LCOV - 2024-02-22 08:15Z e05c22dc - meta test only.lcov

LCOV - code coverage report

Current view:	top level - pebble/sstable - block.go (source / functions)		Hit	Total	Coverage
Test:	2024-02-22 08:15Z e05c22dc - meta test only.lcov	Lines:	1059	1406	75.3 %
Date:	2024-02-22 08:16:34	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package sstable
       6             : 
       7             : import (
       8             :         "context"
       9             :         "encoding/binary"
      10             :         "unsafe"
      11             : 
      12             :         "github.com/cockroachdb/errors"
      13             :         "github.com/cockroachdb/pebble/internal/base"
      14             :         "github.com/cockroachdb/pebble/internal/invariants"
      15             :         "github.com/cockroachdb/pebble/internal/keyspan"
      16             :         "github.com/cockroachdb/pebble/internal/manual"
      17             :         "github.com/cockroachdb/pebble/internal/rangedel"
      18             :         "github.com/cockroachdb/pebble/internal/rangekey"
      19             : )
      20             : 
      21           1 : func uvarintLen(v uint32) int {
      22           1 :         i := 0
      23           1 :         for v >= 0x80 {
      24           0 :                 v >>= 7
      25           0 :                 i++
      26           0 :         }
      27           1 :         return i + 1
      28             : }
      29             : 
      30             : type blockWriter struct {
      31             :         restartInterval int
      32             :         nEntries        int
      33             :         nextRestart     int
      34             :         buf             []byte
      35             :         // For datablocks in TableFormatPebblev3, we steal the most significant bit
      36             :         // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
      37             :         // us with 31 bits, which is more than enough (no one needs > 2GB blocks).
      38             :         // Typically, restarts occur every 16 keys, and by storing this bit with the
      39             :         // restart, we can optimize for the case where a user wants to skip to the
      40             :         // next prefix which happens to be in the same data block, but is > 16 keys
      41             :         // away. We have seen production situations with 100+ versions per MVCC key
      42             :         // (which share the same prefix). Additionally, for such writers, the prefix
      43             :         // compression of the key, that shares the key with the preceding key, is
      44             :         // limited to the prefix part of the preceding key -- this ensures that when
      45             :         // doing NPrefix (see blockIter) we don't need to assemble the full key
      46             :         // for each step since by limiting the length of the shared key we are
      47             :         // ensuring that any of the keys with the same prefix can be used to
      48             :         // assemble the full key when the prefix does change.
      49             :         restarts []uint32
      50             :         // Do not read curKey directly from outside blockWriter since it can have
      51             :         // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
      52             :         // getCurUserKey() instead.
      53             :         curKey []byte
      54             :         // curValue excludes the optional prefix provided to
      55             :         // storeWithOptionalValuePrefix.
      56             :         curValue []byte
      57             :         prevKey  []byte
      58             :         tmp      [4]byte
      59             :         // We don't know the state of the sets that were at the end of the previous
      60             :         // block, so this is initially 0. It may be true for the second and later
      61             :         // restarts in a block. Not having inter-block information is fine since we
      62             :         // will optimize by stepping through restarts only within the same block.
      63             :         // Note that the first restart is the first key in the block.
      64             :         setHasSameKeyPrefixSinceLastRestart bool
      65             : }
      66             : 
      67           1 : func (w *blockWriter) clear() {
      68           1 :         *w = blockWriter{
      69           1 :                 buf:      w.buf[:0],
      70           1 :                 restarts: w.restarts[:0],
      71           1 :                 curKey:   w.curKey[:0],
      72           1 :                 curValue: w.curValue[:0],
      73           1 :                 prevKey:  w.prevKey[:0],
      74           1 :         }
      75           1 : }
      76             : 
      77             : // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
      78             : // explicitly place this limit to reserve a few bits in the restart for
      79             : // internal use.
      80             : const MaximumBlockSize = 1 << 28
      81             : const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
      82             : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
      83             : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
      84             : 
      85           1 : func (w *blockWriter) getCurKey() InternalKey {
      86           1 :         k := base.DecodeInternalKey(w.curKey)
      87           1 :         k.Trailer = k.Trailer & trailerObsoleteMask
      88           1 :         return k
      89           1 : }
      90             : 
      91           1 : func (w *blockWriter) getCurUserKey() []byte {
      92           1 :         n := len(w.curKey) - base.InternalTrailerLen
      93           1 :         if n < 0 {
      94           0 :                 panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
      95             :         }
      96           1 :         return w.curKey[:n:n]
      97             : }
      98             : 
      99             : // If !addValuePrefix, the valuePrefix is ignored.
     100             : func (w *blockWriter) storeWithOptionalValuePrefix(
     101             :         keySize int,
     102             :         value []byte,
     103             :         maxSharedKeyLen int,
     104             :         addValuePrefix bool,
     105             :         valuePrefix valuePrefix,
     106             :         setHasSameKeyPrefix bool,
     107           1 : ) {
     108           1 :         shared := 0
     109           1 :         if !setHasSameKeyPrefix {
     110           1 :                 w.setHasSameKeyPrefixSinceLastRestart = false
     111           1 :         }
     112           1 :         if w.nEntries == w.nextRestart {
     113           1 :                 w.nextRestart = w.nEntries + w.restartInterval
     114           1 :                 restart := uint32(len(w.buf))
     115           1 :                 if w.setHasSameKeyPrefixSinceLastRestart {
     116           1 :                         restart = restart | setHasSameKeyPrefixRestartMask
     117           1 :                 }
     118           1 :                 w.setHasSameKeyPrefixSinceLastRestart = true
     119           1 :                 w.restarts = append(w.restarts, restart)
     120           1 :         } else {
     121           1 :                 // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
     122           1 :                 // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
     123           1 :                 // show this to not be a performance win. For now, functions that use of
     124           1 :                 // unsafe cannot be inlined.
     125           1 :                 n := maxSharedKeyLen
     126           1 :                 if n > len(w.prevKey) {
     127           1 :                         n = len(w.prevKey)
     128           1 :                 }
     129           1 :                 asUint64 := func(b []byte, i int) uint64 {
     130           1 :                         return binary.LittleEndian.Uint64(b[i:])
     131           1 :                 }
     132           1 :                 for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
     133           1 :                         shared += 8
     134           1 :                 }
     135           1 :                 for shared < n && w.curKey[shared] == w.prevKey[shared] {
     136           1 :                         shared++
     137           1 :                 }
     138             :         }
     139             : 
     140           1 :         lenValuePlusOptionalPrefix := len(value)
     141           1 :         if addValuePrefix {
     142           1 :                 lenValuePlusOptionalPrefix++
     143           1 :         }
     144           1 :         needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
     145           1 :         n := len(w.buf)
     146           1 :         if cap(w.buf) < n+needed {
     147           1 :                 newCap := 2 * cap(w.buf)
     148           1 :                 if newCap == 0 {
     149           1 :                         newCap = 1024
     150           1 :                 }
     151           1 :                 for newCap < n+needed {
     152           1 :                         newCap *= 2
     153           1 :                 }
     154           1 :                 newBuf := make([]byte, n, newCap)
     155           1 :                 copy(newBuf, w.buf)
     156           1 :                 w.buf = newBuf
     157             :         }
     158           1 :         w.buf = w.buf[:n+needed]
     159           1 : 
     160           1 :         // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
     161           1 :         // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
     162           1 :         // show this to not be a performance win.
     163           1 :         {
     164           1 :                 x := uint32(shared)
     165           1 :                 for x >= 0x80 {
     166           0 :                         w.buf[n] = byte(x) | 0x80
     167           0 :                         x >>= 7
     168           0 :                         n++
     169           0 :                 }
     170           1 :                 w.buf[n] = byte(x)
     171           1 :                 n++
     172             :         }
     173             : 
     174           1 :         {
     175           1 :                 x := uint32(keySize - shared)
     176           1 :                 for x >= 0x80 {
     177           0 :                         w.buf[n] = byte(x) | 0x80
     178           0 :                         x >>= 7
     179           0 :                         n++
     180           0 :                 }
     181           1 :                 w.buf[n] = byte(x)
     182           1 :                 n++
     183             :         }
     184             : 
     185           1 :         {
     186           1 :                 x := uint32(lenValuePlusOptionalPrefix)
     187           1 :                 for x >= 0x80 {
     188           1 :                         w.buf[n] = byte(x) | 0x80
     189           1 :                         x >>= 7
     190           1 :                         n++
     191           1 :                 }
     192           1 :                 w.buf[n] = byte(x)
     193           1 :                 n++
     194             :         }
     195             : 
     196           1 :         n += copy(w.buf[n:], w.curKey[shared:])
     197           1 :         if addValuePrefix {
     198           1 :                 w.buf[n : n+1][0] = byte(valuePrefix)
     199           1 :                 n++
     200           1 :         }
     201           1 :         n += copy(w.buf[n:], value)
     202           1 :         w.buf = w.buf[:n]
     203           1 : 
     204           1 :         w.curValue = w.buf[n-len(value):]
     205           1 : 
     206           1 :         w.nEntries++
     207             : }
     208             : 
     209           1 : func (w *blockWriter) add(key InternalKey, value []byte) {
     210           1 :         w.addWithOptionalValuePrefix(
     211           1 :                 key, false, value, len(key.UserKey), false, 0, false)
     212           1 : }
     213             : 
     214             : // Callers that always set addValuePrefix to false should use add() instead.
     215             : //
     216             : // isObsolete indicates whether this key-value pair is obsolete in this
     217             : // sstable (only applicable when writing data blocks) -- see the comment in
     218             : // table.go and the longer one in format.go. addValuePrefix adds a 1 byte
     219             : // prefix to the value, specified in valuePrefix -- this is used for data
     220             : // blocks in TableFormatPebblev3 onwards for SETs (see the comment in
     221             : // format.go, with more details in value_block.go). setHasSameKeyPrefix is
     222             : // also used in TableFormatPebblev3 onwards for SETs.
     223             : func (w *blockWriter) addWithOptionalValuePrefix(
     224             :         key InternalKey,
     225             :         isObsolete bool,
     226             :         value []byte,
     227             :         maxSharedKeyLen int,
     228             :         addValuePrefix bool,
     229             :         valuePrefix valuePrefix,
     230             :         setHasSameKeyPrefix bool,
     231           1 : ) {
     232           1 :         w.curKey, w.prevKey = w.prevKey, w.curKey
     233           1 : 
     234           1 :         size := key.Size()
     235           1 :         if cap(w.curKey) < size {
     236           1 :                 w.curKey = make([]byte, 0, size*2)
     237           1 :         }
     238           1 :         w.curKey = w.curKey[:size]
     239           1 :         if isObsolete {
     240           1 :                 key.Trailer = key.Trailer | trailerObsoleteBit
     241           1 :         }
     242           1 :         key.Encode(w.curKey)
     243           1 : 
     244           1 :         w.storeWithOptionalValuePrefix(
     245           1 :                 size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
     246             : }
     247             : 
     248           1 : func (w *blockWriter) finish() []byte {
     249           1 :         // Write the restart points to the buffer.
     250           1 :         if w.nEntries == 0 {
     251           1 :                 // Every block must have at least one restart point.
     252           1 :                 if cap(w.restarts) > 0 {
     253           1 :                         w.restarts = w.restarts[:1]
     254           1 :                         w.restarts[0] = 0
     255           1 :                 } else {
     256           1 :                         w.restarts = append(w.restarts, 0)
     257           1 :                 }
     258             :         }
     259           1 :         tmp4 := w.tmp[:4]
     260           1 :         for _, x := range w.restarts {
     261           1 :                 binary.LittleEndian.PutUint32(tmp4, x)
     262           1 :                 w.buf = append(w.buf, tmp4...)
     263           1 :         }
     264           1 :         binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
     265           1 :         w.buf = append(w.buf, tmp4...)
     266           1 :         result := w.buf
     267           1 : 
     268           1 :         // Reset the block state.
     269           1 :         w.nEntries = 0
     270           1 :         w.nextRestart = 0
     271           1 :         w.buf = w.buf[:0]
     272           1 :         w.restarts = w.restarts[:0]
     273           1 :         return result
     274             : }
     275             : 
     276             : // emptyBlockSize holds the size of an empty block. Every block ends
     277             : // in a uint32 trailer encoding the number of restart points within the
     278             : // block.
     279             : const emptyBlockSize = 4
     280             : 
     281           1 : func (w *blockWriter) estimatedSize() int {
     282           1 :         return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
     283           1 : }
     284             : 
     285             : type blockEntry struct {
     286             :         offset   int32
     287             :         keyStart int32
     288             :         keyEnd   int32
     289             :         valStart int32
     290             :         valSize  int32
     291             : }
     292             : 
     293             : // blockIter is an iterator over a single block of data.
     294             : //
     295             : // A blockIter provides an additional guarantee around key stability when a
     296             : // block has a restart interval of 1 (i.e. when there is no prefix
     297             : // compression). Key stability refers to whether the InternalKey.UserKey bytes
     298             : // returned by a positioning call will remain stable after a subsequent
     299             : // positioning call. The normal case is that a positioning call will invalidate
     300             : // any previously returned InternalKey.UserKey. If a block has a restart
     301             : // interval of 1 (no prefix compression), blockIter guarantees that
     302             : // InternalKey.UserKey will point to the key as stored in the block itself
     303             : // which will remain valid until the blockIter is closed. The key stability
     304             : // guarantee is used by the range tombstone and range key code, which knows that
     305             : // the respective blocks are always encoded with a restart interval of 1. This
     306             : // per-block key stability guarantee is sufficient for range tombstones and
     307             : // range deletes as they are always encoded in a single block. Note: this
     308             : // stability guarantee no longer holds for a block iter with synthetic suffix
     309             : // replacement, but this doesn't matter, as the user will not open
     310             : // an iterator with a synthetic suffix on a block with rangekeys (for now).
     311             : //
     312             : // A blockIter also provides a value stability guarantee for range deletions and
     313             : // range keys since there is only a single range deletion and range key block
     314             : // per sstable and the blockIter will not release the bytes for the block until
     315             : // it is closed.
     316             : //
     317             : // Note on why blockIter knows about lazyValueHandling:
     318             : //
     319             : // blockIter's positioning functions (that return a LazyValue), are too
     320             : // complex to inline even prior to lazyValueHandling. blockIter.Next and
     321             : // blockIter.First were by far the cheapest and had costs 195 and 180
     322             : // respectively, which exceeds the budget of 80. We initially tried to keep
     323             : // the lazyValueHandling logic out of blockIter by wrapping it with a
     324             : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
     325             : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
     326             : // they called the corresponding blockIter func and then decided whether the
     327             : // value was in fact in-place (so return immediately) or needed further
     328             : // handling. But these also turned out too costly for mid-stack inlining since
     329             : // simple calls like the following have a high cost that is barely under the
     330             : // budget of 80
     331             : //
     332             : //      k, v := i.data.SeekGE(key, flags)  // cost 74
     333             : //      k, v := i.data.Next()              // cost 72
     334             : //
     335             : // We have 2 options for minimizing performance regressions:
     336             : //   - Include the lazyValueHandling logic in the already non-inlineable
     337             : //     blockIter functions: Since most of the time is spent in data block iters,
     338             : //     it is acceptable to take the small hit of unnecessary branching (which
     339             : //     hopefully branch prediction will predict correctly) for other kinds of
     340             : //     blocks.
     341             : //   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
     342             : //     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
     343             : //     v3 sstable. We would want to manage these copies via code generation.
     344             : //
     345             : // We have picked the first option here.
     346             : type blockIter struct {
     347             :         cmp   Compare
     348             :         split Split
     349             :         // offset is the byte index that marks where the current key/value is
     350             :         // encoded in the block.
     351             :         offset int32
     352             :         // nextOffset is the byte index where the next key/value is encoded in the
     353             :         // block.
     354             :         nextOffset int32
     355             :         // A "restart point" in a block is a point where the full key is encoded,
     356             :         // instead of just having a suffix of the key encoded. See readEntry() for
     357             :         // how prefix compression of keys works. Keys in between two restart points
     358             :         // only have a suffix encoded in the block. When restart interval is 1, no
     359             :         // prefix compression of keys happens. This is the case with range tombstone
     360             :         // blocks.
     361             :         //
     362             :         // All restart offsets are listed in increasing order in
     363             :         // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
     364             :         // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
     365             :         // therefore be seen as the point where data in the block ends, and a list
     366             :         // of offsets of all restart points begins.
     367             :         restarts int32
     368             :         // Number of restart points in this block. Encoded at the end of the block
     369             :         // as a uint32.
     370             :         numRestarts  int32
     371             :         globalSeqNum uint64
     372             :         ptr          unsafe.Pointer
     373             :         data         []byte
     374             :         // key contains the raw key the iterator is currently pointed at. This may
     375             :         // point directly to data stored in the block (for a key which has no prefix
     376             :         // compression), to fullKey (for a prefix compressed key), or to a slice of
     377             :         // data stored in cachedBuf (during reverse iteration).
     378             :         //
     379             :         // NB: In general, key contains the same logical content as ikey
     380             :         // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
     381             :         // replacement rule, this will not be the case. Therefore, key should never
     382             :         // be used after ikey is set.
     383             :         key []byte
     384             :         // fullKey is a buffer used for key prefix decompression.
     385             :         fullKey []byte
     386             :         // val contains the value the iterator is currently pointed at. If non-nil,
     387             :         // this points to a slice of the block data.
     388             :         val []byte
     389             :         // lazyValue is val turned into a LazyValue, whenever a positioning method
     390             :         // returns a non-nil key-value pair.
     391             :         lazyValue base.LazyValue
     392             :         // ikey contains the decoded InternalKey the iterator is currently pointed
     393             :         // at. Note that the memory backing ikey.UserKey is either data stored
     394             :         // directly in the block, fullKey, cachedBuf, or synthSuffixBuf. The key
     395             :         // stability guarantee for blocks built with a restart interval of 1 is
     396             :         // achieved by having ikey.UserKey always point to data stored directly in the
     397             :         // block.
     398             :         ikey InternalKey
     399             :         // cached and cachedBuf are used during reverse iteration. They are needed
     400             :         // because we can't perform prefix decoding in reverse, only in the forward
     401             :         // direction. In order to iterate in reverse, we decode and cache the entries
     402             :         // between two restart points.
     403             :         //
     404             :         // Note that cached[len(cached)-1] contains the previous entry to the one the
     405             :         // blockIter is currently pointed at. As usual, nextOffset will contain the
     406             :         // offset of the next entry. During reverse iteration, nextOffset will be
     407             :         // updated to point to offset, and we'll set the blockIter to point at the
     408             :         // entry cached[len(cached)-1]. See Prev() for more details.
     409             :         //
     410             :         // For a block encoded with a restart interval of 1, cached and cachedBuf
     411             :         // will not be used as there are no prefix compressed entries between the
     412             :         // restart points.
     413             :         cached    []blockEntry
     414             :         cachedBuf []byte
     415             :         handle    bufferHandle
     416             :         // for block iteration for already loaded blocks.
     417             :         firstUserKey      []byte
     418             :         lazyValueHandling struct {
     419             :                 vbr            *valueBlockReader
     420             :                 hasValuePrefix bool
     421             :         }
     422             :         hideObsoletePoints bool
     423             : 
     424             :         // syntheticSuffix, if not nil, will replace the decoded ikey.UserKey suffix
     425             :         // before the key is returned to the user. A sequence of iter operations on a
     426             :         // block with a syntheticSuffix rule should return keys as if those operations
     427             :         // ran on a block with keys that all had the syntheticSuffix. As an example:
     428             :         // any sequence of block iter cmds should return the same keys for the
     429             :         // following two blocks:
     430             :         //
     431             :         // blockA: a@3,b@3,c@3
     432             :         // blockB: a@1,b@2,c@1 with syntheticSuffix=3
     433             :         //
     434             :         // To ensure this, Suffix replacement will not change the ordering of keys in
     435             :         // the block because the iter assumes that no two keys in the block share the
     436             :         // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
     437             :         // iterator handles "off by one" errors (explained in more detail in those
     438             :         // functions) when, for a given key, originalSuffix < searchSuffix <
     439             :         // replacementSuffix, with integer comparison. To handle these cases, the
     440             :         // iterator assumes:
     441             :         //
     442             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
     443             :         //  for keys with a suffix.
     444             :         //
     445             :         //  NB: it is possible for a block iter to add a synthetic suffix on a key
     446             :         //  without a suffix, which implies
     447             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
     448             :         //  however, the iterator would never need to handle an off by one error in
     449             :         //  this case since originalSuffix (empty) > searchSuffix (non empty), with
     450             :         //  integer comparison.
     451             :         //
     452             :         //
     453             :         // In addition, we also assume that any block with rangekeys will not contain
     454             :         // a synthetic suffix.
     455             :         syntheticSuffix SyntheticSuffix
     456             :         synthSuffixBuf  []byte
     457             : }
     458             : 
     459             : // blockIter implements the base.InternalIterator interface.
     460             : var _ base.InternalIterator = (*blockIter)(nil)
     461             : 
     462             : func newBlockIter(
     463             :         cmp Compare, split Split, block block, syntheticSuffix SyntheticSuffix,
     464           1 : ) (*blockIter, error) {
     465           1 :         i := &blockIter{}
     466           1 :         return i, i.init(cmp, split, block, 0, false, syntheticSuffix)
     467           1 : }
     468             : 
     469           0 : func (i *blockIter) String() string {
     470           0 :         return "block"
     471           0 : }
     472             : 
     473             : func (i *blockIter) init(
     474             :         cmp Compare,
     475             :         split Split,
     476             :         block block,
     477             :         globalSeqNum uint64,
     478             :         hideObsoletePoints bool,
     479             :         syntheticSuffix SyntheticSuffix,
     480           1 : ) error {
     481           1 :         numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
     482           1 :         if numRestarts == 0 {
     483           0 :                 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
     484           0 :         }
     485           1 :         i.syntheticSuffix = syntheticSuffix
     486           1 :         i.synthSuffixBuf = i.synthSuffixBuf[:0]
     487           1 :         i.split = split
     488           1 :         i.cmp = cmp
     489           1 :         i.restarts = int32(len(block)) - 4*(1+numRestarts)
     490           1 :         i.numRestarts = numRestarts
     491           1 :         i.globalSeqNum = globalSeqNum
     492           1 :         i.ptr = unsafe.Pointer(&block[0])
     493           1 :         i.data = block
     494           1 :         i.fullKey = i.fullKey[:0]
     495           1 :         i.val = nil
     496           1 :         i.hideObsoletePoints = hideObsoletePoints
     497           1 :         i.clearCache()
     498           1 :         if i.restarts > 0 {
     499           1 :                 if err := i.readFirstKey(); err != nil {
     500           0 :                         return err
     501           0 :                 }
     502           1 :         } else {
     503           1 :                 // Block is empty.
     504           1 :                 i.firstUserKey = nil
     505           1 :         }
     506           1 :         return nil
     507             : }
     508             : 
     509             : // NB: two cases of hideObsoletePoints:
     510             : //   - Local sstable iteration: globalSeqNum will be set iff the sstable was
     511             : //     ingested.
     512             : //   - Foreign sstable iteration: globalSeqNum is always set.
     513             : func (i *blockIter) initHandle(
     514             :         cmp Compare,
     515             :         split Split,
     516             :         block bufferHandle,
     517             :         globalSeqNum uint64,
     518             :         hideObsoletePoints bool,
     519             :         syntheticSuffix SyntheticSuffix,
     520           1 : ) error {
     521           1 :         i.handle.Release()
     522           1 :         i.handle = block
     523           1 :         return i.init(cmp, split, block.Get(), globalSeqNum, hideObsoletePoints, syntheticSuffix)
     524           1 : }
     525             : 
     526           1 : func (i *blockIter) invalidate() {
     527           1 :         i.clearCache()
     528           1 :         i.offset = 0
     529           1 :         i.nextOffset = 0
     530           1 :         i.restarts = 0
     531           1 :         i.numRestarts = 0
     532           1 :         i.data = nil
     533           1 : }
     534             : 
     535             : // isDataInvalidated returns true when the blockIter has been invalidated
     536             : // using an invalidate call. NB: this is different from blockIter.Valid
     537             : // which is part of the InternalIterator implementation.
     538           1 : func (i *blockIter) isDataInvalidated() bool {
     539           1 :         return i.data == nil
     540           1 : }
     541             : 
     542           1 : func (i *blockIter) resetForReuse() blockIter {
     543           1 :         return blockIter{
     544           1 :                 fullKey:   i.fullKey[:0],
     545           1 :                 cached:    i.cached[:0],
     546           1 :                 cachedBuf: i.cachedBuf[:0],
     547           1 :                 data:      nil,
     548           1 :         }
     549           1 : }
     550             : 
     551           1 : func (i *blockIter) readEntry() {
     552           1 :         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
     553           1 : 
     554           1 :         // This is an ugly performance hack. Reading entries from blocks is one of
     555           1 :         // the inner-most routines and decoding the 3 varints per-entry takes
     556           1 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     557           1 :         // us, so we do it manually. This provides a 10-15% performance improvement
     558           1 :         // on blockIter benchmarks on both go1.11 and go1.12.
     559           1 :         //
     560           1 :         // TODO(peter): remove this hack if go:inline is ever supported.
     561           1 : 
     562           1 :         var shared uint32
     563           1 :         if a := *((*uint8)(ptr)); a < 128 {
     564           1 :                 shared = uint32(a)
     565           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     566           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     567           0 :                 shared = uint32(b)<<7 | uint32(a)
     568           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     569           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     570           0 :                 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     571           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     572           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     573           0 :                 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     574           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     575           0 :         } else {
     576           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     577           0 :                 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     578           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     579           0 :         }
     580             : 
     581           1 :         var unshared uint32
     582           1 :         if a := *((*uint8)(ptr)); a < 128 {
     583           1 :                 unshared = uint32(a)
     584           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     585           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     586           0 :                 unshared = uint32(b)<<7 | uint32(a)
     587           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     588           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     589           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     590           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     591           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     592           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     593           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     594           0 :         } else {
     595           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     596           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     597           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     598           0 :         }
     599             : 
     600           1 :         var value uint32
     601           1 :         if a := *((*uint8)(ptr)); a < 128 {
     602           1 :                 value = uint32(a)
     603           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     604           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     605           1 :                 value = uint32(b)<<7 | uint32(a)
     606           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     607           1 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     608           0 :                 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     609           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     610           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     611           0 :                 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     612           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     613           0 :         } else {
     614           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     615           0 :                 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     616           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     617           0 :         }
     618             : 
     619           1 :         unsharedKey := getBytes(ptr, int(unshared))
     620           1 :         // TODO(sumeer): move this into the else block below.
     621           1 :         i.fullKey = append(i.fullKey[:shared], unsharedKey...)
     622           1 :         if shared == 0 {
     623           1 :                 // Provide stability for the key across positioning calls if the key
     624           1 :                 // doesn't share a prefix with the previous key. This removes requiring the
     625           1 :                 // key to be copied if the caller knows the block has a restart interval of
     626           1 :                 // 1. An important example of this is range-del blocks.
     627           1 :                 i.key = unsharedKey
     628           1 :         } else {
     629           1 :                 i.key = i.fullKey
     630           1 :         }
     631           1 :         ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
     632           1 :         i.val = getBytes(ptr, int(value))
     633           1 :         i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
     634             : }
     635             : 
     636           1 : func (i *blockIter) readFirstKey() error {
     637           1 :         ptr := i.ptr
     638           1 : 
     639           1 :         // This is an ugly performance hack. Reading entries from blocks is one of
     640           1 :         // the inner-most routines and decoding the 3 varints per-entry takes
     641           1 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     642           1 :         // us, so we do it manually. This provides a 10-15% performance improvement
     643           1 :         // on blockIter benchmarks on both go1.11 and go1.12.
     644           1 :         //
     645           1 :         // TODO(peter): remove this hack if go:inline is ever supported.
     646           1 : 
     647           1 :         if shared := *((*uint8)(ptr)); shared == 0 {
     648           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     649           1 :         } else {
     650           0 :                 // The shared length is != 0, which is invalid.
     651           0 :                 panic("first key in block must have zero shared length")
     652             :         }
     653             : 
     654           1 :         var unshared uint32
     655           1 :         if a := *((*uint8)(ptr)); a < 128 {
     656           1 :                 unshared = uint32(a)
     657           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     658           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     659           0 :                 unshared = uint32(b)<<7 | uint32(a)
     660           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     661           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     662           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     663           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     664           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     665           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     666           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     667           0 :         } else {
     668           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     669           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     670           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     671           0 :         }
     672             : 
     673             :         // Skip the value length.
     674           1 :         if a := *((*uint8)(ptr)); a < 128 {
     675           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     676           1 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
     677           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     678           1 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
     679           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     680           0 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
     681           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     682           0 :         } else {
     683           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     684           0 :         }
     685             : 
     686           1 :         firstKey := getBytes(ptr, int(unshared))
     687           1 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     688           1 :         // BlockIter benchmarks.
     689           1 :         if n := len(firstKey) - 8; n >= 0 {
     690           1 :                 i.firstUserKey = firstKey[:n:n]
     691           1 :         } else {
     692           0 :                 i.firstUserKey = nil
     693           0 :                 return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
     694           0 :         }
     695           1 :         return nil
     696             : }
     697             : 
     698             : // The sstable internal obsolete bit is set when writing a block and unset by
     699             : // blockIter, so no code outside block writing/reading code ever sees it.
     700             : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
     701             : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
     702             : 
     703           1 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
     704           1 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     705           1 :         // BlockIter benchmarks.
     706           1 :         if n := len(key) - 8; n >= 0 {
     707           1 :                 trailer := binary.LittleEndian.Uint64(key[n:])
     708           1 :                 hiddenPoint = i.hideObsoletePoints &&
     709           1 :                         (trailer&trailerObsoleteBit != 0)
     710           1 :                 i.ikey.Trailer = trailer & trailerObsoleteMask
     711           1 :                 i.ikey.UserKey = key[:n:n]
     712           1 :                 if i.globalSeqNum != 0 {
     713           1 :                         i.ikey.SetSeqNum(i.globalSeqNum)
     714           1 :                 }
     715           1 :         } else {
     716           1 :                 i.ikey.Trailer = uint64(InternalKeyKindInvalid)
     717           1 :                 i.ikey.UserKey = nil
     718           1 :         }
     719           1 :         return hiddenPoint
     720             : }
     721             : 
     722             : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
     723             : // i.syntheticSuffix. allowInPlace is set to false if there's a chance that
     724             : // i.ikey.UserKey points to the same buffer as i.cachedBuf (i.e. during reverse
     725             : // iteration).
     726           1 : func (i *blockIter) maybeReplaceSuffix(allowInPlace bool) {
     727           1 :         if i.syntheticSuffix != nil && i.ikey.UserKey != nil {
     728           1 :                 prefixLen := i.split(i.ikey.UserKey)
     729           1 :                 if allowInPlace && cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
     730           1 :                         i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
     731           1 :                         return
     732           1 :                 }
     733             :                 // If ikey is cached or may get cached, we must copy
     734             :                 // UserKey to a new buffer before prefix replacement.
     735           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
     736           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
     737           1 :                 i.ikey.UserKey = i.synthSuffixBuf
     738             :         }
     739             : }
     740             : 
     741           1 : func (i *blockIter) clearCache() {
     742           1 :         i.cached = i.cached[:0]
     743           1 :         i.cachedBuf = i.cachedBuf[:0]
     744           1 : }
     745             : 
     746           1 : func (i *blockIter) cacheEntry() {
     747           1 :         var valStart int32
     748           1 :         valSize := int32(len(i.val))
     749           1 :         if valSize > 0 {
     750           1 :                 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
     751           1 :         }
     752             : 
     753           1 :         i.cached = append(i.cached, blockEntry{
     754           1 :                 offset:   i.offset,
     755           1 :                 keyStart: int32(len(i.cachedBuf)),
     756           1 :                 keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
     757           1 :                 valStart: valStart,
     758           1 :                 valSize:  valSize,
     759           1 :         })
     760           1 :         i.cachedBuf = append(i.cachedBuf, i.key...)
     761             : }
     762             : 
     763           1 : func (i *blockIter) getFirstUserKey() []byte {
     764           1 :         return i.firstUserKey
     765           1 : }
     766             : 
     767             : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
     768             : // package.
     769           1 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
     770           1 :         if invariants.Enabled && i.isDataInvalidated() {
     771           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     772             :         }
     773             : 
     774           1 :         i.clearCache()
     775           1 :         // Find the index of the smallest restart point whose key is > the key
     776           1 :         // sought; index will be numRestarts if there is no such restart point.
     777           1 :         i.offset = 0
     778           1 :         var index int32
     779           1 : 
     780           1 :         {
     781           1 :                 // NB: manually inlined sort.Seach is ~5% faster.
     782           1 :                 //
     783           1 :                 // Define f(-1) == false and f(n) == true.
     784           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
     785           1 :                 upper := i.numRestarts
     786           1 :                 for index < upper {
     787           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     788           1 :                         // index ≤ h < upper
     789           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     790           1 :                         // For a restart point, there are 0 bytes shared with the previous key.
     791           1 :                         // The varint encoding of 0 occupies 1 byte.
     792           1 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     793           1 : 
     794           1 :                         // Decode the key at that restart point, and compare it to the key
     795           1 :                         // sought. See the comment in readEntry for why we manually inline the
     796           1 :                         // varint decoding.
     797           1 :                         var v1 uint32
     798           1 :                         if a := *((*uint8)(ptr)); a < 128 {
     799           1 :                                 v1 = uint32(a)
     800           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     801           1 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     802           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     803           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     804           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     805           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     806           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     807           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     808           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     809           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     810           0 :                         } else {
     811           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     812           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     813           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     814           0 :                         }
     815             : 
     816           1 :                         if *((*uint8)(ptr)) < 128 {
     817           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     818           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     819           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     820           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     821           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     822           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     823           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     824           0 :                         } else {
     825           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     826           0 :                         }
     827             : 
     828             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     829             :                         // speedup on BlockIter benchmarks.
     830           1 :                         s := getBytes(ptr, int(v1))
     831           1 :                         var k []byte
     832           1 :                         if n := len(s) - 8; n >= 0 {
     833           1 :                                 k = s[:n:n]
     834           1 :                         }
     835             :                         // Else k is invalid, and left as nil
     836             : 
     837           1 :                         if i.cmp(key, k) > 0 {
     838           1 :                                 // The search key is greater than the user key at this restart point.
     839           1 :                                 // Search beyond this restart point, since we are trying to find the
     840           1 :                                 // first restart point with a user key >= the search key.
     841           1 :                                 index = h + 1 // preserves f(i-1) == false
     842           1 :                         } else {
     843           1 :                                 // k >= search key, so prune everything after index (since index
     844           1 :                                 // satisfies the property we are looking for).
     845           1 :                                 upper = h // preserves f(j) == true
     846           1 :                         }
     847             :                 }
     848             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
     849             :                 // => answer is index.
     850             :         }
     851             : 
     852             :         // index is the first restart point with key >= search key. Define the keys
     853             :         // between a restart point and the next restart point as belonging to that
     854             :         // restart point.
     855             :         //
     856             :         // Since keys are strictly increasing, if index > 0 then the restart point
     857             :         // at index-1 will be the first one that has some keys belonging to it that
     858             :         // could be equal to the search key.  If index == 0, then all keys in this
     859             :         // block are larger than the key sought, and offset remains at zero.
     860           1 :         if index > 0 {
     861           1 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
     862           1 :         }
     863           1 :         i.readEntry()
     864           1 :         hiddenPoint := i.decodeInternalKey(i.key)
     865           1 : 
     866           1 :         // Iterate from that restart point to somewhere >= the key sought.
     867           1 :         if !i.valid() {
     868           0 :                 return nil, base.LazyValue{}
     869           0 :         }
     870             : 
     871             :         // A note on seeking in a block with a suffix replacement rule: even though
     872             :         // the binary search above was conducted on keys without suffix replacement,
     873             :         // Seek will still return the correct suffix replaced key. A binary
     874             :         // search without suffix replacement will land on a key that is _less_ than
     875             :         // the key the search would have landed on if all keys were already suffix
     876             :         // replaced. Since Seek then conducts forward iteration to the first suffix
     877             :         // replaced user key that is greater than or equal to the search key, the
     878             :         // correct key is still returned.
     879             :         //
     880             :         // As an example, consider the following block with a restart interval of 1,
     881             :         // with a replacement suffix of "4":
     882             :         // - Pre-suffix replacement: apple@1, banana@3
     883             :         // - Post-suffix replacement: apple@4, banana@4
     884             :         //
     885             :         // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
     886             :         // chronological order (i.e. apple@1>apple@3), the binary search without
     887             :         // suffix replacement would return apple@1. A binary search with suffix
     888             :         // replacement would return banana@4. After beginning forward iteration from
     889             :         // either returned restart point, forward iteration would
     890             :         // always return the correct key, banana@4.
     891             :         //
     892             :         // Further, if the user searched with apple@0 (i.e. a suffix less than the
     893             :         // pre replacement suffix) or with apple@5 (a suffix larger than the post
     894             :         // replacement suffix), the binary search with or without suffix replacement
     895             :         // would land on the same key, as we assume the following:
     896             :         // (1) no two keys in the sst share the same prefix.
     897             :         // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
     898             : 
     899           1 :         i.maybeReplaceSuffix(true /*allowInPlace*/)
     900           1 : 
     901           1 :         if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
     902           1 :                 // Initialize i.lazyValue
     903           1 :                 if !i.lazyValueHandling.hasValuePrefix ||
     904           1 :                         base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
     905           1 :                         i.lazyValue = base.MakeInPlaceValue(i.val)
     906           1 :                 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
     907           1 :                         i.lazyValue = base.MakeInPlaceValue(i.val[1:])
     908           1 :                 } else {
     909           1 :                         i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
     910           1 :                 }
     911           1 :                 return &i.ikey, i.lazyValue
     912             :         }
     913           1 :         for i.Next(); i.valid(); i.Next() {
     914           1 :                 if i.cmp(i.ikey.UserKey, key) >= 0 {
     915           1 :                         // i.Next() has already initialized i.lazyValue.
     916           1 :                         return &i.ikey, i.lazyValue
     917           1 :                 }
     918             :         }
     919           1 :         return nil, base.LazyValue{}
     920             : }
     921             : 
     922             : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
     923             : // pebble package.
     924             : func (i *blockIter) SeekPrefixGE(
     925             :         prefix, key []byte, flags base.SeekGEFlags,
     926           0 : ) (*base.InternalKey, base.LazyValue) {
     927           0 :         // This should never be called as prefix iteration is handled by sstable.Iterator.
     928           0 :         panic("pebble: SeekPrefixGE unimplemented")
     929             : }
     930             : 
     931             : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
     932             : // package.
     933           1 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
     934           1 :         if invariants.Enabled && i.isDataInvalidated() {
     935           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     936             :         }
     937             : 
     938           1 :         i.clearCache()
     939           1 :         // Find the index of the smallest restart point whose key is >= the key
     940           1 :         // sought; index will be numRestarts if there is no such restart point.
     941           1 :         i.offset = 0
     942           1 :         var index int32
     943           1 : 
     944           1 :         {
     945           1 :                 // NB: manually inlined sort.Search is ~5% faster.
     946           1 :                 //
     947           1 :                 // Define f(-1) == false and f(n) == true.
     948           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
     949           1 :                 upper := i.numRestarts
     950           1 :                 for index < upper {
     951           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     952           1 :                         // index ≤ h < upper
     953           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     954           1 :                         // For a restart point, there are 0 bytes shared with the previous key.
     955           1 :                         // The varint encoding of 0 occupies 1 byte.
     956           1 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     957           1 : 
     958           1 :                         // Decode the key at that restart point, and compare it to the key
     959           1 :                         // sought. See the comment in readEntry for why we manually inline the
     960           1 :                         // varint decoding.
     961           1 :                         var v1 uint32
     962           1 :                         if a := *((*uint8)(ptr)); a < 128 {
     963           1 :                                 v1 = uint32(a)
     964           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     965           1 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     966           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     967           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     968           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     969           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     970           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     971           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     972           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     973           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     974           0 :                         } else {
     975           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     976           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     977           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     978           0 :                         }
     979             : 
     980           1 :                         if *((*uint8)(ptr)) < 128 {
     981           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     982           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     983           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     984           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     985           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     986           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     987           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     988           0 :                         } else {
     989           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     990           0 :                         }
     991             : 
     992             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     993             :                         // speedup on BlockIter benchmarks.
     994           1 :                         s := getBytes(ptr, int(v1))
     995           1 :                         var k []byte
     996           1 :                         if n := len(s) - 8; n >= 0 {
     997           1 :                                 k = s[:n:n]
     998           1 :                         }
     999             :                         // Else k is invalid, and left as nil
    1000             : 
    1001           1 :                         if i.cmp(key, k) > 0 {
    1002           1 :                                 // The search key is greater than the user key at this restart point.
    1003           1 :                                 // Search beyond this restart point, since we are trying to find the
    1004           1 :                                 // first restart point with a user key >= the search key.
    1005           1 :                                 index = h + 1 // preserves f(i-1) == false
    1006           1 :                         } else {
    1007           1 :                                 // k >= search key, so prune everything after index (since index
    1008           1 :                                 // satisfies the property we are looking for).
    1009           1 :                                 upper = h // preserves f(j) == true
    1010           1 :                         }
    1011             :                 }
    1012             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1013             :                 // => answer is index.
    1014             :         }
    1015             : 
    1016           1 :         if index == 0 {
    1017           1 :                 if i.syntheticSuffix != nil {
    1018           1 :                         // The binary search was conducted on keys without suffix replacement,
    1019           1 :                         // implying the first key in the block may be less than the search key. To
    1020           1 :                         // double check, get the first key in the block with suffix replacement
    1021           1 :                         // and compare to the search key. Consider the following example: suppose
    1022           1 :                         // the user searches with a@3, the first key in the block is a@2 and the
    1023           1 :                         // block contains a suffix replacement rule of 4. Since a@3 sorts before
    1024           1 :                         // a@2, the binary search would return index==0. Without conducting the
    1025           1 :                         // suffix replacement, the SeekLT would incorrectly return nil. With
    1026           1 :                         // suffix replacement though, a@4 should be returned as a@4 sorts before
    1027           1 :                         // a@3.
    1028           1 :                         ikey, lazyVal := i.First()
    1029           1 :                         if i.cmp(ikey.UserKey, key) < 0 {
    1030           0 :                                 return ikey, lazyVal
    1031           0 :                         }
    1032             :                 }
    1033             :                 // If index == 0 then all keys in this block are larger than the key
    1034             :                 // sought, so there is no match.
    1035           1 :                 i.offset = -1
    1036           1 :                 i.nextOffset = 0
    1037           1 :                 return nil, base.LazyValue{}
    1038             :         }
    1039             : 
    1040             :         // INVARIANT: index > 0
    1041             : 
    1042             :         // Ignoring suffix replacement, index is the first restart point with key >=
    1043             :         // search key. Define the keys between a restart point and the next restart
    1044             :         // point as belonging to that restart point. Note that index could be equal to
    1045             :         // i.numRestarts, i.e., we are past the last restart.  Since keys are strictly
    1046             :         // increasing, then the restart point at index-1 will be the first one that
    1047             :         // has some keys belonging to it that are less than the search key.
    1048             :         //
    1049             :         // Next, we will search between the restart at index-1 and the restart point
    1050             :         // at index, for the first key >= key, and then on finding it, return
    1051             :         // i.Prev(). We need to know when we have hit the offset for index, since then
    1052             :         // we can stop searching. targetOffset encodes that offset for index.
    1053           1 :         targetOffset := i.restarts
    1054           1 :         i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1055           1 :         if index < i.numRestarts {
    1056           1 :                 targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
    1057           1 : 
    1058           1 :                 if i.syntheticSuffix != nil {
    1059           0 :                         // The binary search was conducted on keys without suffix replacement,
    1060           0 :                         // implying the returned restart point (index) may be less than the search
    1061           0 :                         // key, breaking the assumption described above.
    1062           0 :                         //
    1063           0 :                         // For example: consider this block with a replacement ts of 4, and
    1064           0 :                         // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
    1065           0 :                         // replacement: a@4,b@4,c@4
    1066           0 :                         //
    1067           0 :                         // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
    1068           0 :                         //
    1069           0 :                         // If the client calls  SeekLT(b@3), the binary search would return b@2,
    1070           0 :                         // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
    1071           0 :                         // begin forward iteration from a@3, the previous restart point, to
    1072           0 :                         // b{suffix}. The iteration stops when it encounters a key geq to the
    1073           0 :                         // search key or if it reaches the upper bound. Without suffix
    1074           0 :                         // replacement, we can assume that the upper bound of this forward
    1075           0 :                         // iteration, b{suffix}, is greater than the search key, as implied by the
    1076           0 :                         // binary search.
    1077           0 :                         //
    1078           0 :                         // If we naively hold this assumption with suffix replacement, the
    1079           0 :                         // iteration would terminate at the upper bound, b@4, call i.Prev, and
    1080           0 :                         // incorrectly return a@4. To correct for this, if the original returned
    1081           0 :                         // index is less than the search key, shift our forward iteration to begin
    1082           0 :                         // at index instead of index -1. With suffix replacement the key at index
    1083           0 :                         // is guaranteed to be the highest restart point less than the seach key
    1084           0 :                         // (i.e. the same property of index-1 for a block without suffix
    1085           0 :                         // replacement). This property holds because of the invariant that a block
    1086           0 :                         // with suffix replacement will not have two keys that share the same
    1087           0 :                         // prefix. To consider the above example, binary searching with b@3 landed
    1088           0 :                         // naively at a@3, but since b@4<b@3, we shift our forward iteration to
    1089           0 :                         // begin at b@4. We never need to shift by more than one restart point
    1090           0 :                         // (i.e. to c@4) because it's impossible for the search key to be greater
    1091           0 :                         // than the the key at the next restart point in the block because that
    1092           0 :                         // key will always have a different prefix. Put another way, because no
    1093           0 :                         // key in the block shares the same prefix, naive binary search should
    1094           0 :                         // always land at most 1 restart point off the correct one.
    1095           0 : 
    1096           0 :                         naiveOffset := i.offset
    1097           0 :                         // Shift up to the original binary search result and decode the key.
    1098           0 :                         i.offset = targetOffset
    1099           0 :                         i.readEntry()
    1100           0 :                         i.decodeInternalKey(i.key)
    1101           0 :                         i.maybeReplaceSuffix(false /* allowInPlace */)
    1102           0 : 
    1103           0 :                         // If the binary search point is actually less than the search key, post
    1104           0 :                         // replacement, bump the target offset.
    1105           0 :                         if i.cmp(i.ikey.UserKey, key) < 0 {
    1106           0 :                                 i.offset = targetOffset
    1107           0 :                                 if index+1 < i.numRestarts {
    1108           0 :                                         // if index+1 is within the i.data bounds, use it to find the target
    1109           0 :                                         // offset.
    1110           0 :                                         targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
    1111           0 :                                 } else {
    1112           0 :                                         targetOffset = i.restarts
    1113           0 :                                 }
    1114           0 :                         } else {
    1115           0 :                                 i.offset = naiveOffset
    1116           0 :                         }
    1117             :                 }
    1118             :         }
    1119             : 
    1120             :         // Init nextOffset for the forward iteration below.
    1121           1 :         i.nextOffset = i.offset
    1122           1 : 
    1123           1 :         for {
    1124           1 :                 i.offset = i.nextOffset
    1125           1 :                 i.readEntry()
    1126           1 :                 // When hidden keys are common, there is additional optimization possible
    1127           1 :                 // by not caching entries that are hidden (note that some calls to
    1128           1 :                 // cacheEntry don't decode the internal key before caching, but checking
    1129           1 :                 // whether a key is hidden does not require full decoding). However, we do
    1130           1 :                 // need to use the blockEntry.offset in the cache for the first entry at
    1131           1 :                 // the reset point to do the binary search when the cache is empty -- so
    1132           1 :                 // we would need to cache that first entry (though not the key) even if
    1133           1 :                 // was hidden. Our current assumption is that if there are large numbers
    1134           1 :                 // of hidden keys we will be able to skip whole blocks (using block
    1135           1 :                 // property filters) so we don't bother optimizing.
    1136           1 :                 hiddenPoint := i.decodeInternalKey(i.key)
    1137           1 :                 i.maybeReplaceSuffix(false /*allowInPlace*/)
    1138           1 : 
    1139           1 :                 // NB: we don't use the hiddenPoint return value of decodeInternalKey
    1140           1 :                 // since we want to stop as soon as we reach a key >= ikey.UserKey, so
    1141           1 :                 // that we can reverse.
    1142           1 :                 if i.cmp(i.ikey.UserKey, key) >= 0 {
    1143           1 :                         // The current key is greater than or equal to our search key. Back up to
    1144           1 :                         // the previous key which was less than our search key. Note that this for
    1145           1 :                         // loop will execute at least once with this if-block not being true, so
    1146           1 :                         // the key we are backing up to is the last one this loop cached.
    1147           1 :                         return i.Prev()
    1148           1 :                 }
    1149             : 
    1150           1 :                 if i.nextOffset >= targetOffset {
    1151           1 :                         // We've reached the end of the current restart block. Return the
    1152           1 :                         // current key if not hidden, else call Prev().
    1153           1 :                         //
    1154           1 :                         // When the restart interval is 1, the first iteration of the for loop
    1155           1 :                         // will bring us here. In that case ikey is backed by the block so we
    1156           1 :                         // get the desired key stability guarantee for the lifetime of the
    1157           1 :                         // blockIter. That is, we never cache anything and therefore never
    1158           1 :                         // return a key backed by cachedBuf.
    1159           1 :                         if hiddenPoint {
    1160           1 :                                 return i.Prev()
    1161           1 :                         }
    1162           1 :                         break
    1163             :                 }
    1164           1 :                 i.cacheEntry()
    1165             :         }
    1166             : 
    1167           1 :         if !i.valid() {
    1168           1 :                 return nil, base.LazyValue{}
    1169           1 :         }
    1170           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1171           1 :                 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1172           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1173           1 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1174           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1175           1 :         } else {
    1176           1 :                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1177           1 :         }
    1178           1 :         return &i.ikey, i.lazyValue
    1179             : }
    1180             : 
    1181             : // First implements internalIterator.First, as documented in the pebble
    1182             : // package.
    1183           1 : func (i *blockIter) First() (*InternalKey, base.LazyValue) {
    1184           1 :         if invariants.Enabled && i.isDataInvalidated() {
    1185           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
    1186             :         }
    1187             : 
    1188           1 :         i.offset = 0
    1189           1 :         if !i.valid() {
    1190           1 :                 return nil, base.LazyValue{}
    1191           1 :         }
    1192           1 :         i.clearCache()
    1193           1 :         i.readEntry()
    1194           1 :         hiddenPoint := i.decodeInternalKey(i.key)
    1195           1 :         if hiddenPoint {
    1196           1 :                 return i.Next()
    1197           1 :         }
    1198           1 :         i.maybeReplaceSuffix(true /*allowInPlace*/)
    1199           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1200           1 :                 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1201           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1202           1 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1203           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1204           1 :         } else {
    1205           1 :                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1206           1 :         }
    1207           1 :         return &i.ikey, i.lazyValue
    1208             : }
    1209             : 
    1210           1 : func decodeRestart(b []byte) int32 {
    1211           1 :         _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
    1212           1 :         return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
    1213           1 :                 uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
    1214           1 : }
    1215             : 
    1216             : // Last implements internalIterator.Last, as documented in the pebble package.
    1217           1 : func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
    1218           1 :         if invariants.Enabled && i.isDataInvalidated() {
    1219           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
    1220             :         }
    1221             : 
    1222             :         // Seek forward from the last restart point.
    1223           1 :         i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
    1224           1 :         if !i.valid() {
    1225           1 :                 return nil, base.LazyValue{}
    1226           1 :         }
    1227             : 
    1228           1 :         i.readEntry()
    1229           1 :         i.clearCache()
    1230           1 : 
    1231           1 :         for i.nextOffset < i.restarts {
    1232           1 :                 i.cacheEntry()
    1233           1 :                 i.offset = i.nextOffset
    1234           1 :                 i.readEntry()
    1235           1 :         }
    1236             : 
    1237           1 :         hiddenPoint := i.decodeInternalKey(i.key)
    1238           1 :         if hiddenPoint {
    1239           1 :                 return i.Prev()
    1240           1 :         }
    1241           1 :         i.maybeReplaceSuffix(false /*allowInPlace*/)
    1242           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1243           1 :                 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1244           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1245           1 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1246           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1247           1 :         } else {
    1248           1 :                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1249           1 :         }
    1250           1 :         return &i.ikey, i.lazyValue
    1251             : }
    1252             : 
    1253             : // Next implements internalIterator.Next, as documented in the pebble
    1254             : // package.
    1255           1 : func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
    1256           1 :         if len(i.cachedBuf) > 0 {
    1257           1 :                 // We're switching from reverse iteration to forward iteration. We need to
    1258           1 :                 // populate i.fullKey with the current key we're positioned at so that
    1259           1 :                 // readEntry() can use i.fullKey for key prefix decompression. Note that we
    1260           1 :                 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
    1261           1 :                 // SeekLT was the previous call, i.key may be backed by i.fullKey), but
    1262           1 :                 // copying into i.fullKey works for both cases.
    1263           1 :                 //
    1264           1 :                 // TODO(peter): Rather than clearing the cache, we could instead use the
    1265           1 :                 // cache until it is exhausted. This would likely be faster than falling
    1266           1 :                 // through to the normal forward iteration code below.
    1267           1 :                 i.fullKey = append(i.fullKey[:0], i.key...)
    1268           1 :                 i.clearCache()
    1269           1 :         }
    1270             : 
    1271             : start:
    1272           1 :         i.offset = i.nextOffset
    1273           1 :         if !i.valid() {
    1274           1 :                 return nil, base.LazyValue{}
    1275           1 :         }
    1276           1 :         i.readEntry()
    1277           1 :         // Manually inlined version of i.decodeInternalKey(i.key).
    1278           1 :         if n := len(i.key) - 8; n >= 0 {
    1279           1 :                 trailer := binary.LittleEndian.Uint64(i.key[n:])
    1280           1 :                 hiddenPoint := i.hideObsoletePoints &&
    1281           1 :                         (trailer&trailerObsoleteBit != 0)
    1282           1 :                 i.ikey.Trailer = trailer & trailerObsoleteMask
    1283           1 :                 i.ikey.UserKey = i.key[:n:n]
    1284           1 :                 if i.globalSeqNum != 0 {
    1285           1 :                         i.ikey.SetSeqNum(i.globalSeqNum)
    1286           1 :                 }
    1287           1 :                 if hiddenPoint {
    1288           1 :                         goto start
    1289             :                 }
    1290           1 :                 if i.syntheticSuffix != nil {
    1291           1 :                         // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */)
    1292           1 :                         prefixLen := i.split(i.ikey.UserKey)
    1293           1 :                         if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
    1294           1 :                                 i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
    1295           1 :                         } else {
    1296           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
    1297           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
    1298           1 :                                 i.ikey.UserKey = i.synthSuffixBuf
    1299           1 :                         }
    1300             :                 }
    1301           0 :         } else {
    1302           0 :                 i.ikey.Trailer = uint64(InternalKeyKindInvalid)
    1303           0 :                 i.ikey.UserKey = nil
    1304           0 :         }
    1305           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1306           1 :                 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1307           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1308           1 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1309           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1310           1 :         } else {
    1311           1 :                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1312           1 :         }
    1313           1 :         return &i.ikey, i.lazyValue
    1314             : }
    1315             : 
    1316             : // NextPrefix implements (base.InternalIterator).NextPrefix.
    1317           1 : func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
    1318           1 :         if i.lazyValueHandling.hasValuePrefix {
    1319           1 :                 return i.nextPrefixV3(succKey)
    1320           1 :         }
    1321           1 :         const nextsBeforeSeek = 3
    1322           1 :         k, v := i.Next()
    1323           1 :         for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
    1324           1 :                 if j >= nextsBeforeSeek {
    1325           1 :                         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1326           1 :                 }
    1327           1 :                 k, v = i.Next()
    1328             :         }
    1329           1 :         return k, v
    1330             : }
    1331             : 
    1332           1 : func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
    1333           1 :         // Doing nexts that involve a key comparison can be expensive (and the cost
    1334           1 :         // depends on the key length), so we use the same threshold of 3 that we use
    1335           1 :         // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
    1336           1 :         // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
    1337           1 :         // than doing a SeekGE within the block, so we do this 16 times
    1338           1 :         // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
    1339           1 :         // search for the restart consumes > 100ns. If the number of versions is >
    1340           1 :         // 17, we will increment nextFastCount to 17, then do a binary search, and
    1341           1 :         // on average need to find a key between two restarts, so another 8 steps
    1342           1 :         // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
    1343           1 :         // steps.
    1344           1 :         //
    1345           1 :         // TODO(sumeer): use the configured restartInterval for the sstable when it
    1346           1 :         // was written (which we don't currently store) instead of the default value
    1347           1 :         // of 16.
    1348           1 :         const nextCmpThresholdBeforeSeek = 3
    1349           1 :         const nextFastThresholdBeforeRestarts = 16
    1350           1 :         nextCmpCount := 0
    1351           1 :         nextFastCount := 0
    1352           1 :         usedRestarts := false
    1353           1 :         // INVARIANT: blockIter is valid.
    1354           1 :         if invariants.Enabled && !i.valid() {
    1355           0 :                 panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
    1356             :         }
    1357           1 :         prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
    1358           1 :         for {
    1359           1 :                 i.offset = i.nextOffset
    1360           1 :                 if !i.valid() {
    1361           1 :                         return nil, base.LazyValue{}
    1362           1 :                 }
    1363             :                 // Need to decode the length integers, so we can compute nextOffset.
    1364           1 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1365           1 :                 // This is an ugly performance hack. Reading entries from blocks is one of
    1366           1 :                 // the inner-most routines and decoding the 3 varints per-entry takes
    1367           1 :                 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
    1368           1 :                 // us, so we do it manually. This provides a 10-15% performance improvement
    1369           1 :                 // on blockIter benchmarks on both go1.11 and go1.12.
    1370           1 :                 //
    1371           1 :                 // TODO(peter): remove this hack if go:inline is ever supported.
    1372           1 : 
    1373           1 :                 // Decode the shared key length integer.
    1374           1 :                 var shared uint32
    1375           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1376           1 :                         shared = uint32(a)
    1377           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1378           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1379           0 :                         shared = uint32(b)<<7 | uint32(a)
    1380           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1381           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1382           0 :                         shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1383           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1384           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1385           0 :                         shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1386           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1387           0 :                 } else {
    1388           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1389           0 :                         shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1390           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1391           0 :                 }
    1392             :                 // Decode the unshared key length integer.
    1393           1 :                 var unshared uint32
    1394           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1395           1 :                         unshared = uint32(a)
    1396           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1397           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1398           0 :                         unshared = uint32(b)<<7 | uint32(a)
    1399           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1400           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1401           0 :                         unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1402           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1403           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1404           0 :                         unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1405           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1406           0 :                 } else {
    1407           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1408           0 :                         unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1409           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1410           0 :                 }
    1411             :                 // Decode the value length integer.
    1412           1 :                 var value uint32
    1413           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1414           1 :                         value = uint32(a)
    1415           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1416           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1417           0 :                         value = uint32(b)<<7 | uint32(a)
    1418           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1419           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1420           0 :                         value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1421           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1422           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1423           0 :                         value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1424           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1425           0 :                 } else {
    1426           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1427           0 :                         value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1428           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1429           0 :                 }
    1430             :                 // The starting position of the value.
    1431           1 :                 valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
    1432           1 :                 i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
    1433           1 :                 if invariants.Enabled && unshared < 8 {
    1434           0 :                         // This should not happen since only the key prefix is shared, so even
    1435           0 :                         // if the prefix length is the same as the user key length, the unshared
    1436           0 :                         // will include the trailer.
    1437           0 :                         panic(errors.AssertionFailedf("unshared %d is too small", unshared))
    1438             :                 }
    1439             :                 // The trailer is written in little endian, so the key kind is the first
    1440             :                 // byte in the trailer that is encoded in the slice [unshared-8:unshared].
    1441           1 :                 keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
    1442           1 :                 keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
    1443           1 :                 prefixChanged := false
    1444           1 :                 if keyKind == InternalKeyKindSet {
    1445           1 :                         if invariants.Enabled && value == 0 {
    1446           0 :                                 panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
    1447             :                         }
    1448           1 :                         valPrefix := *((*valuePrefix)(valuePtr))
    1449           1 :                         if setHasSamePrefix(valPrefix) {
    1450           1 :                                 // Fast-path. No need to assemble i.fullKey, or update i.key. We know
    1451           1 :                                 // that subsequent keys will not have a shared length that is greater
    1452           1 :                                 // than the prefix of the current key, which is also the prefix of
    1453           1 :                                 // i.key. Since we are continuing to iterate, we don't need to
    1454           1 :                                 // initialize i.ikey and i.lazyValue (these are initialized before
    1455           1 :                                 // returning).
    1456           1 :                                 nextFastCount++
    1457           1 :                                 if nextFastCount > nextFastThresholdBeforeRestarts {
    1458           0 :                                         if usedRestarts {
    1459           0 :                                                 // Exhausted iteration budget. This will never happen unless
    1460           0 :                                                 // someone is using a restart interval > 16. It is just to guard
    1461           0 :                                                 // against long restart intervals causing too much iteration.
    1462           0 :                                                 break
    1463             :                                         }
    1464             :                                         // Haven't used restarts yet, so find the first restart at or beyond
    1465             :                                         // the current offset.
    1466           0 :                                         targetOffset := i.offset
    1467           0 :                                         var index int32
    1468           0 :                                         {
    1469           0 :                                                 // NB: manually inlined sort.Sort is ~5% faster.
    1470           0 :                                                 //
    1471           0 :                                                 // f defined for a restart point is true iff the offset >=
    1472           0 :                                                 // targetOffset.
    1473           0 :                                                 // Define f(-1) == false and f(i.numRestarts) == true.
    1474           0 :                                                 // Invariant: f(index-1) == false, f(upper) == true.
    1475           0 :                                                 upper := i.numRestarts
    1476           0 :                                                 for index < upper {
    1477           0 :                                                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1478           0 :                                                         // index ≤ h < upper
    1479           0 :                                                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1480           0 :                                                         if offset < targetOffset {
    1481           0 :                                                                 index = h + 1 // preserves f(index-1) == false
    1482           0 :                                                         } else {
    1483           0 :                                                                 upper = h // preserves f(upper) == true
    1484           0 :                                                         }
    1485             :                                                 }
    1486             :                                                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1487             :                                                 // => answer is index.
    1488             :                                         }
    1489           0 :                                         usedRestarts = true
    1490           0 :                                         nextFastCount = 0
    1491           0 :                                         if index == i.numRestarts {
    1492           0 :                                                 // Already past the last real restart, so iterate a bit more until
    1493           0 :                                                 // we are done with the block.
    1494           0 :                                                 continue
    1495             :                                         }
    1496             :                                         // Have some real restarts after index. NB: index is the first
    1497             :                                         // restart at or beyond the current offset.
    1498           0 :                                         startingIndex := index
    1499           0 :                                         for index != i.numRestarts &&
    1500           0 :                                                 // The restart at index is 4 bytes written in little endian format
    1501           0 :                                                 // starting at i.restart+4*index. The 0th byte is the least
    1502           0 :                                                 // significant and the 3rd byte is the most significant. Since the
    1503           0 :                                                 // most significant bit of the 3rd byte is what we use for
    1504           0 :                                                 // encoding the set-has-same-prefix information, the indexing
    1505           0 :                                                 // below has +3.
    1506           0 :                                                 i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
    1507           0 :                                                 // We still have the same prefix, so move to the next restart.
    1508           0 :                                                 index++
    1509           0 :                                         }
    1510             :                                         // index is the first restart that did not have the same prefix.
    1511           0 :                                         if index != startingIndex {
    1512           0 :                                                 // Managed to skip past at least one restart. Resume iteration
    1513           0 :                                                 // from index-1. Since nextFastCount has been reset to 0, we
    1514           0 :                                                 // should be able to iterate to the next prefix.
    1515           0 :                                                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1516           0 :                                                 i.readEntry()
    1517           0 :                                         }
    1518             :                                         // Else, unable to skip past any restart. Resume iteration. Since
    1519             :                                         // nextFastCount has been reset to 0, we should be able to iterate
    1520             :                                         // to the next prefix.
    1521           0 :                                         continue
    1522             :                                 }
    1523           1 :                                 continue
    1524           1 :                         } else if prevKeyIsSet {
    1525           1 :                                 prefixChanged = true
    1526           1 :                         }
    1527           1 :                 } else {
    1528           1 :                         prevKeyIsSet = false
    1529           1 :                 }
    1530             :                 // Slow-path cases:
    1531             :                 // - (Likely) The prefix has changed.
    1532             :                 // - (Unlikely) The prefix has not changed.
    1533             :                 // We assemble the key etc. under the assumption that it is the likely
    1534             :                 // case.
    1535           1 :                 unsharedKey := getBytes(ptr, int(unshared))
    1536           1 :                 // TODO(sumeer): move this into the else block below. This is a bit tricky
    1537           1 :                 // since the current logic assumes we have always copied the latest key
    1538           1 :                 // into fullKey, which is why when we get to the next key we can (a)
    1539           1 :                 // access i.fullKey[:shared], (b) append only the unsharedKey to
    1540           1 :                 // i.fullKey. For (a), we can access i.key[:shared] since that memory is
    1541           1 :                 // valid (even if unshared). For (b), we will need to remember whether
    1542           1 :                 // i.key refers to i.fullKey or not, and can append the unsharedKey only
    1543           1 :                 // in the former case and for the latter case need to copy the shared part
    1544           1 :                 // too. This same comment applies to the other place where we can do this
    1545           1 :                 // optimization, in readEntry().
    1546           1 :                 i.fullKey = append(i.fullKey[:shared], unsharedKey...)
    1547           1 :                 i.val = getBytes(valuePtr, int(value))
    1548           1 :                 if shared == 0 {
    1549           1 :                         // Provide stability for the key across positioning calls if the key
    1550           1 :                         // doesn't share a prefix with the previous key. This removes requiring the
    1551           1 :                         // key to be copied if the caller knows the block has a restart interval of
    1552           1 :                         // 1. An important example of this is range-del blocks.
    1553           1 :                         i.key = unsharedKey
    1554           1 :                 } else {
    1555           1 :                         i.key = i.fullKey
    1556           1 :                 }
    1557             :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1558           1 :                 hiddenPoint := false
    1559           1 :                 if n := len(i.key) - 8; n >= 0 {
    1560           1 :                         trailer := binary.LittleEndian.Uint64(i.key[n:])
    1561           1 :                         hiddenPoint = i.hideObsoletePoints &&
    1562           1 :                                 (trailer&trailerObsoleteBit != 0)
    1563           1 :                         i.ikey.Trailer = trailer & trailerObsoleteMask
    1564           1 :                         i.ikey.UserKey = i.key[:n:n]
    1565           1 :                         if i.globalSeqNum != 0 {
    1566           0 :                                 i.ikey.SetSeqNum(i.globalSeqNum)
    1567           0 :                         }
    1568           1 :                         if i.syntheticSuffix != nil {
    1569           0 :                                 // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */)
    1570           0 :                                 prefixLen := i.split(i.ikey.UserKey)
    1571           0 :                                 if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
    1572           0 :                                         i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
    1573           0 :                                 } else {
    1574           0 :                                         i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
    1575           0 :                                         i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
    1576           0 :                                         i.ikey.UserKey = i.synthSuffixBuf
    1577           0 :                                 }
    1578             :                         }
    1579           0 :                 } else {
    1580           0 :                         i.ikey.Trailer = uint64(InternalKeyKindInvalid)
    1581           0 :                         i.ikey.UserKey = nil
    1582           0 :                 }
    1583           1 :                 nextCmpCount++
    1584           1 :                 if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
    1585           0 :                         panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
    1586           0 :                                 i.ikey.UserKey, succKey))
    1587             :                 }
    1588           1 :                 if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
    1589           1 :                         // Prefix has changed.
    1590           1 :                         if hiddenPoint {
    1591           1 :                                 return i.Next()
    1592           1 :                         }
    1593           1 :                         if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
    1594           0 :                                 panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
    1595             :                         }
    1596           1 :                         if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1597           1 :                                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1598           1 :                         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1599           1 :                                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1600           1 :                         } else {
    1601           0 :                                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1602           0 :                         }
    1603           1 :                         return &i.ikey, i.lazyValue
    1604             :                 }
    1605             :                 // Else prefix has not changed.
    1606             : 
    1607           1 :                 if nextCmpCount >= nextCmpThresholdBeforeSeek {
    1608           1 :                         break
    1609             :                 }
    1610             :         }
    1611           1 :         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1612             : }
    1613             : 
    1614             : // Prev implements internalIterator.Prev, as documented in the pebble
    1615             : // package.
    1616           1 : func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
    1617           1 : start:
    1618           1 :         for n := len(i.cached) - 1; n >= 0; n-- {
    1619           1 :                 i.nextOffset = i.offset
    1620           1 :                 e := &i.cached[n]
    1621           1 :                 i.offset = e.offset
    1622           1 :                 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
    1623           1 :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1624           1 :                 i.key = i.cachedBuf[e.keyStart:e.keyEnd]
    1625           1 :                 if n := len(i.key) - 8; n >= 0 {
    1626           1 :                         trailer := binary.LittleEndian.Uint64(i.key[n:])
    1627           1 :                         hiddenPoint := i.hideObsoletePoints &&
    1628           1 :                                 (trailer&trailerObsoleteBit != 0)
    1629           1 :                         if hiddenPoint {
    1630           1 :                                 continue
    1631             :                         }
    1632           1 :                         i.ikey.Trailer = trailer & trailerObsoleteMask
    1633           1 :                         i.ikey.UserKey = i.key[:n:n]
    1634           1 :                         if i.globalSeqNum != 0 {
    1635           1 :                                 i.ikey.SetSeqNum(i.globalSeqNum)
    1636           1 :                         }
    1637           1 :                         if i.syntheticSuffix != nil {
    1638           1 :                                 // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */)
    1639           1 :                                 prefixLen := i.split(i.ikey.UserKey)
    1640           1 :                                 // If ikey is cached or may get cached, we must de-reference
    1641           1 :                                 // UserKey before prefix replacement.
    1642           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
    1643           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
    1644           1 :                                 i.ikey.UserKey = i.synthSuffixBuf
    1645           1 :                         }
    1646           0 :                 } else {
    1647           0 :                         i.ikey.Trailer = uint64(InternalKeyKindInvalid)
    1648           0 :                         i.ikey.UserKey = nil
    1649           0 :                 }
    1650           1 :                 i.cached = i.cached[:n]
    1651           1 :                 if !i.lazyValueHandling.hasValuePrefix ||
    1652           1 :                         base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1653           1 :                         i.lazyValue = base.MakeInPlaceValue(i.val)
    1654           1 :                 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1655           1 :                         i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1656           1 :                 } else {
    1657           1 :                         i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1658           1 :                 }
    1659           1 :                 return &i.ikey, i.lazyValue
    1660             :         }
    1661             : 
    1662           1 :         i.clearCache()
    1663           1 :         if i.offset <= 0 {
    1664           1 :                 i.offset = -1
    1665           1 :                 i.nextOffset = 0
    1666           1 :                 return nil, base.LazyValue{}
    1667           1 :         }
    1668             : 
    1669           1 :         targetOffset := i.offset
    1670           1 :         var index int32
    1671           1 : 
    1672           1 :         {
    1673           1 :                 // NB: manually inlined sort.Sort is ~5% faster.
    1674           1 :                 //
    1675           1 :                 // Define f(-1) == false and f(n) == true.
    1676           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
    1677           1 :                 upper := i.numRestarts
    1678           1 :                 for index < upper {
    1679           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1680           1 :                         // index ≤ h < upper
    1681           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1682           1 :                         if offset < targetOffset {
    1683           1 :                                 // Looking for the first restart that has offset >= targetOffset, so
    1684           1 :                                 // ignore h and earlier.
    1685           1 :                                 index = h + 1 // preserves f(i-1) == false
    1686           1 :                         } else {
    1687           1 :                                 upper = h // preserves f(j) == true
    1688           1 :                         }
    1689             :                 }
    1690             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1691             :                 // => answer is index.
    1692             :         }
    1693             : 
    1694             :         // index is first restart with offset >= targetOffset. Note that
    1695             :         // targetOffset may not be at a restart point since one can call Prev()
    1696             :         // after Next() (so the cache was not populated) and targetOffset refers to
    1697             :         // the current entry. index-1 must have an offset < targetOffset (it can't
    1698             :         // be equal to targetOffset since the binary search would have selected that
    1699             :         // as the index).
    1700           1 :         i.offset = 0
    1701           1 :         if index > 0 {
    1702           1 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1703           1 :         }
    1704             :         // TODO(sumeer): why is the else case not an error given targetOffset is a
    1705             :         // valid offset.
    1706             : 
    1707           1 :         i.readEntry()
    1708           1 : 
    1709           1 :         // We stop when i.nextOffset == targetOffset since the targetOffset is the
    1710           1 :         // entry we are stepping back from, and we don't need to cache the entry
    1711           1 :         // before it, since it is the candidate to return.
    1712           1 :         for i.nextOffset < targetOffset {
    1713           1 :                 i.cacheEntry()
    1714           1 :                 i.offset = i.nextOffset
    1715           1 :                 i.readEntry()
    1716           1 :         }
    1717             : 
    1718           1 :         hiddenPoint := i.decodeInternalKey(i.key)
    1719           1 :         if hiddenPoint {
    1720           1 :                 // Use the cache.
    1721           1 :                 goto start
    1722             :         }
    1723           1 :         if i.syntheticSuffix != nil {
    1724           1 :                 // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */)
    1725           1 :                 prefixLen := i.split(i.ikey.UserKey)
    1726           1 :                 // If ikey is cached or may get cached, we must de-reference
    1727           1 :                 // UserKey before prefix replacement.
    1728           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
    1729           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
    1730           1 :                 i.ikey.UserKey = i.synthSuffixBuf
    1731           1 :         }
    1732           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1733           1 :                 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
    1734           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val)
    1735           1 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1736           1 :                 i.lazyValue = base.MakeInPlaceValue(i.val[1:])
    1737           1 :         } else {
    1738           1 :                 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1739           1 :         }
    1740           1 :         return &i.ikey, i.lazyValue
    1741             : }
    1742             : 
    1743             : // Key implements internalIterator.Key, as documented in the pebble package.
    1744           1 : func (i *blockIter) Key() *InternalKey {
    1745           1 :         return &i.ikey
    1746           1 : }
    1747             : 
    1748           1 : func (i *blockIter) value() base.LazyValue {
    1749           1 :         return i.lazyValue
    1750           1 : }
    1751             : 
    1752             : // Error implements internalIterator.Error, as documented in the pebble
    1753             : // package.
    1754           1 : func (i *blockIter) Error() error {
    1755           1 :         return nil // infallible
    1756           1 : }
    1757             : 
    1758             : // Close implements internalIterator.Close, as documented in the pebble
    1759             : // package.
    1760           1 : func (i *blockIter) Close() error {
    1761           1 :         i.handle.Release()
    1762           1 :         i.handle = bufferHandle{}
    1763           1 :         i.val = nil
    1764           1 :         i.lazyValue = base.LazyValue{}
    1765           1 :         i.lazyValueHandling.vbr = nil
    1766           1 :         return nil
    1767           1 : }
    1768             : 
    1769           0 : func (i *blockIter) SetBounds(lower, upper []byte) {
    1770           0 :         // This should never be called as bounds are handled by sstable.Iterator.
    1771           0 :         panic("pebble: SetBounds unimplemented")
    1772             : }
    1773             : 
    1774           0 : func (i *blockIter) SetContext(_ context.Context) {}
    1775             : 
    1776           1 : func (i *blockIter) valid() bool {
    1777           1 :         return i.offset >= 0 && i.offset < i.restarts
    1778           1 : }
    1779             : 
    1780             : // fragmentBlockIter wraps a blockIter, implementing the
    1781             : // keyspan.FragmentIterator interface. It's used for reading range deletion and
    1782             : // range key blocks.
    1783             : //
    1784             : // Range deletions and range keys are fragmented before they're persisted to the
    1785             : // block. Overlapping fragments have identical bounds.  The fragmentBlockIter
    1786             : // gathers all the fragments with identical bounds within a block and returns a
    1787             : // single keyspan.Span describing all the keys defined over the span.
    1788             : //
    1789             : // # Memory lifetime
    1790             : //
    1791             : // A Span returned by fragmentBlockIter is only guaranteed to be stable until
    1792             : // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
    1793             : // may be reused, so the user must not assume it's stable.
    1794             : //
    1795             : // Blocks holding range deletions and range keys are configured to use a restart
    1796             : // interval of 1. This provides key stability. The caller may treat the various
    1797             : // byte slices (start, end, suffix, value) as stable for the lifetime of the
    1798             : // iterator.
    1799             : type fragmentBlockIter struct {
    1800             :         blockIter blockIter
    1801             :         keyBuf    [2]keyspan.Key
    1802             :         span      keyspan.Span
    1803             :         dir       int8
    1804             :         closeHook func(i keyspan.FragmentIterator) error
    1805             : 
    1806             :         // elideSameSeqnum, if true, returns only the first-occurring (in forward
    1807             :         // order) Key for each sequence number.
    1808             :         elideSameSeqnum bool
    1809             : }
    1810             : 
    1811           1 : func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
    1812           1 :         return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
    1813           1 : }
    1814             : 
    1815           1 : func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) error {
    1816           1 :         // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
    1817           1 :         // calls to Decode is too confusing and subtle. Refactor to make it
    1818           1 :         // explicit.
    1819           1 : 
    1820           1 :         // decode the contents of the fragment's value. This always includes at
    1821           1 :         // least the end key: RANGEDELs store the end key directly as the value,
    1822           1 :         // whereas the various range key kinds store are more complicated.  The
    1823           1 :         // details of the range key internal value format are documented within the
    1824           1 :         // internal/rangekey package.
    1825           1 :         var err error
    1826           1 :         switch k.Kind() {
    1827           1 :         case base.InternalKeyKindRangeDelete:
    1828           1 :                 i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
    1829           1 :         case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
    1830           1 :                 i.span, err = rangekey.Decode(*k, internalValue, i.span.Keys)
    1831           0 :         default:
    1832           0 :                 i.span = keyspan.Span{}
    1833           0 :                 err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
    1834             :         }
    1835           1 :         return err
    1836             : }
    1837             : 
    1838           1 : func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
    1839           1 :         if invariants.Enabled {
    1840           1 :                 if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
    1841           0 :                         panic("elideKeysOfSameSeqNum called when it should not be")
    1842             :                 }
    1843             :         }
    1844           1 :         lastSeqNum := i.span.Keys[0].SeqNum()
    1845           1 :         k := 1
    1846           1 :         for j := 1; j < len(i.span.Keys); j++ {
    1847           1 :                 if lastSeqNum != i.span.Keys[j].SeqNum() {
    1848           1 :                         lastSeqNum = i.span.Keys[j].SeqNum()
    1849           1 :                         i.span.Keys[k] = i.span.Keys[j]
    1850           1 :                         k++
    1851           1 :                 }
    1852             :         }
    1853           1 :         i.span.Keys = i.span.Keys[:k]
    1854             : }
    1855             : 
    1856             : // gatherForward gathers internal keys with identical bounds. Keys defined over
    1857             : // spans of the keyspace are fragmented such that any overlapping key spans have
    1858             : // identical bounds. When these spans are persisted to a range deletion or range
    1859             : // key block, they may be persisted as multiple internal keys in order to encode
    1860             : // multiple sequence numbers or key kinds.
    1861             : //
    1862             : // gatherForward iterates forward, re-combining the fragmented internal keys to
    1863             : // reconstruct a keyspan.Span that holds all the keys defined over the span.
    1864             : func (i *fragmentBlockIter) gatherForward(
    1865             :         k *InternalKey, lazyValue base.LazyValue,
    1866           1 : ) (*keyspan.Span, error) {
    1867           1 :         i.span = keyspan.Span{}
    1868           1 :         if k == nil || !i.blockIter.valid() {
    1869           1 :                 return nil, nil
    1870           1 :         }
    1871             :         // Use the i.keyBuf array to back the Keys slice to prevent an allocation
    1872             :         // when a span contains few keys.
    1873           1 :         i.span.Keys = i.keyBuf[:0]
    1874           1 : 
    1875           1 :         // Decode the span's end key and individual keys from the value.
    1876           1 :         internalValue := lazyValue.InPlaceValue()
    1877           1 :         if err := i.decodeSpanKeys(k, internalValue); err != nil {
    1878           0 :                 return nil, err
    1879           0 :         }
    1880           1 :         prevEnd := i.span.End
    1881           1 : 
    1882           1 :         // There might exist additional internal keys with identical bounds encoded
    1883           1 :         // within the block. Iterate forward, accumulating all the keys with
    1884           1 :         // identical bounds to s.
    1885           1 :         k, lazyValue = i.blockIter.Next()
    1886           1 :         internalValue = lazyValue.InPlaceValue()
    1887           1 :         for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
    1888           1 :                 if err := i.decodeSpanKeys(k, internalValue); err != nil {
    1889           0 :                         return nil, err
    1890           0 :                 }
    1891             : 
    1892             :                 // Since k indicates an equal start key, the encoded end key must
    1893             :                 // exactly equal the original end key from the first internal key.
    1894             :                 // Overlapping fragments are required to have exactly equal start and
    1895             :                 // end bounds.
    1896           1 :                 if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
    1897           0 :                         i.span = keyspan.Span{}
    1898           0 :                         return nil, base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
    1899           0 :                 }
    1900           1 :                 k, lazyValue = i.blockIter.Next()
    1901           1 :                 internalValue = lazyValue.InPlaceValue()
    1902             :         }
    1903           1 :         if i.elideSameSeqnum && len(i.span.Keys) > 0 {
    1904           1 :                 i.elideKeysOfSameSeqNum()
    1905           1 :         }
    1906             :         // i.blockIter is positioned over the first internal key for the next span.
    1907           1 :         return &i.span, nil
    1908             : }
    1909             : 
    1910             : // gatherBackward gathers internal keys with identical bounds. Keys defined over
    1911             : // spans of the keyspace are fragmented such that any overlapping key spans have
    1912             : // identical bounds. When these spans are persisted to a range deletion or range
    1913             : // key block, they may be persisted as multiple internal keys in order to encode
    1914             : // multiple sequence numbers or key kinds.
    1915             : //
    1916             : // gatherBackward iterates backwards, re-combining the fragmented internal keys
    1917             : // to reconstruct a keyspan.Span that holds all the keys defined over the span.
    1918             : func (i *fragmentBlockIter) gatherBackward(
    1919             :         k *InternalKey, lazyValue base.LazyValue,
    1920           1 : ) (*keyspan.Span, error) {
    1921           1 :         i.span = keyspan.Span{}
    1922           1 :         if k == nil || !i.blockIter.valid() {
    1923           1 :                 return nil, nil
    1924           1 :         }
    1925             :         // Use the i.keyBuf array to back the Keys slice to prevent an allocation
    1926             :         // when a span contains few keys.
    1927           1 :         i.span.Keys = i.keyBuf[:0]
    1928           1 : 
    1929           1 :         // Decode the span's end key and individual keys from the value.
    1930           1 :         internalValue := lazyValue.InPlaceValue()
    1931           1 :         if err := i.decodeSpanKeys(k, internalValue); err != nil {
    1932           0 :                 return nil, err
    1933           0 :         }
    1934           1 :         prevEnd := i.span.End
    1935           1 : 
    1936           1 :         // There might exist additional internal keys with identical bounds encoded
    1937           1 :         // within the block. Iterate backward, accumulating all the keys with
    1938           1 :         // identical bounds to s.
    1939           1 :         k, lazyValue = i.blockIter.Prev()
    1940           1 :         internalValue = lazyValue.InPlaceValue()
    1941           1 :         for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
    1942           1 :                 if err := i.decodeSpanKeys(k, internalValue); err != nil {
    1943           0 :                         return nil, err
    1944           0 :                 }
    1945             : 
    1946             :                 // Since k indicates an equal start key, the encoded end key must
    1947             :                 // exactly equal the original end key from the first internal key.
    1948             :                 // Overlapping fragments are required to have exactly equal start and
    1949             :                 // end bounds.
    1950           1 :                 if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
    1951           0 :                         i.span = keyspan.Span{}
    1952           0 :                         return nil, base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
    1953           0 :                 }
    1954           1 :                 k, lazyValue = i.blockIter.Prev()
    1955           1 :                 internalValue = lazyValue.InPlaceValue()
    1956             :         }
    1957             :         // i.blockIter is positioned over the last internal key for the previous
    1958             :         // span.
    1959             : 
    1960             :         // Backwards iteration encounters internal keys in the wrong order.
    1961           1 :         keyspan.SortKeysByTrailer(&i.span.Keys)
    1962           1 : 
    1963           1 :         if i.elideSameSeqnum && len(i.span.Keys) > 0 {
    1964           1 :                 i.elideKeysOfSameSeqNum()
    1965           1 :         }
    1966           1 :         return &i.span, nil
    1967             : }
    1968             : 
    1969             : // Close implements (keyspan.FragmentIterator).Close.
    1970           1 : func (i *fragmentBlockIter) Close() error {
    1971           1 :         var err error
    1972           1 :         if i.closeHook != nil {
    1973           0 :                 err = i.closeHook(i)
    1974           0 :         }
    1975           1 :         err = firstError(err, i.blockIter.Close())
    1976           1 :         return err
    1977             : }
    1978             : 
    1979             : // First implements (keyspan.FragmentIterator).First
    1980           1 : func (i *fragmentBlockIter) First() (*keyspan.Span, error) {
    1981           1 :         i.dir = +1
    1982           1 :         return i.gatherForward(i.blockIter.First())
    1983           1 : }
    1984             : 
    1985             : // Last implements (keyspan.FragmentIterator).Last.
    1986           1 : func (i *fragmentBlockIter) Last() (*keyspan.Span, error) {
    1987           1 :         i.dir = -1
    1988           1 :         return i.gatherBackward(i.blockIter.Last())
    1989           1 : }
    1990             : 
    1991             : // Next implements (keyspan.FragmentIterator).Next.
    1992           1 : func (i *fragmentBlockIter) Next() (*keyspan.Span, error) {
    1993           1 :         switch {
    1994           1 :         case i.dir == -1 && !i.span.Valid():
    1995           1 :                 // Switching directions.
    1996           1 :                 //
    1997           1 :                 // i.blockIter is exhausted, before the first key. Move onto the first.
    1998           1 :                 i.blockIter.First()
    1999           1 :                 i.dir = +1
    2000           1 :         case i.dir == -1 && i.span.Valid():
    2001           1 :                 // Switching directions.
    2002           1 :                 //
    2003           1 :                 // i.blockIter is currently positioned over the last internal key for
    2004           1 :                 // the previous span. Next it once to move to the first internal key
    2005           1 :                 // that makes up the current span, and gatherForwaad to land on the
    2006           1 :                 // first internal key making up the next span.
    2007           1 :                 //
    2008           1 :                 // In the diagram below, if the last span returned to the user during
    2009           1 :                 // reverse iteration was [b,c), i.blockIter is currently positioned at
    2010           1 :                 // [a,b). The block iter must be positioned over [d,e) to gather the
    2011           1 :                 // next span's fragments.
    2012           1 :                 //
    2013           1 :                 //    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
    2014           1 :                 //          ^                       ^
    2015           1 :                 //     i.blockIter                 want
    2016           1 :                 if x, err := i.gatherForward(i.blockIter.Next()); err != nil {
    2017           0 :                         return nil, err
    2018           1 :                 } else if invariants.Enabled && !x.Valid() {
    2019           0 :                         panic("pebble: invariant violation: next entry unexpectedly invalid")
    2020             :                 }
    2021           1 :                 i.dir = +1
    2022             :         }
    2023             :         // We know that this blockIter has in-place values.
    2024           1 :         return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
    2025             : }
    2026             : 
    2027             : // Prev implements (keyspan.FragmentIterator).Prev.
    2028           1 : func (i *fragmentBlockIter) Prev() (*keyspan.Span, error) {
    2029           1 :         switch {
    2030           1 :         case i.dir == +1 && !i.span.Valid():
    2031           1 :                 // Switching directions.
    2032           1 :                 //
    2033           1 :                 // i.blockIter is exhausted, after the last key. Move onto the last.
    2034           1 :                 i.blockIter.Last()
    2035           1 :                 i.dir = -1
    2036           1 :         case i.dir == +1 && i.span.Valid():
    2037           1 :                 // Switching directions.
    2038           1 :                 //
    2039           1 :                 // i.blockIter is currently positioned over the first internal key for
    2040           1 :                 // the next span. Prev it once to move to the last internal key that
    2041           1 :                 // makes up the current span, and gatherBackward to land on the last
    2042           1 :                 // internal key making up the previous span.
    2043           1 :                 //
    2044           1 :                 // In the diagram below, if the last span returned to the user during
    2045           1 :                 // forward iteration was [b,c), i.blockIter is currently positioned at
    2046           1 :                 // [d,e). The block iter must be positioned over [a,b) to gather the
    2047           1 :                 // previous span's fragments.
    2048           1 :                 //
    2049           1 :                 //    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
    2050           1 :                 //          ^                       ^
    2051           1 :                 //        want                  i.blockIter
    2052           1 :                 if x, err := i.gatherBackward(i.blockIter.Prev()); err != nil {
    2053           0 :                         return nil, err
    2054           1 :                 } else if invariants.Enabled && !x.Valid() {
    2055           0 :                         panic("pebble: invariant violation: previous entry unexpectedly invalid")
    2056             :                 }
    2057           1 :                 i.dir = -1
    2058             :         }
    2059             :         // We know that this blockIter has in-place values.
    2060           1 :         return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
    2061             : }
    2062             : 
    2063             : // SeekGE implements (keyspan.FragmentIterator).SeekGE.
    2064           1 : func (i *fragmentBlockIter) SeekGE(k []byte) (*keyspan.Span, error) {
    2065           1 :         if s, err := i.SeekLT(k); err != nil {
    2066           0 :                 return nil, err
    2067           1 :         } else if s != nil && i.blockIter.cmp(k, s.End) < 0 {
    2068           1 :                 return s, nil
    2069           1 :         }
    2070             :         // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
    2071             :         // doesn't meet the k < s.End comparison, then there's no need for the
    2072             :         // SeekLT to gatherBackward.
    2073           1 :         return i.Next()
    2074             : }
    2075             : 
    2076             : // SeekLT implements (keyspan.FragmentIterator).SeekLT.
    2077           1 : func (i *fragmentBlockIter) SeekLT(k []byte) (*keyspan.Span, error) {
    2078           1 :         i.dir = -1
    2079           1 :         return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
    2080           1 : }
    2081             : 
    2082             : // String implements fmt.Stringer.
    2083           0 : func (i *fragmentBlockIter) String() string {
    2084           0 :         return "fragment-block-iter"
    2085           0 : }
    2086             : 
    2087             : // SetCloseHook implements sstable.FragmentIterator.
    2088           0 : func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
    2089           0 :         i.closeHook = fn
    2090           0 : }
    2091             : 
    2092             : // WrapChildren implements FragmentIterator.
    2093           0 : func (i *fragmentBlockIter) WrapChildren(wrap keyspan.WrapFn) {}

Generated by: LCOV version 1.14