LCOV - code coverage report
Current view: top level - pebble - merging_iter.go (source / functions) Hit Total Coverage
Test: 2024-07-12 08:16Z e762e056 - tests only.lcov Lines: 704 790 89.1 %
Date: 2024-07-12 08:17:09 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package pebble
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "context"
      10             :         "fmt"
      11             :         "runtime/debug"
      12             :         "unsafe"
      13             : 
      14             :         "github.com/cockroachdb/errors"
      15             :         "github.com/cockroachdb/pebble/internal/base"
      16             :         "github.com/cockroachdb/pebble/internal/invariants"
      17             :         "github.com/cockroachdb/pebble/internal/keyspan"
      18             :         "github.com/cockroachdb/pebble/internal/treeprinter"
      19             : )
      20             : 
      21             : type mergingIterLevel struct {
      22             :         index int
      23             :         iter  internalIterator
      24             :         // rangeDelIter is set to the range-deletion iterator for the level. When
      25             :         // configured with a levelIter, this pointer changes as sstable boundaries
      26             :         // are crossed. See levelIter.initRangeDel and the Range Deletions comment
      27             :         // below.
      28             :         rangeDelIter keyspan.FragmentIterator
      29             :         // rangeDelIterGeneration is incremented whenever rangeDelIter changes.
      30             :         rangeDelIterGeneration int
      31             :         // iterKV caches the current key-value pair iter points to.
      32             :         iterKV *base.InternalKV
      33             :         // levelIter is non-nil if this level's iter is ultimately backed by a
      34             :         // *levelIter. The handle in iter may have wrapped the levelIter with
      35             :         // intermediary internalIterator implementations.
      36             :         levelIter *levelIter
      37             : 
      38             :         // tombstone caches the tombstone rangeDelIter is currently pointed at. If
      39             :         // tombstone is nil, there are no further tombstones within the
      40             :         // current sstable in the current iterator direction. The cached tombstone is
      41             :         // only valid for the levels in the range [0,heap[0].index]. This avoids
      42             :         // positioning tombstones at lower levels which cannot possibly shadow the
      43             :         // current key.
      44             :         tombstone *keyspan.Span
      45             : }
      46             : 
      47           1 : func (ml *mergingIterLevel) setRangeDelIter(iter keyspan.FragmentIterator) {
      48           1 :         ml.tombstone = nil
      49           1 :         if ml.rangeDelIter != nil {
      50           1 :                 ml.rangeDelIter.Close()
      51           1 :         }
      52           1 :         ml.rangeDelIter = iter
      53           1 :         ml.rangeDelIterGeneration++
      54             : }
      55             : 
      56             : // mergingIter provides a merged view of multiple iterators from different
      57             : // levels of the LSM.
      58             : //
      59             : // The core of a mergingIter is a heap of internalIterators (see
      60             : // mergingIterHeap). The heap can operate as either a min-heap, used during
      61             : // forward iteration (First, SeekGE, Next) or a max-heap, used during reverse
      62             : // iteration (Last, SeekLT, Prev). The heap is initialized in calls to First,
      63             : // Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top
      64             : // element on the heap, advances its iterator, and then "fixes" the heap
      65             : // property. When one of the child iterators is exhausted during Next/Prev
      66             : // iteration, it is removed from the heap.
      67             : //
      68             : // # Range Deletions
      69             : //
      70             : // A mergingIter can optionally be configured with a slice of range deletion
      71             : // iterators. The range deletion iterator slice must exactly parallel the point
      72             : // iterators and the range deletion iterator must correspond to the same level
      73             : // in the LSM as the point iterator. Note that each memtable and each table in
      74             : // L0 is a different "level" from the mergingIter perspective. So level 0 below
      75             : // does not correspond to L0 in the LSM.
      76             : //
      77             : // A range deletion iterator iterates over fragmented range tombstones. Range
      78             : // tombstones are fragmented by splitting them at any overlapping points. This
      79             : // fragmentation guarantees that within an sstable tombstones will either be
      80             : // distinct or will have identical start and end user keys. While range
      81             : // tombstones are fragmented within an sstable, the start and end keys are not truncated
      82             : // to sstable boundaries. This is necessary because the tombstone end key is
      83             : // exclusive and does not have a sequence number. Consider an sstable
      84             : // containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must
      85             : // delete "b#8", yet older versions of "b" might spill over to the next
      86             : // sstable. So the boundary key for this sstable must be "b#8". Adjusting the
      87             : // end key of tombstones to be optionally inclusive or contain a sequence
      88             : // number would be possible solutions (such solutions have potentially serious
      89             : // issues: tombstones have exclusive end keys since an inclusive deletion end can
      90             : // be converted to an exclusive one while the reverse transformation is not possible;
      91             : // the semantics of a sequence number for the end key of a range tombstone are murky).
      92             : //
      93             : // The approach taken here performs an
      94             : // implicit truncation of the tombstone to the sstable boundaries.
      95             : //
      96             : // During initialization of a mergingIter, the range deletion iterators for
      97             : // batches, memtables, and L0 tables are populated up front. Note that Batches
      98             : // and memtables index unfragmented tombstones.  Batch.newRangeDelIter() and
      99             : // memTable.newRangeDelIter() fragment and cache the tombstones on demand. The
     100             : // L1-L6 range deletion iterators are populated by levelIter. When configured
     101             : // to load range deletion iterators, whenever a levelIter loads a table it
     102             : // loads both the point iterator and the range deletion
     103             : // iterator. levelIter.rangeDelIter is configured to point to the right entry
     104             : // in mergingIter.levels. The effect of this setup is that
     105             : // mergingIter.levels[i].rangeDelIter always contains the fragmented range
     106             : // tombstone for the current table in level i that the levelIter has open.
     107             : //
     108             : // Another crucial mechanism of levelIter is that it materializes fake point
     109             : // entries for the table boundaries if the boundary is range deletion
     110             : // key. Consider a table that contains only a range tombstone [a-e)#10. The
     111             : // sstable boundaries for this table will be a#10,15 and
     112             : // e#72057594037927935,15. During forward iteration levelIter will return
     113             : // e#72057594037927935,15 as a key. During reverse iteration levelIter will
     114             : // return a#10,15 as a key. These sentinel keys act as bookends to point
     115             : // iteration and allow mergingIter to keep a table and its associated range
     116             : // tombstones loaded as long as there are keys at lower levels that are within
     117             : // the bounds of the table.
     118             : //
     119             : // The final piece to the range deletion puzzle is the LSM invariant that for a
     120             : // given key K newer versions of K can only exist earlier in the level, or at
     121             : // higher levels of the tree. For example, if K#4 exists in L3, k#5 can only
     122             : // exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly
     123             : // uses this invariant to find the value for a key by walking the LSM level by
     124             : // level. For range deletions, this invariant means that a range deletion at
     125             : // level N will necessarily shadow any keys within its bounds in level Y where
     126             : // Y > N. One wrinkle to this statement is that it only applies to keys that
     127             : // lie within the sstable bounds as well, but we get that guarantee due to the
     128             : // way the range deletion iterator and point iterator are bound together by a
     129             : // levelIter.
     130             : //
     131             : // Tying the above all together, we get a picture where each level (index in
     132             : // mergingIter.levels) is composed of both point operations (pX) and range
     133             : // deletions (rX). The range deletions for level X shadow both the point
     134             : // operations and range deletions for level Y where Y > X allowing mergingIter
     135             : // to skip processing entries in that shadow. For example, consider the
     136             : // scenario:
     137             : //
     138             : //      r0: a---e
     139             : //      r1:    d---h
     140             : //      r2:       g---k
     141             : //      r3:          j---n
     142             : //      r4:             m---q
     143             : //
     144             : // This is showing 5 levels of range deletions. Consider what happens upon
     145             : // SeekGE("b"). We first seek the point iterator for level 0 (the point values
     146             : // are not shown above) and we then seek the range deletion iterator. That
     147             : // returns the tombstone [a,e). This tombstone tells us that all keys in the
     148             : // range [a,e) in lower levels are deleted so we can skip them. So we can
     149             : // adjust the seek key to "e", the tombstone end key. For level 1 we seek to
     150             : // "e" and find the range tombstone [d,h) and similar logic holds. By the time
     151             : // we get to level 4 we're seeking to "n".
     152             : //
     153             : // One consequence of not truncating tombstone end keys to sstable boundaries
     154             : // is the seeking process described above cannot always seek to the tombstone
     155             : // end key in the older level. For example, imagine in the above example r3 is
     156             : // a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j,
     157             : // n) has "k" as its upper boundary. In this situation, compactions involving
     158             : // keys at or after "k" can output those keys to r4+, even if they're newer
     159             : // than our tombstone [j, n). So instead of seeking to "n" in r4 we can only
     160             : // seek to "k".  To achieve this, the instance variable `largestUserKey.`
     161             : // maintains the upper bounds of the current sstables in the partitioned
     162             : // levels. In this example, `levels[3].largestUserKey` holds "k", telling us to
     163             : // limit the seek triggered by a tombstone in r3 to "k".
     164             : //
     165             : // During actual iteration levels can contain both point operations and range
     166             : // deletions. Within a level, when a range deletion contains a point operation
     167             : // the sequence numbers must be checked to determine if the point operation is
     168             : // newer or older than the range deletion tombstone. The mergingIter maintains
     169             : // the invariant that the range deletion iterators for all levels newer that
     170             : // the current iteration key (L < m.heap.items[0].index) are positioned at the
     171             : // next (or previous during reverse iteration) range deletion tombstone. We
     172             : // know those levels don't contain a range deletion tombstone that covers the
     173             : // current key because if they did the current key would be deleted. The range
     174             : // deletion iterator for the current key's level is positioned at a range
     175             : // tombstone covering or past the current key. The position of all of other
     176             : // range deletion iterators is unspecified. Whenever a key from those levels
     177             : // becomes the current key, their range deletion iterators need to be
     178             : // positioned. This lazy positioning avoids seeking the range deletion
     179             : // iterators for keys that are never considered. (A similar bit of lazy
     180             : // evaluation can be done for the point iterators, but is still TBD).
     181             : //
     182             : // For a full example, consider the following setup:
     183             : //
     184             : //      p0:               o
     185             : //      r0:             m---q
     186             : //
     187             : //      p1:              n p
     188             : //      r1:       g---k
     189             : //
     190             : //      p2:  b d    i
     191             : //      r2: a---e           q----v
     192             : //
     193             : //      p3:     e
     194             : //      r3:
     195             : //
     196             : // If we start iterating from the beginning, the first key we encounter is "b"
     197             : // in p2. When the mergingIter is pointing at a valid entry, the range deletion
     198             : // iterators for all of the levels < m.heap.items[0].index are positioned at
     199             : // the next range tombstone past the current key. So r0 will point at [m,q) and
     200             : // r1 at [g,k). When the key "b" is encountered, we check to see if the current
     201             : // tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e),
     202             : // contains and is newer than "b".
     203             : //
     204             : // Advancing the iterator finds the next key at "d". This is in the same level
     205             : // as the previous key "b" so we don't have to reposition any of the range
     206             : // deletion iterators, but merely check whether "d" is now contained by any of
     207             : // the range tombstones at higher levels or has stepped past the range
     208             : // tombstone in its own level or higher levels. In this case, there is nothing to be done.
     209             : //
     210             : // Advancing the iterator again finds "e". Since "e" comes from p3, we have to
     211             : // position the r3 range deletion iterator, which is empty. "e" is past the r2
     212             : // tombstone of [a,e) so we need to advance the r2 range deletion iterator to
     213             : // [q,v).
     214             : //
     215             : // The next key is "i". Because this key is in p2, a level above "e", we don't
     216             : // have to reposition any range deletion iterators and instead see that "i" is
     217             : // covered by the range tombstone [g,k). The iterator is immediately advanced
     218             : // to "n" which is covered by the range tombstone [m,q) causing the iterator to
     219             : // advance to "o" which is visible.
     220             : //
     221             : // # Error handling
     222             : //
     223             : // Any iterator operation may fail. The InternalIterator contract dictates that
     224             : // an iterator must return a nil internal key when an error occurs, and a
     225             : // subsequent call to Error() should return the error value. The exported
     226             : // merging iterator positioning methods must adhere to this contract by setting
     227             : // m.err to hold any error encountered by the individual level iterators and
     228             : // returning a nil internal key. Some internal helpers (eg,
     229             : // find[Next|Prev]Entry) also adhere to this contract, setting m.err directly).
     230             : // Other internal functions return an explicit error return value and DO NOT set
     231             : // m.err, relying on the caller to set m.err appropriately.
     232             : //
     233             : // TODO(jackson): Update the InternalIterator interface to return explicit error
     234             : // return values (and an *InternalKV pointer).
     235             : //
     236             : // TODO(peter,rangedel): For testing, advance the iterator through various
     237             : // scenarios and have each step display the current state (i.e. the current
     238             : // heap and range-del iterator positioning).
     239             : type mergingIter struct {
     240             :         logger        Logger
     241             :         split         Split
     242             :         dir           int
     243             :         snapshot      base.SeqNum
     244             :         batchSnapshot base.SeqNum
     245             :         levels        []mergingIterLevel
     246             :         heap          mergingIterHeap
     247             :         err           error
     248             :         prefix        []byte
     249             :         lower         []byte
     250             :         upper         []byte
     251             :         stats         *InternalIteratorStats
     252             :         seekKeyBuf    []byte
     253             : 
     254             :         // levelsPositioned, if non-nil, is a slice of the same length as levels.
     255             :         // It's used by NextPrefix to record which levels have already been
     256             :         // repositioned. It's created lazily by the first call to NextPrefix.
     257             :         levelsPositioned []bool
     258             : 
     259             :         combinedIterState *combinedIterState
     260             : 
     261             :         // Used in some tests to disable the random disabling of seek optimizations.
     262             :         forceEnableSeekOpt bool
     263             : }
     264             : 
     265             : // mergingIter implements the base.InternalIterator interface.
     266             : var _ base.InternalIterator = (*mergingIter)(nil)
     267             : 
     268             : // newMergingIter returns an iterator that merges its input. Walking the
     269             : // resultant iterator will return all key/value pairs of all input iterators
     270             : // in strictly increasing key order, as defined by cmp. It is permissible to
     271             : // pass a nil split parameter if the caller is never going to call
     272             : // SeekPrefixGE.
     273             : //
     274             : // The input's key ranges may overlap, but there are assumed to be no duplicate
     275             : // keys: if iters[i] contains a key k then iters[j] will not contain that key k.
     276             : //
     277             : // None of the iters may be nil.
     278             : func newMergingIter(
     279             :         logger Logger,
     280             :         stats *base.InternalIteratorStats,
     281             :         cmp Compare,
     282             :         split Split,
     283             :         iters ...internalIterator,
     284           1 : ) *mergingIter {
     285           1 :         m := &mergingIter{}
     286           1 :         levels := make([]mergingIterLevel, len(iters))
     287           1 :         for i := range levels {
     288           1 :                 levels[i].iter = iters[i]
     289           1 :         }
     290           1 :         m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...)
     291           1 :         return m
     292             : }
     293             : 
     294             : func (m *mergingIter) init(
     295             :         opts *IterOptions,
     296             :         stats *base.InternalIteratorStats,
     297             :         cmp Compare,
     298             :         split Split,
     299             :         levels ...mergingIterLevel,
     300           1 : ) {
     301           1 :         m.err = nil // clear cached iteration error
     302           1 :         m.logger = opts.getLogger()
     303           1 :         if opts != nil {
     304           1 :                 m.lower = opts.LowerBound
     305           1 :                 m.upper = opts.UpperBound
     306           1 :         }
     307           1 :         m.snapshot = base.SeqNumMax
     308           1 :         m.batchSnapshot = base.SeqNumMax
     309           1 :         m.levels = levels
     310           1 :         m.heap.cmp = cmp
     311           1 :         m.split = split
     312           1 :         m.stats = stats
     313           1 :         if cap(m.heap.items) < len(levels) {
     314           1 :                 m.heap.items = make([]*mergingIterLevel, 0, len(levels))
     315           1 :         } else {
     316           1 :                 m.heap.items = m.heap.items[:0]
     317           1 :         }
     318           1 :         for l := range m.levels {
     319           1 :                 m.levels[l].index = l
     320           1 :         }
     321             : }
     322             : 
     323           1 : func (m *mergingIter) initHeap() {
     324           1 :         m.heap.items = m.heap.items[:0]
     325           1 :         for i := range m.levels {
     326           1 :                 if l := &m.levels[i]; l.iterKV != nil {
     327           1 :                         m.heap.items = append(m.heap.items, l)
     328           1 :                 }
     329             :         }
     330           1 :         m.heap.init()
     331             : }
     332             : 
     333           1 : func (m *mergingIter) initMinHeap() error {
     334           1 :         m.dir = 1
     335           1 :         m.heap.reverse = false
     336           1 :         m.initHeap()
     337           1 :         return m.initMinRangeDelIters(-1)
     338           1 : }
     339             : 
     340             : // The level of the previous top element was oldTopLevel. Note that all range delete
     341             : // iterators < oldTopLevel are positioned past the key of the previous top element and
     342             : // the range delete iterator == oldTopLevel is positioned at or past the key of the
     343             : // previous top element. We need to position the range delete iterators from oldTopLevel + 1
     344             : // to the level of the current top element.
     345           1 : func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) error {
     346           1 :         if m.heap.len() == 0 {
     347           1 :                 return nil
     348           1 :         }
     349             : 
     350             :         // Position the range-del iterators at levels <= m.heap.items[0].index.
     351           1 :         item := m.heap.items[0]
     352           1 :         for level := oldTopLevel + 1; level <= item.index; level++ {
     353           1 :                 l := &m.levels[level]
     354           1 :                 if l.rangeDelIter == nil {
     355           1 :                         continue
     356             :                 }
     357           1 :                 var err error
     358           1 :                 l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey)
     359           1 :                 if err != nil {
     360           0 :                         return err
     361           0 :                 }
     362             :         }
     363           1 :         return nil
     364             : }
     365             : 
     366           1 : func (m *mergingIter) initMaxHeap() error {
     367           1 :         m.dir = -1
     368           1 :         m.heap.reverse = true
     369           1 :         m.initHeap()
     370           1 :         return m.initMaxRangeDelIters(-1)
     371           1 : }
     372             : 
     373             : // The level of the previous top element was oldTopLevel. Note that all range delete
     374             : // iterators < oldTopLevel are positioned before the key of the previous top element and
     375             : // the range delete iterator == oldTopLevel is positioned at or before the key of the
     376             : // previous top element. We need to position the range delete iterators from oldTopLevel + 1
     377             : // to the level of the current top element.
     378           1 : func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) error {
     379           1 :         if m.heap.len() == 0 {
     380           1 :                 return nil
     381           1 :         }
     382             :         // Position the range-del iterators at levels <= m.heap.items[0].index.
     383           1 :         item := m.heap.items[0]
     384           1 :         for level := oldTopLevel + 1; level <= item.index; level++ {
     385           1 :                 l := &m.levels[level]
     386           1 :                 if l.rangeDelIter == nil {
     387           1 :                         continue
     388             :                 }
     389           1 :                 tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey)
     390           1 :                 if err != nil {
     391           0 :                         return err
     392           0 :                 }
     393           1 :                 l.tombstone = tomb
     394             :         }
     395           1 :         return nil
     396             : }
     397             : 
     398           1 : func (m *mergingIter) switchToMinHeap() error {
     399           1 :         if m.heap.len() == 0 {
     400           1 :                 if m.lower != nil {
     401           1 :                         m.SeekGE(m.lower, base.SeekGEFlagsNone)
     402           1 :                 } else {
     403           1 :                         m.First()
     404           1 :                 }
     405           1 :                 return m.err
     406             :         }
     407             : 
     408             :         // We're switching from using a max heap to a min heap. We need to advance
     409             :         // any iterator that is less than or equal to the current key. Consider the
     410             :         // scenario where we have 2 iterators being merged (user-key:seq-num):
     411             :         //
     412             :         // i1:     *a:2     b:2
     413             :         // i2: a:1      b:1
     414             :         //
     415             :         // The current key is a:2 and i2 is pointed at a:1. When we switch to forward
     416             :         // iteration, we want to return a key that is greater than a:2.
     417             : 
     418           1 :         key := m.heap.items[0].iterKV.K
     419           1 :         cur := m.heap.items[0]
     420           1 : 
     421           1 :         for i := range m.levels {
     422           1 :                 l := &m.levels[i]
     423           1 :                 if l == cur {
     424           1 :                         continue
     425             :                 }
     426           1 :                 for l.iterKV = l.iter.Next(); l.iterKV != nil; l.iterKV = l.iter.Next() {
     427           1 :                         if base.InternalCompare(m.heap.cmp, key, l.iterKV.K) < 0 {
     428           1 :                                 // key < iter-key
     429           1 :                                 break
     430             :                         }
     431             :                         // key >= iter-key
     432             :                 }
     433           1 :                 if l.iterKV == nil {
     434           1 :                         if err := l.iter.Error(); err != nil {
     435           1 :                                 return err
     436           1 :                         }
     437             :                 }
     438             :         }
     439             : 
     440             :         // Special handling for the current iterator because we were using its key
     441             :         // above.
     442           1 :         cur.iterKV = cur.iter.Next()
     443           1 :         if cur.iterKV == nil {
     444           1 :                 if err := cur.iter.Error(); err != nil {
     445           1 :                         return err
     446           1 :                 }
     447             :         }
     448           1 :         return m.initMinHeap()
     449             : }
     450             : 
     451           1 : func (m *mergingIter) switchToMaxHeap() error {
     452           1 :         if m.heap.len() == 0 {
     453           1 :                 if m.upper != nil {
     454           1 :                         m.SeekLT(m.upper, base.SeekLTFlagsNone)
     455           1 :                 } else {
     456           1 :                         m.Last()
     457           1 :                 }
     458           1 :                 return m.err
     459             :         }
     460             : 
     461             :         // We're switching from using a min heap to a max heap. We need to backup any
     462             :         // iterator that is greater than or equal to the current key. Consider the
     463             :         // scenario where we have 2 iterators being merged (user-key:seq-num):
     464             :         //
     465             :         // i1: a:2     *b:2
     466             :         // i2:     a:1      b:1
     467             :         //
     468             :         // The current key is b:2 and i2 is pointing at b:1. When we switch to
     469             :         // reverse iteration, we want to return a key that is less than b:2.
     470           1 :         key := m.heap.items[0].iterKV.K
     471           1 :         cur := m.heap.items[0]
     472           1 : 
     473           1 :         for i := range m.levels {
     474           1 :                 l := &m.levels[i]
     475           1 :                 if l == cur {
     476           1 :                         continue
     477             :                 }
     478             : 
     479           1 :                 for l.iterKV = l.iter.Prev(); l.iterKV != nil; l.iterKV = l.iter.Prev() {
     480           1 :                         if base.InternalCompare(m.heap.cmp, key, l.iterKV.K) > 0 {
     481           1 :                                 // key > iter-key
     482           1 :                                 break
     483             :                         }
     484             :                         // key <= iter-key
     485             :                 }
     486           1 :                 if l.iterKV == nil {
     487           1 :                         if err := l.iter.Error(); err != nil {
     488           0 :                                 return err
     489           0 :                         }
     490             :                 }
     491             :         }
     492             : 
     493             :         // Special handling for the current iterator because we were using its key
     494             :         // above.
     495           1 :         cur.iterKV = cur.iter.Prev()
     496           1 :         if cur.iterKV == nil {
     497           1 :                 if err := cur.iter.Error(); err != nil {
     498           1 :                         return err
     499           1 :                 }
     500             :         }
     501           1 :         return m.initMaxHeap()
     502             : }
     503             : 
     504             : // nextEntry unconditionally steps to the next entry. item is the current top
     505             : // item in the heap.
     506           1 : func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error {
     507           1 :         // INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal
     508           1 :         // to m.prefix. This invariant is important for ensuring TrySeekUsingNext
     509           1 :         // optimizations behave correctly.
     510           1 :         //
     511           1 :         // During prefix iteration, the iterator does not have a full view of the
     512           1 :         // LSM. Some level iterators may omit keys that are known to fall outside
     513           1 :         // the seek prefix (eg, due to sstable bloom filter exclusion). It's
     514           1 :         // important that in such cases we don't position any iterators beyond
     515           1 :         // m.prefix, because doing so may interfere with future seeks.
     516           1 :         //
     517           1 :         // Let prefixes P1 < P2 < P3. Imagine a SeekPrefixGE to prefix P1, followed
     518           1 :         // by a SeekPrefixGE to prefix P2. Imagine there exist live keys at prefix
     519           1 :         // P2, but they're not visible to the SeekPrefixGE(P1) (because of
     520           1 :         // bloom-filter exclusion or a range tombstone that deletes prefix P1 but
     521           1 :         // not P2). If the SeekPrefixGE(P1) is allowed to move any level iterators
     522           1 :         // to P3, the SeekPrefixGE(P2, TrySeekUsingNext=true) may mistakenly think
     523           1 :         // the level contains no point keys or range tombstones within the prefix
     524           1 :         // P2. Care is taken to avoid ever advancing the iterator beyond the current
     525           1 :         // prefix. If nextEntry is ever invoked while we're already beyond the
     526           1 :         // current prefix, we're violating the invariant.
     527           1 :         if invariants.Enabled && m.prefix != nil {
     528           1 :                 if p := m.split.Prefix(l.iterKV.K.UserKey); !bytes.Equal(m.prefix, p) {
     529           0 :                         m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s",
     530           0 :                                 m.prefix, l.iterKV, debug.Stack())
     531           0 :                 }
     532             :         }
     533             : 
     534           1 :         oldTopLevel := l.index
     535           1 :         oldRangeDelIterGeneration := l.rangeDelIterGeneration
     536           1 : 
     537           1 :         if succKey == nil {
     538           1 :                 l.iterKV = l.iter.Next()
     539           1 :         } else {
     540           1 :                 l.iterKV = l.iter.NextPrefix(succKey)
     541           1 :         }
     542             : 
     543           1 :         if l.iterKV == nil {
     544           1 :                 if err := l.iter.Error(); err != nil {
     545           1 :                         return err
     546           1 :                 }
     547           1 :                 m.heap.pop()
     548           1 :         } else {
     549           1 :                 if m.prefix != nil && !bytes.Equal(m.prefix, m.split.Prefix(l.iterKV.K.UserKey)) {
     550           1 :                         // Set keys without a matching prefix to their zero values when in prefix
     551           1 :                         // iteration mode and remove iterated level from heap.
     552           1 :                         l.iterKV = nil
     553           1 :                         m.heap.pop()
     554           1 :                 } else if m.heap.len() > 1 {
     555           1 :                         m.heap.fix(0)
     556           1 :                 }
     557           1 :                 if l.rangeDelIterGeneration != oldRangeDelIterGeneration {
     558           1 :                         // The rangeDelIter changed which indicates that the l.iter moved to the
     559           1 :                         // next sstable. We have to update the tombstone for oldTopLevel as well.
     560           1 :                         oldTopLevel--
     561           1 :                 }
     562             :         }
     563             : 
     564             :         // The cached tombstones are only valid for the levels
     565             :         // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
     566             :         // [oldTopLevel+1,heap[0].index].
     567           1 :         return m.initMinRangeDelIters(oldTopLevel)
     568             : }
     569             : 
     570             : // isNextEntryDeleted starts from the current entry (as the next entry) and if
     571             : // it is deleted, moves the iterators forward as needed and returns true, else
     572             : // it returns false. item is the top item in the heap. If any of the required
     573             : // iterator operations error, the error is returned without updating m.err.
     574             : //
     575             : // During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by
     576             : // clearing the heap if the deleted key(s) extend beyond the iteration prefix
     577             : // during prefix-iteration mode.
     578           1 : func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) (bool, error) {
     579           1 :         // Look for a range deletion tombstone containing item.iterKV at higher
     580           1 :         // levels (level < item.index). If we find such a range tombstone we know
     581           1 :         // it deletes the key in the current level. Also look for a range
     582           1 :         // deletion at the current level (level == item.index). If we find such a
     583           1 :         // range deletion we need to check whether it is newer than the current
     584           1 :         // entry.
     585           1 :         for level := 0; level <= item.index; level++ {
     586           1 :                 l := &m.levels[level]
     587           1 :                 if l.rangeDelIter == nil || l.tombstone == nil {
     588           1 :                         // If l.tombstone is nil, there are no further tombstones
     589           1 :                         // in the current sstable in the current (forward) iteration
     590           1 :                         // direction.
     591           1 :                         continue
     592             :                 }
     593           1 :                 if m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) <= 0 {
     594           1 :                         // The current key is at or past the tombstone end key.
     595           1 :                         //
     596           1 :                         // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
     597           1 :                         // the levelIter must be positioned at a key >= item.iterKV. So it is sufficient to seek the
     598           1 :                         // current l.rangeDelIter (since any range del iterators that will be provided by the
     599           1 :                         // levelIter in the future cannot contain item.iterKV). Also, it is possible that we
     600           1 :                         // will encounter parts of the range delete that should be ignored -- we handle that
     601           1 :                         // below.
     602           1 :                         var err error
     603           1 :                         l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey)
     604           1 :                         if err != nil {
     605           1 :                                 return false, err
     606           1 :                         }
     607             :                 }
     608           1 :                 if l.tombstone == nil {
     609           1 :                         continue
     610             :                 }
     611             : 
     612           1 :                 if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, item.iterKV.K.UserKey) <= 0 {
     613           1 :                         if level < item.index {
     614           1 :                                 // We could also do m.seekGE(..., level + 1). The levels from
     615           1 :                                 // [level + 1, item.index) are already after item.iterKV so seeking them may be
     616           1 :                                 // wasteful.
     617           1 : 
     618           1 :                                 // We can seek up to tombstone.End.
     619           1 :                                 //
     620           1 :                                 // Progress argument: Since this file is at a higher level than item.iterKV we know
     621           1 :                                 // that the iterator in this file must be positioned within its bounds and at a key
     622           1 :                                 // X > item.iterKV (otherwise it would be the min of the heap). It is not
     623           1 :                                 // possible for X.UserKey == item.iterKV.UserKey, since it is incompatible with
     624           1 :                                 // X > item.iterKV (a lower version cannot be in a higher sstable), so it must be that
     625           1 :                                 // X.UserKey > item.iterKV.UserKey. Which means l.largestUserKey > item.key.UserKey.
     626           1 :                                 // We also know that l.tombstone.End > item.iterKV.UserKey. So the min of these,
     627           1 :                                 // seekKey, computed below, is > item.iterKV.UserKey, so the call to seekGE() will
     628           1 :                                 // make forward progress.
     629           1 :                                 m.seekKeyBuf = append(m.seekKeyBuf[:0], l.tombstone.End...)
     630           1 :                                 seekKey := m.seekKeyBuf
     631           1 :                                 // This seek is not directly due to a SeekGE call, so we don't know
     632           1 :                                 // enough about the underlying iterator positions, and so we keep the
     633           1 :                                 // try-seek-using-next optimization disabled. Additionally, if we're in
     634           1 :                                 // prefix-seek mode and a re-seek would have moved us past the original
     635           1 :                                 // prefix, we can remove all merging iter levels below the rangedel
     636           1 :                                 // tombstone's level and return immediately instead of re-seeking. This
     637           1 :                                 // is correct since those levels cannot provide a key that matches the
     638           1 :                                 // prefix, and is also visible. Additionally, this is important to make
     639           1 :                                 // subsequent `TrySeekUsingNext` work correctly, as a re-seek on a
     640           1 :                                 // different prefix could have resulted in this iterator skipping visible
     641           1 :                                 // keys at prefixes in between m.prefix and seekKey, that are currently
     642           1 :                                 // not in the heap due to a bloom filter mismatch.
     643           1 :                                 //
     644           1 :                                 // Additionally, we set the relative-seek flag. This is
     645           1 :                                 // important when iterating with lazy combined iteration. If
     646           1 :                                 // there's a range key between this level's current file and the
     647           1 :                                 // file the seek will land on, we need to detect it in order to
     648           1 :                                 // trigger construction of the combined iterator.
     649           1 :                                 if m.prefix != nil {
     650           1 :                                         if !bytes.Equal(m.prefix, m.split.Prefix(seekKey)) {
     651           1 :                                                 for i := item.index; i < len(m.levels); i++ {
     652           1 :                                                         // Remove this level from the heap. Setting iterKV
     653           1 :                                                         // to nil should be sufficient for initMinHeap to
     654           1 :                                                         // not re-initialize the heap with them in it. Other
     655           1 :                                                         // fields in mergingIterLevel can remain as-is; the
     656           1 :                                                         // iter/rangeDelIter needs to stay intact for future
     657           1 :                                                         // trySeekUsingNexts to work, the level iter
     658           1 :                                                         // boundary context is owned by the levelIter which
     659           1 :                                                         // is not being repositioned, and any tombstones in
     660           1 :                                                         // these levels will be irrelevant for us anyway.
     661           1 :                                                         m.levels[i].iterKV = nil
     662           1 :                                                 }
     663             :                                                 // TODO(bilal): Consider a more efficient way of removing levels from
     664             :                                                 // the heap without reinitializing all of it. This would likely
     665             :                                                 // necessitate tracking the heap positions of each mergingIterHeap
     666             :                                                 // item in the mergingIterLevel, and then swapping that item in the
     667             :                                                 // heap with the last-positioned heap item, and shrinking the heap by
     668             :                                                 // one.
     669           1 :                                                 if err := m.initMinHeap(); err != nil {
     670           0 :                                                         return false, err
     671           0 :                                                 }
     672           1 :                                                 return true, nil
     673             :                                         }
     674             :                                 }
     675           1 :                                 if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil {
     676           0 :                                         return false, err
     677           0 :                                 }
     678           1 :                                 return true, nil
     679             :                         }
     680           1 :                         if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) {
     681           1 :                                 if err := m.nextEntry(item, nil /* succKey */); err != nil {
     682           0 :                                         return false, err
     683           0 :                                 }
     684           1 :                                 return true, nil
     685             :                         }
     686             :                 }
     687             :         }
     688           1 :         return false, nil
     689             : }
     690             : 
     691             : // Starting from the current entry, finds the first (next) entry that can be returned.
     692             : //
     693             : // If an error occurs, m.err is updated to hold the error and findNextentry
     694             : // returns a nil internal key.
     695           1 : func (m *mergingIter) findNextEntry() *base.InternalKV {
     696           1 :         for m.heap.len() > 0 && m.err == nil {
     697           1 :                 item := m.heap.items[0]
     698           1 : 
     699           1 :                 // The levelIter internal iterator will interleave exclusive sentinel
     700           1 :                 // keys to keep files open until their range deletions are no longer
     701           1 :                 // necessary. Sometimes these are interleaved with the user key of a
     702           1 :                 // file's largest key, in which case they may simply be stepped over to
     703           1 :                 // move to the next file in the forward direction. Other times they're
     704           1 :                 // interleaved at the user key of the user-iteration boundary, if that
     705           1 :                 // falls within the bounds of a file. In the latter case, there are no
     706           1 :                 // more keys < m.upper, and we can stop iterating.
     707           1 :                 //
     708           1 :                 // We perform a key comparison to differentiate between these two cases.
     709           1 :                 // This key comparison is considered okay because it only happens for
     710           1 :                 // sentinel keys. It may be eliminated after #2863.
     711           1 :                 if m.levels[item.index].iterKV.K.IsExclusiveSentinel() {
     712           1 :                         if m.upper != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.upper) >= 0 {
     713           1 :                                 break
     714             :                         }
     715             :                         // This key is the largest boundary of a file and can be skipped now
     716             :                         // that the file's range deletions are no longer relevant.
     717           1 :                         m.err = m.nextEntry(item, nil /* succKey */)
     718           1 :                         if m.err != nil {
     719           0 :                                 return nil
     720           0 :                         }
     721           1 :                         continue
     722             :                 }
     723             : 
     724           1 :                 m.addItemStats(item)
     725           1 : 
     726           1 :                 // Check if the heap root key is deleted by a range tombstone in a
     727           1 :                 // higher level. If it is, isNextEntryDeleted will advance the iterator
     728           1 :                 // to a later key (through seeking or nexting).
     729           1 :                 isDeleted, err := m.isNextEntryDeleted(item)
     730           1 :                 if err != nil {
     731           1 :                         m.err = err
     732           1 :                         return nil
     733           1 :                 } else if isDeleted {
     734           1 :                         m.stats.PointsCoveredByRangeTombstones++
     735           1 :                         continue
     736             :                 }
     737             : 
     738             :                 // Check if the key is visible at the iterator sequence numbers.
     739           1 :                 if !item.iterKV.Visible(m.snapshot, m.batchSnapshot) {
     740           1 :                         m.err = m.nextEntry(item, nil /* succKey */)
     741           1 :                         if m.err != nil {
     742           0 :                                 return nil
     743           0 :                         }
     744           1 :                         continue
     745             :                 }
     746             : 
     747             :                 // The heap root is visible and not deleted by any range tombstones.
     748             :                 // Return it.
     749           1 :                 return item.iterKV
     750             :         }
     751           1 :         return nil
     752             : }
     753             : 
     754             : // Steps to the prev entry. item is the current top item in the heap.
     755           1 : func (m *mergingIter) prevEntry(l *mergingIterLevel) error {
     756           1 :         oldTopLevel := l.index
     757           1 :         oldRangeDelIterGeneration := l.rangeDelIterGeneration
     758           1 :         if l.iterKV = l.iter.Prev(); l.iterKV != nil {
     759           1 :                 if m.heap.len() > 1 {
     760           1 :                         m.heap.fix(0)
     761           1 :                 }
     762           1 :                 if l.rangeDelIterGeneration != oldRangeDelIterGeneration && l.rangeDelIter != nil {
     763           1 :                         // The rangeDelIter changed which indicates that the l.iter moved to the
     764           1 :                         // previous sstable. We have to update the tombstone for oldTopLevel as
     765           1 :                         // well.
     766           1 :                         oldTopLevel--
     767           1 :                 }
     768           1 :         } else {
     769           1 :                 if err := l.iter.Error(); err != nil {
     770           1 :                         return err
     771           1 :                 }
     772           1 :                 m.heap.pop()
     773             :         }
     774             : 
     775             :         // The cached tombstones are only valid for the levels
     776             :         // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
     777             :         // [oldTopLevel+1,heap[0].index].
     778           1 :         return m.initMaxRangeDelIters(oldTopLevel)
     779             : }
     780             : 
     781             : // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted,
     782             : // moves the iterators backward as needed and returns true, else it returns false. item is the top
     783             : // item in the heap.
     784           1 : func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) (bool, error) {
     785           1 :         // Look for a range deletion tombstone containing item.iterKV at higher
     786           1 :         // levels (level < item.index). If we find such a range tombstone we know
     787           1 :         // it deletes the key in the current level. Also look for a range
     788           1 :         // deletion at the current level (level == item.index). If we find such a
     789           1 :         // range deletion we need to check whether it is newer than the current
     790           1 :         // entry.
     791           1 :         for level := 0; level <= item.index; level++ {
     792           1 :                 l := &m.levels[level]
     793           1 :                 if l.rangeDelIter == nil || l.tombstone == nil {
     794           1 :                         // If l.tombstone is nil, there are no further tombstones
     795           1 :                         // in the current sstable in the current (reverse) iteration
     796           1 :                         // direction.
     797           1 :                         continue
     798             :                 }
     799           1 :                 if m.heap.cmp(item.iterKV.K.UserKey, l.tombstone.Start) < 0 {
     800           1 :                         // The current key is before the tombstone start key.
     801           1 :                         //
     802           1 :                         // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
     803           1 :                         // the levelIter must be positioned at a key < item.iterKV. So it is sufficient to seek the
     804           1 :                         // current l.rangeDelIter (since any range del iterators that will be provided by the
     805           1 :                         // levelIter in the future cannot contain item.iterKV). Also, it is it is possible that we
     806           1 :                         // will encounter parts of the range delete that should be ignored -- we handle that
     807           1 :                         // below.
     808           1 : 
     809           1 :                         tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey)
     810           1 :                         if err != nil {
     811           0 :                                 return false, err
     812           0 :                         }
     813           1 :                         l.tombstone = tomb
     814             :                 }
     815           1 :                 if l.tombstone == nil {
     816           1 :                         continue
     817             :                 }
     818           1 :                 if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) > 0 {
     819           1 :                         if level < item.index {
     820           1 :                                 // We could also do m.seekLT(..., level + 1). The levels from
     821           1 :                                 // [level + 1, item.index) are already before item.iterKV so seeking them may be
     822           1 :                                 // wasteful.
     823           1 : 
     824           1 :                                 // We can seek up to tombstone.Start.UserKey.
     825           1 :                                 //
     826           1 :                                 // Progress argument: We know that the iterator in this file is positioned within
     827           1 :                                 // its bounds and at a key X < item.iterKV (otherwise it would be the max of the heap).
     828           1 :                                 // So smallestUserKey <= item.iterKV.UserKey and we already know that
     829           1 :                                 // l.tombstone.Start.UserKey <= item.iterKV.UserKey. So the seekKey computed below
     830           1 :                                 // is <= item.iterKV.UserKey, and since we do a seekLT() we will make backwards
     831           1 :                                 // progress.
     832           1 :                                 m.seekKeyBuf = append(m.seekKeyBuf[:0], l.tombstone.Start...)
     833           1 :                                 seekKey := m.seekKeyBuf
     834           1 :                                 // We set the relative-seek flag. This is important when
     835           1 :                                 // iterating with lazy combined iteration. If there's a range
     836           1 :                                 // key between this level's current file and the file the seek
     837           1 :                                 // will land on, we need to detect it in order to trigger
     838           1 :                                 // construction of the combined iterator.
     839           1 :                                 if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil {
     840           0 :                                         return false, err
     841           0 :                                 }
     842           1 :                                 return true, nil
     843             :                         }
     844           1 :                         if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) {
     845           1 :                                 if err := m.prevEntry(item); err != nil {
     846           0 :                                         return false, err
     847           0 :                                 }
     848           1 :                                 return true, nil
     849             :                         }
     850             :                 }
     851             :         }
     852           1 :         return false, nil
     853             : }
     854             : 
     855             : // Starting from the current entry, finds the first (prev) entry that can be returned.
     856             : //
     857             : // If an error occurs, m.err is updated to hold the error and findNextentry
     858             : // returns a nil internal key.
     859           1 : func (m *mergingIter) findPrevEntry() *base.InternalKV {
     860           1 :         for m.heap.len() > 0 && m.err == nil {
     861           1 :                 item := m.heap.items[0]
     862           1 : 
     863           1 :                 // The levelIter internal iterator will interleave exclusive sentinel
     864           1 :                 // keys to keep files open until their range deletions are no longer
     865           1 :                 // necessary. Sometimes these are interleaved with the user key of a
     866           1 :                 // file's smallest key, in which case they may simply be stepped over to
     867           1 :                 // move to the next file in the backward direction. Other times they're
     868           1 :                 // interleaved at the user key of the user-iteration boundary, if that
     869           1 :                 // falls within the bounds of a file. In the latter case, there are no
     870           1 :                 // more keys ≥ m.lower, and we can stop iterating.
     871           1 :                 //
     872           1 :                 // We perform a key comparison to differentiate between these two cases.
     873           1 :                 // This key comparison is considered okay because it only happens for
     874           1 :                 // sentinel keys. It may be eliminated after #2863.
     875           1 :                 if m.levels[item.index].iterKV.K.IsExclusiveSentinel() {
     876           1 :                         if m.lower != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.lower) <= 0 {
     877           1 :                                 break
     878             :                         }
     879             :                         // This key is the smallest boundary of a file and can be skipped
     880             :                         // now that the file's range deletions are no longer relevant.
     881           1 :                         m.err = m.prevEntry(item)
     882           1 :                         if m.err != nil {
     883           0 :                                 return nil
     884           0 :                         }
     885           1 :                         continue
     886             :                 }
     887             : 
     888           1 :                 m.addItemStats(item)
     889           1 :                 if isDeleted, err := m.isPrevEntryDeleted(item); err != nil {
     890           0 :                         m.err = err
     891           0 :                         return nil
     892           1 :                 } else if isDeleted {
     893           1 :                         m.stats.PointsCoveredByRangeTombstones++
     894           1 :                         continue
     895             :                 }
     896           1 :                 if item.iterKV.Visible(m.snapshot, m.batchSnapshot) {
     897           1 :                         return item.iterKV
     898           1 :                 }
     899           1 :                 m.err = m.prevEntry(item)
     900             :         }
     901           1 :         return nil
     902             : }
     903             : 
     904             : // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks.
     905             : //
     906             : // If an error occurs, seekGE returns the error without setting m.err.
     907           1 : func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) error {
     908           1 :         // When seeking, we can use tombstones to adjust the key we seek to on each
     909           1 :         // level. Consider the series of range tombstones:
     910           1 :         //
     911           1 :         //   1: a---e
     912           1 :         //   2:    d---h
     913           1 :         //   3:       g---k
     914           1 :         //   4:          j---n
     915           1 :         //   5:             m---q
     916           1 :         //
     917           1 :         // If we SeekGE("b") we also find the tombstone "b" resides within in the
     918           1 :         // first level which is [a,e). Regardless of whether this tombstone deletes
     919           1 :         // "b" in that level, we know it deletes "b" in all lower levels, so we
     920           1 :         // adjust the search key in the next level to the tombstone end key "e". We
     921           1 :         // then SeekGE("e") in the second level and find the corresponding tombstone
     922           1 :         // [d,h). This process continues and we end up seeking for "h" in the 3rd
     923           1 :         // level, "k" in the 4th level and "n" in the last level.
     924           1 :         //
     925           1 :         // TODO(peter,rangedel): In addition to the above we can delay seeking a
     926           1 :         // level (and any lower levels) when the current iterator position is
     927           1 :         // contained within a range tombstone at a higher level.
     928           1 : 
     929           1 :         // Deterministically disable the TrySeekUsingNext optimizations sometimes in
     930           1 :         // invariant builds to encourage the metamorphic tests to surface bugs. Note
     931           1 :         // that we cannot disable the optimization within individual levels. It must
     932           1 :         // be disabled for all levels or none. If one lower-level iterator performs
     933           1 :         // a fresh seek whereas another takes advantage of its current iterator
     934           1 :         // position, the heap can become inconsistent. Consider the following
     935           1 :         // example:
     936           1 :         //
     937           1 :         //     L5:  [ [b-c) ]  [ d ]*
     938           1 :         //     L6:  [  b ]           [e]*
     939           1 :         //
     940           1 :         // Imagine a SeekGE(a). The [b-c) range tombstone deletes the L6 point key
     941           1 :         // 'b', resulting in the iterator positioned at d with the heap:
     942           1 :         //
     943           1 :         //     {L5: d, L6: e}
     944           1 :         //
     945           1 :         // A subsequent SeekGE(b) is seeking to a larger key, so the caller may set
     946           1 :         // TrySeekUsingNext()=true. If the L5 iterator used the TrySeekUsingNext
     947           1 :         // optimization but the L6 iterator did not, the iterator would have the
     948           1 :         // heap:
     949           1 :         //
     950           1 :         //     {L6: b, L5: d}
     951           1 :         //
     952           1 :         // Because the L5 iterator has already advanced to the next sstable, the
     953           1 :         // merging iterator cannot observe the [b-c) range tombstone and will
     954           1 :         // mistakenly return L6's deleted point key 'b'.
     955           1 :         if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(m))) && !m.forceEnableSeekOpt {
     956           1 :                 flags = flags.DisableTrySeekUsingNext()
     957           1 :         }
     958             : 
     959           1 :         for ; level < len(m.levels); level++ {
     960           1 :                 if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 {
     961           0 :                         m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack())
     962           0 :                 }
     963             : 
     964           1 :                 l := &m.levels[level]
     965           1 :                 if m.prefix != nil {
     966           1 :                         l.iterKV = l.iter.SeekPrefixGE(m.prefix, key, flags)
     967           1 :                         if l.iterKV != nil {
     968           1 :                                 if !bytes.Equal(m.prefix, m.split.Prefix(l.iterKV.K.UserKey)) {
     969           1 :                                         // Prevent keys without a matching prefix from being added to the heap by setting
     970           1 :                                         // iterKey and iterValue to their zero values before calling initMinHeap.
     971           1 :                                         l.iterKV = nil
     972           1 :                                 }
     973             :                         }
     974           1 :                 } else {
     975           1 :                         l.iterKV = l.iter.SeekGE(key, flags)
     976           1 :                 }
     977           1 :                 if l.iterKV == nil {
     978           1 :                         if err := l.iter.Error(); err != nil {
     979           1 :                                 return err
     980           1 :                         }
     981             :                 }
     982             : 
     983             :                 // If this level contains overlapping range tombstones, alter the seek
     984             :                 // key accordingly. Caveat: If we're performing lazy-combined iteration,
     985             :                 // we cannot alter the seek key: Range tombstones don't delete range
     986             :                 // keys, and there might exist live range keys within the range
     987             :                 // tombstone's span that need to be observed to trigger a switch to
     988             :                 // combined iteration.
     989           1 :                 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
     990           1 :                         (m.combinedIterState == nil || m.combinedIterState.initialized) {
     991           1 :                         // The level has a range-del iterator. Find the tombstone containing
     992           1 :                         // the search key.
     993           1 :                         var err error
     994           1 :                         l.tombstone, err = rangeDelIter.SeekGE(key)
     995           1 :                         if err != nil {
     996           0 :                                 return err
     997           0 :                         }
     998           1 :                         if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, key) <= 0 {
     999           1 :                                 // Based on the containment condition tombstone.End > key, so
    1000           1 :                                 // the assignment to key results in a monotonically
    1001           1 :                                 // non-decreasing key across iterations of this loop.
    1002           1 :                                 //
    1003           1 :                                 // The adjustment of key here can only move it to a larger key.
    1004           1 :                                 // Since the caller of seekGE guaranteed that the original key
    1005           1 :                                 // was greater than or equal to m.lower, the new key will
    1006           1 :                                 // continue to be greater than or equal to m.lower.
    1007           1 :                                 key = l.tombstone.End
    1008           1 :                         }
    1009             :                 }
    1010             :         }
    1011           1 :         return m.initMinHeap()
    1012             : }
    1013             : 
    1014           0 : func (m *mergingIter) String() string {
    1015           0 :         return "merging"
    1016           0 : }
    1017             : 
    1018             : // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks
    1019             : // the upper bound. It is up to the caller to ensure that key is greater than
    1020             : // or equal to the lower bound.
    1021           1 : func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
    1022           1 :         m.prefix = nil
    1023           1 :         m.err = m.seekGE(key, 0 /* start level */, flags)
    1024           1 :         if m.err != nil {
    1025           1 :                 return nil
    1026           1 :         }
    1027           1 :         return m.findNextEntry()
    1028             : }
    1029             : 
    1030             : // SeekPrefixGE implements base.InternalIterator.SeekPrefixGE.
    1031           1 : func (m *mergingIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
    1032           1 :         return m.SeekPrefixGEStrict(prefix, key, flags)
    1033           1 : }
    1034             : 
    1035             : // SeekPrefixGEStrict implements topLevelIterator.SeekPrefixGEStrict. Note that
    1036             : // SeekPrefixGEStrict explicitly checks that the key has a matching prefix.
    1037             : func (m *mergingIter) SeekPrefixGEStrict(
    1038             :         prefix, key []byte, flags base.SeekGEFlags,
    1039           1 : ) *base.InternalKV {
    1040           1 :         m.prefix = prefix
    1041           1 :         m.err = m.seekGE(key, 0 /* start level */, flags)
    1042           1 :         if m.err != nil {
    1043           1 :                 return nil
    1044           1 :         }
    1045             : 
    1046           1 :         iterKV := m.findNextEntry()
    1047           1 :         if invariants.Enabled && iterKV != nil {
    1048           1 :                 if !bytes.Equal(m.prefix, m.split.Prefix(iterKV.K.UserKey)) {
    1049           0 :                         m.logger.Fatalf("mergingIter: prefix violation: returning key %q without prefix %q\n", iterKV, m.prefix)
    1050           0 :                 }
    1051             :         }
    1052           1 :         return iterKV
    1053             : }
    1054             : 
    1055             : // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks.
    1056           1 : func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) error {
    1057           1 :         // See the comment in seekGE regarding using tombstones to adjust the seek
    1058           1 :         // target per level.
    1059           1 :         m.prefix = nil
    1060           1 :         for ; level < len(m.levels); level++ {
    1061           1 :                 if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 {
    1062           0 :                         m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack())
    1063           0 :                 }
    1064             : 
    1065           1 :                 l := &m.levels[level]
    1066           1 :                 l.iterKV = l.iter.SeekLT(key, flags)
    1067           1 :                 if l.iterKV == nil {
    1068           1 :                         if err := l.iter.Error(); err != nil {
    1069           1 :                                 return err
    1070           1 :                         }
    1071             :                 }
    1072             : 
    1073             :                 // If this level contains overlapping range tombstones, alter the seek
    1074             :                 // key accordingly. Caveat: If we're performing lazy-combined iteration,
    1075             :                 // we cannot alter the seek key: Range tombstones don't delete range
    1076             :                 // keys, and there might exist live range keys within the range
    1077             :                 // tombstone's span that need to be observed to trigger a switch to
    1078             :                 // combined iteration.
    1079           1 :                 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
    1080           1 :                         (m.combinedIterState == nil || m.combinedIterState.initialized) {
    1081           1 :                         // The level has a range-del iterator. Find the tombstone containing
    1082           1 :                         // the search key.
    1083           1 :                         tomb, err := keyspan.SeekLE(m.heap.cmp, rangeDelIter, key)
    1084           1 :                         if err != nil {
    1085           0 :                                 return err
    1086           0 :                         }
    1087           1 :                         l.tombstone = tomb
    1088           1 :                         // Since SeekLT is exclusive on `key` and a tombstone's end key is
    1089           1 :                         // also exclusive, a seek key equal to a tombstone's end key still
    1090           1 :                         // enables the seek optimization (Note this is different than the
    1091           1 :                         // check performed by (*keyspan.Span).Contains).
    1092           1 :                         if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) &&
    1093           1 :                                 m.heap.cmp(key, l.tombstone.End) <= 0 {
    1094           1 :                                 // NB: Based on the containment condition
    1095           1 :                                 // tombstone.Start.UserKey <= key, so the assignment to key
    1096           1 :                                 // results in a monotonically non-increasing key across
    1097           1 :                                 // iterations of this loop.
    1098           1 :                                 //
    1099           1 :                                 // The adjustment of key here can only move it to a smaller key.
    1100           1 :                                 // Since the caller of seekLT guaranteed that the original key
    1101           1 :                                 // was less than or equal to m.upper, the new key will continue
    1102           1 :                                 // to be less than or equal to m.upper.
    1103           1 :                                 key = l.tombstone.Start
    1104           1 :                         }
    1105             :                 }
    1106             :         }
    1107             : 
    1108           1 :         return m.initMaxHeap()
    1109             : }
    1110             : 
    1111             : // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks
    1112             : // the lower bound. It is up to the caller to ensure that key is less than the
    1113             : // upper bound.
    1114           1 : func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
    1115           1 :         m.prefix = nil
    1116           1 :         m.err = m.seekLT(key, 0 /* start level */, flags)
    1117           1 :         if m.err != nil {
    1118           1 :                 return nil
    1119           1 :         }
    1120           1 :         return m.findPrevEntry()
    1121             : }
    1122             : 
    1123             : // First implements base.InternalIterator.First. Note that First only checks
    1124             : // the upper bound. It is up to the caller to ensure that key is greater than
    1125             : // or equal to the lower bound (e.g. via a call to SeekGE(lower)).
    1126           1 : func (m *mergingIter) First() *base.InternalKV {
    1127           1 :         m.err = nil // clear cached iteration error
    1128           1 :         m.prefix = nil
    1129           1 :         m.heap.items = m.heap.items[:0]
    1130           1 :         for i := range m.levels {
    1131           1 :                 l := &m.levels[i]
    1132           1 :                 l.iterKV = l.iter.First()
    1133           1 :                 if l.iterKV == nil {
    1134           1 :                         if m.err = l.iter.Error(); m.err != nil {
    1135           1 :                                 return nil
    1136           1 :                         }
    1137             :                 }
    1138             :         }
    1139           1 :         if m.err = m.initMinHeap(); m.err != nil {
    1140           0 :                 return nil
    1141           0 :         }
    1142           1 :         return m.findNextEntry()
    1143             : }
    1144             : 
    1145             : // Last implements base.InternalIterator.Last. Note that Last only checks the
    1146             : // lower bound. It is up to the caller to ensure that key is less than the
    1147             : // upper bound (e.g. via a call to SeekLT(upper))
    1148           1 : func (m *mergingIter) Last() *base.InternalKV {
    1149           1 :         m.err = nil // clear cached iteration error
    1150           1 :         m.prefix = nil
    1151           1 :         for i := range m.levels {
    1152           1 :                 l := &m.levels[i]
    1153           1 :                 l.iterKV = l.iter.Last()
    1154           1 :                 if l.iterKV == nil {
    1155           1 :                         if m.err = l.iter.Error(); m.err != nil {
    1156           1 :                                 return nil
    1157           1 :                         }
    1158             :                 }
    1159             :         }
    1160           1 :         if m.err = m.initMaxHeap(); m.err != nil {
    1161           0 :                 return nil
    1162           0 :         }
    1163           1 :         return m.findPrevEntry()
    1164             : }
    1165             : 
    1166           1 : func (m *mergingIter) Next() *base.InternalKV {
    1167           1 :         if m.err != nil {
    1168           1 :                 return nil
    1169           1 :         }
    1170             : 
    1171           1 :         if m.dir != 1 {
    1172           1 :                 if m.err = m.switchToMinHeap(); m.err != nil {
    1173           1 :                         return nil
    1174           1 :                 }
    1175           1 :                 return m.findNextEntry()
    1176             :         }
    1177             : 
    1178           1 :         if m.heap.len() == 0 {
    1179           1 :                 return nil
    1180           1 :         }
    1181             : 
    1182             :         // NB: It's okay to call nextEntry directly even during prefix iteration
    1183             :         // mode. During prefix iteration mode, we rely on the caller to not call
    1184             :         // Next if the iterator has already advanced beyond the iteration prefix.
    1185             :         // See the comment above the base.InternalIterator interface.
    1186           1 :         if m.err = m.nextEntry(m.heap.items[0], nil /* succKey */); m.err != nil {
    1187           1 :                 return nil
    1188           1 :         }
    1189             : 
    1190           1 :         iterKV := m.findNextEntry()
    1191           1 :         if invariants.Enabled && m.prefix != nil && iterKV != nil {
    1192           1 :                 if !bytes.Equal(m.prefix, m.split.Prefix(iterKV.K.UserKey)) {
    1193           0 :                         m.logger.Fatalf("mergingIter: prefix violation: returning key %q without prefix %q\n", iterKV, m.prefix)
    1194           0 :                 }
    1195             :         }
    1196           1 :         return iterKV
    1197             : }
    1198             : 
    1199           1 : func (m *mergingIter) NextPrefix(succKey []byte) *base.InternalKV {
    1200           1 :         if m.dir != 1 {
    1201           0 :                 panic("pebble: cannot switch directions with NextPrefix")
    1202             :         }
    1203           1 :         if m.err != nil || m.heap.len() == 0 {
    1204           0 :                 return nil
    1205           0 :         }
    1206           1 :         if m.levelsPositioned == nil {
    1207           1 :                 m.levelsPositioned = make([]bool, len(m.levels))
    1208           1 :         } else {
    1209           1 :                 for i := range m.levelsPositioned {
    1210           1 :                         m.levelsPositioned[i] = false
    1211           1 :                 }
    1212             :         }
    1213             : 
    1214             :         // The heap root necessarily must be positioned at a key < succKey, because
    1215             :         // NextPrefix was invoked.
    1216           1 :         root := m.heap.items[0]
    1217           1 :         if invariants.Enabled && m.heap.cmp((*root).iterKV.K.UserKey, succKey) >= 0 {
    1218           0 :                 m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q",
    1219           0 :                         succKey, (*root).iterKV)
    1220           0 :         }
    1221             :         // NB: root is the heap root before we call nextEntry; nextEntry may change
    1222             :         // the heap root, so we must not `root` to still be the root of the heap, or
    1223             :         // even to be in the heap if the level's iterator becomes exhausted.
    1224           1 :         if m.err = m.nextEntry(root, succKey); m.err != nil {
    1225           1 :                 return nil
    1226           1 :         }
    1227             :         // We only consider the level to be conclusively positioned at the next
    1228             :         // prefix if our call to nextEntry did not advance the level onto a range
    1229             :         // deletion's boundary. Range deletions may have bounds within the prefix
    1230             :         // that are still surfaced by NextPrefix.
    1231           1 :         m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel()
    1232           1 : 
    1233           1 :         for m.heap.len() > 0 {
    1234           1 :                 root := m.heap.items[0]
    1235           1 :                 if m.levelsPositioned[root.index] {
    1236           1 :                         // A level we've previously positioned is at the top of the heap, so
    1237           1 :                         // there are no other levels positioned at keys < succKey. We've
    1238           1 :                         // advanced as far as we need to.
    1239           1 :                         break
    1240             :                 }
    1241             :                 // If the current heap root is a sentinel key, we need to skip it.
    1242             :                 // Calling NextPrefix while positioned at a sentinel key is not
    1243             :                 // supported.
    1244           1 :                 if root.iterKV.K.IsExclusiveSentinel() {
    1245           1 :                         if m.err = m.nextEntry(root, nil); m.err != nil {
    1246           0 :                                 return nil
    1247           0 :                         }
    1248           1 :                         continue
    1249             :                 }
    1250             : 
    1251             :                 // Since this level was not the original heap root when NextPrefix was
    1252             :                 // called, we don't know whether this level's current key has the
    1253             :                 // previous prefix or a new one.
    1254           1 :                 if m.heap.cmp(root.iterKV.K.UserKey, succKey) >= 0 {
    1255           1 :                         break
    1256             :                 }
    1257           1 :                 if m.err = m.nextEntry(root, succKey); m.err != nil {
    1258           0 :                         return nil
    1259           0 :                 }
    1260             :                 // We only consider the level to be conclusively positioned at the next
    1261             :                 // prefix if our call to nextEntry did not land onto a range deletion's
    1262             :                 // boundary. Range deletions may have bounds within the prefix that are
    1263             :                 // still surfaced by NextPrefix.
    1264           1 :                 m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel()
    1265             :         }
    1266           1 :         return m.findNextEntry()
    1267             : }
    1268             : 
    1269           1 : func (m *mergingIter) Prev() *base.InternalKV {
    1270           1 :         if m.err != nil {
    1271           0 :                 return nil
    1272           0 :         }
    1273             : 
    1274           1 :         if m.dir != -1 {
    1275           1 :                 if m.prefix != nil {
    1276           1 :                         m.err = errors.New("pebble: unsupported reverse prefix iteration")
    1277           1 :                         return nil
    1278           1 :                 }
    1279           1 :                 if m.err = m.switchToMaxHeap(); m.err != nil {
    1280           1 :                         return nil
    1281           1 :                 }
    1282           1 :                 return m.findPrevEntry()
    1283             :         }
    1284             : 
    1285           1 :         if m.heap.len() == 0 {
    1286           1 :                 return nil
    1287           1 :         }
    1288           1 :         if m.err = m.prevEntry(m.heap.items[0]); m.err != nil {
    1289           1 :                 return nil
    1290           1 :         }
    1291           1 :         return m.findPrevEntry()
    1292             : }
    1293             : 
    1294           1 : func (m *mergingIter) Error() error {
    1295           1 :         if m.heap.len() == 0 || m.err != nil {
    1296           1 :                 return m.err
    1297           1 :         }
    1298           1 :         return m.levels[m.heap.items[0].index].iter.Error()
    1299             : }
    1300             : 
    1301           1 : func (m *mergingIter) Close() error {
    1302           1 :         for i := range m.levels {
    1303           1 :                 iter := m.levels[i].iter
    1304           1 :                 if err := iter.Close(); err != nil && m.err == nil {
    1305           0 :                         m.err = err
    1306           0 :                 }
    1307           1 :                 m.levels[i].setRangeDelIter(nil)
    1308             :         }
    1309           1 :         m.levels = nil
    1310           1 :         m.heap.items = m.heap.items[:0]
    1311           1 :         return m.err
    1312             : }
    1313             : 
    1314           1 : func (m *mergingIter) SetBounds(lower, upper []byte) {
    1315           1 :         m.prefix = nil
    1316           1 :         m.lower = lower
    1317           1 :         m.upper = upper
    1318           1 :         for i := range m.levels {
    1319           1 :                 m.levels[i].iter.SetBounds(lower, upper)
    1320           1 :         }
    1321           1 :         m.heap.clear()
    1322             : }
    1323             : 
    1324           1 : func (m *mergingIter) SetContext(ctx context.Context) {
    1325           1 :         for i := range m.levels {
    1326           1 :                 m.levels[i].iter.SetContext(ctx)
    1327           1 :         }
    1328             : }
    1329             : 
    1330             : // DebugTree is part of the InternalIterator interface.
    1331           0 : func (m *mergingIter) DebugTree(tp treeprinter.Node) {
    1332           0 :         n := tp.Childf("%T(%p)", m, m)
    1333           0 :         for i := range m.levels {
    1334           0 :                 if iter := m.levels[i].iter; iter != nil {
    1335           0 :                         iter.DebugTree(n)
    1336           0 :                 }
    1337             :         }
    1338             : }
    1339             : 
    1340           0 : func (m *mergingIter) DebugString() string {
    1341           0 :         var buf bytes.Buffer
    1342           0 :         sep := ""
    1343           0 :         for m.heap.len() > 0 {
    1344           0 :                 item := m.heap.pop()
    1345           0 :                 fmt.Fprintf(&buf, "%s%s", sep, item.iterKV.K)
    1346           0 :                 sep = " "
    1347           0 :         }
    1348           0 :         var err error
    1349           0 :         if m.dir == 1 {
    1350           0 :                 err = m.initMinHeap()
    1351           0 :         } else {
    1352           0 :                 err = m.initMaxHeap()
    1353           0 :         }
    1354           0 :         if err != nil {
    1355           0 :                 fmt.Fprintf(&buf, "err=<%s>", err)
    1356           0 :         }
    1357           0 :         return buf.String()
    1358             : }
    1359             : 
    1360           1 : func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) {
    1361           1 :         for _, ml := range m.levels {
    1362           1 :                 if ml.levelIter != nil {
    1363           1 :                         if done := fn(ml.levelIter); done {
    1364           1 :                                 break
    1365             :                         }
    1366             :                 }
    1367             :         }
    1368             : }
    1369             : 
    1370           1 : func (m *mergingIter) addItemStats(l *mergingIterLevel) {
    1371           1 :         m.stats.PointCount++
    1372           1 :         m.stats.KeyBytes += uint64(len(l.iterKV.K.UserKey))
    1373           1 :         m.stats.ValueBytes += uint64(len(l.iterKV.V.ValueOrHandle))
    1374           1 : }
    1375             : 
    1376             : var _ internalIterator = &mergingIter{}

Generated by: LCOV version 1.14