LCOV - 2023-09-21 08:17Z 244f2e1f - tests + meta.lcov - pebble/internal/manifest/version.go

LCOV - code coverage report

Current view:	top level - pebble/internal/manifest - version.go (source / functions)		Hit	Total	Coverage
Test:	2023-09-21 08:17Z 244f2e1f - tests + meta.lcov	Lines:	643	743	86.5 %
Date:	2023-09-21 08:18:30	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package manifest
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "fmt"
      10             :         "sort"
      11             :         "strconv"
      12             :         "strings"
      13             :         "sync"
      14             :         "sync/atomic"
      15             :         "unicode"
      16             : 
      17             :         "github.com/cockroachdb/errors"
      18             :         "github.com/cockroachdb/pebble/internal/base"
      19             :         "github.com/cockroachdb/pebble/internal/invariants"
      20             : )
      21             : 
      22             : // Compare exports the base.Compare type.
      23             : type Compare = base.Compare
      24             : 
      25             : // InternalKey exports the base.InternalKey type.
      26             : type InternalKey = base.InternalKey
      27             : 
      28             : // TableInfo contains the common information for table related events.
      29             : type TableInfo struct {
      30             :         // FileNum is the internal DB identifier for the table.
      31             :         FileNum base.FileNum
      32             :         // Size is the size of the file in bytes.
      33             :         Size uint64
      34             :         // Smallest is the smallest internal key in the table.
      35             :         Smallest InternalKey
      36             :         // Largest is the largest internal key in the table.
      37             :         Largest InternalKey
      38             :         // SmallestSeqNum is the smallest sequence number in the table.
      39             :         SmallestSeqNum uint64
      40             :         // LargestSeqNum is the largest sequence number in the table.
      41             :         LargestSeqNum uint64
      42             : }
      43             : 
      44             : // TableStats contains statistics on a table used for compaction heuristics,
      45             : // and export via Metrics.
      46             : type TableStats struct {
      47             :         // The total number of entries in the table.
      48             :         NumEntries uint64
      49             :         // The number of point and range deletion entries in the table.
      50             :         NumDeletions uint64
      51             :         // NumRangeKeySets is the total number of range key sets in the table.
      52             :         NumRangeKeySets uint64
      53             :         // Estimate of the total disk space that may be dropped by this table's
      54             :         // point deletions by compacting them.
      55             :         PointDeletionsBytesEstimate uint64
      56             :         // Estimate of the total disk space that may be dropped by this table's
      57             :         // range deletions by compacting them. This estimate is at data-block
      58             :         // granularity and is not updated if compactions beneath the table reduce
      59             :         // the amount of reclaimable disk space. It also does not account for
      60             :         // overlapping data in L0 and ignores L0 sublevels, but the error that
      61             :         // introduces is expected to be small.
      62             :         //
      63             :         // Tables in the bottommost level of the LSM may have a nonzero estimate if
      64             :         // snapshots or move compactions prevented the elision of their range
      65             :         // tombstones. A table in the bottommost level that was ingested into L6
      66             :         // will have a zero estimate, because the file's sequence numbers indicate
      67             :         // that the tombstone cannot drop any data contained within the file itself.
      68             :         RangeDeletionsBytesEstimate uint64
      69             :         // Total size of value blocks and value index block.
      70             :         ValueBlocksSize uint64
      71             : }
      72             : 
      73             : // boundType represents the type of key (point or range) present as the smallest
      74             : // and largest keys.
      75             : type boundType uint8
      76             : 
      77             : const (
      78             :         boundTypePointKey boundType = iota + 1
      79             :         boundTypeRangeKey
      80             : )
      81             : 
      82             : // CompactionState is the compaction state of a file.
      83             : //
      84             : // The following shows the valid state transitions:
      85             : //
      86             : //      NotCompacting --> Compacting --> Compacted
      87             : //            ^               |
      88             : //            |               |
      89             : //            +-------<-------+
      90             : //
      91             : // Input files to a compaction transition to Compacting when a compaction is
      92             : // picked. A file that has finished compacting typically transitions into the
      93             : // Compacted state, at which point it is effectively obsolete ("zombied") and
      94             : // will eventually be removed from the LSM. A file that has been move-compacted
      95             : // will transition from Compacting back into the NotCompacting state, signaling
      96             : // that the file may be selected for a subsequent compaction. A failed
      97             : // compaction will result in all input tables transitioning from Compacting to
      98             : // NotCompacting.
      99             : //
     100             : // This state is in-memory only. It is not persisted to the manifest.
     101             : type CompactionState uint8
     102             : 
     103             : // CompactionStates.
     104             : const (
     105             :         CompactionStateNotCompacting CompactionState = iota
     106             :         CompactionStateCompacting
     107             :         CompactionStateCompacted
     108             : )
     109             : 
     110             : // String implements fmt.Stringer.
     111           0 : func (s CompactionState) String() string {
     112           0 :         switch s {
     113           0 :         case CompactionStateNotCompacting:
     114           0 :                 return "NotCompacting"
     115           0 :         case CompactionStateCompacting:
     116           0 :                 return "Compacting"
     117           0 :         case CompactionStateCompacted:
     118           0 :                 return "Compacted"
     119           0 :         default:
     120           0 :                 panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
     121             :         }
     122             : }
     123             : 
     124             : // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
     125             : // some version. FileMetadata does not contain the actual level of the sst,
     126             : // since such leveled-ssts can move across levels in different versions, while
     127             : // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
     128             : // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
     129             : // only state is FileBacking. A backing-sst is level-less. It is possible for a
     130             : // backing-sst to be referred to by a physical sst in one version and by one or
     131             : // more virtual ssts in one or more versions. A backing-sst becomes obsolete
     132             : // and can be deleted once it is no longer required by any physical or virtual
     133             : // sst in any version.
     134             : //
     135             : // We maintain some invariants:
     136             : //
     137             : //  1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
     138             : //     and there will be exactly one FileMetadata associated with the FileNum.
     139             : //
     140             : //  2. Within a version, a backing-sst is either only referred to by one
     141             : //     physical sst or one or more virtual ssts.
     142             : //
     143             : //  3. Once a backing-sst is referred to by a virtual sst in the latest version,
     144             : //     it cannot go back to being referred to by a physical sst in any future
     145             : //     version.
     146             : //
     147             : // Once a physical sst is no longer needed by any version, we will no longer
     148             : // maintain the file metadata associated with it. We will still maintain the
     149             : // FileBacking associated with the physical sst if the backing sst is required
     150             : // by any virtual ssts in any version.
     151             : type FileMetadata struct {
     152             :         // AllowedSeeks is used to determine if a file should be picked for
     153             :         // a read triggered compaction. It is decremented when read sampling
     154             :         // in pebble.Iterator after every after every positioning operation
     155             :         // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
     156             :         AllowedSeeks atomic.Int64
     157             : 
     158             :         // statsValid indicates if stats have been loaded for the table. The
     159             :         // TableStats structure is populated only if valid is true.
     160             :         statsValid atomic.Bool
     161             : 
     162             :         // FileBacking is the state which backs either a physical or virtual
     163             :         // sstables.
     164             :         FileBacking *FileBacking
     165             : 
     166             :         // InitAllowedSeeks is the inital value of allowed seeks. This is used
     167             :         // to re-set allowed seeks on a file once it hits 0.
     168             :         InitAllowedSeeks int64
     169             :         // FileNum is the file number.
     170             :         //
     171             :         // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
     172             :         FileNum base.FileNum
     173             :         // Size is the size of the file, in bytes. Size is an approximate value for
     174             :         // virtual sstables.
     175             :         //
     176             :         // INVARIANT: when !FileMetadata.Virtual, Size == FileBacking.Size.
     177             :         //
     178             :         // TODO(bananabrick): Size is currently used in metrics, and for many key
     179             :         // Pebble level heuristics. Make sure that the heuristics will still work
     180             :         // appropriately with an approximate value of size.
     181             :         Size uint64
     182             :         // File creation time in seconds since the epoch (1970-01-01 00:00:00
     183             :         // UTC). For ingested sstables, this corresponds to the time the file was
     184             :         // ingested. For virtual sstables, this corresponds to the wall clock time
     185             :         // when the FileMetadata for the virtual sstable was first created.
     186             :         CreationTime int64
     187             :         // Lower and upper bounds for the smallest and largest sequence numbers in
     188             :         // the table, across both point and range keys. For physical sstables, these
     189             :         // values are tight bounds. For virtual sstables, there is no guarantee that
     190             :         // there will be keys with SmallestSeqNum or LargestSeqNum within virtual
     191             :         // sstable bounds.
     192             :         SmallestSeqNum uint64
     193             :         LargestSeqNum  uint64
     194             :         // SmallestPointKey and LargestPointKey are the inclusive bounds for the
     195             :         // internal point keys stored in the table. This includes RANGEDELs, which
     196             :         // alter point keys.
     197             :         // NB: these field should be set using ExtendPointKeyBounds. They are left
     198             :         // exported for reads as an optimization.
     199             :         SmallestPointKey InternalKey
     200             :         LargestPointKey  InternalKey
     201             :         // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
     202             :         // internal range keys stored in the table.
     203             :         // NB: these field should be set using ExtendRangeKeyBounds. They are left
     204             :         // exported for reads as an optimization.
     205             :         SmallestRangeKey InternalKey
     206             :         LargestRangeKey  InternalKey
     207             :         // Smallest and Largest are the inclusive bounds for the internal keys stored
     208             :         // in the table, across both point and range keys.
     209             :         // NB: these fields are derived from their point and range key equivalents,
     210             :         // and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
     211             :         Smallest InternalKey
     212             :         Largest  InternalKey
     213             :         // Stats describe table statistics. Protected by DB.mu.
     214             :         //
     215             :         // For virtual sstables, set stats upon virtual sstable creation as
     216             :         // asynchronous computation of stats is not currently supported.
     217             :         //
     218             :         // TODO(bananabrick): To support manifest replay for virtual sstables, we
     219             :         // probably need to compute virtual sstable stats asynchronously. Otherwise,
     220             :         // we'd have to write virtual sstable stats to the version edit.
     221             :         Stats TableStats
     222             : 
     223             :         // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
     224             :         // pick L0 compactions. Only accurate for the most recent Version.
     225             :         SubLevel         int
     226             :         L0Index          int
     227             :         minIntervalIndex int
     228             :         maxIntervalIndex int
     229             : 
     230             :         // NB: the alignment of this struct is 8 bytes. We pack all the bools to
     231             :         // ensure an optimal packing.
     232             : 
     233             :         // IsIntraL0Compacting is set to True if this file is part of an intra-L0
     234             :         // compaction. When it's true, IsCompacting must also return true. If
     235             :         // Compacting is true and IsIntraL0Compacting is false for an L0 file, the
     236             :         // file must be part of a compaction to Lbase.
     237             :         IsIntraL0Compacting bool
     238             :         CompactionState     CompactionState
     239             :         // True if compaction of this file has been explicitly requested.
     240             :         // Previously, RocksDB and earlier versions of Pebble allowed this
     241             :         // flag to be set by a user table property collector. Some earlier
     242             :         // versions of Pebble respected this flag, while other more recent
     243             :         // versions ignored this flag.
     244             :         //
     245             :         // More recently this flag has been repurposed to facilitate the
     246             :         // compaction of 'atomic compaction units'. Files marked for
     247             :         // compaction are compacted in a rewrite compaction at the lowest
     248             :         // possible compaction priority.
     249             :         //
     250             :         // NB: A count of files marked for compaction is maintained on
     251             :         // Version, and compaction picking reads cached annotations
     252             :         // determined by this field.
     253             :         //
     254             :         // Protected by DB.mu.
     255             :         MarkedForCompaction bool
     256             :         // HasPointKeys tracks whether the table contains point keys (including
     257             :         // RANGEDELs). If a table contains only range deletions, HasPointsKeys is
     258             :         // still true.
     259             :         HasPointKeys bool
     260             :         // HasRangeKeys tracks whether the table contains any range keys.
     261             :         HasRangeKeys bool
     262             :         // smallestSet and largestSet track whether the overall bounds have been set.
     263             :         boundsSet bool
     264             :         // boundTypeSmallest and boundTypeLargest provide an indication as to which
     265             :         // key type (point or range) corresponds to the smallest and largest overall
     266             :         // table bounds.
     267             :         boundTypeSmallest, boundTypeLargest boundType
     268             :         // Virtual is true if the FileMetadata belongs to a virtual sstable.
     269             :         Virtual bool
     270             : }
     271             : 
     272             : // PhysicalFileMeta is used by functions which want a guarantee that their input
     273             : // belongs to a physical sst and not a virtual sst.
     274             : //
     275             : // NB: This type should only be constructed by calling
     276             : // FileMetadata.PhysicalMeta.
     277             : type PhysicalFileMeta struct {
     278             :         *FileMetadata
     279             : }
     280             : 
     281             : // VirtualFileMeta is used by functions which want a guarantee that their input
     282             : // belongs to a virtual sst and not a physical sst.
     283             : //
     284             : // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
     285             : // fields have additional invariants imposed on them, and/or slightly varying
     286             : // meanings:
     287             : //   - Smallest and Largest (and their counterparts
     288             : //     {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
     289             : //     key at that exact bound. We make the effort to determine the next smallest
     290             : //     or largest key in an sstable after virtualizing it, to maintain this
     291             : //     tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
     292             : //     could mean that a rangedel or range key ends at that user key, or has been
     293             : //     truncated to that user key.
     294             : //   - One invariant is that if a rangedel or range key is truncated on its
     295             : //     upper bound, the virtual sstable *must* have a rangedel or range key
     296             : //     sentinel key as its upper bound. This is because truncation yields
     297             : //     an exclusive upper bound for the rangedel/rangekey, and if there are
     298             : //     any points at that exclusive upper bound within the same virtual
     299             : //     sstable, those could get uncovered by this truncation. We enforce this
     300             : //     invariant in calls to keyspan.Truncate.
     301             : //   - Size is an estimate of the size of the virtualized portion of this sstable.
     302             : //     The underlying file's size is stored in FileBacking.Size, though it could
     303             : //     also be estimated or could correspond to just the referenced portion of
     304             : //     a file (eg. if the file originated on another node).
     305             : //   - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
     306             : //     This means that all keys in the virtual sstable must have seqnums within
     307             : //     [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
     308             : //     a key with a seqnum at either of the bounds. Calculating tight seqnum
     309             : //     bounds would be too expensive and deliver little value.
     310             : //
     311             : // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
     312             : type VirtualFileMeta struct {
     313             :         *FileMetadata
     314             : }
     315             : 
     316             : // PhysicalMeta should be the only source of creating the PhysicalFileMeta
     317             : // wrapper type.
     318           2 : func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
     319           2 :         if m.Virtual {
     320           0 :                 panic("pebble: file metadata does not belong to a physical sstable")
     321             :         }
     322           2 :         return PhysicalFileMeta{
     323           2 :                 m,
     324           2 :         }
     325             : }
     326             : 
     327             : // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
     328             : // type.
     329           1 : func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
     330           1 :         if !m.Virtual {
     331           0 :                 panic("pebble: file metadata does not belong to a virtual sstable")
     332             :         }
     333           1 :         return VirtualFileMeta{
     334           1 :                 m,
     335           1 :         }
     336             : }
     337             : 
     338             : // FileBacking either backs a single physical sstable, or one or more virtual
     339             : // sstables.
     340             : //
     341             : // See the comment above the FileMetadata type for sstable terminology.
     342             : type FileBacking struct {
     343             :         // Reference count for the backing file on disk: incremented when a
     344             :         // physical or virtual sstable which is backed by the FileBacking is
     345             :         // added to a version and decremented when the version is unreferenced.
     346             :         // We ref count in order to determine when it is safe to delete a
     347             :         // backing sst file from disk. The backing file is obsolete when the
     348             :         // reference count falls to zero.
     349             :         refs atomic.Int32
     350             :         // latestVersionRefs are the references to the FileBacking in the
     351             :         // latest version. This reference can be through a single physical
     352             :         // sstable in the latest version, or one or more virtual sstables in the
     353             :         // latest version.
     354             :         //
     355             :         // INVARIANT: latestVersionRefs <= refs.
     356             :         latestVersionRefs atomic.Int32
     357             :         // VirtualizedSize is set iff the backing sst is only referred to by
     358             :         // virtual ssts in the latest version. VirtualizedSize is the sum of the
     359             :         // virtual sstable sizes of all of the virtual sstables in the latest
     360             :         // version which are backed by the physical sstable. When a virtual
     361             :         // sstable is removed from the latest version, we will decrement the
     362             :         // VirtualizedSize. During compaction picking, we'll compensate a
     363             :         // virtual sstable file size by
     364             :         // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
     365             :         // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
     366             :         // is high, then the space amplification due to virtual sstables is
     367             :         // high, and we should pick the virtual sstable with a higher priority.
     368             :         //
     369             :         // TODO(bananabrick): Compensate the virtual sstable file size using
     370             :         // the VirtualizedSize during compaction picking and test.
     371             :         VirtualizedSize atomic.Uint64
     372             :         DiskFileNum     base.DiskFileNum
     373             :         Size            uint64
     374             : }
     375             : 
     376             : // InitPhysicalBacking allocates and sets the FileBacking which is required by a
     377             : // physical sstable FileMetadata.
     378             : //
     379             : // Ensure that the state required by FileBacking, such as the FileNum, is
     380             : // already set on the FileMetadata before InitPhysicalBacking is called.
     381             : // Calling InitPhysicalBacking only after the relevant state has been set in the
     382             : // FileMetadata is not necessary in tests which don't rely on FileBacking.
     383           2 : func (m *FileMetadata) InitPhysicalBacking() {
     384           2 :         if m.Virtual {
     385           0 :                 panic("pebble: virtual sstables should use a pre-existing FileBacking")
     386             :         }
     387           2 :         if m.FileBacking == nil {
     388           2 :                 m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
     389           2 :         }
     390             : }
     391             : 
     392             : // InitProviderBacking creates a new FileBacking for a file backed by
     393             : // an objstorage.Provider.
     394           1 : func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
     395           1 :         if !m.Virtual {
     396           0 :                 panic("pebble: provider-backed sstables must be virtual")
     397             :         }
     398           1 :         if m.FileBacking == nil {
     399           1 :                 m.FileBacking = &FileBacking{DiskFileNum: fileNum}
     400           1 :         }
     401             : }
     402             : 
     403             : // ValidateVirtual should be called once the FileMetadata for a virtual sstable
     404             : // is created to verify that the fields of the virtual sstable are sound.
     405           1 : func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
     406           1 :         if !m.Virtual {
     407           0 :                 panic("pebble: invalid virtual sstable")
     408             :         }
     409             : 
     410           1 :         if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
     411           0 :                 panic("pebble: invalid smallest sequence number for virtual sstable")
     412             :         }
     413             : 
     414           1 :         if createdFrom.LargestSeqNum != m.LargestSeqNum {
     415           0 :                 panic("pebble: invalid largest sequence number for virtual sstable")
     416             :         }
     417             : 
     418           1 :         if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
     419           0 :                 panic("pebble: invalid physical sstable state for virtual sstable")
     420             :         }
     421             : 
     422           1 :         if m.Size == 0 {
     423           0 :                 panic("pebble: virtual sstable size must be set upon creation")
     424             :         }
     425             : }
     426             : 
     427             : // Refs returns the refcount of backing sstable.
     428           2 : func (m *FileMetadata) Refs() int32 {
     429           2 :         return m.FileBacking.refs.Load()
     430           2 : }
     431             : 
     432             : // Ref increments the ref count associated with the backing sstable.
     433           2 : func (m *FileMetadata) Ref() {
     434           2 :         m.FileBacking.refs.Add(1)
     435           2 : }
     436             : 
     437             : // Unref decrements the ref count associated with the backing sstable.
     438           2 : func (m *FileMetadata) Unref() int32 {
     439           2 :         v := m.FileBacking.refs.Add(-1)
     440           2 :         if invariants.Enabled && v < 0 {
     441           0 :                 panic("pebble: invalid FileMetadata refcounting")
     442             :         }
     443           2 :         return v
     444             : }
     445             : 
     446             : // LatestRef increments the latest ref count associated with the backing
     447             : // sstable.
     448           2 : func (m *FileMetadata) LatestRef() {
     449           2 :         m.FileBacking.latestVersionRefs.Add(1)
     450           2 : 
     451           2 :         if m.Virtual {
     452           1 :                 m.FileBacking.VirtualizedSize.Add(m.Size)
     453           1 :         }
     454             : }
     455             : 
     456             : // LatestUnref decrements the latest ref count associated with the backing
     457             : // sstable.
     458           2 : func (m *FileMetadata) LatestUnref() int32 {
     459           2 :         if m.Virtual {
     460           1 :                 m.FileBacking.VirtualizedSize.Add(-m.Size)
     461           1 :         }
     462             : 
     463           2 :         v := m.FileBacking.latestVersionRefs.Add(-1)
     464           2 :         if invariants.Enabled && v < 0 {
     465           0 :                 panic("pebble: invalid FileMetadata latest refcounting")
     466             :         }
     467           2 :         return v
     468             : }
     469             : 
     470             : // LatestRefs returns the latest ref count associated with the backing sstable.
     471           2 : func (m *FileMetadata) LatestRefs() int32 {
     472           2 :         return m.FileBacking.latestVersionRefs.Load()
     473           2 : }
     474             : 
     475             : // SetCompactionState transitions this file's compaction state to the given
     476             : // state. Protected by DB.mu.
     477           2 : func (m *FileMetadata) SetCompactionState(to CompactionState) {
     478           2 :         if invariants.Enabled {
     479           2 :                 transitionErr := func() error {
     480           0 :                         return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
     481           0 :                 }
     482           2 :                 switch m.CompactionState {
     483           2 :                 case CompactionStateNotCompacting:
     484           2 :                         if to != CompactionStateCompacting {
     485           0 :                                 panic(transitionErr())
     486             :                         }
     487           2 :                 case CompactionStateCompacting:
     488           2 :                         if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
     489           0 :                                 panic(transitionErr())
     490             :                         }
     491           0 :                 case CompactionStateCompacted:
     492           0 :                         panic(transitionErr())
     493           0 :                 default:
     494           0 :                         panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
     495             :                 }
     496             :         }
     497           2 :         m.CompactionState = to
     498             : }
     499             : 
     500             : // IsCompacting returns true if this file's compaction state is
     501             : // CompactionStateCompacting. Protected by DB.mu.
     502           2 : func (m *FileMetadata) IsCompacting() bool {
     503           2 :         return m.CompactionState == CompactionStateCompacting
     504           2 : }
     505             : 
     506             : // StatsValid returns true if the table stats have been populated. If StatValid
     507             : // returns true, the Stats field may be read (with or without holding the
     508             : // database mutex).
     509           2 : func (m *FileMetadata) StatsValid() bool {
     510           2 :         return m.statsValid.Load()
     511           2 : }
     512             : 
     513             : // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
     514             : // while populating TableStats and calling StatsMarkValud. Once stats are
     515             : // populated, they must not be mutated.
     516           2 : func (m *FileMetadata) StatsMarkValid() {
     517           2 :         m.statsValid.Store(true)
     518           2 : }
     519             : 
     520             : // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
     521             : // and overall table bounds with the given smallest and largest keys. The
     522             : // smallest and largest bounds may not be extended if the table already has a
     523             : // bound that is smaller or larger, respectively. The receiver is returned.
     524             : // NB: calling this method should be preferred to manually setting the bounds by
     525             : // manipulating the fields directly, to maintain certain invariants.
     526             : func (m *FileMetadata) ExtendPointKeyBounds(
     527             :         cmp Compare, smallest, largest InternalKey,
     528           2 : ) *FileMetadata {
     529           2 :         // Update the point key bounds.
     530           2 :         if !m.HasPointKeys {
     531           2 :                 m.SmallestPointKey, m.LargestPointKey = smallest, largest
     532           2 :                 m.HasPointKeys = true
     533           2 :         } else {
     534           2 :                 if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
     535           2 :                         m.SmallestPointKey = smallest
     536           2 :                 }
     537           2 :                 if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
     538           2 :                         m.LargestPointKey = largest
     539           2 :                 }
     540             :         }
     541             :         // Update the overall bounds.
     542           2 :         m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
     543           2 :         return m
     544             : }
     545             : 
     546             : // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
     547             : // and overall table bounds with the given smallest and largest keys. The
     548             : // smallest and largest bounds may not be extended if the table already has a
     549             : // bound that is smaller or larger, respectively. The receiver is returned.
     550             : // NB: calling this method should be preferred to manually setting the bounds by
     551             : // manipulating the fields directly, to maintain certain invariants.
     552             : func (m *FileMetadata) ExtendRangeKeyBounds(
     553             :         cmp Compare, smallest, largest InternalKey,
     554           2 : ) *FileMetadata {
     555           2 :         // Update the range key bounds.
     556           2 :         if !m.HasRangeKeys {
     557           2 :                 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
     558           2 :                 m.HasRangeKeys = true
     559           2 :         } else {
     560           1 :                 if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
     561           0 :                         m.SmallestRangeKey = smallest
     562           0 :                 }
     563           1 :                 if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
     564           1 :                         m.LargestRangeKey = largest
     565           1 :                 }
     566             :         }
     567             :         // Update the overall bounds.
     568           2 :         m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
     569           2 :         return m
     570             : }
     571             : 
     572             : // extendOverallBounds attempts to extend the overall table lower and upper
     573             : // bounds. The given bounds may not be used if a lower or upper bound already
     574             : // exists that is smaller or larger than the given keys, respectively. The given
     575             : // boundType will be used if the bounds are updated.
     576             : func (m *FileMetadata) extendOverallBounds(
     577             :         cmp Compare, smallest, largest InternalKey, bTyp boundType,
     578           2 : ) {
     579           2 :         if !m.boundsSet {
     580           2 :                 m.Smallest, m.Largest = smallest, largest
     581           2 :                 m.boundsSet = true
     582           2 :                 m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
     583           2 :         } else {
     584           2 :                 if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
     585           2 :                         m.Smallest = smallest
     586           2 :                         m.boundTypeSmallest = bTyp
     587           2 :                 }
     588           2 :                 if base.InternalCompare(cmp, largest, m.Largest) > 0 {
     589           2 :                         m.Largest = largest
     590           2 :                         m.boundTypeLargest = bTyp
     591           2 :                 }
     592             :         }
     593             : }
     594             : 
     595             : // Overlaps returns true if the file key range overlaps with the given range.
     596           2 : func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
     597           2 :         if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
     598           2 :                 // f is completely before the specified range; no overlap.
     599           2 :                 return false
     600           2 :         }
     601           2 :         if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
     602           2 :                 // f is completely after the specified range; no overlap.
     603           2 :                 return false
     604           2 :         }
     605           2 :         return true
     606             : }
     607             : 
     608             : // ContainedWithinSpan returns true if the file key range completely overlaps with the
     609             : // given range ("end" is assumed to exclusive).
     610           1 : func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
     611           1 :         lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
     612           1 :         return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
     613           1 : }
     614             : 
     615             : // ContainsKeyType returns whether or not the file contains keys of the provided
     616             : // type.
     617           2 : func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
     618           2 :         switch kt {
     619           2 :         case KeyTypePointAndRange:
     620           2 :                 return true
     621           2 :         case KeyTypePoint:
     622           2 :                 return m.HasPointKeys
     623           2 :         case KeyTypeRange:
     624           2 :                 return m.HasRangeKeys
     625           0 :         default:
     626           0 :                 panic("unrecognized key type")
     627             :         }
     628             : }
     629             : 
     630             : // SmallestBound returns the file's smallest bound of the key type. It returns a
     631             : // false second return value if the file does not contain any keys of the key
     632             : // type.
     633           2 : func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
     634           2 :         switch kt {
     635           0 :         case KeyTypePointAndRange:
     636           0 :                 return &m.Smallest, true
     637           2 :         case KeyTypePoint:
     638           2 :                 return &m.SmallestPointKey, m.HasPointKeys
     639           2 :         case KeyTypeRange:
     640           2 :                 return &m.SmallestRangeKey, m.HasRangeKeys
     641           0 :         default:
     642           0 :                 panic("unrecognized key type")
     643             :         }
     644             : }
     645             : 
     646             : // LargestBound returns the file's largest bound of the key type. It returns a
     647             : // false second return value if the file does not contain any keys of the key
     648             : // type.
     649           2 : func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
     650           2 :         switch kt {
     651           0 :         case KeyTypePointAndRange:
     652           0 :                 return &m.Largest, true
     653           2 :         case KeyTypePoint:
     654           2 :                 return &m.LargestPointKey, m.HasPointKeys
     655           2 :         case KeyTypeRange:
     656           2 :                 return &m.LargestRangeKey, m.HasRangeKeys
     657           0 :         default:
     658           0 :                 panic("unrecognized key type")
     659             :         }
     660             : }
     661             : 
     662             : const (
     663             :         maskContainsPointKeys = 1 << 0
     664             :         maskSmallest          = 1 << 1
     665             :         maskLargest           = 1 << 2
     666             : )
     667             : 
     668             : // boundsMarker returns a marker byte whose bits encode the following
     669             : // information (in order from least significant bit):
     670             : // - if the table contains point keys
     671             : // - if the table's smallest key is a point key
     672             : // - if the table's largest key is a point key
     673           2 : func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
     674           2 :         if m.HasPointKeys {
     675           2 :                 sentinel |= maskContainsPointKeys
     676           2 :         }
     677           2 :         switch m.boundTypeSmallest {
     678           2 :         case boundTypePointKey:
     679           2 :                 sentinel |= maskSmallest
     680           2 :         case boundTypeRangeKey:
     681             :                 // No op - leave bit unset.
     682           0 :         default:
     683           0 :                 return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
     684             :         }
     685           2 :         switch m.boundTypeLargest {
     686           2 :         case boundTypePointKey:
     687           2 :                 sentinel |= maskLargest
     688           2 :         case boundTypeRangeKey:
     689             :                 // No op - leave bit unset.
     690           0 :         default:
     691           0 :                 return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
     692             :         }
     693           2 :         return
     694             : }
     695             : 
     696             : // String implements fmt.Stringer, printing the file number and the overall
     697             : // table bounds.
     698           2 : func (m *FileMetadata) String() string {
     699           2 :         return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
     700           2 : }
     701             : 
     702             : // DebugString returns a verbose representation of FileMetadata, typically for
     703             : // use in tests and debugging, returning the file number and the point, range
     704             : // and overall bounds for the table.
     705           2 : func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
     706           2 :         var b bytes.Buffer
     707           2 :         fmt.Fprintf(&b, "%s:[%s-%s]",
     708           2 :                 m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
     709           2 :         if !verbose {
     710           2 :                 return b.String()
     711           2 :         }
     712           1 :         fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
     713           1 :         if m.HasPointKeys {
     714           1 :                 fmt.Fprintf(&b, " points:[%s-%s]",
     715           1 :                         m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
     716           1 :         }
     717           1 :         if m.HasRangeKeys {
     718           1 :                 fmt.Fprintf(&b, " ranges:[%s-%s]",
     719           1 :                         m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
     720           1 :         }
     721           1 :         return b.String()
     722             : }
     723             : 
     724             : // ParseFileMetadataDebug parses a FileMetadata from its DebugString
     725             : // representation.
     726           1 : func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
     727           1 :         // Split lines of the form:
     728           1 :         //  000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
     729           1 :         fields := strings.FieldsFunc(s, func(c rune) bool {
     730           1 :                 switch c {
     731           1 :                 case ':', '[', '-', ']':
     732           1 :                         return true
     733           1 :                 default:
     734           1 :                         return unicode.IsSpace(c) // NB: also trim whitespace padding.
     735             :                 }
     736             :         })
     737           1 :         if len(fields)%3 != 0 {
     738           0 :                 return nil, errors.Newf("malformed input: %s", s)
     739           0 :         }
     740           1 :         m := &FileMetadata{}
     741           1 :         for len(fields) > 0 {
     742           1 :                 prefix := fields[0]
     743           1 :                 if prefix == "seqnums" {
     744           1 :                         smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
     745           1 :                         if err != nil {
     746           0 :                                 return m, errors.Newf("malformed input: %s: %s", s, err)
     747           0 :                         }
     748           1 :                         largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
     749           1 :                         if err != nil {
     750           0 :                                 return m, errors.Newf("malformed input: %s: %s", s, err)
     751           0 :                         }
     752           1 :                         m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
     753           1 :                         fields = fields[3:]
     754           1 :                         continue
     755             :                 }
     756           1 :                 smallest := base.ParsePrettyInternalKey(fields[1])
     757           1 :                 largest := base.ParsePrettyInternalKey(fields[2])
     758           1 :                 switch prefix {
     759           1 :                 case "points":
     760           1 :                         m.SmallestPointKey, m.LargestPointKey = smallest, largest
     761           1 :                         m.HasPointKeys = true
     762           1 :                 case "ranges":
     763           1 :                         m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
     764           1 :                         m.HasRangeKeys = true
     765           1 :                 default:
     766           1 :                         fileNum, err := strconv.ParseUint(prefix, 10, 64)
     767           1 :                         if err != nil {
     768           0 :                                 return m, errors.Newf("malformed input: %s: %s", s, err)
     769           0 :                         }
     770           1 :                         m.FileNum = base.FileNum(fileNum)
     771           1 :                         m.Smallest, m.Largest = smallest, largest
     772           1 :                         m.boundsSet = true
     773             :                 }
     774           1 :                 fields = fields[3:]
     775             :         }
     776             :         // By default, when the parser sees just the overall bounds, we set the point
     777             :         // keys. This preserves backwards compatability with existing test cases that
     778             :         // specify only the overall bounds.
     779           1 :         if !m.HasPointKeys && !m.HasRangeKeys {
     780           1 :                 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
     781           1 :                 m.HasPointKeys = true
     782           1 :         }
     783           1 :         m.InitPhysicalBacking()
     784           1 :         return m, nil
     785             : }
     786             : 
     787             : // Validate validates the metadata for consistency with itself, returning an
     788             : // error if inconsistent.
     789           2 : func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
     790           2 :         // Combined range and point key validation.
     791           2 : 
     792           2 :         if !m.HasPointKeys && !m.HasRangeKeys {
     793           0 :                 return base.CorruptionErrorf("file %s has neither point nor range keys",
     794           0 :                         errors.Safe(m.FileNum))
     795           0 :         }
     796           2 :         if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
     797           1 :                 return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
     798           1 :                         errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
     799           1 :                         m.Largest.Pretty(formatKey))
     800           1 :         }
     801           2 :         if m.SmallestSeqNum > m.LargestSeqNum {
     802           0 :                 return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
     803           0 :                         errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
     804           0 :         }
     805             : 
     806             :         // Point key validation.
     807             : 
     808           2 :         if m.HasPointKeys {
     809           2 :                 if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
     810           0 :                         return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
     811           0 :                                 errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
     812           0 :                                 m.LargestPointKey.Pretty(formatKey))
     813           0 :                 }
     814           2 :                 if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
     815           2 :                         base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
     816           0 :                         return base.CorruptionErrorf(
     817           0 :                                 "file %s has inconsistent point key bounds relative to overall bounds: "+
     818           0 :                                         "overall = [%s-%s], point keys = [%s-%s]",
     819           0 :                                 errors.Safe(m.FileNum),
     820           0 :                                 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
     821           0 :                                 m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
     822           0 :                         )
     823           0 :                 }
     824             :         }
     825             : 
     826             :         // Range key validation.
     827             : 
     828           2 :         if m.HasRangeKeys {
     829           2 :                 if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
     830           0 :                         return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
     831           0 :                                 errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
     832           0 :                                 m.LargestRangeKey.Pretty(formatKey))
     833           0 :                 }
     834           2 :                 if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
     835           2 :                         base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
     836           0 :                         return base.CorruptionErrorf(
     837           0 :                                 "file %s has inconsistent range key bounds relative to overall bounds: "+
     838           0 :                                         "overall = [%s-%s], range keys = [%s-%s]",
     839           0 :                                 errors.Safe(m.FileNum),
     840           0 :                                 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
     841           0 :                                 m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
     842           0 :                         )
     843           0 :                 }
     844             :         }
     845             : 
     846             :         // Ensure that FileMetadata.Init was called.
     847           2 :         if m.FileBacking == nil {
     848           0 :                 return base.CorruptionErrorf("file metadata FileBacking not set")
     849           0 :         }
     850             : 
     851           2 :         return nil
     852             : }
     853             : 
     854             : // TableInfo returns a subset of the FileMetadata state formatted as a
     855             : // TableInfo.
     856           2 : func (m *FileMetadata) TableInfo() TableInfo {
     857           2 :         return TableInfo{
     858           2 :                 FileNum:        m.FileNum,
     859           2 :                 Size:           m.Size,
     860           2 :                 Smallest:       m.Smallest,
     861           2 :                 Largest:        m.Largest,
     862           2 :                 SmallestSeqNum: m.SmallestSeqNum,
     863           2 :                 LargestSeqNum:  m.LargestSeqNum,
     864           2 :         }
     865           2 : }
     866             : 
     867           2 : func cmpUint64(a, b uint64) int {
     868           2 :         switch {
     869           2 :         case a < b:
     870           2 :                 return -1
     871           2 :         case a > b:
     872           2 :                 return +1
     873           2 :         default:
     874           2 :                 return 0
     875             :         }
     876             : }
     877             : 
     878           2 : func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
     879           2 :         // NB: This is the same ordering that RocksDB uses for L0 files.
     880           2 : 
     881           2 :         // Sort first by largest sequence number.
     882           2 :         if m.LargestSeqNum != b.LargestSeqNum {
     883           2 :                 return cmpUint64(m.LargestSeqNum, b.LargestSeqNum)
     884           2 :         }
     885             :         // Then by smallest sequence number.
     886           2 :         if m.SmallestSeqNum != b.SmallestSeqNum {
     887           2 :                 return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum)
     888           2 :         }
     889             :         // Break ties by file number.
     890           2 :         return cmpUint64(uint64(m.FileNum), uint64(b.FileNum))
     891             : }
     892             : 
     893           2 : func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
     894           2 :         return m.cmpSeqNum(b) < 0
     895           2 : }
     896             : 
     897           2 : func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
     898           2 :         return base.InternalCompare(cmp, m.Smallest, b.Smallest)
     899           2 : }
     900             : 
     901             : // KeyRange returns the minimum smallest and maximum largest internalKey for
     902             : // all the FileMetadata in iters.
     903           2 : func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
     904           2 :         first := true
     905           2 :         for _, iter := range iters {
     906           2 :                 for meta := iter.First(); meta != nil; meta = iter.Next() {
     907           2 :                         if first {
     908           2 :                                 first = false
     909           2 :                                 smallest, largest = meta.Smallest, meta.Largest
     910           2 :                                 continue
     911             :                         }
     912           2 :                         if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
     913           2 :                                 smallest = meta.Smallest
     914           2 :                         }
     915           2 :                         if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
     916           2 :                                 largest = meta.Largest
     917           2 :                         }
     918             :                 }
     919             :         }
     920           2 :         return smallest, largest
     921             : }
     922             : 
     923             : type bySeqNum []*FileMetadata
     924             : 
     925           2 : func (b bySeqNum) Len() int { return len(b) }
     926           2 : func (b bySeqNum) Less(i, j int) bool {
     927           2 :         return b[i].lessSeqNum(b[j])
     928           2 : }
     929           2 : func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
     930             : 
     931             : // SortBySeqNum sorts the specified files by increasing sequence number.
     932           2 : func SortBySeqNum(files []*FileMetadata) {
     933           2 :         sort.Sort(bySeqNum(files))
     934           2 : }
     935             : 
     936             : type bySmallest struct {
     937             :         files []*FileMetadata
     938             :         cmp   Compare
     939             : }
     940             : 
     941           1 : func (b bySmallest) Len() int { return len(b.files) }
     942           1 : func (b bySmallest) Less(i, j int) bool {
     943           1 :         return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
     944           1 : }
     945           0 : func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
     946             : 
     947             : // SortBySmallest sorts the specified files by smallest key using the supplied
     948             : // comparison function to order user keys.
     949           1 : func SortBySmallest(files []*FileMetadata, cmp Compare) {
     950           1 :         sort.Sort(bySmallest{files, cmp})
     951           1 : }
     952             : 
     953           2 : func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
     954           2 :         startIter := iter.Clone()
     955           2 :         {
     956           2 :                 startIterFile := startIter.SeekGE(cmp, start)
     957           2 :                 // SeekGE compares user keys. The user key `start` may be equal to the
     958           2 :                 // f.Largest because f.Largest is a range deletion sentinel, indicating
     959           2 :                 // that the user key `start` is NOT contained within the file f. If
     960           2 :                 // that's the case, we can narrow the overlapping bounds to exclude the
     961           2 :                 // file with the sentinel.
     962           2 :                 if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
     963           2 :                         cmp(startIterFile.Largest.UserKey, start) == 0 {
     964           2 :                         startIterFile = startIter.Next()
     965           2 :                 }
     966           2 :                 _ = startIterFile // Ignore unused assignment.
     967             :         }
     968             : 
     969           2 :         endIter := iter.Clone()
     970           2 :         {
     971           2 :                 endIterFile := endIter.SeekGE(cmp, end)
     972           2 : 
     973           2 :                 if !exclusiveEnd {
     974           2 :                         // endIter is now pointing at the *first* file with a largest key >= end.
     975           2 :                         // If there are multiple files including the user key `end`, we want all
     976           2 :                         // of them, so move forward.
     977           2 :                         for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
     978           2 :                                 endIterFile = endIter.Next()
     979           2 :                         }
     980             :                 }
     981             : 
     982             :                 // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
     983             :                 // or nexted too far because Largest.UserKey equaled `end`, go back.
     984             :                 //
     985             :                 // Consider !exclusiveEnd and end = 'f', with the following file bounds:
     986             :                 //
     987             :                 //     [b,d] [e, f] [f, f] [g, h]
     988             :                 //
     989             :                 // the above for loop will Next until it arrives at [g, h]. We need to
     990             :                 // observe that g > f, and Prev to the file with bounds [f, f].
     991           2 :                 if endIterFile == nil {
     992           2 :                         endIterFile = endIter.Prev()
     993           2 :                 } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
     994           2 :                         endIterFile = endIter.Prev()
     995           2 :                 }
     996           2 :                 _ = endIterFile // Ignore unused assignment.
     997             :         }
     998           2 :         return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
     999             : }
    1000             : 
    1001             : // NumLevels is the number of levels a Version contains.
    1002             : const NumLevels = 7
    1003             : 
    1004             : // NewVersion constructs a new Version with the provided files. It requires
    1005             : // the provided files are already well-ordered. It's intended for testing.
    1006             : func NewVersion(
    1007             :         cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
    1008           1 : ) *Version {
    1009           1 :         var v Version
    1010           1 :         for l := range files {
    1011           1 :                 // NB: We specifically insert `files` into the B-Tree in the order
    1012           1 :                 // they appear within `files`. Some tests depend on this behavior in
    1013           1 :                 // order to test consistency checking, etc. Once we've constructed the
    1014           1 :                 // initial B-Tree, we swap out the btreeCmp for the correct one.
    1015           1 :                 // TODO(jackson): Adjust or remove the tests and remove this.
    1016           1 :                 v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
    1017           1 :                 v.Levels[l].level = l
    1018           1 :                 if l == 0 {
    1019           1 :                         v.Levels[l].tree.cmp = btreeCmpSeqNum
    1020           1 :                 } else {
    1021           1 :                         v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
    1022           1 :                 }
    1023           1 :                 for _, f := range files[l] {
    1024           1 :                         v.Levels[l].totalSize += f.Size
    1025           1 :                 }
    1026             :         }
    1027           1 :         if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
    1028           0 :                 panic(err)
    1029             :         }
    1030           1 :         return &v
    1031             : }
    1032             : 
    1033             : // Version is a collection of file metadata for on-disk tables at various
    1034             : // levels. In-memory DBs are written to level-0 tables, and compactions
    1035             : // migrate data from level N to level N+1. The tables map internal keys (which
    1036             : // are a user key, a delete or set bit, and a sequence number) to user values.
    1037             : //
    1038             : // The tables at level 0 are sorted by largest sequence number. Due to file
    1039             : // ingestion, there may be overlap in the ranges of sequence numbers contain in
    1040             : // level 0 sstables. In particular, it is valid for one level 0 sstable to have
    1041             : // the seqnum range [1,100] while an adjacent sstable has the seqnum range
    1042             : // [50,50]. This occurs when the [50,50] table was ingested and given a global
    1043             : // seqnum. The ingestion code will have ensured that the [50,50] sstable will
    1044             : // not have any keys that overlap with the [1,100] in the seqnum range
    1045             : // [1,49]. The range of internal keys [fileMetadata.smallest,
    1046             : // fileMetadata.largest] in each level 0 table may overlap.
    1047             : //
    1048             : // The tables at any non-0 level are sorted by their internal key range and any
    1049             : // two tables at the same non-0 level do not overlap.
    1050             : //
    1051             : // The internal key ranges of two tables at different levels X and Y may
    1052             : // overlap, for any X != Y.
    1053             : //
    1054             : // Finally, for every internal key in a table at level X, there is no internal
    1055             : // key in a higher level table that has both the same user key and a higher
    1056             : // sequence number.
    1057             : type Version struct {
    1058             :         refs atomic.Int32
    1059             : 
    1060             :         // The level 0 sstables are organized in a series of sublevels. Similar to
    1061             :         // the seqnum invariant in normal levels, there is no internal key in a
    1062             :         // higher level table that has both the same user key and a higher sequence
    1063             :         // number. Within a sublevel, tables are sorted by their internal key range
    1064             :         // and any two tables at the same sublevel do not overlap. Unlike the normal
    1065             :         // levels, sublevel n contains older tables (lower sequence numbers) than
    1066             :         // sublevel n+1.
    1067             :         //
    1068             :         // The L0Sublevels struct is mostly used for compaction picking. As most
    1069             :         // internal data structures in it are only necessary for compaction picking
    1070             :         // and not for iterator creation, the reference to L0Sublevels is nil'd
    1071             :         // after this version becomes the non-newest version, to reduce memory
    1072             :         // usage.
    1073             :         //
    1074             :         // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
    1075             :         // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
    1076             :         // a reference to that slice, as that slice is necessary for iterator
    1077             :         // creation and needs to outlast L0Sublevels.
    1078             :         L0Sublevels     *L0Sublevels
    1079             :         L0SublevelFiles []LevelSlice
    1080             : 
    1081             :         Levels [NumLevels]LevelMetadata
    1082             : 
    1083             :         // RangeKeyLevels holds a subset of the same files as Levels that contain range
    1084             :         // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
    1085             :         // duplication should be minimal, as range keys are expected to be rare.
    1086             :         RangeKeyLevels [NumLevels]LevelMetadata
    1087             : 
    1088             :         // The callback to invoke when the last reference to a version is
    1089             :         // removed. Will be called with list.mu held.
    1090             :         Deleted func(obsolete []*FileBacking)
    1091             : 
    1092             :         // Stats holds aggregated stats about the version maintained from
    1093             :         // version to version.
    1094             :         Stats struct {
    1095             :                 // MarkedForCompaction records the count of files marked for
    1096             :                 // compaction within the version.
    1097             :                 MarkedForCompaction int
    1098             :         }
    1099             : 
    1100             :         // The list the version is linked into.
    1101             :         list *VersionList
    1102             : 
    1103             :         // The next/prev link for the versionList doubly-linked list of versions.
    1104             :         prev, next *Version
    1105             : }
    1106             : 
    1107             : // String implements fmt.Stringer, printing the FileMetadata for each level in
    1108             : // the Version.
    1109           1 : func (v *Version) String() string {
    1110           1 :         return v.string(base.DefaultFormatter, false)
    1111           1 : }
    1112             : 
    1113             : // DebugString returns an alternative format to String() which includes sequence
    1114             : // number and kind information for the sstable boundaries.
    1115           1 : func (v *Version) DebugString(format base.FormatKey) string {
    1116           1 :         return v.string(format, true)
    1117           1 : }
    1118             : 
    1119           2 : func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
    1120           2 :         var buf bytes.Buffer
    1121           2 :         for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
    1122           2 :                 fmt.Fprintf(&buf, "0.%d:\n", sublevel)
    1123           2 :                 sublevels[sublevel].Each(func(f *FileMetadata) {
    1124           2 :                         fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
    1125           2 :                 })
    1126             :         }
    1127           2 :         return buf.String()
    1128             : }
    1129             : 
    1130           1 : func (v *Version) string(format base.FormatKey, verbose bool) string {
    1131           1 :         var buf bytes.Buffer
    1132           1 :         if len(v.L0SublevelFiles) > 0 {
    1133           1 :                 fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
    1134           1 :         }
    1135           1 :         for level := 1; level < NumLevels; level++ {
    1136           1 :                 if v.Levels[level].Empty() {
    1137           1 :                         continue
    1138             :                 }
    1139           1 :                 fmt.Fprintf(&buf, "%d:\n", level)
    1140           1 :                 iter := v.Levels[level].Iter()
    1141           1 :                 for f := iter.First(); f != nil; f = iter.Next() {
    1142           1 :                         fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
    1143           1 :                 }
    1144             :         }
    1145           1 :         return buf.String()
    1146             : }
    1147             : 
    1148             : // ParseVersionDebug parses a Version from its DebugString output.
    1149             : func ParseVersionDebug(
    1150             :         cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
    1151           1 : ) (*Version, error) {
    1152           1 :         var level int
    1153           1 :         var files [NumLevels][]*FileMetadata
    1154           1 :         for _, l := range strings.Split(s, "\n") {
    1155           1 :                 l = strings.TrimSpace(l)
    1156           1 : 
    1157           1 :                 switch l[:2] {
    1158           1 :                 case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
    1159           1 :                         var err error
    1160           1 :                         level, err = strconv.Atoi(l[:1])
    1161           1 :                         if err != nil {
    1162           0 :                                 return nil, err
    1163           0 :                         }
    1164           1 :                 default:
    1165           1 :                         m, err := ParseFileMetadataDebug(l)
    1166           1 :                         if err != nil {
    1167           0 :                                 return nil, err
    1168           0 :                         }
    1169             :                         // If we only parsed overall bounds, default to setting the point bounds.
    1170           1 :                         if !m.HasPointKeys && !m.HasRangeKeys {
    1171           0 :                                 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
    1172           0 :                                 m.HasPointKeys = true
    1173           0 :                         }
    1174           1 :                         files[level] = append(files[level], m)
    1175             :                 }
    1176             :         }
    1177             :         // Reverse the order of L0 files. This ensures we construct the same
    1178             :         // sublevels. (They're printed from higher sublevel to lower, which means in
    1179             :         // a partial order that represents newest to oldest).
    1180           1 :         for i := 0; i < len(files[0])/2; i++ {
    1181           1 :                 files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
    1182           1 :         }
    1183           1 :         return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
    1184             : }
    1185             : 
    1186             : // Refs returns the number of references to the version.
    1187           2 : func (v *Version) Refs() int32 {
    1188           2 :         return v.refs.Load()
    1189           2 : }
    1190             : 
    1191             : // Ref increments the version refcount.
    1192           2 : func (v *Version) Ref() {
    1193           2 :         v.refs.Add(1)
    1194           2 : }
    1195             : 
    1196             : // Unref decrements the version refcount. If the last reference to the version
    1197             : // was removed, the version is removed from the list of versions and the
    1198             : // Deleted callback is invoked. Requires that the VersionList mutex is NOT
    1199             : // locked.
    1200           2 : func (v *Version) Unref() {
    1201           2 :         if v.refs.Add(-1) == 0 {
    1202           2 :                 l := v.list
    1203           2 :                 l.mu.Lock()
    1204           2 :                 l.Remove(v)
    1205           2 :                 obsolete := v.unrefFiles()
    1206           2 :                 fileBacking := make([]*FileBacking, len(obsolete))
    1207           2 :                 for i, f := range obsolete {
    1208           2 :                         fileBacking[i] = f.FileBacking
    1209           2 :                 }
    1210           2 :                 v.Deleted(fileBacking)
    1211           2 :                 l.mu.Unlock()
    1212             :         }
    1213             : }
    1214             : 
    1215             : // UnrefLocked decrements the version refcount. If the last reference to the
    1216             : // version was removed, the version is removed from the list of versions and
    1217             : // the Deleted callback is invoked. Requires that the VersionList mutex is
    1218             : // already locked.
    1219           2 : func (v *Version) UnrefLocked() {
    1220           2 :         if v.refs.Add(-1) == 0 {
    1221           2 :                 v.list.Remove(v)
    1222           2 :                 obsolete := v.unrefFiles()
    1223           2 :                 fileBacking := make([]*FileBacking, len(obsolete))
    1224           2 :                 for i, f := range obsolete {
    1225           2 :                         fileBacking[i] = f.FileBacking
    1226           2 :                 }
    1227           2 :                 v.Deleted(fileBacking)
    1228             :         }
    1229             : }
    1230             : 
    1231           2 : func (v *Version) unrefFiles() []*FileMetadata {
    1232           2 :         var obsolete []*FileMetadata
    1233           2 :         for _, lm := range v.Levels {
    1234           2 :                 obsolete = append(obsolete, lm.release()...)
    1235           2 :         }
    1236           2 :         for _, lm := range v.RangeKeyLevels {
    1237           2 :                 obsolete = append(obsolete, lm.release()...)
    1238           2 :         }
    1239           2 :         return obsolete
    1240             : }
    1241             : 
    1242             : // Next returns the next version in the list of versions.
    1243           1 : func (v *Version) Next() *Version {
    1244           1 :         return v.next
    1245           1 : }
    1246             : 
    1247             : // InitL0Sublevels initializes the L0Sublevels
    1248             : func (v *Version) InitL0Sublevels(
    1249             :         cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
    1250           2 : ) error {
    1251           2 :         var err error
    1252           2 :         v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
    1253           2 :         if err == nil && v.L0Sublevels != nil {
    1254           2 :                 v.L0SublevelFiles = v.L0Sublevels.Levels
    1255           2 :         }
    1256           2 :         return err
    1257             : }
    1258             : 
    1259             : // Contains returns a boolean indicating whether the provided file exists in
    1260             : // the version at the given level. If level is non-zero then Contains binary
    1261             : // searches among the files. If level is zero, Contains scans the entire
    1262             : // level.
    1263           2 : func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
    1264           2 :         iter := v.Levels[level].Iter()
    1265           2 :         if level > 0 {
    1266           2 :                 overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
    1267           2 :                         m.Largest.IsExclusiveSentinel())
    1268           2 :                 iter = overlaps.Iter()
    1269           2 :         }
    1270           2 :         for f := iter.First(); f != nil; f = iter.Next() {
    1271           2 :                 if f == m {
    1272           2 :                         return true
    1273           2 :                 }
    1274             :         }
    1275           2 :         return false
    1276             : }
    1277             : 
    1278             : // Overlaps returns all elements of v.files[level] whose user key range
    1279             : // intersects the given range. If level is non-zero then the user key ranges of
    1280             : // v.files[level] are assumed to not overlap (although they may touch). If level
    1281             : // is zero then that assumption cannot be made, and the [start, end] range is
    1282             : // expanded to the union of those matching ranges so far and the computation is
    1283             : // repeated until [start, end] stabilizes.
    1284             : // The returned files are a subsequence of the input files, i.e., the ordering
    1285             : // is not changed.
    1286             : func (v *Version) Overlaps(
    1287             :         level int, cmp Compare, start, end []byte, exclusiveEnd bool,
    1288           2 : ) LevelSlice {
    1289           2 :         if level == 0 {
    1290           2 :                 // Indices that have been selected as overlapping.
    1291           2 :                 l0 := v.Levels[level]
    1292           2 :                 l0Iter := l0.Iter()
    1293           2 :                 selectedIndices := make([]bool, l0.Len())
    1294           2 :                 numSelected := 0
    1295           2 :                 var slice LevelSlice
    1296           2 :                 for {
    1297           2 :                         restart := false
    1298           2 :                         for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
    1299           2 :                                 selected := selectedIndices[i]
    1300           2 :                                 if selected {
    1301           2 :                                         continue
    1302             :                                 }
    1303           2 :                                 if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
    1304           2 :                                         // meta is completely outside the specified range; skip it.
    1305           2 :                                         continue
    1306             :                                 }
    1307             :                                 // Overlaps.
    1308           2 :                                 selectedIndices[i] = true
    1309           2 :                                 numSelected++
    1310           2 : 
    1311           2 :                                 smallest := meta.Smallest.UserKey
    1312           2 :                                 largest := meta.Largest.UserKey
    1313           2 :                                 // Since level == 0, check if the newly added fileMetadata has
    1314           2 :                                 // expanded the range. We expand the range immediately for files
    1315           2 :                                 // we have remaining to check in this loop. All already checked
    1316           2 :                                 // and unselected files will need to be rechecked via the
    1317           2 :                                 // restart below.
    1318           2 :                                 if cmp(smallest, start) < 0 {
    1319           2 :                                         start = smallest
    1320           2 :                                         restart = true
    1321           2 :                                 }
    1322           2 :                                 if v := cmp(largest, end); v > 0 {
    1323           2 :                                         end = largest
    1324           2 :                                         exclusiveEnd = meta.Largest.IsExclusiveSentinel()
    1325           2 :                                         restart = true
    1326           2 :                                 } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
    1327           2 :                                         // Only update the exclusivity of our existing `end`
    1328           2 :                                         // bound.
    1329           2 :                                         exclusiveEnd = false
    1330           2 :                                         restart = true
    1331           2 :                                 }
    1332             :                         }
    1333             : 
    1334           2 :                         if !restart {
    1335           2 :                                 // Construct a B-Tree containing only the matching items.
    1336           2 :                                 var tr btree
    1337           2 :                                 tr.cmp = v.Levels[level].tree.cmp
    1338           2 :                                 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
    1339           2 :                                         if selectedIndices[i] {
    1340           2 :                                                 err := tr.Insert(meta)
    1341           2 :                                                 if err != nil {
    1342           0 :                                                         panic(err)
    1343             :                                                 }
    1344             :                                         }
    1345             :                                 }
    1346           2 :                                 slice = newLevelSlice(tr.Iter())
    1347           2 :                                 // TODO(jackson): Avoid the oddity of constructing and
    1348           2 :                                 // immediately releasing a B-Tree. Make LevelSlice an
    1349           2 :                                 // interface?
    1350           2 :                                 tr.Release()
    1351           2 :                                 break
    1352             :                         }
    1353             :                         // Continue looping to retry the files that were not selected.
    1354             :                 }
    1355           2 :                 return slice
    1356             :         }
    1357             : 
    1358           2 :         return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
    1359             : }
    1360             : 
    1361             : // CheckOrdering checks that the files are consistent with respect to
    1362             : // increasing file numbers (for level 0 files) and increasing and non-
    1363             : // overlapping internal key ranges (for level non-0 files).
    1364           1 : func (v *Version) CheckOrdering(cmp Compare, format base.FormatKey) error {
    1365           1 :         for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
    1366           1 :                 sublevelIter := v.L0SublevelFiles[sublevel].Iter()
    1367           1 :                 if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter); err != nil {
    1368           0 :                         return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
    1369           0 :                 }
    1370             :         }
    1371             : 
    1372           1 :         for level, lm := range v.Levels {
    1373           1 :                 if err := CheckOrdering(cmp, format, Level(level), lm.Iter()); err != nil {
    1374           1 :                         return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
    1375           1 :                 }
    1376             :         }
    1377           1 :         return nil
    1378             : }
    1379             : 
    1380             : // VersionList holds a list of versions. The versions are ordered from oldest
    1381             : // to newest.
    1382             : type VersionList struct {
    1383             :         mu   *sync.Mutex
    1384             :         root Version
    1385             : }
    1386             : 
    1387             : // Init initializes the version list.
    1388           2 : func (l *VersionList) Init(mu *sync.Mutex) {
    1389           2 :         l.mu = mu
    1390           2 :         l.root.next = &l.root
    1391           2 :         l.root.prev = &l.root
    1392           2 : }
    1393             : 
    1394             : // Empty returns true if the list is empty, and false otherwise.
    1395           2 : func (l *VersionList) Empty() bool {
    1396           2 :         return l.root.next == &l.root
    1397           2 : }
    1398             : 
    1399             : // Front returns the oldest version in the list. Note that this version is only
    1400             : // valid if Empty() returns true.
    1401           2 : func (l *VersionList) Front() *Version {
    1402           2 :         return l.root.next
    1403           2 : }
    1404             : 
    1405             : // Back returns the newest version in the list. Note that this version is only
    1406             : // valid if Empty() returns true.
    1407           2 : func (l *VersionList) Back() *Version {
    1408           2 :         return l.root.prev
    1409           2 : }
    1410             : 
    1411             : // PushBack adds a new version to the back of the list. This new version
    1412             : // becomes the "newest" version in the list.
    1413           2 : func (l *VersionList) PushBack(v *Version) {
    1414           2 :         if v.list != nil || v.prev != nil || v.next != nil {
    1415           0 :                 panic("pebble: version list is inconsistent")
    1416             :         }
    1417           2 :         v.prev = l.root.prev
    1418           2 :         v.prev.next = v
    1419           2 :         v.next = &l.root
    1420           2 :         v.next.prev = v
    1421           2 :         v.list = l
    1422           2 :         // Let L0Sublevels on the second newest version get GC'd, as it is no longer
    1423           2 :         // necessary. See the comment in Version.
    1424           2 :         v.prev.L0Sublevels = nil
    1425             : }
    1426             : 
    1427             : // Remove removes the specified version from the list.
    1428           2 : func (l *VersionList) Remove(v *Version) {
    1429           2 :         if v == &l.root {
    1430           0 :                 panic("pebble: cannot remove version list root node")
    1431             :         }
    1432           2 :         if v.list != l {
    1433           0 :                 panic("pebble: version list is inconsistent")
    1434             :         }
    1435           2 :         v.prev.next = v.next
    1436           2 :         v.next.prev = v.prev
    1437           2 :         v.next = nil // avoid memory leaks
    1438           2 :         v.prev = nil // avoid memory leaks
    1439           2 :         v.list = nil // avoid memory leaks
    1440             : }
    1441             : 
    1442             : // CheckOrdering checks that the files are consistent with respect to
    1443             : // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
    1444             : // overlapping internal key ranges (for non-level 0 files).
    1445           2 : func CheckOrdering(cmp Compare, format base.FormatKey, level Level, files LevelIterator) error {
    1446           2 :         // The invariants to check for L0 sublevels are the same as the ones to
    1447           2 :         // check for all other levels. However, if L0 is not organized into
    1448           2 :         // sublevels, or if all L0 files are being passed in, we do the legacy L0
    1449           2 :         // checks, defined in the detailed comment below.
    1450           2 :         if level == Level(0) {
    1451           2 :                 // We have 2 kinds of files:
    1452           2 :                 // - Files with exactly one sequence number: these could be either ingested files
    1453           2 :                 //   or flushed files. We cannot tell the difference between them based on FileMetadata,
    1454           2 :                 //   so our consistency checking here uses the weaker checks assuming it is a narrow
    1455           2 :                 //   flushed file. We cannot error on ingested files having sequence numbers coincident
    1456           2 :                 //   with flushed files as the seemingly ingested file could just be a flushed file
    1457           2 :                 //   with just one key in it which is a truncated range tombstone sharing sequence numbers
    1458           2 :                 //   with other files in the same flush.
    1459           2 :                 // - Files with multiple sequence numbers: these are necessarily flushed files.
    1460           2 :                 //
    1461           2 :                 // Three cases of overlapping sequence numbers:
    1462           2 :                 // Case 1:
    1463           2 :                 // An ingested file contained in the sequence numbers of the flushed file -- it must be
    1464           2 :                 // fully contained (not coincident with either end of the flushed file) since the memtable
    1465           2 :                 // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
    1466           2 :                 // num b, and the memtable got a subsequent update that was given sequence num b+1, before
    1467           2 :                 // being flushed.
    1468           2 :                 //
    1469           2 :                 // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
    1470           2 :                 // third file are inconsistent with each other. So comparing adjacent files is insufficient
    1471           2 :                 // for consistency checking.
    1472           2 :                 //
    1473           2 :                 // Visually we have something like
    1474           2 :                 // x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
    1475           2 :                 //     y       y  y        y             (y's represent ingested files)
    1476           2 :                 // And these are ordered in increasing order of y. Note that y's must be unique.
    1477           2 :                 //
    1478           2 :                 // Case 2:
    1479           2 :                 // A flushed file that did not overlap in keys with any file in any level, but does overlap
    1480           2 :                 // in the file key intervals. This file is placed in L0 since it overlaps in the file
    1481           2 :                 // key intervals but since it has no overlapping data, it is assigned a sequence number
    1482           2 :                 // of 0 in RocksDB. We handle this case for compatibility with RocksDB.
    1483           2 :                 //
    1484           2 :                 // Case 3:
    1485           2 :                 // A sequence of flushed files that overlap in sequence numbers with one another,
    1486           2 :                 // but do not overlap in keys inside the sstables. These files correspond to
    1487           2 :                 // partitioned flushes or the results of intra-L0 compactions of partitioned
    1488           2 :                 // flushes.
    1489           2 :                 //
    1490           2 :                 // Since these types of SSTables violate most other sequence number
    1491           2 :                 // overlap invariants, and handling this case is important for compatibility
    1492           2 :                 // with future versions of pebble, this method relaxes most L0 invariant
    1493           2 :                 // checks.
    1494           2 : 
    1495           2 :                 var prev *FileMetadata
    1496           2 :                 for f := files.First(); f != nil; f, prev = files.Next(), f {
    1497           2 :                         if prev == nil {
    1498           2 :                                 continue
    1499             :                         }
    1500             :                         // Validate that the sorting is sane.
    1501           2 :                         if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
    1502           1 :                                 // Multiple files satisfying case 2 mentioned above.
    1503           2 :                         } else if !prev.lessSeqNum(f) {
    1504           1 :                                 return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
    1505           1 :                                         errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1506           1 :                                         errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
    1507           1 :                                         errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
    1508           1 :                         }
    1509             :                 }
    1510           2 :         } else {
    1511           2 :                 var prev *FileMetadata
    1512           2 :                 for f := files.First(); f != nil; f, prev = files.Next(), f {
    1513           2 :                         if err := f.Validate(cmp, format); err != nil {
    1514           1 :                                 return errors.Wrapf(err, "%s ", level)
    1515           1 :                         }
    1516           2 :                         if prev != nil {
    1517           2 :                                 if prev.cmpSmallestKey(f, cmp) >= 0 {
    1518           1 :                                         return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
    1519           1 :                                                 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1520           1 :                                                 prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
    1521           1 :                                                 f.Smallest.Pretty(format), f.Largest.Pretty(format))
    1522           1 :                                 }
    1523           2 :                                 if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
    1524           1 :                                         return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
    1525           1 :                                                 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1526           1 :                                                 prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
    1527           1 :                                                 f.Smallest.Pretty(format), f.Largest.Pretty(format))
    1528           1 :                                 }
    1529             :                         }
    1530             :                 }
    1531             :         }
    1532           2 :         return nil
    1533             : }

Generated by: LCOV version 1.14