LCOV - code coverage report
Current view: top level - pebble/internal/manifest - version.go (source / functions) Hit Total Coverage
Test: 2024-11-23 08:16Z 3ec779d3 - tests only.lcov Lines: 743 869 85.5 %
Date: 2024-11-23 08:17:02 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package manifest
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         stdcmp "cmp"
      10             :         "fmt"
      11             :         "slices"
      12             :         "sort"
      13             :         "strings"
      14             :         "sync"
      15             :         "sync/atomic"
      16             : 
      17             :         "github.com/cockroachdb/errors"
      18             :         "github.com/cockroachdb/pebble/internal/base"
      19             :         "github.com/cockroachdb/pebble/internal/invariants"
      20             :         "github.com/cockroachdb/pebble/sstable"
      21             :         "github.com/cockroachdb/pebble/sstable/block"
      22             : )
      23             : 
      24             : // Compare exports the base.Compare type.
      25             : type Compare = base.Compare
      26             : 
      27             : // InternalKey exports the base.InternalKey type.
      28             : type InternalKey = base.InternalKey
      29             : 
      30             : // TableInfo contains the common information for table related events.
      31             : type TableInfo struct {
      32             :         // FileNum is the internal DB identifier for the table.
      33             :         FileNum base.FileNum
      34             :         // Size is the size of the file in bytes.
      35             :         Size uint64
      36             :         // Smallest is the smallest internal key in the table.
      37             :         Smallest InternalKey
      38             :         // Largest is the largest internal key in the table.
      39             :         Largest InternalKey
      40             :         // SmallestSeqNum is the smallest sequence number in the table.
      41             :         SmallestSeqNum base.SeqNum
      42             :         // LargestSeqNum is the largest sequence number in the table.
      43             :         LargestSeqNum base.SeqNum
      44             : }
      45             : 
      46             : // TableStats contains statistics on a table used for compaction heuristics,
      47             : // and export via Metrics.
      48             : type TableStats struct {
      49             :         // The total number of entries in the table.
      50             :         NumEntries uint64
      51             :         // The number of point and range deletion entries in the table.
      52             :         NumDeletions uint64
      53             :         // NumRangeKeySets is the total number of range key sets in the table.
      54             :         //
      55             :         // NB: If there's a chance that the sstable contains any range key sets,
      56             :         // then NumRangeKeySets must be > 0.
      57             :         NumRangeKeySets uint64
      58             :         // Estimate of the total disk space that may be dropped by this table's
      59             :         // point deletions by compacting them.
      60             :         PointDeletionsBytesEstimate uint64
      61             :         // Estimate of the total disk space that may be dropped by this table's
      62             :         // range deletions by compacting them. This estimate is at data-block
      63             :         // granularity and is not updated if compactions beneath the table reduce
      64             :         // the amount of reclaimable disk space. It also does not account for
      65             :         // overlapping data in L0 and ignores L0 sublevels, but the error that
      66             :         // introduces is expected to be small.
      67             :         //
      68             :         // Tables in the bottommost level of the LSM may have a nonzero estimate if
      69             :         // snapshots or move compactions prevented the elision of their range
      70             :         // tombstones. A table in the bottommost level that was ingested into L6
      71             :         // will have a zero estimate, because the file's sequence numbers indicate
      72             :         // that the tombstone cannot drop any data contained within the file itself.
      73             :         RangeDeletionsBytesEstimate uint64
      74             :         // Total size of value blocks and value index block.
      75             :         ValueBlocksSize uint64
      76             :         // CompressionType is the compression type of the table.
      77             :         CompressionType block.Compression
      78             :         // TombstoneDenseBlocksRatio is the ratio of data blocks in this table that
      79             :         // fulfills at least one of the following:
      80             :         // 1. The block contains at least options.Experimental.NumDeletionsThreshold
      81             :         //    point tombstones.
      82             :         // 2. The ratio of the uncompressed size of point tombstones to the
      83             :         //    uncompressed size of the block is at least
      84             :         //    options.Experimental.DeletionSizeRatioThreshold.
      85             :         // This statistic is used to determine eligibility for a tombstone density
      86             :         // compaction.
      87             :         TombstoneDenseBlocksRatio float64
      88             : }
      89             : 
      90             : // boundType represents the type of key (point or range) present as the smallest
      91             : // and largest keys.
      92             : type boundType uint8
      93             : 
      94             : const (
      95             :         boundTypePointKey boundType = iota + 1
      96             :         boundTypeRangeKey
      97             : )
      98             : 
      99             : // CompactionState is the compaction state of a file.
     100             : //
     101             : // The following shows the valid state transitions:
     102             : //
     103             : //      NotCompacting --> Compacting --> Compacted
     104             : //            ^               |
     105             : //            |               |
     106             : //            +-------<-------+
     107             : //
     108             : // Input files to a compaction transition to Compacting when a compaction is
     109             : // picked. A file that has finished compacting typically transitions into the
     110             : // Compacted state, at which point it is effectively obsolete ("zombied") and
     111             : // will eventually be removed from the LSM. A file that has been move-compacted
     112             : // will transition from Compacting back into the NotCompacting state, signaling
     113             : // that the file may be selected for a subsequent compaction. A failed
     114             : // compaction will result in all input tables transitioning from Compacting to
     115             : // NotCompacting.
     116             : //
     117             : // This state is in-memory only. It is not persisted to the manifest.
     118             : type CompactionState uint8
     119             : 
     120             : // CompactionStates.
     121             : const (
     122             :         CompactionStateNotCompacting CompactionState = iota
     123             :         CompactionStateCompacting
     124             :         CompactionStateCompacted
     125             : )
     126             : 
     127             : // String implements fmt.Stringer.
     128           0 : func (s CompactionState) String() string {
     129           0 :         switch s {
     130           0 :         case CompactionStateNotCompacting:
     131           0 :                 return "NotCompacting"
     132           0 :         case CompactionStateCompacting:
     133           0 :                 return "Compacting"
     134           0 :         case CompactionStateCompacted:
     135           0 :                 return "Compacted"
     136           0 :         default:
     137           0 :                 panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
     138             :         }
     139             : }
     140             : 
     141             : // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
     142             : // some version. FileMetadata does not contain the actual level of the sst,
     143             : // since such leveled-ssts can move across levels in different versions, while
     144             : // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
     145             : // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
     146             : // only state is FileBacking. A backing-sst is level-less. It is possible for a
     147             : // backing-sst to be referred to by a physical sst in one version and by one or
     148             : // more virtual ssts in one or more versions. A backing-sst becomes obsolete
     149             : // and can be deleted once it is no longer required by any physical or virtual
     150             : // sst in any version.
     151             : //
     152             : // We maintain some invariants:
     153             : //
     154             : //  1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
     155             : //     and there will be exactly one FileMetadata associated with the FileNum.
     156             : //
     157             : //  2. Within a version, a backing-sst is either only referred to by one
     158             : //     physical sst or one or more virtual ssts.
     159             : //
     160             : //  3. Once a backing-sst is referred to by a virtual sst in the latest version,
     161             : //     it cannot go back to being referred to by a physical sst in any future
     162             : //     version.
     163             : //
     164             : // Once a physical sst is no longer needed by any version, we will no longer
     165             : // maintain the file metadata associated with it. We will still maintain the
     166             : // FileBacking associated with the physical sst if the backing sst is required
     167             : // by any virtual ssts in any version.
     168             : type FileMetadata struct {
     169             :         // AllowedSeeks is used to determine if a file should be picked for
     170             :         // a read triggered compaction. It is decremented when read sampling
     171             :         // in pebble.Iterator after every after every positioning operation
     172             :         // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
     173             :         AllowedSeeks atomic.Int64
     174             : 
     175             :         // statsValid indicates if stats have been loaded for the table. The
     176             :         // TableStats structure is populated only if valid is true.
     177             :         statsValid atomic.Bool
     178             : 
     179             :         // FileBacking is the state which backs either a physical or virtual
     180             :         // sstables.
     181             :         FileBacking *FileBacking
     182             : 
     183             :         // InitAllowedSeeks is the inital value of allowed seeks. This is used
     184             :         // to re-set allowed seeks on a file once it hits 0.
     185             :         InitAllowedSeeks int64
     186             :         // FileNum is the file number.
     187             :         //
     188             :         // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
     189             :         FileNum base.FileNum
     190             :         // Size is the size of the file, in bytes. Size is an approximate value for
     191             :         // virtual sstables.
     192             :         //
     193             :         // INVARIANTS:
     194             :         // - When !FileMetadata.Virtual, Size == FileBacking.Size.
     195             :         // - Size should be non-zero. Size 0 virtual sstables must not be created.
     196             :         Size uint64
     197             :         // File creation time in seconds since the epoch (1970-01-01 00:00:00
     198             :         // UTC). For ingested sstables, this corresponds to the time the file was
     199             :         // ingested. For virtual sstables, this corresponds to the wall clock time
     200             :         // when the FileMetadata for the virtual sstable was first created.
     201             :         CreationTime int64
     202             :         // LargestSeqNumAbsolute is an upper bound for the largest sequence number
     203             :         // in the table. This upper bound is guaranteed to be higher than any
     204             :         // sequence number any of the table's keys have held at any point in time
     205             :         // while the database has been open. Specifically, if the table contains
     206             :         // keys that have had their sequence numbers zeroed during a compaction,
     207             :         // LargestSeqNumAbsolute will be at least as high as the pre-zeroing
     208             :         // sequence number. LargestSeqNumAbsolute is NOT durably persisted, so after
     209             :         // a database restart it takes on the value of LargestSeqNum.
     210             :         LargestSeqNumAbsolute base.SeqNum
     211             :         // Lower and upper bounds for the smallest and largest sequence numbers in
     212             :         // the table, across both point and range keys. For physical sstables, these
     213             :         // values are tight bounds. For virtual sstables, there is no guarantee that
     214             :         // there will be keys with SmallestSeqNum or LargestSeqNum within virtual
     215             :         // sstable bounds.
     216             :         SmallestSeqNum base.SeqNum
     217             :         LargestSeqNum  base.SeqNum
     218             :         // SmallestPointKey and LargestPointKey are the inclusive bounds for the
     219             :         // internal point keys stored in the table. This includes RANGEDELs, which
     220             :         // alter point keys.
     221             :         // NB: these field should be set using ExtendPointKeyBounds. They are left
     222             :         // exported for reads as an optimization.
     223             :         SmallestPointKey InternalKey
     224             :         LargestPointKey  InternalKey
     225             :         // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
     226             :         // internal range keys stored in the table.
     227             :         // NB: these field should be set using ExtendRangeKeyBounds. They are left
     228             :         // exported for reads as an optimization.
     229             :         SmallestRangeKey InternalKey
     230             :         LargestRangeKey  InternalKey
     231             :         // Smallest and Largest are the inclusive bounds for the internal keys stored
     232             :         // in the table, across both point and range keys.
     233             :         // NB: these fields are derived from their point and range key equivalents,
     234             :         // and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
     235             :         Smallest InternalKey
     236             :         Largest  InternalKey
     237             :         // Stats describe table statistics. Protected by DB.mu.
     238             :         //
     239             :         // For virtual sstables, set stats upon virtual sstable creation as
     240             :         // asynchronous computation of stats is not currently supported.
     241             :         //
     242             :         // TODO(bananabrick): To support manifest replay for virtual sstables, we
     243             :         // probably need to compute virtual sstable stats asynchronously. Otherwise,
     244             :         // we'd have to write virtual sstable stats to the version edit.
     245             :         Stats TableStats
     246             : 
     247             :         // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
     248             :         // pick L0 compactions. Only accurate for the most recent Version.
     249             :         SubLevel         int
     250             :         L0Index          int
     251             :         minIntervalIndex int
     252             :         maxIntervalIndex int
     253             : 
     254             :         // NB: the alignment of this struct is 8 bytes. We pack all the bools to
     255             :         // ensure an optimal packing.
     256             : 
     257             :         // IsIntraL0Compacting is set to True if this file is part of an intra-L0
     258             :         // compaction. When it's true, IsCompacting must also return true. If
     259             :         // Compacting is true and IsIntraL0Compacting is false for an L0 file, the
     260             :         // file must be part of a compaction to Lbase.
     261             :         IsIntraL0Compacting bool
     262             :         CompactionState     CompactionState
     263             :         // True if compaction of this file has been explicitly requested.
     264             :         // Previously, RocksDB and earlier versions of Pebble allowed this
     265             :         // flag to be set by a user table property collector. Some earlier
     266             :         // versions of Pebble respected this flag, while other more recent
     267             :         // versions ignored this flag.
     268             :         //
     269             :         // More recently this flag has been repurposed to facilitate the
     270             :         // compaction of 'atomic compaction units'. Files marked for
     271             :         // compaction are compacted in a rewrite compaction at the lowest
     272             :         // possible compaction priority.
     273             :         //
     274             :         // NB: A count of files marked for compaction is maintained on
     275             :         // Version, and compaction picking reads cached annotations
     276             :         // determined by this field.
     277             :         //
     278             :         // Protected by DB.mu.
     279             :         MarkedForCompaction bool
     280             :         // HasPointKeys tracks whether the table contains point keys (including
     281             :         // RANGEDELs). If a table contains only range deletions, HasPointsKeys is
     282             :         // still true.
     283             :         HasPointKeys bool
     284             :         // HasRangeKeys tracks whether the table contains any range keys.
     285             :         HasRangeKeys bool
     286             :         // smallestSet and largestSet track whether the overall bounds have been set.
     287             :         boundsSet bool
     288             :         // boundTypeSmallest and boundTypeLargest provide an indication as to which
     289             :         // key type (point or range) corresponds to the smallest and largest overall
     290             :         // table bounds.
     291             :         boundTypeSmallest, boundTypeLargest boundType
     292             :         // Virtual is true if the FileMetadata belongs to a virtual sstable.
     293             :         Virtual bool
     294             : 
     295             :         // SyntheticPrefix is used to prepend a prefix to all keys and/or override all
     296             :         // suffixes in a table; used for some virtual tables.
     297             :         SyntheticPrefixAndSuffix sstable.SyntheticPrefixAndSuffix
     298             : }
     299             : 
     300             : // InternalKeyBounds returns the set of overall table bounds.
     301           0 : func (m *FileMetadata) InternalKeyBounds() (InternalKey, InternalKey) {
     302           0 :         return m.Smallest, m.Largest
     303           0 : }
     304             : 
     305             : // UserKeyBounds returns the user key bounds that correspond to m.Smallest and
     306             : // Largest. Because we do not allow split user keys, the user key bounds of
     307             : // files within a level do not overlap.
     308           1 : func (m *FileMetadata) UserKeyBounds() base.UserKeyBounds {
     309           1 :         return base.UserKeyBoundsFromInternal(m.Smallest, m.Largest)
     310           1 : }
     311             : 
     312             : // UserKeyBoundsByType returns the user key bounds for the given key types.
     313             : // Note that the returned bounds are invalid when requesting KeyTypePoint but
     314             : // HasPointKeys is false, or when requesting KeyTypeRange and HasRangeKeys is
     315             : // false.
     316           1 : func (m *FileMetadata) UserKeyBoundsByType(keyType KeyType) base.UserKeyBounds {
     317           1 :         switch keyType {
     318           1 :         case KeyTypePoint:
     319           1 :                 return base.UserKeyBoundsFromInternal(m.SmallestPointKey, m.LargestPointKey)
     320           1 :         case KeyTypeRange:
     321           1 :                 return base.UserKeyBoundsFromInternal(m.SmallestRangeKey, m.LargestRangeKey)
     322           0 :         default:
     323           0 :                 return base.UserKeyBoundsFromInternal(m.Smallest, m.Largest)
     324             :         }
     325             : }
     326             : 
     327             : // SyntheticSeqNum returns a SyntheticSeqNum which is set when SmallestSeqNum
     328             : // equals LargestSeqNum.
     329           1 : func (m *FileMetadata) SyntheticSeqNum() sstable.SyntheticSeqNum {
     330           1 :         if m.SmallestSeqNum == m.LargestSeqNum {
     331           1 :                 return sstable.SyntheticSeqNum(m.SmallestSeqNum)
     332           1 :         }
     333           1 :         return sstable.NoSyntheticSeqNum
     334             : }
     335             : 
     336             : // IterTransforms returns an sstable.IterTransforms populated according to the
     337             : // file.
     338           1 : func (m *FileMetadata) IterTransforms() sstable.IterTransforms {
     339           1 :         return sstable.IterTransforms{
     340           1 :                 SyntheticSeqNum:          m.SyntheticSeqNum(),
     341           1 :                 SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix,
     342           1 :         }
     343           1 : }
     344             : 
     345             : // FragmentIterTransforms returns an sstable.FragmentIterTransforms populated
     346             : // according to the file.
     347           1 : func (m *FileMetadata) FragmentIterTransforms() sstable.FragmentIterTransforms {
     348           1 :         return sstable.FragmentIterTransforms{
     349           1 :                 SyntheticSeqNum:          m.SyntheticSeqNum(),
     350           1 :                 SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix,
     351           1 :         }
     352           1 : }
     353             : 
     354             : // PhysicalFileMeta is used by functions which want a guarantee that their input
     355             : // belongs to a physical sst and not a virtual sst.
     356             : //
     357             : // NB: This type should only be constructed by calling
     358             : // FileMetadata.PhysicalMeta.
     359             : type PhysicalFileMeta struct {
     360             :         *FileMetadata
     361             : }
     362             : 
     363             : // VirtualFileMeta is used by functions which want a guarantee that their input
     364             : // belongs to a virtual sst and not a physical sst.
     365             : //
     366             : // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
     367             : // fields have additional invariants imposed on them, and/or slightly varying
     368             : // meanings:
     369             : //   - Smallest and Largest (and their counterparts
     370             : //     {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
     371             : //     key at that exact bound. We make the effort to determine the next smallest
     372             : //     or largest key in an sstable after virtualizing it, to maintain this
     373             : //     tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
     374             : //     could mean that a rangedel or range key ends at that user key, or has been
     375             : //     truncated to that user key.
     376             : //   - One invariant is that if a rangedel or range key is truncated on its
     377             : //     upper bound, the virtual sstable *must* have a rangedel or range key
     378             : //     sentinel key as its upper bound. This is because truncation yields
     379             : //     an exclusive upper bound for the rangedel/rangekey, and if there are
     380             : //     any points at that exclusive upper bound within the same virtual
     381             : //     sstable, those could get uncovered by this truncation. We enforce this
     382             : //     invariant in calls to keyspan.Truncate.
     383             : //   - Size is an estimate of the size of the virtualized portion of this sstable.
     384             : //     The underlying file's size is stored in FileBacking.Size, though it could
     385             : //     also be estimated or could correspond to just the referenced portion of
     386             : //     a file (eg. if the file originated on another node).
     387             : //   - Size must be > 0.
     388             : //   - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
     389             : //     This means that all keys in the virtual sstable must have seqnums within
     390             : //     [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
     391             : //     a key with a seqnum at either of the bounds. Calculating tight seqnum
     392             : //     bounds would be too expensive and deliver little value.
     393             : //
     394             : // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
     395             : type VirtualFileMeta struct {
     396             :         *FileMetadata
     397             : }
     398             : 
     399             : // VirtualReaderParams fills in the parameters necessary to create a virtual
     400             : // sstable reader.
     401           1 : func (m VirtualFileMeta) VirtualReaderParams(isShared bool) sstable.VirtualReaderParams {
     402           1 :         return sstable.VirtualReaderParams{
     403           1 :                 Lower:            m.Smallest,
     404           1 :                 Upper:            m.Largest,
     405           1 :                 FileNum:          m.FileNum,
     406           1 :                 IsSharedIngested: isShared && m.SyntheticSeqNum() != 0,
     407           1 :                 Size:             m.Size,
     408           1 :                 BackingSize:      m.FileBacking.Size,
     409           1 :         }
     410           1 : }
     411             : 
     412             : // PhysicalMeta should be the only source of creating the PhysicalFileMeta
     413             : // wrapper type.
     414           1 : func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
     415           1 :         if m.Virtual {
     416           0 :                 panic("pebble: file metadata does not belong to a physical sstable")
     417             :         }
     418           1 :         return PhysicalFileMeta{
     419           1 :                 m,
     420           1 :         }
     421             : }
     422             : 
     423             : // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
     424             : // type.
     425           1 : func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
     426           1 :         if !m.Virtual {
     427           0 :                 panic("pebble: file metadata does not belong to a virtual sstable")
     428             :         }
     429           1 :         return VirtualFileMeta{
     430           1 :                 m,
     431           1 :         }
     432             : }
     433             : 
     434             : // FileBacking either backs a single physical sstable, or one or more virtual
     435             : // sstables.
     436             : //
     437             : // See the comment above the FileMetadata type for sstable terminology.
     438             : type FileBacking struct {
     439             :         DiskFileNum base.DiskFileNum
     440             :         Size        uint64
     441             : 
     442             :         // Reference count for the backing file, used to determine when a backing file
     443             :         // is obsolete and can be removed.
     444             :         //
     445             :         // The reference count is at least the number of distinct tables that use this
     446             :         // backing across all versions that have a non-zero reference count. The tables
     447             :         // in each version are maintained in a copy-on-write B-tree and each B-tree node
     448             :         // keeps a reference on the respective backings.
     449             :         //
     450             :         // In addition, a reference count is taken for every backing in the latest
     451             :         // version's VirtualBackings (necessary to support Protect/Unprotect).
     452             :         refs atomic.Int32
     453             : }
     454             : 
     455             : // MustHaveRefs asserts that the backing has a positive refcount.
     456           1 : func (b *FileBacking) MustHaveRefs() {
     457           1 :         if refs := b.refs.Load(); refs <= 0 {
     458           0 :                 panic(errors.AssertionFailedf("backing %s must have positive refcount (refs=%d)",
     459           0 :                         b.DiskFileNum, refs))
     460             :         }
     461             : }
     462             : 
     463             : // Ref increments the backing's ref count.
     464           1 : func (b *FileBacking) Ref() {
     465           1 :         b.refs.Add(1)
     466           1 : }
     467             : 
     468             : // IsUnused returns if the backing is not being used by any tables in a version
     469             : // or btree.
     470           0 : func (b *FileBacking) IsUnused() bool {
     471           0 :         return b.refs.Load() == 0
     472           0 : }
     473             : 
     474             : // Unref decrements the backing's ref count (and returns the new count).
     475           1 : func (b *FileBacking) Unref() int32 {
     476           1 :         v := b.refs.Add(-1)
     477           1 :         if invariants.Enabled && v < 0 {
     478           0 :                 panic("pebble: invalid FileMetadata refcounting")
     479             :         }
     480           1 :         return v
     481             : }
     482             : 
     483             : // InitPhysicalBacking allocates and sets the FileBacking which is required by a
     484             : // physical sstable FileMetadata.
     485             : //
     486             : // Ensure that the state required by FileBacking, such as the FileNum, is
     487             : // already set on the FileMetadata before InitPhysicalBacking is called.
     488             : // Calling InitPhysicalBacking only after the relevant state has been set in the
     489             : // FileMetadata is not necessary in tests which don't rely on FileBacking.
     490           1 : func (m *FileMetadata) InitPhysicalBacking() {
     491           1 :         if m.Virtual {
     492           0 :                 panic("pebble: virtual sstables should use a pre-existing FileBacking")
     493             :         }
     494           1 :         if m.FileBacking == nil {
     495           1 :                 m.FileBacking = &FileBacking{
     496           1 :                         DiskFileNum: base.PhysicalTableDiskFileNum(m.FileNum),
     497           1 :                         Size:        m.Size,
     498           1 :                 }
     499           1 :         }
     500             : }
     501             : 
     502             : // InitProviderBacking creates a new FileBacking for a file backed by
     503             : // an objstorage.Provider.
     504           1 : func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum, size uint64) {
     505           1 :         if !m.Virtual {
     506           0 :                 panic("pebble: provider-backed sstables must be virtual")
     507             :         }
     508           1 :         if m.FileBacking == nil {
     509           1 :                 m.FileBacking = &FileBacking{DiskFileNum: fileNum}
     510           1 :         }
     511           1 :         m.FileBacking.Size = size
     512             : }
     513             : 
     514             : // ValidateVirtual should be called once the FileMetadata for a virtual sstable
     515             : // is created to verify that the fields of the virtual sstable are sound.
     516           1 : func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
     517           1 :         switch {
     518           0 :         case !m.Virtual:
     519           0 :                 panic("pebble: invalid virtual sstable")
     520           0 :         case createdFrom.SmallestSeqNum != m.SmallestSeqNum:
     521           0 :                 panic("pebble: invalid smallest sequence number for virtual sstable")
     522           0 :         case createdFrom.LargestSeqNum != m.LargestSeqNum:
     523           0 :                 panic("pebble: invalid largest sequence number for virtual sstable")
     524           0 :         case createdFrom.LargestSeqNumAbsolute != m.LargestSeqNumAbsolute:
     525           0 :                 panic("pebble: invalid largest absolute sequence number for virtual sstable")
     526           0 :         case createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking:
     527           0 :                 panic("pebble: invalid physical sstable state for virtual sstable")
     528           0 :         case m.Size == 0:
     529           0 :                 panic("pebble: virtual sstable size must be set upon creation")
     530             :         }
     531             : }
     532             : 
     533             : // SetCompactionState transitions this file's compaction state to the given
     534             : // state. Protected by DB.mu.
     535           1 : func (m *FileMetadata) SetCompactionState(to CompactionState) {
     536           1 :         if invariants.Enabled {
     537           1 :                 transitionErr := func() error {
     538           0 :                         return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
     539           0 :                 }
     540           1 :                 switch m.CompactionState {
     541           1 :                 case CompactionStateNotCompacting:
     542           1 :                         if to != CompactionStateCompacting {
     543           0 :                                 panic(transitionErr())
     544             :                         }
     545           1 :                 case CompactionStateCompacting:
     546           1 :                         if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
     547           0 :                                 panic(transitionErr())
     548             :                         }
     549           0 :                 case CompactionStateCompacted:
     550           0 :                         panic(transitionErr())
     551           0 :                 default:
     552           0 :                         panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
     553             :                 }
     554             :         }
     555           1 :         m.CompactionState = to
     556             : }
     557             : 
     558             : // IsCompacting returns true if this file's compaction state is
     559             : // CompactionStateCompacting. Protected by DB.mu.
     560           1 : func (m *FileMetadata) IsCompacting() bool {
     561           1 :         return m.CompactionState == CompactionStateCompacting
     562           1 : }
     563             : 
     564             : // StatsValid returns true if the table stats have been populated. If StatValid
     565             : // returns true, the Stats field may be read (with or without holding the
     566             : // database mutex).
     567           1 : func (m *FileMetadata) StatsValid() bool {
     568           1 :         return m.statsValid.Load()
     569           1 : }
     570             : 
     571             : // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
     572             : // while populating TableStats and calling StatsMarkValud. Once stats are
     573             : // populated, they must not be mutated.
     574           1 : func (m *FileMetadata) StatsMarkValid() {
     575           1 :         m.statsValid.Store(true)
     576           1 : }
     577             : 
     578             : // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
     579             : // and overall table bounds with the given smallest and largest keys. The
     580             : // smallest and largest bounds may not be extended if the table already has a
     581             : // bound that is smaller or larger, respectively. The receiver is returned.
     582             : // NB: calling this method should be preferred to manually setting the bounds by
     583             : // manipulating the fields directly, to maintain certain invariants.
     584             : func (m *FileMetadata) ExtendPointKeyBounds(
     585             :         cmp Compare, smallest, largest InternalKey,
     586           1 : ) *FileMetadata {
     587           1 :         // Update the point key bounds.
     588           1 :         if !m.HasPointKeys {
     589           1 :                 m.SmallestPointKey, m.LargestPointKey = smallest, largest
     590           1 :                 m.HasPointKeys = true
     591           1 :         } else {
     592           1 :                 if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
     593           1 :                         m.SmallestPointKey = smallest
     594           1 :                 }
     595           1 :                 if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
     596           1 :                         m.LargestPointKey = largest
     597           1 :                 }
     598             :         }
     599             :         // Update the overall bounds.
     600           1 :         m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
     601           1 :         return m
     602             : }
     603             : 
     604             : // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
     605             : // and overall table bounds with the given smallest and largest keys. The
     606             : // smallest and largest bounds may not be extended if the table already has a
     607             : // bound that is smaller or larger, respectively. The receiver is returned.
     608             : // NB: calling this method should be preferred to manually setting the bounds by
     609             : // manipulating the fields directly, to maintain certain invariants.
     610             : func (m *FileMetadata) ExtendRangeKeyBounds(
     611             :         cmp Compare, smallest, largest InternalKey,
     612           1 : ) *FileMetadata {
     613           1 :         // Update the range key bounds.
     614           1 :         if !m.HasRangeKeys {
     615           1 :                 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
     616           1 :                 m.HasRangeKeys = true
     617           1 :         } else {
     618           1 :                 if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
     619           1 :                         m.SmallestRangeKey = smallest
     620           1 :                 }
     621           1 :                 if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
     622           1 :                         m.LargestRangeKey = largest
     623           1 :                 }
     624             :         }
     625             :         // Update the overall bounds.
     626           1 :         m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
     627           1 :         return m
     628             : }
     629             : 
     630             : // extendOverallBounds attempts to extend the overall table lower and upper
     631             : // bounds. The given bounds may not be used if a lower or upper bound already
     632             : // exists that is smaller or larger than the given keys, respectively. The given
     633             : // boundType will be used if the bounds are updated.
     634             : func (m *FileMetadata) extendOverallBounds(
     635             :         cmp Compare, smallest, largest InternalKey, bTyp boundType,
     636           1 : ) {
     637           1 :         if !m.boundsSet {
     638           1 :                 m.Smallest, m.Largest = smallest, largest
     639           1 :                 m.boundsSet = true
     640           1 :                 m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
     641           1 :         } else {
     642           1 :                 if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
     643           1 :                         m.Smallest = smallest
     644           1 :                         m.boundTypeSmallest = bTyp
     645           1 :                 }
     646           1 :                 if base.InternalCompare(cmp, largest, m.Largest) > 0 {
     647           1 :                         m.Largest = largest
     648           1 :                         m.boundTypeLargest = bTyp
     649           1 :                 }
     650             :         }
     651             : }
     652             : 
     653             : // Overlaps returns true if the file key range overlaps with the given user key bounds.
     654           1 : func (m *FileMetadata) Overlaps(cmp Compare, bounds *base.UserKeyBounds) bool {
     655           1 :         b := m.UserKeyBounds()
     656           1 :         return b.Overlaps(cmp, bounds)
     657           1 : }
     658             : 
     659             : // ContainedWithinSpan returns true if the file key range completely overlaps with the
     660             : // given range ("end" is assumed to exclusive).
     661           1 : func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
     662           1 :         lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
     663           1 :         return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
     664           1 : }
     665             : 
     666             : // ContainsKeyType returns whether or not the file contains keys of the provided
     667             : // type.
     668           1 : func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
     669           1 :         switch kt {
     670           1 :         case KeyTypePointAndRange:
     671           1 :                 return true
     672           1 :         case KeyTypePoint:
     673           1 :                 return m.HasPointKeys
     674           1 :         case KeyTypeRange:
     675           1 :                 return m.HasRangeKeys
     676           0 :         default:
     677           0 :                 panic("unrecognized key type")
     678             :         }
     679             : }
     680             : 
     681             : // SmallestBound returns the file's smallest bound of the key type. It returns a
     682             : // false second return value if the file does not contain any keys of the key
     683             : // type.
     684           1 : func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
     685           1 :         switch kt {
     686           0 :         case KeyTypePointAndRange:
     687           0 :                 return &m.Smallest, true
     688           1 :         case KeyTypePoint:
     689           1 :                 return &m.SmallestPointKey, m.HasPointKeys
     690           1 :         case KeyTypeRange:
     691           1 :                 return &m.SmallestRangeKey, m.HasRangeKeys
     692           0 :         default:
     693           0 :                 panic("unrecognized key type")
     694             :         }
     695             : }
     696             : 
     697             : // LargestBound returns the file's largest bound of the key type. It returns a
     698             : // false second return value if the file does not contain any keys of the key
     699             : // type.
     700           1 : func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
     701           1 :         switch kt {
     702           0 :         case KeyTypePointAndRange:
     703           0 :                 return &m.Largest, true
     704           1 :         case KeyTypePoint:
     705           1 :                 return &m.LargestPointKey, m.HasPointKeys
     706           1 :         case KeyTypeRange:
     707           1 :                 return &m.LargestRangeKey, m.HasRangeKeys
     708           0 :         default:
     709           0 :                 panic("unrecognized key type")
     710             :         }
     711             : }
     712             : 
     713             : const (
     714             :         maskContainsPointKeys = 1 << 0
     715             :         maskSmallest          = 1 << 1
     716             :         maskLargest           = 1 << 2
     717             : )
     718             : 
     719             : // boundsMarker returns a marker byte whose bits encode the following
     720             : // information (in order from least significant bit):
     721             : // - if the table contains point keys
     722             : // - if the table's smallest key is a point key
     723             : // - if the table's largest key is a point key
     724           1 : func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
     725           1 :         if m.HasPointKeys {
     726           1 :                 sentinel |= maskContainsPointKeys
     727           1 :         }
     728           1 :         switch m.boundTypeSmallest {
     729           1 :         case boundTypePointKey:
     730           1 :                 sentinel |= maskSmallest
     731           1 :         case boundTypeRangeKey:
     732             :                 // No op - leave bit unset.
     733           0 :         default:
     734           0 :                 return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
     735             :         }
     736           1 :         switch m.boundTypeLargest {
     737           1 :         case boundTypePointKey:
     738           1 :                 sentinel |= maskLargest
     739           1 :         case boundTypeRangeKey:
     740             :                 // No op - leave bit unset.
     741           0 :         default:
     742           0 :                 return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
     743             :         }
     744           1 :         return
     745             : }
     746             : 
     747             : // String implements fmt.Stringer, printing the file number and the overall
     748             : // table bounds.
     749           1 : func (m *FileMetadata) String() string {
     750           1 :         return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
     751           1 : }
     752             : 
     753             : // DebugString returns a verbose representation of FileMetadata, typically for
     754             : // use in tests and debugging, returning the file number and the point, range
     755             : // and overall bounds for the table.
     756           1 : func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
     757           1 :         var b bytes.Buffer
     758           1 :         if m.Virtual {
     759           1 :                 fmt.Fprintf(&b, "%s(%s):[%s-%s]",
     760           1 :                         m.FileNum, m.FileBacking.DiskFileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
     761           1 :         } else {
     762           1 :                 fmt.Fprintf(&b, "%s:[%s-%s]",
     763           1 :                         m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
     764           1 :         }
     765           1 :         if !verbose {
     766           1 :                 return b.String()
     767           1 :         }
     768           1 :         fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
     769           1 :         if m.HasPointKeys {
     770           1 :                 fmt.Fprintf(&b, " points:[%s-%s]",
     771           1 :                         m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
     772           1 :         }
     773           1 :         if m.HasRangeKeys {
     774           1 :                 fmt.Fprintf(&b, " ranges:[%s-%s]",
     775           1 :                         m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
     776           1 :         }
     777           1 :         if m.Size != 0 {
     778           1 :                 fmt.Fprintf(&b, " size:%d", m.Size)
     779           1 :         }
     780           1 :         return b.String()
     781             : }
     782             : 
     783             : // ParseFileMetadataDebug parses a FileMetadata from its DebugString
     784             : // representation.
     785           1 : func ParseFileMetadataDebug(s string) (_ *FileMetadata, err error) {
     786           1 :         defer func() {
     787           1 :                 if r := recover(); r != nil {
     788           0 :                         err = errors.CombineErrors(err, errFromPanic(r))
     789           0 :                 }
     790             :         }()
     791             : 
     792             :         // Input format:
     793             :         //      000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] size:5
     794           1 :         m := &FileMetadata{}
     795           1 :         p := makeDebugParser(s)
     796           1 :         m.FileNum = p.FileNum()
     797           1 :         var backingNum base.DiskFileNum
     798           1 :         if p.Peek() == "(" {
     799           1 :                 p.Expect("(")
     800           1 :                 backingNum = p.DiskFileNum()
     801           1 :                 p.Expect(")")
     802           1 :         }
     803           1 :         p.Expect(":", "[")
     804           1 :         m.Smallest = p.InternalKey()
     805           1 :         p.Expect("-")
     806           1 :         m.Largest = p.InternalKey()
     807           1 :         p.Expect("]")
     808           1 : 
     809           1 :         for !p.Done() {
     810           1 :                 field := p.Next()
     811           1 :                 p.Expect(":")
     812           1 :                 switch field {
     813           1 :                 case "seqnums":
     814           1 :                         p.Expect("[")
     815           1 :                         m.SmallestSeqNum = p.SeqNum()
     816           1 :                         p.Expect("-")
     817           1 :                         m.LargestSeqNum = p.SeqNum()
     818           1 :                         p.Expect("]")
     819           1 :                         m.LargestSeqNumAbsolute = m.LargestSeqNum
     820             : 
     821           1 :                 case "points":
     822           1 :                         p.Expect("[")
     823           1 :                         m.SmallestPointKey = p.InternalKey()
     824           1 :                         p.Expect("-")
     825           1 :                         m.LargestPointKey = p.InternalKey()
     826           1 :                         m.HasPointKeys = true
     827           1 :                         p.Expect("]")
     828             : 
     829           1 :                 case "ranges":
     830           1 :                         p.Expect("[")
     831           1 :                         m.SmallestRangeKey = p.InternalKey()
     832           1 :                         p.Expect("-")
     833           1 :                         m.LargestRangeKey = p.InternalKey()
     834           1 :                         m.HasRangeKeys = true
     835           1 :                         p.Expect("]")
     836             : 
     837           1 :                 case "size":
     838           1 :                         m.Size = p.Uint64()
     839             : 
     840           0 :                 default:
     841           0 :                         p.Errf("unknown field %q", field)
     842             :                 }
     843             :         }
     844             : 
     845             :         // By default, when the parser sees just the overall bounds, we set the point
     846             :         // keys. This preserves backwards compatability with existing test cases that
     847             :         // specify only the overall bounds.
     848           1 :         if !m.HasPointKeys && !m.HasRangeKeys {
     849           1 :                 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
     850           1 :                 m.HasPointKeys = true
     851           1 :         }
     852           1 :         if backingNum == 0 {
     853           1 :                 m.InitPhysicalBacking()
     854           1 :         } else {
     855           1 :                 m.Virtual = true
     856           1 :                 m.InitProviderBacking(backingNum, 0 /* size */)
     857           1 :         }
     858           1 :         return m, nil
     859             : }
     860             : 
     861             : // Validate validates the metadata for consistency with itself, returning an
     862             : // error if inconsistent.
     863           1 : func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
     864           1 :         // Combined range and point key validation.
     865           1 : 
     866           1 :         if !m.HasPointKeys && !m.HasRangeKeys {
     867           0 :                 return base.CorruptionErrorf("file %s has neither point nor range keys",
     868           0 :                         errors.Safe(m.FileNum))
     869           0 :         }
     870           1 :         if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
     871           1 :                 return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
     872           1 :                         errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
     873           1 :                         m.Largest.Pretty(formatKey))
     874           1 :         }
     875           1 :         if m.SmallestSeqNum > m.LargestSeqNum {
     876           0 :                 return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
     877           0 :                         errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
     878           0 :         }
     879           1 :         if m.LargestSeqNumAbsolute < m.LargestSeqNum {
     880           0 :                 return base.CorruptionErrorf("file %s has inconsistent absolute largest seqnum bounds: %d vs %d",
     881           0 :                         errors.Safe(m.FileNum), m.LargestSeqNumAbsolute, m.LargestSeqNum)
     882           0 :         }
     883             : 
     884             :         // Point key validation.
     885             : 
     886           1 :         if m.HasPointKeys {
     887           1 :                 if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
     888           0 :                         return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
     889           0 :                                 errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
     890           0 :                                 m.LargestPointKey.Pretty(formatKey))
     891           0 :                 }
     892           1 :                 if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
     893           1 :                         base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
     894           0 :                         return base.CorruptionErrorf(
     895           0 :                                 "file %s has inconsistent point key bounds relative to overall bounds: "+
     896           0 :                                         "overall = [%s-%s], point keys = [%s-%s]",
     897           0 :                                 errors.Safe(m.FileNum),
     898           0 :                                 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
     899           0 :                                 m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
     900           0 :                         )
     901           0 :                 }
     902           1 :                 if !isValidPointBoundKeyKind[m.SmallestPointKey.Kind()] {
     903           0 :                         return base.CorruptionErrorf("file %s has invalid smallest point key kind", m)
     904           0 :                 }
     905           1 :                 if !isValidPointBoundKeyKind[m.LargestPointKey.Kind()] {
     906           0 :                         return base.CorruptionErrorf("file %s has invalid largest point key kind", m)
     907           0 :                 }
     908             :         }
     909             : 
     910             :         // Range key validation.
     911             : 
     912           1 :         if m.HasRangeKeys {
     913           1 :                 if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
     914           0 :                         return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
     915           0 :                                 errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
     916           0 :                                 m.LargestRangeKey.Pretty(formatKey))
     917           0 :                 }
     918           1 :                 if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
     919           1 :                         base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
     920           0 :                         return base.CorruptionErrorf(
     921           0 :                                 "file %s has inconsistent range key bounds relative to overall bounds: "+
     922           0 :                                         "overall = [%s-%s], range keys = [%s-%s]",
     923           0 :                                 errors.Safe(m.FileNum),
     924           0 :                                 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
     925           0 :                                 m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
     926           0 :                         )
     927           0 :                 }
     928           1 :                 if !isValidRangeKeyBoundKeyKind[m.SmallestRangeKey.Kind()] {
     929           0 :                         return base.CorruptionErrorf("file %s has invalid smallest range key kind", m)
     930           0 :                 }
     931           1 :                 if !isValidRangeKeyBoundKeyKind[m.LargestRangeKey.Kind()] {
     932           0 :                         return base.CorruptionErrorf("file %s has invalid largest range key kind", m)
     933           0 :                 }
     934             :         }
     935             : 
     936             :         // Ensure that FileMetadata.Init was called.
     937           1 :         if m.FileBacking == nil {
     938           0 :                 return base.CorruptionErrorf("file metadata FileBacking not set")
     939           0 :         }
     940             : 
     941           1 :         if m.SyntheticPrefixAndSuffix.HasPrefix() {
     942           1 :                 if !m.Virtual {
     943           0 :                         return base.CorruptionErrorf("non-virtual file with synthetic prefix")
     944           0 :                 }
     945           1 :                 if !bytes.HasPrefix(m.Smallest.UserKey, m.SyntheticPrefixAndSuffix.Prefix()) {
     946           0 :                         return base.CorruptionErrorf("virtual file with synthetic prefix has smallest key with a different prefix: %s", m.Smallest.Pretty(formatKey))
     947           0 :                 }
     948           1 :                 if !bytes.HasPrefix(m.Largest.UserKey, m.SyntheticPrefixAndSuffix.Prefix()) {
     949           0 :                         return base.CorruptionErrorf("virtual file with synthetic prefix has largest key with a different prefix: %s", m.Largest.Pretty(formatKey))
     950           0 :                 }
     951             :         }
     952             : 
     953           1 :         if m.SyntheticPrefixAndSuffix.HasSuffix() {
     954           1 :                 if !m.Virtual {
     955           0 :                         return base.CorruptionErrorf("non-virtual file with synthetic suffix")
     956           0 :                 }
     957             :         }
     958             : 
     959           1 :         return nil
     960             : }
     961             : 
     962             : var (
     963             :         isValidPointBoundKeyKind = [base.InternalKeyKindMax + 1]bool{
     964             :                 base.InternalKeyKindDelete:        true,
     965             :                 base.InternalKeyKindSet:           true,
     966             :                 base.InternalKeyKindMerge:         true,
     967             :                 base.InternalKeyKindSingleDelete:  true,
     968             :                 base.InternalKeyKindRangeDelete:   true,
     969             :                 base.InternalKeyKindSetWithDelete: true,
     970             :                 base.InternalKeyKindDeleteSized:   true,
     971             :         }
     972             :         isValidRangeKeyBoundKeyKind = [base.InternalKeyKindMax + 1]bool{
     973             :                 base.InternalKeyKindRangeKeySet:    true,
     974             :                 base.InternalKeyKindRangeKeyUnset:  true,
     975             :                 base.InternalKeyKindRangeKeyDelete: true,
     976             :         }
     977             : )
     978             : 
     979             : // TableInfo returns a subset of the FileMetadata state formatted as a
     980             : // TableInfo.
     981           1 : func (m *FileMetadata) TableInfo() TableInfo {
     982           1 :         return TableInfo{
     983           1 :                 FileNum:        m.FileNum,
     984           1 :                 Size:           m.Size,
     985           1 :                 Smallest:       m.Smallest,
     986           1 :                 Largest:        m.Largest,
     987           1 :                 SmallestSeqNum: m.SmallestSeqNum,
     988           1 :                 LargestSeqNum:  m.LargestSeqNum,
     989           1 :         }
     990           1 : }
     991             : 
     992           1 : func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
     993           1 :         // NB: This is the same ordering that RocksDB uses for L0 files.
     994           1 : 
     995           1 :         // Sort first by largest sequence number.
     996           1 :         if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
     997           1 :                 return v
     998           1 :         }
     999             :         // Then by smallest sequence number.
    1000           1 :         if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
    1001           1 :                 return v
    1002           1 :         }
    1003             :         // Break ties by file number.
    1004           1 :         return stdcmp.Compare(m.FileNum, b.FileNum)
    1005             : }
    1006             : 
    1007           1 : func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
    1008           1 :         return m.cmpSeqNum(b) < 0
    1009           1 : }
    1010             : 
    1011           1 : func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
    1012           1 :         return base.InternalCompare(cmp, m.Smallest, b.Smallest)
    1013           1 : }
    1014             : 
    1015             : // KeyRange returns the minimum smallest and maximum largest internalKey for
    1016             : // all the FileMetadata in iters.
    1017           1 : func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
    1018           1 :         first := true
    1019           1 :         for _, iter := range iters {
    1020           1 :                 for meta := iter.First(); meta != nil; meta = iter.Next() {
    1021           1 :                         if first {
    1022           1 :                                 first = false
    1023           1 :                                 smallest, largest = meta.Smallest, meta.Largest
    1024           1 :                                 continue
    1025             :                         }
    1026           1 :                         if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
    1027           1 :                                 smallest = meta.Smallest
    1028           1 :                         }
    1029           1 :                         if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
    1030           1 :                                 largest = meta.Largest
    1031           1 :                         }
    1032             :                 }
    1033             :         }
    1034           1 :         return smallest, largest
    1035             : }
    1036             : 
    1037             : type bySeqNum []*FileMetadata
    1038             : 
    1039           1 : func (b bySeqNum) Len() int { return len(b) }
    1040           1 : func (b bySeqNum) Less(i, j int) bool {
    1041           1 :         return b[i].lessSeqNum(b[j])
    1042           1 : }
    1043           1 : func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
    1044             : 
    1045             : // SortBySeqNum sorts the specified files by increasing sequence number.
    1046           1 : func SortBySeqNum(files []*FileMetadata) {
    1047           1 :         sort.Sort(bySeqNum(files))
    1048           1 : }
    1049             : 
    1050             : type bySmallest struct {
    1051             :         files []*FileMetadata
    1052             :         cmp   Compare
    1053             : }
    1054             : 
    1055           1 : func (b bySmallest) Len() int { return len(b.files) }
    1056           1 : func (b bySmallest) Less(i, j int) bool {
    1057           1 :         return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
    1058           1 : }
    1059           0 : func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
    1060             : 
    1061             : // SortBySmallest sorts the specified files by smallest key using the supplied
    1062             : // comparison function to order user keys.
    1063           1 : func SortBySmallest(files []*FileMetadata, cmp Compare) {
    1064           1 :         sort.Sort(bySmallest{files, cmp})
    1065           1 : }
    1066             : 
    1067             : // NumLevels is the number of levels a Version contains.
    1068             : const NumLevels = 7
    1069             : 
    1070             : // NewVersion constructs a new Version with the provided files. It requires
    1071             : // the provided files are already well-ordered. It's intended for testing.
    1072             : func NewVersion(
    1073             :         comparer *base.Comparer, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
    1074           1 : ) *Version {
    1075           1 :         v := &Version{
    1076           1 :                 cmp: comparer,
    1077           1 :         }
    1078           1 :         for l := range files {
    1079           1 :                 // NB: We specifically insert `files` into the B-Tree in the order
    1080           1 :                 // they appear within `files`. Some tests depend on this behavior in
    1081           1 :                 // order to test consistency checking, etc. Once we've constructed the
    1082           1 :                 // initial B-Tree, we swap out the btreeCmp for the correct one.
    1083           1 :                 // TODO(jackson): Adjust or remove the tests and remove this.
    1084           1 :                 v.Levels[l].tree, _ = makeBTree(comparer.Compare, btreeCmpSpecificOrder(files[l]), files[l])
    1085           1 :                 v.Levels[l].level = l
    1086           1 :                 if l == 0 {
    1087           1 :                         v.Levels[l].tree.bcmp = btreeCmpSeqNum
    1088           1 :                 } else {
    1089           1 :                         v.Levels[l].tree.bcmp = btreeCmpSmallestKey(comparer.Compare)
    1090           1 :                 }
    1091           1 :                 for _, f := range files[l] {
    1092           1 :                         v.Levels[l].totalSize += f.Size
    1093           1 :                 }
    1094             :         }
    1095           1 :         if err := v.InitL0Sublevels(flushSplitBytes); err != nil {
    1096           0 :                 panic(err)
    1097             :         }
    1098           1 :         return v
    1099             : }
    1100             : 
    1101             : // TestingNewVersion returns a blank Version, used for tests.
    1102           1 : func TestingNewVersion(comparer *base.Comparer) *Version {
    1103           1 :         return &Version{
    1104           1 :                 cmp: comparer,
    1105           1 :         }
    1106           1 : }
    1107             : 
    1108             : // Version is a collection of file metadata for on-disk tables at various
    1109             : // levels. In-memory DBs are written to level-0 tables, and compactions
    1110             : // migrate data from level N to level N+1. The tables map internal keys (which
    1111             : // are a user key, a delete or set bit, and a sequence number) to user values.
    1112             : //
    1113             : // The tables at level 0 are sorted by largest sequence number. Due to file
    1114             : // ingestion, there may be overlap in the ranges of sequence numbers contain in
    1115             : // level 0 sstables. In particular, it is valid for one level 0 sstable to have
    1116             : // the seqnum range [1,100] while an adjacent sstable has the seqnum range
    1117             : // [50,50]. This occurs when the [50,50] table was ingested and given a global
    1118             : // seqnum. The ingestion code will have ensured that the [50,50] sstable will
    1119             : // not have any keys that overlap with the [1,100] in the seqnum range
    1120             : // [1,49]. The range of internal keys [fileMetadata.smallest,
    1121             : // fileMetadata.largest] in each level 0 table may overlap.
    1122             : //
    1123             : // The tables at any non-0 level are sorted by their internal key range and any
    1124             : // two tables at the same non-0 level do not overlap.
    1125             : //
    1126             : // The internal key ranges of two tables at different levels X and Y may
    1127             : // overlap, for any X != Y.
    1128             : //
    1129             : // Finally, for every internal key in a table at level X, there is no internal
    1130             : // key in a higher level table that has both the same user key and a higher
    1131             : // sequence number.
    1132             : type Version struct {
    1133             :         refs atomic.Int32
    1134             : 
    1135             :         // The level 0 sstables are organized in a series of sublevels. Similar to
    1136             :         // the seqnum invariant in normal levels, there is no internal key in a
    1137             :         // higher level table that has both the same user key and a higher sequence
    1138             :         // number. Within a sublevel, tables are sorted by their internal key range
    1139             :         // and any two tables at the same sublevel do not overlap. Unlike the normal
    1140             :         // levels, sublevel n contains older tables (lower sequence numbers) than
    1141             :         // sublevel n+1.
    1142             :         //
    1143             :         // The L0Sublevels struct is mostly used for compaction picking. As most
    1144             :         // internal data structures in it are only necessary for compaction picking
    1145             :         // and not for iterator creation, the reference to L0Sublevels is nil'd
    1146             :         // after this version becomes the non-newest version, to reduce memory
    1147             :         // usage.
    1148             :         //
    1149             :         // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
    1150             :         // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
    1151             :         // a reference to that slice, as that slice is necessary for iterator
    1152             :         // creation and needs to outlast L0Sublevels.
    1153             :         L0Sublevels     *L0Sublevels
    1154             :         L0SublevelFiles []LevelSlice
    1155             : 
    1156             :         Levels [NumLevels]LevelMetadata
    1157             : 
    1158             :         // RangeKeyLevels holds a subset of the same files as Levels that contain range
    1159             :         // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
    1160             :         // duplication should be minimal, as range keys are expected to be rare.
    1161             :         RangeKeyLevels [NumLevels]LevelMetadata
    1162             : 
    1163             :         // The callback to invoke when the last reference to a version is
    1164             :         // removed. Will be called with list.mu held.
    1165             :         Deleted func(obsolete []*FileBacking)
    1166             : 
    1167             :         // Stats holds aggregated stats about the version maintained from
    1168             :         // version to version.
    1169             :         Stats struct {
    1170             :                 // MarkedForCompaction records the count of files marked for
    1171             :                 // compaction within the version.
    1172             :                 MarkedForCompaction int
    1173             :         }
    1174             : 
    1175             :         cmp *base.Comparer
    1176             : 
    1177             :         // The list the version is linked into.
    1178             :         list *VersionList
    1179             : 
    1180             :         // The next/prev link for the versionList doubly-linked list of versions.
    1181             :         prev, next *Version
    1182             : }
    1183             : 
    1184             : // String implements fmt.Stringer, printing the FileMetadata for each level in
    1185             : // the Version.
    1186           1 : func (v *Version) String() string {
    1187           1 :         return v.string(false)
    1188           1 : }
    1189             : 
    1190             : // DebugString returns an alternative format to String() which includes sequence
    1191             : // number and kind information for the sstable boundaries.
    1192           1 : func (v *Version) DebugString() string {
    1193           1 :         return v.string(true)
    1194           1 : }
    1195             : 
    1196           1 : func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
    1197           1 :         var buf bytes.Buffer
    1198           1 :         for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
    1199           1 :                 fmt.Fprintf(&buf, "L0.%d:\n", sublevel)
    1200           1 :                 sublevels[sublevel].Each(func(f *FileMetadata) {
    1201           1 :                         fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
    1202           1 :                 })
    1203             :         }
    1204           1 :         return buf.String()
    1205             : }
    1206             : 
    1207           1 : func (v *Version) string(verbose bool) string {
    1208           1 :         var buf bytes.Buffer
    1209           1 :         if len(v.L0SublevelFiles) > 0 {
    1210           1 :                 fmt.Fprintf(&buf, "%s", describeSublevels(v.cmp.FormatKey, verbose, v.L0SublevelFiles))
    1211           1 :         }
    1212           1 :         for level := 1; level < NumLevels; level++ {
    1213           1 :                 if v.Levels[level].Empty() {
    1214           1 :                         continue
    1215             :                 }
    1216           1 :                 fmt.Fprintf(&buf, "L%d:\n", level)
    1217           1 :                 iter := v.Levels[level].Iter()
    1218           1 :                 for f := iter.First(); f != nil; f = iter.Next() {
    1219           1 :                         fmt.Fprintf(&buf, "  %s\n", f.DebugString(v.cmp.FormatKey, verbose))
    1220           1 :                 }
    1221             :         }
    1222           1 :         return buf.String()
    1223             : }
    1224             : 
    1225             : // ParseVersionDebug parses a Version from its DebugString output.
    1226           1 : func ParseVersionDebug(comparer *base.Comparer, flushSplitBytes int64, s string) (*Version, error) {
    1227           1 :         var files [NumLevels][]*FileMetadata
    1228           1 :         level := -1
    1229           1 :         for _, l := range strings.Split(s, "\n") {
    1230           1 :                 if l == "" {
    1231           1 :                         continue
    1232             :                 }
    1233           1 :                 p := makeDebugParser(l)
    1234           1 :                 if l, ok := p.TryLevel(); ok {
    1235           1 :                         level = l
    1236           1 :                         continue
    1237             :                 }
    1238             : 
    1239           1 :                 if level == -1 {
    1240           0 :                         return nil, errors.Errorf("version string must start with a level")
    1241           0 :                 }
    1242           1 :                 m, err := ParseFileMetadataDebug(l)
    1243           1 :                 if err != nil {
    1244           0 :                         return nil, err
    1245           0 :                 }
    1246           1 :                 files[level] = append(files[level], m)
    1247             :         }
    1248             :         // L0 files are printed from higher sublevel to lower, which means in a
    1249             :         // partial order that represents newest to oldest. Reverse the order of L0
    1250             :         // files to ensure we construct the same sublevels.
    1251           1 :         slices.Reverse(files[0])
    1252           1 :         v := NewVersion(comparer, flushSplitBytes, files)
    1253           1 :         if err := v.CheckOrdering(); err != nil {
    1254           1 :                 return nil, err
    1255           1 :         }
    1256           1 :         return v, nil
    1257             : }
    1258             : 
    1259             : // Refs returns the number of references to the version.
    1260           1 : func (v *Version) Refs() int32 {
    1261           1 :         return v.refs.Load()
    1262           1 : }
    1263             : 
    1264             : // Ref increments the version refcount.
    1265           1 : func (v *Version) Ref() {
    1266           1 :         v.refs.Add(1)
    1267           1 : }
    1268             : 
    1269             : // Unref decrements the version refcount. If the last reference to the version
    1270             : // was removed, the version is removed from the list of versions and the
    1271             : // Deleted callback is invoked. Requires that the VersionList mutex is NOT
    1272             : // locked.
    1273           1 : func (v *Version) Unref() {
    1274           1 :         if v.refs.Add(-1) == 0 {
    1275           1 :                 l := v.list
    1276           1 :                 l.mu.Lock()
    1277           1 :                 l.Remove(v)
    1278           1 :                 v.Deleted(v.unrefFiles())
    1279           1 :                 l.mu.Unlock()
    1280           1 :         }
    1281             : }
    1282             : 
    1283             : // UnrefLocked decrements the version refcount. If the last reference to the
    1284             : // version was removed, the version is removed from the list of versions and
    1285             : // the Deleted callback is invoked. Requires that the VersionList mutex is
    1286             : // already locked.
    1287           1 : func (v *Version) UnrefLocked() {
    1288           1 :         if v.refs.Add(-1) == 0 {
    1289           1 :                 v.list.Remove(v)
    1290           1 :                 v.Deleted(v.unrefFiles())
    1291           1 :         }
    1292             : }
    1293             : 
    1294           1 : func (v *Version) unrefFiles() []*FileBacking {
    1295           1 :         var obsolete []*FileBacking
    1296           1 :         for _, lm := range v.Levels {
    1297           1 :                 obsolete = append(obsolete, lm.release()...)
    1298           1 :         }
    1299           1 :         for _, lm := range v.RangeKeyLevels {
    1300           1 :                 obsolete = append(obsolete, lm.release()...)
    1301           1 :         }
    1302           1 :         return obsolete
    1303             : }
    1304             : 
    1305             : // Next returns the next version in the list of versions.
    1306           1 : func (v *Version) Next() *Version {
    1307           1 :         return v.next
    1308           1 : }
    1309             : 
    1310             : // InitL0Sublevels initializes the L0Sublevels
    1311           1 : func (v *Version) InitL0Sublevels(flushSplitBytes int64) error {
    1312           1 :         var err error
    1313           1 :         v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], v.cmp.Compare, v.cmp.FormatKey, flushSplitBytes)
    1314           1 :         if err == nil && v.L0Sublevels != nil {
    1315           1 :                 v.L0SublevelFiles = v.L0Sublevels.Levels
    1316           1 :         }
    1317           1 :         return err
    1318             : }
    1319             : 
    1320             : // CalculateInuseKeyRanges examines file metadata in levels [level, maxLevel]
    1321             : // within bounds [smallest,largest], returning an ordered slice of key ranges
    1322             : // that include all keys that exist within levels [level, maxLevel] and within
    1323             : // [smallest,largest].
    1324             : func (v *Version) CalculateInuseKeyRanges(
    1325             :         level, maxLevel int, smallest, largest []byte,
    1326           1 : ) []base.UserKeyBounds {
    1327           1 :         // Use two slices, alternating which one is input and which one is output
    1328           1 :         // as we descend the LSM.
    1329           1 :         var input, output []base.UserKeyBounds
    1330           1 : 
    1331           1 :         // L0 requires special treatment, since sstables within L0 may overlap.
    1332           1 :         // We use the L0 Sublevels structure to efficiently calculate the merged
    1333           1 :         // in-use key ranges.
    1334           1 :         if level == 0 {
    1335           1 :                 output = v.L0Sublevels.InUseKeyRanges(smallest, largest)
    1336           1 :                 level++
    1337           1 :         }
    1338             : 
    1339             :         // NB: We always treat `largest` as inclusive for simplicity, because
    1340             :         // there's little consequence to calculating slightly broader in-use key
    1341             :         // ranges.
    1342           1 :         bounds := base.UserKeyBoundsInclusive(smallest, largest)
    1343           1 :         for ; level <= maxLevel; level++ {
    1344           1 :                 overlaps := v.Overlaps(level, bounds)
    1345           1 :                 iter := overlaps.Iter()
    1346           1 : 
    1347           1 :                 // We may already have in-use key ranges from higher levels. Iterate
    1348           1 :                 // through both our accumulated in-use key ranges and this level's
    1349           1 :                 // files, merging the two.
    1350           1 :                 //
    1351           1 :                 // Tables higher within the LSM have broader key spaces. We use this
    1352           1 :                 // when possible to seek past a level's files that are contained by
    1353           1 :                 // our current accumulated in-use key ranges. This helps avoid
    1354           1 :                 // per-sstable work during flushes or compactions in high levels which
    1355           1 :                 // overlap the majority of the LSM's sstables.
    1356           1 :                 input, output = output, input
    1357           1 :                 output = output[:0]
    1358           1 : 
    1359           1 :                 cmp := v.cmp.Compare
    1360           1 :                 inputIdx := 0
    1361           1 :                 var currFile *FileMetadata
    1362           1 :                 // If we have an accumulated key range and its start is ≤ smallest,
    1363           1 :                 // we can seek to the accumulated range's end. Otherwise, we need to
    1364           1 :                 // start at the first overlapping file within the level.
    1365           1 :                 if len(input) > 0 && cmp(input[0].Start, smallest) <= 0 {
    1366           1 :                         currFile = seekGT(&iter, cmp, input[0].End)
    1367           1 :                 } else {
    1368           1 :                         currFile = iter.First()
    1369           1 :                 }
    1370             : 
    1371           1 :                 for currFile != nil && inputIdx < len(input) {
    1372           1 :                         // Invariant: Neither currFile nor input[inputIdx] overlaps any earlier
    1373           1 :                         // ranges.
    1374           1 :                         switch {
    1375           1 :                         case cmp(currFile.Largest.UserKey, input[inputIdx].Start) < 0:
    1376           1 :                                 // File is completely before input range.
    1377           1 :                                 output = append(output, currFile.UserKeyBounds())
    1378           1 :                                 currFile = iter.Next()
    1379             : 
    1380           1 :                         case cmp(input[inputIdx].End.Key, currFile.Smallest.UserKey) < 0:
    1381           1 :                                 // Input range is completely before the next file.
    1382           1 :                                 output = append(output, input[inputIdx])
    1383           1 :                                 inputIdx++
    1384             : 
    1385           1 :                         default:
    1386           1 :                                 // Input range and file range overlap or touch. We will maximally extend
    1387           1 :                                 // the range with more overlapping inputs and files.
    1388           1 :                                 currAccum := currFile.UserKeyBounds()
    1389           1 :                                 if cmp(input[inputIdx].Start, currAccum.Start) < 0 {
    1390           1 :                                         currAccum.Start = input[inputIdx].Start
    1391           1 :                                 }
    1392           1 :                                 currFile = iter.Next()
    1393           1 : 
    1394           1 :                                 // Extend curAccum with any overlapping (or touching) input intervals or
    1395           1 :                                 // files. Note that we will always consume at least input[inputIdx].
    1396           1 :                                 for {
    1397           1 :                                         if inputIdx < len(input) && cmp(input[inputIdx].Start, currAccum.End.Key) <= 0 {
    1398           1 :                                                 if currAccum.End.CompareUpperBounds(cmp, input[inputIdx].End) < 0 {
    1399           1 :                                                         currAccum.End = input[inputIdx].End
    1400           1 :                                                         // Skip over files that are entirely inside this newly extended
    1401           1 :                                                         // accumulated range; we expect ranges to be wider in levels that
    1402           1 :                                                         // are higher up so this might skip over a non-trivial number of
    1403           1 :                                                         // files.
    1404           1 :                                                         currFile = seekGT(&iter, cmp, currAccum.End)
    1405           1 :                                                 }
    1406           1 :                                                 inputIdx++
    1407           1 :                                         } else if currFile != nil && cmp(currFile.Smallest.UserKey, currAccum.End.Key) <= 0 {
    1408           1 :                                                 if b := currFile.UserKeyBounds(); currAccum.End.CompareUpperBounds(cmp, b.End) < 0 {
    1409           1 :                                                         currAccum.End = b.End
    1410           1 :                                                 }
    1411           1 :                                                 currFile = iter.Next()
    1412           1 :                                         } else {
    1413           1 :                                                 // No overlaps remaining.
    1414           1 :                                                 break
    1415             :                                         }
    1416             :                                 }
    1417           1 :                                 output = append(output, currAccum)
    1418             :                         }
    1419             :                 }
    1420             :                 // If we have either files or input ranges left over, add them to the
    1421             :                 // output.
    1422           1 :                 output = append(output, input[inputIdx:]...)
    1423           1 :                 for ; currFile != nil; currFile = iter.Next() {
    1424           1 :                         output = append(output, currFile.UserKeyBounds())
    1425           1 :                 }
    1426             :         }
    1427           1 :         return output
    1428             : }
    1429             : 
    1430             : // seekGT seeks to the first file that ends with a boundary that is after the
    1431             : // given boundary. Specifically:
    1432             : //   - if boundary.End is inclusive, the returned file ending boundary is strictly
    1433             : //     greater than boundary.End.Key
    1434             : //   - if boundary.End is exclusive, the returned file ending boundary is either
    1435             : //     greater than boundary.End.Key, or it's inclusive at boundary.End.Key.
    1436           1 : func seekGT(iter *LevelIterator, cmp base.Compare, boundary base.UserKeyBoundary) *FileMetadata {
    1437           1 :         f := iter.SeekGE(cmp, boundary.Key)
    1438           1 :         if f == nil {
    1439           1 :                 return nil
    1440           1 :         }
    1441             :         // If boundary is inclusive or the file boundary is exclusive we do not
    1442             :         // tolerate an equal largest key.
    1443             :         // Note: we know f.Largest.UserKey >= boundary.End.Key so this condition is
    1444             :         // equivalent to boundary.End.IsUpperBoundForInternalKey(cmp, f.Largest).
    1445           1 :         if (boundary.Kind == base.Inclusive || f.Largest.IsExclusiveSentinel()) && cmp(boundary.Key, f.Largest.UserKey) == 0 {
    1446           1 :                 return iter.Next()
    1447           1 :         }
    1448           1 :         return f
    1449             : }
    1450             : 
    1451             : // Contains returns a boolean indicating whether the provided file exists in
    1452             : // the version at the given level. If level is non-zero then Contains binary
    1453             : // searches among the files. If level is zero, Contains scans the entire
    1454             : // level.
    1455           1 : func (v *Version) Contains(level int, m *FileMetadata) bool {
    1456           1 :         iter := v.Levels[level].Iter()
    1457           1 :         if level > 0 {
    1458           1 :                 overlaps := v.Overlaps(level, m.UserKeyBounds())
    1459           1 :                 iter = overlaps.Iter()
    1460           1 :         }
    1461           1 :         for f := iter.First(); f != nil; f = iter.Next() {
    1462           1 :                 if f == m {
    1463           1 :                         return true
    1464           1 :                 }
    1465             :         }
    1466           1 :         return false
    1467             : }
    1468             : 
    1469             : // Overlaps returns all elements of v.files[level] whose user key range
    1470             : // intersects the given bounds. If level is non-zero then the user key bounds of
    1471             : // v.files[level] are assumed to not overlap (although they may touch). If level
    1472             : // is zero then that assumption cannot be made, and the given bounds are
    1473             : // expanded to the union of those matching bounds so far and the computation is
    1474             : // repeated until the bounds stabilize.
    1475             : // The returned files are a subsequence of the input files, i.e., the ordering
    1476             : // is not changed.
    1477           1 : func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice {
    1478           1 :         if level == 0 {
    1479           1 :                 // Indices that have been selected as overlapping.
    1480           1 :                 l0 := v.Levels[level]
    1481           1 :                 l0Iter := l0.Iter()
    1482           1 :                 selectedIndices := make([]bool, l0.Len())
    1483           1 :                 numSelected := 0
    1484           1 :                 var slice LevelSlice
    1485           1 :                 for {
    1486           1 :                         restart := false
    1487           1 :                         for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
    1488           1 :                                 selected := selectedIndices[i]
    1489           1 :                                 if selected {
    1490           1 :                                         continue
    1491             :                                 }
    1492           1 :                                 if !meta.Overlaps(v.cmp.Compare, &bounds) {
    1493           1 :                                         // meta is completely outside the specified range; skip it.
    1494           1 :                                         continue
    1495             :                                 }
    1496             :                                 // Overlaps.
    1497           1 :                                 selectedIndices[i] = true
    1498           1 :                                 numSelected++
    1499           1 : 
    1500           1 :                                 // Since this is L0, check if the newly added fileMetadata has expanded
    1501           1 :                                 // the range. We expand the range immediately for files we have
    1502           1 :                                 // remaining to check in this loop. All already checked and unselected
    1503           1 :                                 // files will need to be rechecked via the restart below.
    1504           1 :                                 if v.cmp.Compare(meta.Smallest.UserKey, bounds.Start) < 0 {
    1505           1 :                                         bounds.Start = meta.Smallest.UserKey
    1506           1 :                                         restart = true
    1507           1 :                                 }
    1508           1 :                                 if !bounds.End.IsUpperBoundForInternalKey(v.cmp.Compare, meta.Largest) {
    1509           1 :                                         bounds.End = base.UserKeyExclusiveIf(meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
    1510           1 :                                         restart = true
    1511           1 :                                 }
    1512             :                         }
    1513             : 
    1514           1 :                         if !restart {
    1515           1 :                                 // Construct a B-Tree containing only the matching items.
    1516           1 :                                 var tr btree
    1517           1 :                                 tr.bcmp = v.Levels[level].tree.bcmp
    1518           1 :                                 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
    1519           1 :                                         if selectedIndices[i] {
    1520           1 :                                                 err := tr.Insert(meta)
    1521           1 :                                                 if err != nil {
    1522           0 :                                                         panic(err)
    1523             :                                                 }
    1524             :                                         }
    1525             :                                 }
    1526           1 :                                 slice = newLevelSlice(tr.Iter())
    1527           1 :                                 // TODO(jackson): Avoid the oddity of constructing and
    1528           1 :                                 // immediately releasing a B-Tree. Make LevelSlice an
    1529           1 :                                 // interface?
    1530           1 :                                 tr.Release()
    1531           1 :                                 break
    1532             :                         }
    1533             :                         // Continue looping to retry the files that were not selected.
    1534             :                 }
    1535           1 :                 return slice
    1536             :         }
    1537             : 
    1538           1 :         return v.Levels[level].Slice().Overlaps(v.cmp.Compare, bounds)
    1539             : }
    1540             : 
    1541             : // IterAllLevelsAndSublevels calls fn with an iterator for each L0 sublevel
    1542             : // (from top to bottom), then once for each level below L0.
    1543           1 : func (v *Version) IterAllLevelsAndSublevels(fn func(it LevelIterator, level Layer)) {
    1544           1 :         for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
    1545           1 :                 fn(v.L0SublevelFiles[sublevel].Iter(), L0Sublevel(sublevel))
    1546           1 :         }
    1547           1 :         for level := 1; level < NumLevels; level++ {
    1548           1 :                 fn(v.Levels[level].Iter(), Level(level))
    1549           1 :         }
    1550             : }
    1551             : 
    1552             : // CheckOrdering checks that the files are consistent with respect to
    1553             : // increasing file numbers (for level 0 files) and increasing and non-
    1554             : // overlapping internal key ranges (for level non-0 files).
    1555           1 : func (v *Version) CheckOrdering() error {
    1556           1 :         for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
    1557           1 :                 sublevelIter := v.L0SublevelFiles[sublevel].Iter()
    1558           1 :                 if err := CheckOrdering(v.cmp.Compare, v.cmp.FormatKey, L0Sublevel(sublevel), sublevelIter); err != nil {
    1559           0 :                         return base.CorruptionErrorf("%s\n%s", err, v.DebugString())
    1560           0 :                 }
    1561             :         }
    1562             : 
    1563           1 :         for level, lm := range v.Levels {
    1564           1 :                 if err := CheckOrdering(v.cmp.Compare, v.cmp.FormatKey, Level(level), lm.Iter()); err != nil {
    1565           1 :                         return base.CorruptionErrorf("%s\n%s", err, v.DebugString())
    1566           1 :                 }
    1567             :         }
    1568           1 :         return nil
    1569             : }
    1570             : 
    1571             : // VersionList holds a list of versions. The versions are ordered from oldest
    1572             : // to newest.
    1573             : type VersionList struct {
    1574             :         mu   *sync.Mutex
    1575             :         root Version
    1576             : }
    1577             : 
    1578             : // Init initializes the version list.
    1579           1 : func (l *VersionList) Init(mu *sync.Mutex) {
    1580           1 :         l.mu = mu
    1581           1 :         l.root.next = &l.root
    1582           1 :         l.root.prev = &l.root
    1583           1 : }
    1584             : 
    1585             : // Empty returns true if the list is empty, and false otherwise.
    1586           1 : func (l *VersionList) Empty() bool {
    1587           1 :         return l.root.next == &l.root
    1588           1 : }
    1589             : 
    1590             : // Front returns the oldest version in the list. Note that this version is only
    1591             : // valid if Empty() returns true.
    1592           1 : func (l *VersionList) Front() *Version {
    1593           1 :         return l.root.next
    1594           1 : }
    1595             : 
    1596             : // Back returns the newest version in the list. Note that this version is only
    1597             : // valid if Empty() returns true.
    1598           1 : func (l *VersionList) Back() *Version {
    1599           1 :         return l.root.prev
    1600           1 : }
    1601             : 
    1602             : // PushBack adds a new version to the back of the list. This new version
    1603             : // becomes the "newest" version in the list.
    1604           1 : func (l *VersionList) PushBack(v *Version) {
    1605           1 :         if v.list != nil || v.prev != nil || v.next != nil {
    1606           0 :                 panic("pebble: version list is inconsistent")
    1607             :         }
    1608           1 :         v.prev = l.root.prev
    1609           1 :         v.prev.next = v
    1610           1 :         v.next = &l.root
    1611           1 :         v.next.prev = v
    1612           1 :         v.list = l
    1613           1 :         // Let L0Sublevels on the second newest version get GC'd, as it is no longer
    1614           1 :         // necessary. See the comment in Version.
    1615           1 :         v.prev.L0Sublevels = nil
    1616             : }
    1617             : 
    1618             : // Remove removes the specified version from the list.
    1619           1 : func (l *VersionList) Remove(v *Version) {
    1620           1 :         if v == &l.root {
    1621           0 :                 panic("pebble: cannot remove version list root node")
    1622             :         }
    1623           1 :         if v.list != l {
    1624           0 :                 panic("pebble: version list is inconsistent")
    1625             :         }
    1626           1 :         v.prev.next = v.next
    1627           1 :         v.next.prev = v.prev
    1628           1 :         v.next = nil // avoid memory leaks
    1629           1 :         v.prev = nil // avoid memory leaks
    1630           1 :         v.list = nil // avoid memory leaks
    1631             : }
    1632             : 
    1633             : // CheckOrdering checks that the files are consistent with respect to
    1634             : // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
    1635             : // overlapping internal key ranges (for non-level 0 files).
    1636           1 : func CheckOrdering(cmp Compare, format base.FormatKey, level Layer, files LevelIterator) error {
    1637           1 :         // The invariants to check for L0 sublevels are the same as the ones to
    1638           1 :         // check for all other levels. However, if L0 is not organized into
    1639           1 :         // sublevels, or if all L0 files are being passed in, we do the legacy L0
    1640           1 :         // checks, defined in the detailed comment below.
    1641           1 :         if level == Level(0) {
    1642           1 :                 // We have 2 kinds of files:
    1643           1 :                 // - Files with exactly one sequence number: these could be either ingested files
    1644           1 :                 //   or flushed files. We cannot tell the difference between them based on FileMetadata,
    1645           1 :                 //   so our consistency checking here uses the weaker checks assuming it is a narrow
    1646           1 :                 //   flushed file. We cannot error on ingested files having sequence numbers coincident
    1647           1 :                 //   with flushed files as the seemingly ingested file could just be a flushed file
    1648           1 :                 //   with just one key in it which is a truncated range tombstone sharing sequence numbers
    1649           1 :                 //   with other files in the same flush.
    1650           1 :                 // - Files with multiple sequence numbers: these are necessarily flushed files.
    1651           1 :                 //
    1652           1 :                 // Three cases of overlapping sequence numbers:
    1653           1 :                 // Case 1:
    1654           1 :                 // An ingested file contained in the sequence numbers of the flushed file -- it must be
    1655           1 :                 // fully contained (not coincident with either end of the flushed file) since the memtable
    1656           1 :                 // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
    1657           1 :                 // num b, and the memtable got a subsequent update that was given sequence num b+1, before
    1658           1 :                 // being flushed.
    1659           1 :                 //
    1660           1 :                 // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
    1661           1 :                 // third file are inconsistent with each other. So comparing adjacent files is insufficient
    1662           1 :                 // for consistency checking.
    1663           1 :                 //
    1664           1 :                 // Visually we have something like
    1665           1 :                 // x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
    1666           1 :                 //     y       y  y        y             (y's represent ingested files)
    1667           1 :                 // And these are ordered in increasing order of y. Note that y's must be unique.
    1668           1 :                 //
    1669           1 :                 // Case 2:
    1670           1 :                 // A flushed file that did not overlap in keys with any file in any level, but does overlap
    1671           1 :                 // in the file key intervals. This file is placed in L0 since it overlaps in the file
    1672           1 :                 // key intervals but since it has no overlapping data, it is assigned a sequence number
    1673           1 :                 // of 0 in RocksDB. We handle this case for compatibility with RocksDB.
    1674           1 :                 //
    1675           1 :                 // Case 3:
    1676           1 :                 // A sequence of flushed files that overlap in sequence numbers with one another,
    1677           1 :                 // but do not overlap in keys inside the sstables. These files correspond to
    1678           1 :                 // partitioned flushes or the results of intra-L0 compactions of partitioned
    1679           1 :                 // flushes.
    1680           1 :                 //
    1681           1 :                 // Since these types of SSTables violate most other sequence number
    1682           1 :                 // overlap invariants, and handling this case is important for compatibility
    1683           1 :                 // with future versions of pebble, this method relaxes most L0 invariant
    1684           1 :                 // checks.
    1685           1 : 
    1686           1 :                 var prev *FileMetadata
    1687           1 :                 for f := files.First(); f != nil; f, prev = files.Next(), f {
    1688           1 :                         if prev == nil {
    1689           1 :                                 continue
    1690             :                         }
    1691             :                         // Validate that the sorting is sane.
    1692           1 :                         if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
    1693           1 :                                 // Multiple files satisfying case 2 mentioned above.
    1694           1 :                         } else if !prev.lessSeqNum(f) {
    1695           1 :                                 return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
    1696           1 :                                         errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1697           1 :                                         errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
    1698           1 :                                         errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
    1699           1 :                         }
    1700             :                 }
    1701           1 :         } else {
    1702           1 :                 var prev *FileMetadata
    1703           1 :                 for f := files.First(); f != nil; f, prev = files.Next(), f {
    1704           1 :                         if err := f.Validate(cmp, format); err != nil {
    1705           1 :                                 return errors.Wrapf(err, "%s ", level)
    1706           1 :                         }
    1707           1 :                         if prev != nil {
    1708           1 :                                 if prev.cmpSmallestKey(f, cmp) >= 0 {
    1709           1 :                                         return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
    1710           1 :                                                 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1711           1 :                                                 prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
    1712           1 :                                                 f.Smallest.Pretty(format), f.Largest.Pretty(format))
    1713           1 :                                 }
    1714             : 
    1715             :                                 // In all supported format major version, split user keys are
    1716             :                                 // prohibited, so both files cannot contain keys with the same user
    1717             :                                 // keys. If the bounds have the same user key, the previous file's
    1718             :                                 // boundary must have a InternalKeyTrailer indicating that it's exclusive.
    1719           1 :                                 if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) {
    1720           1 :                                         return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
    1721           1 :                                                 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
    1722           1 :                                                 prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
    1723           1 :                                                 f.Smallest.Pretty(format), f.Largest.Pretty(format))
    1724           1 :                                 }
    1725             :                         }
    1726             :                 }
    1727             :         }
    1728           1 :         return nil
    1729             : }

Generated by: LCOV version 1.14