LCOV - code coverage report
Current view: top level - pebble - format_major_version.go (source / functions) Coverage Total Hit
Test: 2025-08-19 08:18Z 07506b8d - tests + meta.lcov Lines: 60.5 % 261 158
Test Date: 2025-08-19 08:19:58 Functions: - 0 0

            Line data    Source code
       1              : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2              : // of this source code is governed by a BSD-style license that can be found in
       3              : // the LICENSE file.
       4              : 
       5              : package pebble
       6              : 
       7              : import (
       8              :         "fmt"
       9              :         "strconv"
      10              : 
      11              :         "github.com/cockroachdb/errors"
      12              :         "github.com/cockroachdb/pebble/internal/manifest"
      13              :         "github.com/cockroachdb/pebble/objstorage/remote"
      14              :         "github.com/cockroachdb/pebble/sstable"
      15              :         "github.com/cockroachdb/pebble/sstable/blob"
      16              :         "github.com/cockroachdb/pebble/vfs"
      17              :         "github.com/cockroachdb/pebble/vfs/atomicfs"
      18              : )
      19              : 
      20              : // FormatMajorVersion is a constant controlling the format of persisted
      21              : // data. Backwards incompatible changes to durable formats are gated
      22              : // behind new format major versions.
      23              : //
      24              : // At any point, a database's format major version may be bumped.
      25              : // However, once a database's format major version is increased,
      26              : // previous versions of Pebble will refuse to open the database.
      27              : //
      28              : // The zero value format is the FormatDefault constant. The exact
      29              : // FormatVersion that the default corresponds to may change with time.
      30              : type FormatMajorVersion uint64
      31              : 
      32              : // SafeValue implements redact.SafeValue.
      33            0 : func (v FormatMajorVersion) SafeValue() {}
      34              : 
      35              : // String implements fmt.Stringer.
      36            2 : func (v FormatMajorVersion) String() string {
      37            2 :         // NB: This must not change. It's used as the value for the on-disk
      38            2 :         // version marker file.
      39            2 :         //
      40            2 :         // Specifically, this value must always parse as a base 10 integer
      41            2 :         // that fits in a uint64. We format it as zero-padded, 3-digit
      42            2 :         // number today, but the padding may change.
      43            2 :         return fmt.Sprintf("%03d", v)
      44            2 : }
      45              : 
      46              : const (
      47              :         // FormatDefault leaves the format version unspecified. When used to create a
      48              :         // new store, Pebble will choose the earliest format version it supports.
      49              :         FormatDefault FormatMajorVersion = iota
      50              : 
      51              :         // 21.2 versions.
      52              : 
      53              :         // FormatMostCompatible maintains the most backwards compatibility,
      54              :         // maintaining bi-directional compatibility with RocksDB 6.2.1 in
      55              :         // the particular configuration described in the Pebble README.
      56              :         // Deprecated.
      57              :         _ // FormatMostCompatible
      58              : 
      59              :         // formatVersionedManifestMarker is the first
      60              :         // backwards-incompatible change made to Pebble, introducing the
      61              :         // format-version marker file for handling backwards-incompatible
      62              :         // changes more broadly, and replacing the `CURRENT` file with a
      63              :         // marker file.
      64              :         //
      65              :         // This format version is intended as an intermediary version state.
      66              :         // It is deliberately unexported to discourage direct use of this
      67              :         // format major version.  Clients should use FormatVersioned which
      68              :         // also ensures earlier versions of Pebble fail to open a database
      69              :         // written in a future format major version.
      70              :         // Deprecated.
      71              :         _ // formatVersionedManifestMarker
      72              : 
      73              :         // FormatVersioned is a new format major version that replaces the
      74              :         // old `CURRENT` file with a new 'marker' file scheme.  Previous
      75              :         // Pebble versions will be unable to open the database unless
      76              :         // they're aware of format versions.
      77              :         // Deprecated.
      78              :         _ // FormatVersioned
      79              : 
      80              :         // FormatSetWithDelete is a format major version that introduces a new key
      81              :         // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
      82              :         // unable to open this database.
      83              :         // Deprecated.
      84              :         _ // FormatSetWithDelete
      85              : 
      86              :         // 22.1 versions.
      87              : 
      88              :         // FormatBlockPropertyCollector is a format major version that introduces
      89              :         // BlockPropertyCollectors.
      90              :         // Deprecated.
      91              :         _ // FormatBlockPropertyCollector
      92              : 
      93              :         // FormatSplitUserKeysMarked is a format major version that guarantees that
      94              :         // all files that share user keys with neighbors are marked for compaction
      95              :         // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
      96              :         // (without holding mutexes) until the scan of the LSM is complete and the
      97              :         // manifest has been rotated.
      98              :         // Deprecated.
      99              :         _ // FormatSplitUserKeysMarked
     100              : 
     101              :         // 22.2 versions.
     102              : 
     103              :         // FormatSplitUserKeysMarkedCompacted is a format major version that
     104              :         // guarantees that all files explicitly marked for compaction in the manifest
     105              :         // have been compacted. Combined with the FormatSplitUserKeysMarked format
     106              :         // major version, this version guarantees that there are no user keys split
     107              :         // across multiple files within a level L1+. Ratcheting to this format version
     108              :         // will block (without holding mutexes) until all necessary compactions for
     109              :         // files marked for compaction are complete.
     110              :         // Deprecated.
     111              :         _ // FormatSplitUserKeysMarkedCompacted
     112              : 
     113              :         // FormatRangeKeys is a format major version that introduces range keys.
     114              :         // Deprecated.
     115              :         _ // FormatRangeKeys
     116              : 
     117              :         // FormatMinTableFormatPebblev1 is a format major version that guarantees that
     118              :         // tables created by or ingested into the DB at or above this format major
     119              :         // version will have a table format version of at least Pebblev1 (Block
     120              :         // Properties).
     121              :         // Deprecated.
     122              :         _ // FormatMinTableFormatPebblev1
     123              : 
     124              :         // FormatPrePebblev1Marked is a format major version that guarantees that all
     125              :         // sstables with a table format version pre-Pebblev1 (i.e. those that are
     126              :         // guaranteed to not contain block properties) are marked for compaction in
     127              :         // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
     128              :         // holding mutexes) until the scan of the LSM is complete and the manifest has
     129              :         // been rotated.
     130              :         // Deprecated.
     131              :         _ // FormatPrePebblev1Marked
     132              : 
     133              :         // 23.1 versions.
     134              : 
     135              :         // formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
     136              :         // This format major version was originally intended to ship in the 23.1
     137              :         // release. It was later decided that this should be deferred until a
     138              :         // subsequent release. The original ordering is preserved so as not to
     139              :         // introduce breaking changes in Cockroach.
     140              :         _ // formatUnusedPrePebblev1MarkedCompacted
     141              : 
     142              :         // FormatSSTableValueBlocks is a format major version that adds support for
     143              :         // storing values in value blocks in the sstable. Value block support is not
     144              :         // necessarily enabled when writing sstables, when running with this format
     145              :         // major version.
     146              :         _ // FormatSSTableValueBlocks
     147              : 
     148              :         // FormatFlushableIngest is a format major version that enables lazy
     149              :         // addition of ingested sstables into the LSM structure. When an ingest
     150              :         // overlaps with a memtable, a record of the ingest is written to the WAL
     151              :         // without waiting for a flush. Subsequent reads treat the ingested files as
     152              :         // a level above the overlapping memtable. Once the memtable is flushed, the
     153              :         // ingested files are moved into the lowest possible levels.
     154              :         //
     155              :         // This feature is behind a format major version because it required
     156              :         // breaking changes to the WAL format.
     157              :         FormatFlushableIngest
     158              : 
     159              :         // 23.2 versions.
     160              : 
     161              :         // FormatPrePebblev1MarkedCompacted is a format major version that guarantees
     162              :         // that all sstables explicitly marked for compaction in the manifest (see
     163              :         // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
     164              :         // version will block (without holding mutexes) until all necessary
     165              :         // compactions for files marked for compaction are complete.
     166              :         FormatPrePebblev1MarkedCompacted
     167              : 
     168              :         // FormatDeleteSizedAndObsolete is a format major version that adds support
     169              :         // for deletion tombstones that encode the size of the value they're
     170              :         // expected to delete. This format major version is required before the
     171              :         // associated key kind may be committed through batch applications or
     172              :         // ingests. It also adds support for keys that are marked obsolete (see
     173              :         // sstable/format.go for details).
     174              :         FormatDeleteSizedAndObsolete
     175              : 
     176              :         // FormatVirtualSSTables is a format major version that adds support for
     177              :         // virtual sstables that can reference a sub-range of keys in an underlying
     178              :         // physical sstable. This information is persisted through new,
     179              :         // backward-incompatible fields in the Manifest, and therefore requires
     180              :         // a format major version.
     181              :         FormatVirtualSSTables
     182              : 
     183              :         // FormatSyntheticPrefixSuffix is a format major version that adds support for
     184              :         // sstables to have their content exposed in a different prefix or suffix of
     185              :         // keyspace than the actual prefix/suffix persisted in the keys in such
     186              :         // sstables. The prefix and suffix replacement information is stored in new
     187              :         // fields in the Manifest and thus requires a format major version.
     188              :         FormatSyntheticPrefixSuffix
     189              : 
     190              :         // FormatFlushableIngestExcises is a format major version that adds support for
     191              :         // having excises unconditionally being written as flushable ingestions. This
     192              :         // is implemented through adding a new key kind that can go in the same batches
     193              :         // as flushable ingested sstables.
     194              :         FormatFlushableIngestExcises
     195              : 
     196              :         // FormatColumnarBlocks is a format major version enabling use of the
     197              :         // TableFormatPebblev5 table format, that encodes sstable data blocks, index
     198              :         // blocks and keyspan blocks by organizing the KVs into columns within the
     199              :         // block.
     200              :         FormatColumnarBlocks
     201              : 
     202              :         // FormatWALSyncChunks is a format major version enabling the writing of
     203              :         // WAL sync chunks. These new chunks are used to disambiguate between corruption
     204              :         // and logical EOF during WAL replay. This is implemented by adding a new
     205              :         // chunk wire format that encodes an additional "Synced Offset" field which acts
     206              :         // as a commitment that the WAL should have been synced up until the offset.
     207              :         FormatWALSyncChunks
     208              : 
     209              :         // FormatTableFormatV6 is a format major version enabling the sstable table
     210              :         // format TableFormatPebblev6.
     211              :         //
     212              :         // The TableFormatPebblev6 sstable format introduces a checksum within the
     213              :         // sstable footer, allows inclusion of blob handle references within the
     214              :         // value column of a sstable block, and supports columnar meta index +
     215              :         // properties blocks.
     216              :         //
     217              :         // This format major version does not yet enable use of value separation.
     218              :         FormatTableFormatV6
     219              : 
     220              :         // formatDeprecatedExperimentalValueSeparation was used to enable an
     221              :         // experimental version of value separation, separating values into external
     222              :         // blob files that do not participate in every compaction.
     223              :         //
     224              :         // Value separation now depends on TableFormatPebblev7 which this format
     225              :         // major version precedes. This format major version is deprecated and
     226              :         // unexported, and value separation now requires FormatValueSeparation.
     227              :         formatDeprecatedExperimentalValueSeparation
     228              : 
     229              :         // formatFooterAttributes is a format major version that adds support for
     230              :         // writing sstable.Attributes in the footer of sstables.
     231              :         formatFooterAttributes
     232              : 
     233              :         // FormatValueSeparation is a format major version that adds support for
     234              :         // value separation, separating values into external blob files that do not
     235              :         // participate in every compaction.
     236              :         FormatValueSeparation
     237              : 
     238              :         // FormatExciseBoundsRecord is a format major version that adds support for
     239              :         // persisting excise bounds records in the manifest (VersionEdit).
     240              :         FormatExciseBoundsRecord
     241              : 
     242              :         // FormatV2BlobFiles is a format major version that adds support for V2 blob
     243              :         // file format (which adds compression statistics).
     244              :         FormatV2BlobFiles
     245              : 
     246              :         // -- Add new versions here --
     247              : 
     248              :         // FormatNewest is the most recent format major version.
     249              :         FormatNewest FormatMajorVersion = iota - 1
     250              : 
     251              :         // Experimental versions, which are excluded by FormatNewest (but can be used
     252              :         // in tests) can be defined here.
     253              : 
     254              :         // -- Add experimental versions here --
     255              : 
     256              :         // internalFormatNewest is the most recent, possibly experimental format major
     257              :         // version.
     258              :         internalFormatNewest FormatMajorVersion = iota - 2
     259              : )
     260              : 
     261              : // FormatMinSupported is the minimum format version that is supported by this
     262              : // Pebble version.
     263              : const FormatMinSupported = FormatFlushableIngest
     264              : 
     265              : // FormatMinForSharedObjects it the minimum format version that supports shared
     266              : // objects (see CreateOnShared option).
     267              : const FormatMinForSharedObjects = FormatVirtualSSTables
     268              : 
     269              : // resolveDefault asserts that the given version is supported, and returns the
     270              : // given version, replacing FormatDefault with FormatMinSupported.
     271            2 : func (v FormatMajorVersion) resolveDefault() FormatMajorVersion {
     272            2 :         if v == FormatDefault {
     273            1 :                 return FormatMinSupported
     274            1 :         }
     275            2 :         if v < FormatMinSupported || v > internalFormatNewest {
     276            1 :                 panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
     277              :         }
     278            2 :         return v
     279              : }
     280              : 
     281              : // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
     282              : // this FormatMajorVersion.
     283            2 : func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
     284            2 :         v = v.resolveDefault()
     285            2 :         switch {
     286            2 :         case v >= formatFooterAttributes:
     287            2 :                 return sstable.TableFormatPebblev7
     288            2 :         case v >= FormatTableFormatV6:
     289            2 :                 return sstable.TableFormatPebblev6
     290            2 :         case v >= FormatColumnarBlocks:
     291            2 :                 return sstable.TableFormatPebblev5
     292            2 :         case v >= FormatDeleteSizedAndObsolete:
     293            2 :                 return sstable.TableFormatPebblev4
     294            2 :         default:
     295            2 :                 return sstable.TableFormatPebblev3
     296              :         }
     297              : }
     298              : 
     299              : // MinTableFormat returns the minimum sstable.TableFormat that can be used at
     300              : // this FormatMajorVersion.
     301            2 : func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
     302            2 :         _ = v.resolveDefault()
     303            2 :         return sstable.TableFormatPebblev1
     304            2 : }
     305              : 
     306              : // MaxBlobFileFormat returns the maximum blob.FileFormat that can be used at
     307              : // this FormatMajorVersion. It can only be used on versions that support value
     308              : // separation.
     309            2 : func (v FormatMajorVersion) MaxBlobFileFormat() blob.FileFormat {
     310            2 :         v = v.resolveDefault()
     311            2 :         switch {
     312            2 :         case v >= FormatV2BlobFiles:
     313            2 :                 return blob.FileFormatV2
     314            2 :         case v >= FormatValueSeparation:
     315            2 :                 return blob.FileFormatV1
     316            1 :         default:
     317            1 :                 panic(fmt.Sprintf("pebble: format major version %s does not support blob files", v))
     318              :         }
     319              : }
     320              : 
     321              : // formatMajorVersionMigrations defines the migrations from one format
     322              : // major version to the next. Each migration is defined as a closure
     323              : // which will be invoked on the database before the new format major
     324              : // version is committed. Migrations must be idempotent. Migrations are
     325              : // invoked with d.mu locked.
     326              : //
     327              : // Each migration is responsible for invoking finalizeFormatVersUpgrade
     328              : // to set the new format major version.  RatchetFormatMajorVersion will
     329              : // panic if a migration returns a nil error but fails to finalize the
     330              : // new format major version.
     331              : var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
     332            0 :         FormatFlushableIngest: func(d *DB) error { return nil },
     333            2 :         FormatPrePebblev1MarkedCompacted: func(d *DB) error {
     334            2 :                 // Before finalizing the format major version, rewrite any sstables
     335            2 :                 // still marked for compaction. Note all format major versions
     336            2 :                 // migrations are invoked with DB.mu locked.
     337            2 :                 if err := d.compactMarkedFilesLocked(); err != nil {
     338            0 :                         return err
     339            0 :                 }
     340            2 :                 return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
     341              :         },
     342            2 :         FormatDeleteSizedAndObsolete: func(d *DB) error {
     343            2 :                 return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
     344            2 :         },
     345            2 :         FormatVirtualSSTables: func(d *DB) error {
     346            2 :                 return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
     347            2 :         },
     348            2 :         FormatSyntheticPrefixSuffix: func(d *DB) error {
     349            2 :                 return d.finalizeFormatVersUpgrade(FormatSyntheticPrefixSuffix)
     350            2 :         },
     351            2 :         FormatFlushableIngestExcises: func(d *DB) error {
     352            2 :                 return d.finalizeFormatVersUpgrade(FormatFlushableIngestExcises)
     353            2 :         },
     354            2 :         FormatColumnarBlocks: func(d *DB) error {
     355            2 :                 return d.finalizeFormatVersUpgrade(FormatColumnarBlocks)
     356            2 :         },
     357            2 :         FormatWALSyncChunks: func(d *DB) error {
     358            2 :                 return d.finalizeFormatVersUpgrade(FormatWALSyncChunks)
     359            2 :         },
     360            2 :         FormatTableFormatV6: func(d *DB) error {
     361            2 :                 return d.finalizeFormatVersUpgrade(FormatTableFormatV6)
     362            2 :         },
     363            2 :         formatDeprecatedExperimentalValueSeparation: func(d *DB) error {
     364            2 :                 return d.finalizeFormatVersUpgrade(formatDeprecatedExperimentalValueSeparation)
     365            2 :         },
     366            2 :         formatFooterAttributes: func(d *DB) error {
     367            2 :                 return d.finalizeFormatVersUpgrade(formatFooterAttributes)
     368            2 :         },
     369            2 :         FormatValueSeparation: func(d *DB) error {
     370            2 :                 return d.finalizeFormatVersUpgrade(FormatValueSeparation)
     371            2 :         },
     372            2 :         FormatExciseBoundsRecord: func(d *DB) error {
     373            2 :                 return d.finalizeFormatVersUpgrade(FormatExciseBoundsRecord)
     374            2 :         },
     375            2 :         FormatV2BlobFiles: func(d *DB) error {
     376            2 :                 return d.finalizeFormatVersUpgrade(FormatV2BlobFiles)
     377            2 :         },
     378              : }
     379              : 
     380              : const formatVersionMarkerName = `format-version`
     381              : 
     382              : // lookupFormatMajorVersion retrieves the format version from the format version
     383              : // marker file.
     384              : //
     385              : // If such a file does not exist, returns FormatDefault. Note that this case is
     386              : // only acceptable if we are creating a new store (we no longer support
     387              : // FormatMostCompatible which is the only one with no version marker file).
     388              : func lookupFormatMajorVersion(
     389              :         fs vfs.FS, dirname string, ls []string,
     390            2 : ) (FormatMajorVersion, *atomicfs.Marker, error) {
     391            2 :         m, versString, err := atomicfs.LocateMarkerInListing(fs, dirname, formatVersionMarkerName, ls)
     392            2 :         if err != nil {
     393            1 :                 return 0, nil, err
     394            1 :         }
     395            2 :         if versString == "" {
     396            2 :                 return FormatDefault, m, nil
     397            2 :         }
     398            2 :         v, err := strconv.ParseUint(versString, 10, 64)
     399            2 :         if err != nil {
     400            0 :                 return 0, nil, errors.Wrap(err, "parsing format major version")
     401            0 :         }
     402            2 :         vers := FormatMajorVersion(v)
     403            2 :         if vers == FormatDefault {
     404            0 :                 return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
     405            0 :         }
     406            2 :         if vers > internalFormatNewest {
     407            1 :                 return 0, nil, errors.Newf("pebble: database %q written in unknown format major version %d", dirname, vers)
     408            1 :         }
     409            2 :         if vers < FormatMinSupported {
     410            0 :                 return 0, nil, errors.Newf("pebble: database %q written in format major version %d which is no longer supported", dirname, vers)
     411            0 :         }
     412            2 :         return vers, m, nil
     413              : }
     414              : 
     415              : // FormatMajorVersion returns the database's active format major
     416              : // version. The format major version may be higher than the one
     417              : // provided in Options when the database was opened if the existing
     418              : // database was written with a higher format version.
     419            2 : func (d *DB) FormatMajorVersion() FormatMajorVersion {
     420            2 :         return FormatMajorVersion(d.mu.formatVers.vers.Load())
     421            2 : }
     422              : 
     423              : // TableFormat returns the TableFormat that the database is currently using when
     424              : // writing sstables. The table format is determined by the database's format
     425              : // major version, as well as experimental settings like EnableValueBlocks and
     426              : // EnableColumnarBlocks.
     427            2 : func (d *DB) TableFormat() sstable.TableFormat {
     428            2 :         // The table is typically written at the maximum allowable format implied by
     429            2 :         // the current format major version of the DB.
     430            2 :         f := d.FormatMajorVersion().MaxTableFormat()
     431            2 :         if f == sstable.TableFormatPebblev3 {
     432            2 :                 // In format major versions with maximum table formats of Pebblev3,
     433            2 :                 // value blocks were conditional on an experimental setting. In format
     434            2 :                 // major versions with maximum table formats of Pebblev4 and higher,
     435            2 :                 // value blocks are always enabled.
     436            2 :                 if d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks() {
     437            2 :                         f = sstable.TableFormatPebblev2
     438            2 :                 }
     439              :         }
     440            2 :         return f
     441              : }
     442              : 
     443              : // BlobFileFormat returns the blob.FileFormat that the database is currently
     444              : // using when writing blob files.
     445            2 : func (d *DB) BlobFileFormat() blob.FileFormat {
     446            2 :         return d.FormatMajorVersion().MaxBlobFileFormat()
     447            2 : }
     448              : 
     449              : // shouldCreateShared returns true if the database should use shared objects
     450              : // when creating new objects on the given level.
     451            2 : func (d *DB) shouldCreateShared(targetLevel int) bool {
     452            2 :         return remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, targetLevel) &&
     453            2 :                 d.FormatMajorVersion() >= FormatMinForSharedObjects
     454            2 : }
     455              : 
     456              : // RatchetFormatMajorVersion ratchets the opened database's format major
     457              : // version to the provided version. It errors if the provided format
     458              : // major version is below the database's current version. Once a
     459              : // database's format major version is upgraded, previous Pebble versions
     460              : // that do not know of the format version will be unable to open the
     461              : // database.
     462            2 : func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
     463            2 :         if err := d.closed.Load(); err != nil {
     464            1 :                 panic(err)
     465              :         }
     466              : 
     467            2 :         d.mu.Lock()
     468            2 :         defer d.mu.Unlock()
     469            2 :         return d.ratchetFormatMajorVersionLocked(fmv)
     470              : }
     471              : 
     472            2 : func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
     473            2 :         if d.opts.ReadOnly {
     474            0 :                 return ErrReadOnly
     475            0 :         }
     476            2 :         if formatVers > internalFormatNewest {
     477            0 :                 // Guard against accidentally forgetting to update internalFormatNewest.
     478            0 :                 return errors.Errorf("pebble: unknown format version %d", formatVers)
     479            0 :         }
     480            2 :         if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
     481            0 :                 return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
     482            0 :                         currentVers, formatVers)
     483            0 :         }
     484            2 :         if d.mu.formatVers.ratcheting {
     485            0 :                 return errors.Newf("pebble: database format major version upgrade is in-progress")
     486            0 :         }
     487            2 :         d.mu.formatVers.ratcheting = true
     488            2 :         defer func() { d.mu.formatVers.ratcheting = false }()
     489              : 
     490            2 :         for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
     491            2 :                 if err := formatMajorVersionMigrations[nextVers](d); err != nil {
     492            0 :                         return errors.Wrapf(err, "migrating to version %d", nextVers)
     493            0 :                 }
     494              : 
     495              :                 // NB: The migration is responsible for calling
     496              :                 // finalizeFormatVersUpgrade to finalize the upgrade. This
     497              :                 // structure is necessary because some migrations may need to
     498              :                 // update in-memory state (without ever dropping locks) after
     499              :                 // the upgrade is finalized. Here we assert that the upgrade
     500              :                 // did occur.
     501            2 :                 if d.FormatMajorVersion() != nextVers {
     502            0 :                         d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
     503            0 :                 }
     504              :         }
     505            2 :         return nil
     506              : }
     507              : 
     508              : // finalizeFormatVersUpgrade is typically only be called from within a
     509              : // format major version migration.
     510              : //
     511              : // See formatMajorVersionMigrations.
     512            2 : func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
     513            2 :         if err := d.writeFormatVersionMarker(formatVers); err != nil {
     514            0 :                 return err
     515            0 :         }
     516            2 :         d.mu.formatVers.vers.Store(uint64(formatVers))
     517            2 :         d.opts.EventListener.FormatUpgrade(formatVers)
     518            2 :         return nil
     519              : }
     520              : 
     521            2 : func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error {
     522            2 :         // We use the marker to encode the active format version in the
     523            2 :         // marker filename. Unlike other uses of the atomic marker, there is
     524            2 :         // no file with the filename `formatVers.String()` on the
     525            2 :         // filesystem.
     526            2 :         return d.mu.formatVers.marker.Move(formatVers.String())
     527            2 : }
     528              : 
     529              : // compactMarkedFilesLocked performs a migration that schedules rewrite
     530              : // compactions to compact away any sstables marked for compaction.
     531              : // compactMarkedFilesLocked is run while ratcheting the database's format major
     532              : // version to FormatSplitUserKeysMarkedCompacted.
     533              : //
     534              : // Note that while this method is called with the DB.mu held, and will not
     535              : // return until all marked files have been compacted, the mutex is dropped while
     536              : // waiting for compactions to complete (or for slots to free up).
     537            2 : func (d *DB) compactMarkedFilesLocked() error {
     538            2 :         curr := d.mu.versions.currentVersion()
     539            2 :         if curr.Stats.MarkedForCompaction == 0 {
     540            2 :                 return nil
     541            2 :         }
     542              :         // Attempt to schedule a compaction to rewrite a file marked for compaction.
     543              :         // We simply call maybeScheduleCompaction since it also picks rewrite
     544              :         // compactions. Note that we don't need to call this repeatedly in the for
     545              :         // loop below since the completion of a compaction either starts a new one
     546              :         // or ensures a compaction is queued for scheduling. By calling
     547              :         // maybeScheduleCompaction here we are simply kicking off this behavior.
     548            0 :         d.maybeScheduleCompaction()
     549            0 : 
     550            0 :         // The above attempt might succeed and schedule a rewrite compaction. Or
     551            0 :         // there might not be available compaction concurrency to schedule the
     552            0 :         // compaction.  Or compaction of the file might have already been in
     553            0 :         // progress. In any scenario, wait until there's some change in the
     554            0 :         // state of active compactions.
     555            0 :         for curr.Stats.MarkedForCompaction > 0 {
     556            0 :                 // Before waiting, check that the database hasn't been closed. Trying to
     557            0 :                 // schedule the compaction may have dropped d.mu while waiting for a
     558            0 :                 // manifest write to complete. In that dropped interim, the database may
     559            0 :                 // have been closed.
     560            0 :                 if err := d.closed.Load(); err != nil {
     561            0 :                         return err.(error)
     562            0 :                 }
     563              : 
     564              :                 // Some flush or compaction may have scheduled or completed while we waited
     565              :                 // for the manifest lock in maybeScheduleCompactionPicker. Get the latest
     566              :                 // Version before waiting on a compaction.
     567            0 :                 curr = d.mu.versions.currentVersion()
     568            0 : 
     569            0 :                 // Only wait on compactions if there are files still marked for compaction.
     570            0 :                 // NB: Waiting on this condition variable drops d.mu while blocked.
     571            0 :                 if curr.Stats.MarkedForCompaction > 0 {
     572            0 :                         // NB: we cannot assert that d.mu.compact.compactingCount > 0, since
     573            0 :                         // with a CompactionScheduler a DB may not have even one ongoing
     574            0 :                         // compaction (if other competing activities are being preferred by the
     575            0 :                         // scheduler).
     576            0 :                         d.mu.compact.cond.Wait()
     577            0 :                         // Refresh the current version again.
     578            0 :                         curr = d.mu.versions.currentVersion()
     579            0 :                 }
     580              :         }
     581            0 :         return nil
     582              : }
     583              : 
     584              : // findFilesFunc scans the LSM for files, returning true if at least one
     585              : // file was found. The returned array contains the matched files, if any, per
     586              : // level.
     587              : type findFilesFunc func(v *manifest.Version) (found bool, files [numLevels][]*manifest.TableMetadata, _ error)
     588              : 
     589              : // This method is not used currently, but it will be useful the next time we need
     590              : // to mark files for compaction.
     591              : var _ = (*DB)(nil).markFilesLocked
     592              : 
     593              : // markFilesLocked durably marks the files that match the given findFilesFunc for
     594              : // compaction.
     595            0 : func (d *DB) markFilesLocked(findFn findFilesFunc) error {
     596            0 :         jobID := d.newJobIDLocked()
     597            0 : 
     598            0 :         // Acquire a read state to have a view of the LSM and a guarantee that none
     599            0 :         // of the referenced files will be deleted until we've unreferenced the read
     600            0 :         // state. Some findFilesFuncs may read the files, requiring they not be
     601            0 :         // deleted.
     602            0 :         rs := d.loadReadState()
     603            0 :         var (
     604            0 :                 found bool
     605            0 :                 files [numLevels][]*manifest.TableMetadata
     606            0 :                 err   error
     607            0 :         )
     608            0 :         func() {
     609            0 :                 defer rs.unrefLocked()
     610            0 :                 // Note the unusual locking: unlock, defer Lock(). The scan of the files in
     611            0 :                 // the version does not need to block other operations that require the
     612            0 :                 // DB.mu. Drop it for the scan, before re-acquiring it.
     613            0 :                 d.mu.Unlock()
     614            0 :                 defer d.mu.Lock()
     615            0 :                 found, files, err = findFn(rs.current)
     616            0 :         }()
     617            0 :         if err != nil {
     618            0 :                 return err
     619            0 :         }
     620              : 
     621              :         // The database lock has been acquired again by the defer within the above
     622              :         // anonymous function.
     623            0 :         if !found {
     624            0 :                 // Nothing to do.
     625            0 :                 return nil
     626            0 :         }
     627              : 
     628              :         // After scanning, if we found files to mark, we fetch the current state of
     629              :         // the LSM (which may have changed) and set MarkedForCompaction on the files,
     630              :         // and update the version's Stats.MarkedForCompaction count, which are both
     631              :         // protected by d.mu.
     632              : 
     633              :         // Lock the manifest for a coherent view of the LSM. The database lock has
     634              :         // been re-acquired by the defer within the above anonymous function.
     635            0 :         _, err = d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) {
     636            0 :                 vers := d.mu.versions.currentVersion()
     637            0 :                 for l, filesToMark := range files {
     638            0 :                         if len(filesToMark) == 0 {
     639            0 :                                 continue
     640              :                         }
     641            0 :                         for _, f := range filesToMark {
     642            0 :                                 // Ignore files to be marked that have already been compacted or marked.
     643            0 :                                 if f.CompactionState == manifest.CompactionStateCompacted ||
     644            0 :                                         f.MarkedForCompaction {
     645            0 :                                         continue
     646              :                                 }
     647              :                                 // Else, mark the file for compaction in this version.
     648            0 :                                 vers.Stats.MarkedForCompaction++
     649            0 :                                 f.MarkedForCompaction = true
     650              :                         }
     651              :                         // The compaction picker uses the markedForCompactionAnnotator to
     652              :                         // quickly find files marked for compaction, or to quickly determine
     653              :                         // that there are no such files marked for compaction within a level.
     654              :                         // A b-tree node may be annotated with an annotation recording that
     655              :                         // there are no files marked for compaction within the node's subtree,
     656              :                         // based on the assumption that it's static.
     657              :                         //
     658              :                         // Since we're marking files for compaction, these b-tree nodes'
     659              :                         // annotations will be out of date. Clear the compaction-picking
     660              :                         // annotation, so that it's recomputed the next time the compaction
     661              :                         // picker looks for a file marked for compaction.
     662            0 :                         markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l])
     663              :                 }
     664              :                 // The 'marked-for-compaction' bit is persisted in the MANIFEST file
     665              :                 // metadata. We've already modified the in-memory table metadata, but the
     666              :                 // manifest hasn't been updated. Force rotation to a new MANIFEST file,
     667              :                 // which will write every table metadata to the new manifest file and ensure
     668              :                 // that the now marked-for-compaction table metadata are persisted as marked.
     669            0 :                 return versionUpdate{
     670            0 :                         VE:                      &manifest.VersionEdit{},
     671            0 :                         JobID:                   jobID,
     672            0 :                         ForceManifestRotation:   true,
     673            0 :                         InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) },
     674              :                 }, nil
     675              :         })
     676            0 :         return err
     677              : }
        

Generated by: LCOV version 2.0-1