LCOV - 2024-12-11 08:18Z ab9741a3 - tests + meta.lcov

LCOV - code coverage report

Current view:	top level - pebble - compaction.go (source / functions)		Hit	Total	Coverage
Test:	2024-12-11 08:18Z ab9741a3 - tests + meta.lcov	Lines:	2075	2219	93.5 %
Date:	2024-12-11 08:19:09	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package pebble
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "context"
      10             :         "fmt"
      11             :         "math"
      12             :         "runtime/pprof"
      13             :         "slices"
      14             :         "sort"
      15             :         "sync/atomic"
      16             :         "time"
      17             : 
      18             :         "github.com/cockroachdb/crlib/crtime"
      19             :         "github.com/cockroachdb/errors"
      20             :         "github.com/cockroachdb/pebble/internal/base"
      21             :         "github.com/cockroachdb/pebble/internal/compact"
      22             :         "github.com/cockroachdb/pebble/internal/keyspan"
      23             :         "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
      24             :         "github.com/cockroachdb/pebble/internal/manifest"
      25             :         "github.com/cockroachdb/pebble/internal/sstableinternal"
      26             :         "github.com/cockroachdb/pebble/objstorage"
      27             :         "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
      28             :         "github.com/cockroachdb/pebble/objstorage/remote"
      29             :         "github.com/cockroachdb/pebble/sstable"
      30             :         "github.com/cockroachdb/pebble/vfs"
      31             : )
      32             : 
      33             : var errEmptyTable = errors.New("pebble: empty table")
      34             : 
      35             : // ErrCancelledCompaction is returned if a compaction is cancelled by a
      36             : // concurrent excise or ingest-split operation.
      37             : var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction")
      38             : 
      39             : var compactLabels = pprof.Labels("pebble", "compact")
      40             : var flushLabels = pprof.Labels("pebble", "flush")
      41             : var gcLabels = pprof.Labels("pebble", "gc")
      42             : 
      43             : // expandedCompactionByteSizeLimit is the maximum number of bytes in all
      44             : // compacted files. We avoid expanding the lower level file set of a compaction
      45             : // if it would make the total compaction cover more than this many bytes.
      46           2 : func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 {
      47           2 :         v := uint64(25 * opts.Level(level).TargetFileSize)
      48           2 : 
      49           2 :         // Never expand a compaction beyond half the available capacity, divided
      50           2 :         // by the maximum number of concurrent compactions. Each of the concurrent
      51           2 :         // compactions may expand up to this limit, so this attempts to limit
      52           2 :         // compactions to half of available disk space. Note that this will not
      53           2 :         // prevent compaction picking from pursuing compactions that are larger
      54           2 :         // than this threshold before expansion.
      55           2 :         diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions())
      56           2 :         if v > diskMax {
      57           1 :                 v = diskMax
      58           1 :         }
      59           2 :         return v
      60             : }
      61             : 
      62             : // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1
      63             : // before we stop building a single file in a level-1 to level compaction.
      64           2 : func maxGrandparentOverlapBytes(opts *Options, level int) uint64 {
      65           2 :         return uint64(10 * opts.Level(level).TargetFileSize)
      66           2 : }
      67             : 
      68             : // maxReadCompactionBytes is used to prevent read compactions which
      69             : // are too wide.
      70           2 : func maxReadCompactionBytes(opts *Options, level int) uint64 {
      71           2 :         return uint64(10 * opts.Level(level).TargetFileSize)
      72           2 : }
      73             : 
      74             : // noCloseIter wraps around a FragmentIterator, intercepting and eliding
      75             : // calls to Close. It is used during compaction to ensure that rangeDelIters
      76             : // are not closed prematurely.
      77             : type noCloseIter struct {
      78             :         keyspan.FragmentIterator
      79             : }
      80             : 
      81           2 : func (i *noCloseIter) Close() {}
      82             : 
      83             : type compactionLevel struct {
      84             :         level int
      85             :         files manifest.LevelSlice
      86             :         // l0SublevelInfo contains information about L0 sublevels being compacted.
      87             :         // It's only set for the start level of a compaction starting out of L0 and
      88             :         // is nil for all other compactions.
      89             :         l0SublevelInfo []sublevelInfo
      90             : }
      91             : 
      92           2 : func (cl compactionLevel) Clone() compactionLevel {
      93           2 :         newCL := compactionLevel{
      94           2 :                 level: cl.level,
      95           2 :                 files: cl.files,
      96           2 :         }
      97           2 :         return newCL
      98           2 : }
      99           1 : func (cl compactionLevel) String() string {
     100           1 :         return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files)
     101           1 : }
     102             : 
     103             : // compactionWritable is a objstorage.Writable wrapper that, on every write,
     104             : // updates a metric in `versions` on bytes written by in-progress compactions so
     105             : // far. It also increments a per-compaction `written` int.
     106             : type compactionWritable struct {
     107             :         objstorage.Writable
     108             : 
     109             :         versions *versionSet
     110             :         written  *int64
     111             : }
     112             : 
     113             : // Write is part of the objstorage.Writable interface.
     114           2 : func (c *compactionWritable) Write(p []byte) error {
     115           2 :         if err := c.Writable.Write(p); err != nil {
     116           0 :                 return err
     117           0 :         }
     118             : 
     119           2 :         *c.written += int64(len(p))
     120           2 :         c.versions.incrementCompactionBytes(int64(len(p)))
     121           2 :         return nil
     122             : }
     123             : 
     124             : type compactionKind int
     125             : 
     126             : const (
     127             :         compactionKindDefault compactionKind = iota
     128             :         compactionKindFlush
     129             :         // compactionKindMove denotes a move compaction where the input file is
     130             :         // retained and linked in a new level without being obsoleted.
     131             :         compactionKindMove
     132             :         // compactionKindCopy denotes a copy compaction where the input file is
     133             :         // copied byte-by-byte into a new file with a new FileNum in the output level.
     134             :         compactionKindCopy
     135             :         // compactionKindDeleteOnly denotes a compaction that only deletes input
     136             :         // files. It can occur when wide range tombstones completely contain sstables.
     137             :         compactionKindDeleteOnly
     138             :         compactionKindElisionOnly
     139             :         compactionKindRead
     140             :         compactionKindTombstoneDensity
     141             :         compactionKindRewrite
     142             :         compactionKindIngestedFlushable
     143             : )
     144             : 
     145           2 : func (k compactionKind) String() string {
     146           2 :         switch k {
     147           2 :         case compactionKindDefault:
     148           2 :                 return "default"
     149           0 :         case compactionKindFlush:
     150           0 :                 return "flush"
     151           2 :         case compactionKindMove:
     152           2 :                 return "move"
     153           2 :         case compactionKindDeleteOnly:
     154           2 :                 return "delete-only"
     155           2 :         case compactionKindElisionOnly:
     156           2 :                 return "elision-only"
     157           1 :         case compactionKindRead:
     158           1 :                 return "read"
     159           1 :         case compactionKindTombstoneDensity:
     160           1 :                 return "tombstone-density"
     161           2 :         case compactionKindRewrite:
     162           2 :                 return "rewrite"
     163           0 :         case compactionKindIngestedFlushable:
     164           0 :                 return "ingested-flushable"
     165           2 :         case compactionKindCopy:
     166           2 :                 return "copy"
     167             :         }
     168           0 :         return "?"
     169             : }
     170             : 
     171             : // compaction is a table compaction from one level to the next, starting from a
     172             : // given version.
     173             : type compaction struct {
     174             :         // cancel is a bool that can be used by other goroutines to signal a compaction
     175             :         // to cancel, such as if a conflicting excise operation raced it to manifest
     176             :         // application. Only holders of the manifest lock will write to this atomic.
     177             :         cancel atomic.Bool
     178             : 
     179             :         kind compactionKind
     180             :         // isDownload is true if this compaction was started as part of a Download
     181             :         // operation. In this case kind is compactionKindCopy or
     182             :         // compactionKindRewrite.
     183             :         isDownload bool
     184             : 
     185             :         cmp       Compare
     186             :         equal     Equal
     187             :         comparer  *base.Comparer
     188             :         formatKey base.FormatKey
     189             :         logger    Logger
     190             :         version   *version
     191             :         stats     base.InternalIteratorStats
     192             :         beganAt   time.Time
     193             :         // versionEditApplied is set to true when a compaction has completed and the
     194             :         // resulting version has been installed (if successful), but the compaction
     195             :         // goroutine is still cleaning up (eg, deleting obsolete files).
     196             :         versionEditApplied bool
     197             :         bufferPool         sstable.BufferPool
     198             : 
     199             :         // startLevel is the level that is being compacted. Inputs from startLevel
     200             :         // and outputLevel will be merged to produce a set of outputLevel files.
     201             :         startLevel *compactionLevel
     202             : 
     203             :         // outputLevel is the level that files are being produced in. outputLevel is
     204             :         // equal to startLevel+1 except when:
     205             :         //    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
     206             :         //    - in multilevel compaction, the output level is the lowest level involved in
     207             :         //      the compaction
     208             :         // A compaction's outputLevel is nil for delete-only compactions.
     209             :         outputLevel *compactionLevel
     210             : 
     211             :         // extraLevels point to additional levels in between the input and output
     212             :         // levels that get compacted in multilevel compactions
     213             :         extraLevels []*compactionLevel
     214             : 
     215             :         inputs []compactionLevel
     216             : 
     217             :         // maxOutputFileSize is the maximum size of an individual table created
     218             :         // during compaction.
     219             :         maxOutputFileSize uint64
     220             :         // maxOverlapBytes is the maximum number of bytes of overlap allowed for a
     221             :         // single output table with the tables in the grandparent level.
     222             :         maxOverlapBytes uint64
     223             : 
     224             :         // flushing contains the flushables (aka memtables) that are being flushed.
     225             :         flushing flushableList
     226             :         // bytesWritten contains the number of bytes that have been written to outputs.
     227             :         bytesWritten int64
     228             : 
     229             :         // The boundaries of the input data.
     230             :         smallest InternalKey
     231             :         largest  InternalKey
     232             : 
     233             :         // A list of fragment iterators to close when the compaction finishes. Used by
     234             :         // input iteration to keep rangeDelIters open for the lifetime of the
     235             :         // compaction, and only close them when the compaction finishes.
     236             :         closers []*noCloseIter
     237             : 
     238             :         // grandparents are the tables in level+2 that overlap with the files being
     239             :         // compacted. Used to determine output table boundaries. Do not assume that the actual files
     240             :         // in the grandparent when this compaction finishes will be the same.
     241             :         grandparents manifest.LevelSlice
     242             : 
     243             :         // Boundaries at which flushes to L0 should be split. Determined by
     244             :         // L0Sublevels. If nil, flushes aren't split.
     245             :         l0Limits [][]byte
     246             : 
     247             :         delElision      compact.TombstoneElision
     248             :         rangeKeyElision compact.TombstoneElision
     249             : 
     250             :         // allowedZeroSeqNum is true if seqnums can be zeroed if there are no
     251             :         // snapshots requiring them to be kept. This determination is made by
     252             :         // looking for an sstable which overlaps the bounds of the compaction at a
     253             :         // lower level in the LSM during runCompaction.
     254             :         allowedZeroSeqNum bool
     255             : 
     256             :         // deletionHints are set if this is a compactionKindDeleteOnly. Used to figure
     257             :         // out whether an input must be deleted in its entirety, or excised into
     258             :         // virtual sstables.
     259             :         deletionHints []deleteCompactionHint
     260             : 
     261             :         // exciseEnabled is set to true if this is a compactionKindDeleteOnly and
     262             :         // this compaction is allowed to excise files.
     263             :         exciseEnabled bool
     264             : 
     265             :         metrics map[int]*LevelMetrics
     266             : 
     267             :         pickerMetrics compactionPickerMetrics
     268             : 
     269             :         slot base.CompactionSlot
     270             : }
     271             : 
     272             : // inputLargestSeqNumAbsolute returns the maximum LargestSeqNumAbsolute of any
     273             : // input sstables.
     274           2 : func (c *compaction) inputLargestSeqNumAbsolute() base.SeqNum {
     275           2 :         var seqNum base.SeqNum
     276           2 :         for _, cl := range c.inputs {
     277           2 :                 cl.files.Each(func(m *manifest.FileMetadata) {
     278           2 :                         seqNum = max(seqNum, m.LargestSeqNumAbsolute)
     279           2 :                 })
     280             :         }
     281           2 :         return seqNum
     282             : }
     283             : 
     284           2 : func (c *compaction) makeInfo(jobID JobID) CompactionInfo {
     285           2 :         info := CompactionInfo{
     286           2 :                 JobID:       int(jobID),
     287           2 :                 Reason:      c.kind.String(),
     288           2 :                 Input:       make([]LevelInfo, 0, len(c.inputs)),
     289           2 :                 Annotations: []string{},
     290           2 :         }
     291           2 :         if c.isDownload {
     292           2 :                 info.Reason = "download," + info.Reason
     293           2 :         }
     294           2 :         for _, cl := range c.inputs {
     295           2 :                 inputInfo := LevelInfo{Level: cl.level, Tables: nil}
     296           2 :                 iter := cl.files.Iter()
     297           2 :                 for m := iter.First(); m != nil; m = iter.Next() {
     298           2 :                         inputInfo.Tables = append(inputInfo.Tables, m.TableInfo())
     299           2 :                 }
     300           2 :                 info.Input = append(info.Input, inputInfo)
     301             :         }
     302           2 :         if c.outputLevel != nil {
     303           2 :                 info.Output.Level = c.outputLevel.level
     304           2 : 
     305           2 :                 // If there are no inputs from the output level (eg, a move
     306           2 :                 // compaction), add an empty LevelInfo to info.Input.
     307           2 :                 if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level {
     308           0 :                         info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level})
     309           0 :                 }
     310           2 :         } else {
     311           2 :                 // For a delete-only compaction, set the output level to L6. The
     312           2 :                 // output level is not meaningful here, but complicating the
     313           2 :                 // info.Output interface with a pointer doesn't seem worth the
     314           2 :                 // semantic distinction.
     315           2 :                 info.Output.Level = numLevels - 1
     316           2 :         }
     317             : 
     318           2 :         for i, score := range c.pickerMetrics.scores {
     319           2 :                 info.Input[i].Score = score
     320           2 :         }
     321           2 :         info.SingleLevelOverlappingRatio = c.pickerMetrics.singleLevelOverlappingRatio
     322           2 :         info.MultiLevelOverlappingRatio = c.pickerMetrics.multiLevelOverlappingRatio
     323           2 :         if len(info.Input) > 2 {
     324           2 :                 info.Annotations = append(info.Annotations, "multilevel")
     325           2 :         }
     326           2 :         return info
     327             : }
     328             : 
     329           2 : func (c *compaction) userKeyBounds() base.UserKeyBounds {
     330           2 :         return base.UserKeyBoundsFromInternal(c.smallest, c.largest)
     331           2 : }
     332             : 
     333             : func newCompaction(
     334             :         pc *pickedCompaction, opts *Options, beganAt time.Time, provider objstorage.Provider,
     335           2 : ) *compaction {
     336           2 :         c := &compaction{
     337           2 :                 kind:              compactionKindDefault,
     338           2 :                 cmp:               pc.cmp,
     339           2 :                 equal:             opts.Comparer.Equal,
     340           2 :                 comparer:          opts.Comparer,
     341           2 :                 formatKey:         opts.Comparer.FormatKey,
     342           2 :                 inputs:            pc.inputs,
     343           2 :                 smallest:          pc.smallest,
     344           2 :                 largest:           pc.largest,
     345           2 :                 logger:            opts.Logger,
     346           2 :                 version:           pc.version,
     347           2 :                 beganAt:           beganAt,
     348           2 :                 maxOutputFileSize: pc.maxOutputFileSize,
     349           2 :                 maxOverlapBytes:   pc.maxOverlapBytes,
     350           2 :                 pickerMetrics:     pc.pickerMetrics,
     351           2 :                 slot:              pc.slot,
     352           2 :         }
     353           2 :         c.startLevel = &c.inputs[0]
     354           2 :         if pc.startLevel.l0SublevelInfo != nil {
     355           2 :                 c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo
     356           2 :         }
     357           2 :         c.outputLevel = &c.inputs[1]
     358           2 :         if c.slot == nil {
     359           2 :                 c.slot = opts.Experimental.CompactionLimiter.TookWithoutPermission(context.TODO())
     360           2 :                 c.slot.CompactionSelected(c.startLevel.level, c.outputLevel.level, c.startLevel.files.SizeSum())
     361           2 :         }
     362             : 
     363           2 :         if len(pc.extraLevels) > 0 {
     364           2 :                 c.extraLevels = pc.extraLevels
     365           2 :                 c.outputLevel = &c.inputs[len(c.inputs)-1]
     366           2 :         }
     367             :         // Compute the set of outputLevel+1 files that overlap this compaction (these
     368             :         // are the grandparent sstables).
     369           2 :         if c.outputLevel.level+1 < numLevels {
     370           2 :                 c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.userKeyBounds())
     371           2 :         }
     372           2 :         c.delElision, c.rangeKeyElision = compact.SetupTombstoneElision(
     373           2 :                 c.cmp, c.version, c.outputLevel.level, base.UserKeyBoundsFromInternal(c.smallest, c.largest),
     374           2 :         )
     375           2 :         c.kind = pc.kind
     376           2 : 
     377           2 :         if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() &&
     378           2 :                 c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes {
     379           2 :                 // This compaction can be converted into a move or copy from one level
     380           2 :                 // to the next. We avoid such a move if there is lots of overlapping
     381           2 :                 // grandparent data. Otherwise, the move could create a parent file
     382           2 :                 // that will require a very expensive merge later on.
     383           2 :                 iter := c.startLevel.files.Iter()
     384           2 :                 meta := iter.First()
     385           2 :                 isRemote := false
     386           2 :                 // We should always be passed a provider, except in some unit tests.
     387           2 :                 if provider != nil {
     388           2 :                         isRemote = !objstorage.IsLocalTable(provider, meta.FileBacking.DiskFileNum)
     389           2 :                 }
     390             :                 // Avoid a trivial move or copy if all of these are true, as rewriting a
     391             :                 // new file is better:
     392             :                 //
     393             :                 // 1) The source file is a virtual sstable
     394             :                 // 2) The existing file `meta` is on non-remote storage
     395             :                 // 3) The output level prefers shared storage
     396           2 :                 mustCopy := !isRemote && remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level)
     397           2 :                 if mustCopy {
     398           2 :                         // If the source is virtual, it's best to just rewrite the file as all
     399           2 :                         // conditions in the above comment are met.
     400           2 :                         if !meta.Virtual {
     401           2 :                                 c.kind = compactionKindCopy
     402           2 :                         }
     403           2 :                 } else {
     404           2 :                         c.kind = compactionKindMove
     405           2 :                 }
     406             :         }
     407           2 :         return c
     408             : }
     409             : 
     410             : func newDeleteOnlyCompaction(
     411             :         opts *Options,
     412             :         cur *version,
     413             :         inputs []compactionLevel,
     414             :         beganAt time.Time,
     415             :         hints []deleteCompactionHint,
     416             :         exciseEnabled bool,
     417           2 : ) *compaction {
     418           2 :         c := &compaction{
     419           2 :                 kind:          compactionKindDeleteOnly,
     420           2 :                 cmp:           opts.Comparer.Compare,
     421           2 :                 equal:         opts.Comparer.Equal,
     422           2 :                 comparer:      opts.Comparer,
     423           2 :                 formatKey:     opts.Comparer.FormatKey,
     424           2 :                 logger:        opts.Logger,
     425           2 :                 version:       cur,
     426           2 :                 beganAt:       beganAt,
     427           2 :                 inputs:        inputs,
     428           2 :                 deletionHints: hints,
     429           2 :                 exciseEnabled: exciseEnabled,
     430           2 :         }
     431           2 : 
     432           2 :         // Set c.smallest, c.largest.
     433           2 :         files := make([]manifest.LevelIterator, 0, len(inputs))
     434           2 :         for _, in := range inputs {
     435           2 :                 files = append(files, in.files.Iter())
     436           2 :         }
     437           2 :         c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...)
     438           2 :         return c
     439             : }
     440             : 
     441           2 : func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) {
     442           2 :         // Heuristic to place a lower bound on compaction output file size
     443           2 :         // caused by Lbase. Prior to this heuristic we have observed an L0 in
     444           2 :         // production with 310K files of which 290K files were < 10KB in size.
     445           2 :         // Our hypothesis is that it was caused by L1 having 2600 files and
     446           2 :         // ~10GB, such that each flush got split into many tiny files due to
     447           2 :         // overlapping with most of the files in Lbase.
     448           2 :         //
     449           2 :         // The computation below is general in that it accounts
     450           2 :         // for flushing different volumes of data (e.g. we may be flushing
     451           2 :         // many memtables). For illustration, we consider the typical
     452           2 :         // example of flushing a 64MB memtable. So 12.8MB output,
     453           2 :         // based on the compression guess below. If the compressed bytes
     454           2 :         // guess is an over-estimate we will end up with smaller files,
     455           2 :         // and if an under-estimate we will end up with larger files.
     456           2 :         // With a 2MB target file size, 7 files. We are willing to accept
     457           2 :         // 4x the number of files, if it results in better write amplification
     458           2 :         // when later compacting to Lbase, i.e., ~450KB files (target file
     459           2 :         // size / 4).
     460           2 :         //
     461           2 :         // Note that this is a pessimistic heuristic in that
     462           2 :         // fileCountUpperBoundDueToGrandparents could be far from the actual
     463           2 :         // number of files produced due to the grandparent limits. For
     464           2 :         // example, in the extreme, consider a flush that overlaps with 1000
     465           2 :         // files in Lbase f0...f999, and the initially calculated value of
     466           2 :         // maxOverlapBytes will cause splits at f10, f20,..., f990, which
     467           2 :         // means an upper bound file count of 100 files. Say the input bytes
     468           2 :         // in the flush are such that acceptableFileCount=10. We will fatten
     469           2 :         // up maxOverlapBytes by 10x to ensure that the upper bound file count
     470           2 :         // drops to 10. However, it is possible that in practice, even without
     471           2 :         // this change, we would have produced no more than 10 files, and that
     472           2 :         // this change makes the files unnecessarily wide. Say the input bytes
     473           2 :         // are distributed such that 10% are in f0...f9, 10% in f10...f19, ...
     474           2 :         // 10% in f80...f89 and 10% in f990...f999. The original value of
     475           2 :         // maxOverlapBytes would have actually produced only 10 sstables. But
     476           2 :         // by increasing maxOverlapBytes by 10x, we may produce 1 sstable that
     477           2 :         // spans f0...f89, i.e., a much wider sstable than necessary.
     478           2 :         //
     479           2 :         // We could produce a tighter estimate of
     480           2 :         // fileCountUpperBoundDueToGrandparents if we had knowledge of the key
     481           2 :         // distribution of the flush. The 4x multiplier mentioned earlier is
     482           2 :         // a way to try to compensate for this pessimism.
     483           2 :         //
     484           2 :         // TODO(sumeer): we don't have compression info for the data being
     485           2 :         // flushed, but it is likely that existing files that overlap with
     486           2 :         // this flush in Lbase are representative wrt compression ratio. We
     487           2 :         // could store the uncompressed size in FileMetadata and estimate
     488           2 :         // the compression ratio.
     489           2 :         const approxCompressionRatio = 0.2
     490           2 :         approxOutputBytes := approxCompressionRatio * float64(flushingBytes)
     491           2 :         approxNumFilesBasedOnTargetSize :=
     492           2 :                 int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize)))
     493           2 :         acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize)
     494           2 :         // The byte calculation is linear in numGrandparentFiles, but we will
     495           2 :         // incur this linear cost in compact.Runner.TableSplitLimit() too, so we are
     496           2 :         // also willing to pay it now. We could approximate this cheaply by using the
     497           2 :         // mean file size of Lbase.
     498           2 :         grandparentFileBytes := c.grandparents.SizeSum()
     499           2 :         fileCountUpperBoundDueToGrandparents :=
     500           2 :                 float64(grandparentFileBytes) / float64(c.maxOverlapBytes)
     501           2 :         if fileCountUpperBoundDueToGrandparents > acceptableFileCount {
     502           2 :                 c.maxOverlapBytes = uint64(
     503           2 :                         float64(c.maxOverlapBytes) *
     504           2 :                                 (fileCountUpperBoundDueToGrandparents / acceptableFileCount))
     505           2 :         }
     506             : }
     507             : 
     508             : func newFlush(
     509             :         opts *Options, cur *version, baseLevel int, flushing flushableList, beganAt time.Time,
     510           2 : ) (*compaction, error) {
     511           2 :         c := &compaction{
     512           2 :                 kind:              compactionKindFlush,
     513           2 :                 cmp:               opts.Comparer.Compare,
     514           2 :                 equal:             opts.Comparer.Equal,
     515           2 :                 comparer:          opts.Comparer,
     516           2 :                 formatKey:         opts.Comparer.FormatKey,
     517           2 :                 logger:            opts.Logger,
     518           2 :                 version:           cur,
     519           2 :                 beganAt:           beganAt,
     520           2 :                 inputs:            []compactionLevel{{level: -1}, {level: 0}},
     521           2 :                 maxOutputFileSize: math.MaxUint64,
     522           2 :                 maxOverlapBytes:   math.MaxUint64,
     523           2 :                 flushing:          flushing,
     524           2 :         }
     525           2 :         c.startLevel = &c.inputs[0]
     526           2 :         c.outputLevel = &c.inputs[1]
     527           2 : 
     528           2 :         // Flush slots are always taken without permission.
     529           2 :         //
     530           2 :         // NB: CompactionLimiter defaults to a no-op limiter unless one is implemented
     531           2 :         // and passed-in as an option during Open.
     532           2 :         slot := opts.Experimental.CompactionLimiter.TookWithoutPermission(context.TODO())
     533           2 :         var flushingSize uint64
     534           2 :         for i := range flushing {
     535           2 :                 flushingSize += flushing[i].totalBytes()
     536           2 :         }
     537           2 :         slot.CompactionSelected(-1, 0, flushingSize)
     538           2 :         c.slot = slot
     539           2 : 
     540           2 :         if len(flushing) > 0 {
     541           2 :                 if _, ok := flushing[0].flushable.(*ingestedFlushable); ok {
     542           2 :                         if len(flushing) != 1 {
     543           0 :                                 panic("pebble: ingestedFlushable must be flushed one at a time.")
     544             :                         }
     545           2 :                         c.kind = compactionKindIngestedFlushable
     546           2 :                         return c, nil
     547             :                 }
     548             :         }
     549             : 
     550             :         // Make sure there's no ingestedFlushable after the first flushable in the
     551             :         // list.
     552           2 :         for _, f := range flushing {
     553           2 :                 if _, ok := f.flushable.(*ingestedFlushable); ok {
     554           0 :                         panic("pebble: flushing shouldn't contain ingestedFlushable flushable")
     555             :                 }
     556             :         }
     557             : 
     558           2 :         if cur.L0Sublevels != nil {
     559           2 :                 c.l0Limits = cur.L0Sublevels.FlushSplitKeys()
     560           2 :         }
     561             : 
     562           2 :         smallestSet, largestSet := false, false
     563           2 :         updatePointBounds := func(iter internalIterator) {
     564           2 :                 if kv := iter.First(); kv != nil {
     565           2 :                         if !smallestSet ||
     566           2 :                                 base.InternalCompare(c.cmp, c.smallest, kv.K) > 0 {
     567           2 :                                 smallestSet = true
     568           2 :                                 c.smallest = kv.K.Clone()
     569           2 :                         }
     570             :                 }
     571           2 :                 if kv := iter.Last(); kv != nil {
     572           2 :                         if !largestSet ||
     573           2 :                                 base.InternalCompare(c.cmp, c.largest, kv.K) < 0 {
     574           2 :                                 largestSet = true
     575           2 :                                 c.largest = kv.K.Clone()
     576           2 :                         }
     577             :                 }
     578             :         }
     579             : 
     580           2 :         updateRangeBounds := func(iter keyspan.FragmentIterator) error {
     581           2 :                 // File bounds require s != nil && !s.Empty(). We only need to check for
     582           2 :                 // s != nil here, as the memtable's FragmentIterator would never surface
     583           2 :                 // empty spans.
     584           2 :                 if s, err := iter.First(); err != nil {
     585           0 :                         return err
     586           2 :                 } else if s != nil {
     587           2 :                         if key := s.SmallestKey(); !smallestSet ||
     588           2 :                                 base.InternalCompare(c.cmp, c.smallest, key) > 0 {
     589           2 :                                 smallestSet = true
     590           2 :                                 c.smallest = key.Clone()
     591           2 :                         }
     592             :                 }
     593           2 :                 if s, err := iter.Last(); err != nil {
     594           0 :                         return err
     595           2 :                 } else if s != nil {
     596           2 :                         if key := s.LargestKey(); !largestSet ||
     597           2 :                                 base.InternalCompare(c.cmp, c.largest, key) < 0 {
     598           2 :                                 largestSet = true
     599           2 :                                 c.largest = key.Clone()
     600           2 :                         }
     601             :                 }
     602           2 :                 return nil
     603             :         }
     604             : 
     605           2 :         var flushingBytes uint64
     606           2 :         for i := range flushing {
     607           2 :                 f := flushing[i]
     608           2 :                 updatePointBounds(f.newIter(nil))
     609           2 :                 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
     610           2 :                         if err := updateRangeBounds(rangeDelIter); err != nil {
     611           0 :                                 return nil, err
     612           0 :                         }
     613             :                 }
     614           2 :                 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
     615           2 :                         if err := updateRangeBounds(rangeKeyIter); err != nil {
     616           0 :                                 return nil, err
     617           0 :                         }
     618             :                 }
     619           2 :                 flushingBytes += f.inuseBytes()
     620             :         }
     621             : 
     622           2 :         if opts.FlushSplitBytes > 0 {
     623           2 :                 c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
     624           2 :                 c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
     625           2 :                 c.grandparents = c.version.Overlaps(baseLevel, c.userKeyBounds())
     626           2 :                 adjustGrandparentOverlapBytesForFlush(c, flushingBytes)
     627           2 :         }
     628             : 
     629             :         // We don't elide tombstones for flushes.
     630           2 :         c.delElision, c.rangeKeyElision = compact.NoTombstoneElision(), compact.NoTombstoneElision()
     631           2 :         return c, nil
     632             : }
     633             : 
     634           2 : func (c *compaction) hasExtraLevelData() bool {
     635           2 :         if len(c.extraLevels) == 0 {
     636           2 :                 // not a multi level compaction
     637           2 :                 return false
     638           2 :         } else if c.extraLevels[0].files.Empty() {
     639           2 :                 // a multi level compaction without data in the intermediate input level;
     640           2 :                 // e.g. for a multi level compaction with levels 4,5, and 6, this could
     641           2 :                 // occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move).
     642           2 :                 return false
     643           2 :         }
     644           2 :         return true
     645             : }
     646             : 
     647             : // errorOnUserKeyOverlap returns an error if the last two written sstables in
     648             : // this compaction have revisions of the same user key present in both sstables,
     649             : // when it shouldn't (eg. when splitting flushes).
     650           1 : func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
     651           1 :         if n := len(ve.NewFiles); n > 1 {
     652           1 :                 meta := ve.NewFiles[n-1].Meta
     653           1 :                 prevMeta := ve.NewFiles[n-2].Meta
     654           1 :                 if !prevMeta.Largest.IsExclusiveSentinel() &&
     655           1 :                         c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 {
     656           1 :                         return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s",
     657           1 :                                 prevMeta.Largest.Pretty(c.formatKey),
     658           1 :                                 prevMeta.FileNum,
     659           1 :                                 meta.FileNum)
     660           1 :                 }
     661             :         }
     662           1 :         return nil
     663             : }
     664             : 
     665             : // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
     666             : // snapshots requiring them to be kept. It performs this determination by
     667             : // looking at the TombstoneElision values which are set up based on sstables
     668             : // which overlap the bounds of the compaction at a lower level in the LSM.
     669           2 : func (c *compaction) allowZeroSeqNum() bool {
     670           2 :         // TODO(peter): we disable zeroing of seqnums during flushing to match
     671           2 :         // RocksDB behavior and to avoid generating overlapping sstables during
     672           2 :         // DB.replayWAL. When replaying WAL files at startup, we flush after each
     673           2 :         // WAL is replayed building up a single version edit that is
     674           2 :         // applied. Because we don't apply the version edit after each flush, this
     675           2 :         // code doesn't know that L0 contains files and zeroing of seqnums should
     676           2 :         // be disabled. That is fixable, but it seems safer to just match the
     677           2 :         // RocksDB behavior for now.
     678           2 :         return len(c.flushing) == 0 && c.delElision.ElidesEverything() && c.rangeKeyElision.ElidesEverything()
     679           2 : }
     680             : 
     681             : // newInputIters returns an iterator over all the input tables in a compaction.
     682             : func (c *compaction) newInputIters(
     683             :         newIters tableNewIters, newRangeKeyIter keyspanimpl.TableNewSpanIter,
     684             : ) (
     685             :         pointIter internalIterator,
     686             :         rangeDelIter, rangeKeyIter keyspan.FragmentIterator,
     687             :         retErr error,
     688           2 : ) {
     689           2 :         // Validate the ordering of compaction input files for defense in depth.
     690           2 :         if len(c.flushing) == 0 {
     691           2 :                 if c.startLevel.level >= 0 {
     692           2 :                         err := manifest.CheckOrdering(c.cmp, c.formatKey,
     693           2 :                                 manifest.Level(c.startLevel.level), c.startLevel.files.Iter())
     694           2 :                         if err != nil {
     695           1 :                                 return nil, nil, nil, err
     696           1 :                         }
     697             :                 }
     698           2 :                 err := manifest.CheckOrdering(c.cmp, c.formatKey,
     699           2 :                         manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter())
     700           2 :                 if err != nil {
     701           1 :                         return nil, nil, nil, err
     702           1 :                 }
     703           2 :                 if c.startLevel.level == 0 {
     704           2 :                         if c.startLevel.l0SublevelInfo == nil {
     705           0 :                                 panic("l0SublevelInfo not created for compaction out of L0")
     706             :                         }
     707           2 :                         for _, info := range c.startLevel.l0SublevelInfo {
     708           2 :                                 err := manifest.CheckOrdering(c.cmp, c.formatKey,
     709           2 :                                         info.sublevel, info.Iter())
     710           2 :                                 if err != nil {
     711           1 :                                         return nil, nil, nil, err
     712           1 :                                 }
     713             :                         }
     714             :                 }
     715           2 :                 if len(c.extraLevels) > 0 {
     716           2 :                         if len(c.extraLevels) > 1 {
     717           0 :                                 panic("n>2 multi level compaction not implemented yet")
     718             :                         }
     719           2 :                         interLevel := c.extraLevels[0]
     720           2 :                         err := manifest.CheckOrdering(c.cmp, c.formatKey,
     721           2 :                                 manifest.Level(interLevel.level), interLevel.files.Iter())
     722           2 :                         if err != nil {
     723           0 :                                 return nil, nil, nil, err
     724           0 :                         }
     725             :                 }
     726             :         }
     727             : 
     728             :         // There are three classes of keys that a compaction needs to process: point
     729             :         // keys, range deletion tombstones and range keys. Collect all iterators for
     730             :         // all these classes of keys from all the levels. We'll aggregate them
     731             :         // together farther below.
     732             :         //
     733             :         // numInputLevels is an approximation of the number of iterator levels. Due
     734             :         // to idiosyncrasies in iterator construction, we may (rarely) exceed this
     735             :         // initial capacity.
     736           2 :         numInputLevels := max(len(c.flushing), len(c.inputs))
     737           2 :         iters := make([]internalIterator, 0, numInputLevels)
     738           2 :         rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
     739           2 :         rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
     740           2 : 
     741           2 :         // If construction of the iterator inputs fails, ensure that we close all
     742           2 :         // the consitutent iterators.
     743           2 :         defer func() {
     744           2 :                 if retErr != nil {
     745           0 :                         for _, iter := range iters {
     746           0 :                                 if iter != nil {
     747           0 :                                         iter.Close()
     748           0 :                                 }
     749             :                         }
     750           0 :                         for _, rangeDelIter := range rangeDelIters {
     751           0 :                                 rangeDelIter.Close()
     752           0 :                         }
     753             :                 }
     754             :         }()
     755           2 :         iterOpts := IterOptions{
     756           2 :                 Category: categoryCompaction,
     757           2 :                 logger:   c.logger,
     758           2 :         }
     759           2 : 
     760           2 :         // Populate iters, rangeDelIters and rangeKeyIters with the appropriate
     761           2 :         // constituent iterators. This depends on whether this is a flush or a
     762           2 :         // compaction.
     763           2 :         if len(c.flushing) != 0 {
     764           2 :                 // If flushing, we need to build the input iterators over the memtables
     765           2 :                 // stored in c.flushing.
     766           2 :                 for i := range c.flushing {
     767           2 :                         f := c.flushing[i]
     768           2 :                         iters = append(iters, f.newFlushIter(nil))
     769           2 :                         rangeDelIter := f.newRangeDelIter(nil)
     770           2 :                         if rangeDelIter != nil {
     771           2 :                                 rangeDelIters = append(rangeDelIters, rangeDelIter)
     772           2 :                         }
     773           2 :                         if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
     774           2 :                                 rangeKeyIters = append(rangeKeyIters, rangeKeyIter)
     775           2 :                         }
     776             :                 }
     777           2 :         } else {
     778           2 :                 addItersForLevel := func(level *compactionLevel, l manifest.Layer) error {
     779           2 :                         // Add a *levelIter for point iterators. Because we don't call
     780           2 :                         // initRangeDel, the levelIter will close and forget the range
     781           2 :                         // deletion iterator when it steps on to a new file. Surfacing range
     782           2 :                         // deletions to compactions are handled below.
     783           2 :                         iters = append(iters, newLevelIter(context.Background(),
     784           2 :                                 iterOpts, c.comparer, newIters, level.files.Iter(), l, internalIterOpts{
     785           2 :                                         compaction: true,
     786           2 :                                         bufferPool: &c.bufferPool,
     787           2 :                                         stats:      &c.stats,
     788           2 :                                 }))
     789           2 :                         // TODO(jackson): Use keyspanimpl.LevelIter to avoid loading all the range
     790           2 :                         // deletions into memory upfront. (See #2015, which reverted this.) There
     791           2 :                         // will be no user keys that are split between sstables within a level in
     792           2 :                         // Cockroach 23.1, which unblocks this optimization.
     793           2 : 
     794           2 :                         // Add the range deletion iterator for each file as an independent level
     795           2 :                         // in mergingIter, as opposed to making a levelIter out of those. This
     796           2 :                         // is safer as levelIter expects all keys coming from underlying
     797           2 :                         // iterators to be in order. Due to compaction / tombstone writing
     798           2 :                         // logic in finishOutput(), it is possible for range tombstones to not
     799           2 :                         // be strictly ordered across all files in one level.
     800           2 :                         //
     801           2 :                         // Consider this example from the metamorphic tests (also repeated in
     802           2 :                         // finishOutput()), consisting of three L3 files with their bounds
     803           2 :                         // specified in square brackets next to the file name:
     804           2 :                         //
     805           2 :                         // ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
     806           2 :                         // tmgc#391,MERGE [786e627a]
     807           2 :                         // tmgc-udkatvs#331,RANGEDEL
     808           2 :                         //
     809           2 :                         // ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
     810           2 :                         // tmgc#384,MERGE [666c7070]
     811           2 :                         // tmgc-tvsalezade#383,RANGEDEL
     812           2 :                         // tmgc-tvsalezade#331,RANGEDEL
     813           2 :                         //
     814           2 :                         // ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
     815           2 :                         // tmgc-tvsalezade#383,RANGEDEL
     816           2 :                         // tmgc#375,SET [72646c78766965616c72776865676e79]
     817           2 :                         // tmgc-tvsalezade#356,RANGEDEL
     818           2 :                         //
     819           2 :                         // Here, the range tombstone in 000240.sst falls "after" one in
     820           2 :                         // 000241.sst, despite 000240.sst being ordered "before" 000241.sst for
     821           2 :                         // levelIter's purposes. While each file is still consistent before its
     822           2 :                         // bounds, it's safer to have all rangedel iterators be visible to
     823           2 :                         // mergingIter.
     824           2 :                         iter := level.files.Iter()
     825           2 :                         for f := iter.First(); f != nil; f = iter.Next() {
     826           2 :                                 rangeDelIter, err := c.newRangeDelIter(newIters, iter.Take(), iterOpts, l)
     827           2 :                                 if err != nil {
     828           0 :                                         // The error will already be annotated with the BackingFileNum, so
     829           0 :                                         // we annotate it with the FileNum.
     830           0 :                                         return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.FileNum))
     831           0 :                                 }
     832           2 :                                 if rangeDelIter == nil {
     833           2 :                                         continue
     834             :                                 }
     835           2 :                                 rangeDelIters = append(rangeDelIters, rangeDelIter)
     836           2 :                                 c.closers = append(c.closers, rangeDelIter)
     837             :                         }
     838             : 
     839             :                         // Check if this level has any range keys.
     840           2 :                         hasRangeKeys := false
     841           2 :                         for f := iter.First(); f != nil; f = iter.Next() {
     842           2 :                                 if f.HasRangeKeys {
     843           2 :                                         hasRangeKeys = true
     844           2 :                                         break
     845             :                                 }
     846             :                         }
     847           2 :                         if hasRangeKeys {
     848           2 :                                 newRangeKeyIterWrapper := func(ctx context.Context, file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
     849           2 :                                         rangeKeyIter, err := newRangeKeyIter(ctx, file, iterOptions)
     850           2 :                                         if err != nil {
     851           0 :                                                 return nil, err
     852           2 :                                         } else if rangeKeyIter == nil {
     853           0 :                                                 return emptyKeyspanIter, nil
     854           0 :                                         }
     855             :                                         // Ensure that the range key iter is not closed until the compaction is
     856             :                                         // finished. This is necessary because range key processing
     857             :                                         // requires the range keys to be held in memory for up to the
     858             :                                         // lifetime of the compaction.
     859           2 :                                         noCloseIter := &noCloseIter{rangeKeyIter}
     860           2 :                                         c.closers = append(c.closers, noCloseIter)
     861           2 : 
     862           2 :                                         // We do not need to truncate range keys to sstable boundaries, or
     863           2 :                                         // only read within the file's atomic compaction units, unlike with
     864           2 :                                         // range tombstones. This is because range keys were added after we
     865           2 :                                         // stopped splitting user keys across sstables, so all the range keys
     866           2 :                                         // in this sstable must wholly lie within the file's bounds.
     867           2 :                                         return noCloseIter, err
     868             :                                 }
     869           2 :                                 li := keyspanimpl.NewLevelIter(
     870           2 :                                         context.Background(), keyspan.SpanIterOptions{}, c.cmp,
     871           2 :                                         newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange,
     872           2 :                                 )
     873           2 :                                 rangeKeyIters = append(rangeKeyIters, li)
     874             :                         }
     875           2 :                         return nil
     876             :                 }
     877             : 
     878           2 :                 for i := range c.inputs {
     879           2 :                         // If the level is annotated with l0SublevelInfo, expand it into one
     880           2 :                         // level per sublevel.
     881           2 :                         // TODO(jackson): Perform this expansion even earlier when we pick the
     882           2 :                         // compaction?
     883           2 :                         if len(c.inputs[i].l0SublevelInfo) > 0 {
     884           2 :                                 for _, info := range c.startLevel.l0SublevelInfo {
     885           2 :                                         sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil}
     886           2 :                                         if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil {
     887           0 :                                                 return nil, nil, nil, err
     888           0 :                                         }
     889             :                                 }
     890           2 :                                 continue
     891             :                         }
     892           2 :                         if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil {
     893           0 :                                 return nil, nil, nil, err
     894           0 :                         }
     895             :                 }
     896             :         }
     897             : 
     898             :         // If there's only one constituent point iterator, we can avoid the overhead
     899             :         // of a *mergingIter. This is possible, for example, when performing a flush
     900             :         // of a single memtable. Otherwise, combine all the iterators into a merging
     901             :         // iter.
     902           2 :         pointIter = iters[0]
     903           2 :         if len(iters) > 1 {
     904           2 :                 pointIter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...)
     905           2 :         }
     906             : 
     907             :         // In normal operation, levelIter iterates over the point operations in a
     908             :         // level, and initializes a rangeDelIter pointer for the range deletions in
     909             :         // each table. During compaction, we want to iterate over the merged view of
     910             :         // point operations and range deletions. In order to do this we create one
     911             :         // levelIter per level to iterate over the point operations, and collect up
     912             :         // all the range deletion files.
     913             :         //
     914             :         // The range deletion levels are combined with a keyspanimpl.MergingIter. The
     915             :         // resulting merged rangedel iterator is then included using an
     916             :         // InterleavingIter.
     917             :         // TODO(jackson): Consider using a defragmenting iterator to stitch together
     918             :         // logical range deletions that were fragmented due to previous file
     919             :         // boundaries.
     920           2 :         if len(rangeDelIters) > 0 {
     921           2 :                 mi := &keyspanimpl.MergingIter{}
     922           2 :                 mi.Init(c.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), rangeDelIters...)
     923           2 :                 rangeDelIter = mi
     924           2 :         }
     925             : 
     926             :         // If there are range key iterators, we need to combine them using
     927             :         // keyspanimpl.MergingIter, and then interleave them among the points.
     928           2 :         if len(rangeKeyIters) > 0 {
     929           2 :                 mi := &keyspanimpl.MergingIter{}
     930           2 :                 mi.Init(c.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), rangeKeyIters...)
     931           2 :                 // TODO(radu): why do we have a defragmenter here but not above?
     932           2 :                 di := &keyspan.DefragmentingIter{}
     933           2 :                 di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers))
     934           2 :                 rangeKeyIter = di
     935           2 :         }
     936           2 :         return pointIter, rangeDelIter, rangeKeyIter, nil
     937             : }
     938             : 
     939             : func (c *compaction) newRangeDelIter(
     940             :         newIters tableNewIters, f manifest.LevelFile, opts IterOptions, l manifest.Layer,
     941           2 : ) (*noCloseIter, error) {
     942           2 :         opts.layer = l
     943           2 :         iterSet, err := newIters(context.Background(), f.FileMetadata, &opts,
     944           2 :                 internalIterOpts{
     945           2 :                         compaction: true,
     946           2 :                         bufferPool: &c.bufferPool,
     947           2 :                 }, iterRangeDeletions)
     948           2 :         if err != nil {
     949           0 :                 return nil, err
     950           2 :         } else if iterSet.rangeDeletion == nil {
     951           2 :                 // The file doesn't contain any range deletions.
     952           2 :                 return nil, nil
     953           2 :         }
     954             :         // Ensure that rangeDelIter is not closed until the compaction is
     955             :         // finished. This is necessary because range tombstone processing
     956             :         // requires the range tombstones to be held in memory for up to the
     957             :         // lifetime of the compaction.
     958           2 :         return &noCloseIter{iterSet.rangeDeletion}, nil
     959             : }
     960             : 
     961           1 : func (c *compaction) String() string {
     962           1 :         if len(c.flushing) != 0 {
     963           0 :                 return "flush\n"
     964           0 :         }
     965             : 
     966           1 :         var buf bytes.Buffer
     967           1 :         for level := c.startLevel.level; level <= c.outputLevel.level; level++ {
     968           1 :                 i := level - c.startLevel.level
     969           1 :                 fmt.Fprintf(&buf, "%d:", level)
     970           1 :                 iter := c.inputs[i].files.Iter()
     971           1 :                 for f := iter.First(); f != nil; f = iter.Next() {
     972           1 :                         fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest)
     973           1 :                 }
     974           1 :                 fmt.Fprintf(&buf, "\n")
     975             :         }
     976           1 :         return buf.String()
     977             : }
     978             : 
     979             : type manualCompaction struct {
     980             :         // Count of the retries either due to too many concurrent compactions, or a
     981             :         // concurrent compaction to overlapping levels.
     982             :         retries     int
     983             :         level       int
     984             :         outputLevel int
     985             :         done        chan error
     986             :         start       []byte
     987             :         end         []byte
     988             :         split       bool
     989             : }
     990             : 
     991             : type readCompaction struct {
     992             :         level int
     993             :         // [start, end] key ranges are used for de-duping.
     994             :         start []byte
     995             :         end   []byte
     996             : 
     997             :         // The file associated with the compaction.
     998             :         // If the file no longer belongs in the same
     999             :         // level, then we skip the compaction.
    1000             :         fileNum base.FileNum
    1001             : }
    1002             : 
    1003           2 : func (d *DB) addInProgressCompaction(c *compaction) {
    1004           2 :         d.mu.compact.inProgress[c] = struct{}{}
    1005           2 :         var isBase, isIntraL0 bool
    1006           2 :         for _, cl := range c.inputs {
    1007           2 :                 iter := cl.files.Iter()
    1008           2 :                 for f := iter.First(); f != nil; f = iter.Next() {
    1009           2 :                         if f.IsCompacting() {
    1010           0 :                                 d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
    1011           0 :                         }
    1012           2 :                         f.SetCompactionState(manifest.CompactionStateCompacting)
    1013           2 :                         if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 {
    1014           2 :                                 if c.outputLevel.level == 0 {
    1015           2 :                                         f.IsIntraL0Compacting = true
    1016           2 :                                         isIntraL0 = true
    1017           2 :                                 } else {
    1018           2 :                                         isBase = true
    1019           2 :                                 }
    1020             :                         }
    1021             :                 }
    1022             :         }
    1023             : 
    1024           2 :         if (isIntraL0 || isBase) && c.version.L0Sublevels != nil {
    1025           2 :                 l0Inputs := []manifest.LevelSlice{c.startLevel.files}
    1026           2 :                 if isIntraL0 {
    1027           2 :                         l0Inputs = append(l0Inputs, c.outputLevel.files)
    1028           2 :                 }
    1029           2 :                 if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil {
    1030           0 :                         d.opts.Logger.Fatalf("could not update state for compaction: %s", err)
    1031           0 :                 }
    1032             :         }
    1033             : }
    1034             : 
    1035             : // Removes compaction markers from files in a compaction. The rollback parameter
    1036             : // indicates whether the compaction state should be rolled back to its original
    1037             : // state in the case of an unsuccessful compaction.
    1038             : //
    1039             : // DB.mu must be held when calling this method, however this method can drop and
    1040             : // re-acquire that mutex. All writes to the manifest for this compaction should
    1041             : // have completed by this point.
    1042           2 : func (d *DB) clearCompactingState(c *compaction, rollback bool) {
    1043           2 :         c.versionEditApplied = true
    1044           2 :         for _, cl := range c.inputs {
    1045           2 :                 iter := cl.files.Iter()
    1046           2 :                 for f := iter.First(); f != nil; f = iter.Next() {
    1047           2 :                         if !f.IsCompacting() {
    1048           0 :                                 d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
    1049           0 :                         }
    1050           2 :                         if !rollback {
    1051           2 :                                 // On success all compactions other than move and delete-only compactions
    1052           2 :                                 // transition the file into the Compacted state. Move-compacted files
    1053           2 :                                 // become eligible for compaction again and transition back to NotCompacting.
    1054           2 :                                 // Delete-only compactions could, on rare occasion, leave files untouched
    1055           2 :                                 // (eg. if files have a loose bound), so we revert them all to NotCompacting
    1056           2 :                                 // just in case they need to be compacted again.
    1057           2 :                                 if c.kind != compactionKindMove && c.kind != compactionKindDeleteOnly {
    1058           2 :                                         f.SetCompactionState(manifest.CompactionStateCompacted)
    1059           2 :                                 } else {
    1060           2 :                                         f.SetCompactionState(manifest.CompactionStateNotCompacting)
    1061           2 :                                 }
    1062           2 :                         } else {
    1063           2 :                                 // Else, on rollback, all input files unconditionally transition back to
    1064           2 :                                 // NotCompacting.
    1065           2 :                                 f.SetCompactionState(manifest.CompactionStateNotCompacting)
    1066           2 :                         }
    1067           2 :                         f.IsIntraL0Compacting = false
    1068             :                 }
    1069             :         }
    1070           2 :         l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c))
    1071           2 :         func() {
    1072           2 :                 // InitCompactingFileInfo requires that no other manifest writes be
    1073           2 :                 // happening in parallel with it, i.e. we're not in the midst of installing
    1074           2 :                 // another version. Otherwise, it's possible that we've created another
    1075           2 :                 // L0Sublevels instance, but not added it to the versions list, causing
    1076           2 :                 // all the indices in FileMetadata to be inaccurate. To ensure this,
    1077           2 :                 // grab the manifest lock.
    1078           2 :                 d.mu.versions.logLock()
    1079           2 :                 defer d.mu.versions.logUnlock()
    1080           2 :                 d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress)
    1081           2 :         }()
    1082             : }
    1083             : 
    1084           2 : func (d *DB) calculateDiskAvailableBytes() uint64 {
    1085           2 :         if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil {
    1086           2 :                 d.diskAvailBytes.Store(space.AvailBytes)
    1087           2 :                 return space.AvailBytes
    1088           2 :         } else if !errors.Is(err, vfs.ErrUnsupported) {
    1089           1 :                 d.opts.EventListener.BackgroundError(err)
    1090           1 :         }
    1091           2 :         return d.diskAvailBytes.Load()
    1092             : }
    1093             : 
    1094             : // maybeScheduleFlush schedules a flush if necessary.
    1095             : //
    1096             : // d.mu must be held when calling this.
    1097           2 : func (d *DB) maybeScheduleFlush() {
    1098           2 :         if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly {
    1099           2 :                 return
    1100           2 :         }
    1101           2 :         if len(d.mu.mem.queue) <= 1 {
    1102           2 :                 return
    1103           2 :         }
    1104             : 
    1105           2 :         if !d.passedFlushThreshold() {
    1106           2 :                 return
    1107           2 :         }
    1108             : 
    1109           2 :         d.mu.compact.flushing = true
    1110           2 :         go d.flush()
    1111             : }
    1112             : 
    1113           2 : func (d *DB) passedFlushThreshold() bool {
    1114           2 :         var n int
    1115           2 :         var size uint64
    1116           2 :         for ; n < len(d.mu.mem.queue)-1; n++ {
    1117           2 :                 if !d.mu.mem.queue[n].readyForFlush() {
    1118           2 :                         break
    1119             :                 }
    1120           2 :                 if d.mu.mem.queue[n].flushForced {
    1121           2 :                         // A flush was forced. Pretend the memtable size is the configured
    1122           2 :                         // size. See minFlushSize below.
    1123           2 :                         size += d.opts.MemTableSize
    1124           2 :                 } else {
    1125           2 :                         size += d.mu.mem.queue[n].totalBytes()
    1126           2 :                 }
    1127             :         }
    1128           2 :         if n == 0 {
    1129           2 :                 // None of the immutable memtables are ready for flushing.
    1130           2 :                 return false
    1131           2 :         }
    1132             : 
    1133             :         // Only flush once the sum of the queued memtable sizes exceeds half the
    1134             :         // configured memtable size. This prevents flushing of memtables at startup
    1135             :         // while we're undergoing the ramp period on the memtable size. See
    1136             :         // DB.newMemTable().
    1137           2 :         minFlushSize := d.opts.MemTableSize / 2
    1138           2 :         return size >= minFlushSize
    1139             : }
    1140             : 
    1141           2 : func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) {
    1142           2 :         var mem *flushableEntry
    1143           2 :         for _, m := range d.mu.mem.queue {
    1144           2 :                 if m.flushable == tbl {
    1145           2 :                         mem = m
    1146           2 :                         break
    1147             :                 }
    1148             :         }
    1149           2 :         if mem == nil || mem.flushForced {
    1150           2 :                 return
    1151           2 :         }
    1152           2 :         deadline := d.timeNow().Add(dur)
    1153           2 :         if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) {
    1154           2 :                 // Already scheduled to flush sooner than within `dur`.
    1155           2 :                 return
    1156           2 :         }
    1157           2 :         mem.delayedFlushForcedAt = deadline
    1158           2 :         go func() {
    1159           2 :                 timer := time.NewTimer(dur)
    1160           2 :                 defer timer.Stop()
    1161           2 : 
    1162           2 :                 select {
    1163           2 :                 case <-d.closedCh:
    1164           2 :                         return
    1165           2 :                 case <-mem.flushed:
    1166           2 :                         return
    1167           2 :                 case <-timer.C:
    1168           2 :                         d.commit.mu.Lock()
    1169           2 :                         defer d.commit.mu.Unlock()
    1170           2 :                         d.mu.Lock()
    1171           2 :                         defer d.mu.Unlock()
    1172           2 : 
    1173           2 :                         // NB: The timer may fire concurrently with a call to Close.  If a
    1174           2 :                         // Close call beat us to acquiring d.mu, d.closed holds ErrClosed,
    1175           2 :                         // and it's too late to flush anything. Otherwise, the Close call
    1176           2 :                         // will block on locking d.mu until we've finished scheduling the
    1177           2 :                         // flush and set `d.mu.compact.flushing` to true. Close will wait
    1178           2 :                         // for the current flush to complete.
    1179           2 :                         if d.closed.Load() != nil {
    1180           2 :                                 return
    1181           2 :                         }
    1182             : 
    1183           2 :                         if d.mu.mem.mutable == tbl {
    1184           2 :                                 d.makeRoomForWrite(nil)
    1185           2 :                         } else {
    1186           2 :                                 mem.flushForced = true
    1187           2 :                         }
    1188           2 :                         d.maybeScheduleFlush()
    1189             :                 }
    1190             :         }()
    1191             : }
    1192             : 
    1193           2 : func (d *DB) flush() {
    1194           2 :         pprof.Do(context.Background(), flushLabels, func(context.Context) {
    1195           2 :                 flushingWorkStart := crtime.NowMono()
    1196           2 :                 d.mu.Lock()
    1197           2 :                 defer d.mu.Unlock()
    1198           2 :                 idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime)
    1199           2 :                 var bytesFlushed uint64
    1200           2 :                 var err error
    1201           2 :                 if bytesFlushed, err = d.flush1(); err != nil {
    1202           1 :                         // TODO(peter): count consecutive flush errors and backoff.
    1203           1 :                         d.opts.EventListener.BackgroundError(err)
    1204           1 :                 }
    1205           2 :                 d.mu.compact.flushing = false
    1206           2 :                 d.mu.compact.noOngoingFlushStartTime = crtime.NowMono()
    1207           2 :                 workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart)
    1208           2 :                 d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed)
    1209           2 :                 d.mu.compact.flushWriteThroughput.WorkDuration += workDuration
    1210           2 :                 d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration
    1211           2 :                 // More flush work may have arrived while we were flushing, so schedule
    1212           2 :                 // another flush if needed.
    1213           2 :                 d.maybeScheduleFlush()
    1214           2 :                 // The flush may have produced too many files in a level, so schedule a
    1215           2 :                 // compaction if needed.
    1216           2 :                 d.maybeScheduleCompaction()
    1217           2 :                 d.mu.compact.cond.Broadcast()
    1218             :         })
    1219             : }
    1220             : 
    1221             : // runIngestFlush is used to generate a flush version edit for sstables which
    1222             : // were ingested as flushables. Both DB.mu and the manifest lock must be held
    1223             : // while runIngestFlush is called.
    1224           2 : func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
    1225           2 :         if len(c.flushing) != 1 {
    1226           0 :                 panic("pebble: ingestedFlushable must be flushed one at a time.")
    1227             :         }
    1228             : 
    1229             :         // Construct the VersionEdit, levelMetrics etc.
    1230           2 :         c.metrics = make(map[int]*LevelMetrics, numLevels)
    1231           2 :         // Finding the target level for ingestion must use the latest version
    1232           2 :         // after the logLock has been acquired.
    1233           2 :         c.version = d.mu.versions.currentVersion()
    1234           2 : 
    1235           2 :         baseLevel := d.mu.versions.picker.getBaseLevel()
    1236           2 :         ve := &versionEdit{}
    1237           2 :         var ingestSplitFiles []ingestSplitFile
    1238           2 :         ingestFlushable := c.flushing[0].flushable.(*ingestedFlushable)
    1239           2 : 
    1240           2 :         updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
    1241           2 :                 levelMetrics := c.metrics[level]
    1242           2 :                 if levelMetrics == nil {
    1243           2 :                         levelMetrics = &LevelMetrics{}
    1244           2 :                         c.metrics[level] = levelMetrics
    1245           2 :                 }
    1246           2 :                 levelMetrics.NumFiles--
    1247           2 :                 levelMetrics.Size -= int64(m.Size)
    1248           2 :                 for i := range added {
    1249           2 :                         levelMetrics.NumFiles++
    1250           2 :                         levelMetrics.Size += int64(added[i].Meta.Size)
    1251           2 :                 }
    1252             :         }
    1253             : 
    1254           2 :         suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() &&
    1255           2 :                 d.FormatMajorVersion() >= FormatVirtualSSTables
    1256           2 : 
    1257           2 :         if suggestSplit || ingestFlushable.exciseSpan.Valid() {
    1258           2 :                 // We could add deleted files to ve.
    1259           2 :                 ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
    1260           2 :         }
    1261             : 
    1262           2 :         ctx := context.Background()
    1263           2 :         overlapChecker := &overlapChecker{
    1264           2 :                 comparer: d.opts.Comparer,
    1265           2 :                 newIters: d.newIters,
    1266           2 :                 opts: IterOptions{
    1267           2 :                         logger:   d.opts.Logger,
    1268           2 :                         Category: categoryIngest,
    1269           2 :                 },
    1270           2 :                 v: c.version,
    1271           2 :         }
    1272           2 :         replacedFiles := make(map[base.FileNum][]newFileEntry)
    1273           2 :         for _, file := range ingestFlushable.files {
    1274           2 :                 var fileToSplit *fileMetadata
    1275           2 :                 var level int
    1276           2 : 
    1277           2 :                 // This file fits perfectly within the excise span, so we can slot it at L6.
    1278           2 :                 if ingestFlushable.exciseSpan.Valid() &&
    1279           2 :                         ingestFlushable.exciseSpan.Contains(d.cmp, file.FileMetadata.Smallest) &&
    1280           2 :                         ingestFlushable.exciseSpan.Contains(d.cmp, file.FileMetadata.Largest) {
    1281           2 :                         level = 6
    1282           2 :                 } else {
    1283           2 :                         // TODO(radu): this can perform I/O; we should not do this while holding DB.mu.
    1284           2 :                         lsmOverlap, err := overlapChecker.DetermineLSMOverlap(ctx, file.UserKeyBounds())
    1285           2 :                         if err != nil {
    1286           0 :                                 return nil, err
    1287           0 :                         }
    1288           2 :                         level, fileToSplit, err = ingestTargetLevel(
    1289           2 :                                 ctx, d.cmp, lsmOverlap, baseLevel, d.mu.compact.inProgress, file.FileMetadata, suggestSplit,
    1290           2 :                         )
    1291           2 :                         if err != nil {
    1292           0 :                                 return nil, err
    1293           0 :                         }
    1294             :                 }
    1295             : 
    1296             :                 // Add the current flushableIngest file to the version.
    1297           2 :                 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata})
    1298           2 :                 if fileToSplit != nil {
    1299           1 :                         ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{
    1300           1 :                                 ingestFile: file.FileMetadata,
    1301           1 :                                 splitFile:  fileToSplit,
    1302           1 :                                 level:      level,
    1303           1 :                         })
    1304           1 :                 }
    1305           2 :                 levelMetrics := c.metrics[level]
    1306           2 :                 if levelMetrics == nil {
    1307           2 :                         levelMetrics = &LevelMetrics{}
    1308           2 :                         c.metrics[level] = levelMetrics
    1309           2 :                 }
    1310           2 :                 levelMetrics.BytesIngested += file.Size
    1311           2 :                 levelMetrics.TablesIngested++
    1312             :         }
    1313           2 :         if ingestFlushable.exciseSpan.Valid() {
    1314           2 :                 // Iterate through all levels and find files that intersect with exciseSpan.
    1315           2 :                 for l := range c.version.Levels {
    1316           2 :                         overlaps := c.version.Overlaps(l, base.UserKeyBoundsEndExclusive(ingestFlushable.exciseSpan.Start, ingestFlushable.exciseSpan.End))
    1317           2 :                         iter := overlaps.Iter()
    1318           2 : 
    1319           2 :                         for m := iter.First(); m != nil; m = iter.Next() {
    1320           2 :                                 newFiles, err := d.excise(context.TODO(), ingestFlushable.exciseSpan.UserKeyBounds(), m, ve, l)
    1321           2 :                                 if err != nil {
    1322           0 :                                         return nil, err
    1323           0 :                                 }
    1324             : 
    1325           2 :                                 if _, ok := ve.DeletedFiles[deletedFileEntry{
    1326           2 :                                         Level:   l,
    1327           2 :                                         FileNum: m.FileNum,
    1328           2 :                                 }]; !ok {
    1329           2 :                                         // We did not excise this file.
    1330           2 :                                         continue
    1331             :                                 }
    1332           2 :                                 replacedFiles[m.FileNum] = newFiles
    1333           2 :                                 updateLevelMetricsOnExcise(m, l, newFiles)
    1334             :                         }
    1335             :                 }
    1336             :         }
    1337             : 
    1338           2 :         if len(ingestSplitFiles) > 0 {
    1339           1 :                 if err := d.ingestSplit(context.TODO(), ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil {
    1340           0 :                         return nil, err
    1341           0 :                 }
    1342             :         }
    1343             : 
    1344           2 :         return ve, nil
    1345             : }
    1346             : 
    1347             : // flush runs a compaction that copies the immutable memtables from memory to
    1348             : // disk.
    1349             : //
    1350             : // d.mu must be held when calling this, but the mutex may be dropped and
    1351             : // re-acquired during the course of this method.
    1352           2 : func (d *DB) flush1() (bytesFlushed uint64, err error) {
    1353           2 :         // NB: The flushable queue can contain flushables of type ingestedFlushable.
    1354           2 :         // The sstables in ingestedFlushable.files must be placed into the appropriate
    1355           2 :         // level in the lsm. Let's say the flushable queue contains a prefix of
    1356           2 :         // regular immutable memtables, then an ingestedFlushable, and then the
    1357           2 :         // mutable memtable. When the flush of the ingestedFlushable is performed,
    1358           2 :         // it needs an updated view of the lsm. That is, the prefix of immutable
    1359           2 :         // memtables must have already been flushed. Similarly, if there are two
    1360           2 :         // contiguous ingestedFlushables in the queue, then the first flushable must
    1361           2 :         // be flushed, so that the second flushable can see an updated view of the
    1362           2 :         // lsm.
    1363           2 :         //
    1364           2 :         // Given the above, we restrict flushes to either some prefix of regular
    1365           2 :         // memtables, or a single flushable of type ingestedFlushable. The DB.flush
    1366           2 :         // function will call DB.maybeScheduleFlush again, so a new flush to finish
    1367           2 :         // the remaining flush work should be scheduled right away.
    1368           2 :         //
    1369           2 :         // NB: Large batches placed in the flushable queue share the WAL with the
    1370           2 :         // previous memtable in the queue. We must ensure the property that both the
    1371           2 :         // large batch and the memtable with which it shares a WAL are flushed
    1372           2 :         // together. The property ensures that the minimum unflushed log number
    1373           2 :         // isn't incremented incorrectly. Since a flushableBatch.readyToFlush always
    1374           2 :         // returns true, and since the large batch will always be placed right after
    1375           2 :         // the memtable with which it shares a WAL, the property is naturally
    1376           2 :         // ensured. The large batch will always be placed after the memtable with
    1377           2 :         // which it shares a WAL because we ensure it in DB.commitWrite by holding
    1378           2 :         // the commitPipeline.mu and then holding DB.mu. As an extra defensive
    1379           2 :         // measure, if we try to flush the memtable without also flushing the
    1380           2 :         // flushable batch in the same flush, since the memtable and flushableBatch
    1381           2 :         // have the same logNum, the logNum invariant check below will trigger.
    1382           2 :         var n, inputs int
    1383           2 :         var inputBytes uint64
    1384           2 :         var ingest bool
    1385           2 :         for ; n < len(d.mu.mem.queue)-1; n++ {
    1386           2 :                 if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok {
    1387           2 :                         if n == 0 {
    1388           2 :                                 // The first flushable is of type ingestedFlushable. Since these
    1389           2 :                                 // must be flushed individually, we perform a flush for just
    1390           2 :                                 // this.
    1391           2 :                                 if !f.readyForFlush() {
    1392           0 :                                         // This check is almost unnecessary, but we guard against it
    1393           0 :                                         // just in case this invariant changes in the future.
    1394           0 :                                         panic("pebble: ingestedFlushable should always be ready to flush.")
    1395             :                                 }
    1396             :                                 // By setting n = 1, we ensure that the first flushable(n == 0)
    1397             :                                 // is scheduled for a flush. The number of tables added is equal to the
    1398             :                                 // number of files in the ingest operation.
    1399           2 :                                 n = 1
    1400           2 :                                 inputs = len(f.files)
    1401           2 :                                 ingest = true
    1402           2 :                                 break
    1403           2 :                         } else {
    1404           2 :                                 // There was some prefix of flushables which weren't of type
    1405           2 :                                 // ingestedFlushable. So, perform a flush for those.
    1406           2 :                                 break
    1407             :                         }
    1408             :                 }
    1409           2 :                 if !d.mu.mem.queue[n].readyForFlush() {
    1410           1 :                         break
    1411             :                 }
    1412           2 :                 inputBytes += d.mu.mem.queue[n].inuseBytes()
    1413             :         }
    1414           2 :         if n == 0 {
    1415           0 :                 // None of the immutable memtables are ready for flushing.
    1416           0 :                 return 0, nil
    1417           0 :         }
    1418           2 :         if !ingest {
    1419           2 :                 // Flushes of memtables add the prefix of n memtables from the flushable
    1420           2 :                 // queue.
    1421           2 :                 inputs = n
    1422           2 :         }
    1423             : 
    1424             :         // Require that every memtable being flushed has a log number less than the
    1425             :         // new minimum unflushed log number.
    1426           2 :         minUnflushedLogNum := d.mu.mem.queue[n].logNum
    1427           2 :         if !d.opts.DisableWAL {
    1428           2 :                 for i := 0; i < n; i++ {
    1429           2 :                         if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum {
    1430           0 :                                 panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d",
    1431           0 :                                         n,
    1432           0 :                                         i, d.mu.mem.queue[i].flushable, logNum,
    1433           0 :                                         n, d.mu.mem.queue[n].flushable, minUnflushedLogNum))
    1434             :                         }
    1435             :                 }
    1436             :         }
    1437             : 
    1438           2 :         c, err := newFlush(d.opts, d.mu.versions.currentVersion(),
    1439           2 :                 d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow())
    1440           2 :         if err != nil {
    1441           0 :                 return 0, err
    1442           0 :         }
    1443           2 :         d.addInProgressCompaction(c)
    1444           2 : 
    1445           2 :         jobID := d.newJobIDLocked()
    1446           2 :         d.opts.EventListener.FlushBegin(FlushInfo{
    1447           2 :                 JobID:      int(jobID),
    1448           2 :                 Input:      inputs,
    1449           2 :                 InputBytes: inputBytes,
    1450           2 :                 Ingest:     ingest,
    1451           2 :         })
    1452           2 :         startTime := d.timeNow()
    1453           2 : 
    1454           2 :         var ve *manifest.VersionEdit
    1455           2 :         var stats compact.Stats
    1456           2 :         // To determine the target level of the files in the ingestedFlushable, we
    1457           2 :         // need to acquire the logLock, and not release it for that duration. Since,
    1458           2 :         // we need to acquire the logLock below to perform the logAndApply step
    1459           2 :         // anyway, we create the VersionEdit for ingestedFlushable outside of
    1460           2 :         // runCompaction. For all other flush cases, we construct the VersionEdit
    1461           2 :         // inside runCompaction.
    1462           2 :         if c.kind != compactionKindIngestedFlushable {
    1463           2 :                 ve, stats, err = d.runCompaction(jobID, c)
    1464           2 :         }
    1465             : 
    1466             :         // Acquire logLock. This will be released either on an error, by way of
    1467             :         // logUnlock, or through a call to logAndApply if there is no error.
    1468           2 :         d.mu.versions.logLock()
    1469           2 : 
    1470           2 :         if c.kind == compactionKindIngestedFlushable {
    1471           2 :                 ve, err = d.runIngestFlush(c)
    1472           2 :         }
    1473             : 
    1474           2 :         info := FlushInfo{
    1475           2 :                 JobID:      int(jobID),
    1476           2 :                 Input:      inputs,
    1477           2 :                 InputBytes: inputBytes,
    1478           2 :                 Duration:   d.timeNow().Sub(startTime),
    1479           2 :                 Done:       true,
    1480           2 :                 Ingest:     ingest,
    1481           2 :                 Err:        err,
    1482           2 :         }
    1483           2 :         if err == nil {
    1484           2 :                 validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey, d.opts.Logger)
    1485           2 :                 for i := range ve.NewFiles {
    1486           2 :                         e := &ve.NewFiles[i]
    1487           2 :                         info.Output = append(info.Output, e.Meta.TableInfo())
    1488           2 :                         // Ingested tables are not necessarily flushed to L0. Record the level of
    1489           2 :                         // each ingested file explicitly.
    1490           2 :                         if ingest {
    1491           2 :                                 info.IngestLevels = append(info.IngestLevels, e.Level)
    1492           2 :                         }
    1493             :                 }
    1494           2 :                 if len(ve.NewFiles) == 0 {
    1495           2 :                         info.Err = errEmptyTable
    1496           2 :                 }
    1497             : 
    1498             :                 // The flush succeeded or it produced an empty sstable. In either case we
    1499             :                 // want to bump the minimum unflushed log number to the log number of the
    1500             :                 // oldest unflushed memtable.
    1501           2 :                 ve.MinUnflushedLogNum = minUnflushedLogNum
    1502           2 :                 if c.kind != compactionKindIngestedFlushable {
    1503           2 :                         metrics := c.metrics[0]
    1504           2 :                         if d.opts.DisableWAL {
    1505           2 :                                 // If the WAL is disabled, every flushable has a zero [logSize],
    1506           2 :                                 // resulting in zero bytes in. Instead, use the number of bytes we
    1507           2 :                                 // flushed as the BytesIn. This ensures we get a reasonable w-amp
    1508           2 :                                 // calculation even when the WAL is disabled.
    1509           2 :                                 metrics.BytesIn = metrics.BytesFlushed
    1510           2 :                         } else {
    1511           2 :                                 for i := 0; i < n; i++ {
    1512           2 :                                         metrics.BytesIn += d.mu.mem.queue[i].logSize
    1513           2 :                                 }
    1514             :                         }
    1515           2 :                 } else {
    1516           2 :                         // c.kind == compactionKindIngestedFlushable && we could have deleted files due
    1517           2 :                         // to ingest-time splits or excises.
    1518           2 :                         ingestFlushable := c.flushing[0].flushable.(*ingestedFlushable)
    1519           2 :                         for c2 := range d.mu.compact.inProgress {
    1520           2 :                                 // Check if this compaction overlaps with the excise span. Note that just
    1521           2 :                                 // checking if the inputs individually overlap with the excise span
    1522           2 :                                 // isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
    1523           2 :                                 // as inputs and write it all out as [a,b,e,f] in one sstable. If we're
    1524           2 :                                 // doing a [c,d) excise at the same time as this compaction, we will have
    1525           2 :                                 // to error out the whole compaction as we can't guarantee it hasn't/won't
    1526           2 :                                 // write a file overlapping with the excise span.
    1527           2 :                                 if ingestFlushable.exciseSpan.OverlapsInternalKeyRange(d.cmp, c2.smallest, c2.largest) {
    1528           2 :                                         c2.cancel.Store(true)
    1529           2 :                                         continue
    1530             :                                 }
    1531             :                         }
    1532             : 
    1533           2 :                         if len(ve.DeletedFiles) > 0 {
    1534           2 :                                 // Iterate through all other compactions, and check if their inputs have
    1535           2 :                                 // been replaced due to an ingest-time split or excise. In that case,
    1536           2 :                                 // cancel the compaction.
    1537           2 :                                 for c2 := range d.mu.compact.inProgress {
    1538           2 :                                         for i := range c2.inputs {
    1539           2 :                                                 iter := c2.inputs[i].files.Iter()
    1540           2 :                                                 for f := iter.First(); f != nil; f = iter.Next() {
    1541           2 :                                                         if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok {
    1542           2 :                                                                 c2.cancel.Store(true)
    1543           2 :                                                                 break
    1544             :                                                         }
    1545             :                                                 }
    1546             :                                         }
    1547             :                                 }
    1548             :                         }
    1549             :                 }
    1550           2 :                 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */
    1551           2 :                         func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) })
    1552           2 :                 if err != nil {
    1553           1 :                         info.Err = err
    1554           1 :                 }
    1555           1 :         } else {
    1556           1 :                 // We won't be performing the logAndApply step because of the error,
    1557           1 :                 // so logUnlock.
    1558           1 :                 d.mu.versions.logUnlock()
    1559           1 :         }
    1560             : 
    1561             :         // If err != nil, then the flush will be retried, and we will recalculate
    1562             :         // these metrics.
    1563           2 :         if err == nil {
    1564           2 :                 d.mu.snapshots.cumulativePinnedCount += stats.CumulativePinnedKeys
    1565           2 :                 d.mu.snapshots.cumulativePinnedSize += stats.CumulativePinnedSize
    1566           2 :                 d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.CountMissizedDels
    1567           2 :         }
    1568             : 
    1569           2 :         d.clearCompactingState(c, err != nil)
    1570           2 :         delete(d.mu.compact.inProgress, c)
    1571           2 :         d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
    1572           2 : 
    1573           2 :         var flushed flushableList
    1574           2 :         if err == nil {
    1575           2 :                 flushed = d.mu.mem.queue[:n]
    1576           2 :                 d.mu.mem.queue = d.mu.mem.queue[n:]
    1577           2 :                 d.updateReadStateLocked(d.opts.DebugCheck)
    1578           2 :                 d.updateTableStatsLocked(ve.NewFiles)
    1579           2 :                 if ingest {
    1580           2 :                         d.mu.versions.metrics.Flush.AsIngestCount++
    1581           2 :                         for _, l := range c.metrics {
    1582           2 :                                 d.mu.versions.metrics.Flush.AsIngestBytes += l.BytesIngested
    1583           2 :                                 d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested
    1584           2 :                         }
    1585             :                 }
    1586           2 :                 d.maybeTransitionSnapshotsToFileOnlyLocked()
    1587             : 
    1588             :         }
    1589             :         // Signal FlushEnd after installing the new readState. This helps for unit
    1590             :         // tests that use the callback to trigger a read using an iterator with
    1591             :         // IterOptions.OnlyReadGuaranteedDurable.
    1592           2 :         info.TotalDuration = d.timeNow().Sub(startTime)
    1593           2 :         d.opts.EventListener.FlushEnd(info)
    1594           2 : 
    1595           2 :         // The order of these operations matters here for ease of testing.
    1596           2 :         // Removing the reader reference first allows tests to be guaranteed that
    1597           2 :         // the memtable reservation has been released by the time a synchronous
    1598           2 :         // flush returns. readerUnrefLocked may also produce obsolete files so the
    1599           2 :         // call to deleteObsoleteFiles must happen after it.
    1600           2 :         for i := range flushed {
    1601           2 :                 flushed[i].readerUnrefLocked(true)
    1602           2 :         }
    1603             : 
    1604           2 :         d.deleteObsoleteFiles(jobID)
    1605           2 : 
    1606           2 :         // Mark all the memtables we flushed as flushed.
    1607           2 :         for i := range flushed {
    1608           2 :                 close(flushed[i].flushed)
    1609           2 :         }
    1610             : 
    1611           2 :         return inputBytes, err
    1612             : }
    1613             : 
    1614             : // maybeTransitionSnapshotsToFileOnlyLocked transitions any "eventually
    1615             : // file-only" snapshots to be file-only if all their visible state has been
    1616             : // flushed to sstables.
    1617             : //
    1618             : // REQUIRES: d.mu.
    1619           2 : func (d *DB) maybeTransitionSnapshotsToFileOnlyLocked() {
    1620           2 :         earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked()
    1621           2 :         currentVersion := d.mu.versions.currentVersion()
    1622           2 :         for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; {
    1623           2 :                 if s.efos == nil {
    1624           2 :                         s = s.next
    1625           2 :                         continue
    1626             :                 }
    1627           2 :                 overlapsFlushable := false
    1628           2 :                 if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, base.SeqNumMax) {
    1629           2 :                         // There are some unflushed keys that are still visible to the EFOS.
    1630           2 :                         // Check if any memtables older than the EFOS contain keys within a
    1631           2 :                         // protected range of the EFOS. If no, we can transition.
    1632           2 :                         protectedRanges := make([]bounded, len(s.efos.protectedRanges))
    1633           2 :                         for i := range s.efos.protectedRanges {
    1634           2 :                                 protectedRanges[i] = s.efos.protectedRanges[i]
    1635           2 :                         }
    1636           2 :                         for i := range d.mu.mem.queue {
    1637           2 :                                 if !base.Visible(d.mu.mem.queue[i].logSeqNum, s.efos.seqNum, base.SeqNumMax) {
    1638           1 :                                         // All keys in this memtable are newer than the EFOS. Skip this
    1639           1 :                                         // memtable.
    1640           1 :                                         continue
    1641             :                                 }
    1642             :                                 // NB: computePossibleOverlaps could have false positives, such as if
    1643             :                                 // the flushable is a flushable ingest and not a memtable. In that
    1644             :                                 // case we don't open the sstables to check; we just pessimistically
    1645             :                                 // assume an overlap.
    1646           2 :                                 d.mu.mem.queue[i].computePossibleOverlaps(func(b bounded) shouldContinue {
    1647           2 :                                         overlapsFlushable = true
    1648           2 :                                         return stopIteration
    1649           2 :                                 }, protectedRanges...)
    1650           2 :                                 if overlapsFlushable {
    1651           2 :                                         break
    1652             :                                 }
    1653             :                         }
    1654             :                 }
    1655           2 :                 if overlapsFlushable {
    1656           2 :                         s = s.next
    1657           2 :                         continue
    1658             :                 }
    1659           2 :                 currentVersion.Ref()
    1660           2 : 
    1661           2 :                 // NB: s.efos.transitionToFileOnlySnapshot could close s, in which
    1662           2 :                 // case s.next would be nil. Save it before calling it.
    1663           2 :                 next := s.next
    1664           2 :                 _ = s.efos.transitionToFileOnlySnapshot(currentVersion)
    1665           2 :                 s = next
    1666             :         }
    1667             : }
    1668             : 
    1669             : // maybeScheduleCompactionAsync should be used when
    1670             : // we want to possibly schedule a compaction, but don't
    1671             : // want to eat the cost of running maybeScheduleCompaction.
    1672             : // This method should be launched in a separate goroutine.
    1673             : // d.mu must not be held when this is called.
    1674           0 : func (d *DB) maybeScheduleCompactionAsync() {
    1675           0 :         defer d.compactionSchedulers.Done()
    1676           0 : 
    1677           0 :         d.mu.Lock()
    1678           0 :         d.maybeScheduleCompaction()
    1679           0 :         d.mu.Unlock()
    1680           0 : }
    1681             : 
    1682             : // maybeScheduleCompaction schedules a compaction if necessary.
    1683             : //
    1684             : // d.mu must be held when calling this.
    1685           2 : func (d *DB) maybeScheduleCompaction() {
    1686           2 :         d.maybeScheduleCompactionPicker(pickAuto)
    1687           2 : }
    1688             : 
    1689           2 : func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction {
    1690           2 :         return picker.pickAuto(env)
    1691           2 : }
    1692             : 
    1693           2 : func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction {
    1694           2 :         return picker.pickElisionOnlyCompaction(env)
    1695           2 : }
    1696             : 
    1697             : // tryScheduleDownloadCompaction tries to start a download compaction.
    1698             : //
    1699             : // Returns true if we started a download compaction (or completed it
    1700             : // immediately because it is a no-op or we hit an error).
    1701             : //
    1702             : // Requires d.mu to be held. Updates d.mu.compact.downloads.
    1703           2 : func (d *DB) tryScheduleDownloadCompaction(env compactionEnv, maxConcurrentDownloads int) bool {
    1704           2 :         vers := d.mu.versions.currentVersion()
    1705           2 :         for i := 0; i < len(d.mu.compact.downloads); {
    1706           2 :                 download := d.mu.compact.downloads[i]
    1707           2 :                 switch d.tryLaunchDownloadCompaction(download, vers, env, maxConcurrentDownloads) {
    1708           2 :                 case launchedCompaction:
    1709           2 :                         return true
    1710           1 :                 case didNotLaunchCompaction:
    1711           1 :                         // See if we can launch a compaction for another download task.
    1712           1 :                         i++
    1713           2 :                 case downloadTaskCompleted:
    1714           2 :                         // Task is completed and must be removed.
    1715           2 :                         d.mu.compact.downloads = slices.Delete(d.mu.compact.downloads, i, i+1)
    1716             :                 }
    1717             :         }
    1718           2 :         return false
    1719             : }
    1720             : 
    1721             : // maybeScheduleCompactionPicker schedules a compaction if necessary,
    1722             : // calling `pickFunc` to pick automatic compactions.
    1723             : //
    1724             : // Requires d.mu to be held.
    1725             : func (d *DB) maybeScheduleCompactionPicker(
    1726             :         pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
    1727           2 : ) {
    1728           2 :         if d.closed.Load() != nil || d.opts.ReadOnly {
    1729           2 :                 return
    1730           2 :         }
    1731           2 :         maxCompactions := d.opts.MaxConcurrentCompactions()
    1732           2 :         maxDownloads := d.opts.MaxConcurrentDownloads()
    1733           2 : 
    1734           2 :         if d.mu.compact.compactingCount >= maxCompactions &&
    1735           2 :                 (len(d.mu.compact.downloads) == 0 || d.mu.compact.downloadingCount >= maxDownloads) {
    1736           2 :                 if len(d.mu.compact.manual) > 0 {
    1737           2 :                         // Inability to run head blocks later manual compactions.
    1738           2 :                         d.mu.compact.manual[0].retries++
    1739           2 :                 }
    1740           2 :                 return
    1741             :         }
    1742             : 
    1743             :         // Compaction picking needs a coherent view of a Version. In particular, we
    1744             :         // need to exclude concurrent ingestions from making a decision on which level
    1745             :         // to ingest into that conflicts with our compaction
    1746             :         // decision. versionSet.logLock provides the necessary mutual exclusion.
    1747           2 :         d.mu.versions.logLock()
    1748           2 :         defer d.mu.versions.logUnlock()
    1749           2 : 
    1750           2 :         // Check for the closed flag again, in case the DB was closed while we were
    1751           2 :         // waiting for logLock().
    1752           2 :         if d.closed.Load() != nil {
    1753           2 :                 return
    1754           2 :         }
    1755             : 
    1756           2 :         env := compactionEnv{
    1757           2 :                 diskAvailBytes:          d.diskAvailBytes.Load(),
    1758           2 :                 earliestSnapshotSeqNum:  d.mu.snapshots.earliest(),
    1759           2 :                 earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(),
    1760           2 :         }
    1761           2 : 
    1762           2 :         if d.mu.compact.compactingCount < maxCompactions {
    1763           2 :                 // Check for delete-only compactions first, because they're expected to be
    1764           2 :                 // cheap and reduce future compaction work.
    1765           2 :                 if !d.opts.private.disableDeleteOnlyCompactions &&
    1766           2 :                         !d.opts.DisableAutomaticCompactions &&
    1767           2 :                         len(d.mu.compact.deletionHints) > 0 {
    1768           2 :                         d.tryScheduleDeleteOnlyCompaction()
    1769           2 :                 }
    1770             : 
    1771           2 :                 for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxCompactions {
    1772           2 :                         if manual := d.mu.compact.manual[0]; !d.tryScheduleManualCompaction(env, manual) {
    1773           2 :                                 // Inability to run head blocks later manual compactions.
    1774           2 :                                 manual.retries++
    1775           2 :                                 break
    1776             :                         }
    1777           2 :                         d.mu.compact.manual = d.mu.compact.manual[1:]
    1778             :                 }
    1779             : 
    1780           2 :                 for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxCompactions &&
    1781           2 :                         d.tryScheduleAutoCompaction(env, pickFunc) {
    1782           2 :                 }
    1783             :         }
    1784             : 
    1785           2 :         for len(d.mu.compact.downloads) > 0 && d.mu.compact.downloadingCount < maxDownloads &&
    1786           2 :                 d.tryScheduleDownloadCompaction(env, maxDownloads) {
    1787           2 :         }
    1788             : }
    1789             : 
    1790             : // tryScheduleDeleteOnlyCompaction tries to kick off a delete-only compaction
    1791             : // for all files that can be deleted as suggested by deletionHints.
    1792             : //
    1793             : // Requires d.mu to be held. Updates d.mu.compact.deletionHints.
    1794           2 : func (d *DB) tryScheduleDeleteOnlyCompaction() {
    1795           2 :         v := d.mu.versions.currentVersion()
    1796           2 :         snapshots := d.mu.snapshots.toSlice()
    1797           2 :         // We need to save the value of exciseEnabled in the compaction itself, as
    1798           2 :         // it can change dynamically between now and when the compaction runs.
    1799           2 :         exciseEnabled := d.FormatMajorVersion() >= FormatVirtualSSTables &&
    1800           2 :                 d.opts.Experimental.EnableDeleteOnlyCompactionExcises != nil && d.opts.Experimental.EnableDeleteOnlyCompactionExcises()
    1801           2 :         inputs, resolvedHints, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots, exciseEnabled)
    1802           2 :         d.mu.compact.deletionHints = unresolvedHints
    1803           2 : 
    1804           2 :         if len(inputs) > 0 {
    1805           2 :                 c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow(), resolvedHints, exciseEnabled)
    1806           2 :                 d.mu.compact.compactingCount++
    1807           2 :                 d.addInProgressCompaction(c)
    1808           2 :                 go d.compact(c, nil)
    1809           2 :         }
    1810             : }
    1811             : 
    1812             : // tryScheduleManualCompaction tries to kick off the given manual compaction.
    1813             : //
    1814             : // Returns false if we are not able to run this compaction at this time.
    1815             : //
    1816             : // Requires d.mu to be held.
    1817           2 : func (d *DB) tryScheduleManualCompaction(env compactionEnv, manual *manualCompaction) bool {
    1818           2 :         v := d.mu.versions.currentVersion()
    1819           2 :         env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
    1820           2 :         pc, retryLater := pickManualCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), manual)
    1821           2 :         if pc == nil {
    1822           2 :                 if !retryLater {
    1823           2 :                         // Manual compaction is a no-op. Signal completion and exit.
    1824           2 :                         manual.done <- nil
    1825           2 :                         return true
    1826           2 :                 }
    1827             :                 // We are not able to run this manual compaction at this time.
    1828           2 :                 return false
    1829             :         }
    1830             : 
    1831           2 :         c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
    1832           2 :         d.mu.compact.compactingCount++
    1833           2 :         d.addInProgressCompaction(c)
    1834           2 :         go d.compact(c, manual.done)
    1835           2 :         return true
    1836             : }
    1837             : 
    1838             : // tryScheduleAutoCompaction tries to kick off an automatic compaction.
    1839             : //
    1840             : // Returns false if no automatic compactions are necessary or able to run at
    1841             : // this time.
    1842             : //
    1843             : // Requires d.mu to be held.
    1844             : func (d *DB) tryScheduleAutoCompaction(
    1845             :         env compactionEnv, pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
    1846           2 : ) bool {
    1847           2 :         env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
    1848           2 :         env.readCompactionEnv = readCompactionEnv{
    1849           2 :                 readCompactions:          &d.mu.compact.readCompactions,
    1850           2 :                 flushing:                 d.mu.compact.flushing || d.passedFlushThreshold(),
    1851           2 :                 rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
    1852           2 :         }
    1853           2 :         pc := pickFunc(d.mu.versions.picker, env)
    1854           2 :         if pc == nil {
    1855           2 :                 return false
    1856           2 :         }
    1857           2 :         c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
    1858           2 :         d.mu.compact.compactingCount++
    1859           2 :         d.addInProgressCompaction(c)
    1860           2 :         go d.compact(c, nil)
    1861           2 :         return true
    1862             : }
    1863             : 
    1864             : // deleteCompactionHintType indicates whether the deleteCompactionHint was
    1865             : // generated from a span containing a range del (point key only), a range key
    1866             : // delete (range key only), or both a point and range key.
    1867             : type deleteCompactionHintType uint8
    1868             : 
    1869             : const (
    1870             :         // NOTE: While these are primarily used as enumeration types, they are also
    1871             :         // used for some bitwise operations. Care should be taken when updating.
    1872             :         deleteCompactionHintTypeUnknown deleteCompactionHintType = iota
    1873             :         deleteCompactionHintTypePointKeyOnly
    1874             :         deleteCompactionHintTypeRangeKeyOnly
    1875             :         deleteCompactionHintTypePointAndRangeKey
    1876             : )
    1877             : 
    1878             : // String implements fmt.Stringer.
    1879           1 : func (h deleteCompactionHintType) String() string {
    1880           1 :         switch h {
    1881           0 :         case deleteCompactionHintTypeUnknown:
    1882           0 :                 return "unknown"
    1883           1 :         case deleteCompactionHintTypePointKeyOnly:
    1884           1 :                 return "point-key-only"
    1885           1 :         case deleteCompactionHintTypeRangeKeyOnly:
    1886           1 :                 return "range-key-only"
    1887           1 :         case deleteCompactionHintTypePointAndRangeKey:
    1888           1 :                 return "point-and-range-key"
    1889           0 :         default:
    1890           0 :                 panic(fmt.Sprintf("unknown hint type: %d", h))
    1891             :         }
    1892             : }
    1893             : 
    1894             : // compactionHintFromKeys returns a deleteCompactionHintType given a slice of
    1895             : // keyspan.Keys.
    1896           2 : func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType {
    1897           2 :         var hintType deleteCompactionHintType
    1898           2 :         for _, k := range keys {
    1899           2 :                 switch k.Kind() {
    1900           2 :                 case base.InternalKeyKindRangeDelete:
    1901           2 :                         hintType |= deleteCompactionHintTypePointKeyOnly
    1902           2 :                 case base.InternalKeyKindRangeKeyDelete:
    1903           2 :                         hintType |= deleteCompactionHintTypeRangeKeyOnly
    1904           0 :                 default:
    1905           0 :                         panic(fmt.Sprintf("unsupported key kind: %s", k.Kind()))
    1906             :                 }
    1907             :         }
    1908           2 :         return hintType
    1909             : }
    1910             : 
    1911             : // A deleteCompactionHint records a user key and sequence number span that has been
    1912             : // deleted by a range tombstone. A hint is recorded if at least one sstable
    1913             : // falls completely within both the user key and sequence number spans.
    1914             : // Once the tombstones and the observed completely-contained sstables fall
    1915             : // into the same snapshot stripe, a delete-only compaction may delete any
    1916             : // sstables within the range.
    1917             : type deleteCompactionHint struct {
    1918             :         // The type of key span that generated this hint (point key, range key, or
    1919             :         // both).
    1920             :         hintType deleteCompactionHintType
    1921             :         // start and end are user keys specifying a key range [start, end) of
    1922             :         // deleted keys.
    1923             :         start []byte
    1924             :         end   []byte
    1925             :         // The level of the file containing the range tombstone(s) when the hint
    1926             :         // was created. Only lower levels need to be searched for files that may
    1927             :         // be deleted.
    1928             :         tombstoneLevel int
    1929             :         // The file containing the range tombstone(s) that created the hint.
    1930             :         tombstoneFile *fileMetadata
    1931             :         // The smallest and largest sequence numbers of the abutting tombstones
    1932             :         // merged to form this hint. All of a tables' keys must be less than the
    1933             :         // tombstone smallest sequence number to be deleted. All of a tables'
    1934             :         // sequence numbers must fall into the same snapshot stripe as the
    1935             :         // tombstone largest sequence number to be deleted.
    1936             :         tombstoneLargestSeqNum  base.SeqNum
    1937             :         tombstoneSmallestSeqNum base.SeqNum
    1938             :         // The smallest sequence number of a sstable that was found to be covered
    1939             :         // by this hint. The hint cannot be resolved until this sequence number is
    1940             :         // in the same snapshot stripe as the largest tombstone sequence number.
    1941             :         // This is set when a hint is created, so the LSM may look different and
    1942             :         // notably no longer contain the sstable that contained the key at this
    1943             :         // sequence number.
    1944             :         fileSmallestSeqNum base.SeqNum
    1945             : }
    1946             : 
    1947             : type deletionHintOverlap int8
    1948             : 
    1949             : const (
    1950             :         // hintDoesNotApply indicates that the hint does not apply to the file.
    1951             :         hintDoesNotApply deletionHintOverlap = iota
    1952             :         // hintExcisesFile indicates that the hint excises a portion of the file,
    1953             :         // and the format major version of the DB supports excises.
    1954             :         hintExcisesFile
    1955             :         // hintDeletesFile indicates that the hint deletes the entirety of the file.
    1956             :         hintDeletesFile
    1957             : )
    1958             : 
    1959           1 : func (h deleteCompactionHint) String() string {
    1960           1 :         return fmt.Sprintf(
    1961           1 :                 "L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)",
    1962           1 :                 h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end,
    1963           1 :                 h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum,
    1964           1 :                 h.hintType,
    1965           1 :         )
    1966           1 : }
    1967             : 
    1968             : func (h *deleteCompactionHint) canDeleteOrExcise(
    1969             :         cmp Compare, m *fileMetadata, snapshots compact.Snapshots, exciseEnabled bool,
    1970           2 : ) deletionHintOverlap {
    1971           2 :         // The file can only be deleted if all of its keys are older than the
    1972           2 :         // earliest tombstone aggregated into the hint. Note that we use
    1973           2 :         // m.LargestSeqNumAbsolute, not m.LargestSeqNum. Consider a compaction that
    1974           2 :         // zeroes sequence numbers. A compaction may zero the sequence number of a
    1975           2 :         // key with a sequence number > h.tombstoneSmallestSeqNum and set it to
    1976           2 :         // zero. If we looked at m.LargestSeqNum, the resulting output file would
    1977           2 :         // appear to not contain any keys more recent than the oldest tombstone. To
    1978           2 :         // avoid this error, the largest pre-zeroing sequence number is maintained
    1979           2 :         // in LargestSeqNumAbsolute and used here to make the determination whether
    1980           2 :         // the file's keys are older than all of the hint's tombstones.
    1981           2 :         if m.LargestSeqNumAbsolute >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum {
    1982           2 :                 return hintDoesNotApply
    1983           2 :         }
    1984             : 
    1985             :         // The file's oldest key must  be in the same snapshot stripe as the
    1986             :         // newest tombstone. NB: We already checked the hint's sequence numbers,
    1987             :         // but this file's oldest sequence number might be lower than the hint's
    1988             :         // smallest sequence number despite the file falling within the key range
    1989             :         // if this file was constructed after the hint by a compaction.
    1990           2 :         if snapshots.Index(h.tombstoneLargestSeqNum) != snapshots.Index(m.SmallestSeqNum) {
    1991           0 :                 return hintDoesNotApply
    1992           0 :         }
    1993             : 
    1994           2 :         switch h.hintType {
    1995           2 :         case deleteCompactionHintTypePointKeyOnly:
    1996           2 :                 // A hint generated by a range del span cannot delete tables that contain
    1997           2 :                 // range keys.
    1998           2 :                 if m.HasRangeKeys {
    1999           2 :                         return hintDoesNotApply
    2000           2 :                 }
    2001           2 :         case deleteCompactionHintTypeRangeKeyOnly:
    2002           2 :                 // A hint generated by a range key del span cannot delete tables that
    2003           2 :                 // contain point keys.
    2004           2 :                 if m.HasPointKeys {
    2005           2 :                         return hintDoesNotApply
    2006           2 :                 }
    2007           2 :         case deleteCompactionHintTypePointAndRangeKey:
    2008             :                 // A hint from a span that contains both range dels *and* range keys can
    2009             :                 // only be deleted if both bounds fall within the hint. The next check takes
    2010             :                 // care of this.
    2011           0 :         default:
    2012           0 :                 panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType))
    2013             :         }
    2014           2 :         if cmp(h.start, m.Smallest.UserKey) <= 0 &&
    2015           2 :                 base.UserKeyExclusive(h.end).CompareUpperBounds(cmp, m.UserKeyBounds().End) >= 0 {
    2016           2 :                 return hintDeletesFile
    2017           2 :         }
    2018           2 :         if !exciseEnabled {
    2019           2 :                 // The file's keys must be completely contained within the hint range; excises
    2020           2 :                 // aren't allowed.
    2021           2 :                 return hintDoesNotApply
    2022           2 :         }
    2023             :         // Check for any overlap. In cases of partial overlap, we can excise the part of the file
    2024             :         // that overlaps with the deletion hint.
    2025           2 :         if cmp(h.end, m.Smallest.UserKey) > 0 &&
    2026           2 :                 (m.UserKeyBounds().End.CompareUpperBounds(cmp, base.UserKeyInclusive(h.start)) >= 0) {
    2027           2 :                 return hintExcisesFile
    2028           2 :         }
    2029           1 :         return hintDoesNotApply
    2030             : }
    2031             : 
    2032             : // checkDeleteCompactionHints checks the passed-in deleteCompactionHints for those that
    2033             : // can be resolved and those that cannot. A hint is considered resolved when its largest
    2034             : // tombstone sequence number and the smallest sequence number of covered files fall in
    2035             : // the same snapshot stripe. No more than maxHintsPerDeleteOnlyCompaction will be resolved
    2036             : // per method call. Resolved and unresolved hints are returned in separate return values.
    2037             : // The files that the resolved hints apply to, are returned as compactionLevels.
    2038             : func checkDeleteCompactionHints(
    2039             :         cmp Compare,
    2040             :         v *version,
    2041             :         hints []deleteCompactionHint,
    2042             :         snapshots compact.Snapshots,
    2043             :         exciseEnabled bool,
    2044           2 : ) (levels []compactionLevel, resolved, unresolved []deleteCompactionHint) {
    2045           2 :         var files map[*fileMetadata]bool
    2046           2 :         var byLevel [numLevels][]*fileMetadata
    2047           2 : 
    2048           2 :         // Delete-only compactions can be quadratic (O(mn)) in terms of runtime
    2049           2 :         // where m = number of files in the delete-only compaction and n = number
    2050           2 :         // of resolved hints. To prevent these from growing unbounded, we cap
    2051           2 :         // the number of hints we resolve for one delete-only compaction. This
    2052           2 :         // cap only applies if exciseEnabled == true.
    2053           2 :         const maxHintsPerDeleteOnlyCompaction = 10
    2054           2 : 
    2055           2 :         unresolvedHints := hints[:0]
    2056           2 :         // Lazily populate resolvedHints, similar to files above.
    2057           2 :         resolvedHints := make([]deleteCompactionHint, 0)
    2058           2 :         for _, h := range hints {
    2059           2 :                 // Check each compaction hint to see if it's resolvable. Resolvable
    2060           2 :                 // hints are removed and trigger a delete-only compaction if any files
    2061           2 :                 // in the current LSM still meet their criteria. Unresolvable hints
    2062           2 :                 // are saved and don't trigger a delete-only compaction.
    2063           2 :                 //
    2064           2 :                 // When a compaction hint is created, the sequence numbers of the
    2065           2 :                 // range tombstones and the covered file with the oldest key are
    2066           2 :                 // recorded. The largest tombstone sequence number and the smallest
    2067           2 :                 // file sequence number must be in the same snapshot stripe for the
    2068           2 :                 // hint to be resolved. The below graphic models a compaction hint
    2069           2 :                 // covering the keyspace [b, r). The hint completely contains two
    2070           2 :                 // files, 000002 and 000003. The file 000003 contains the lowest
    2071           2 :                 // covered sequence number at #90. The tombstone b.RANGEDEL.230:h has
    2072           2 :                 // the highest tombstone sequence number incorporated into the hint.
    2073           2 :                 // The hint may be resolved only once the snapshots at #100, #180 and
    2074           2 :                 // #210 are all closed. File 000001 is not included within the hint
    2075           2 :                 // because it extends beyond the range tombstones in user key space.
    2076           2 :                 //
    2077           2 :                 // 250
    2078           2 :                 //
    2079           2 :                 //       |-b...230:h-|
    2080           2 :                 // _____________________________________________________ snapshot #210
    2081           2 :                 // 200               |--h.RANGEDEL.200:r--|
    2082           2 :                 //
    2083           2 :                 // _____________________________________________________ snapshot #180
    2084           2 :                 //
    2085           2 :                 // 150                     +--------+
    2086           2 :                 //           +---------+   | 000003 |
    2087           2 :                 //           | 000002  |   |        |
    2088           2 :                 //           +_________+   |        |
    2089           2 :                 // 100_____________________|________|___________________ snapshot #100
    2090           2 :                 //                         +--------+
    2091           2 :                 // _____________________________________________________ snapshot #70
    2092           2 :                 //                             +---------------+
    2093           2 :                 //  50                         | 000001        |
    2094           2 :                 //                             |               |
    2095           2 :                 //                             +---------------+
    2096           2 :                 // ______________________________________________________________
    2097           2 :                 //     a b c d e f g h i j k l m n o p q r s t u v w x y z
    2098           2 : 
    2099           2 :                 if snapshots.Index(h.tombstoneLargestSeqNum) != snapshots.Index(h.fileSmallestSeqNum) ||
    2100           2 :                         (len(resolvedHints) >= maxHintsPerDeleteOnlyCompaction && exciseEnabled) {
    2101           2 :                         // Cannot resolve yet.
    2102           2 :                         unresolvedHints = append(unresolvedHints, h)
    2103           2 :                         continue
    2104             :                 }
    2105             : 
    2106             :                 // The hint h will be resolved and dropped, if it either affects no files at all
    2107             :                 // or if the number of files it creates (eg. through excision) is less than or
    2108             :                 // equal to the number of files it deletes. First, determine how many files are
    2109             :                 // affected by this hint.
    2110           2 :                 filesDeletedByCurrentHint := 0
    2111           2 :                 var filesDeletedByLevel [7][]*fileMetadata
    2112           2 :                 for l := h.tombstoneLevel + 1; l < numLevels; l++ {
    2113           2 :                         overlaps := v.Overlaps(l, base.UserKeyBoundsEndExclusive(h.start, h.end))
    2114           2 :                         iter := overlaps.Iter()
    2115           2 : 
    2116           2 :                         for m := iter.First(); m != nil; m = iter.Next() {
    2117           2 :                                 doesHintApply := h.canDeleteOrExcise(cmp, m, snapshots, exciseEnabled)
    2118           2 :                                 if m.IsCompacting() || doesHintApply == hintDoesNotApply || files[m] {
    2119           2 :                                         continue
    2120             :                                 }
    2121           2 :                                 switch doesHintApply {
    2122           2 :                                 case hintDeletesFile:
    2123           2 :                                         filesDeletedByCurrentHint++
    2124           2 :                                 case hintExcisesFile:
    2125           2 :                                         // Account for the original file being deleted.
    2126           2 :                                         filesDeletedByCurrentHint++
    2127           2 :                                         // An excise could produce up to 2 new files. If the hint
    2128           2 :                                         // leaves a fragment of the file on the left, decrement
    2129           2 :                                         // the counter once. If the hint leaves a fragment of the
    2130           2 :                                         // file on the right, decrement the counter once.
    2131           2 :                                         if cmp(h.start, m.Smallest.UserKey) > 0 {
    2132           2 :                                                 filesDeletedByCurrentHint--
    2133           2 :                                         }
    2134           2 :                                         if m.UserKeyBounds().End.IsUpperBoundFor(cmp, h.end) {
    2135           2 :                                                 filesDeletedByCurrentHint--
    2136           2 :                                         }
    2137             :                                 }
    2138           2 :                                 filesDeletedByLevel[l] = append(filesDeletedByLevel[l], m)
    2139             :                         }
    2140             :                 }
    2141           2 :                 if filesDeletedByCurrentHint < 0 {
    2142           2 :                         // This hint does not delete a sufficient number of files to warrant
    2143           2 :                         // a delete-only compaction at this stage. Drop it (ie. don't add it
    2144           2 :                         // to either resolved or unresolved hints) so it doesn't stick around
    2145           2 :                         // forever.
    2146           2 :                         continue
    2147             :                 }
    2148             :                 // This hint will be resolved and dropped.
    2149           2 :                 for l := h.tombstoneLevel + 1; l < numLevels; l++ {
    2150           2 :                         byLevel[l] = append(byLevel[l], filesDeletedByLevel[l]...)
    2151           2 :                         for _, m := range filesDeletedByLevel[l] {
    2152           2 :                                 if files == nil {
    2153           2 :                                         // Construct files lazily, assuming most calls will not
    2154           2 :                                         // produce delete-only compactions.
    2155           2 :                                         files = make(map[*fileMetadata]bool)
    2156           2 :                                 }
    2157           2 :                                 files[m] = true
    2158             :                         }
    2159             :                 }
    2160           2 :                 resolvedHints = append(resolvedHints, h)
    2161             :         }
    2162             : 
    2163           2 :         var compactLevels []compactionLevel
    2164           2 :         for l, files := range byLevel {
    2165           2 :                 if len(files) == 0 {
    2166           2 :                         continue
    2167             :                 }
    2168           2 :                 compactLevels = append(compactLevels, compactionLevel{
    2169           2 :                         level: l,
    2170           2 :                         files: manifest.NewLevelSliceKeySorted(cmp, files),
    2171           2 :                 })
    2172             :         }
    2173           2 :         return compactLevels, resolvedHints, unresolvedHints
    2174             : }
    2175             : 
    2176             : // compact runs one compaction and maybe schedules another call to compact.
    2177           2 : func (d *DB) compact(c *compaction, errChannel chan error) {
    2178           2 :         pprof.Do(context.Background(), compactLabels, func(context.Context) {
    2179           2 :                 d.mu.Lock()
    2180           2 :                 defer d.mu.Unlock()
    2181           2 :                 if err := d.compact1(c, errChannel); err != nil {
    2182           2 :                         // TODO(peter): count consecutive compaction errors and backoff.
    2183           2 :                         d.opts.EventListener.BackgroundError(err)
    2184           2 :                 }
    2185           2 :                 if c.isDownload {
    2186           2 :                         d.mu.compact.downloadingCount--
    2187           2 :                 } else {
    2188           2 :                         d.mu.compact.compactingCount--
    2189           2 :                 }
    2190           2 :                 delete(d.mu.compact.inProgress, c)
    2191           2 :                 // Add this compaction's duration to the cumulative duration. NB: This
    2192           2 :                 // must be atomic with the above removal of c from
    2193           2 :                 // d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not
    2194           2 :                 // miss or double count a completing compaction's duration.
    2195           2 :                 d.mu.compact.duration += d.timeNow().Sub(c.beganAt)
    2196           2 : 
    2197           2 :                 // The previous compaction may have produced too many files in a
    2198           2 :                 // level, so reschedule another compaction if needed.
    2199           2 :                 d.maybeScheduleCompaction()
    2200           2 :                 d.mu.compact.cond.Broadcast()
    2201             :         })
    2202             : }
    2203             : 
    2204             : // cleanupVersionEdit cleans up any on-disk artifacts that were created
    2205             : // for the application of a versionEdit that is no longer going to be applied.
    2206             : //
    2207             : // d.mu must be held when calling this method.
    2208           2 : func (d *DB) cleanupVersionEdit(ve *versionEdit) {
    2209           2 :         obsoleteFiles := make([]*fileBacking, 0, len(ve.NewFiles))
    2210           2 :         deletedFiles := make(map[base.FileNum]struct{})
    2211           2 :         for key := range ve.DeletedFiles {
    2212           2 :                 deletedFiles[key.FileNum] = struct{}{}
    2213           2 :         }
    2214           2 :         for i := range ve.NewFiles {
    2215           2 :                 if ve.NewFiles[i].Meta.Virtual {
    2216           1 :                         // We handle backing files separately.
    2217           1 :                         continue
    2218             :                 }
    2219           2 :                 if _, ok := deletedFiles[ve.NewFiles[i].Meta.FileNum]; ok {
    2220           1 :                         // This file is being moved in this ve to a different level.
    2221           1 :                         // Don't mark it as obsolete.
    2222           1 :                         continue
    2223             :                 }
    2224           2 :                 obsoleteFiles = append(obsoleteFiles, ve.NewFiles[i].Meta.PhysicalMeta().FileBacking)
    2225             :         }
    2226           2 :         for i := range ve.CreatedBackingTables {
    2227           1 :                 if ve.CreatedBackingTables[i].IsUnused() {
    2228           0 :                         obsoleteFiles = append(obsoleteFiles, ve.CreatedBackingTables[i])
    2229           0 :                 }
    2230             :         }
    2231           2 :         for i := range obsoleteFiles {
    2232           2 :                 // Add this file to zombie tables as well, as the versionSet
    2233           2 :                 // asserts on whether every obsolete file was at one point
    2234           2 :                 // marked zombie.
    2235           2 :                 d.mu.versions.zombieTables[obsoleteFiles[i].DiskFileNum] = tableInfo{
    2236           2 :                         fileInfo: fileInfo{
    2237           2 :                                 FileNum:  obsoleteFiles[i].DiskFileNum,
    2238           2 :                                 FileSize: obsoleteFiles[i].Size,
    2239           2 :                         },
    2240           2 :                         // TODO(bilal): This is harmless if it's wrong, as it only causes
    2241           2 :                         // incorrect accounting for the size of it in metrics. Currently
    2242           2 :                         // all compactions only write to local files anyway except with
    2243           2 :                         // disaggregated storage; if this becomes the norm, we should do
    2244           2 :                         // an objprovider lookup here.
    2245           2 :                         isLocal: true,
    2246           2 :                 }
    2247           2 :         }
    2248           2 :         d.mu.versions.addObsoleteLocked(obsoleteFiles)
    2249             : }
    2250             : 
    2251             : // compact1 runs one compaction.
    2252             : //
    2253             : // d.mu must be held when calling this, but the mutex may be dropped and
    2254             : // re-acquired during the course of this method.
    2255           2 : func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
    2256           2 :         if errChannel != nil {
    2257           2 :                 defer func() {
    2258           2 :                         errChannel <- err
    2259           2 :                 }()
    2260             :         }
    2261             : 
    2262           2 :         jobID := d.newJobIDLocked()
    2263           2 :         info := c.makeInfo(jobID)
    2264           2 :         d.opts.EventListener.CompactionBegin(info)
    2265           2 :         startTime := d.timeNow()
    2266           2 : 
    2267           2 :         ve, stats, err := d.runCompaction(jobID, c)
    2268           2 : 
    2269           2 :         info.Duration = d.timeNow().Sub(startTime)
    2270           2 :         if err == nil {
    2271           2 :                 validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey, d.opts.Logger)
    2272           2 :                 err = func() error {
    2273           2 :                         var err error
    2274           2 :                         d.mu.versions.logLock()
    2275           2 :                         // Check if this compaction had a conflicting operation (eg. a d.excise())
    2276           2 :                         // that necessitates it restarting from scratch. Note that since we hold
    2277           2 :                         // the manifest lock, we don't expect this bool to change its value
    2278           2 :                         // as only the holder of the manifest lock will ever write to it.
    2279           2 :                         if c.cancel.Load() {
    2280           2 :                                 d.mu.versions.metrics.Compact.CancelledCount++
    2281           2 :                                 d.mu.versions.metrics.Compact.CancelledBytes += c.bytesWritten
    2282           2 : 
    2283           2 :                                 err = firstError(err, ErrCancelledCompaction)
    2284           2 :                                 // This is the first time we've seen a cancellation during the
    2285           2 :                                 // life of this compaction (or the original condition on err == nil
    2286           2 :                                 // would not have been true). We should delete any tables already
    2287           2 :                                 // created, as d.runCompaction did not do that.
    2288           2 :                                 d.cleanupVersionEdit(ve)
    2289           2 :                                 // logAndApply calls logUnlock. If we didn't call it, we need to call
    2290           2 :                                 // logUnlock ourselves.
    2291           2 :                                 d.mu.versions.logUnlock()
    2292           2 :                                 return err
    2293           2 :                         }
    2294           2 :                         return d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo {
    2295           2 :                                 return d.getInProgressCompactionInfoLocked(c)
    2296           2 :                         })
    2297             :                 }()
    2298             :         }
    2299             : 
    2300           2 :         info.Done = true
    2301           2 :         info.Err = err
    2302           2 :         if err == nil {
    2303           2 :                 for i := range ve.NewFiles {
    2304           2 :                         e := &ve.NewFiles[i]
    2305           2 :                         info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo())
    2306           2 :                 }
    2307           2 :                 d.mu.snapshots.cumulativePinnedCount += stats.CumulativePinnedKeys
    2308           2 :                 d.mu.snapshots.cumulativePinnedSize += stats.CumulativePinnedSize
    2309           2 :                 d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.CountMissizedDels
    2310             :         }
    2311             : 
    2312             :         // NB: clearing compacting state must occur before updating the read state;
    2313             :         // L0Sublevels initialization depends on it.
    2314           2 :         d.clearCompactingState(c, err != nil)
    2315           2 :         if err != nil && errors.Is(err, ErrCancelledCompaction) {
    2316           2 :                 d.mu.versions.metrics.Compact.CancelledCount++
    2317           2 :                 d.mu.versions.metrics.Compact.CancelledBytes += c.bytesWritten
    2318           2 :         }
    2319           2 :         d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
    2320           2 :         d.mu.versions.incrementCompactionBytes(-c.bytesWritten)
    2321           2 : 
    2322           2 :         info.TotalDuration = d.timeNow().Sub(c.beganAt)
    2323           2 :         d.opts.EventListener.CompactionEnd(info)
    2324           2 : 
    2325           2 :         // Update the read state before deleting obsolete files because the
    2326           2 :         // read-state update will cause the previous version to be unref'd and if
    2327           2 :         // there are no references obsolete tables will be added to the obsolete
    2328           2 :         // table list.
    2329           2 :         if err == nil {
    2330           2 :                 d.updateReadStateLocked(d.opts.DebugCheck)
    2331           2 :                 d.updateTableStatsLocked(ve.NewFiles)
    2332           2 :         }
    2333           2 :         d.deleteObsoleteFiles(jobID)
    2334           2 : 
    2335           2 :         return err
    2336             : }
    2337             : 
    2338             : // runCopyCompaction runs a copy compaction where a new FileNum is created that
    2339             : // is a byte-for-byte copy of the input file or span thereof in some cases. This
    2340             : // is used in lieu of a move compaction when a file is being moved across the
    2341             : // local/remote storage boundary. It could also be used in lieu of a rewrite
    2342             : // compaction as part of a Download() call, which allows copying only a span of
    2343             : // the external file, provided the file does not contain range keys or value
    2344             : // blocks (see sstable.CopySpan).
    2345             : //
    2346             : // d.mu must be held when calling this method. The mutex will be released when
    2347             : // doing IO.
    2348             : func (d *DB) runCopyCompaction(
    2349             :         jobID JobID, c *compaction,
    2350           2 : ) (ve *versionEdit, stats compact.Stats, _ error) {
    2351           2 :         iter := c.startLevel.files.Iter()
    2352           2 :         inputMeta := iter.First()
    2353           2 :         if iter.Next() != nil {
    2354           0 :                 return nil, compact.Stats{}, base.AssertionFailedf("got more than one file for a move compaction")
    2355           0 :         }
    2356           2 :         if c.cancel.Load() {
    2357           0 :                 return nil, compact.Stats{}, ErrCancelledCompaction
    2358           0 :         }
    2359           2 :         ve = &versionEdit{
    2360           2 :                 DeletedFiles: map[deletedFileEntry]*fileMetadata{
    2361           2 :                         {Level: c.startLevel.level, FileNum: inputMeta.FileNum}: inputMeta,
    2362           2 :                 },
    2363           2 :         }
    2364           2 : 
    2365           2 :         objMeta, err := d.objProvider.Lookup(fileTypeTable, inputMeta.FileBacking.DiskFileNum)
    2366           2 :         if err != nil {
    2367           0 :                 return nil, compact.Stats{}, err
    2368           0 :         }
    2369           2 :         if !objMeta.IsExternal() {
    2370           2 :                 if objMeta.IsRemote() || !remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level) {
    2371           0 :                         panic("pebble: scheduled a copy compaction that is not actually moving files to shared storage")
    2372             :                 }
    2373             :                 // Note that based on logic in the compaction picker, we're guaranteed
    2374             :                 // inputMeta.Virtual is false.
    2375           2 :                 if inputMeta.Virtual {
    2376           0 :                         panic(errors.AssertionFailedf("cannot do a copy compaction of a virtual sstable across local/remote storage"))
    2377             :                 }
    2378             :         }
    2379             : 
    2380             :         // We are in the relatively more complex case where we need to copy this
    2381             :         // file to remote storage. Drop the db mutex while we do the copy
    2382             :         //
    2383             :         // To ease up cleanup of the local file and tracking of refs, we create
    2384             :         // a new FileNum. This has the potential of making the block cache less
    2385             :         // effective, however.
    2386           2 :         newMeta := &fileMetadata{
    2387           2 :                 Size:                     inputMeta.Size,
    2388           2 :                 CreationTime:             inputMeta.CreationTime,
    2389           2 :                 SmallestSeqNum:           inputMeta.SmallestSeqNum,
    2390           2 :                 LargestSeqNum:            inputMeta.LargestSeqNum,
    2391           2 :                 LargestSeqNumAbsolute:    inputMeta.LargestSeqNumAbsolute,
    2392           2 :                 Stats:                    inputMeta.Stats,
    2393           2 :                 Virtual:                  inputMeta.Virtual,
    2394           2 :                 SyntheticPrefixAndSuffix: inputMeta.SyntheticPrefixAndSuffix,
    2395           2 :         }
    2396           2 :         if inputMeta.HasPointKeys {
    2397           2 :                 newMeta.ExtendPointKeyBounds(c.cmp, inputMeta.SmallestPointKey, inputMeta.LargestPointKey)
    2398           2 :         }
    2399           2 :         if inputMeta.HasRangeKeys {
    2400           2 :                 newMeta.ExtendRangeKeyBounds(c.cmp, inputMeta.SmallestRangeKey, inputMeta.LargestRangeKey)
    2401           2 :         }
    2402           2 :         newMeta.FileNum = d.mu.versions.getNextFileNum()
    2403           2 :         if objMeta.IsExternal() {
    2404           2 :                 // external -> local/shared copy. File must be virtual.
    2405           2 :                 // We will update this size later after we produce the new backing file.
    2406           2 :                 newMeta.InitProviderBacking(base.DiskFileNum(newMeta.FileNum), inputMeta.FileBacking.Size)
    2407           2 :         } else {
    2408           2 :                 // local -> shared copy. New file is guaranteed to not be virtual.
    2409           2 :                 newMeta.InitPhysicalBacking()
    2410           2 :         }
    2411             : 
    2412             :         // Before dropping the db mutex, grab a ref to the current version. This
    2413             :         // prevents any concurrent excises from deleting files that this compaction
    2414             :         // needs to read/maintain a reference to.
    2415           2 :         vers := d.mu.versions.currentVersion()
    2416           2 :         vers.Ref()
    2417           2 :         defer vers.UnrefLocked()
    2418           2 : 
    2419           2 :         // NB: The order here is reversed, lock after unlock. This is similar to
    2420           2 :         // runCompaction.
    2421           2 :         d.mu.Unlock()
    2422           2 :         defer d.mu.Lock()
    2423           2 : 
    2424           2 :         deleteOnExit := false
    2425           2 :         defer func() {
    2426           2 :                 if deleteOnExit {
    2427           1 :                         _ = d.objProvider.Remove(fileTypeTable, newMeta.FileBacking.DiskFileNum)
    2428           1 :                 }
    2429             :         }()
    2430             : 
    2431             :         // If the src obj is external, we're doing an external to local/shared copy.
    2432           2 :         if objMeta.IsExternal() {
    2433           2 :                 ctx := context.TODO()
    2434           2 :                 src, err := d.objProvider.OpenForReading(
    2435           2 :                         ctx, fileTypeTable, inputMeta.FileBacking.DiskFileNum, objstorage.OpenOptions{},
    2436           2 :                 )
    2437           2 :                 if err != nil {
    2438           0 :                         return nil, compact.Stats{}, err
    2439           0 :                 }
    2440           2 :                 defer func() {
    2441           2 :                         if src != nil {
    2442           0 :                                 src.Close()
    2443           0 :                         }
    2444             :                 }()
    2445             : 
    2446           2 :                 w, _, err := d.objProvider.Create(
    2447           2 :                         ctx, fileTypeTable, newMeta.FileBacking.DiskFileNum,
    2448           2 :                         objstorage.CreateOptions{
    2449           2 :                                 PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level),
    2450           2 :                         },
    2451           2 :                 )
    2452           2 :                 if err != nil {
    2453           0 :                         return nil, compact.Stats{}, err
    2454           0 :                 }
    2455           2 :                 deleteOnExit = true
    2456           2 : 
    2457           2 :                 start, end := newMeta.Smallest, newMeta.Largest
    2458           2 :                 if newMeta.SyntheticPrefixAndSuffix.HasPrefix() {
    2459           2 :                         syntheticPrefix := newMeta.SyntheticPrefixAndSuffix.Prefix()
    2460           2 :                         start.UserKey = syntheticPrefix.Invert(start.UserKey)
    2461           2 :                         end.UserKey = syntheticPrefix.Invert(end.UserKey)
    2462           2 :                 }
    2463           2 :                 if newMeta.SyntheticPrefixAndSuffix.HasSuffix() {
    2464           1 :                         // Extend the bounds as necessary so that the keys don't include suffixes.
    2465           1 :                         start.UserKey = start.UserKey[:c.comparer.Split(start.UserKey)]
    2466           1 :                         if n := c.comparer.Split(end.UserKey); n < len(end.UserKey) {
    2467           0 :                                 end = base.MakeRangeDeleteSentinelKey(c.comparer.ImmediateSuccessor(nil, end.UserKey[:n]))
    2468           0 :                         }
    2469             :                 }
    2470             : 
    2471             :                 // NB: external files are always virtual.
    2472           2 :                 var wrote uint64
    2473           2 :                 err = d.fileCache.withVirtualReader(inputMeta.VirtualMeta(), func(r sstable.VirtualReader) error {
    2474           2 :                         var err error
    2475           2 :                         wrote, err = sstable.CopySpan(ctx,
    2476           2 :                                 src, r.UnsafeReader(), d.opts.MakeReaderOptions(),
    2477           2 :                                 w, d.opts.MakeWriterOptions(c.outputLevel.level, d.TableFormat()),
    2478           2 :                                 start, end,
    2479           2 :                         )
    2480           2 :                         return err
    2481           2 :                 })
    2482             : 
    2483           2 :                 src = nil // We passed src to CopySpan; it's responsible for closing it.
    2484           2 :                 if err != nil {
    2485           1 :                         if errors.Is(err, sstable.ErrEmptySpan) {
    2486           1 :                                 // The virtual table was empty. Just remove the backing file.
    2487           1 :                                 // Note that deleteOnExit is true so we will delete the created object.
    2488           1 :                                 c.metrics = map[int]*LevelMetrics{
    2489           1 :                                         c.outputLevel.level: {
    2490           1 :                                                 BytesIn: inputMeta.Size,
    2491           1 :                                         },
    2492           1 :                                 }
    2493           1 :                                 return ve, compact.Stats{}, nil
    2494           1 :                         }
    2495           0 :                         return nil, compact.Stats{}, err
    2496             :                 }
    2497           2 :                 newMeta.FileBacking.Size = wrote
    2498           2 :                 newMeta.Size = wrote
    2499           2 :         } else {
    2500           2 :                 _, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS,
    2501           2 :                         d.objProvider.Path(objMeta), fileTypeTable, newMeta.FileBacking.DiskFileNum,
    2502           2 :                         objstorage.CreateOptions{PreferSharedStorage: true})
    2503           2 :                 if err != nil {
    2504           0 :                         return nil, compact.Stats{}, err
    2505           0 :                 }
    2506           2 :                 deleteOnExit = true
    2507             :         }
    2508           2 :         ve.NewFiles = []newFileEntry{{
    2509           2 :                 Level: c.outputLevel.level,
    2510           2 :                 Meta:  newMeta,
    2511           2 :         }}
    2512           2 :         if newMeta.Virtual {
    2513           2 :                 ve.CreatedBackingTables = []*fileBacking{newMeta.FileBacking}
    2514           2 :         }
    2515           2 :         c.metrics = map[int]*LevelMetrics{
    2516           2 :                 c.outputLevel.level: {
    2517           2 :                         BytesIn:         inputMeta.Size,
    2518           2 :                         BytesCompacted:  newMeta.Size,
    2519           2 :                         TablesCompacted: 1,
    2520           2 :                 },
    2521           2 :         }
    2522           2 : 
    2523           2 :         if err := d.objProvider.Sync(); err != nil {
    2524           0 :                 return nil, compact.Stats{}, err
    2525           0 :         }
    2526           2 :         deleteOnExit = false
    2527           2 :         return ve, compact.Stats{}, nil
    2528             : }
    2529             : 
    2530             : // applyHintOnFile applies a deleteCompactionHint to a file, and updates the
    2531             : // versionEdit accordingly. It returns a list of new files that were created
    2532             : // if the hint was applied partially to a file (eg. through an excise as opposed
    2533             : // to an outright deletion). levelMetrics is kept up-to-date with the number
    2534             : // of tables deleted or excised.
    2535             : func (d *DB) applyHintOnFile(
    2536             :         h deleteCompactionHint,
    2537             :         f *fileMetadata,
    2538             :         level int,
    2539             :         levelMetrics *LevelMetrics,
    2540             :         ve *versionEdit,
    2541             :         hintOverlap deletionHintOverlap,
    2542           2 : ) (newFiles []manifest.NewFileEntry, err error) {
    2543           2 :         if hintOverlap == hintDoesNotApply {
    2544           0 :                 return nil, nil
    2545           0 :         }
    2546             : 
    2547             :         // The hint overlaps with at least part of the file.
    2548           2 :         if hintOverlap == hintDeletesFile {
    2549           2 :                 // The hint deletes the entirety of this file.
    2550           2 :                 ve.DeletedFiles[deletedFileEntry{
    2551           2 :                         Level:   level,
    2552           2 :                         FileNum: f.FileNum,
    2553           2 :                 }] = f
    2554           2 :                 levelMetrics.TablesDeleted++
    2555           2 :                 return nil, nil
    2556           2 :         }
    2557             :         // The hint overlaps with only a part of the file, not the entirety of it. We need
    2558             :         // to use d.excise. (hintOverlap == hintExcisesFile)
    2559           2 :         if d.FormatMajorVersion() < FormatVirtualSSTables {
    2560           0 :                 panic("pebble: delete-only compaction hint excising a file is not supported in this version")
    2561             :         }
    2562             : 
    2563           2 :         levelMetrics.TablesExcised++
    2564           2 :         newFiles, err = d.excise(context.TODO(), base.UserKeyBoundsEndExclusive(h.start, h.end), f, ve, level)
    2565           2 :         if err != nil {
    2566           0 :                 return nil, errors.Wrap(err, "error when running excise for delete-only compaction")
    2567           0 :         }
    2568           2 :         if _, ok := ve.DeletedFiles[deletedFileEntry{
    2569           2 :                 Level:   level,
    2570           2 :                 FileNum: f.FileNum,
    2571           2 :         }]; !ok {
    2572           0 :                 panic("pebble: delete-only compaction hint overlapping a file did not excise that file")
    2573             :         }
    2574           2 :         return newFiles, nil
    2575             : }
    2576             : 
    2577             : func (d *DB) runDeleteOnlyCompactionForLevel(
    2578             :         cl compactionLevel,
    2579             :         levelMetrics *LevelMetrics,
    2580             :         ve *versionEdit,
    2581             :         snapshots compact.Snapshots,
    2582             :         fragments []deleteCompactionHintFragment,
    2583             :         exciseEnabled bool,
    2584           2 : ) error {
    2585           2 :         curFragment := 0
    2586           2 :         iter := cl.files.Iter()
    2587           2 :         if cl.level == 0 {
    2588           0 :                 panic("cannot run delete-only compaction for L0")
    2589             :         }
    2590             : 
    2591             :         // Outer loop loops on files. Middle loop loops on fragments. Inner loop
    2592             :         // loops on raw fragments of hints. Number of fragments are bounded by
    2593             :         // the number of hints this compaction was created with, which is capped
    2594             :         // in the compaction picker to avoid very CPU-hot loops here.
    2595           2 :         for f := iter.First(); f != nil; f = iter.Next() {
    2596           2 :                 // curFile usually matches f, except if f got excised in which case
    2597           2 :                 // it maps to a virtual file that replaces f, or nil if f got removed
    2598           2 :                 // in its entirety.
    2599           2 :                 curFile := f
    2600           2 :                 for curFragment < len(fragments) && d.cmp(fragments[curFragment].start, f.Smallest.UserKey) <= 0 {
    2601           2 :                         curFragment++
    2602           2 :                 }
    2603           2 :                 if curFragment > 0 {
    2604           2 :                         curFragment--
    2605           2 :                 }
    2606             : 
    2607           2 :                 for ; curFragment < len(fragments); curFragment++ {
    2608           2 :                         if f.UserKeyBounds().End.CompareUpperBounds(d.cmp, base.UserKeyInclusive(fragments[curFragment].start)) < 0 {
    2609           2 :                                 break
    2610             :                         }
    2611             :                         // Process all overlapping hints with this file. Note that applying
    2612             :                         // a hint twice is idempotent; curFile should have already been excised
    2613             :                         // the first time, resulting in no change the second time.
    2614           2 :                         for _, h := range fragments[curFragment].hints {
    2615           2 :                                 if h.tombstoneLevel >= cl.level {
    2616           2 :                                         // We cannot excise out the deletion tombstone itself, or anything
    2617           2 :                                         // above it.
    2618           2 :                                         continue
    2619             :                                 }
    2620           2 :                                 hintOverlap := h.canDeleteOrExcise(d.cmp, curFile, snapshots, exciseEnabled)
    2621           2 :                                 if hintOverlap == hintDoesNotApply {
    2622           2 :                                         continue
    2623             :                                 }
    2624           2 :                                 newFiles, err := d.applyHintOnFile(h, curFile, cl.level, levelMetrics, ve, hintOverlap)
    2625           2 :                                 if err != nil {
    2626           0 :                                         return err
    2627           0 :                                 }
    2628           2 :                                 if _, ok := ve.DeletedFiles[manifest.DeletedFileEntry{Level: cl.level, FileNum: curFile.FileNum}]; ok {
    2629           2 :                                         curFile = nil
    2630           2 :                                 }
    2631           2 :                                 if len(newFiles) > 0 {
    2632           2 :                                         curFile = newFiles[len(newFiles)-1].Meta
    2633           2 :                                 } else if curFile == nil {
    2634           2 :                                         // Nothing remains of the file.
    2635           2 :                                         break
    2636             :                                 }
    2637             :                         }
    2638           2 :                         if curFile == nil {
    2639           2 :                                 // Nothing remains of the file.
    2640           2 :                                 break
    2641             :                         }
    2642             :                 }
    2643           2 :                 if _, ok := ve.DeletedFiles[deletedFileEntry{
    2644           2 :                         Level:   cl.level,
    2645           2 :                         FileNum: f.FileNum,
    2646           2 :                 }]; !ok {
    2647           0 :                         panic("pebble: delete-only compaction scheduled with hints that did not delete or excise a file")
    2648             :                 }
    2649             :         }
    2650           2 :         return nil
    2651             : }
    2652             : 
    2653             : // deleteCompactionHintFragment represents a fragment of the key space and
    2654             : // contains a set of deleteCompactionHints that apply to that fragment; a
    2655             : // fragment starts at the start field and ends where the next fragment starts.
    2656             : type deleteCompactionHintFragment struct {
    2657             :         start []byte
    2658             :         hints []deleteCompactionHint
    2659             : }
    2660             : 
    2661             : // Delete compaction hints can overlap with each other, and multiple fragments
    2662             : // can apply to a single file. This function takes a list of hints and fragments
    2663             : // them, to make it easier to apply them to non-overlapping files occupying a level;
    2664             : // that way, files and hint fragments can be iterated on in lockstep, while efficiently
    2665             : // being able to apply all hints overlapping with a given file.
    2666             : func fragmentDeleteCompactionHints(
    2667             :         cmp Compare, hints []deleteCompactionHint,
    2668           2 : ) []deleteCompactionHintFragment {
    2669           2 :         fragments := make([]deleteCompactionHintFragment, 0, len(hints)*2)
    2670           2 :         for i := range hints {
    2671           2 :                 fragments = append(fragments, deleteCompactionHintFragment{start: hints[i].start},
    2672           2 :                         deleteCompactionHintFragment{start: hints[i].end})
    2673           2 :         }
    2674           2 :         slices.SortFunc(fragments, func(i, j deleteCompactionHintFragment) int {
    2675           2 :                 return cmp(i.start, j.start)
    2676           2 :         })
    2677           2 :         fragments = slices.CompactFunc(fragments, func(i, j deleteCompactionHintFragment) bool {
    2678           2 :                 return bytes.Equal(i.start, j.start)
    2679           2 :         })
    2680           2 :         for _, h := range hints {
    2681           2 :                 startIdx := sort.Search(len(fragments), func(i int) bool {
    2682           2 :                         return cmp(fragments[i].start, h.start) >= 0
    2683           2 :                 })
    2684           2 :                 endIdx := sort.Search(len(fragments), func(i int) bool {
    2685           2 :                         return cmp(fragments[i].start, h.end) >= 0
    2686           2 :                 })
    2687           2 :                 for i := startIdx; i < endIdx; i++ {
    2688           2 :                         fragments[i].hints = append(fragments[i].hints, h)
    2689           2 :                 }
    2690             :         }
    2691           2 :         return fragments
    2692             : }
    2693             : 
    2694             : // Runs a delete-only compaction.
    2695             : //
    2696             : // d.mu must *not* be held when calling this.
    2697             : func (d *DB) runDeleteOnlyCompaction(
    2698             :         jobID JobID, c *compaction, snapshots compact.Snapshots,
    2699           2 : ) (ve *versionEdit, stats compact.Stats, retErr error) {
    2700           2 :         c.metrics = make(map[int]*LevelMetrics, len(c.inputs))
    2701           2 :         fragments := fragmentDeleteCompactionHints(d.cmp, c.deletionHints)
    2702           2 :         ve = &versionEdit{
    2703           2 :                 DeletedFiles: map[deletedFileEntry]*fileMetadata{},
    2704           2 :         }
    2705           2 :         for _, cl := range c.inputs {
    2706           2 :                 levelMetrics := &LevelMetrics{}
    2707           2 :                 if err := d.runDeleteOnlyCompactionForLevel(cl, levelMetrics, ve, snapshots, fragments, c.exciseEnabled); err != nil {
    2708           0 :                         return nil, stats, err
    2709           0 :                 }
    2710           2 :                 c.metrics[cl.level] = levelMetrics
    2711             :         }
    2712             :         // Remove any files that were added and deleted in the same versionEdit.
    2713           2 :         ve.NewFiles = slices.DeleteFunc(ve.NewFiles, func(e manifest.NewFileEntry) bool {
    2714           2 :                 deletedFileEntry := manifest.DeletedFileEntry{Level: e.Level, FileNum: e.Meta.FileNum}
    2715           2 :                 if _, deleted := ve.DeletedFiles[deletedFileEntry]; deleted {
    2716           2 :                         delete(ve.DeletedFiles, deletedFileEntry)
    2717           2 :                         return true
    2718           2 :                 }
    2719           2 :                 return false
    2720             :         })
    2721             :         // Remove any entries from CreatedBackingTables that are not used in any
    2722             :         // NewFiles.
    2723           2 :         usedBackingFiles := make(map[base.DiskFileNum]struct{})
    2724           2 :         for _, e := range ve.NewFiles {
    2725           2 :                 if e.Meta.Virtual {
    2726           2 :                         usedBackingFiles[e.Meta.FileBacking.DiskFileNum] = struct{}{}
    2727           2 :                 }
    2728             :         }
    2729           2 :         ve.CreatedBackingTables = slices.DeleteFunc(ve.CreatedBackingTables, func(b *fileBacking) bool {
    2730           2 :                 _, used := usedBackingFiles[b.DiskFileNum]
    2731           2 :                 return !used
    2732           2 :         })
    2733             :         // Refresh the disk available statistic whenever a compaction/flush
    2734             :         // completes, before re-acquiring the mutex.
    2735           2 :         d.calculateDiskAvailableBytes()
    2736           2 :         return ve, stats, nil
    2737             : }
    2738             : 
    2739             : func (d *DB) runMoveCompaction(
    2740             :         jobID JobID, c *compaction,
    2741           2 : ) (ve *versionEdit, stats compact.Stats, _ error) {
    2742           2 :         iter := c.startLevel.files.Iter()
    2743           2 :         meta := iter.First()
    2744           2 :         if iter.Next() != nil {
    2745           0 :                 return nil, stats, base.AssertionFailedf("got more than one file for a move compaction")
    2746           0 :         }
    2747           2 :         if c.cancel.Load() {
    2748           0 :                 return ve, stats, ErrCancelledCompaction
    2749           0 :         }
    2750           2 :         c.metrics = map[int]*LevelMetrics{
    2751           2 :                 c.outputLevel.level: {
    2752           2 :                         BytesMoved:  meta.Size,
    2753           2 :                         TablesMoved: 1,
    2754           2 :                 },
    2755           2 :         }
    2756           2 :         ve = &versionEdit{
    2757           2 :                 DeletedFiles: map[deletedFileEntry]*fileMetadata{
    2758           2 :                         {Level: c.startLevel.level, FileNum: meta.FileNum}: meta,
    2759           2 :                 },
    2760           2 :                 NewFiles: []newFileEntry{
    2761           2 :                         {Level: c.outputLevel.level, Meta: meta},
    2762           2 :                 },
    2763           2 :         }
    2764           2 : 
    2765           2 :         return ve, stats, nil
    2766             : }
    2767             : 
    2768             : // runCompaction runs a compaction that produces new on-disk tables from
    2769             : // memtables or old on-disk tables.
    2770             : //
    2771             : // runCompaction cannot be used for compactionKindIngestedFlushable.
    2772             : //
    2773             : // d.mu must be held when calling this, but the mutex may be dropped and
    2774             : // re-acquired during the course of this method.
    2775             : func (d *DB) runCompaction(
    2776             :         jobID JobID, c *compaction,
    2777           2 : ) (ve *versionEdit, stats compact.Stats, retErr error) {
    2778           2 :         if c.cancel.Load() {
    2779           2 :                 return ve, stats, ErrCancelledCompaction
    2780           2 :         }
    2781           2 :         switch c.kind {
    2782           2 :         case compactionKindDeleteOnly:
    2783           2 :                 // Before dropping the db mutex, grab a ref to the current version. This
    2784           2 :                 // prevents any concurrent excises from deleting files that this compaction
    2785           2 :                 // needs to read/maintain a reference to.
    2786           2 :                 //
    2787           2 :                 // Note that delete-only compactions can call excise(), which needs to be able
    2788           2 :                 // to read these files.
    2789           2 :                 vers := d.mu.versions.currentVersion()
    2790           2 :                 vers.Ref()
    2791           2 :                 defer vers.UnrefLocked()
    2792           2 :                 // Release the d.mu lock while doing I/O.
    2793           2 :                 // Note the unusual order: Unlock and then Lock.
    2794           2 :                 snapshots := d.mu.snapshots.toSlice()
    2795           2 :                 d.mu.Unlock()
    2796           2 :                 defer d.mu.Lock()
    2797           2 :                 return d.runDeleteOnlyCompaction(jobID, c, snapshots)
    2798           2 :         case compactionKindMove:
    2799           2 :                 return d.runMoveCompaction(jobID, c)
    2800           2 :         case compactionKindCopy:
    2801           2 :                 return d.runCopyCompaction(jobID, c)
    2802           0 :         case compactionKindIngestedFlushable:
    2803           0 :                 panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.")
    2804             :         }
    2805             : 
    2806           2 :         snapshots := d.mu.snapshots.toSlice()
    2807           2 : 
    2808           2 :         if c.flushing == nil {
    2809           2 :                 // Before dropping the db mutex, grab a ref to the current version. This
    2810           2 :                 // prevents any concurrent excises from deleting files that this compaction
    2811           2 :                 // needs to read/maintain a reference to.
    2812           2 :                 //
    2813           2 :                 // Note that unlike user iterators, compactionIter does not maintain a ref
    2814           2 :                 // of the version or read state.
    2815           2 :                 vers := d.mu.versions.currentVersion()
    2816           2 :                 vers.Ref()
    2817           2 :                 defer vers.UnrefLocked()
    2818           2 :         }
    2819             : 
    2820             :         // The table is typically written at the maximum allowable format implied by
    2821             :         // the current format major version of the DB, but Options may define
    2822             :         // additional constraints.
    2823           2 :         tableFormat := d.TableFormat()
    2824           2 : 
    2825           2 :         // Release the d.mu lock while doing I/O.
    2826           2 :         // Note the unusual order: Unlock and then Lock.
    2827           2 :         d.mu.Unlock()
    2828           2 :         defer d.mu.Lock()
    2829           2 : 
    2830           2 :         result := d.compactAndWrite(jobID, c, snapshots, tableFormat)
    2831           2 :         if result.Err == nil {
    2832           2 :                 ve, result.Err = c.makeVersionEdit(result)
    2833           2 :         }
    2834           2 :         if result.Err != nil {
    2835           2 :                 // Delete any created tables.
    2836           2 :                 obsoleteFiles := make([]*fileBacking, 0, len(result.Tables))
    2837           2 :                 d.mu.Lock()
    2838           2 :                 for i := range result.Tables {
    2839           2 :                         backing := &fileBacking{
    2840           2 :                                 DiskFileNum: result.Tables[i].ObjMeta.DiskFileNum,
    2841           2 :                                 Size:        result.Tables[i].WriterMeta.Size,
    2842           2 :                         }
    2843           2 :                         obsoleteFiles = append(obsoleteFiles, backing)
    2844           2 :                         // Add this file to zombie tables as well, as the versionSet
    2845           2 :                         // asserts on whether every obsolete file was at one point
    2846           2 :                         // marked zombie.
    2847           2 :                         d.mu.versions.zombieTables[backing.DiskFileNum] = tableInfo{
    2848           2 :                                 fileInfo: fileInfo{
    2849           2 :                                         FileNum:  backing.DiskFileNum,
    2850           2 :                                         FileSize: backing.Size,
    2851           2 :                                 },
    2852           2 :                                 isLocal: true,
    2853           2 :                         }
    2854           2 :                 }
    2855           2 :                 d.mu.versions.addObsoleteLocked(obsoleteFiles)
    2856           2 :                 d.mu.Unlock()
    2857             :         }
    2858             :         // Refresh the disk available statistic whenever a compaction/flush
    2859             :         // completes, before re-acquiring the mutex.
    2860           2 :         d.calculateDiskAvailableBytes()
    2861           2 :         return ve, result.Stats, result.Err
    2862             : }
    2863             : 
    2864             : // compactAndWrite runs the data part of a compaction, where we set up a
    2865             : // compaction iterator and use it to write output tables.
    2866             : func (d *DB) compactAndWrite(
    2867             :         jobID JobID, c *compaction, snapshots compact.Snapshots, tableFormat sstable.TableFormat,
    2868           2 : ) (result compact.Result) {
    2869           2 :         // Compactions use a pool of buffers to read blocks, avoiding polluting the
    2870           2 :         // block cache with blocks that will not be read again. We initialize the
    2871           2 :         // buffer pool with a size 12. This initial size does not need to be
    2872           2 :         // accurate, because the pool will grow to accommodate the maximum number of
    2873           2 :         // blocks allocated at a given time over the course of the compaction. But
    2874           2 :         // choosing a size larger than that working set avoids any additional
    2875           2 :         // allocations to grow the size of the pool over the course of iteration.
    2876           2 :         //
    2877           2 :         // Justification for initial size 12: In a two-level compaction, at any
    2878           2 :         // given moment we'll have 2 index blocks in-use and 2 data blocks in-use.
    2879           2 :         // Additionally, when decoding a compressed block, we'll temporarily
    2880           2 :         // allocate 1 additional block to hold the compressed buffer. In the worst
    2881           2 :         // case that all input sstables have two-level index blocks (+2), value
    2882           2 :         // blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll
    2883           2 :         // additionally require 2n+4 blocks where n is the number of input sstables.
    2884           2 :         // Range deletion and range key blocks are relatively rare, and the cost of
    2885           2 :         // an additional allocation or two over the course of the compaction is
    2886           2 :         // considered to be okay. A larger initial size would cause the pool to hold
    2887           2 :         // on to more memory, even when it's not in-use because the pool will
    2888           2 :         // recycle buffers up to the current capacity of the pool. The memory use of
    2889           2 :         // a 12-buffer pool is expected to be within reason, even if all the buffers
    2890           2 :         // grow to the typical size of an index block (256 KiB) which would
    2891           2 :         // translate to 3 MiB per compaction.
    2892           2 :         c.bufferPool.Init(12)
    2893           2 :         defer c.bufferPool.Release()
    2894           2 : 
    2895           2 :         pointIter, rangeDelIter, rangeKeyIter, err := c.newInputIters(d.newIters, d.tableNewRangeKeyIter)
    2896           2 :         defer func() {
    2897           2 :                 for _, closer := range c.closers {
    2898           2 :                         closer.FragmentIterator.Close()
    2899           2 :                 }
    2900             :         }()
    2901           2 :         if err != nil {
    2902           0 :                 return compact.Result{Err: err}
    2903           0 :         }
    2904           2 :         c.allowedZeroSeqNum = c.allowZeroSeqNum()
    2905           2 :         cfg := compact.IterConfig{
    2906           2 :                 Comparer:                               c.comparer,
    2907           2 :                 Merge:                                  d.merge,
    2908           2 :                 TombstoneElision:                       c.delElision,
    2909           2 :                 RangeKeyElision:                        c.rangeKeyElision,
    2910           2 :                 Snapshots:                              snapshots,
    2911           2 :                 AllowZeroSeqNum:                        c.allowedZeroSeqNum,
    2912           2 :                 IneffectualSingleDeleteCallback:        d.opts.Experimental.IneffectualSingleDeleteCallback,
    2913           2 :                 SingleDeleteInvariantViolationCallback: d.opts.Experimental.SingleDeleteInvariantViolationCallback,
    2914           2 :         }
    2915           2 :         iter := compact.NewIter(cfg, pointIter, rangeDelIter, rangeKeyIter)
    2916           2 : 
    2917           2 :         runnerCfg := compact.RunnerConfig{
    2918           2 :                 CompactionBounds:           base.UserKeyBoundsFromInternal(c.smallest, c.largest),
    2919           2 :                 L0SplitKeys:                c.l0Limits,
    2920           2 :                 Grandparents:               c.grandparents,
    2921           2 :                 MaxGrandparentOverlapBytes: c.maxOverlapBytes,
    2922           2 :                 TargetOutputFileSize:       c.maxOutputFileSize,
    2923           2 :                 Slot:                       c.slot,
    2924           2 :                 IteratorStats:              &c.stats,
    2925           2 :         }
    2926           2 :         runner := compact.NewRunner(runnerCfg, iter)
    2927           2 :         for runner.MoreDataToWrite() {
    2928           2 :                 if c.cancel.Load() {
    2929           2 :                         return runner.Finish().WithError(ErrCancelledCompaction)
    2930           2 :                 }
    2931             :                 // Create a new table.
    2932           2 :                 writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
    2933           2 :                 objMeta, tw, cpuWorkHandle, err := d.newCompactionOutput(jobID, c, writerOpts)
    2934           2 :                 if err != nil {
    2935           1 :                         return runner.Finish().WithError(err)
    2936           1 :                 }
    2937           2 :                 runner.WriteTable(objMeta, tw)
    2938           2 :                 d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle)
    2939             :         }
    2940           2 :         result = runner.Finish()
    2941           2 :         if result.Err == nil {
    2942           2 :                 result.Err = d.objProvider.Sync()
    2943           2 :         }
    2944           2 :         return result
    2945             : }
    2946             : 
    2947             : // makeVersionEdit creates the version edit for a compaction, based on the
    2948             : // tables in compact.Result.
    2949           2 : func (c *compaction) makeVersionEdit(result compact.Result) (*versionEdit, error) {
    2950           2 :         ve := &versionEdit{
    2951           2 :                 DeletedFiles: map[deletedFileEntry]*fileMetadata{},
    2952           2 :         }
    2953           2 :         for _, cl := range c.inputs {
    2954           2 :                 iter := cl.files.Iter()
    2955           2 :                 for f := iter.First(); f != nil; f = iter.Next() {
    2956           2 :                         ve.DeletedFiles[deletedFileEntry{
    2957           2 :                                 Level:   cl.level,
    2958           2 :                                 FileNum: f.FileNum,
    2959           2 :                         }] = f
    2960           2 :                 }
    2961             :         }
    2962             : 
    2963           2 :         startLevelBytes := c.startLevel.files.SizeSum()
    2964           2 :         outputMetrics := &LevelMetrics{
    2965           2 :                 BytesIn:   startLevelBytes,
    2966           2 :                 BytesRead: c.outputLevel.files.SizeSum(),
    2967           2 :         }
    2968           2 :         if len(c.extraLevels) > 0 {
    2969           2 :                 outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum()
    2970           2 :         }
    2971           2 :         outputMetrics.BytesRead += outputMetrics.BytesIn
    2972           2 : 
    2973           2 :         c.metrics = map[int]*LevelMetrics{
    2974           2 :                 c.outputLevel.level: outputMetrics,
    2975           2 :         }
    2976           2 :         if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil {
    2977           2 :                 c.metrics[c.startLevel.level] = &LevelMetrics{}
    2978           2 :         }
    2979           2 :         if len(c.extraLevels) > 0 {
    2980           2 :                 c.metrics[c.extraLevels[0].level] = &LevelMetrics{}
    2981           2 :                 outputMetrics.MultiLevel.BytesInTop = startLevelBytes
    2982           2 :                 outputMetrics.MultiLevel.BytesIn = outputMetrics.BytesIn
    2983           2 :                 outputMetrics.MultiLevel.BytesRead = outputMetrics.BytesRead
    2984           2 :         }
    2985             : 
    2986           2 :         inputLargestSeqNumAbsolute := c.inputLargestSeqNumAbsolute()
    2987           2 :         ve.NewFiles = make([]newFileEntry, len(result.Tables))
    2988           2 :         for i := range result.Tables {
    2989           2 :                 t := &result.Tables[i]
    2990           2 : 
    2991           2 :                 fileMeta := &fileMetadata{
    2992           2 :                         FileNum:        base.PhysicalTableFileNum(t.ObjMeta.DiskFileNum),
    2993           2 :                         CreationTime:   t.CreationTime.Unix(),
    2994           2 :                         Size:           t.WriterMeta.Size,
    2995           2 :                         SmallestSeqNum: t.WriterMeta.SmallestSeqNum,
    2996           2 :                         LargestSeqNum:  t.WriterMeta.LargestSeqNum,
    2997           2 :                 }
    2998           2 :                 if c.flushing == nil {
    2999           2 :                         // Set the file's LargestSeqNumAbsolute to be the maximum value of any
    3000           2 :                         // of the compaction's input sstables.
    3001           2 :                         // TODO(jackson): This could be narrowed to be the maximum of input
    3002           2 :                         // sstables that overlap the output sstable's key range.
    3003           2 :                         fileMeta.LargestSeqNumAbsolute = inputLargestSeqNumAbsolute
    3004           2 :                 } else {
    3005           2 :                         fileMeta.LargestSeqNumAbsolute = t.WriterMeta.LargestSeqNum
    3006           2 :                 }
    3007           2 :                 fileMeta.InitPhysicalBacking()
    3008           2 : 
    3009           2 :                 // If the file didn't contain any range deletions, we can fill its
    3010           2 :                 // table stats now, avoiding unnecessarily loading the table later.
    3011           2 :                 maybeSetStatsFromProperties(
    3012           2 :                         fileMeta.PhysicalMeta(), &t.WriterMeta.Properties,
    3013           2 :                 )
    3014           2 : 
    3015           2 :                 if t.WriterMeta.HasPointKeys {
    3016           2 :                         fileMeta.ExtendPointKeyBounds(c.cmp, t.WriterMeta.SmallestPoint, t.WriterMeta.LargestPoint)
    3017           2 :                 }
    3018           2 :                 if t.WriterMeta.HasRangeDelKeys {
    3019           2 :                         fileMeta.ExtendPointKeyBounds(c.cmp, t.WriterMeta.SmallestRangeDel, t.WriterMeta.LargestRangeDel)
    3020           2 :                 }
    3021           2 :                 if t.WriterMeta.HasRangeKeys {
    3022           2 :                         fileMeta.ExtendRangeKeyBounds(c.cmp, t.WriterMeta.SmallestRangeKey, t.WriterMeta.LargestRangeKey)
    3023           2 :                 }
    3024             : 
    3025           2 :                 ve.NewFiles[i] = newFileEntry{
    3026           2 :                         Level: c.outputLevel.level,
    3027           2 :                         Meta:  fileMeta,
    3028           2 :                 }
    3029           2 : 
    3030           2 :                 // Update metrics.
    3031           2 :                 if c.flushing == nil {
    3032           2 :                         outputMetrics.TablesCompacted++
    3033           2 :                         outputMetrics.BytesCompacted += fileMeta.Size
    3034           2 :                 } else {
    3035           2 :                         outputMetrics.TablesFlushed++
    3036           2 :                         outputMetrics.BytesFlushed += fileMeta.Size
    3037           2 :                 }
    3038           2 :                 outputMetrics.Size += int64(fileMeta.Size)
    3039           2 :                 outputMetrics.NumFiles++
    3040           2 :                 outputMetrics.Additional.BytesWrittenDataBlocks += t.WriterMeta.Properties.DataSize
    3041           2 :                 outputMetrics.Additional.BytesWrittenValueBlocks += t.WriterMeta.Properties.ValueBlocksSize
    3042             :         }
    3043             : 
    3044             :         // Sanity check that the tables are ordered and don't overlap.
    3045           2 :         for i := 1; i < len(ve.NewFiles); i++ {
    3046           2 :                 if ve.NewFiles[i-1].Meta.UserKeyBounds().End.IsUpperBoundFor(c.cmp, ve.NewFiles[i].Meta.Smallest.UserKey) {
    3047           0 :                         return nil, base.AssertionFailedf("pebble: compaction output tables overlap: %s and %s",
    3048           0 :                                 ve.NewFiles[i-1].Meta.DebugString(c.formatKey, true),
    3049           0 :                                 ve.NewFiles[i].Meta.DebugString(c.formatKey, true),
    3050           0 :                         )
    3051           0 :                 }
    3052             :         }
    3053             : 
    3054           2 :         return ve, nil
    3055             : }
    3056             : 
    3057             : // newCompactionOutput creates an object for a new table produced by a
    3058             : // compaction or flush.
    3059             : func (d *DB) newCompactionOutput(
    3060             :         jobID JobID, c *compaction, writerOpts sstable.WriterOptions,
    3061           2 : ) (objstorage.ObjectMetadata, sstable.RawWriter, CPUWorkHandle, error) {
    3062           2 :         diskFileNum := d.mu.versions.getNextDiskFileNum()
    3063           2 : 
    3064           2 :         var writeCategory vfs.DiskWriteCategory
    3065           2 :         if d.opts.EnableSQLRowSpillMetrics {
    3066           0 :                 // In the scenario that the Pebble engine is used for SQL row spills the
    3067           0 :                 // data written to the memtable will correspond to spills to disk and
    3068           0 :                 // should be categorized as such.
    3069           0 :                 writeCategory = "sql-row-spill"
    3070           2 :         } else if c.kind == compactionKindFlush {
    3071           2 :                 writeCategory = "pebble-memtable-flush"
    3072           2 :         } else {
    3073           2 :                 writeCategory = "pebble-compaction"
    3074           2 :         }
    3075             : 
    3076           2 :         var reason string
    3077           2 :         if c.kind == compactionKindFlush {
    3078           2 :                 reason = "flushing"
    3079           2 :         } else {
    3080           2 :                 reason = "compacting"
    3081           2 :         }
    3082             : 
    3083           2 :         ctx := context.TODO()
    3084           2 :         if objiotracing.Enabled {
    3085           0 :                 ctx = objiotracing.WithLevel(ctx, c.outputLevel.level)
    3086           0 :                 if c.kind == compactionKindFlush {
    3087           0 :                         ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush)
    3088           0 :                 } else {
    3089           0 :                         ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction)
    3090           0 :                 }
    3091             :         }
    3092             : 
    3093             :         // Prefer shared storage if present.
    3094           2 :         createOpts := objstorage.CreateOptions{
    3095           2 :                 PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level),
    3096           2 :                 WriteCategory:       writeCategory,
    3097           2 :         }
    3098           2 :         writable, objMeta, err := d.objProvider.Create(ctx, fileTypeTable, diskFileNum, createOpts)
    3099           2 :         if err != nil {
    3100           1 :                 return objstorage.ObjectMetadata{}, nil, nil, err
    3101           1 :         }
    3102             : 
    3103           2 :         if c.kind != compactionKindFlush {
    3104           2 :                 writable = &compactionWritable{
    3105           2 :                         Writable: writable,
    3106           2 :                         versions: d.mu.versions,
    3107           2 :                         written:  &c.bytesWritten,
    3108           2 :                 }
    3109           2 :         }
    3110           2 :         d.opts.EventListener.TableCreated(TableCreateInfo{
    3111           2 :                 JobID:   int(jobID),
    3112           2 :                 Reason:  reason,
    3113           2 :                 Path:    d.objProvider.Path(objMeta),
    3114           2 :                 FileNum: diskFileNum,
    3115           2 :         })
    3116           2 : 
    3117           2 :         writerOpts.SetInternal(sstableinternal.WriterOptions{
    3118           2 :                 CacheOpts: sstableinternal.CacheOptions{
    3119           2 :                         Cache:   d.opts.Cache,
    3120           2 :                         CacheID: d.cacheID,
    3121           2 :                         FileNum: diskFileNum,
    3122           2 :                 },
    3123           2 :         })
    3124           2 : 
    3125           2 :         const MaxFileWriteAdditionalCPUTime = time.Millisecond * 100
    3126           2 :         cpuWorkHandle := d.opts.Experimental.CPUWorkPermissionGranter.GetPermission(
    3127           2 :                 MaxFileWriteAdditionalCPUTime,
    3128           2 :         )
    3129           2 :         writerOpts.Parallelism =
    3130           2 :                 d.opts.Experimental.MaxWriterConcurrency > 0 &&
    3131           2 :                         (cpuWorkHandle.Permitted() || d.opts.Experimental.ForceWriterParallelism)
    3132           2 : 
    3133           2 :         // TODO(jackson): Make the compaction body generic over the RawWriter type,
    3134           2 :         // so that we don't need to pay the cost of dynamic dispatch?
    3135           2 :         tw := sstable.NewRawWriter(writable, writerOpts)
    3136           2 :         return objMeta, tw, cpuWorkHandle, nil
    3137             : }
    3138             : 
    3139             : // validateVersionEdit validates that start and end keys across new and deleted
    3140             : // files in a versionEdit pass the given validation function.
    3141             : func validateVersionEdit(
    3142             :         ve *versionEdit, validateFn func([]byte) error, format base.FormatKey, logger Logger,
    3143           2 : ) {
    3144           2 :         validateKey := func(f *manifest.FileMetadata, key []byte) {
    3145           2 :                 if err := validateFn(key); err != nil {
    3146           1 :                         logger.Fatalf("pebble: version edit validation failed (key=%s file=%s): %v", format(key), f, err)
    3147           1 :                 }
    3148             :         }
    3149             : 
    3150             :         // Validate both new and deleted files.
    3151           2 :         for _, f := range ve.NewFiles {
    3152           2 :                 validateKey(f.Meta, f.Meta.Smallest.UserKey)
    3153           2 :                 validateKey(f.Meta, f.Meta.Largest.UserKey)
    3154           2 :         }
    3155           2 :         for _, m := range ve.DeletedFiles {
    3156           2 :                 validateKey(m, m.Smallest.UserKey)
    3157           2 :                 validateKey(m, m.Largest.UserKey)
    3158           2 :         }
    3159             : }

Generated by: LCOV version 1.14