LCOV - code coverage report
Current view: top level - pebble - table_stats.go (source / functions) Hit Total Coverage
Test: 2024-03-12 08:15Z 635c6003 - tests + meta.lcov Lines: 638 702 90.9 %
Date: 2024-03-12 08:17:02 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package pebble
       6             : 
       7             : import (
       8             :         "fmt"
       9             :         "math"
      10             : 
      11             :         "github.com/cockroachdb/errors"
      12             :         "github.com/cockroachdb/pebble/internal/base"
      13             :         "github.com/cockroachdb/pebble/internal/invariants"
      14             :         "github.com/cockroachdb/pebble/internal/keyspan"
      15             :         "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
      16             :         "github.com/cockroachdb/pebble/internal/manifest"
      17             :         "github.com/cockroachdb/pebble/sstable"
      18             : )
      19             : 
      20             : // In-memory statistics about tables help inform compaction picking, but may
      21             : // be expensive to calculate or load from disk. Every time a database is
      22             : // opened, these statistics must be reloaded or recalculated. To minimize
      23             : // impact on user activity and compactions, we load these statistics
      24             : // asynchronously in the background and store loaded statistics in each
      25             : // table's *FileMetadata.
      26             : //
      27             : // This file implements the asynchronous loading of statistics by maintaining
      28             : // a list of files that require statistics, alongside their LSM levels.
      29             : // Whenever new files are added to the LSM, the files are appended to
      30             : // d.mu.tableStats.pending. If a stats collection job is not currently
      31             : // running, one is started in a separate goroutine.
      32             : //
      33             : // The stats collection job grabs and clears the pending list, computes table
      34             : // statistics relative to the current readState and updates the tables' file
      35             : // metadata. New pending files may accumulate during a stats collection job,
      36             : // so a completing job triggers a new job if necessary. Only one job runs at a
      37             : // time.
      38             : //
      39             : // When an existing database is opened, all files lack in-memory statistics.
      40             : // These files' stats are loaded incrementally whenever the pending list is
      41             : // empty by scanning a current readState for files missing statistics. Once a
      42             : // job completes a scan without finding any remaining files without
      43             : // statistics, it flips a `loadedInitial` flag. From then on, the stats
      44             : // collection job only needs to load statistics for new files appended to the
      45             : // pending list.
      46             : 
      47           2 : func (d *DB) maybeCollectTableStatsLocked() {
      48           2 :         if d.shouldCollectTableStatsLocked() {
      49           2 :                 go d.collectTableStats()
      50           2 :         }
      51             : }
      52             : 
      53             : // updateTableStatsLocked is called when new files are introduced, after the
      54             : // read state has been updated. It may trigger a new stat collection.
      55             : // DB.mu must be locked when calling.
      56           2 : func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
      57           2 :         var needStats bool
      58           2 :         for _, nf := range newFiles {
      59           2 :                 if !nf.Meta.StatsValid() {
      60           2 :                         needStats = true
      61           2 :                         break
      62             :                 }
      63             :         }
      64           2 :         if !needStats {
      65           2 :                 return
      66           2 :         }
      67             : 
      68           2 :         d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
      69           2 :         d.maybeCollectTableStatsLocked()
      70             : }
      71             : 
      72           2 : func (d *DB) shouldCollectTableStatsLocked() bool {
      73           2 :         return !d.mu.tableStats.loading &&
      74           2 :                 d.closed.Load() == nil &&
      75           2 :                 !d.opts.DisableTableStats &&
      76           2 :                 (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
      77           2 : }
      78             : 
      79             : // collectTableStats runs a table stats collection job, returning true if the
      80             : // invocation did the collection work, false otherwise (e.g. if another job was
      81             : // already running).
      82           2 : func (d *DB) collectTableStats() bool {
      83           2 :         const maxTableStatsPerScan = 50
      84           2 : 
      85           2 :         d.mu.Lock()
      86           2 :         if !d.shouldCollectTableStatsLocked() {
      87           2 :                 d.mu.Unlock()
      88           2 :                 return false
      89           2 :         }
      90             : 
      91           2 :         pending := d.mu.tableStats.pending
      92           2 :         d.mu.tableStats.pending = nil
      93           2 :         d.mu.tableStats.loading = true
      94           2 :         jobID := d.mu.nextJobID
      95           2 :         d.mu.nextJobID++
      96           2 :         loadedInitial := d.mu.tableStats.loadedInitial
      97           2 :         // Drop DB.mu before performing IO.
      98           2 :         d.mu.Unlock()
      99           2 : 
     100           2 :         // Every run of collectTableStats either collects stats from the pending
     101           2 :         // list (if non-empty) or from scanning the version (loadedInitial is
     102           2 :         // false). This job only runs if at least one of those conditions holds.
     103           2 : 
     104           2 :         // Grab a read state to scan for tables.
     105           2 :         rs := d.loadReadState()
     106           2 :         var collected []collectedStats
     107           2 :         var hints []deleteCompactionHint
     108           2 :         if len(pending) > 0 {
     109           2 :                 collected, hints = d.loadNewFileStats(rs, pending)
     110           2 :         } else {
     111           2 :                 var moreRemain bool
     112           2 :                 var buf [maxTableStatsPerScan]collectedStats
     113           2 :                 collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
     114           2 :                 loadedInitial = !moreRemain
     115           2 :         }
     116           2 :         rs.unref()
     117           2 : 
     118           2 :         // Update the FileMetadata with the loaded stats while holding d.mu.
     119           2 :         d.mu.Lock()
     120           2 :         defer d.mu.Unlock()
     121           2 :         d.mu.tableStats.loading = false
     122           2 :         if loadedInitial && !d.mu.tableStats.loadedInitial {
     123           2 :                 d.mu.tableStats.loadedInitial = loadedInitial
     124           2 :                 d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
     125           2 :                         JobID: jobID,
     126           2 :                 })
     127           2 :         }
     128             : 
     129           2 :         maybeCompact := false
     130           2 :         for _, c := range collected {
     131           2 :                 c.fileMetadata.Stats = c.TableStats
     132           2 :                 maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
     133           2 :                 c.fileMetadata.StatsMarkValid()
     134           2 :         }
     135           2 :         d.mu.tableStats.cond.Broadcast()
     136           2 :         d.maybeCollectTableStatsLocked()
     137           2 :         if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
     138           2 :                 // Verify that all of the hint tombstones' files still exist in the
     139           2 :                 // current version. Otherwise, the tombstone itself may have been
     140           2 :                 // compacted into L6 and more recent keys may have had their sequence
     141           2 :                 // numbers zeroed.
     142           2 :                 //
     143           2 :                 // Note that it's possible that the tombstone file is being compacted
     144           2 :                 // presently. In that case, the file will be present in v. When the
     145           2 :                 // compaction finishes compacting the tombstone file, it will detect
     146           2 :                 // and clear the hint.
     147           2 :                 //
     148           2 :                 // See DB.maybeUpdateDeleteCompactionHints.
     149           2 :                 v := d.mu.versions.currentVersion()
     150           2 :                 keepHints := hints[:0]
     151           2 :                 for _, h := range hints {
     152           2 :                         if v.Contains(h.tombstoneLevel, h.tombstoneFile) {
     153           2 :                                 keepHints = append(keepHints, h)
     154           2 :                         }
     155             :                 }
     156           2 :                 d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
     157             :         }
     158           2 :         if maybeCompact {
     159           2 :                 d.maybeScheduleCompaction()
     160           2 :         }
     161           2 :         return true
     162             : }
     163             : 
     164             : type collectedStats struct {
     165             :         *fileMetadata
     166             :         manifest.TableStats
     167             : }
     168             : 
     169             : func (d *DB) loadNewFileStats(
     170             :         rs *readState, pending []manifest.NewFileEntry,
     171           2 : ) ([]collectedStats, []deleteCompactionHint) {
     172           2 :         var hints []deleteCompactionHint
     173           2 :         collected := make([]collectedStats, 0, len(pending))
     174           2 :         for _, nf := range pending {
     175           2 :                 // A file's stats might have been populated by an earlier call to
     176           2 :                 // loadNewFileStats if the file was moved.
     177           2 :                 // NB: We're not holding d.mu which protects f.Stats, but only
     178           2 :                 // collectTableStats updates f.Stats for active files, and we
     179           2 :                 // ensure only one goroutine runs it at a time through
     180           2 :                 // d.mu.tableStats.loading.
     181           2 :                 if nf.Meta.StatsValid() {
     182           2 :                         continue
     183             :                 }
     184             : 
     185             :                 // The file isn't guaranteed to still be live in the readState's
     186             :                 // version. It may have been deleted or moved. Skip it if it's not in
     187             :                 // the expected level.
     188           2 :                 if !rs.current.Contains(nf.Level, nf.Meta) {
     189           2 :                         continue
     190             :                 }
     191             : 
     192           2 :                 stats, newHints, err := d.loadTableStats(
     193           2 :                         rs.current, nf.Level,
     194           2 :                         nf.Meta,
     195           2 :                 )
     196           2 :                 if err != nil {
     197           0 :                         d.opts.EventListener.BackgroundError(err)
     198           0 :                         continue
     199             :                 }
     200             :                 // NB: We don't update the FileMetadata yet, because we aren't
     201             :                 // holding DB.mu. We'll copy it to the FileMetadata after we're
     202             :                 // finished with IO.
     203           2 :                 collected = append(collected, collectedStats{
     204           2 :                         fileMetadata: nf.Meta,
     205           2 :                         TableStats:   stats,
     206           2 :                 })
     207           2 :                 hints = append(hints, newHints...)
     208             :         }
     209           2 :         return collected, hints
     210             : }
     211             : 
     212             : // scanReadStateTableStats is run by an active stat collection job when there
     213             : // are no pending new files, but there might be files that existed at Open for
     214             : // which we haven't loaded table stats.
     215             : func (d *DB) scanReadStateTableStats(
     216             :         rs *readState, fill []collectedStats,
     217           2 : ) ([]collectedStats, []deleteCompactionHint, bool) {
     218           2 :         moreRemain := false
     219           2 :         var hints []deleteCompactionHint
     220           2 :         sizesChecked := make(map[base.DiskFileNum]struct{})
     221           2 :         for l, levelMetadata := range rs.current.Levels {
     222           2 :                 iter := levelMetadata.Iter()
     223           2 :                 for f := iter.First(); f != nil; f = iter.Next() {
     224           2 :                         // NB: We're not holding d.mu which protects f.Stats, but only the
     225           2 :                         // active stats collection job updates f.Stats for active files,
     226           2 :                         // and we ensure only one goroutine runs it at a time through
     227           2 :                         // d.mu.tableStats.loading. This makes it safe to read validity
     228           2 :                         // through f.Stats.ValidLocked despite not holding d.mu.
     229           2 :                         if f.StatsValid() {
     230           2 :                                 continue
     231             :                         }
     232             : 
     233             :                         // Limit how much work we do per read state. The older the read
     234             :                         // state is, the higher the likelihood files are no longer being
     235             :                         // used in the current version. If we've exhausted our allowance,
     236             :                         // return true for the last return value to signal there's more
     237             :                         // work to do.
     238           2 :                         if len(fill) == cap(fill) {
     239           2 :                                 moreRemain = true
     240           2 :                                 return fill, hints, moreRemain
     241           2 :                         }
     242             : 
     243             :                         // If the file is remote and not SharedForeign, we should check if its size
     244             :                         // matches. This is because checkConsistency skips over remote files.
     245             :                         //
     246             :                         // SharedForeign and External files are skipped as their sizes are allowed
     247             :                         // to have a mismatch; the size stored in the FileBacking is just the part
     248             :                         // of the file that is referenced by this Pebble instance, not the size of
     249             :                         // the whole object.
     250           2 :                         objMeta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
     251           2 :                         if err != nil {
     252           0 :                                 // Set `moreRemain` so we'll try again.
     253           0 :                                 moreRemain = true
     254           0 :                                 d.opts.EventListener.BackgroundError(err)
     255           0 :                                 continue
     256             :                         }
     257             : 
     258           2 :                         shouldCheckSize := objMeta.IsRemote() &&
     259           2 :                                 !d.objProvider.IsSharedForeign(objMeta) &&
     260           2 :                                 !objMeta.IsExternal()
     261           2 :                         if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize {
     262           2 :                                 size, err := d.objProvider.Size(objMeta)
     263           2 :                                 fileSize := f.FileBacking.Size
     264           2 :                                 if err != nil {
     265           0 :                                         moreRemain = true
     266           0 :                                         d.opts.EventListener.BackgroundError(err)
     267           0 :                                         continue
     268             :                                 }
     269           2 :                                 if size != int64(fileSize) {
     270           0 :                                         err := errors.Errorf(
     271           0 :                                                 "during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)",
     272           0 :                                                 errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta),
     273           0 :                                                 errors.Safe(size), errors.Safe(fileSize))
     274           0 :                                         d.opts.EventListener.BackgroundError(err)
     275           0 :                                         d.opts.Logger.Fatalf("%s", err)
     276           0 :                                 }
     277             : 
     278           2 :                                 sizesChecked[f.FileBacking.DiskFileNum] = struct{}{}
     279             :                         }
     280             : 
     281           2 :                         stats, newHints, err := d.loadTableStats(
     282           2 :                                 rs.current, l, f,
     283           2 :                         )
     284           2 :                         if err != nil {
     285           0 :                                 // Set `moreRemain` so we'll try again.
     286           0 :                                 moreRemain = true
     287           0 :                                 d.opts.EventListener.BackgroundError(err)
     288           0 :                                 continue
     289             :                         }
     290           2 :                         fill = append(fill, collectedStats{
     291           2 :                                 fileMetadata: f,
     292           2 :                                 TableStats:   stats,
     293           2 :                         })
     294           2 :                         hints = append(hints, newHints...)
     295             :                 }
     296             :         }
     297           2 :         return fill, hints, moreRemain
     298             : }
     299             : 
     300             : func (d *DB) loadTableStats(
     301             :         v *version, level int, meta *fileMetadata,
     302           2 : ) (manifest.TableStats, []deleteCompactionHint, error) {
     303           2 :         var stats manifest.TableStats
     304           2 :         var compactionHints []deleteCompactionHint
     305           2 :         err := d.tableCache.withCommonReader(
     306           2 :                 meta, func(r sstable.CommonReader) (err error) {
     307           2 :                         props := r.CommonProperties()
     308           2 :                         stats.NumEntries = props.NumEntries
     309           2 :                         stats.NumDeletions = props.NumDeletions
     310           2 :                         if props.NumPointDeletions() > 0 {
     311           2 :                                 if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
     312           0 :                                         return
     313           0 :                                 }
     314             :                         }
     315           2 :                         if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
     316           2 :                                 if compactionHints, err = d.loadTableRangeDelStats(
     317           2 :                                         r, v, level, meta, &stats,
     318           2 :                                 ); err != nil {
     319           0 :                                         return
     320           0 :                                 }
     321             :                         }
     322             :                         // TODO(travers): Once we have real-world data, consider collecting
     323             :                         // additional stats that may provide improved heuristics for compaction
     324             :                         // picking.
     325           2 :                         stats.NumRangeKeySets = props.NumRangeKeySets
     326           2 :                         stats.ValueBlocksSize = props.ValueBlocksSize
     327           2 :                         return
     328             :                 })
     329           2 :         if err != nil {
     330           0 :                 return stats, nil, err
     331           0 :         }
     332           2 :         return stats, compactionHints, nil
     333             : }
     334             : 
     335             : // loadTablePointKeyStats calculates the point key statistics for the given
     336             : // table. The provided manifest.TableStats are updated.
     337             : func (d *DB) loadTablePointKeyStats(
     338             :         props *sstable.CommonProperties,
     339             :         v *version,
     340             :         level int,
     341             :         meta *fileMetadata,
     342             :         stats *manifest.TableStats,
     343           2 : ) error {
     344           2 :         // TODO(jackson): If the file has a wide keyspace, the average
     345           2 :         // value size beneath the entire file might not be representative
     346           2 :         // of the size of the keys beneath the point tombstones.
     347           2 :         // We could write the ranges of 'clusters' of point tombstones to
     348           2 :         // a sstable property and call averageValueSizeBeneath for each of
     349           2 :         // these narrower ranges to improve the estimate.
     350           2 :         avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
     351           2 :         if err != nil {
     352           0 :                 return err
     353           0 :         }
     354           2 :         stats.PointDeletionsBytesEstimate =
     355           2 :                 pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
     356           2 :         return nil
     357             : }
     358             : 
     359             : // loadTableRangeDelStats calculates the range deletion and range key deletion
     360             : // statistics for the given table.
     361             : func (d *DB) loadTableRangeDelStats(
     362             :         r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
     363           2 : ) ([]deleteCompactionHint, error) {
     364           2 :         iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
     365           2 :         if err != nil {
     366           0 :                 return nil, err
     367           0 :         }
     368           2 :         defer iter.Close()
     369           2 :         var compactionHints []deleteCompactionHint
     370           2 :         // We iterate over the defragmented range tombstones and range key deletions,
     371           2 :         // which ensures we don't double count ranges deleted at different sequence
     372           2 :         // numbers. Also, merging abutting tombstones reduces the number of calls to
     373           2 :         // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
     374           2 :         // our overall estimate.
     375           2 :         s, err := iter.First()
     376           2 :         for ; s != nil; s, err = iter.Next() {
     377           2 :                 start, end := s.Start, s.End
     378           2 :                 // We only need to consider deletion size estimates for tables that contain
     379           2 :                 // RANGEDELs.
     380           2 :                 var maxRangeDeleteSeqNum uint64
     381           2 :                 for _, k := range s.Keys {
     382           2 :                         if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
     383           2 :                                 maxRangeDeleteSeqNum = k.SeqNum()
     384           2 :                                 break
     385             :                         }
     386             :                 }
     387             : 
     388             :                 // If the file is in the last level of the LSM, there is no data beneath
     389             :                 // it. The fact that there is still a range tombstone in a bottommost file
     390             :                 // indicates two possibilites:
     391             :                 //   1. an open snapshot kept the tombstone around, and the data the
     392             :                 //      tombstone deletes is contained within the file itself.
     393             :                 //   2. the file was ingested.
     394             :                 // In the first case, we'd like to estimate disk usage within the file
     395             :                 // itself since compacting the file will drop that covered data. In the
     396             :                 // second case, we expect that compacting the file will NOT drop any
     397             :                 // data and rewriting the file is a waste of write bandwidth. We can
     398             :                 // distinguish these cases by looking at the file metadata's sequence
     399             :                 // numbers. A file's range deletions can only delete data within the
     400             :                 // file at lower sequence numbers. All keys in an ingested sstable adopt
     401             :                 // the same sequence number, preventing tombstones from deleting keys
     402             :                 // within the same file. We check here if the largest RANGEDEL sequence
     403             :                 // number is greater than the file's smallest sequence number. If it is,
     404             :                 // the RANGEDEL could conceivably (although inconclusively) delete data
     405             :                 // within the same file.
     406             :                 //
     407             :                 // Note that this heuristic is imperfect. If a table containing a range
     408             :                 // deletion is ingested into L5 and subsequently compacted into L6 but
     409             :                 // an open snapshot prevents elision of covered keys in L6, the
     410             :                 // resulting RangeDeletionsBytesEstimate will incorrectly include all
     411             :                 // covered keys.
     412             :                 //
     413             :                 // TODO(jackson): We could prevent the above error in the heuristic by
     414             :                 // computing the file's RangeDeletionsBytesEstimate during the
     415             :                 // compaction itself. It's unclear how common this is.
     416             :                 //
     417             :                 // NOTE: If the span `s` wholly contains a table containing range keys,
     418             :                 // the returned size estimate will be slightly inflated by the range key
     419             :                 // block. However, in practice, range keys are expected to be rare, and
     420             :                 // the size of the range key block relative to the overall size of the
     421             :                 // table is expected to be small.
     422           2 :                 if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
     423           2 :                         size, err := r.EstimateDiskUsage(start, end)
     424           2 :                         if err != nil {
     425           0 :                                 return nil, err
     426           0 :                         }
     427           2 :                         stats.RangeDeletionsBytesEstimate += size
     428           2 : 
     429           2 :                         // As the file is in the bottommost level, there is no need to collect a
     430           2 :                         // deletion hint.
     431           2 :                         continue
     432             :                 }
     433             : 
     434             :                 // While the size estimates for point keys should only be updated if this
     435             :                 // span contains a range del, the sequence numbers are required for the
     436             :                 // hint. Unconditionally descend, but conditionally update the estimates.
     437           2 :                 hintType := compactionHintFromKeys(s.Keys)
     438           2 :                 estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
     439           2 :                 if err != nil {
     440           0 :                         return nil, err
     441           0 :                 }
     442           2 :                 stats.RangeDeletionsBytesEstimate += estimate
     443           2 : 
     444           2 :                 // If any files were completely contained with the range,
     445           2 :                 // hintSeqNum is the smallest sequence number contained in any
     446           2 :                 // such file.
     447           2 :                 if hintSeqNum == math.MaxUint64 {
     448           2 :                         continue
     449             :                 }
     450           2 :                 hint := deleteCompactionHint{
     451           2 :                         hintType:                hintType,
     452           2 :                         start:                   make([]byte, len(start)),
     453           2 :                         end:                     make([]byte, len(end)),
     454           2 :                         tombstoneFile:           meta,
     455           2 :                         tombstoneLevel:          level,
     456           2 :                         tombstoneLargestSeqNum:  s.LargestSeqNum(),
     457           2 :                         tombstoneSmallestSeqNum: s.SmallestSeqNum(),
     458           2 :                         fileSmallestSeqNum:      hintSeqNum,
     459           2 :                 }
     460           2 :                 copy(hint.start, start)
     461           2 :                 copy(hint.end, end)
     462           2 :                 compactionHints = append(compactionHints, hint)
     463             :         }
     464           2 :         if err != nil {
     465           0 :                 return nil, err
     466           0 :         }
     467           2 :         return compactionHints, nil
     468             : }
     469             : 
     470             : func (d *DB) estimateSizesBeneath(
     471             :         v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties,
     472           2 : ) (avgValueLogicalSize, compressionRatio float64, err error) {
     473           2 :         // Find all files in lower levels that overlap with meta,
     474           2 :         // summing their value sizes and entry counts.
     475           2 :         file := meta
     476           2 :         var fileSum, keySum, valSum, entryCount uint64
     477           2 :         // Include the file itself. This is important because in some instances, the
     478           2 :         // computed compression ratio is applied to the tombstones contained within
     479           2 :         // `meta` itself. If there are no files beneath `meta` in the LSM, we would
     480           2 :         // calculate a compression ratio of 0 which is not accurate for the file's
     481           2 :         // own tombstones.
     482           2 :         fileSum += file.Size
     483           2 :         entryCount += fileProps.NumEntries
     484           2 :         keySum += fileProps.RawKeySize
     485           2 :         valSum += fileProps.RawValueSize
     486           2 : 
     487           2 :         addPhysicalTableStats := func(r *sstable.Reader) (err error) {
     488           2 :                 fileSum += file.Size
     489           2 :                 entryCount += r.Properties.NumEntries
     490           2 :                 keySum += r.Properties.RawKeySize
     491           2 :                 valSum += r.Properties.RawValueSize
     492           2 :                 return nil
     493           2 :         }
     494           2 :         addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
     495           1 :                 fileSum += file.Size
     496           1 :                 entryCount += file.Stats.NumEntries
     497           1 :                 keySum += v.Properties.RawKeySize
     498           1 :                 valSum += v.Properties.RawValueSize
     499           1 :                 return nil
     500           1 :         }
     501             : 
     502           2 :         for l := level + 1; l < numLevels; l++ {
     503           2 :                 overlaps := v.Overlaps(l, meta.Smallest.UserKey,
     504           2 :                         meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
     505           2 :                 iter := overlaps.Iter()
     506           2 :                 for file = iter.First(); file != nil; file = iter.Next() {
     507           2 :                         var err error
     508           2 :                         if file.Virtual {
     509           1 :                                 err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
     510           2 :                         } else {
     511           2 :                                 err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
     512           2 :                         }
     513           2 :                         if err != nil {
     514           0 :                                 return 0, 0, err
     515           0 :                         }
     516             :                 }
     517             :         }
     518           2 :         if entryCount == 0 {
     519           0 :                 return 0, 0, nil
     520           0 :         }
     521             :         // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
     522             :         // the value sum according to the data size to account for compression,
     523             :         // index blocks and metadata overhead. Eg:
     524             :         //
     525             :         //    Compression rate        ×  Average uncompressed value size
     526             :         //
     527             :         //                            ↓
     528             :         //
     529             :         //         FileSize              RawValueSize
     530             :         //   -----------------------  ×  ------------
     531             :         //   RawKeySize+RawValueSize     NumEntries
     532             :         //
     533             :         // We return the average logical value size plus the compression ratio,
     534             :         // leaving the scaling to the caller. This allows the caller to perform
     535             :         // additional compression ratio scaling if necessary.
     536           2 :         uncompressedSum := float64(keySum + valSum)
     537           2 :         compressionRatio = float64(fileSum) / uncompressedSum
     538           2 :         avgValueLogicalSize = (float64(valSum) / float64(entryCount))
     539           2 :         return avgValueLogicalSize, compressionRatio, nil
     540             : }
     541             : 
     542             : func (d *DB) estimateReclaimedSizeBeneath(
     543             :         v *version, level int, start, end []byte, hintType deleteCompactionHintType,
     544           2 : ) (estimate uint64, hintSeqNum uint64, err error) {
     545           2 :         // Find all files in lower levels that overlap with the deleted range
     546           2 :         // [start, end).
     547           2 :         //
     548           2 :         // An overlapping file might be completely contained by the range
     549           2 :         // tombstone, in which case we can count the entire file size in
     550           2 :         // our estimate without doing any additional I/O.
     551           2 :         //
     552           2 :         // Otherwise, estimating the range for the file requires
     553           2 :         // additional I/O to read the file's index blocks.
     554           2 :         hintSeqNum = math.MaxUint64
     555           2 :         for l := level + 1; l < numLevels; l++ {
     556           2 :                 overlaps := v.Overlaps(l, start, end, true /* exclusiveEnd */)
     557           2 :                 iter := overlaps.Iter()
     558           2 :                 for file := iter.First(); file != nil; file = iter.Next() {
     559           2 :                         startCmp := d.cmp(start, file.Smallest.UserKey)
     560           2 :                         endCmp := d.cmp(file.Largest.UserKey, end)
     561           2 :                         if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
     562           2 :                                 // The range fully contains the file, so skip looking it up in table
     563           2 :                                 // cache/looking at its indexes and add the full file size. Whether the
     564           2 :                                 // disk estimate and hint seqnums are updated depends on a) the type of
     565           2 :                                 // hint that requested the estimate and b) the keys contained in this
     566           2 :                                 // current file.
     567           2 :                                 var updateEstimates, updateHints bool
     568           2 :                                 switch hintType {
     569           2 :                                 case deleteCompactionHintTypePointKeyOnly:
     570           2 :                                         // The range deletion byte estimates should only be updated if this
     571           2 :                                         // table contains point keys. This ends up being an overestimate in
     572           2 :                                         // the case that table also has range keys, but such keys are expected
     573           2 :                                         // to contribute a negligible amount of the table's overall size,
     574           2 :                                         // relative to point keys.
     575           2 :                                         if file.HasPointKeys {
     576           2 :                                                 updateEstimates = true
     577           2 :                                         }
     578             :                                         // As the initiating span contained only range dels, hints can only be
     579             :                                         // updated if this table does _not_ contain range keys.
     580           2 :                                         if !file.HasRangeKeys {
     581           2 :                                                 updateHints = true
     582           2 :                                         }
     583           2 :                                 case deleteCompactionHintTypeRangeKeyOnly:
     584           2 :                                         // The initiating span contained only range key dels. The estimates
     585           2 :                                         // apply only to point keys, and are therefore not updated.
     586           2 :                                         updateEstimates = false
     587           2 :                                         // As the initiating span contained only range key dels, hints can
     588           2 :                                         // only be updated if this table does _not_ contain point keys.
     589           2 :                                         if !file.HasPointKeys {
     590           2 :                                                 updateHints = true
     591           2 :                                         }
     592           2 :                                 case deleteCompactionHintTypePointAndRangeKey:
     593           2 :                                         // Always update the estimates and hints, as this hint type can drop a
     594           2 :                                         // file, irrespective of the mixture of keys. Similar to above, the
     595           2 :                                         // range del bytes estimates is an overestimate.
     596           2 :                                         updateEstimates, updateHints = true, true
     597           0 :                                 default:
     598           0 :                                         panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
     599             :                                 }
     600           2 :                                 if updateEstimates {
     601           2 :                                         estimate += file.Size
     602           2 :                                 }
     603           2 :                                 if updateHints && hintSeqNum > file.SmallestSeqNum {
     604           2 :                                         hintSeqNum = file.SmallestSeqNum
     605           2 :                                 }
     606           2 :                         } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
     607           2 :                                 // Partial overlap.
     608           2 :                                 if hintType == deleteCompactionHintTypeRangeKeyOnly {
     609           2 :                                         // If the hint that generated this overlap contains only range keys,
     610           2 :                                         // there is no need to calculate disk usage, as the reclaimable space
     611           2 :                                         // is expected to be minimal relative to point keys.
     612           2 :                                         continue
     613             :                                 }
     614           2 :                                 var size uint64
     615           2 :                                 var err error
     616           2 :                                 if file.Virtual {
     617           2 :                                         err = d.tableCache.withVirtualReader(
     618           2 :                                                 file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
     619           2 :                                                         size, err = r.EstimateDiskUsage(start, end)
     620           2 :                                                         return err
     621           2 :                                                 })
     622           2 :                                 } else {
     623           2 :                                         err = d.tableCache.withReader(
     624           2 :                                                 file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
     625           2 :                                                         size, err = r.EstimateDiskUsage(start, end)
     626           2 :                                                         return err
     627           2 :                                                 })
     628             :                                 }
     629             : 
     630           2 :                                 if err != nil {
     631           0 :                                         return 0, hintSeqNum, err
     632           0 :                                 }
     633           2 :                                 estimate += size
     634             :                         }
     635             :                 }
     636             :         }
     637           2 :         return estimate, hintSeqNum, nil
     638             : }
     639             : 
     640           2 : func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
     641           2 :         // If a table contains range deletions or range key deletions, we defer the
     642           2 :         // stats collection. There are two main reasons for this:
     643           2 :         //
     644           2 :         //  1. Estimating the potential for reclaimed space due to a range deletion
     645           2 :         //     tombstone requires scanning the LSM - a potentially expensive operation
     646           2 :         //     that should be deferred.
     647           2 :         //  2. Range deletions and / or range key deletions present an opportunity to
     648           2 :         //     compute "deletion hints", which also requires a scan of the LSM to
     649           2 :         //     compute tables that would be eligible for deletion.
     650           2 :         //
     651           2 :         // These two tasks are deferred to the table stats collector goroutine.
     652           2 :         if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
     653           2 :                 return false
     654           2 :         }
     655             : 
     656             :         // If a table is more than 10% point deletions without user-provided size
     657             :         // estimates, don't calculate the PointDeletionsBytesEstimate statistic
     658             :         // using our limited knowledge. The table stats collector can populate the
     659             :         // stats and calculate an average of value size of all the tables beneath
     660             :         // the table in the LSM, which will be more accurate.
     661           2 :         if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
     662           2 :                 return false
     663           2 :         }
     664             : 
     665           2 :         var pointEstimate uint64
     666           2 :         if props.NumEntries > 0 {
     667           2 :                 // Use the file's own average key and value sizes as an estimate. This
     668           2 :                 // doesn't require any additional IO and since the number of point
     669           2 :                 // deletions in the file is low, the error introduced by this crude
     670           2 :                 // estimate is expected to be small.
     671           2 :                 commonProps := &props.CommonProperties
     672           2 :                 avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
     673           2 :                 pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
     674           2 :         }
     675             : 
     676           2 :         meta.Stats.NumEntries = props.NumEntries
     677           2 :         meta.Stats.NumDeletions = props.NumDeletions
     678           2 :         meta.Stats.NumRangeKeySets = props.NumRangeKeySets
     679           2 :         meta.Stats.PointDeletionsBytesEstimate = pointEstimate
     680           2 :         meta.Stats.RangeDeletionsBytesEstimate = 0
     681           2 :         meta.Stats.ValueBlocksSize = props.ValueBlocksSize
     682           2 :         meta.StatsMarkValid()
     683           2 :         return true
     684             : }
     685             : 
     686             : func pointDeletionsBytesEstimate(
     687             :         fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
     688           2 : ) (estimate uint64) {
     689           2 :         if props.NumEntries == 0 {
     690           0 :                 return 0
     691           0 :         }
     692           2 :         numPointDels := props.NumPointDeletions()
     693           2 :         if numPointDels == 0 {
     694           2 :                 return 0
     695           2 :         }
     696             :         // Estimate the potential space to reclaim using the table's own properties.
     697             :         // There may or may not be keys covered by any individual point tombstone.
     698             :         // If not, compacting the point tombstone into L6 will at least allow us to
     699             :         // drop the point deletion key and will reclaim the tombstone's key bytes.
     700             :         // If there are covered key(s), we also get to drop key and value bytes for
     701             :         // each covered key.
     702             :         //
     703             :         // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
     704             :         // uncompressed size of entries that will be elided by fully compacting the
     705             :         // tombstone. For these tombstones, there's no guesswork—we use the
     706             :         // RawPointTombstoneValueSizeHint property which is the sum of all these
     707             :         // tombstones' encoded values.
     708             :         //
     709             :         // For un-sized point tombstones (DELs), we estimate assuming that each
     710             :         // point tombstone on average covers 1 key and using average value sizes.
     711             :         // This is almost certainly an overestimate, but that's probably okay
     712             :         // because point tombstones can slow range iterations even when they don't
     713             :         // cover a key.
     714             :         //
     715             :         // TODO(jackson): This logic doesn't directly incorporate fixed per-key
     716             :         // overhead (8-byte trailer, plus at least 1 byte encoding the length of the
     717             :         // key and 1 byte encoding the length of the value). This overhead is
     718             :         // indirectly incorporated through the compression ratios, but that results
     719             :         // in the overhead being smeared per key-byte and value-byte, rather than
     720             :         // per-entry. This per-key fixed overhead can be nontrivial, especially for
     721             :         // dense swaths of point tombstones. Give some thought as to whether we
     722             :         // should directly include fixed per-key overhead in the calculations.
     723             : 
     724             :         // Below, we calculate the tombstone contributions and the shadowed keys'
     725             :         // contributions separately.
     726           2 :         var tombstonesLogicalSize float64
     727           2 :         var shadowedLogicalSize float64
     728           2 : 
     729           2 :         // 1. Calculate the contribution of the tombstone keys themselves.
     730           2 :         if props.RawPointTombstoneKeySize > 0 {
     731           2 :                 tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
     732           2 :         } else {
     733           0 :                 // This sstable predates the existence of the RawPointTombstoneKeySize
     734           0 :                 // property. We can use the average key size within the file itself and
     735           0 :                 // the count of point deletions to estimate the size.
     736           0 :                 tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
     737           0 :         }
     738             : 
     739             :         // 2. Calculate the contribution of the keys shadowed by tombstones.
     740             :         //
     741             :         // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
     742             :         // tombstones encode the size of both the key and value of the shadowed KV
     743             :         // entries. These sizes are aggregated into a sstable property.
     744           2 :         shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
     745           2 : 
     746           2 :         // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
     747           2 :         // keys.
     748           2 :         numUnsizedDels := numPointDels - props.NumSizedDeletions
     749           2 :         {
     750           2 :                 // The shadowed keys have the same exact user keys as the tombstones
     751           2 :                 // themselves, so we can use the `tombstonesLogicalSize` we computed
     752           2 :                 // earlier as an estimate. There's a complication that
     753           2 :                 // `tombstonesLogicalSize` may include DELSIZED keys we already
     754           2 :                 // accounted for.
     755           2 :                 shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
     756           2 : 
     757           2 :                 // Calculate the contribution of the deleted values. The caller has
     758           2 :                 // already computed an average logical size (possibly computed across
     759           2 :                 // many sstables).
     760           2 :                 shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
     761           2 :         }
     762             : 
     763             :         // Scale both tombstone and shadowed totals by logical:physical ratios to
     764             :         // account for compression, metadata overhead, etc.
     765             :         //
     766             :         //      Physical             FileSize
     767             :         //     -----------  = -----------------------
     768             :         //      Logical       RawKeySize+RawValueSize
     769             :         //
     770           2 :         return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
     771             : }
     772             : 
     773             : func estimatePhysicalSizes(
     774             :         fileSize uint64, props *sstable.CommonProperties,
     775           2 : ) (avgValLogicalSize, compressionRatio float64) {
     776           2 :         // RawKeySize and RawValueSize are uncompressed totals. Scale according to
     777           2 :         // the data size to account for compression, index blocks and metadata
     778           2 :         // overhead. Eg:
     779           2 :         //
     780           2 :         //    Compression rate        ×  Average uncompressed value size
     781           2 :         //
     782           2 :         //                            ↓
     783           2 :         //
     784           2 :         //         FileSize              RawValSize
     785           2 :         //   -----------------------  ×  ----------
     786           2 :         //   RawKeySize+RawValueSize     NumEntries
     787           2 :         //
     788           2 :         uncompressedSum := props.RawKeySize + props.RawValueSize
     789           2 :         compressionRatio = float64(fileSize) / float64(uncompressedSum)
     790           2 :         avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
     791           2 :         return avgValLogicalSize, compressionRatio
     792           2 : }
     793             : 
     794             : // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
     795             : // returns "ranged deletion" spans for a single table, providing a combined view
     796             : // of both range deletion and range key deletion spans. The
     797             : // tableRangedDeletionIter is intended for use in the specific case of computing
     798             : // the statistics and deleteCompactionHints for a single table.
     799             : //
     800             : // As an example, consider the following set of spans from the range deletion
     801             : // and range key blocks of a table:
     802             : //
     803             : //                    |---------|     |---------|         |-------| RANGEKEYDELs
     804             : //              |-----------|-------------|           |-----|       RANGEDELs
     805             : //        __________________________________________________________
     806             : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     807             : //
     808             : // The tableRangedDeletionIter produces the following set of output spans, where
     809             : // '1' indicates a span containing only range deletions, '2' is a span
     810             : // containing only range key deletions, and '3' is a span containing a mixture
     811             : // of both range deletions and range key deletions.
     812             : //
     813             : //                 1       3       1    3    2          1  3   2
     814             : //              |-----|---------|-----|---|-----|     |---|-|-----|
     815             : //        __________________________________________________________
     816             : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     817             : //
     818             : // Algorithm.
     819             : //
     820             : // The iterator first defragments the range deletion and range key blocks
     821             : // separately. During this defragmentation, the range key block is also filtered
     822             : // so that keys other than range key deletes are ignored. The range delete and
     823             : // range key delete keyspaces are then merged.
     824             : //
     825             : // Note that the only fragmentation introduced by merging is from where a range
     826             : // del span overlaps with a range key del span. Within the bounds of any overlap
     827             : // there is guaranteed to be no further fragmentation, as the constituent spans
     828             : // have already been defragmented. To the left and right of any overlap, the
     829             : // same reasoning applies. For example,
     830             : //
     831             : //                       |--------|         |-------| RANGEKEYDEL
     832             : //              |---------------------------|         RANGEDEL
     833             : //              |----1---|----3---|----1----|---2---| Merged, fragmented spans.
     834             : //        __________________________________________________________
     835             : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     836             : //
     837             : // Any fragmented abutting spans produced by the merging iter will be of
     838             : // differing types (i.e. a transition from a span with homogenous key kinds to a
     839             : // heterogeneous span, or a transition from a span with exclusively range dels
     840             : // to a span with exclusively range key dels). Therefore, further
     841             : // defragmentation is not required.
     842             : //
     843             : // Each span returned by the tableRangeDeletionIter will have at most four keys,
     844             : // corresponding to the largest and smallest sequence numbers encountered across
     845             : // the range deletes and range keys deletes that comprised the merged spans.
     846             : func newCombinedDeletionKeyspanIter(
     847             :         comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata,
     848           2 : ) (keyspan.FragmentIterator, error) {
     849           2 :         // The range del iter and range key iter are each wrapped in their own
     850           2 :         // defragmenting iter. For each iter, abutting spans can always be merged.
     851           2 :         var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
     852             :         // Reduce keys by maintaining a slice of at most length two, corresponding to
     853             :         // the largest and smallest keys in the defragmented span. This maintains the
     854             :         // contract that the emitted slice is sorted by (SeqNum, Kind) descending.
     855           2 :         reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
     856           2 :                 if len(current) == 0 && len(incoming) == 0 {
     857           0 :                         // While this should never occur in practice, a defensive return is used
     858           0 :                         // here to preserve correctness.
     859           0 :                         return current
     860           0 :                 }
     861           2 :                 var largest, smallest keyspan.Key
     862           2 :                 var set bool
     863           2 :                 for _, keys := range [2][]keyspan.Key{current, incoming} {
     864           2 :                         if len(keys) == 0 {
     865           0 :                                 continue
     866             :                         }
     867           2 :                         first, last := keys[0], keys[len(keys)-1]
     868           2 :                         if !set {
     869           2 :                                 largest, smallest = first, last
     870           2 :                                 set = true
     871           2 :                                 continue
     872             :                         }
     873           2 :                         if first.Trailer > largest.Trailer {
     874           2 :                                 largest = first
     875           2 :                         }
     876           2 :                         if last.Trailer < smallest.Trailer {
     877           2 :                                 smallest = last
     878           2 :                         }
     879             :                 }
     880           2 :                 if largest.Equal(comparer.Equal, smallest) {
     881           2 :                         current = append(current[:0], largest)
     882           2 :                 } else {
     883           2 :                         current = append(current[:0], largest, smallest)
     884           2 :                 }
     885           2 :                 return current
     886             :         }
     887             : 
     888             :         // The separate iters for the range dels and range keys are wrapped in a
     889             :         // merging iter to join the keyspaces into a single keyspace. The separate
     890             :         // iters are only added if the particular key kind is present.
     891           2 :         mIter := &keyspanimpl.MergingIter{}
     892           2 :         var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
     893           2 :                 if in.KeysOrder != keyspan.ByTrailerDesc {
     894           0 :                         panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
     895             :                 }
     896           2 :                 out.Start, out.End = in.Start, in.End
     897           2 :                 out.Keys = append(out.Keys[:0], in.Keys...)
     898           2 :                 out.KeysOrder = keyspan.ByTrailerDesc
     899           2 :                 // NB: The order of by-trailer descending may have been violated,
     900           2 :                 // because we've layered rangekey and rangedel iterators from the same
     901           2 :                 // sstable into the same keyspanimpl.MergingIter. The MergingIter will
     902           2 :                 // return the keys in the order that the child iterators were provided.
     903           2 :                 // Sort the keys to ensure they're sorted by trailer descending.
     904           2 :                 keyspan.SortKeysByTrailer(&out.Keys)
     905           2 :                 return nil
     906             :         })
     907           2 :         mIter.Init(comparer.Compare, transform, new(keyspanimpl.MergingBuffers))
     908           2 : 
     909           2 :         iter, err := cr.NewRawRangeDelIter(m.IterTransforms())
     910           2 :         if err != nil {
     911           0 :                 return nil, err
     912           0 :         }
     913           2 :         if iter != nil {
     914           2 :                 // Assert expected bounds. In previous versions of Pebble, range
     915           2 :                 // deletions persisted to sstables could exceed the bounds of the
     916           2 :                 // containing files due to "split user keys." This required readers to
     917           2 :                 // constrain the tombstones' bounds to the containing file at read time.
     918           2 :                 // See docs/range_deletions.md for an extended discussion of the design
     919           2 :                 // and invariants at that time.
     920           2 :                 //
     921           2 :                 // We've since compacted away all 'split user-keys' and in the process
     922           2 :                 // eliminated all "untruncated range tombstones" for physical sstables.
     923           2 :                 // We no longer need to perform truncation at read time for these
     924           2 :                 // sstables.
     925           2 :                 //
     926           2 :                 // At the same time, we've also introduced the concept of "virtual
     927           2 :                 // SSTables" where the file metadata's effective bounds can again be
     928           2 :                 // reduced to be narrower than the contained tombstones. These virtual
     929           2 :                 // SSTables handle truncation differently, performing it using
     930           2 :                 // keyspan.Truncate when the sstable's range deletion iterator is
     931           2 :                 // opened.
     932           2 :                 //
     933           2 :                 // Together, these mean that we should never see untruncated range
     934           2 :                 // tombstones any more—and the merging iterator no longer accounts for
     935           2 :                 // their existence. Since there's abundant subtlety that we're relying
     936           2 :                 // on, we choose to be conservative and assert that these invariants
     937           2 :                 // hold. We could (and previously did) choose to only validate these
     938           2 :                 // bounds in invariants builds, but the most likely avenue for these
     939           2 :                 // tombstones' existence is through a bug in a migration and old data
     940           2 :                 // sitting around in an old store from long ago.
     941           2 :                 //
     942           2 :                 // The table stats collector will read all files range deletions
     943           2 :                 // asynchronously after Open, and provides a perfect opportunity to
     944           2 :                 // validate our invariants without harming user latency. We also
     945           2 :                 // previously performed truncation here which similarly required key
     946           2 :                 // comparisons, so replacing those key comparisons with assertions
     947           2 :                 // should be roughly similar in performance.
     948           2 :                 //
     949           2 :                 // TODO(jackson): Only use AssertBounds in invariants builds in the
     950           2 :                 // following release.
     951           2 :                 iter = keyspan.AssertBounds(
     952           2 :                         iter, m.SmallestPointKey, m.LargestPointKey.UserKey, comparer.Compare,
     953           2 :                 )
     954           2 :                 dIter := &keyspan.DefragmentingIter{}
     955           2 :                 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
     956           2 :                 iter = dIter
     957           2 :                 mIter.AddLevel(iter)
     958           2 :         }
     959             : 
     960           2 :         iter, err = cr.NewRawRangeKeyIter(m.IterTransforms())
     961           2 :         if err != nil {
     962           0 :                 return nil, err
     963           0 :         }
     964           2 :         if iter != nil {
     965           2 :                 // Assert expected bounds in tests.
     966           2 :                 if invariants.Enabled {
     967           2 :                         iter = keyspan.AssertBounds(
     968           2 :                                 iter, m.SmallestRangeKey, m.LargestRangeKey.UserKey, comparer.Compare,
     969           2 :                         )
     970           2 :                 }
     971             :                 // Wrap the range key iterator in a filter that elides keys other than range
     972             :                 // key deletions.
     973           2 :                 iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
     974           2 :                         out.Start, out.End = in.Start, in.End
     975           2 :                         out.Keys = out.Keys[:0]
     976           2 :                         for _, k := range in.Keys {
     977           2 :                                 if k.Kind() != base.InternalKeyKindRangeKeyDelete {
     978           2 :                                         continue
     979             :                                 }
     980           2 :                                 out.Keys = append(out.Keys, k)
     981             :                         }
     982           2 :                         return len(out.Keys) > 0
     983             :                 }, comparer.Compare)
     984           2 :                 dIter := &keyspan.DefragmentingIter{}
     985           2 :                 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
     986           2 :                 iter = dIter
     987           2 :                 mIter.AddLevel(iter)
     988             :         }
     989             : 
     990           2 :         return mIter, nil
     991             : }
     992             : 
     993             : // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
     994             : // with the sum of the files' counts of range key fragments. Its annotation type
     995             : // is a *uint64. The count of range key sets may change once a table's stats are
     996             : // loaded asynchronously, so its values are marked as cacheable only if a file's
     997             : // stats have been loaded.
     998             : type rangeKeySetsAnnotator struct{}
     999             : 
    1000             : var _ manifest.Annotator = rangeKeySetsAnnotator{}
    1001             : 
    1002           2 : func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
    1003           2 :         if dst == nil {
    1004           2 :                 return new(uint64)
    1005           2 :         }
    1006           0 :         v := dst.(*uint64)
    1007           0 :         *v = 0
    1008           0 :         return v
    1009             : }
    1010             : 
    1011             : func (a rangeKeySetsAnnotator) Accumulate(
    1012             :         f *fileMetadata, dst interface{},
    1013           2 : ) (v interface{}, cacheOK bool) {
    1014           2 :         vptr := dst.(*uint64)
    1015           2 :         *vptr = *vptr + f.Stats.NumRangeKeySets
    1016           2 :         return vptr, f.StatsValid()
    1017           2 : }
    1018             : 
    1019           1 : func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
    1020           1 :         srcV := src.(*uint64)
    1021           1 :         dstV := dst.(*uint64)
    1022           1 :         *dstV = *dstV + *srcV
    1023           1 :         return dstV
    1024           1 : }
    1025             : 
    1026             : // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
    1027             : // files of the LSM. It only counts keys in files for which table stats have
    1028             : // been loaded. It uses a b-tree annotator to cache intermediate values between
    1029             : // calculations when possible.
    1030           2 : func countRangeKeySetFragments(v *version) (count uint64) {
    1031           2 :         for l := 0; l < numLevels; l++ {
    1032           2 :                 if v.RangeKeyLevels[l].Empty() {
    1033           2 :                         continue
    1034             :                 }
    1035           2 :                 count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
    1036             :         }
    1037           2 :         return count
    1038             : }
    1039             : 
    1040             : // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes
    1041             : // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk
    1042             : // eys). Its annotation type is a *uint64. The count of tombstones may change
    1043             : // once a table's stats are loaded asynchronously, so its values are marked as
    1044             : // cacheable only if a file's stats have been loaded.
    1045             : type tombstonesAnnotator struct{}
    1046             : 
    1047             : var _ manifest.Annotator = tombstonesAnnotator{}
    1048             : 
    1049           2 : func (a tombstonesAnnotator) Zero(dst interface{}) interface{} {
    1050           2 :         if dst == nil {
    1051           2 :                 return new(uint64)
    1052           2 :         }
    1053           1 :         v := dst.(*uint64)
    1054           1 :         *v = 0
    1055           1 :         return v
    1056             : }
    1057             : 
    1058             : func (a tombstonesAnnotator) Accumulate(
    1059             :         f *fileMetadata, dst interface{},
    1060           2 : ) (v interface{}, cacheOK bool) {
    1061           2 :         vptr := dst.(*uint64)
    1062           2 :         *vptr = *vptr + f.Stats.NumDeletions
    1063           2 :         return vptr, f.StatsValid()
    1064           2 : }
    1065             : 
    1066           2 : func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} {
    1067           2 :         srcV := src.(*uint64)
    1068           2 :         dstV := dst.(*uint64)
    1069           2 :         *dstV = *dstV + *srcV
    1070           2 :         return dstV
    1071           2 : }
    1072             : 
    1073             : // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL)
    1074             : // internal keys across all files of the LSM. It only counts keys in files for
    1075             : // which table stats have been loaded. It uses a b-tree annotator to cache
    1076             : // intermediate values between calculations when possible.
    1077           2 : func countTombstones(v *version) (count uint64) {
    1078           2 :         for l := 0; l < numLevels; l++ {
    1079           2 :                 if v.Levels[l].Empty() {
    1080           2 :                         continue
    1081             :                 }
    1082           2 :                 count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64)
    1083             :         }
    1084           2 :         return count
    1085             : }
    1086             : 
    1087             : // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree
    1088             : // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation
    1089             : // type is a *uint64. The value block size may change once a table's stats are
    1090             : // loaded asynchronously, so its values are marked as cacheable only if a
    1091             : // file's stats have been loaded.
    1092             : type valueBlocksSizeAnnotator struct{}
    1093             : 
    1094             : var _ manifest.Annotator = valueBlocksSizeAnnotator{}
    1095             : 
    1096           2 : func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} {
    1097           2 :         if dst == nil {
    1098           2 :                 return new(uint64)
    1099           2 :         }
    1100           1 :         v := dst.(*uint64)
    1101           1 :         *v = 0
    1102           1 :         return v
    1103             : }
    1104             : 
    1105             : func (a valueBlocksSizeAnnotator) Accumulate(
    1106             :         f *fileMetadata, dst interface{},
    1107           2 : ) (v interface{}, cacheOK bool) {
    1108           2 :         vptr := dst.(*uint64)
    1109           2 :         *vptr = *vptr + f.Stats.ValueBlocksSize
    1110           2 :         return vptr, f.StatsValid()
    1111           2 : }
    1112             : 
    1113           2 : func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
    1114           2 :         srcV := src.(*uint64)
    1115           2 :         dstV := dst.(*uint64)
    1116           2 :         *dstV = *dstV + *srcV
    1117           2 :         return dstV
    1118           2 : }
    1119             : 
    1120             : // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all
    1121             : // files for a level of the LSM. It only includes the size for files for which
    1122             : // table stats have been loaded. It uses a b-tree annotator to cache
    1123             : // intermediate values between calculations when possible. It must not be
    1124             : // called concurrently.
    1125             : //
    1126             : // REQUIRES: 0 <= level <= numLevels.
    1127           2 : func valueBlocksSizeForLevel(v *version, level int) (count uint64) {
    1128           2 :         if v.Levels[level].Empty() {
    1129           2 :                 return 0
    1130           2 :         }
    1131           2 :         return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64)
    1132             : }

Generated by: LCOV version 1.14