LCOV - 2025-02-24 08:17Z 6949b900 - tests only.lcov

LCOV - code coverage report

Current view:	top level - pebble - table_stats.go (source / functions)		Coverage	Total	Hit
Test:	2025-02-24 08:17Z 6949b900 - tests only.lcov	Lines:	90.6 %	669	606
Test Date:	2025-02-24 08:18:26	Functions:	-	0	0

            Line data    Source code

       1              : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2              : // of this source code is governed by a BSD-style license that can be found in
       3              : // the LICENSE file.
       4              : 
       5              : package pebble
       6              : 
       7              : import (
       8              :         "context"
       9              :         "fmt"
      10              :         "math"
      11              : 
      12              :         "github.com/cockroachdb/errors"
      13              :         "github.com/cockroachdb/pebble/internal/base"
      14              :         "github.com/cockroachdb/pebble/internal/invariants"
      15              :         "github.com/cockroachdb/pebble/internal/keyspan"
      16              :         "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
      17              :         "github.com/cockroachdb/pebble/internal/manifest"
      18              :         "github.com/cockroachdb/pebble/sstable"
      19              :         "github.com/cockroachdb/pebble/sstable/block"
      20              : )
      21              : 
      22              : // In-memory statistics about tables help inform compaction picking, but may
      23              : // be expensive to calculate or load from disk. Every time a database is
      24              : // opened, these statistics must be reloaded or recalculated. To minimize
      25              : // impact on user activity and compactions, we load these statistics
      26              : // asynchronously in the background and store loaded statistics in each
      27              : // table's *TableMetadata.
      28              : //
      29              : // This file implements the asynchronous loading of statistics by maintaining
      30              : // a list of files that require statistics, alongside their LSM levels.
      31              : // Whenever new files are added to the LSM, the files are appended to
      32              : // d.mu.tableStats.pending. If a stats collection job is not currently
      33              : // running, one is started in a separate goroutine.
      34              : //
      35              : // The stats collection job grabs and clears the pending list, computes table
      36              : // statistics relative to the current readState and updates the tables' file
      37              : // metadata. New pending files may accumulate during a stats collection job,
      38              : // so a completing job triggers a new job if necessary. Only one job runs at a
      39              : // time.
      40              : //
      41              : // When an existing database is opened, all files lack in-memory statistics.
      42              : // These files' stats are loaded incrementally whenever the pending list is
      43              : // empty by scanning a current readState for files missing statistics. Once a
      44              : // job completes a scan without finding any remaining files without
      45              : // statistics, it flips a `loadedInitial` flag. From then on, the stats
      46              : // collection job only needs to load statistics for new files appended to the
      47              : // pending list.
      48              : 
      49            1 : func (d *DB) maybeCollectTableStatsLocked() {
      50            1 :         if d.shouldCollectTableStatsLocked() {
      51            1 :                 go d.collectTableStats()
      52            1 :         }
      53              : }
      54              : 
      55              : // updateTableStatsLocked is called when new files are introduced, after the
      56              : // read state has been updated. It may trigger a new stat collection.
      57              : // DB.mu must be locked when calling.
      58            1 : func (d *DB) updateTableStatsLocked(newTables []manifest.NewTableEntry) {
      59            1 :         var needStats bool
      60            1 :         for _, nf := range newTables {
      61            1 :                 if !nf.Meta.StatsValid() {
      62            1 :                         needStats = true
      63            1 :                         break
      64              :                 }
      65              :         }
      66            1 :         if !needStats {
      67            1 :                 return
      68            1 :         }
      69              : 
      70            1 :         d.mu.tableStats.pending = append(d.mu.tableStats.pending, newTables...)
      71            1 :         d.maybeCollectTableStatsLocked()
      72              : }
      73              : 
      74            1 : func (d *DB) shouldCollectTableStatsLocked() bool {
      75            1 :         return !d.mu.tableStats.loading &&
      76            1 :                 d.closed.Load() == nil &&
      77            1 :                 !d.opts.DisableTableStats &&
      78            1 :                 (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
      79            1 : }
      80              : 
      81              : // collectTableStats runs a table stats collection job, returning true if the
      82              : // invocation did the collection work, false otherwise (e.g. if another job was
      83              : // already running).
      84            1 : func (d *DB) collectTableStats() bool {
      85            1 :         const maxTableStatsPerScan = 50
      86            1 : 
      87            1 :         d.mu.Lock()
      88            1 :         if !d.shouldCollectTableStatsLocked() {
      89            1 :                 d.mu.Unlock()
      90            1 :                 return false
      91            1 :         }
      92              : 
      93            1 :         pending := d.mu.tableStats.pending
      94            1 :         d.mu.tableStats.pending = nil
      95            1 :         d.mu.tableStats.loading = true
      96            1 :         jobID := d.newJobIDLocked()
      97            1 :         loadedInitial := d.mu.tableStats.loadedInitial
      98            1 :         // Drop DB.mu before performing IO.
      99            1 :         d.mu.Unlock()
     100            1 : 
     101            1 :         // Every run of collectTableStats either collects stats from the pending
     102            1 :         // list (if non-empty) or from scanning the version (loadedInitial is
     103            1 :         // false). This job only runs if at least one of those conditions holds.
     104            1 : 
     105            1 :         // Grab a read state to scan for tables.
     106            1 :         rs := d.loadReadState()
     107            1 :         var collected []collectedStats
     108            1 :         var hints []deleteCompactionHint
     109            1 :         if len(pending) > 0 {
     110            1 :                 collected, hints = d.loadNewFileStats(rs, pending)
     111            1 :         } else {
     112            1 :                 var moreRemain bool
     113            1 :                 var buf [maxTableStatsPerScan]collectedStats
     114            1 :                 collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
     115            1 :                 loadedInitial = !moreRemain
     116            1 :         }
     117            1 :         rs.unref()
     118            1 : 
     119            1 :         // Update the TableMetadata with the loaded stats while holding d.mu.
     120            1 :         d.mu.Lock()
     121            1 :         defer d.mu.Unlock()
     122            1 :         d.mu.tableStats.loading = false
     123            1 :         if loadedInitial && !d.mu.tableStats.loadedInitial {
     124            1 :                 d.mu.tableStats.loadedInitial = loadedInitial
     125            1 :                 d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
     126            1 :                         JobID: int(jobID),
     127            1 :                 })
     128            1 :         }
     129              : 
     130            1 :         maybeCompact := false
     131            1 :         for _, c := range collected {
     132            1 :                 c.tableMetadata.Stats = c.TableStats
     133            1 :                 maybeCompact = maybeCompact || fileCompensation(c.tableMetadata) > 0
     134            1 :                 c.tableMetadata.StatsMarkValid()
     135            1 :         }
     136              : 
     137            1 :         d.mu.tableStats.cond.Broadcast()
     138            1 :         d.maybeCollectTableStatsLocked()
     139            1 :         if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
     140            1 :                 // Verify that all of the hint tombstones' files still exist in the
     141            1 :                 // current version. Otherwise, the tombstone itself may have been
     142            1 :                 // compacted into L6 and more recent keys may have had their sequence
     143            1 :                 // numbers zeroed.
     144            1 :                 //
     145            1 :                 // Note that it's possible that the tombstone file is being compacted
     146            1 :                 // presently. In that case, the file will be present in v. When the
     147            1 :                 // compaction finishes compacting the tombstone file, it will detect
     148            1 :                 // and clear the hint.
     149            1 :                 //
     150            1 :                 // See DB.maybeUpdateDeleteCompactionHints.
     151            1 :                 v := d.mu.versions.currentVersion()
     152            1 :                 keepHints := hints[:0]
     153            1 :                 for _, h := range hints {
     154            1 :                         if v.Contains(h.tombstoneLevel, h.tombstoneFile) {
     155            1 :                                 keepHints = append(keepHints, h)
     156            1 :                         }
     157              :                 }
     158            1 :                 d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
     159              :         }
     160            1 :         if maybeCompact {
     161            1 :                 d.maybeScheduleCompaction()
     162            1 :         }
     163            1 :         return true
     164              : }
     165              : 
     166              : type collectedStats struct {
     167              :         *tableMetadata
     168              :         manifest.TableStats
     169              : }
     170              : 
     171              : func (d *DB) loadNewFileStats(
     172              :         rs *readState, pending []manifest.NewTableEntry,
     173            1 : ) ([]collectedStats, []deleteCompactionHint) {
     174            1 :         var hints []deleteCompactionHint
     175            1 :         collected := make([]collectedStats, 0, len(pending))
     176            1 :         for _, nf := range pending {
     177            1 :                 // A file's stats might have been populated by an earlier call to
     178            1 :                 // loadNewFileStats if the file was moved.
     179            1 :                 // NB: We're not holding d.mu which protects f.Stats, but only
     180            1 :                 // collectTableStats updates f.Stats for active files, and we
     181            1 :                 // ensure only one goroutine runs it at a time through
     182            1 :                 // d.mu.tableStats.loading.
     183            1 :                 if nf.Meta.StatsValid() {
     184            1 :                         continue
     185              :                 }
     186              : 
     187              :                 // The file isn't guaranteed to still be live in the readState's
     188              :                 // version. It may have been deleted or moved. Skip it if it's not in
     189              :                 // the expected level.
     190            1 :                 if !rs.current.Contains(nf.Level, nf.Meta) {
     191            1 :                         continue
     192              :                 }
     193              : 
     194            1 :                 stats, newHints, err := d.loadTableStats(
     195            1 :                         rs.current, nf.Level,
     196            1 :                         nf.Meta,
     197            1 :                 )
     198            1 :                 if err != nil {
     199            0 :                         d.opts.EventListener.BackgroundError(err)
     200            0 :                         continue
     201              :                 }
     202              :                 // NB: We don't update the TableMetadata yet, because we aren't holding
     203              :                 // DB.mu. We'll copy it to the TableMetadata after we're finished with
     204              :                 // IO.
     205            1 :                 collected = append(collected, collectedStats{
     206            1 :                         tableMetadata: nf.Meta,
     207            1 :                         TableStats:    stats,
     208            1 :                 })
     209            1 :                 hints = append(hints, newHints...)
     210              :         }
     211            1 :         return collected, hints
     212              : }
     213              : 
     214              : // scanReadStateTableStats is run by an active stat collection job when there
     215              : // are no pending new files, but there might be files that existed at Open for
     216              : // which we haven't loaded table stats.
     217              : func (d *DB) scanReadStateTableStats(
     218              :         rs *readState, fill []collectedStats,
     219            1 : ) ([]collectedStats, []deleteCompactionHint, bool) {
     220            1 :         moreRemain := false
     221            1 :         var hints []deleteCompactionHint
     222            1 :         sizesChecked := make(map[base.DiskFileNum]struct{})
     223            1 :         for l, levelMetadata := range rs.current.Levels {
     224            1 :                 iter := levelMetadata.Iter()
     225            1 :                 for f := iter.First(); f != nil; f = iter.Next() {
     226            1 :                         // NB: We're not holding d.mu which protects f.Stats, but only the
     227            1 :                         // active stats collection job updates f.Stats for active files,
     228            1 :                         // and we ensure only one goroutine runs it at a time through
     229            1 :                         // d.mu.tableStats.loading. This makes it safe to read validity
     230            1 :                         // through f.Stats.ValidLocked despite not holding d.mu.
     231            1 :                         if f.StatsValid() {
     232            1 :                                 continue
     233              :                         }
     234              : 
     235              :                         // Limit how much work we do per read state. The older the read
     236              :                         // state is, the higher the likelihood files are no longer being
     237              :                         // used in the current version. If we've exhausted our allowance,
     238              :                         // return true for the last return value to signal there's more
     239              :                         // work to do.
     240            1 :                         if len(fill) == cap(fill) {
     241            1 :                                 moreRemain = true
     242            1 :                                 return fill, hints, moreRemain
     243            1 :                         }
     244              : 
     245              :                         // If the file is remote and not SharedForeign, we should check if its size
     246              :                         // matches. This is because checkConsistency skips over remote files.
     247              :                         //
     248              :                         // SharedForeign and External files are skipped as their sizes are allowed
     249              :                         // to have a mismatch; the size stored in the FileBacking is just the part
     250              :                         // of the file that is referenced by this Pebble instance, not the size of
     251              :                         // the whole object.
     252            1 :                         objMeta, err := d.objProvider.Lookup(base.FileTypeTable, f.FileBacking.DiskFileNum)
     253            1 :                         if err != nil {
     254            0 :                                 // Set `moreRemain` so we'll try again.
     255            0 :                                 moreRemain = true
     256            0 :                                 d.opts.EventListener.BackgroundError(err)
     257            0 :                                 continue
     258              :                         }
     259              : 
     260            1 :                         shouldCheckSize := objMeta.IsRemote() &&
     261            1 :                                 !d.objProvider.IsSharedForeign(objMeta) &&
     262            1 :                                 !objMeta.IsExternal()
     263            1 :                         if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize {
     264            1 :                                 size, err := d.objProvider.Size(objMeta)
     265            1 :                                 fileSize := f.FileBacking.Size
     266            1 :                                 if err != nil {
     267            0 :                                         moreRemain = true
     268            0 :                                         d.opts.EventListener.BackgroundError(err)
     269            0 :                                         continue
     270              :                                 }
     271            1 :                                 if size != int64(fileSize) {
     272            0 :                                         err := errors.Errorf(
     273            0 :                                                 "during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)",
     274            0 :                                                 errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta),
     275            0 :                                                 errors.Safe(size), errors.Safe(fileSize))
     276            0 :                                         d.opts.EventListener.BackgroundError(err)
     277            0 :                                         d.opts.Logger.Fatalf("%s", err)
     278            0 :                                 }
     279              : 
     280            1 :                                 sizesChecked[f.FileBacking.DiskFileNum] = struct{}{}
     281              :                         }
     282              : 
     283            1 :                         stats, newHints, err := d.loadTableStats(
     284            1 :                                 rs.current, l, f,
     285            1 :                         )
     286            1 :                         if err != nil {
     287            0 :                                 // Set `moreRemain` so we'll try again.
     288            0 :                                 moreRemain = true
     289            0 :                                 d.opts.EventListener.BackgroundError(err)
     290            0 :                                 continue
     291              :                         }
     292            1 :                         fill = append(fill, collectedStats{
     293            1 :                                 tableMetadata: f,
     294            1 :                                 TableStats:    stats,
     295            1 :                         })
     296            1 :                         hints = append(hints, newHints...)
     297              :                 }
     298              :         }
     299            1 :         return fill, hints, moreRemain
     300              : }
     301              : 
     302              : func (d *DB) loadTableStats(
     303              :         v *version, level int, meta *tableMetadata,
     304            1 : ) (manifest.TableStats, []deleteCompactionHint, error) {
     305            1 :         var stats manifest.TableStats
     306            1 :         var compactionHints []deleteCompactionHint
     307            1 :         err := d.fileCache.withCommonReader(
     308            1 :                 meta, func(r sstable.CommonReader) (err error) {
     309            1 :                         props := r.CommonProperties()
     310            1 :                         stats.NumEntries = props.NumEntries
     311            1 :                         stats.NumDeletions = props.NumDeletions
     312            1 :                         stats.NumRangeKeySets = props.NumRangeKeySets
     313            1 :                         stats.ValueBlocksSize = props.ValueBlocksSize
     314            1 :                         stats.CompressionType = block.CompressionFromString(props.CompressionName)
     315            1 :                         if props.NumDataBlocks > 0 {
     316            1 :                                 stats.TombstoneDenseBlocksRatio = float64(props.NumTombstoneDenseBlocks) / float64(props.NumDataBlocks)
     317            1 :                         }
     318              : 
     319            1 :                         if props.NumPointDeletions() > 0 {
     320            1 :                                 if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
     321            0 :                                         return
     322            0 :                                 }
     323              :                         }
     324            1 :                         if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
     325            1 :                                 if compactionHints, err = d.loadTableRangeDelStats(
     326            1 :                                         r, v, level, meta, &stats,
     327            1 :                                 ); err != nil {
     328            0 :                                         return
     329            0 :                                 }
     330              :                         }
     331            1 :                         return
     332              :                 })
     333            1 :         if err != nil {
     334            0 :                 return stats, nil, err
     335            0 :         }
     336            1 :         return stats, compactionHints, nil
     337              : }
     338              : 
     339              : // loadTablePointKeyStats calculates the point key statistics for the given
     340              : // table. The provided manifest.TableStats are updated.
     341              : func (d *DB) loadTablePointKeyStats(
     342              :         props *sstable.CommonProperties,
     343              :         v *version,
     344              :         level int,
     345              :         meta *tableMetadata,
     346              :         stats *manifest.TableStats,
     347            1 : ) error {
     348            1 :         // TODO(jackson): If the file has a wide keyspace, the average
     349            1 :         // value size beneath the entire file might not be representative
     350            1 :         // of the size of the keys beneath the point tombstones.
     351            1 :         // We could write the ranges of 'clusters' of point tombstones to
     352            1 :         // a sstable property and call averageValueSizeBeneath for each of
     353            1 :         // these narrower ranges to improve the estimate.
     354            1 :         avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
     355            1 :         if err != nil {
     356            0 :                 return err
     357            0 :         }
     358            1 :         stats.PointDeletionsBytesEstimate =
     359            1 :                 pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
     360            1 :         return nil
     361              : }
     362              : 
     363              : // loadTableRangeDelStats calculates the range deletion and range key deletion
     364              : // statistics for the given table.
     365              : func (d *DB) loadTableRangeDelStats(
     366              :         r sstable.CommonReader, v *version, level int, meta *tableMetadata, stats *manifest.TableStats,
     367            1 : ) ([]deleteCompactionHint, error) {
     368            1 :         iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
     369            1 :         if err != nil {
     370            0 :                 return nil, err
     371            0 :         }
     372            1 :         defer iter.Close()
     373            1 :         var compactionHints []deleteCompactionHint
     374            1 :         // We iterate over the defragmented range tombstones and range key deletions,
     375            1 :         // which ensures we don't double count ranges deleted at different sequence
     376            1 :         // numbers. Also, merging abutting tombstones reduces the number of calls to
     377            1 :         // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
     378            1 :         // our overall estimate.
     379            1 :         s, err := iter.First()
     380            1 :         for ; s != nil; s, err = iter.Next() {
     381            1 :                 start, end := s.Start, s.End
     382            1 :                 // We only need to consider deletion size estimates for tables that contain
     383            1 :                 // RANGEDELs.
     384            1 :                 var maxRangeDeleteSeqNum base.SeqNum
     385            1 :                 for _, k := range s.Keys {
     386            1 :                         if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
     387            1 :                                 maxRangeDeleteSeqNum = k.SeqNum()
     388            1 :                                 break
     389              :                         }
     390              :                 }
     391              : 
     392              :                 // If the file is in the last level of the LSM, there is no data beneath
     393              :                 // it. The fact that there is still a range tombstone in a bottommost file
     394              :                 // indicates two possibilites:
     395              :                 //   1. an open snapshot kept the tombstone around, and the data the
     396              :                 //      tombstone deletes is contained within the file itself.
     397              :                 //   2. the file was ingested.
     398              :                 // In the first case, we'd like to estimate disk usage within the file
     399              :                 // itself since compacting the file will drop that covered data. In the
     400              :                 // second case, we expect that compacting the file will NOT drop any
     401              :                 // data and rewriting the file is a waste of write bandwidth. We can
     402              :                 // distinguish these cases by looking at the table metadata's sequence
     403              :                 // numbers. A file's range deletions can only delete data within the
     404              :                 // file at lower sequence numbers. All keys in an ingested sstable adopt
     405              :                 // the same sequence number, preventing tombstones from deleting keys
     406              :                 // within the same file. We check here if the largest RANGEDEL sequence
     407              :                 // number is greater than the file's smallest sequence number. If it is,
     408              :                 // the RANGEDEL could conceivably (although inconclusively) delete data
     409              :                 // within the same file.
     410              :                 //
     411              :                 // Note that this heuristic is imperfect. If a table containing a range
     412              :                 // deletion is ingested into L5 and subsequently compacted into L6 but
     413              :                 // an open snapshot prevents elision of covered keys in L6, the
     414              :                 // resulting RangeDeletionsBytesEstimate will incorrectly include all
     415              :                 // covered keys.
     416              :                 //
     417              :                 // TODO(jackson): We could prevent the above error in the heuristic by
     418              :                 // computing the file's RangeDeletionsBytesEstimate during the
     419              :                 // compaction itself. It's unclear how common this is.
     420              :                 //
     421              :                 // NOTE: If the span `s` wholly contains a table containing range keys,
     422              :                 // the returned size estimate will be slightly inflated by the range key
     423              :                 // block. However, in practice, range keys are expected to be rare, and
     424              :                 // the size of the range key block relative to the overall size of the
     425              :                 // table is expected to be small.
     426            1 :                 if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
     427            1 :                         size, err := r.EstimateDiskUsage(start, end)
     428            1 :                         if err != nil {
     429            0 :                                 return nil, err
     430            0 :                         }
     431            1 :                         stats.RangeDeletionsBytesEstimate += size
     432            1 : 
     433            1 :                         // As the file is in the bottommost level, there is no need to collect a
     434            1 :                         // deletion hint.
     435            1 :                         continue
     436              :                 }
     437              : 
     438              :                 // While the size estimates for point keys should only be updated if this
     439              :                 // span contains a range del, the sequence numbers are required for the
     440              :                 // hint. Unconditionally descend, but conditionally update the estimates.
     441            1 :                 hintType := compactionHintFromKeys(s.Keys)
     442            1 :                 estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
     443            1 :                 if err != nil {
     444            0 :                         return nil, err
     445            0 :                 }
     446            1 :                 stats.RangeDeletionsBytesEstimate += estimate
     447            1 : 
     448            1 :                 // hintSeqNum is the smallest sequence number contained in any
     449            1 :                 // file overlapping with the hint and in a level below it.
     450            1 :                 if hintSeqNum == math.MaxUint64 {
     451            1 :                         continue
     452              :                 }
     453            1 :                 hint := deleteCompactionHint{
     454            1 :                         hintType:                hintType,
     455            1 :                         start:                   make([]byte, len(start)),
     456            1 :                         end:                     make([]byte, len(end)),
     457            1 :                         tombstoneFile:           meta,
     458            1 :                         tombstoneLevel:          level,
     459            1 :                         tombstoneLargestSeqNum:  s.LargestSeqNum(),
     460            1 :                         tombstoneSmallestSeqNum: s.SmallestSeqNum(),
     461            1 :                         fileSmallestSeqNum:      hintSeqNum,
     462            1 :                 }
     463            1 :                 copy(hint.start, start)
     464            1 :                 copy(hint.end, end)
     465            1 :                 compactionHints = append(compactionHints, hint)
     466              :         }
     467            1 :         if err != nil {
     468            0 :                 return nil, err
     469            0 :         }
     470            1 :         return compactionHints, nil
     471              : }
     472              : 
     473              : func (d *DB) estimateSizesBeneath(
     474              :         v *version, level int, meta *tableMetadata, fileProps *sstable.CommonProperties,
     475            1 : ) (avgValueLogicalSize, compressionRatio float64, err error) {
     476            1 :         // Find all files in lower levels that overlap with meta,
     477            1 :         // summing their value sizes and entry counts.
     478            1 :         file := meta
     479            1 :         var fileSum, keySum, valSum, entryCount uint64
     480            1 :         // Include the file itself. This is important because in some instances, the
     481            1 :         // computed compression ratio is applied to the tombstones contained within
     482            1 :         // `meta` itself. If there are no files beneath `meta` in the LSM, we would
     483            1 :         // calculate a compression ratio of 0 which is not accurate for the file's
     484            1 :         // own tombstones.
     485            1 :         fileSum += file.Size
     486            1 :         entryCount += fileProps.NumEntries
     487            1 :         keySum += fileProps.RawKeySize
     488            1 :         valSum += fileProps.RawValueSize
     489            1 : 
     490            1 :         addPhysicalTableStats := func(r *sstable.Reader) (err error) {
     491            1 :                 fileSum += file.Size
     492            1 :                 entryCount += r.Properties.NumEntries
     493            1 :                 keySum += r.Properties.RawKeySize
     494            1 :                 valSum += r.Properties.RawValueSize
     495            1 :                 return nil
     496            1 :         }
     497            1 :         addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
     498            1 :                 fileSum += file.Size
     499            1 :                 entryCount += file.Stats.NumEntries
     500            1 :                 keySum += v.Properties.RawKeySize
     501            1 :                 valSum += v.Properties.RawValueSize
     502            1 :                 return nil
     503            1 :         }
     504              : 
     505            1 :         for l := level + 1; l < numLevels; l++ {
     506            1 :                 overlaps := v.Overlaps(l, meta.UserKeyBounds())
     507            1 :                 iter := overlaps.Iter()
     508            1 :                 for file = iter.First(); file != nil; file = iter.Next() {
     509            1 :                         var err error
     510            1 :                         if file.Virtual {
     511            1 :                                 err = d.fileCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
     512            1 :                         } else {
     513            1 :                                 err = d.fileCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
     514            1 :                         }
     515            1 :                         if err != nil {
     516            0 :                                 return 0, 0, err
     517            0 :                         }
     518              :                 }
     519              :         }
     520            1 :         if entryCount == 0 {
     521            0 :                 return 0, 0, nil
     522            0 :         }
     523              :         // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
     524              :         // the value sum according to the data size to account for compression,
     525              :         // index blocks and metadata overhead. Eg:
     526              :         //
     527              :         //    Compression rate        ×  Average uncompressed value size
     528              :         //
     529              :         //                            ↓
     530              :         //
     531              :         //         FileSize              RawValueSize
     532              :         //   -----------------------  ×  ------------
     533              :         //   RawKeySize+RawValueSize     NumEntries
     534              :         //
     535              :         // We return the average logical value size plus the compression ratio,
     536              :         // leaving the scaling to the caller. This allows the caller to perform
     537              :         // additional compression ratio scaling if necessary.
     538            1 :         uncompressedSum := float64(keySum + valSum)
     539            1 :         compressionRatio = float64(fileSum) / uncompressedSum
     540            1 :         avgValueLogicalSize = (float64(valSum) / float64(entryCount))
     541            1 :         return avgValueLogicalSize, compressionRatio, nil
     542              : }
     543              : 
     544              : func (d *DB) estimateReclaimedSizeBeneath(
     545              :         v *version, level int, start, end []byte, hintType deleteCompactionHintType,
     546            1 : ) (estimate uint64, hintSeqNum base.SeqNum, err error) {
     547            1 :         // Find all files in lower levels that overlap with the deleted range
     548            1 :         // [start, end).
     549            1 :         //
     550            1 :         // An overlapping file might be completely contained by the range
     551            1 :         // tombstone, in which case we can count the entire file size in
     552            1 :         // our estimate without doing any additional I/O.
     553            1 :         //
     554            1 :         // Otherwise, estimating the range for the file requires
     555            1 :         // additional I/O to read the file's index blocks.
     556            1 :         hintSeqNum = math.MaxUint64
     557            1 :         for l := level + 1; l < numLevels; l++ {
     558            1 :                 overlaps := v.Overlaps(l, base.UserKeyBoundsEndExclusive(start, end))
     559            1 :                 iter := overlaps.Iter()
     560            1 :                 for file := iter.First(); file != nil; file = iter.Next() {
     561            1 :                         // Determine whether we need to update size estimates and hint seqnums
     562            1 :                         // based on the type of hint and the type of keys in this file.
     563            1 :                         var updateEstimates, updateHints bool
     564            1 :                         switch hintType {
     565            1 :                         case deleteCompactionHintTypePointKeyOnly:
     566            1 :                                 // The range deletion byte estimates should only be updated if this
     567            1 :                                 // table contains point keys. This ends up being an overestimate in
     568            1 :                                 // the case that table also has range keys, but such keys are expected
     569            1 :                                 // to contribute a negligible amount of the table's overall size,
     570            1 :                                 // relative to point keys.
     571            1 :                                 if file.HasPointKeys {
     572            1 :                                         updateEstimates = true
     573            1 :                                 }
     574              :                                 // As the initiating span contained only range dels, hints can only be
     575              :                                 // updated if this table does _not_ contain range keys.
     576            1 :                                 if !file.HasRangeKeys {
     577            1 :                                         updateHints = true
     578            1 :                                 }
     579            1 :                         case deleteCompactionHintTypeRangeKeyOnly:
     580            1 :                                 // The initiating span contained only range key dels. The estimates
     581            1 :                                 // apply only to point keys, and are therefore not updated.
     582            1 :                                 updateEstimates = false
     583            1 :                                 // As the initiating span contained only range key dels, hints can
     584            1 :                                 // only be updated if this table does _not_ contain point keys.
     585            1 :                                 if !file.HasPointKeys {
     586            1 :                                         updateHints = true
     587            1 :                                 }
     588            1 :                         case deleteCompactionHintTypePointAndRangeKey:
     589            1 :                                 // Always update the estimates and hints, as this hint type can drop a
     590            1 :                                 // file, irrespective of the mixture of keys. Similar to above, the
     591            1 :                                 // range del bytes estimates is an overestimate.
     592            1 :                                 updateEstimates, updateHints = true, true
     593            0 :                         default:
     594            0 :                                 panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
     595              :                         }
     596            1 :                         startCmp := d.cmp(start, file.Smallest.UserKey)
     597            1 :                         endCmp := d.cmp(file.Largest.UserKey, end)
     598            1 :                         if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
     599            1 :                                 // The range fully contains the file, so skip looking it up in table
     600            1 :                                 // cache/looking at its indexes and add the full file size.
     601            1 :                                 if updateEstimates {
     602            1 :                                         estimate += file.Size
     603            1 :                                 }
     604            1 :                                 if updateHints && hintSeqNum > file.SmallestSeqNum {
     605            1 :                                         hintSeqNum = file.SmallestSeqNum
     606            1 :                                 }
     607            1 :                         } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
     608            1 :                                 // Partial overlap.
     609            1 :                                 if hintType == deleteCompactionHintTypeRangeKeyOnly {
     610            1 :                                         // If the hint that generated this overlap contains only range keys,
     611            1 :                                         // there is no need to calculate disk usage, as the reclaimable space
     612            1 :                                         // is expected to be minimal relative to point keys.
     613            1 :                                         continue
     614              :                                 }
     615            1 :                                 var size uint64
     616            1 :                                 var err error
     617            1 :                                 if file.Virtual {
     618            1 :                                         err = d.fileCache.withVirtualReader(
     619            1 :                                                 file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
     620            1 :                                                         size, err = r.EstimateDiskUsage(start, end)
     621            1 :                                                         return err
     622            1 :                                                 })
     623            1 :                                 } else {
     624            1 :                                         err = d.fileCache.withReader(
     625            1 :                                                 file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
     626            1 :                                                         size, err = r.EstimateDiskUsage(start, end)
     627            1 :                                                         return err
     628            1 :                                                 })
     629              :                                 }
     630              : 
     631            1 :                                 if err != nil {
     632            0 :                                         return 0, hintSeqNum, err
     633            0 :                                 }
     634            1 :                                 estimate += size
     635            1 :                                 if updateHints && hintSeqNum > file.SmallestSeqNum && d.FormatMajorVersion() >= FormatVirtualSSTables {
     636            1 :                                         // If the format major version is past Virtual SSTables, deletion only
     637            1 :                                         // hints can also apply to partial overlaps with sstables.
     638            1 :                                         hintSeqNum = file.SmallestSeqNum
     639            1 :                                 }
     640              :                         }
     641              :                 }
     642              :         }
     643            1 :         return estimate, hintSeqNum, nil
     644              : }
     645              : 
     646            1 : func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
     647            1 :         // If a table contains range deletions or range key deletions, we defer the
     648            1 :         // stats collection. There are two main reasons for this:
     649            1 :         //
     650            1 :         //  1. Estimating the potential for reclaimed space due to a range deletion
     651            1 :         //     tombstone requires scanning the LSM - a potentially expensive operation
     652            1 :         //     that should be deferred.
     653            1 :         //  2. Range deletions and / or range key deletions present an opportunity to
     654            1 :         //     compute "deletion hints", which also requires a scan of the LSM to
     655            1 :         //     compute tables that would be eligible for deletion.
     656            1 :         //
     657            1 :         // These two tasks are deferred to the table stats collector goroutine.
     658            1 :         if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
     659            1 :                 return false
     660            1 :         }
     661              : 
     662              :         // If a table is more than 10% point deletions without user-provided size
     663              :         // estimates, don't calculate the PointDeletionsBytesEstimate statistic
     664              :         // using our limited knowledge. The table stats collector can populate the
     665              :         // stats and calculate an average of value size of all the tables beneath
     666              :         // the table in the LSM, which will be more accurate.
     667            1 :         if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
     668            1 :                 return false
     669            1 :         }
     670              : 
     671            1 :         var pointEstimate uint64
     672            1 :         if props.NumEntries > 0 {
     673            1 :                 // Use the file's own average key and value sizes as an estimate. This
     674            1 :                 // doesn't require any additional IO and since the number of point
     675            1 :                 // deletions in the file is low, the error introduced by this crude
     676            1 :                 // estimate is expected to be small.
     677            1 :                 commonProps := &props.CommonProperties
     678            1 :                 avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
     679            1 :                 pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
     680            1 :         }
     681              : 
     682            1 :         meta.Stats.NumEntries = props.NumEntries
     683            1 :         meta.Stats.NumDeletions = props.NumDeletions
     684            1 :         meta.Stats.NumRangeKeySets = props.NumRangeKeySets
     685            1 :         meta.Stats.PointDeletionsBytesEstimate = pointEstimate
     686            1 :         meta.Stats.RangeDeletionsBytesEstimate = 0
     687            1 :         meta.Stats.ValueBlocksSize = props.ValueBlocksSize
     688            1 :         meta.Stats.CompressionType = block.CompressionFromString(props.CompressionName)
     689            1 :         meta.StatsMarkValid()
     690            1 :         return true
     691              : }
     692              : 
     693              : func pointDeletionsBytesEstimate(
     694              :         fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
     695            1 : ) (estimate uint64) {
     696            1 :         if props.NumEntries == 0 {
     697            0 :                 return 0
     698            0 :         }
     699            1 :         numPointDels := props.NumPointDeletions()
     700            1 :         if numPointDels == 0 {
     701            1 :                 return 0
     702            1 :         }
     703              :         // Estimate the potential space to reclaim using the table's own properties.
     704              :         // There may or may not be keys covered by any individual point tombstone.
     705              :         // If not, compacting the point tombstone into L6 will at least allow us to
     706              :         // drop the point deletion key and will reclaim the tombstone's key bytes.
     707              :         // If there are covered key(s), we also get to drop key and value bytes for
     708              :         // each covered key.
     709              :         //
     710              :         // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
     711              :         // uncompressed size of entries that will be elided by fully compacting the
     712              :         // tombstone. For these tombstones, there's no guesswork—we use the
     713              :         // RawPointTombstoneValueSizeHint property which is the sum of all these
     714              :         // tombstones' encoded values.
     715              :         //
     716              :         // For un-sized point tombstones (DELs), we estimate assuming that each
     717              :         // point tombstone on average covers 1 key and using average value sizes.
     718              :         // This is almost certainly an overestimate, but that's probably okay
     719              :         // because point tombstones can slow range iterations even when they don't
     720              :         // cover a key.
     721              :         //
     722              :         // TODO(jackson): This logic doesn't directly incorporate fixed per-key
     723              :         // overhead (8-byte trailer, plus at least 1 byte encoding the length of the
     724              :         // key and 1 byte encoding the length of the value). This overhead is
     725              :         // indirectly incorporated through the compression ratios, but that results
     726              :         // in the overhead being smeared per key-byte and value-byte, rather than
     727              :         // per-entry. This per-key fixed overhead can be nontrivial, especially for
     728              :         // dense swaths of point tombstones. Give some thought as to whether we
     729              :         // should directly include fixed per-key overhead in the calculations.
     730              : 
     731              :         // Below, we calculate the tombstone contributions and the shadowed keys'
     732              :         // contributions separately.
     733            1 :         var tombstonesLogicalSize float64
     734            1 :         var shadowedLogicalSize float64
     735            1 : 
     736            1 :         // 1. Calculate the contribution of the tombstone keys themselves.
     737            1 :         if props.RawPointTombstoneKeySize > 0 {
     738            1 :                 tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
     739            1 :         } else {
     740            0 :                 // This sstable predates the existence of the RawPointTombstoneKeySize
     741            0 :                 // property. We can use the average key size within the file itself and
     742            0 :                 // the count of point deletions to estimate the size.
     743            0 :                 tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
     744            0 :         }
     745              : 
     746              :         // 2. Calculate the contribution of the keys shadowed by tombstones.
     747              :         //
     748              :         // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
     749              :         // tombstones encode the size of both the key and value of the shadowed KV
     750              :         // entries. These sizes are aggregated into a sstable property.
     751            1 :         shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
     752            1 : 
     753            1 :         // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
     754            1 :         // keys.
     755            1 :         numUnsizedDels := numPointDels - props.NumSizedDeletions
     756            1 :         {
     757            1 :                 // The shadowed keys have the same exact user keys as the tombstones
     758            1 :                 // themselves, so we can use the `tombstonesLogicalSize` we computed
     759            1 :                 // earlier as an estimate. There's a complication that
     760            1 :                 // `tombstonesLogicalSize` may include DELSIZED keys we already
     761            1 :                 // accounted for.
     762            1 :                 shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
     763            1 : 
     764            1 :                 // Calculate the contribution of the deleted values. The caller has
     765            1 :                 // already computed an average logical size (possibly computed across
     766            1 :                 // many sstables).
     767            1 :                 shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
     768            1 :         }
     769              : 
     770              :         // Scale both tombstone and shadowed totals by logical:physical ratios to
     771              :         // account for compression, metadata overhead, etc.
     772              :         //
     773              :         //      Physical             FileSize
     774              :         //     -----------  = -----------------------
     775              :         //      Logical       RawKeySize+RawValueSize
     776              :         //
     777            1 :         return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
     778              : }
     779              : 
     780              : func estimatePhysicalSizes(
     781              :         fileSize uint64, props *sstable.CommonProperties,
     782            1 : ) (avgValLogicalSize, compressionRatio float64) {
     783            1 :         // RawKeySize and RawValueSize are uncompressed totals. Scale according to
     784            1 :         // the data size to account for compression, index blocks and metadata
     785            1 :         // overhead. Eg:
     786            1 :         //
     787            1 :         //    Compression rate        ×  Average uncompressed value size
     788            1 :         //
     789            1 :         //                            ↓
     790            1 :         //
     791            1 :         //         FileSize              RawValSize
     792            1 :         //   -----------------------  ×  ----------
     793            1 :         //   RawKeySize+RawValueSize     NumEntries
     794            1 :         //
     795            1 :         uncompressedSum := props.RawKeySize + props.RawValueSize
     796            1 :         compressionRatio = float64(fileSize) / float64(uncompressedSum)
     797            1 :         avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
     798            1 :         return avgValLogicalSize, compressionRatio
     799            1 : }
     800              : 
     801              : // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
     802              : // returns "ranged deletion" spans for a single table, providing a combined view
     803              : // of both range deletion and range key deletion spans. The
     804              : // tableRangedDeletionIter is intended for use in the specific case of computing
     805              : // the statistics and deleteCompactionHints for a single table.
     806              : //
     807              : // As an example, consider the following set of spans from the range deletion
     808              : // and range key blocks of a table:
     809              : //
     810              : //                    |---------|     |---------|         |-------| RANGEKEYDELs
     811              : //              |-----------|-------------|           |-----|       RANGEDELs
     812              : //        __________________________________________________________
     813              : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     814              : //
     815              : // The tableRangedDeletionIter produces the following set of output spans, where
     816              : // '1' indicates a span containing only range deletions, '2' is a span
     817              : // containing only range key deletions, and '3' is a span containing a mixture
     818              : // of both range deletions and range key deletions.
     819              : //
     820              : //                 1       3       1    3    2          1  3   2
     821              : //              |-----|---------|-----|---|-----|     |---|-|-----|
     822              : //        __________________________________________________________
     823              : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     824              : //
     825              : // Algorithm.
     826              : //
     827              : // The iterator first defragments the range deletion and range key blocks
     828              : // separately. During this defragmentation, the range key block is also filtered
     829              : // so that keys other than range key deletes are ignored. The range delete and
     830              : // range key delete keyspaces are then merged.
     831              : //
     832              : // Note that the only fragmentation introduced by merging is from where a range
     833              : // del span overlaps with a range key del span. Within the bounds of any overlap
     834              : // there is guaranteed to be no further fragmentation, as the constituent spans
     835              : // have already been defragmented. To the left and right of any overlap, the
     836              : // same reasoning applies. For example,
     837              : //
     838              : //                       |--------|         |-------| RANGEKEYDEL
     839              : //              |---------------------------|         RANGEDEL
     840              : //              |----1---|----3---|----1----|---2---| Merged, fragmented spans.
     841              : //        __________________________________________________________
     842              : //              a b c d e f g h i j k l m n o p q r s t u v w x y z
     843              : //
     844              : // Any fragmented abutting spans produced by the merging iter will be of
     845              : // differing types (i.e. a transition from a span with homogenous key kinds to a
     846              : // heterogeneous span, or a transition from a span with exclusively range dels
     847              : // to a span with exclusively range key dels). Therefore, further
     848              : // defragmentation is not required.
     849              : //
     850              : // Each span returned by the tableRangeDeletionIter will have at most four keys,
     851              : // corresponding to the largest and smallest sequence numbers encountered across
     852              : // the range deletes and range keys deletes that comprised the merged spans.
     853              : func newCombinedDeletionKeyspanIter(
     854              :         comparer *base.Comparer, cr sstable.CommonReader, m *tableMetadata,
     855            1 : ) (keyspan.FragmentIterator, error) {
     856            1 :         // The range del iter and range key iter are each wrapped in their own
     857            1 :         // defragmenting iter. For each iter, abutting spans can always be merged.
     858            1 :         var equal = keyspan.DefragmentMethodFunc(func(_ base.CompareRangeSuffixes, a, b *keyspan.Span) bool { return true })
     859              :         // Reduce keys by maintaining a slice of at most length two, corresponding to
     860              :         // the largest and smallest keys in the defragmented span. This maintains the
     861              :         // contract that the emitted slice is sorted by (SeqNum, Kind) descending.
     862            1 :         reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
     863            1 :                 if len(current) == 0 && len(incoming) == 0 {
     864            0 :                         // While this should never occur in practice, a defensive return is used
     865            0 :                         // here to preserve correctness.
     866            0 :                         return current
     867            0 :                 }
     868            1 :                 var largest, smallest keyspan.Key
     869            1 :                 var set bool
     870            1 :                 for _, keys := range [2][]keyspan.Key{current, incoming} {
     871            1 :                         if len(keys) == 0 {
     872            0 :                                 continue
     873              :                         }
     874            1 :                         first, last := keys[0], keys[len(keys)-1]
     875            1 :                         if !set {
     876            1 :                                 largest, smallest = first, last
     877            1 :                                 set = true
     878            1 :                                 continue
     879              :                         }
     880            1 :                         if first.Trailer > largest.Trailer {
     881            1 :                                 largest = first
     882            1 :                         }
     883            1 :                         if last.Trailer < smallest.Trailer {
     884            1 :                                 smallest = last
     885            1 :                         }
     886              :                 }
     887            1 :                 if largest.Equal(comparer.CompareRangeSuffixes, smallest) {
     888            1 :                         current = append(current[:0], largest)
     889            1 :                 } else {
     890            1 :                         current = append(current[:0], largest, smallest)
     891            1 :                 }
     892            1 :                 return current
     893              :         }
     894              : 
     895              :         // The separate iters for the range dels and range keys are wrapped in a
     896              :         // merging iter to join the keyspaces into a single keyspace. The separate
     897              :         // iters are only added if the particular key kind is present.
     898            1 :         mIter := &keyspanimpl.MergingIter{}
     899            1 :         var transform = keyspan.TransformerFunc(func(_ base.CompareRangeSuffixes, in keyspan.Span, out *keyspan.Span) error {
     900            1 :                 if in.KeysOrder != keyspan.ByTrailerDesc {
     901            0 :                         panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
     902              :                 }
     903            1 :                 out.Start, out.End = in.Start, in.End
     904            1 :                 out.Keys = append(out.Keys[:0], in.Keys...)
     905            1 :                 out.KeysOrder = keyspan.ByTrailerDesc
     906            1 :                 // NB: The order of by-trailer descending may have been violated,
     907            1 :                 // because we've layered rangekey and rangedel iterators from the same
     908            1 :                 // sstable into the same keyspanimpl.MergingIter. The MergingIter will
     909            1 :                 // return the keys in the order that the child iterators were provided.
     910            1 :                 // Sort the keys to ensure they're sorted by trailer descending.
     911            1 :                 keyspan.SortKeysByTrailer(out.Keys)
     912            1 :                 return nil
     913              :         })
     914            1 :         mIter.Init(comparer, transform, new(keyspanimpl.MergingBuffers))
     915            1 : 
     916            1 :         iter, err := cr.NewRawRangeDelIter(context.TODO(), m.FragmentIterTransforms(), block.NoReadEnv)
     917            1 :         if err != nil {
     918            0 :                 return nil, err
     919            0 :         }
     920            1 :         if iter != nil {
     921            1 :                 // Assert expected bounds. In previous versions of Pebble, range
     922            1 :                 // deletions persisted to sstables could exceed the bounds of the
     923            1 :                 // containing files due to "split user keys." This required readers to
     924            1 :                 // constrain the tombstones' bounds to the containing file at read time.
     925            1 :                 // See docs/range_deletions.md for an extended discussion of the design
     926            1 :                 // and invariants at that time.
     927            1 :                 //
     928            1 :                 // We've since compacted away all 'split user-keys' and in the process
     929            1 :                 // eliminated all "untruncated range tombstones" for physical sstables.
     930            1 :                 // We no longer need to perform truncation at read time for these
     931            1 :                 // sstables.
     932            1 :                 //
     933            1 :                 // At the same time, we've also introduced the concept of "virtual
     934            1 :                 // SSTables" where the table metadata's effective bounds can again be
     935            1 :                 // reduced to be narrower than the contained tombstones. These virtual
     936            1 :                 // SSTables handle truncation differently, performing it using
     937            1 :                 // keyspan.Truncate when the sstable's range deletion iterator is
     938            1 :                 // opened.
     939            1 :                 //
     940            1 :                 // Together, these mean that we should never see untruncated range
     941            1 :                 // tombstones any more—and the merging iterator no longer accounts for
     942            1 :                 // their existence. Since there's abundant subtlety that we're relying
     943            1 :                 // on, we choose to be conservative and assert that these invariants
     944            1 :                 // hold. We could (and previously did) choose to only validate these
     945            1 :                 // bounds in invariants builds, but the most likely avenue for these
     946            1 :                 // tombstones' existence is through a bug in a migration and old data
     947            1 :                 // sitting around in an old store from long ago.
     948            1 :                 //
     949            1 :                 // The table stats collector will read all files range deletions
     950            1 :                 // asynchronously after Open, and provides a perfect opportunity to
     951            1 :                 // validate our invariants without harming user latency. We also
     952            1 :                 // previously performed truncation here which similarly required key
     953            1 :                 // comparisons, so replacing those key comparisons with assertions
     954            1 :                 // should be roughly similar in performance.
     955            1 :                 //
     956            1 :                 // TODO(jackson): Only use AssertBounds in invariants builds in the
     957            1 :                 // following release.
     958            1 :                 iter = keyspan.AssertBounds(
     959            1 :                         iter, m.SmallestPointKey, m.LargestPointKey.UserKey, comparer.Compare,
     960            1 :                 )
     961            1 :                 dIter := &keyspan.DefragmentingIter{}
     962            1 :                 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
     963            1 :                 iter = dIter
     964            1 :                 mIter.AddLevel(iter)
     965            1 :         }
     966              : 
     967            1 :         iter, err = cr.NewRawRangeKeyIter(context.TODO(), m.FragmentIterTransforms(), block.NoReadEnv)
     968            1 :         if err != nil {
     969            0 :                 return nil, err
     970            0 :         }
     971            1 :         if iter != nil {
     972            1 :                 // Assert expected bounds in tests.
     973            1 :                 if invariants.Sometimes(50) {
     974            1 :                         iter = keyspan.AssertBounds(
     975            1 :                                 iter, m.SmallestRangeKey, m.LargestRangeKey.UserKey, comparer.Compare,
     976            1 :                         )
     977            1 :                 }
     978              :                 // Wrap the range key iterator in a filter that elides keys other than range
     979              :                 // key deletions.
     980            1 :                 iter = keyspan.Filter(iter, func(in *keyspan.Span, buf []keyspan.Key) []keyspan.Key {
     981            1 :                         keys := buf[:0]
     982            1 :                         for _, k := range in.Keys {
     983            1 :                                 if k.Kind() != base.InternalKeyKindRangeKeyDelete {
     984            1 :                                         continue
     985              :                                 }
     986            1 :                                 keys = append(keys, k)
     987              :                         }
     988            1 :                         return keys
     989              :                 }, comparer.Compare)
     990            1 :                 dIter := &keyspan.DefragmentingIter{}
     991            1 :                 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
     992            1 :                 iter = dIter
     993            1 :                 mIter.AddLevel(iter)
     994              :         }
     995              : 
     996            1 :         return mIter, nil
     997              : }
     998              : 
     999              : // rangeKeySetsAnnotator is a manifest.Annotator that annotates B-Tree nodes
    1000              : // with the sum of the files' counts of range key fragments. The count of range
    1001              : // key sets may change once a table's stats are loaded asynchronously, so its
    1002              : // values are marked as cacheable only if a file's stats have been loaded.
    1003            1 : var rangeKeySetsAnnotator = manifest.SumAnnotator(func(f *manifest.TableMetadata) (uint64, bool) {
    1004            1 :         return f.Stats.NumRangeKeySets, f.StatsValid()
    1005            1 : })
    1006              : 
    1007              : // tombstonesAnnotator is a manifest.Annotator that annotates B-Tree nodes
    1008              : // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDEL
    1009              : // keys). The count of tombstones may change once a table's stats are loaded
    1010              : // asynchronously, so its values are marked as cacheable only if a file's stats
    1011              : // have been loaded.
    1012            1 : var tombstonesAnnotator = manifest.SumAnnotator(func(f *manifest.TableMetadata) (uint64, bool) {
    1013            1 :         return f.Stats.NumDeletions, f.StatsValid()
    1014            1 : })
    1015              : 
    1016              : // valueBlocksSizeAnnotator is a manifest.Annotator that annotates B-Tree
    1017              : // nodes with the sum of the files' Properties.ValueBlocksSize. The value block
    1018              : // size may change once a table's stats are loaded asynchronously, so its
    1019              : // values are marked as cacheable only if a file's stats have been loaded.
    1020            1 : var valueBlockSizeAnnotator = manifest.SumAnnotator(func(f *tableMetadata) (uint64, bool) {
    1021            1 :         return f.Stats.ValueBlocksSize, f.StatsValid()
    1022            1 : })
    1023              : 
    1024              : // compressionTypeAnnotator is a manifest.Annotator that annotates B-tree
    1025              : // nodes with the compression type of the file. Its annotation type is
    1026              : // compressionTypes. The compression type may change once a table's stats are
    1027              : // loaded asynchronously, so its values are marked as cacheable only if a file's
    1028              : // stats have been loaded.
    1029              : var compressionTypeAnnotator = manifest.Annotator[compressionTypes]{
    1030              :         Aggregator: compressionTypeAggregator{},
    1031              : }
    1032              : 
    1033              : type compressionTypeAggregator struct{}
    1034              : 
    1035              : type compressionTypes struct {
    1036              :         snappy, zstd, none, unknown uint64
    1037              : }
    1038              : 
    1039            1 : func (a compressionTypeAggregator) Zero(dst *compressionTypes) *compressionTypes {
    1040            1 :         if dst == nil {
    1041            1 :                 return new(compressionTypes)
    1042            1 :         }
    1043            0 :         *dst = compressionTypes{}
    1044            0 :         return dst
    1045              : }
    1046              : 
    1047              : func (a compressionTypeAggregator) Accumulate(
    1048              :         f *tableMetadata, dst *compressionTypes,
    1049            1 : ) (v *compressionTypes, cacheOK bool) {
    1050            1 :         switch f.Stats.CompressionType {
    1051            1 :         case SnappyCompression:
    1052            1 :                 dst.snappy++
    1053            1 :         case ZstdCompression:
    1054            1 :                 dst.zstd++
    1055            1 :         case NoCompression:
    1056            1 :                 dst.none++
    1057            1 :         default:
    1058            1 :                 dst.unknown++
    1059              :         }
    1060            1 :         return dst, f.StatsValid()
    1061              : }
    1062              : 
    1063              : func (a compressionTypeAggregator) Merge(
    1064              :         src *compressionTypes, dst *compressionTypes,
    1065            1 : ) *compressionTypes {
    1066            1 :         dst.snappy += src.snappy
    1067            1 :         dst.zstd += src.zstd
    1068            1 :         dst.none += src.none
    1069            1 :         dst.unknown += src.unknown
    1070            1 :         return dst
    1071            1 : }

Generated by: LCOV version 2.0-1