LCOV - 2025-10-07 08:18Z ff0eb692 - meta test only.lcov

LCOV - code coverage report

Current view:	top level - pebble - ingest.go (source / functions)		Coverage	Total	Hit
Test:	2025-10-07 08:18Z ff0eb692 - meta test only.lcov	Lines:	81.0 %	1596	1292
Test Date:	2025-10-07 08:19:40	Functions:	-	0	0

            Line data    Source code

       1              : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2              : // of this source code is governed by a BSD-style license that can be found in
       3              : // the LICENSE file.
       4              : 
       5              : package pebble
       6              : 
       7              : import (
       8              :         "context"
       9              :         "fmt"
      10              :         "slices"
      11              :         "sort"
      12              :         "time"
      13              : 
      14              :         "github.com/cockroachdb/crlib/crtime"
      15              :         "github.com/cockroachdb/errors"
      16              :         "github.com/cockroachdb/pebble/internal/base"
      17              :         "github.com/cockroachdb/pebble/internal/cache"
      18              :         "github.com/cockroachdb/pebble/internal/invariants"
      19              :         "github.com/cockroachdb/pebble/internal/keyspan"
      20              :         "github.com/cockroachdb/pebble/internal/manifest"
      21              :         "github.com/cockroachdb/pebble/internal/overlap"
      22              :         "github.com/cockroachdb/pebble/internal/sstableinternal"
      23              :         "github.com/cockroachdb/pebble/objstorage"
      24              :         "github.com/cockroachdb/pebble/objstorage/remote"
      25              :         "github.com/cockroachdb/pebble/sstable"
      26              :         "github.com/cockroachdb/pebble/sstable/block"
      27              : )
      28              : 
      29            1 : func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
      30            1 :         c := userCmp(a.UserKey, b.UserKey)
      31            1 :         if c != 0 {
      32            1 :                 return c
      33            1 :         }
      34            1 :         if a.IsExclusiveSentinel() {
      35            1 :                 if !b.IsExclusiveSentinel() {
      36            1 :                         return -1
      37            1 :                 }
      38            1 :         } else if b.IsExclusiveSentinel() {
      39            1 :                 return +1
      40            1 :         }
      41            1 :         return 0
      42              : }
      43              : 
      44            1 : func ingestValidateKey(opts *Options, key *InternalKey) error {
      45            1 :         if key.Kind() == InternalKeyKindInvalid {
      46            0 :                 return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s",
      47            0 :                         key.Pretty(opts.Comparer.FormatKey))
      48            0 :         }
      49            1 :         if key.SeqNum() != 0 {
      50            0 :                 return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s",
      51            0 :                         key.Pretty(opts.Comparer.FormatKey))
      52            0 :         }
      53            1 :         if err := opts.Comparer.ValidateKey.Validate(key.UserKey); err != nil {
      54            0 :                 return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s, %w",
      55            0 :                         key.Pretty(opts.Comparer.FormatKey), err)
      56            0 :         }
      57            1 :         return nil
      58              : }
      59              : 
      60              : // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned
      61              : // or shared by another node.
      62              : func ingestSynthesizeShared(
      63              :         opts *Options, sm SharedSSTMeta, tableNum base.TableNum,
      64            1 : ) (*manifest.TableMetadata, error) {
      65            1 :         if sm.Size == 0 {
      66            0 :                 // Disallow 0 file sizes
      67            0 :                 return nil, errors.New("pebble: cannot ingest shared file with size 0")
      68            0 :         }
      69              :         // Don't load table stats. Doing a round trip to shared storage, one SST
      70              :         // at a time is not worth it as it slows down ingestion.
      71            1 :         meta := &manifest.TableMetadata{
      72            1 :                 TableNum:     tableNum,
      73            1 :                 CreationTime: time.Now().Unix(),
      74            1 :                 Virtual:      true,
      75            1 :                 Size:         sm.Size,
      76            1 :         }
      77            1 :         if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil {
      78            1 :                 // Initialize meta.{HasPointKeys,Smallest,Largest}, etc.
      79            1 :                 //
      80            1 :                 // NB: We create new internal keys and pass them into ExtendPointKeyBounds
      81            1 :                 // so that we can sub a zero sequence number into the bounds. We can set
      82            1 :                 // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum
      83            1 :                 // anyway. However, we do need to use the same sequence number across all
      84            1 :                 // bound keys at this step so that we end up with bounds that are consistent
      85            1 :                 // across point/range keys.
      86            1 :                 //
      87            1 :                 // Because of the sequence number rewriting, we cannot use the Kind of
      88            1 :                 // sm.SmallestPointKey. For example, the original SST might start with
      89            1 :                 // a.SET.2 and a.RANGEDEL.1 (with a.SET.2 being the smallest key); after
      90            1 :                 // rewriting the sequence numbers, these keys become a.SET.100 and
      91            1 :                 // a.RANGEDEL.100, with a.RANGEDEL.100 being the smallest key. To create a
      92            1 :                 // correct bound, we just use the maximum key kind (which sorts first).
      93            1 :                 // Similarly, we use the smallest key kind for the largest key.
      94            1 :                 smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindMaxForSSTable)
      95            1 :                 largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, 0)
      96            1 :                 if sm.LargestPointKey.IsExclusiveSentinel() {
      97            1 :                         largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey)
      98            1 :                 }
      99            1 :                 if opts.Comparer.Equal(smallestPointKey.UserKey, largestPointKey.UserKey) &&
     100            1 :                         smallestPointKey.Trailer < largestPointKey.Trailer {
     101            0 :                         // We get kinds from the sender, however we substitute our own sequence
     102            0 :                         // numbers. This can result in cases where an sstable [b#5,SET-b#4,DELSIZED]
     103            0 :                         // becomes [b#0,SET-b#0,DELSIZED] when we synthesize it here, but the
     104            0 :                         // kinds need to be reversed now because DelSized > Set.
     105            0 :                         smallestPointKey, largestPointKey = largestPointKey, smallestPointKey
     106            0 :                 }
     107            1 :                 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey)
     108              :         }
     109            1 :         if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil {
     110            1 :                 // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc.
     111            1 :                 //
     112            1 :                 // See comment above on why we use a zero sequence number and these key
     113            1 :                 // kinds here.
     114            1 :                 smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, base.InternalKeyKindRangeKeyMax)
     115            1 :                 largestRangeKey := base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeyMin, sm.LargestRangeKey.UserKey)
     116            1 :                 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey)
     117            1 :         }
     118              : 
     119              :         // For simplicity, we use the same number for both the FileNum and the
     120              :         // DiskFileNum (even though this is a virtual sstable). Pass the underlying
     121              :         // TableBacking's size to the same size as the virtualized view of the sstable.
     122              :         // This ensures that we don't over-prioritize this sstable for compaction just
     123              :         // yet, as we do not have a clear sense of what parts of this sstable are
     124              :         // referenced by other nodes.
     125            1 :         meta.InitVirtualBacking(base.DiskFileNum(tableNum), sm.Size)
     126            1 : 
     127            1 :         if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
     128            0 :                 return nil, err
     129            0 :         }
     130            1 :         return meta, nil
     131              : }
     132              : 
     133              : // ingestLoad1External loads the fileMetadata for one external sstable.
     134              : // Sequence number and target level calculation happens during prepare/apply.
     135              : func ingestLoad1External(
     136              :         opts *Options, e ExternalFile, tableNum base.TableNum,
     137            1 : ) (*manifest.TableMetadata, error) {
     138            1 :         if e.Size == 0 {
     139            0 :                 return nil, errors.New("pebble: cannot ingest external file with size 0")
     140            0 :         }
     141            1 :         if !e.HasRangeKey && !e.HasPointKey {
     142            0 :                 return nil, errors.New("pebble: cannot ingest external file with no point or range keys")
     143            0 :         }
     144              : 
     145            1 :         if opts.Comparer.Compare(e.StartKey, e.EndKey) > 0 {
     146            0 :                 return nil, errors.Newf("pebble: external file bounds [%q, %q) are invalid", e.StartKey, e.EndKey)
     147            0 :         }
     148            1 :         if opts.Comparer.Compare(e.StartKey, e.EndKey) == 0 && !e.EndKeyIsInclusive {
     149            0 :                 return nil, errors.Newf("pebble: external file bounds [%q, %q) are invalid", e.StartKey, e.EndKey)
     150            0 :         }
     151            1 :         if n := opts.Comparer.Split(e.StartKey); n != len(e.StartKey) {
     152            0 :                 return nil, errors.Newf("pebble: external file bounds start key %q has suffix", e.StartKey)
     153            0 :         }
     154            1 :         if n := opts.Comparer.Split(e.EndKey); n != len(e.EndKey) {
     155            0 :                 return nil, errors.Newf("pebble: external file bounds end key %q has suffix", e.EndKey)
     156            0 :         }
     157              : 
     158              :         // Don't load table stats. Doing a round trip to shared storage, one SST
     159              :         // at a time is not worth it as it slows down ingestion.
     160            1 :         meta := &manifest.TableMetadata{
     161            1 :                 TableNum:     tableNum,
     162            1 :                 CreationTime: time.Now().Unix(),
     163            1 :                 Size:         e.Size,
     164            1 :                 Virtual:      true,
     165            1 :         }
     166            1 :         // In the name of keeping this ingestion as fast as possible, we avoid *all*
     167            1 :         // existence checks and synthesize a table metadata with smallest/largest
     168            1 :         // keys that overlap whatever the passed-in span was.
     169            1 :         smallestCopy := slices.Clone(e.StartKey)
     170            1 :         largestCopy := slices.Clone(e.EndKey)
     171            1 :         if e.HasPointKey {
     172            1 :                 // Sequence numbers are updated later by
     173            1 :                 // ingestUpdateSeqNum, applying a squence number that
     174            1 :                 // is applied to all keys in the sstable.
     175            1 :                 if e.EndKeyIsInclusive {
     176            0 :                         meta.ExtendPointKeyBounds(
     177            0 :                                 opts.Comparer.Compare,
     178            0 :                                 base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindMaxForSSTable),
     179            0 :                                 base.MakeInternalKey(largestCopy, 0, 0))
     180            1 :                 } else {
     181            1 :                         meta.ExtendPointKeyBounds(
     182            1 :                                 opts.Comparer.Compare,
     183            1 :                                 base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindMaxForSSTable),
     184            1 :                                 base.MakeRangeDeleteSentinelKey(largestCopy))
     185            1 :                 }
     186              :         }
     187            1 :         if e.HasRangeKey {
     188            1 :                 meta.ExtendRangeKeyBounds(
     189            1 :                         opts.Comparer.Compare,
     190            1 :                         base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeyMax),
     191            1 :                         base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyMin, largestCopy),
     192            1 :                 )
     193            1 :         }
     194              : 
     195            1 :         meta.SyntheticPrefixAndSuffix = sstable.MakeSyntheticPrefixAndSuffix(e.SyntheticPrefix, e.SyntheticSuffix)
     196            1 : 
     197            1 :         return meta, nil
     198              : }
     199              : 
     200              : type rangeKeyIngestValidator struct {
     201              :         // lastRangeKey is the last range key seen in the previous file.
     202              :         lastRangeKey keyspan.Span
     203              :         // comparer, if unset, disables range key validation.
     204              :         comparer *base.Comparer
     205              : }
     206              : 
     207            1 : func disableRangeKeyChecks() rangeKeyIngestValidator {
     208            1 :         return rangeKeyIngestValidator{}
     209            1 : }
     210              : 
     211              : func validateSuffixedBoundaries(
     212              :         cmp *base.Comparer, lastRangeKey keyspan.Span,
     213            1 : ) rangeKeyIngestValidator {
     214            1 :         return rangeKeyIngestValidator{
     215            1 :                 lastRangeKey: lastRangeKey,
     216            1 :                 comparer:     cmp,
     217            1 :         }
     218            1 : }
     219              : 
     220              : // Validate valides if the stored state of this rangeKeyIngestValidator allows for
     221              : // a file with the given nextFileSmallestKey to be ingested, such that the stored
     222              : // last file's largest range key defragments cleanly with the next file's smallest
     223              : // key if it was suffixed. If a value of nil is passed in for nextFileSmallestKey,
     224              : // that denotes the next file does not have a range key or there is no next file.
     225            1 : func (r *rangeKeyIngestValidator) Validate(nextFileSmallestKey *keyspan.Span) error {
     226            1 :         if r.comparer == nil {
     227            1 :                 return nil
     228            1 :         }
     229            1 :         if r.lastRangeKey.Valid() {
     230            1 :                 if r.comparer.Split.HasSuffix(r.lastRangeKey.End) {
     231            0 :                         if nextFileSmallestKey == nil || !r.comparer.Equal(r.lastRangeKey.End, nextFileSmallestKey.Start) {
     232            0 :                                 // The last range key has a suffix, and it doesn't defragment cleanly with this range key.
     233            0 :                                 return errors.AssertionFailedf("pebble: ingest sstable has suffixed largest range key that does not match the start key of the next sstable: %s",
     234            0 :                                         r.comparer.FormatKey(r.lastRangeKey.End))
     235            0 :                         } else if !keyspan.DefragmentInternal.ShouldDefragment(r.comparer.CompareRangeSuffixes, &r.lastRangeKey, nextFileSmallestKey) {
     236            0 :                                 // The last range key has a suffix, and it doesn't defragment cleanly with this range key.
     237            0 :                                 return errors.AssertionFailedf("pebble: ingest sstable has suffixed range key that won't defragment with next sstable: %s",
     238            0 :                                         r.comparer.FormatKey(r.lastRangeKey.End))
     239            0 :                         }
     240              :                 }
     241            1 :         } else if nextFileSmallestKey != nil && r.comparer.Split.HasSuffix(nextFileSmallestKey.Start) {
     242            0 :                 return errors.Newf("pebble: ingest sstable has suffixed range key start that won't defragment: %s",
     243            0 :                         r.comparer.FormatKey(nextFileSmallestKey.Start))
     244            0 :         }
     245            1 :         return nil
     246              : }
     247              : 
     248              : // ingestLoad1 creates the TableMetadata for one file. This file will be owned
     249              : // by this store.
     250              : //
     251              : // prevLastRangeKey is the last range key from the previous file. It is used to
     252              : // ensure that the range keys defragment cleanly across files. These checks
     253              : // are disabled if disableRangeKeyChecks is true.
     254              : func ingestLoad1(
     255              :         ctx context.Context,
     256              :         opts *Options,
     257              :         fmv FormatMajorVersion,
     258              :         readable objstorage.Readable,
     259              :         cacheHandle *cache.Handle,
     260              :         tableNum base.TableNum,
     261              :         rangeKeyValidator rangeKeyIngestValidator,
     262              : ) (
     263              :         meta *manifest.TableMetadata,
     264              :         lastRangeKey keyspan.Span,
     265              :         blockReadStats base.BlockReadStats,
     266              :         err error,
     267            1 : ) {
     268            1 :         o := opts.MakeReaderOptions()
     269            1 :         o.CacheOpts = sstableinternal.CacheOptions{
     270            1 :                 CacheHandle: cacheHandle,
     271            1 :                 FileNum:     base.PhysicalTableDiskFileNum(tableNum),
     272            1 :         }
     273            1 :         r, err := sstable.NewReader(ctx, readable, o)
     274            1 :         if err != nil {
     275            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, errors.CombineErrors(err, readable.Close())
     276            0 :         }
     277            1 :         defer func() { _ = r.Close() }()
     278              : 
     279              :         // Avoid ingesting tables with format versions this DB doesn't support.
     280            1 :         tf, err := r.TableFormat()
     281            1 :         if err != nil {
     282            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     283            0 :         }
     284            1 :         if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
     285            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, errors.Newf(
     286            0 :                         "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
     287            0 :                         tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
     288            0 :                 )
     289            0 :         }
     290              : 
     291            1 :         if r.Attributes.Has(sstable.AttributeBlobValues) {
     292            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, errors.Newf(
     293            0 :                         "pebble: ingesting tables with blob references is not supported")
     294            0 :         }
     295              : 
     296            1 :         props, err := r.ReadPropertiesBlock(ctx, nil /* buffer pool */)
     297            1 :         if err != nil {
     298            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     299            0 :         }
     300              : 
     301              :         // If this is a columnar block, read key schema name from properties block.
     302            1 :         if tf.BlockColumnar() {
     303            1 :                 if _, ok := opts.KeySchemas[props.KeySchemaName]; !ok {
     304            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, errors.Newf(
     305            0 :                                 "pebble: table uses key schema %q unknown to the database",
     306            0 :                                 props.KeySchemaName)
     307            0 :                 }
     308              :         }
     309              : 
     310            1 :         meta = &manifest.TableMetadata{}
     311            1 :         meta.TableNum = tableNum
     312            1 :         meta.Size = max(uint64(readable.Size()), 1)
     313            1 :         meta.CreationTime = time.Now().Unix()
     314            1 :         meta.InitPhysicalBacking()
     315            1 : 
     316            1 :         // Avoid loading into the file cache for collecting stats if we
     317            1 :         // don't need to. If there are no range deletions, we have all the
     318            1 :         // information to compute the stats here.
     319            1 :         //
     320            1 :         // This is helpful in tests for avoiding awkwardness around deletion of
     321            1 :         // ingested files from MemFS. MemFS implements the Windows semantics of
     322            1 :         // disallowing removal of an open file. Under MemFS, if we don't populate
     323            1 :         // meta.Stats here, the file will be loaded into the file cache for
     324            1 :         // calculating stats before we can remove the original link.
     325            1 :         maybeSetStatsFromProperties(meta.PhysicalMeta(), &props)
     326            1 : 
     327            1 :         var iterStats base.InternalIteratorStats
     328            1 :         env := sstable.ReadEnv{
     329            1 :                 Block: block.ReadEnv{
     330            1 :                         Stats: &iterStats,
     331            1 :                 },
     332            1 :         }
     333            1 :         {
     334            1 :                 iterOpts := sstable.IterOptions{
     335            1 :                         Lower:                nil,
     336            1 :                         Upper:                nil,
     337            1 :                         Transforms:           sstable.NoTransforms,
     338            1 :                         Filterer:             nil,
     339            1 :                         FilterBlockSizeLimit: sstable.AlwaysUseFilterBlock,
     340            1 :                         Env:                  env,
     341            1 :                         ReaderProvider:       sstable.MakeTrivialReaderProvider(r),
     342            1 :                         BlobContext:          sstable.AssertNoBlobHandles,
     343            1 :                 }
     344            1 :                 iter, err := r.NewPointIter(ctx, iterOpts)
     345            1 :                 if err != nil {
     346            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     347            0 :                 }
     348            1 :                 defer func() { _ = iter.Close() }()
     349            1 :                 var smallest InternalKey
     350            1 :                 if kv := iter.First(); kv != nil {
     351            1 :                         if err := ingestValidateKey(opts, &kv.K); err != nil {
     352            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     353            0 :                         }
     354            1 :                         smallest = kv.K.Clone()
     355              :                 }
     356            1 :                 if err := iter.Error(); err != nil {
     357            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     358            0 :                 }
     359            1 :                 if kv := iter.Last(); kv != nil {
     360            1 :                         if err := ingestValidateKey(opts, &kv.K); err != nil {
     361            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     362            0 :                         }
     363            1 :                         meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, kv.K.Clone())
     364              :                 }
     365            1 :                 if err := iter.Error(); err != nil {
     366            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     367            0 :                 }
     368              :         }
     369              : 
     370            1 :         iter, err := r.NewRawRangeDelIter(ctx, sstable.NoFragmentTransforms, env)
     371            1 :         if err != nil {
     372            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     373            0 :         }
     374            1 :         if iter != nil {
     375            1 :                 defer iter.Close()
     376            1 :                 var smallest InternalKey
     377            1 :                 if s, err := iter.First(); err != nil {
     378            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     379            1 :                 } else if s != nil {
     380            1 :                         key := s.SmallestKey()
     381            1 :                         if err := ingestValidateKey(opts, &key); err != nil {
     382            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     383            0 :                         }
     384            1 :                         smallest = key.Clone()
     385              :                 }
     386            1 :                 if s, err := iter.Last(); err != nil {
     387            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     388            1 :                 } else if s != nil {
     389            1 :                         k := s.SmallestKey()
     390            1 :                         if err := ingestValidateKey(opts, &k); err != nil {
     391            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     392            0 :                         }
     393            1 :                         largest := s.LargestKey().Clone()
     394            1 :                         meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
     395              :                 }
     396              :         }
     397              : 
     398              :         // Update the range-key bounds for the table.
     399            1 :         {
     400            1 :                 iter, err := r.NewRawRangeKeyIter(ctx, sstable.NoFragmentTransforms, env)
     401            1 :                 if err != nil {
     402            0 :                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     403            0 :                 }
     404            1 :                 if iter != nil {
     405            1 :                         defer iter.Close()
     406            1 :                         var smallest InternalKey
     407            1 :                         if s, err := iter.First(); err != nil {
     408            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     409            1 :                         } else if s != nil {
     410            1 :                                 key := s.SmallestKey()
     411            1 :                                 if err := ingestValidateKey(opts, &key); err != nil {
     412            0 :                                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     413            0 :                                 }
     414            1 :                                 smallest = key.Clone()
     415            1 :                                 // Range keys need some additional validation as we need to ensure they
     416            1 :                                 // defragment cleanly with the lastRangeKey from the previous file.
     417            1 :                                 if err := rangeKeyValidator.Validate(s); err != nil {
     418            0 :                                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     419            0 :                                 }
     420              :                         }
     421            1 :                         lastRangeKey = keyspan.Span{}
     422            1 :                         if s, err := iter.Last(); err != nil {
     423            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     424            1 :                         } else if s != nil {
     425            1 :                                 k := s.SmallestKey()
     426            1 :                                 if err := ingestValidateKey(opts, &k); err != nil {
     427            0 :                                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     428            0 :                                 }
     429              :                                 // As range keys are fragmented, the end key of the last range key in
     430              :                                 // the table provides the upper bound for the table.
     431            1 :                                 largest := s.LargestKey().Clone()
     432            1 :                                 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
     433            1 :                                 lastRangeKey = s.Clone()
     434            0 :                         } else {
     435            0 :                                 // s == nil.
     436            0 :                                 if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil {
     437            0 :                                         return nil, keyspan.Span{}, base.BlockReadStats{}, err
     438            0 :                                 }
     439              :                         }
     440            1 :                 } else {
     441            1 :                         if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil {
     442            0 :                                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     443            0 :                         }
     444            1 :                         lastRangeKey = keyspan.Span{}
     445              :                 }
     446              :         }
     447              : 
     448            1 :         if !meta.HasPointKeys && !meta.HasRangeKeys {
     449            1 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, nil
     450            1 :         }
     451              : 
     452              :         // Sanity check that the various bounds on the file were set consistently.
     453            1 :         if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
     454            0 :                 return nil, keyspan.Span{}, base.BlockReadStats{}, err
     455            0 :         }
     456              : 
     457            1 :         return meta, lastRangeKey, iterStats.TotalBlockReads(), nil
     458              : }
     459              : 
     460              : type ingestLoadResult struct {
     461              :         local    []ingestLocalMeta
     462              :         shared   []ingestSharedMeta
     463              :         external []ingestExternalMeta
     464              : 
     465              :         externalFilesHaveLevel bool
     466              :         blockReadStats         base.BlockReadStats
     467              : }
     468              : 
     469              : type ingestLocalMeta struct {
     470              :         *manifest.TableMetadata
     471              :         path string
     472              : }
     473              : 
     474              : type ingestSharedMeta struct {
     475              :         *manifest.TableMetadata
     476              :         shared SharedSSTMeta
     477              : }
     478              : 
     479              : type ingestExternalMeta struct {
     480              :         *manifest.TableMetadata
     481              :         external ExternalFile
     482              :         // usedExistingBacking is true if the external file is reusing a backing
     483              :         // that existed before this ingestion. In this case, we called
     484              :         // VirtualBackings.Protect() on that backing; we will need to call
     485              :         // Unprotect() after the ingestion.
     486              :         usedExistingBacking bool
     487              : }
     488              : 
     489            1 : func (r *ingestLoadResult) fileCount() int {
     490            1 :         return len(r.local) + len(r.shared) + len(r.external)
     491            1 : }
     492              : 
     493              : func ingestLoad(
     494              :         ctx context.Context,
     495              :         opts *Options,
     496              :         fmv FormatMajorVersion,
     497              :         paths []string,
     498              :         shared []SharedSSTMeta,
     499              :         external []ExternalFile,
     500              :         cacheHandle *cache.Handle,
     501              :         pending []base.TableNum,
     502            1 : ) (ingestLoadResult, error) {
     503            1 :         localFileNums := pending[:len(paths)]
     504            1 :         sharedFileNums := pending[len(paths) : len(paths)+len(shared)]
     505            1 :         externalFileNums := pending[len(paths)+len(shared) : len(paths)+len(shared)+len(external)]
     506            1 : 
     507            1 :         var result ingestLoadResult
     508            1 :         result.local = make([]ingestLocalMeta, 0, len(paths))
     509            1 :         var lastRangeKey keyspan.Span
     510            1 :         var blockReadStats base.BlockReadStats
     511            1 :         // NB: we disable range key boundary assertions if we have shared or external files
     512            1 :         // present in this ingestion. This is because a suffixed range key in a local file
     513            1 :         // can possibly defragment with a suffixed range key in a shared or external file.
     514            1 :         // We also disable range key boundary assertions if we have CreateOnShared set to
     515            1 :         // true, as that means we could have suffixed RangeKeyDels or Unsets in the local
     516            1 :         // files that won't ever be surfaced, even if there are no shared or external files
     517            1 :         // in the ingestion.
     518            1 :         shouldDisableRangeKeyChecks := len(shared) > 0 || len(external) > 0 || opts.Experimental.CreateOnShared != remote.CreateOnSharedNone
     519            1 :         for i := range paths {
     520            1 :                 f, err := opts.FS.Open(paths[i])
     521            1 :                 if err != nil {
     522            0 :                         return ingestLoadResult{}, err
     523            0 :                 }
     524              : 
     525            1 :                 readable, err := sstable.NewSimpleReadable(f)
     526            1 :                 if err != nil {
     527            0 :                         return ingestLoadResult{}, err
     528            0 :                 }
     529            1 :                 var m *manifest.TableMetadata
     530            1 :                 rangeKeyValidator := disableRangeKeyChecks()
     531            1 :                 if !shouldDisableRangeKeyChecks {
     532            1 :                         rangeKeyValidator = validateSuffixedBoundaries(opts.Comparer, lastRangeKey)
     533            1 :                 }
     534            1 :                 m, lastRangeKey, blockReadStats, err = ingestLoad1(ctx, opts, fmv, readable, cacheHandle, localFileNums[i], rangeKeyValidator)
     535            1 :                 if err != nil {
     536            0 :                         return ingestLoadResult{}, err
     537            0 :                 }
     538            1 :                 if m != nil {
     539            1 :                         result.local = append(result.local, ingestLocalMeta{
     540            1 :                                 TableMetadata: m,
     541            1 :                                 path:          paths[i],
     542            1 :                         })
     543            1 :                         result.blockReadStats = blockReadStats
     544            1 :                 }
     545              :         }
     546              : 
     547            1 :         if !shouldDisableRangeKeyChecks {
     548            1 :                 rangeKeyValidator := validateSuffixedBoundaries(opts.Comparer, lastRangeKey)
     549            1 :                 if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil {
     550            0 :                         return ingestLoadResult{}, err
     551            0 :                 }
     552              :         }
     553              : 
     554              :         // Sort the shared files according to level.
     555            1 :         sort.Sort(sharedByLevel(shared))
     556            1 : 
     557            1 :         result.shared = make([]ingestSharedMeta, 0, len(shared))
     558            1 :         for i := range shared {
     559            1 :                 m, err := ingestSynthesizeShared(opts, shared[i], sharedFileNums[i])
     560            1 :                 if err != nil {
     561            0 :                         return ingestLoadResult{}, err
     562            0 :                 }
     563            1 :                 if shared[i].Level < sharedLevelsStart {
     564            0 :                         return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart")
     565            0 :                 }
     566            1 :                 result.shared = append(result.shared, ingestSharedMeta{
     567            1 :                         TableMetadata: m,
     568            1 :                         shared:        shared[i],
     569            1 :                 })
     570              :         }
     571            1 :         result.external = make([]ingestExternalMeta, 0, len(external))
     572            1 :         for i := range external {
     573            1 :                 m, err := ingestLoad1External(opts, external[i], externalFileNums[i])
     574            1 :                 if err != nil {
     575            0 :                         return ingestLoadResult{}, err
     576            0 :                 }
     577            1 :                 result.external = append(result.external, ingestExternalMeta{
     578            1 :                         TableMetadata: m,
     579            1 :                         external:      external[i],
     580            1 :                 })
     581            1 :                 if external[i].Level > 0 {
     582            0 :                         if i != 0 && !result.externalFilesHaveLevel {
     583            0 :                                 return ingestLoadResult{}, base.AssertionFailedf("pebble: external sstables must all have level set or unset")
     584            0 :                         }
     585            0 :                         result.externalFilesHaveLevel = true
     586            1 :                 } else if result.externalFilesHaveLevel {
     587            0 :                         return ingestLoadResult{}, base.AssertionFailedf("pebble: external sstables must all have level set or unset")
     588            0 :                 }
     589              :         }
     590            1 :         return result, nil
     591              : }
     592              : 
     593            1 : func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error {
     594            1 :         // Verify that all the shared files (i.e. files in sharedMeta)
     595            1 :         // fit within the exciseSpan.
     596            1 :         for _, f := range lr.shared {
     597            1 :                 if !exciseSpan.Contains(cmp, f.Smallest()) || !exciseSpan.Contains(cmp, f.Largest()) {
     598            0 :                         return errors.Newf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
     599            0 :                 }
     600              :         }
     601              : 
     602            1 :         if lr.externalFilesHaveLevel {
     603            0 :                 for _, f := range lr.external {
     604            0 :                         if !exciseSpan.Contains(cmp, f.Smallest()) || !exciseSpan.Contains(cmp, f.Largest()) {
     605            0 :                                 return base.AssertionFailedf("pebble: external file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
     606            0 :                         }
     607              :                 }
     608              :         }
     609              : 
     610            1 :         if len(lr.external) > 0 {
     611            1 :                 if len(lr.shared) > 0 {
     612            0 :                         // If external files are present alongside shared files,
     613            0 :                         // return an error.
     614            0 :                         return base.AssertionFailedf("pebble: external files cannot be ingested atomically alongside shared files")
     615            0 :                 }
     616              : 
     617              :                 // Sort according to the smallest key.
     618            1 :                 slices.SortFunc(lr.external, func(a, b ingestExternalMeta) int {
     619            1 :                         return cmp(a.Smallest().UserKey, b.Smallest().UserKey)
     620            1 :                 })
     621            1 :                 for i := 1; i < len(lr.external); i++ {
     622            1 :                         if sstableKeyCompare(cmp, lr.external[i-1].Largest(), lr.external[i].Smallest()) >= 0 {
     623            0 :                                 return errors.Newf("pebble: external sstables have overlapping ranges")
     624            0 :                         }
     625              :                 }
     626            1 :                 return nil
     627              :         }
     628            1 :         if len(lr.local) <= 1 {
     629            1 :                 return nil
     630            1 :         }
     631              : 
     632              :         // Sort according to the smallest key.
     633            1 :         slices.SortFunc(lr.local, func(a, b ingestLocalMeta) int {
     634            1 :                 return cmp(a.Smallest().UserKey, b.Smallest().UserKey)
     635            1 :         })
     636              : 
     637            1 :         for i := 1; i < len(lr.local); i++ {
     638            1 :                 if sstableKeyCompare(cmp, lr.local[i-1].Largest(), lr.local[i].Smallest()) >= 0 {
     639            1 :                         return errors.Newf("pebble: local ingestion sstables have overlapping ranges")
     640            1 :                 }
     641              :         }
     642            1 :         if len(lr.shared) == 0 {
     643            1 :                 return nil
     644            1 :         }
     645            0 :         filesInLevel := make([]*manifest.TableMetadata, 0, len(lr.shared))
     646            0 :         for l := sharedLevelsStart; l < numLevels; l++ {
     647            0 :                 filesInLevel = filesInLevel[:0]
     648            0 :                 for i := range lr.shared {
     649            0 :                         if lr.shared[i].shared.Level == uint8(l) {
     650            0 :                                 filesInLevel = append(filesInLevel, lr.shared[i].TableMetadata)
     651            0 :                         }
     652              :                 }
     653            0 :                 for i := range lr.external {
     654            0 :                         if lr.external[i].external.Level == uint8(l) {
     655            0 :                                 filesInLevel = append(filesInLevel, lr.external[i].TableMetadata)
     656            0 :                         }
     657              :                 }
     658            0 :                 slices.SortFunc(filesInLevel, func(a, b *manifest.TableMetadata) int {
     659            0 :                         return cmp(a.Smallest().UserKey, b.Smallest().UserKey)
     660            0 :                 })
     661            0 :                 for i := 1; i < len(filesInLevel); i++ {
     662            0 :                         if sstableKeyCompare(cmp, filesInLevel[i-1].Largest(), filesInLevel[i].Smallest()) >= 0 {
     663            0 :                                 return base.AssertionFailedf("pebble: external shared sstables have overlapping ranges")
     664            0 :                         }
     665              :                 }
     666              :         }
     667            0 :         return nil
     668              : }
     669              : 
     670            0 : func ingestCleanup(objProvider objstorage.Provider, meta []ingestLocalMeta) error {
     671            0 :         var firstErr error
     672            0 :         for i := range meta {
     673            0 :                 if err := objProvider.Remove(base.FileTypeTable, meta[i].TableBacking.DiskFileNum); err != nil {
     674            0 :                         firstErr = firstError(firstErr, err)
     675            0 :                 }
     676              :         }
     677            0 :         return firstErr
     678              : }
     679              : 
     680              : // ingestLinkLocal creates new objects which are backed by either hardlinks to or
     681              : // copies of the ingested files.
     682              : func ingestLinkLocal(
     683              :         ctx context.Context,
     684              :         jobID JobID,
     685              :         opts *Options,
     686              :         objProvider objstorage.Provider,
     687              :         localMetas []ingestLocalMeta,
     688            1 : ) error {
     689            1 :         for i := range localMetas {
     690            1 :                 objMeta, err := objProvider.LinkOrCopyFromLocal(
     691            1 :                         ctx, opts.FS, localMetas[i].path, base.FileTypeTable, localMetas[i].TableBacking.DiskFileNum,
     692            1 :                         objstorage.CreateOptions{PreferSharedStorage: true},
     693            1 :                 )
     694            1 :                 if err != nil {
     695            0 :                         if err2 := ingestCleanup(objProvider, localMetas[:i]); err2 != nil {
     696            0 :                                 opts.Logger.Errorf("ingest cleanup failed: %v", err2)
     697            0 :                         }
     698            0 :                         return err
     699              :                 }
     700            1 :                 if opts.EventListener.TableCreated != nil {
     701            1 :                         opts.EventListener.TableCreated(TableCreateInfo{
     702            1 :                                 JobID:   int(jobID),
     703            1 :                                 Reason:  "ingesting",
     704            1 :                                 Path:    objProvider.Path(objMeta),
     705            1 :                                 FileNum: base.PhysicalTableDiskFileNum(localMetas[i].TableNum),
     706            1 :                         })
     707            1 :                 }
     708              :         }
     709            1 :         return nil
     710              : }
     711              : 
     712              : // ingestAttachRemote attaches remote objects to the storage provider.
     713              : //
     714              : // For external objects, we reuse existing FileBackings from the current version
     715              : // when possible.
     716              : //
     717              : // ingestUnprotectExternalBackings() must be called after this function (even in
     718              : // error cases).
     719            1 : func (d *DB) ingestAttachRemote(jobID JobID, lr ingestLoadResult) error {
     720            1 :         remoteObjs := make([]objstorage.RemoteObjectToAttach, 0, len(lr.shared)+len(lr.external))
     721            1 :         for i := range lr.shared {
     722            1 :                 backing, err := lr.shared[i].shared.Backing.Get()
     723            1 :                 if err != nil {
     724            0 :                         return err
     725            0 :                 }
     726            1 :                 remoteObjs = append(remoteObjs, objstorage.RemoteObjectToAttach{
     727            1 :                         FileNum:  lr.shared[i].TableBacking.DiskFileNum,
     728            1 :                         FileType: base.FileTypeTable,
     729            1 :                         Backing:  backing,
     730            1 :                 })
     731              :         }
     732              : 
     733            1 :         d.findExistingBackingsForExternalObjects(lr.external)
     734            1 : 
     735            1 :         newTableBackings := make(map[remote.ObjectKey]*manifest.TableBacking, len(lr.external))
     736            1 :         for i := range lr.external {
     737            1 :                 meta := lr.external[i].TableMetadata
     738            1 :                 if meta.TableBacking != nil {
     739            1 :                         // The backing was filled in by findExistingBackingsForExternalObjects().
     740            1 :                         continue
     741              :                 }
     742            1 :                 key := remote.MakeObjectKey(lr.external[i].external.Locator, lr.external[i].external.ObjName)
     743            1 :                 if backing, ok := newTableBackings[key]; ok {
     744            1 :                         // We already created the same backing in this loop. Update its size.
     745            1 :                         backing.Size += lr.external[i].external.Size
     746            1 :                         meta.AttachVirtualBacking(backing)
     747            1 :                         continue
     748              :                 }
     749            1 :                 providerBacking, err := d.objProvider.CreateExternalObjectBacking(key.Locator, key.ObjectName)
     750            1 :                 if err != nil {
     751            0 :                         return err
     752            0 :                 }
     753              :                 // We have to attach the remote object (and assign it a DiskFileNum). For
     754              :                 // simplicity, we use the same number for both the FileNum and the
     755              :                 // DiskFileNum (even though this is a virtual sstable).
     756            1 :                 size := max(lr.external[i].external.Size, 1)
     757            1 :                 meta.InitVirtualBacking(base.DiskFileNum(meta.TableNum), size)
     758            1 : 
     759            1 :                 // Set the underlying TableBacking's size to the same size as the virtualized
     760            1 :                 // view of the sstable. This ensures that we don't over-prioritize this
     761            1 :                 // sstable for compaction just yet, as we do not have a clear sense of
     762            1 :                 // what parts of this sstable are referenced by other nodes.
     763            1 :                 meta.TableBacking.Size = size
     764            1 :                 newTableBackings[key] = meta.TableBacking
     765            1 : 
     766            1 :                 remoteObjs = append(remoteObjs, objstorage.RemoteObjectToAttach{
     767            1 :                         FileNum:  meta.TableBacking.DiskFileNum,
     768            1 :                         FileType: base.FileTypeTable,
     769            1 :                         Backing:  providerBacking,
     770            1 :                 })
     771              :         }
     772              : 
     773            1 :         for i := range lr.external {
     774            1 :                 if err := lr.external[i].Validate(d.opts.Comparer.Compare, d.opts.Comparer.FormatKey); err != nil {
     775            0 :                         return err
     776            0 :                 }
     777              :         }
     778              : 
     779            1 :         remoteObjMetas, err := d.objProvider.AttachRemoteObjects(remoteObjs)
     780            1 :         if err != nil {
     781            0 :                 return err
     782            0 :         }
     783              : 
     784            1 :         for i := range lr.shared {
     785            1 :                 // One corner case around file sizes we need to be mindful of, is that
     786            1 :                 // if one of the shareObjs was initially created by us (and has boomeranged
     787            1 :                 // back from another node), we'll need to update the TableBacking's size
     788            1 :                 // to be the true underlying size. Otherwise, we could hit errors when we
     789            1 :                 // open the db again after a crash/restart (see checkConsistency in open.go),
     790            1 :                 // plus it more accurately allows us to prioritize compactions of files
     791            1 :                 // that were originally created by us.
     792            1 :                 if remoteObjMetas[i].IsShared() && !d.objProvider.IsSharedForeign(remoteObjMetas[i]) {
     793            1 :                         size, err := d.objProvider.Size(remoteObjMetas[i])
     794            1 :                         if err != nil {
     795            0 :                                 return err
     796            0 :                         }
     797            1 :                         lr.shared[i].TableBacking.Size = max(uint64(size), 1)
     798              :                 }
     799              :         }
     800              : 
     801            1 :         if d.opts.EventListener.TableCreated != nil {
     802            1 :                 for i := range remoteObjMetas {
     803            1 :                         d.opts.EventListener.TableCreated(TableCreateInfo{
     804            1 :                                 JobID:   int(jobID),
     805            1 :                                 Reason:  "ingesting",
     806            1 :                                 Path:    d.objProvider.Path(remoteObjMetas[i]),
     807            1 :                                 FileNum: remoteObjMetas[i].DiskFileNum,
     808            1 :                         })
     809            1 :                 }
     810              :         }
     811              : 
     812            1 :         return nil
     813              : }
     814              : 
     815              : // findExistingBackingsForExternalObjects populates the TableBacking for external
     816              : // files which are already in use by the current version.
     817              : //
     818              : // We take a Ref and LatestRef on populated backings.
     819            1 : func (d *DB) findExistingBackingsForExternalObjects(metas []ingestExternalMeta) {
     820            1 :         d.mu.Lock()
     821            1 :         defer d.mu.Unlock()
     822            1 : 
     823            1 :         for i := range metas {
     824            1 :                 diskFileNums := d.objProvider.GetExternalObjects(metas[i].external.Locator, metas[i].external.ObjName)
     825            1 :                 // We cross-check against fileBackings in the current version because it is
     826            1 :                 // possible that the external object is referenced by an sstable which only
     827            1 :                 // exists in a previous version. In that case, that object could be removed
     828            1 :                 // at any time so we cannot reuse it.
     829            1 :                 for _, n := range diskFileNums {
     830            1 :                         if backing, ok := d.mu.versions.latest.virtualBackings.Get(n); ok {
     831            1 :                                 // Protect this backing from being removed from the latest version. We
     832            1 :                                 // will unprotect in ingestUnprotectExternalBackings.
     833            1 :                                 d.mu.versions.latest.virtualBackings.Protect(n)
     834            1 :                                 metas[i].usedExistingBacking = true
     835            1 :                                 metas[i].AttachVirtualBacking(backing)
     836            1 : 
     837            1 :                                 // We can't update the size of the backing here, so make sure the
     838            1 :                                 // virtual size is sane.
     839            1 :                                 // TODO(radu): investigate what would it take to update the backing size.
     840            1 :                                 metas[i].Size = min(metas[i].Size, backing.Size)
     841            1 :                                 break
     842              :                         }
     843              :                 }
     844              :         }
     845              : }
     846              : 
     847              : // ingestUnprotectExternalBackings unprotects the file backings that were reused
     848              : // for external objects when the ingestion fails.
     849            1 : func (d *DB) ingestUnprotectExternalBackings(lr ingestLoadResult) {
     850            1 :         d.mu.Lock()
     851            1 :         defer d.mu.Unlock()
     852            1 : 
     853            1 :         for _, meta := range lr.external {
     854            1 :                 if meta.usedExistingBacking {
     855            1 :                         // If the backing is not use anywhere else and the ingest failed (or the
     856            1 :                         // ingested tables were already compacted away), this call will cause in
     857            1 :                         // the next version update to remove the backing.
     858            1 :                         d.mu.versions.latest.virtualBackings.Unprotect(meta.TableBacking.DiskFileNum)
     859            1 :                 }
     860              :         }
     861              : }
     862              : 
     863              : func setSeqNumInMetadata(
     864              :         m *manifest.TableMetadata, seqNum base.SeqNum, cmp Compare, format base.FormatKey,
     865            1 : ) error {
     866            1 :         setSeqFn := func(k base.InternalKey) base.InternalKey {
     867            1 :                 return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
     868            1 :         }
     869              :         // NB: we set the fields directly here, rather than via their Extend*
     870              :         // methods, as we are updating sequence numbers.
     871            1 :         if m.HasPointKeys {
     872            1 :                 m.PointKeyBounds.SetSmallest(setSeqFn(m.PointKeyBounds.Smallest()))
     873            1 :         }
     874            1 :         if m.HasRangeKeys {
     875            1 :                 m.RangeKeyBounds.SetSmallest(setSeqFn(m.RangeKeyBounds.Smallest()))
     876            1 :         }
     877              :         // Only update the seqnum for the largest key if that key is not an
     878              :         // "exclusive sentinel" (i.e. a range deletion sentinel or a range key
     879              :         // boundary), as doing so effectively drops the exclusive sentinel (by
     880              :         // lowering the seqnum from the max value), and extends the bounds of the
     881              :         // table.
     882              :         // NB: as the largest range key is always an exclusive sentinel, it is never
     883              :         // updated.
     884            1 :         if m.HasPointKeys && !m.PointKeyBounds.Largest().IsExclusiveSentinel() {
     885            1 :                 m.PointKeyBounds.SetLargest(setSeqFn(m.PointKeyBounds.Largest()))
     886            1 :         }
     887              :         // Setting smallestSeqNum == largestSeqNum triggers the setting of
     888              :         // Properties.GlobalSeqNum when an sstable is loaded.
     889            1 :         m.SmallestSeqNum = seqNum
     890            1 :         m.LargestSeqNum = seqNum
     891            1 :         m.LargestSeqNumAbsolute = seqNum
     892            1 :         // Ensure the new bounds are consistent.
     893            1 :         if err := m.Validate(cmp, format); err != nil {
     894            0 :                 return err
     895            0 :         }
     896            1 :         return nil
     897              : }
     898              : 
     899              : func ingestUpdateSeqNum(
     900              :         cmp Compare, format base.FormatKey, seqNum base.SeqNum, loadResult ingestLoadResult,
     901            1 : ) error {
     902            1 :         // Shared sstables are required to be sorted by level ascending. We then
     903            1 :         // iterate the shared sstables in reverse, assigning the lower sequence
     904            1 :         // numbers to the shared sstables that will be ingested into the lower
     905            1 :         // (larger numbered) levels first. This ensures sequence number shadowing is
     906            1 :         // correct.
     907            1 :         for i := len(loadResult.shared) - 1; i >= 0; i-- {
     908            1 :                 if i-1 >= 0 && loadResult.shared[i-1].shared.Level > loadResult.shared[i].shared.Level {
     909            0 :                         panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.shared[i-1], loadResult.shared[i]))
     910              :                 }
     911            1 :                 if err := setSeqNumInMetadata(loadResult.shared[i].TableMetadata, seqNum, cmp, format); err != nil {
     912            0 :                         return err
     913            0 :                 }
     914            1 :                 seqNum++
     915              :         }
     916            1 :         for i := range loadResult.external {
     917            1 :                 if err := setSeqNumInMetadata(loadResult.external[i].TableMetadata, seqNum, cmp, format); err != nil {
     918            0 :                         return err
     919            0 :                 }
     920            1 :                 seqNum++
     921              :         }
     922            1 :         for i := range loadResult.local {
     923            1 :                 if err := setSeqNumInMetadata(loadResult.local[i].TableMetadata, seqNum, cmp, format); err != nil {
     924            0 :                         return err
     925            0 :                 }
     926            1 :                 seqNum++
     927              :         }
     928            1 :         return nil
     929              : }
     930              : 
     931              : // ingestTargetLevel returns the target level for a file being ingested.
     932              : // If suggestSplit is true, it accounts for ingest-time splitting as part of
     933              : // its target level calculation, and if a split candidate is found, that file
     934              : // is returned as the splitFile.
     935              : func ingestTargetLevel(
     936              :         ctx context.Context,
     937              :         cmp base.Compare,
     938              :         lsmOverlap overlap.WithLSM,
     939              :         baseLevel int,
     940              :         compactions map[compaction]struct{},
     941              :         meta *manifest.TableMetadata,
     942              :         suggestSplit bool,
     943            1 : ) (targetLevel int, splitFile *manifest.TableMetadata, err error) {
     944            1 :         // Find the lowest level which does not have any files which overlap meta. We
     945            1 :         // search from L0 to L6 looking for whether there are any files in the level
     946            1 :         // which overlap meta. We want the "lowest" level (where lower means
     947            1 :         // increasing level number) in order to reduce write amplification.
     948            1 :         //
     949            1 :         // There are 2 kinds of overlap we need to check for: file boundary overlap
     950            1 :         // and data overlap. Data overlap implies file boundary overlap. Note that it
     951            1 :         // is always possible to ingest into L0.
     952            1 :         //
     953            1 :         // To place meta at level i where i > 0:
     954            1 :         // - there must not be any data overlap with levels <= i, since that will
     955            1 :         //   violate the sequence number invariant.
     956            1 :         // - no file boundary overlap with level i, since that will violate the
     957            1 :         //   invariant that files do not overlap in levels i > 0.
     958            1 :         //   - if there is only a file overlap at a given level, and no data overlap,
     959            1 :         //     we can still slot a file at that level. We return the fileMetadata with
     960            1 :         //     which we have file boundary overlap (must be only one file, as sstable
     961            1 :         //     bounds are usually tight on user keys) and the caller is expected to split
     962            1 :         //     that sstable into two virtual sstables, allowing this file to go into that
     963            1 :         //     level. Note that if we have file boundary overlap with two files, which
     964            1 :         //     should only happen on rare occasions, we treat it as data overlap and
     965            1 :         //     don't use this optimization.
     966            1 :         //
     967            1 :         // The file boundary overlap check is simpler to conceptualize. Consider the
     968            1 :         // following example, in which the ingested file lies completely before or
     969            1 :         // after the file being considered.
     970            1 :         //
     971            1 :         //   |--|           |--|  ingested file: [a,b] or [f,g]
     972            1 :         //         |-----|        existing file: [c,e]
     973            1 :         //  _____________________
     974            1 :         //   a  b  c  d  e  f  g
     975            1 :         //
     976            1 :         // In both cases the ingested file can move to considering the next level.
     977            1 :         //
     978            1 :         // File boundary overlap does not necessarily imply data overlap. The check
     979            1 :         // for data overlap is a little more nuanced. Consider the following examples:
     980            1 :         //
     981            1 :         //  1. No data overlap:
     982            1 :         //
     983            1 :         //          |-|   |--|    ingested file: [cc-d] or [ee-ff]
     984            1 :         //  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
     985            1 :         //  _____________________
     986            1 :         //   a  b  c  d  e  f  g
     987            1 :         //
     988            1 :         // In this case the ingested files can "fall through" this level. The checks
     989            1 :         // continue at the next level.
     990            1 :         //
     991            1 :         //  2. Data overlap:
     992            1 :         //
     993            1 :         //            |--|        ingested file: [d-e]
     994            1 :         //  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
     995            1 :         //  _____________________
     996            1 :         //   a  b  c  d  e  f  g
     997            1 :         //
     998            1 :         // In this case the file cannot be ingested into this level as the point 'dd'
     999            1 :         // is in the way.
    1000            1 :         //
    1001            1 :         // It is worth noting that the check for data overlap is only approximate. In
    1002            1 :         // the previous example, the ingested table [d-e] could contain only the
    1003            1 :         // points 'd' and 'e', in which case the table would be eligible for
    1004            1 :         // considering lower levels. However, such a fine-grained check would need to
    1005            1 :         // be exhaustive (comparing points and ranges in both the ingested existing
    1006            1 :         // tables) and such a check is prohibitively expensive. Thus Pebble treats any
    1007            1 :         // existing point that falls within the ingested table bounds as being "data
    1008            1 :         // overlap".
    1009            1 : 
    1010            1 :         if lsmOverlap[0].Result == overlap.Data {
    1011            1 :                 return 0, nil, nil
    1012            1 :         }
    1013            1 :         targetLevel = 0
    1014            1 :         splitFile = nil
    1015            1 :         metaBounds := meta.UserKeyBounds()
    1016            1 :         for level := baseLevel; level < numLevels; level++ {
    1017            1 :                 var candidateSplitFile *manifest.TableMetadata
    1018            1 :                 switch lsmOverlap[level].Result {
    1019            1 :                 case overlap.Data:
    1020            1 :                         // We cannot ingest into or under this level; return the best target level
    1021            1 :                         // so far.
    1022            1 :                         return targetLevel, splitFile, nil
    1023              : 
    1024            1 :                 case overlap.OnlyBoundary:
    1025            1 :                         if !suggestSplit || lsmOverlap[level].SplitFile == nil {
    1026            1 :                                 // We can ingest under this level, but not into this level.
    1027            1 :                                 continue
    1028              :                         }
    1029              :                         // We can ingest into this level if we split this file.
    1030            1 :                         candidateSplitFile = lsmOverlap[level].SplitFile
    1031              : 
    1032            1 :                 case overlap.None:
    1033              :                 // We can ingest into this level.
    1034              : 
    1035            0 :                 default:
    1036            0 :                         return 0, nil, base.AssertionFailedf("unexpected WithLevel.Result: %v", lsmOverlap[level].Result)
    1037              :                 }
    1038              : 
    1039              :                 // Check boundary overlap with any ongoing compactions. We consider an
    1040              :                 // overlapping compaction that's writing files to an output level as
    1041              :                 // equivalent to boundary overlap with files in that output level.
    1042              :                 //
    1043              :                 // We cannot check for data overlap with the new SSTs compaction will produce
    1044              :                 // since compaction hasn't been done yet. However, there's no need to check
    1045              :                 // since all keys in them will be from levels in [c.startLevel,
    1046              :                 // c.outputLevel], and all those levels have already had their data overlap
    1047              :                 // tested negative (else we'd have returned earlier).
    1048              :                 //
    1049              :                 // An alternative approach would be to cancel these compactions and proceed
    1050              :                 // with an ingest-time split on this level if necessary. However, compaction
    1051              :                 // cancellation can result in significant wasted effort and is best avoided
    1052              :                 // unless necessary.
    1053            1 :                 overlaps := false
    1054            1 :                 for c := range compactions {
    1055            1 :                         tblCompaction, ok := c.(*tableCompaction)
    1056            1 :                         if !ok {
    1057            1 :                                 continue
    1058              :                         }
    1059            1 :                         if tblCompaction.outputLevel == nil || level != tblCompaction.outputLevel.level {
    1060            1 :                                 continue
    1061              :                         }
    1062            1 :                         bounds := tblCompaction.Bounds()
    1063            1 :                         if bounds != nil && metaBounds.Overlaps(cmp, bounds) {
    1064            1 :                                 overlaps = true
    1065            1 :                                 break
    1066              :                         }
    1067              :                 }
    1068            1 :                 if !overlaps {
    1069            1 :                         targetLevel = level
    1070            1 :                         splitFile = candidateSplitFile
    1071            1 :                 }
    1072              :         }
    1073            1 :         return targetLevel, splitFile, nil
    1074              : }
    1075              : 
    1076              : // Ingest ingests a set of sstables into the DB. Ingestion of the files is
    1077              : // atomic and semantically equivalent to creating a single batch containing all
    1078              : // of the mutations in the sstables. Ingestion may require the memtable to be
    1079              : // flushed. The ingested sstable files are moved into the DB and must reside on
    1080              : // the same filesystem as the DB. Sstables can be created for ingestion using
    1081              : // sstable.Writer. On success, Ingest removes the input paths.
    1082              : //
    1083              : // Ingested sstables must have been created with a known KeySchema (when written
    1084              : // with columnar blocks) and Comparer. They must not contain any references to
    1085              : // external blob files.
    1086              : //
    1087              : // Two types of sstables are accepted for ingestion(s): one is sstables present
    1088              : // in the instance's vfs.FS and can be referenced locally. The other is sstables
    1089              : // present in remote.Storage, referred to as shared or foreign sstables. These
    1090              : // shared sstables can be linked through objstorageprovider.Provider, and do not
    1091              : // need to already be present on the local vfs.FS. Foreign sstables must all fit
    1092              : // in an excise span, and are destined for a level specified in SharedSSTMeta.
    1093              : //
    1094              : // All sstables *must* be Sync()'d by the caller after all bytes are written
    1095              : // and before its file handle is closed; failure to do so could violate
    1096              : // durability or lead to corrupted on-disk state. This method cannot, in a
    1097              : // platform-and-FS-agnostic way, ensure that all sstables in the input are
    1098              : // properly synced to disk. Opening new file handles and Sync()-ing them
    1099              : // does not always guarantee durability; see the discussion here on that:
    1100              : // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
    1101              : //
    1102              : // Ingestion loads each sstable into the lowest level of the LSM which it
    1103              : // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
    1104              : // ingestion forces the memtable to flush, and then waits for the flush to
    1105              : // occur. In some cases, such as with no foreign sstables and no excise span,
    1106              : // ingestion that gets blocked on a memtable can join the flushable queue and
    1107              : // finish even before the memtable has been flushed.
    1108              : //
    1109              : // The steps for ingestion are:
    1110              : //
    1111              : //  1. Allocate table numbers for every sstable being ingested.
    1112              : //  2. Load the metadata for all sstables being ingested.
    1113              : //  3. Sort the sstables by smallest key, verifying non overlap (for local
    1114              : //     sstables).
    1115              : //  4. Hard link (or copy) the local sstables into the DB directory.
    1116              : //  5. Allocate a sequence number to use for all of the entries in the
    1117              : //     local sstables. This is the step where overlap with memtables is
    1118              : //     determined. If there is overlap, we remember the most recent memtable
    1119              : //     that overlaps.
    1120              : //  6. Update the sequence number in the ingested local sstables. (Remote
    1121              : //     sstables get fixed sequence numbers that were determined at load time.)
    1122              : //  7. Wait for the most recent memtable that overlaps to flush (if any).
    1123              : //  8. Add the ingested sstables to the version (DB.ingestApply).
    1124              : //     8.1.  If an excise span was specified, figure out what sstables in the
    1125              : //     current version overlap with the excise span, and create new virtual
    1126              : //     sstables out of those sstables that exclude the excised span (DB.excise).
    1127              : //  9. Publish the ingestion sequence number.
    1128              : //
    1129              : // Note that if the mutable memtable overlaps with ingestion, a flush of the
    1130              : // memtable is forced equivalent to DB.Flush. Additionally, subsequent
    1131              : // mutations that get sequence numbers larger than the ingestion sequence
    1132              : // number get queued up behind the ingestion waiting for it to complete. This
    1133              : // can produce a noticeable hiccup in performance. See
    1134              : // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix
    1135              : // this hiccup.
    1136            1 : func (d *DB) Ingest(ctx context.Context, paths []string) error {
    1137            1 :         if err := d.closed.Load(); err != nil {
    1138            0 :                 panic(err)
    1139              :         }
    1140            1 :         if d.opts.ReadOnly {
    1141            0 :                 return ErrReadOnly
    1142            0 :         }
    1143            1 :         _, err := d.ingest(ctx, ingestArgs{Local: paths})
    1144            1 :         return err
    1145              : }
    1146              : 
    1147              : // IngestOperationStats provides some information about where in the LSM the
    1148              : // bytes were ingested.
    1149              : type IngestOperationStats struct {
    1150              :         // Bytes is the total bytes in the ingested sstables.
    1151              :         Bytes uint64
    1152              :         // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
    1153              :         // into L0. This value is approximate when flushable ingests are active and
    1154              :         // an ingest overlaps an entry in the flushable queue. Currently, this
    1155              :         // approximation is very rough, only including tables that overlapped the
    1156              :         // memtable. This estimate may be improved with #2112.
    1157              :         ApproxIngestedIntoL0Bytes uint64
    1158              :         // MemtableOverlappingFiles is the count of ingested sstables
    1159              :         // that overlapped keys in the memtables.
    1160              :         MemtableOverlappingFiles int
    1161              : }
    1162              : 
    1163              : // ExternalFile are external sstables that can be referenced through
    1164              : // objprovider and ingested as remote files that will not be refcounted or
    1165              : // cleaned up. For use with online restore. Note that the underlying sstable
    1166              : // could contain keys outside the [Smallest,Largest) bounds; however Pebble
    1167              : // is expected to only read the keys within those bounds.
    1168              : type ExternalFile struct {
    1169              :         // Locator is the shared.Locator that can be used with objProvider to
    1170              :         // resolve a reference to this external sstable.
    1171              :         Locator remote.Locator
    1172              : 
    1173              :         // ObjName is the unique name of this sstable on Locator.
    1174              :         ObjName string
    1175              : 
    1176              :         // Size of the referenced proportion of the virtualized sstable. An estimate
    1177              :         // is acceptable in lieu of the backing file size.
    1178              :         Size uint64
    1179              : 
    1180              :         // StartKey and EndKey define the bounds of the sstable; the ingestion
    1181              :         // of this file will only result in keys within [StartKey, EndKey) if
    1182              :         // EndKeyIsInclusive is false or [StartKey, EndKey] if it is true.
    1183              :         // These bounds are loose i.e. it's possible for keys to not span the
    1184              :         // entirety of this range.
    1185              :         //
    1186              :         // StartKey and EndKey user keys must not have suffixes.
    1187              :         //
    1188              :         // Multiple ExternalFiles in one ingestion must all have non-overlapping
    1189              :         // bounds.
    1190              :         StartKey, EndKey []byte
    1191              : 
    1192              :         // EndKeyIsInclusive is true if EndKey should be treated as inclusive.
    1193              :         EndKeyIsInclusive bool
    1194              : 
    1195              :         // HasPointKey and HasRangeKey denote whether this file contains point keys
    1196              :         // or range keys. If both structs are false, an error is returned during
    1197              :         // ingestion.
    1198              :         HasPointKey, HasRangeKey bool
    1199              : 
    1200              :         // SyntheticPrefix will prepend this prefix to all keys in the file during
    1201              :         // iteration. Note that the backing file itself is not modified.
    1202              :         //
    1203              :         // SyntheticPrefix must be a prefix of both Bounds.Start and Bounds.End.
    1204              :         SyntheticPrefix []byte
    1205              : 
    1206              :         // SyntheticSuffix will replace the suffix of every key in the file during
    1207              :         // iteration. Note that the file itself is not modified, rather, every key
    1208              :         // returned by an iterator will have the synthetic suffix.
    1209              :         //
    1210              :         // SyntheticSuffix can only be used under the following conditions:
    1211              :         //  - the synthetic suffix must sort before any non-empty suffixes in the
    1212              :         //    backing sst (the entire sst, not just the part restricted to Bounds).
    1213              :         //  - the backing sst must not contain multiple keys with the same prefix.
    1214              :         SyntheticSuffix []byte
    1215              : 
    1216              :         // Level denotes the level at which this file was present at read time
    1217              :         // if the external file was returned by a scan of an existing Pebble
    1218              :         // instance. If Level is 0, this field is ignored.
    1219              :         Level uint8
    1220              : }
    1221              : 
    1222              : // IngestWithStats does the same as Ingest, and additionally returns
    1223              : // IngestOperationStats.
    1224            0 : func (d *DB) IngestWithStats(ctx context.Context, paths []string) (IngestOperationStats, error) {
    1225            0 :         if err := d.closed.Load(); err != nil {
    1226            0 :                 panic(err)
    1227              :         }
    1228            0 :         if d.opts.ReadOnly {
    1229            0 :                 return IngestOperationStats{}, ErrReadOnly
    1230            0 :         }
    1231            0 :         return d.ingest(ctx, ingestArgs{Local: paths})
    1232              : }
    1233              : 
    1234              : // IngestExternalFiles does the same as IngestWithStats, and additionally
    1235              : // accepts external files (with locator info that can be resolved using
    1236              : // d.opts.SharedStorage). These files must also be non-overlapping with
    1237              : // each other, and must be resolvable through d.objProvider.
    1238              : func (d *DB) IngestExternalFiles(
    1239              :         ctx context.Context, external []ExternalFile,
    1240            1 : ) (IngestOperationStats, error) {
    1241            1 :         if err := d.closed.Load(); err != nil {
    1242            0 :                 panic(err)
    1243              :         }
    1244              : 
    1245            1 :         if d.opts.ReadOnly {
    1246            0 :                 return IngestOperationStats{}, ErrReadOnly
    1247            0 :         }
    1248            1 :         if d.opts.Experimental.RemoteStorage == nil {
    1249            0 :                 return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured")
    1250            0 :         }
    1251            1 :         return d.ingest(ctx, ingestArgs{External: external})
    1252              : }
    1253              : 
    1254              : // IngestAndExcise does the same as IngestWithStats, and additionally accepts a
    1255              : // list of shared files to ingest that can be read from a remote.Storage through
    1256              : // a Provider. All the shared files must live within exciseSpan, and any existing
    1257              : // keys in exciseSpan are deleted by turning existing sstables into virtual
    1258              : // sstables (if not virtual already) and shrinking their spans to exclude
    1259              : // exciseSpan. See the comment at Ingest for a more complete picture of the
    1260              : // ingestion process.
    1261              : //
    1262              : // Panics if this DB instance was not instantiated with a remote.Storage and
    1263              : // shared sstables are present.
    1264              : func (d *DB) IngestAndExcise(
    1265              :         ctx context.Context,
    1266              :         paths []string,
    1267              :         shared []SharedSSTMeta,
    1268              :         external []ExternalFile,
    1269              :         exciseSpan KeyRange,
    1270            1 : ) (IngestOperationStats, error) {
    1271            1 :         if err := d.closed.Load(); err != nil {
    1272            0 :                 panic(err)
    1273              :         }
    1274            1 :         if d.opts.ReadOnly {
    1275            0 :                 return IngestOperationStats{}, ErrReadOnly
    1276            0 :         }
    1277              :         // Excise is only supported on prefix keys.
    1278            1 :         if d.opts.Comparer.Split(exciseSpan.Start) != len(exciseSpan.Start) {
    1279            0 :                 return IngestOperationStats{}, errors.New("IngestAndExcise called with suffixed start key")
    1280            0 :         }
    1281            1 :         if d.opts.Comparer.Split(exciseSpan.End) != len(exciseSpan.End) {
    1282            0 :                 return IngestOperationStats{}, errors.New("IngestAndExcise called with suffixed end key")
    1283            0 :         }
    1284            1 :         if v := d.FormatMajorVersion(); v < FormatMinForSharedObjects {
    1285            0 :                 return IngestOperationStats{}, errors.Newf(
    1286            0 :                         "store has format major version %d; IngestAndExcise requires at least %d",
    1287            0 :                         v, FormatMinForSharedObjects,
    1288            0 :                 )
    1289            0 :         }
    1290            1 :         args := ingestArgs{
    1291            1 :                 Local:              paths,
    1292            1 :                 Shared:             shared,
    1293            1 :                 External:           external,
    1294            1 :                 ExciseSpan:         exciseSpan,
    1295            1 :                 ExciseBoundsPolicy: tightExciseBounds,
    1296            1 :         }
    1297            1 :         return d.ingest(ctx, args)
    1298              : }
    1299              : 
    1300              : // Both DB.mu and commitPipeline.mu must be held while this is called.
    1301              : func (d *DB) newIngestedFlushableEntry(
    1302              :         meta []*manifest.TableMetadata, seqNum base.SeqNum, logNum base.DiskFileNum, exciseSpan KeyRange,
    1303            1 : ) (*flushableEntry, error) {
    1304            1 :         // If there's an excise being done atomically with the same ingest, we
    1305            1 :         // assign the lowest sequence number in the set of sequence numbers for this
    1306            1 :         // ingestion to the excise. Note that we've already allocated fileCount+1
    1307            1 :         // sequence numbers in this case.
    1308            1 :         //
    1309            1 :         // This mimics the behaviour in the non-flushable ingest case (see the callsite
    1310            1 :         // for ingestUpdateSeqNum).
    1311            1 :         fileSeqNumStart := seqNum
    1312            1 :         if exciseSpan.Valid() {
    1313            1 :                 fileSeqNumStart = seqNum + 1 // the first seqNum is reserved for the excise.
    1314            1 :                 // The excise span will be retained by the flushable, outliving the
    1315            1 :                 // caller's ingestion call. Copy it.
    1316            1 :                 exciseSpan = KeyRange{
    1317            1 :                         Start: slices.Clone(exciseSpan.Start),
    1318            1 :                         End:   slices.Clone(exciseSpan.End),
    1319            1 :                 }
    1320            1 :         }
    1321              :         // Update the sequence number for all of the sstables in the
    1322              :         // metadata. Writing the metadata to the manifest when the
    1323              :         // version edit is applied is the mechanism that persists the
    1324              :         // sequence number. The sstables themselves are left unmodified.
    1325              :         // In this case, a version edit will only be written to the manifest
    1326              :         // when the flushable is eventually flushed. If Pebble restarts in that
    1327              :         // time, then we'll lose the ingest sequence number information. But this
    1328              :         // information will also be reconstructed on node restart.
    1329            1 :         for i, m := range meta {
    1330            1 :                 if err := setSeqNumInMetadata(m, fileSeqNumStart+base.SeqNum(i), d.cmp, d.opts.Comparer.FormatKey); err != nil {
    1331            0 :                         return nil, err
    1332            0 :                 }
    1333              :         }
    1334              : 
    1335            1 :         f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter, exciseSpan, seqNum)
    1336            1 : 
    1337            1 :         // NB: The logNum/seqNum are the WAL number which we're writing this entry
    1338            1 :         // to and the sequence number within the WAL which we'll write this entry
    1339            1 :         // to.
    1340            1 :         entry := d.newFlushableEntry(f, logNum, seqNum)
    1341            1 :         // The flushable entry starts off with a single reader ref, so increment
    1342            1 :         // the TableMetadata.Refs.
    1343            1 :         for _, file := range f.files {
    1344            1 :                 file.Ref()
    1345            1 :         }
    1346            1 :         entry.unrefFiles = func(of *manifest.ObsoleteFiles) {
    1347            1 :                 // Invoke Unref on each table. If any files become obsolete, they'll be
    1348            1 :                 // added to the set of obsolete files.
    1349            1 :                 for _, file := range f.files {
    1350            1 :                         file.Unref(of)
    1351            1 :                 }
    1352              :         }
    1353              : 
    1354            1 :         entry.flushForced = true
    1355            1 :         entry.releaseMemAccounting = func() {}
    1356            1 :         return entry, nil
    1357              : }
    1358              : 
    1359              : // Both DB.mu and commitPipeline.mu must be held while this is called. Since
    1360              : // we're holding both locks, the order in which we rotate the memtable or
    1361              : // recycle the WAL in this function is irrelevant as long as the correct log
    1362              : // numbers are assigned to the appropriate flushable.
    1363              : func (d *DB) handleIngestAsFlushable(
    1364              :         meta []*manifest.TableMetadata, seqNum base.SeqNum, exciseSpan KeyRange,
    1365            1 : ) error {
    1366            1 :         b := d.NewBatch()
    1367            1 :         if exciseSpan.Valid() {
    1368            1 :                 b.excise(exciseSpan.Start, exciseSpan.End)
    1369            1 :         }
    1370            1 :         for _, m := range meta {
    1371            1 :                 b.ingestSST(m.TableNum)
    1372            1 :         }
    1373            1 :         b.setSeqNum(seqNum)
    1374            1 : 
    1375            1 :         // If the WAL is disabled, then the logNum used to create the flushable
    1376            1 :         // entry doesn't matter. We just use the logNum assigned to the current
    1377            1 :         // mutable memtable. If the WAL is enabled, then this logNum will be
    1378            1 :         // overwritten by the logNum of the log which will contain the log entry
    1379            1 :         // for the ingestedFlushable.
    1380            1 :         logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
    1381            1 :         if !d.opts.DisableWAL {
    1382            1 :                 // We create a new WAL for the flushable instead of reusing the end of
    1383            1 :                 // the previous WAL. This simplifies the increment of the minimum
    1384            1 :                 // unflushed log number, and also simplifies WAL replay.
    1385            1 :                 var prevLogSize uint64
    1386            1 :                 logNum, prevLogSize = d.rotateWAL()
    1387            1 :                 // As the rotator of the WAL, we're responsible for updating the
    1388            1 :                 // previous flushable queue tail's log size.
    1389            1 :                 d.mu.mem.queue[len(d.mu.mem.queue)-1].logSize = prevLogSize
    1390            1 : 
    1391            1 :                 d.mu.Unlock()
    1392            1 :                 err := d.commit.directWrite(b)
    1393            1 :                 if err != nil {
    1394            0 :                         d.opts.Logger.Fatalf("%v", err)
    1395            0 :                 }
    1396            1 :                 d.mu.Lock()
    1397              :         }
    1398              : 
    1399            1 :         entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum, exciseSpan)
    1400            1 :         if err != nil {
    1401            0 :                 return err
    1402            0 :         }
    1403            1 :         nextSeqNum := seqNum + base.SeqNum(b.Count())
    1404            1 : 
    1405            1 :         // Set newLogNum to the logNum of the previous flushable. This value is
    1406            1 :         // irrelevant if the WAL is disabled. If the WAL is enabled, then we set
    1407            1 :         // the appropriate value below.
    1408            1 :         newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
    1409            1 :         if !d.opts.DisableWAL {
    1410            1 :                 // newLogNum will be the WAL num of the next mutable memtable which
    1411            1 :                 // comes after the ingestedFlushable in the flushable queue. The mutable
    1412            1 :                 // memtable will be created below.
    1413            1 :                 //
    1414            1 :                 // The prevLogSize returned by rotateWAL is the WAL to which the
    1415            1 :                 // flushable ingest keys were appended. This intermediary WAL is only
    1416            1 :                 // used to record the flushable ingest and nothing else.
    1417            1 :                 newLogNum, entry.logSize = d.rotateWAL()
    1418            1 :         }
    1419              : 
    1420            1 :         d.mu.versions.metrics.Ingest.Count++
    1421            1 :         currMem := d.mu.mem.mutable
    1422            1 :         // NB: Placing ingested sstables above the current memtables
    1423            1 :         // requires rotating of the existing memtables/WAL. There is
    1424            1 :         // some concern of churning through tiny memtables due to
    1425            1 :         // ingested sstables being placed on top of them, but those
    1426            1 :         // memtables would have to be flushed anyways.
    1427            1 :         d.mu.mem.queue = append(d.mu.mem.queue, entry)
    1428            1 :         d.rotateMemtable(newLogNum, nextSeqNum, currMem, 0 /* minSize */)
    1429            1 :         d.updateReadStateLocked(d.opts.DebugCheck)
    1430            1 :         // TODO(aaditya): is this necessary? we call this already in rotateMemtable above
    1431            1 :         d.maybeScheduleFlush()
    1432            1 :         return nil
    1433              : }
    1434              : 
    1435              : type ingestArgs struct {
    1436              :         // Local sstables to ingest.
    1437              :         Local []string
    1438              :         // Shared sstables to ingest.
    1439              :         Shared []SharedSSTMeta
    1440              :         // External sstables to ingest.
    1441              :         External []ExternalFile
    1442              :         // ExciseSpan (unset if not excising).
    1443              :         ExciseSpan         KeyRange
    1444              :         ExciseBoundsPolicy exciseBoundsPolicy
    1445              : }
    1446              : 
    1447              : // See comment at Ingest() for details on how this works.
    1448            1 : func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats, error) {
    1449            1 :         paths := args.Local
    1450            1 :         shared := args.Shared
    1451            1 :         external := args.External
    1452            1 :         if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil {
    1453            0 :                 panic("cannot ingest shared sstables with nil SharedStorage")
    1454              :         }
    1455            1 :         if (args.ExciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables {
    1456            0 :                 return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion")
    1457            0 :         }
    1458            1 :         if len(external) > 0 && d.FormatMajorVersion() < FormatSyntheticPrefixSuffix {
    1459            0 :                 for i := range external {
    1460            0 :                         if len(external[i].SyntheticPrefix) > 0 {
    1461            0 :                                 return IngestOperationStats{}, errors.New("pebble: format major version too old for synthetic prefix ingestion")
    1462            0 :                         }
    1463            0 :                         if len(external[i].SyntheticSuffix) > 0 {
    1464            0 :                                 return IngestOperationStats{}, errors.New("pebble: format major version too old for synthetic suffix ingestion")
    1465            0 :                         }
    1466              :                 }
    1467              :         }
    1468              :         // Allocate table numbers for all files being ingested and mark them as
    1469              :         // pending in order to prevent them from being deleted. Note that this causes
    1470              :         // the file number ordering to be out of alignment with sequence number
    1471              :         // ordering. The sorting of L0 tables by sequence number avoids relying on
    1472              :         // that (busted) invariant.
    1473            1 :         pendingOutputs := make([]base.TableNum, len(paths)+len(shared)+len(external))
    1474            1 :         for i := 0; i < len(paths)+len(shared)+len(external); i++ {
    1475            1 :                 pendingOutputs[i] = d.mu.versions.getNextTableNum()
    1476            1 :         }
    1477              : 
    1478            1 :         jobID := d.newJobID()
    1479            1 : 
    1480            1 :         // Load the metadata for all the files being ingested. This step detects
    1481            1 :         // and elides empty sstables.
    1482            1 :         loadResult, err := ingestLoad(ctx, d.opts, d.FormatMajorVersion(), paths, shared, external,
    1483            1 :                 d.cacheHandle, pendingOutputs)
    1484            1 :         if err != nil {
    1485            0 :                 return IngestOperationStats{}, err
    1486            0 :         }
    1487              : 
    1488            1 :         if loadResult.fileCount() == 0 && !args.ExciseSpan.Valid() {
    1489            1 :                 // All of the sstables to be ingested were empty. Nothing to do.
    1490            1 :                 return IngestOperationStats{}, nil
    1491            1 :         }
    1492              : 
    1493              :         // Verify the sstables do not overlap.
    1494            1 :         if err := ingestSortAndVerify(d.cmp, loadResult, args.ExciseSpan); err != nil {
    1495            1 :                 return IngestOperationStats{}, err
    1496            1 :         }
    1497              : 
    1498              :         // Hard link the sstables into the DB directory. Since the sstables aren't
    1499              :         // referenced by a version, they won't be used. If the hard linking fails
    1500              :         // (e.g. because the files reside on a different filesystem), ingestLinkLocal
    1501              :         // will fall back to copying, and if that fails we undo our work and return an
    1502              :         // error.
    1503            1 :         if err := ingestLinkLocal(ctx, jobID, d.opts, d.objProvider, loadResult.local); err != nil {
    1504            0 :                 return IngestOperationStats{}, err
    1505            0 :         }
    1506              : 
    1507            1 :         err = d.ingestAttachRemote(jobID, loadResult)
    1508            1 :         defer d.ingestUnprotectExternalBackings(loadResult)
    1509            1 :         if err != nil {
    1510            0 :                 return IngestOperationStats{}, err
    1511            0 :         }
    1512              : 
    1513              :         // Make the new tables durable. We need to do this at some point before we
    1514              :         // update the MANIFEST (via UpdateVersionLocked), otherwise a crash can have
    1515              :         // the tables referenced in the MANIFEST, but not present in the provider.
    1516            1 :         if err := d.objProvider.Sync(); err != nil {
    1517            0 :                 return IngestOperationStats{}, err
    1518            0 :         }
    1519              : 
    1520              :         // metaFlushableOverlaps is a map indicating which of the ingested sstables
    1521              :         // overlap some table in the flushable queue. It's used to approximate
    1522              :         // ingest-into-L0 stats when using flushable ingests.
    1523            1 :         metaFlushableOverlaps := make(map[base.TableNum]bool, loadResult.fileCount())
    1524            1 :         var mem *flushableEntry
    1525            1 :         var mut *memTable
    1526            1 :         // asFlushable indicates whether the sstable was ingested as a flushable.
    1527            1 :         var asFlushable bool
    1528            1 :         var waitFlushStart crtime.Mono
    1529            1 :         prepare := func(seqNum base.SeqNum) {
    1530            1 :                 // Note that d.commit.mu is held by commitPipeline when calling prepare.
    1531            1 : 
    1532            1 :                 // Determine the set of bounds we care about for the purpose of checking
    1533            1 :                 // for overlap among the flushables. If there's an excise span, we need
    1534            1 :                 // to check for overlap with its bounds as well.
    1535            1 :                 overlapBounds := make([]bounded, 0, loadResult.fileCount()+1)
    1536            1 :                 for _, m := range loadResult.local {
    1537            1 :                         overlapBounds = append(overlapBounds, m.TableMetadata)
    1538            1 :                 }
    1539            1 :                 for _, m := range loadResult.shared {
    1540            1 :                         overlapBounds = append(overlapBounds, m.TableMetadata)
    1541            1 :                 }
    1542            1 :                 for _, m := range loadResult.external {
    1543            1 :                         overlapBounds = append(overlapBounds, m.TableMetadata)
    1544            1 :                 }
    1545            1 :                 if args.ExciseSpan.Valid() {
    1546            1 :                         overlapBounds = append(overlapBounds, &args.ExciseSpan)
    1547            1 :                 }
    1548              : 
    1549            1 :                 d.mu.Lock()
    1550            1 :                 defer d.mu.Unlock()
    1551            1 : 
    1552            1 :                 if args.ExciseSpan.Valid() {
    1553            1 :                         // Check if any of the currently-open EventuallyFileOnlySnapshots
    1554            1 :                         // overlap in key ranges with the excise span. If so, we need to
    1555            1 :                         // check for memtable overlaps with all bounds of that
    1556            1 :                         // EventuallyFileOnlySnapshot in addition to the ingestion's own
    1557            1 :                         // bounds too.
    1558            1 :                         overlapBounds = append(overlapBounds, exciseOverlapBounds(
    1559            1 :                                 d.cmp, &d.mu.snapshots.snapshotList, args.ExciseSpan, seqNum)...)
    1560            1 :                 }
    1561              : 
    1562              :                 // Check to see if any files overlap with any of the memtables. The queue
    1563              :                 // is ordered from oldest to newest with the mutable memtable being the
    1564              :                 // last element in the slice. We want to wait for the newest table that
    1565              :                 // overlaps.
    1566              : 
    1567            1 :                 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
    1568            1 :                         m := d.mu.mem.queue[i]
    1569            1 :                         m.computePossibleOverlaps(func(b bounded) shouldContinue {
    1570            1 :                                 // If this is the first table to overlap a flushable, save
    1571            1 :                                 // the flushable. This ingest must be ingested or flushed
    1572            1 :                                 // after it.
    1573            1 :                                 if mem == nil {
    1574            1 :                                         mem = m
    1575            1 :                                 }
    1576              : 
    1577            1 :                                 switch v := b.(type) {
    1578            1 :                                 case *manifest.TableMetadata:
    1579            1 :                                         // NB: False positives are possible if `m` is a flushable
    1580            1 :                                         // ingest that overlaps the file `v` in bounds but doesn't
    1581            1 :                                         // contain overlapping data. This is considered acceptable
    1582            1 :                                         // because it's rare (in CockroachDB a bound overlap likely
    1583            1 :                                         // indicates a data overlap), and blocking the commit
    1584            1 :                                         // pipeline while we perform I/O to check for overlap may be
    1585            1 :                                         // more disruptive than enqueueing this ingestion on the
    1586            1 :                                         // flushable queue and switching to a new memtable.
    1587            1 :                                         metaFlushableOverlaps[v.TableNum] = true
    1588            1 :                                 case *KeyRange:
    1589              :                                         // An excise span or an EventuallyFileOnlySnapshot protected range;
    1590              :                                         // not a file.
    1591            0 :                                 default:
    1592            0 :                                         panic("unreachable")
    1593              :                                 }
    1594            1 :                                 return continueIteration
    1595              :                         }, overlapBounds...)
    1596              :                 }
    1597              : 
    1598            1 :                 if mem == nil {
    1599            1 :                         // No overlap with any of the queued flushables, so no need to queue
    1600            1 :                         // after them.
    1601            1 : 
    1602            1 :                         // New writes with higher sequence numbers may be concurrently
    1603            1 :                         // committed. We must ensure they don't flush before this ingest
    1604            1 :                         // completes. To do that, we ref the mutable memtable as a writer,
    1605            1 :                         // preventing its flushing (and the flushing of all subsequent
    1606            1 :                         // flushables in the queue). Once we've acquired the manifest lock
    1607            1 :                         // to add the ingested sstables to the LSM, we can unref as we're
    1608            1 :                         // guaranteed that the flush won't edit the LSM before this ingest.
    1609            1 :                         mut = d.mu.mem.mutable
    1610            1 :                         mut.writerRef()
    1611            1 :                         return
    1612            1 :                 }
    1613              : 
    1614              :                 // The ingestion overlaps with some entry in the flushable queue. If the
    1615              :                 // pre-conditions are met below, we can treat this ingestion as a flushable
    1616              :                 // ingest, otherwise we wait on the memtable flush before ingestion.
    1617              :                 //
    1618              :                 // TODO(aaditya): We should make flushableIngest compatible with remote
    1619              :                 // files.
    1620            1 :                 hasRemoteFiles := len(shared) > 0 || len(external) > 0
    1621            1 :                 canIngestFlushable := d.FormatMajorVersion() >= FormatFlushableIngest &&
    1622            1 :                         // We require that either the queue of flushables is below the
    1623            1 :                         // stop-writes threshold (note that this is typically a conservative
    1624            1 :                         // check, since not every element of this queue will contribute the full
    1625            1 :                         // memtable memory size that could result in a write stall), or WAL
    1626            1 :                         // failover is permitting an unlimited queue without causing a write
    1627            1 :                         // stall. The latter condition is important to avoid delays in
    1628            1 :                         // visibility of concurrent writes that happen to get a sequence number
    1629            1 :                         // after this ingest and then must wait for this ingest that is itself
    1630            1 :                         // waiting on a large flush. See
    1631            1 :                         // https://github.com/cockroachdb/pebble/issues/4944 for an illustration
    1632            1 :                         // of this problem.
    1633            1 :                         (len(d.mu.mem.queue) < d.opts.MemTableStopWritesThreshold ||
    1634            1 :                                 d.mu.log.manager.ElevateWriteStallThresholdForFailover()) &&
    1635            1 :                         !d.opts.Experimental.DisableIngestAsFlushable() && !hasRemoteFiles &&
    1636            1 :                         (!args.ExciseSpan.Valid() || d.FormatMajorVersion() >= FormatFlushableIngestExcises)
    1637            1 :                 if !canIngestFlushable {
    1638            1 :                         // We're not able to ingest as a flushable,
    1639            1 :                         // so we must synchronously flush.
    1640            1 :                         //
    1641            1 :                         // TODO(bilal): Currently, if any of the files being ingested are shared,
    1642            1 :                         // we cannot use flushable ingests and need
    1643            1 :                         // to wait synchronously.
    1644            1 :                         if mem.flushable == d.mu.mem.mutable {
    1645            1 :                                 err = d.makeRoomForWrite(nil)
    1646            1 :                         }
    1647              :                         // New writes with higher sequence numbers may be concurrently
    1648              :                         // committed. We must ensure they don't flush before this ingest
    1649              :                         // completes. To do that, we ref the mutable memtable as a writer,
    1650              :                         // preventing its flushing (and the flushing of all subsequent
    1651              :                         // flushables in the queue). Once we've acquired the manifest lock
    1652              :                         // to add the ingested sstables to the LSM, we can unref as we're
    1653              :                         // guaranteed that the flush won't edit the LSM before this ingest.
    1654            1 :                         mut = d.mu.mem.mutable
    1655            1 :                         mut.writerRef()
    1656            1 :                         mem.flushForced = true
    1657            1 :                         waitFlushStart = crtime.NowMono()
    1658            1 :                         d.maybeScheduleFlush()
    1659            1 :                         return
    1660              :                 }
    1661              :                 // Since there aren't too many memtables already queued up, we can
    1662              :                 // slide the ingested sstables on top of the existing memtables.
    1663            1 :                 asFlushable = true
    1664            1 :                 fileMetas := make([]*manifest.TableMetadata, len(loadResult.local))
    1665            1 :                 for i := range fileMetas {
    1666            1 :                         fileMetas[i] = loadResult.local[i].TableMetadata
    1667            1 :                 }
    1668            1 :                 err = d.handleIngestAsFlushable(fileMetas, seqNum, args.ExciseSpan)
    1669              :         }
    1670              : 
    1671            1 :         var ve *manifest.VersionEdit
    1672            1 :         var waitFlushDuration time.Duration
    1673            1 :         var manifestUpdateDuration time.Duration
    1674            1 :         apply := func(seqNum base.SeqNum) {
    1675            1 :                 if err != nil || asFlushable {
    1676            1 :                         // An error occurred during prepare.
    1677            1 :                         if mut != nil {
    1678            0 :                                 if mut.writerUnref() {
    1679            0 :                                         d.mu.Lock()
    1680            0 :                                         d.maybeScheduleFlush()
    1681            0 :                                         d.mu.Unlock()
    1682            0 :                                 }
    1683              :                         }
    1684            1 :                         return
    1685              :                 }
    1686              : 
    1687              :                 // If there's an excise being done atomically with the same ingest, we
    1688              :                 // assign the lowest sequence number in the set of sequence numbers for this
    1689              :                 // ingestion to the excise. Note that we've already allocated fileCount+1
    1690              :                 // sequence numbers in this case.
    1691            1 :                 if args.ExciseSpan.Valid() {
    1692            1 :                         seqNum++ // the first seqNum is reserved for the excise.
    1693            1 :                 }
    1694              :                 // Update the sequence numbers for all ingested sstables'
    1695              :                 // metadata. When the version edit is applied, the metadata is
    1696              :                 // written to the manifest, persisting the sequence number.
    1697              :                 // The sstables themselves are left unmodified.
    1698            1 :                 if err = ingestUpdateSeqNum(
    1699            1 :                         d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult,
    1700            1 :                 ); err != nil {
    1701            0 :                         if mut != nil {
    1702            0 :                                 if mut.writerUnref() {
    1703            0 :                                         d.mu.Lock()
    1704            0 :                                         d.maybeScheduleFlush()
    1705            0 :                                         d.mu.Unlock()
    1706            0 :                                 }
    1707              :                         }
    1708            0 :                         return
    1709              :                 }
    1710              : 
    1711              :                 // If we overlapped with a memtable in prepare wait for the flush to
    1712              :                 // finish.
    1713            1 :                 if mem != nil {
    1714            1 :                         <-mem.flushed
    1715            1 :                         waitFlushDuration = waitFlushStart.Elapsed()
    1716            1 :                 }
    1717              : 
    1718            1 :                 if d.opts.private.testingBeforeIngestApplyFunc != nil {
    1719            0 :                         d.opts.private.testingBeforeIngestApplyFunc()
    1720            0 :                 }
    1721              :                 // Assign the sstables to the correct level in the LSM and apply the
    1722              :                 // version edit.
    1723            1 :                 ve, manifestUpdateDuration, err = d.ingestApply(ctx, jobID, loadResult, mut, args.ExciseSpan, args.ExciseBoundsPolicy, seqNum)
    1724              :         }
    1725              : 
    1726              :         // Only one ingest can occur at a time because if not, one would block waiting
    1727              :         // for the other to finish applying. This blocking would happen while holding
    1728              :         // the commit mutex which would prevent unrelated batches from writing their
    1729              :         // changes to the WAL and memtable. This will cause a bigger commit hiccup
    1730              :         // during ingestion.
    1731            1 :         seqNumCount := loadResult.fileCount()
    1732            1 :         if args.ExciseSpan.Valid() {
    1733            1 :                 seqNumCount++
    1734            1 :         }
    1735            1 :         d.commit.ingestSem <- struct{}{}
    1736            1 :         d.commit.AllocateSeqNum(seqNumCount, prepare, apply)
    1737            1 :         <-d.commit.ingestSem
    1738            1 : 
    1739            1 :         if err != nil {
    1740            0 :                 if err2 := ingestCleanup(d.objProvider, loadResult.local); err2 != nil {
    1741            0 :                         d.opts.Logger.Errorf("ingest cleanup failed: %v", err2)
    1742            0 :                 }
    1743            1 :         } else {
    1744            1 :                 // Since we either created a hard link to the ingesting files, or copied
    1745            1 :                 // them over, it is safe to remove the originals paths.
    1746            1 :                 for i := range loadResult.local {
    1747            1 :                         path := loadResult.local[i].path
    1748            1 :                         if err2 := d.opts.FS.Remove(path); err2 != nil {
    1749            0 :                                 d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2)
    1750            0 :                         }
    1751              :                 }
    1752              :         }
    1753              : 
    1754              :         // TODO(jackson): Refactor this so that the case where there are no files
    1755              :         // but a valid excise span is not so exceptional.
    1756              : 
    1757            1 :         var stats IngestOperationStats
    1758            1 :         if loadResult.fileCount() > 0 {
    1759            1 :                 info := TableIngestInfo{
    1760            1 :                         JobID:                  int(jobID),
    1761            1 :                         Err:                    err,
    1762            1 :                         flushable:              asFlushable,
    1763            1 :                         WaitFlushDuration:      waitFlushDuration,
    1764            1 :                         ManifestUpdateDuration: manifestUpdateDuration,
    1765            1 :                         BlockReadDuration:      loadResult.blockReadStats.BlockReadDuration,
    1766            1 :                         BlockReadBytes:         loadResult.blockReadStats.BlockBytes - loadResult.blockReadStats.BlockBytesInCache,
    1767            1 :                 }
    1768            1 :                 if len(loadResult.local) > 0 {
    1769            1 :                         info.GlobalSeqNum = loadResult.local[0].SmallestSeqNum
    1770            1 :                 } else if len(loadResult.shared) > 0 {
    1771            1 :                         info.GlobalSeqNum = loadResult.shared[0].SmallestSeqNum
    1772            1 :                 } else {
    1773            1 :                         info.GlobalSeqNum = loadResult.external[0].SmallestSeqNum
    1774            1 :                 }
    1775            1 :                 if ve != nil {
    1776            1 :                         info.Tables = make([]struct {
    1777            1 :                                 TableInfo
    1778            1 :                                 Level int
    1779            1 :                         }, len(ve.NewTables))
    1780            1 :                         for i := range ve.NewTables {
    1781            1 :                                 e := &ve.NewTables[i]
    1782            1 :                                 info.Tables[i].Level = e.Level
    1783            1 :                                 info.Tables[i].TableInfo = e.Meta.TableInfo()
    1784            1 :                                 stats.Bytes += e.Meta.Size
    1785            1 :                                 if e.Level == 0 {
    1786            1 :                                         stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
    1787            1 :                                 }
    1788            1 :                                 if metaFlushableOverlaps[e.Meta.TableNum] {
    1789            1 :                                         stats.MemtableOverlappingFiles++
    1790            1 :                                 }
    1791              :                         }
    1792            1 :                 } else if asFlushable {
    1793            1 :                         // NB: If asFlushable == true, there are no shared sstables.
    1794            1 :                         info.Tables = make([]struct {
    1795            1 :                                 TableInfo
    1796            1 :                                 Level int
    1797            1 :                         }, len(loadResult.local))
    1798            1 :                         for i, f := range loadResult.local {
    1799            1 :                                 info.Tables[i].Level = -1
    1800            1 :                                 info.Tables[i].TableInfo = f.TableInfo()
    1801            1 :                                 stats.Bytes += f.Size
    1802            1 :                                 // We don't have exact stats on which files will be ingested into
    1803            1 :                                 // L0, because actual ingestion into the LSM has been deferred until
    1804            1 :                                 // flush time. Instead, we infer based on memtable overlap.
    1805            1 :                                 //
    1806            1 :                                 // TODO(jackson): If we optimistically compute data overlap (#2112)
    1807            1 :                                 // before entering the commit pipeline, we can use that overlap to
    1808            1 :                                 // improve our approximation by incorporating overlap with L0, not
    1809            1 :                                 // just memtables.
    1810            1 :                                 if metaFlushableOverlaps[f.TableNum] {
    1811            1 :                                         stats.ApproxIngestedIntoL0Bytes += f.Size
    1812            1 :                                         stats.MemtableOverlappingFiles++
    1813            1 :                                 }
    1814              :                         }
    1815              :                 }
    1816            1 :                 d.opts.EventListener.TableIngested(info)
    1817              :         }
    1818              : 
    1819            1 :         return stats, err
    1820              : }
    1821              : 
    1822              : type ingestSplitFile struct {
    1823              :         // ingestFile is the file being ingested.
    1824              :         ingestFile *manifest.TableMetadata
    1825              :         // splitFile is the file that needs to be split to allow ingestFile to slot
    1826              :         // into `level` level.
    1827              :         splitFile *manifest.TableMetadata
    1828              :         // The level where ingestFile will go (and where splitFile already is).
    1829              :         level int
    1830              : }
    1831              : 
    1832              : // ingestSplit splits files specified in `files` and updates ve in-place to
    1833              : // account for existing files getting split into two virtual sstables. The map
    1834              : // `replacedFiles` contains an in-progress map of all files that have been
    1835              : // replaced with new virtual sstables in this version edit so far, which is also
    1836              : // updated in-place.
    1837              : //
    1838              : // d.mu as well as the manifest lock must be held when calling this method.
    1839              : func (d *DB) ingestSplit(
    1840              :         ctx context.Context,
    1841              :         ve *manifest.VersionEdit,
    1842              :         updateMetrics func(*manifest.TableMetadata, int, []manifest.NewTableEntry),
    1843              :         files []ingestSplitFile,
    1844              :         replacedTables map[base.TableNum][]manifest.NewTableEntry,
    1845            1 : ) error {
    1846            1 :         for _, s := range files {
    1847            1 :                 ingestFileBounds := s.ingestFile.UserKeyBounds()
    1848            1 :                 // replacedFiles can be thought of as a tree, where we start iterating with
    1849            1 :                 // s.splitFile and run its fileNum through replacedFiles, then find which of
    1850            1 :                 // the replaced files overlaps with s.ingestFile, which becomes the new
    1851            1 :                 // splitFile, then we check splitFile's replacements in replacedFiles again
    1852            1 :                 // for overlap with s.ingestFile, and so on until we either can't find the
    1853            1 :                 // current splitFile in replacedFiles (i.e. that's the file that now needs to
    1854            1 :                 // be split), or we don't find a file that overlaps with s.ingestFile, which
    1855            1 :                 // means a prior ingest split already produced enough room for s.ingestFile
    1856            1 :                 // to go into this level without necessitating another ingest split.
    1857            1 :                 splitFile := s.splitFile
    1858            1 :                 for splitFile != nil {
    1859            1 :                         replaced, ok := replacedTables[splitFile.TableNum]
    1860            1 :                         if !ok {
    1861            1 :                                 break
    1862              :                         }
    1863            1 :                         updatedSplitFile := false
    1864            1 :                         for i := range replaced {
    1865            1 :                                 if replaced[i].Meta.Overlaps(d.cmp, &ingestFileBounds) {
    1866            1 :                                         if updatedSplitFile {
    1867            0 :                                                 // This should never happen because the earlier ingestTargetLevel
    1868            0 :                                                 // function only finds split file candidates that are guaranteed to
    1869            0 :                                                 // have no data overlap, only boundary overlap. See the comments
    1870            0 :                                                 // in that method to see the definitions of data vs boundary
    1871            0 :                                                 // overlap. That, plus the fact that files in `replaced` are
    1872            0 :                                                 // guaranteed to have file bounds that are tight on user keys
    1873            0 :                                                 // (as that's what `d.excise` produces), means that the only case
    1874            0 :                                                 // where we overlap with two or more files in `replaced` is if we
    1875            0 :                                                 // actually had data overlap all along, or if the ingestion files
    1876            0 :                                                 // were overlapping, either of which is an invariant violation.
    1877            0 :                                                 panic("updated with two files in ingestSplit")
    1878              :                                         }
    1879            1 :                                         splitFile = replaced[i].Meta
    1880            1 :                                         updatedSplitFile = true
    1881              :                                 }
    1882              :                         }
    1883            1 :                         if !updatedSplitFile {
    1884            1 :                                 // None of the replaced files overlapped with the file being ingested.
    1885            1 :                                 // This can happen if we've already excised a span overlapping with
    1886            1 :                                 // this file, or if we have consecutive ingested files that can slide
    1887            1 :                                 // within the same gap between keys in an existing file. For instance,
    1888            1 :                                 // if an existing file has keys a and g and we're ingesting b-c, d-e,
    1889            1 :                                 // the first loop iteration will split the existing file into one that
    1890            1 :                                 // ends in a and another that starts at g, and the second iteration will
    1891            1 :                                 // fall into this case and require no splitting.
    1892            1 :                                 //
    1893            1 :                                 // No splitting necessary.
    1894            1 :                                 splitFile = nil
    1895            1 :                         }
    1896              :                 }
    1897            1 :                 if splitFile == nil {
    1898            1 :                         continue
    1899              :                 }
    1900              :                 // NB: excise operates on [start, end). We're splitting at [start, end]
    1901              :                 // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation
    1902              :                 // of exclusive vs inclusive end bounds should not make a difference here
    1903              :                 // as we're guaranteed to not have any data overlap between splitFile and
    1904              :                 // s.ingestFile. d.excise will return an error if we pass an inclusive user
    1905              :                 // key bound _and_ we end up seeing data overlap at the end key.
    1906            1 :                 exciseBounds := base.UserKeyBoundsFromInternal(s.ingestFile.Smallest(), s.ingestFile.Largest())
    1907            1 :                 leftTable, rightTable, err := d.exciseTable(ctx, exciseBounds, splitFile, s.level, tightExciseBounds)
    1908            1 :                 if err != nil {
    1909            0 :                         return err
    1910            0 :                 }
    1911            1 :                 added := applyExciseToVersionEdit(ve, splitFile, leftTable, rightTable, s.level)
    1912            1 :                 replacedTables[splitFile.TableNum] = added
    1913            1 :                 for i := range added {
    1914            1 :                         addedBounds := added[i].Meta.UserKeyBounds()
    1915            1 :                         if s.ingestFile.Overlaps(d.cmp, &addedBounds) {
    1916            0 :                                 panic("ingest-time split produced a file that overlaps with ingested file")
    1917              :                         }
    1918              :                 }
    1919            1 :                 updateMetrics(splitFile, s.level, added)
    1920              :         }
    1921              :         // Flatten the version edit by removing any entries from ve.NewFiles that
    1922              :         // are also in ve.DeletedFiles.
    1923            1 :         newNewFiles := ve.NewTables[:0]
    1924            1 :         for i := range ve.NewTables {
    1925            1 :                 fn := ve.NewTables[i].Meta.TableNum
    1926            1 :                 deEntry := manifest.DeletedTableEntry{Level: ve.NewTables[i].Level, FileNum: fn}
    1927            1 :                 if _, ok := ve.DeletedTables[deEntry]; ok {
    1928            1 :                         delete(ve.DeletedTables, deEntry)
    1929            1 :                 } else {
    1930            1 :                         newNewFiles = append(newNewFiles, ve.NewTables[i])
    1931            1 :                 }
    1932              :         }
    1933            1 :         ve.NewTables = newNewFiles
    1934            1 :         return nil
    1935              : }
    1936              : 
    1937              : func (d *DB) ingestApply(
    1938              :         ctx context.Context,
    1939              :         jobID JobID,
    1940              :         lr ingestLoadResult,
    1941              :         mut *memTable,
    1942              :         exciseSpan KeyRange,
    1943              :         exciseBoundsPolicy exciseBoundsPolicy,
    1944              :         exciseSeqNum base.SeqNum,
    1945            1 : ) (*manifest.VersionEdit, time.Duration, error) {
    1946            1 :         d.mu.Lock()
    1947            1 :         defer d.mu.Unlock()
    1948            1 : 
    1949            1 :         ve := &manifest.VersionEdit{
    1950            1 :                 NewTables: make([]manifest.NewTableEntry, lr.fileCount()),
    1951            1 :         }
    1952            1 :         if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) {
    1953            1 :                 ve.DeletedTables = map[manifest.DeletedTableEntry]*manifest.TableMetadata{}
    1954            1 :         }
    1955            1 :         var metrics levelMetricsDelta
    1956            1 : 
    1957            1 :         // Determine the target level inside UpdateVersionLocked. This prevents two
    1958            1 :         // concurrent ingestion jobs from using the same version to determine the
    1959            1 :         // target level, and also provides serialization with concurrent compaction
    1960            1 :         // and flush jobs.
    1961            1 :         manifestUpdateDuration, err := d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) {
    1962            1 :                 if mut != nil {
    1963            1 :                         // Unref the mutable memtable to allows its flush to proceed. Now that we've
    1964            1 :                         // acquired the manifest lock, we can be certain that if the mutable
    1965            1 :                         // memtable has received more recent conflicting writes, the flush won't
    1966            1 :                         // beat us to applying to the manifest resulting in sequence number
    1967            1 :                         // inversion. Even though we call maybeScheduleFlush right now, this flush
    1968            1 :                         // will apply after our ingestion.
    1969            1 :                         if mut.writerUnref() {
    1970            1 :                                 d.maybeScheduleFlush()
    1971            1 :                         }
    1972              :                 }
    1973              : 
    1974            1 :                 current := d.mu.versions.currentVersion()
    1975            1 :                 overlapChecker := &overlapChecker{
    1976            1 :                         comparer: d.opts.Comparer,
    1977            1 :                         newIters: d.newIters,
    1978            1 :                         opts: IterOptions{
    1979            1 :                                 logger:   d.opts.Logger,
    1980            1 :                                 Category: categoryIngest,
    1981            1 :                         },
    1982            1 :                         v: current,
    1983            1 :                 }
    1984            1 :                 shouldIngestSplit := d.opts.Experimental.IngestSplit != nil &&
    1985            1 :                         d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables
    1986            1 :                 baseLevel := d.mu.versions.picker.getBaseLevel()
    1987            1 :                 // filesToSplit is a list where each element is a pair consisting of a file
    1988            1 :                 // being ingested and a file being split to make room for an ingestion into
    1989            1 :                 // that level. Each ingested file will appear at most once in this list. It
    1990            1 :                 // is possible for split files to appear twice in this list.
    1991            1 :                 filesToSplit := make([]ingestSplitFile, 0)
    1992            1 :                 checkCompactions := false
    1993            1 :                 for i := 0; i < lr.fileCount(); i++ {
    1994            1 :                         // Determine the lowest level in the LSM for which the sstable doesn't
    1995            1 :                         // overlap any existing files in the level.
    1996            1 :                         var m *manifest.TableMetadata
    1997            1 :                         specifiedLevel := -1
    1998            1 :                         isShared := false
    1999            1 :                         isExternal := false
    2000            1 :                         if i < len(lr.local) {
    2001            1 :                                 // local file.
    2002            1 :                                 m = lr.local[i].TableMetadata
    2003            1 :                         } else if (i - len(lr.local)) < len(lr.shared) {
    2004            1 :                                 // shared file.
    2005            1 :                                 isShared = true
    2006            1 :                                 sharedIdx := i - len(lr.local)
    2007            1 :                                 m = lr.shared[sharedIdx].TableMetadata
    2008            1 :                                 specifiedLevel = int(lr.shared[sharedIdx].shared.Level)
    2009            1 :                         } else {
    2010            1 :                                 // external file.
    2011            1 :                                 isExternal = true
    2012            1 :                                 externalIdx := i - (len(lr.local) + len(lr.shared))
    2013            1 :                                 m = lr.external[externalIdx].TableMetadata
    2014            1 :                                 if lr.externalFilesHaveLevel {
    2015            0 :                                         specifiedLevel = int(lr.external[externalIdx].external.Level)
    2016            0 :                                 }
    2017              :                         }
    2018              : 
    2019              :                         // Add to CreatedBackingTables if this is a new backing.
    2020              :                         //
    2021              :                         // Shared files always have a new backing. External files have new backings
    2022              :                         // iff the backing disk file num and the file num match (see ingestAttachRemote).
    2023            1 :                         if isShared || (isExternal && m.TableBacking.DiskFileNum == base.DiskFileNum(m.TableNum)) {
    2024            1 :                                 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.TableBacking)
    2025            1 :                         }
    2026              : 
    2027            1 :                         f := &ve.NewTables[i]
    2028            1 :                         var err error
    2029            1 :                         if specifiedLevel != -1 {
    2030            1 :                                 f.Level = specifiedLevel
    2031            1 :                         } else {
    2032            1 :                                 var splitTable *manifest.TableMetadata
    2033            1 :                                 if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest()) && exciseSpan.Contains(d.cmp, m.Largest()) {
    2034            1 :                                         // This file fits perfectly within the excise span. We can slot it at
    2035            1 :                                         // L6, or sharedLevelsStart - 1 if we have shared files.
    2036            1 :                                         if len(lr.shared) > 0 || lr.externalFilesHaveLevel {
    2037            1 :                                                 f.Level = sharedLevelsStart - 1
    2038            1 :                                                 if baseLevel > f.Level {
    2039            1 :                                                         f.Level = 0
    2040            1 :                                                 }
    2041            1 :                                         } else {
    2042            1 :                                                 f.Level = 6
    2043            1 :                                         }
    2044            1 :                                 } else {
    2045            1 :                                         // We check overlap against the LSM without holding DB.mu. Note that we
    2046            1 :                                         // are still holding the log lock, so the version cannot change.
    2047            1 :                                         // TODO(radu): perform this check optimistically outside of the log lock.
    2048            1 :                                         var lsmOverlap overlap.WithLSM
    2049            1 :                                         lsmOverlap, err = func() (overlap.WithLSM, error) {
    2050            1 :                                                 d.mu.Unlock()
    2051            1 :                                                 defer d.mu.Lock()
    2052            1 :                                                 return overlapChecker.DetermineLSMOverlap(ctx, m.UserKeyBounds())
    2053            1 :                                         }()
    2054            1 :                                         if err == nil {
    2055            1 :                                                 f.Level, splitTable, err = ingestTargetLevel(
    2056            1 :                                                         ctx, d.cmp, lsmOverlap, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit,
    2057            1 :                                                 )
    2058            1 :                                         }
    2059              :                                 }
    2060              : 
    2061            1 :                                 if splitTable != nil {
    2062            1 :                                         if invariants.Enabled {
    2063            1 :                                                 if lf := current.Levels[f.Level].Find(d.cmp, splitTable); lf.Empty() {
    2064            0 :                                                         panic("splitFile returned is not in level it should be")
    2065              :                                                 }
    2066              :                                         }
    2067              :                                         // We take advantage of the fact that we won't drop the db mutex
    2068              :                                         // between now and the call to UpdateVersionLocked. So, no files should
    2069              :                                         // get added to a new in-progress compaction at this point. We can
    2070              :                                         // avoid having to iterate on in-progress compactions to cancel them
    2071              :                                         // if none of the files being split have a compacting state.
    2072            1 :                                         if splitTable.IsCompacting() {
    2073            1 :                                                 checkCompactions = true
    2074            1 :                                         }
    2075            1 :                                         filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitTable, level: f.Level})
    2076              :                                 }
    2077              :                         }
    2078            1 :                         if err != nil {
    2079            0 :                                 return versionUpdate{}, err
    2080            0 :                         }
    2081            1 :                         if isShared && f.Level < sharedLevelsStart {
    2082            0 :                                 panic(fmt.Sprintf("cannot slot a shared file higher than the highest shared level: %d < %d",
    2083            0 :                                         f.Level, sharedLevelsStart))
    2084              :                         }
    2085            1 :                         f.Meta = m
    2086            1 :                         levelMetrics := metrics[f.Level]
    2087            1 :                         if levelMetrics == nil {
    2088            1 :                                 levelMetrics = &LevelMetrics{}
    2089            1 :                                 metrics[f.Level] = levelMetrics
    2090            1 :                         }
    2091            1 :                         levelMetrics.TablesCount++
    2092            1 :                         levelMetrics.TablesSize += int64(m.Size)
    2093            1 :                         levelMetrics.EstimatedReferencesSize += m.EstimatedReferenceSize()
    2094            1 :                         levelMetrics.TableBytesIngested += m.Size
    2095            1 :                         levelMetrics.TablesIngested++
    2096              :                 }
    2097              :                 // replacedTables maps files excised due to exciseSpan (or splitFiles returned
    2098              :                 // by ingestTargetLevel), to files that were created to replace it. This map
    2099              :                 // is used to resolve references to split files in filesToSplit, as it is
    2100              :                 // possible for a file that we want to split to no longer exist or have a
    2101              :                 // newer fileMetadata due to a split induced by another ingestion file, or an
    2102              :                 // excise.
    2103            1 :                 replacedTables := make(map[base.TableNum][]manifest.NewTableEntry)
    2104            1 :                 updateLevelMetricsOnExcise := func(m *manifest.TableMetadata, level int, added []manifest.NewTableEntry) {
    2105            1 :                         levelMetrics := metrics[level]
    2106            1 :                         if levelMetrics == nil {
    2107            1 :                                 levelMetrics = &LevelMetrics{}
    2108            1 :                                 metrics[level] = levelMetrics
    2109            1 :                         }
    2110            1 :                         levelMetrics.TablesCount--
    2111            1 :                         levelMetrics.TablesSize -= int64(m.Size)
    2112            1 :                         levelMetrics.EstimatedReferencesSize -= m.EstimatedReferenceSize()
    2113            1 :                         for i := range added {
    2114            1 :                                 levelMetrics.TablesCount++
    2115            1 :                                 levelMetrics.TablesSize += int64(added[i].Meta.Size)
    2116            1 :                                 levelMetrics.EstimatedReferencesSize += added[i].Meta.EstimatedReferenceSize()
    2117            1 :                         }
    2118              :                 }
    2119            1 :                 var exciseBounds base.UserKeyBounds
    2120            1 :                 if exciseSpan.Valid() {
    2121            1 :                         exciseBounds = exciseSpan.UserKeyBounds()
    2122            1 :                         d.mu.versions.metrics.Ingest.ExciseIngestCount++
    2123            1 :                         // Iterate through all levels and find files that intersect with exciseSpan.
    2124            1 :                         //
    2125            1 :                         // TODO(bilal): We could drop the DB mutex here as we don't need it for
    2126            1 :                         // excises; we only need to hold the version lock which we already are
    2127            1 :                         // holding. However releasing the DB mutex could mess with the
    2128            1 :                         // ingestTargetLevel calculation that happened above, as it assumed that it
    2129            1 :                         // had a complete view of in-progress compactions that wouldn't change
    2130            1 :                         // until UpdateVersionLocked is called. If we were to drop the mutex now,
    2131            1 :                         // we could schedule another in-progress compaction that would go into the
    2132            1 :                         // chosen target level and lead to file overlap within level (which would
    2133            1 :                         // panic in UpdateVersionLocked). We should drop the db mutex here, do the
    2134            1 :                         // excise, then re-grab the DB mutex and rerun just the in-progress
    2135            1 :                         // compaction check to see if any new compactions are conflicting with our
    2136            1 :                         // chosen target levels for files, and if they are, we should signal those
    2137            1 :                         // compactions to error out.
    2138            1 :                         for layer, ls := range current.AllLevelsAndSublevels() {
    2139            1 :                                 for m := range ls.Overlaps(d.cmp, exciseSpan.UserKeyBounds()).All() {
    2140            1 :                                         leftTable, rightTable, err := d.exciseTable(ctx, exciseBounds, m, layer.Level(), exciseBoundsPolicy)
    2141            1 :                                         if err != nil {
    2142            0 :                                                 return versionUpdate{}, err
    2143            0 :                                         }
    2144            1 :                                         newFiles := applyExciseToVersionEdit(ve, m, leftTable, rightTable, layer.Level())
    2145            1 :                                         replacedTables[m.TableNum] = newFiles
    2146            1 :                                         updateLevelMetricsOnExcise(m, layer.Level(), newFiles)
    2147              :                                 }
    2148              :                         }
    2149            1 :                         if d.FormatMajorVersion() >= FormatExciseBoundsRecord {
    2150            1 :                                 ve.ExciseBoundsRecord = append(ve.ExciseBoundsRecord, manifest.ExciseOpEntry{
    2151            1 :                                         Bounds: exciseBounds,
    2152            1 :                                         SeqNum: exciseSeqNum,
    2153            1 :                                 })
    2154            1 :                         }
    2155              :                 }
    2156            1 :                 if len(filesToSplit) > 0 {
    2157            1 :                         // For the same reasons as the above call to excise, we hold the db mutex
    2158            1 :                         // while calling this method.
    2159            1 :                         if err := d.ingestSplit(ctx, ve, updateLevelMetricsOnExcise, filesToSplit, replacedTables); err != nil {
    2160            0 :                                 return versionUpdate{}, err
    2161            0 :                         }
    2162              :                 }
    2163            1 :                 if len(filesToSplit) > 0 || exciseSpan.Valid() {
    2164            1 :                         for c := range d.mu.compact.inProgress {
    2165            1 :                                 if c.VersionEditApplied() {
    2166            1 :                                         continue
    2167              :                                 }
    2168              :                                 // Check if this compaction overlaps with the excise span. Note that just
    2169              :                                 // checking if the inputs individually overlap with the excise span
    2170              :                                 // isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
    2171              :                                 // as inputs and write it all out as [a,b,e,f] in one sstable. If we're
    2172              :                                 // doing a [c,d) excise at the same time as this compaction, we will have
    2173              :                                 // to error out the whole compaction as we can't guarantee it hasn't/won't
    2174              :                                 // write a file overlapping with the excise span.
    2175            1 :                                 bounds := c.Bounds()
    2176            1 :                                 if bounds != nil && bounds.Overlaps(d.cmp, &exciseBounds) {
    2177            1 :                                         c.Cancel()
    2178            1 :                                 }
    2179              :                                 // Check if this compaction's inputs have been replaced due to an
    2180              :                                 // ingest-time split. In that case, cancel the compaction as a newly picked
    2181              :                                 // compaction would need to include any new files that slid in between
    2182              :                                 // previously-existing files. Note that we cancel any compaction that has a
    2183              :                                 // file that was ingest-split as an input, even if it started before this
    2184              :                                 // ingestion.
    2185            1 :                                 if checkCompactions {
    2186            1 :                                         for _, table := range c.Tables() {
    2187            1 :                                                 if _, ok := replacedTables[table.TableNum]; ok {
    2188            1 :                                                         c.Cancel()
    2189            1 :                                                         break
    2190              :                                                 }
    2191              :                                         }
    2192              :                                 }
    2193              :                         }
    2194              :                 }
    2195              : 
    2196            1 :                 return versionUpdate{
    2197            1 :                         VE:                      ve,
    2198            1 :                         JobID:                   jobID,
    2199            1 :                         Metrics:                 metrics,
    2200            1 :                         InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) },
    2201              :                 }, nil
    2202              :         })
    2203            1 :         if err != nil {
    2204            0 :                 return nil, 0, err
    2205            0 :         }
    2206              :         // Check for any EventuallyFileOnlySnapshots that could be watching for
    2207              :         // an excise on this span. There should be none as the
    2208              :         // computePossibleOverlaps steps should have forced these EFOS to transition
    2209              :         // to file-only snapshots by now. If we see any that conflict with this
    2210              :         // excise, panic.
    2211            1 :         if exciseSpan.Valid() {
    2212            1 :                 for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next {
    2213            1 :                         // Skip non-EFOS snapshots, and also skip any EFOS that were created
    2214            1 :                         // *after* the excise.
    2215            1 :                         if s.efos == nil || base.Visible(exciseSeqNum, s.efos.seqNum, base.SeqNumMax) {
    2216            0 :                                 continue
    2217              :                         }
    2218            1 :                         efos := s.efos
    2219            1 :                         // TODO(bilal): We can make this faster by taking advantage of the sorted
    2220            1 :                         // nature of protectedRanges to do a sort.Search, or even maintaining a
    2221            1 :                         // global list of all protected ranges instead of having to peer into every
    2222            1 :                         // snapshot.
    2223            1 :                         for i := range efos.protectedRanges {
    2224            1 :                                 if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) {
    2225            0 :                                         panic("unexpected excise of an EventuallyFileOnlySnapshot's bounds")
    2226              :                                 }
    2227              :                         }
    2228              :                 }
    2229              :         }
    2230              : 
    2231            1 :         d.mu.versions.metrics.Ingest.Count++
    2232            1 : 
    2233            1 :         d.updateReadStateLocked(d.opts.DebugCheck)
    2234            1 :         // updateReadStateLocked could have generated obsolete tables, schedule a
    2235            1 :         // cleanup job if necessary.
    2236            1 :         d.deleteObsoleteFiles(jobID)
    2237            1 :         d.updateTableStatsLocked(ve.NewTables)
    2238            1 :         // The ingestion may have pushed a level over the threshold for compaction,
    2239            1 :         // so check to see if one is necessary and schedule it.
    2240            1 :         d.maybeScheduleCompaction()
    2241            1 :         var toValidate []manifest.NewTableEntry
    2242            1 :         dedup := make(map[base.DiskFileNum]struct{})
    2243            1 :         for _, entry := range ve.NewTables {
    2244            1 :                 if _, ok := dedup[entry.Meta.TableBacking.DiskFileNum]; !ok {
    2245            1 :                         toValidate = append(toValidate, entry)
    2246            1 :                         dedup[entry.Meta.TableBacking.DiskFileNum] = struct{}{}
    2247            1 :                 }
    2248              :         }
    2249            1 :         d.maybeValidateSSTablesLocked(toValidate)
    2250            1 : 
    2251            1 :         return ve, manifestUpdateDuration, nil
    2252              : }
    2253              : 
    2254              : // maybeValidateSSTablesLocked adds the slice of newTableEntrys to the pending
    2255              : // queue of files to be validated, when the feature is enabled.
    2256              : //
    2257              : // Note that if two entries with the same backing file are added twice, then the
    2258              : // block checksums for the backing file will be validated twice.
    2259              : //
    2260              : // DB.mu must be locked when calling.
    2261            1 : func (d *DB) maybeValidateSSTablesLocked(newFiles []manifest.NewTableEntry) {
    2262            1 :         // Only add to the validation queue when the feature is enabled.
    2263            1 :         if !d.opts.Experimental.ValidateOnIngest {
    2264            1 :                 return
    2265            1 :         }
    2266              : 
    2267            1 :         d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
    2268            1 :         if d.shouldValidateSSTablesLocked() {
    2269            1 :                 go d.validateSSTables()
    2270            1 :         }
    2271              : }
    2272              : 
    2273              : // shouldValidateSSTablesLocked returns true if SSTable validation should run.
    2274              : // DB.mu must be locked when calling.
    2275            1 : func (d *DB) shouldValidateSSTablesLocked() bool {
    2276            1 :         return !d.mu.tableValidation.validating &&
    2277            1 :                 d.closed.Load() == nil &&
    2278            1 :                 d.opts.Experimental.ValidateOnIngest &&
    2279            1 :                 len(d.mu.tableValidation.pending) > 0
    2280            1 : }
    2281              : 
    2282              : // validateSSTables runs a round of validation on the tables in the pending
    2283              : // queue.
    2284            1 : func (d *DB) validateSSTables() {
    2285            1 :         d.mu.Lock()
    2286            1 :         if !d.shouldValidateSSTablesLocked() {
    2287            1 :                 d.mu.Unlock()
    2288            1 :                 return
    2289            1 :         }
    2290              : 
    2291            1 :         pending := d.mu.tableValidation.pending
    2292            1 :         d.mu.tableValidation.pending = nil
    2293            1 :         d.mu.tableValidation.validating = true
    2294            1 :         jobID := d.newJobIDLocked()
    2295            1 :         rs := d.loadReadState()
    2296            1 : 
    2297            1 :         // Drop DB.mu before performing IO.
    2298            1 :         d.mu.Unlock()
    2299            1 : 
    2300            1 :         // Validate all tables in the pending queue. This could lead to a situation
    2301            1 :         // where we are starving IO from other tasks due to having to page through
    2302            1 :         // all the blocks in all the sstables in the queue.
    2303            1 :         // TODO(travers): Add some form of pacing to avoid IO starvation.
    2304            1 : 
    2305            1 :         // If we fail to validate any files due to reasons other than uncovered
    2306            1 :         // corruption, accumulate them and re-queue them for another attempt.
    2307            1 :         var retry []manifest.NewTableEntry
    2308            1 : 
    2309            1 :         for _, f := range pending {
    2310            1 :                 // The file may have been moved or deleted since it was ingested, in
    2311            1 :                 // which case we skip.
    2312            1 :                 if !rs.current.Contains(f.Level, f.Meta) {
    2313            1 :                         // Assume the file was moved to a lower level. It is rare enough
    2314            1 :                         // that a table is moved or deleted between the time it was ingested
    2315            1 :                         // and the time the validation routine runs that the overall cost of
    2316            1 :                         // this inner loop is tolerably low, when amortized over all
    2317            1 :                         // ingested tables.
    2318            1 :                         found := false
    2319            1 :                         for i := f.Level + 1; i < numLevels; i++ {
    2320            1 :                                 if rs.current.Contains(i, f.Meta) {
    2321            1 :                                         found = true
    2322            1 :                                         break
    2323              :                                 }
    2324              :                         }
    2325            1 :                         if !found {
    2326            1 :                                 continue
    2327              :                         }
    2328              :                 }
    2329              : 
    2330              :                 // TOOD(radu): plumb a ReadEnv with a CategoryIngest stats collector through
    2331              :                 // to ValidateBlockChecksums.
    2332            1 :                 err := d.fileCache.withReader(context.TODO(), block.NoReadEnv,
    2333            1 :                         f.Meta, func(r *sstable.Reader, _ sstable.ReadEnv) error {
    2334            1 :                                 return r.ValidateBlockChecksums()
    2335            1 :                         })
    2336              : 
    2337            1 :                 if err != nil {
    2338            0 :                         if IsCorruptionError(err) {
    2339            0 :                                 // TODO(travers): Hook into the corruption reporting pipeline, once
    2340            0 :                                 // available. See pebble#1192.
    2341            0 :                                 d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err)
    2342            0 :                         } else {
    2343            0 :                                 // If there was some other, possibly transient, error that
    2344            0 :                                 // caused table validation to fail inform the EventListener and
    2345            0 :                                 // move on. We remember the table so that we can retry it in a
    2346            0 :                                 // subsequent table validation job.
    2347            0 :                                 //
    2348            0 :                                 // TODO(jackson): If the error is not transient, this will retry
    2349            0 :                                 // validation indefinitely. While not great, it's the same
    2350            0 :                                 // behavior as erroring flushes and compactions. We should
    2351            0 :                                 // address this as a part of #270.
    2352            0 :                                 d.opts.EventListener.BackgroundError(err)
    2353            0 :                                 retry = append(retry, f)
    2354            0 :                                 continue
    2355              :                         }
    2356              :                 }
    2357              : 
    2358            1 :                 d.opts.EventListener.TableValidated(TableValidatedInfo{
    2359            1 :                         JobID: int(jobID),
    2360            1 :                         Meta:  f.Meta,
    2361            1 :                 })
    2362              :         }
    2363            1 :         rs.unref()
    2364            1 :         d.mu.Lock()
    2365            1 :         defer d.mu.Unlock()
    2366            1 :         d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...)
    2367            1 :         d.mu.tableValidation.validating = false
    2368            1 :         d.mu.tableValidation.cond.Broadcast()
    2369            1 :         if d.shouldValidateSSTablesLocked() {
    2370            1 :                 go d.validateSSTables()
    2371            1 :         }
    2372              : }

Generated by: LCOV version 2.0-1