LCOV - code coverage report
Current view: top level - pebble - ingest.go (source / functions) Hit Total Coverage
Test: 2023-11-18 08:15Z 717d49c2 - meta test only.lcov Lines: 1159 1726 67.1 %
Date: 2023-11-18 08:16:19 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package pebble
       6             : 
       7             : import (
       8             :         "context"
       9             :         "slices"
      10             :         "sort"
      11             :         "time"
      12             : 
      13             :         "github.com/cockroachdb/errors"
      14             :         "github.com/cockroachdb/pebble/internal/base"
      15             :         "github.com/cockroachdb/pebble/internal/invariants"
      16             :         "github.com/cockroachdb/pebble/internal/keyspan"
      17             :         "github.com/cockroachdb/pebble/internal/manifest"
      18             :         "github.com/cockroachdb/pebble/internal/private"
      19             :         "github.com/cockroachdb/pebble/objstorage"
      20             :         "github.com/cockroachdb/pebble/objstorage/remote"
      21             :         "github.com/cockroachdb/pebble/sstable"
      22             : )
      23             : 
      24           1 : func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
      25           1 :         c := userCmp(a.UserKey, b.UserKey)
      26           1 :         if c != 0 {
      27           1 :                 return c
      28           1 :         }
      29           1 :         if a.IsExclusiveSentinel() {
      30           1 :                 if !b.IsExclusiveSentinel() {
      31           1 :                         return -1
      32           1 :                 }
      33           1 :         } else if b.IsExclusiveSentinel() {
      34           1 :                 return +1
      35           1 :         }
      36           1 :         return 0
      37             : }
      38             : 
      39             : // KeyRange encodes a key range in user key space. A KeyRange's Start is
      40             : // inclusive while its End is exclusive.
      41             : type KeyRange struct {
      42             :         Start, End []byte
      43             : }
      44             : 
      45             : // Valid returns true if the KeyRange is defined.
      46           1 : func (k *KeyRange) Valid() bool {
      47           1 :         return k.Start != nil && k.End != nil
      48           1 : }
      49             : 
      50             : // Contains returns whether the specified key exists in the KeyRange.
      51           1 : func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool {
      52           1 :         v := cmp(key.UserKey, k.End)
      53           1 :         return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0
      54           1 : }
      55             : 
      56             : // OverlapsInternalKeyRange checks if the specified internal key range has an
      57             : // overlap with the KeyRange. Note that we aren't checking for full containment
      58             : // of smallest-largest within k, rather just that there's some intersection
      59             : // between the two ranges.
      60           1 : func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool {
      61           1 :         v := cmp(k.Start, largest.UserKey)
      62           1 :         return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) &&
      63           1 :                 cmp(k.End, smallest.UserKey) > 0
      64           1 : }
      65             : 
      66             : // Overlaps checks if the specified file has an overlap with the KeyRange.
      67             : // Note that we aren't checking for full containment of m within k, rather just
      68             : // that there's some intersection between m and k's bounds.
      69           1 : func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool {
      70           1 :         return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest)
      71           1 : }
      72             : 
      73             : // OverlapsKeyRange checks if this span overlaps with the provided KeyRange.
      74             : // Note that we aren't checking for full containment of either span in the other,
      75             : // just that there's a key x that is in both key ranges.
      76           0 : func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool {
      77           0 :         return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0
      78           0 : }
      79             : 
      80           1 : func ingestValidateKey(opts *Options, key *InternalKey) error {
      81           1 :         if key.Kind() == InternalKeyKindInvalid {
      82           0 :                 return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s",
      83           0 :                         key.Pretty(opts.Comparer.FormatKey))
      84           0 :         }
      85           1 :         if key.SeqNum() != 0 {
      86           0 :                 return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s",
      87           0 :                         key.Pretty(opts.Comparer.FormatKey))
      88           0 :         }
      89           1 :         return nil
      90             : }
      91             : 
      92             : // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned
      93             : // or shared by another node.
      94             : func ingestSynthesizeShared(
      95             :         opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum,
      96           0 : ) (*fileMetadata, error) {
      97           0 :         if sm.Size == 0 {
      98           0 :                 // Disallow 0 file sizes
      99           0 :                 return nil, errors.New("pebble: cannot ingest shared file with size 0")
     100           0 :         }
     101             :         // Don't load table stats. Doing a round trip to shared storage, one SST
     102             :         // at a time is not worth it as it slows down ingestion.
     103           0 :         meta := &fileMetadata{
     104           0 :                 FileNum:      fileNum.FileNum(),
     105           0 :                 CreationTime: time.Now().Unix(),
     106           0 :                 Virtual:      true,
     107           0 :                 Size:         sm.Size,
     108           0 :         }
     109           0 :         meta.InitProviderBacking(fileNum)
     110           0 :         // Set the underlying FileBacking's size to the same size as the virtualized
     111           0 :         // view of the sstable. This ensures that we don't over-prioritize this
     112           0 :         // sstable for compaction just yet, as we do not have a clear sense of what
     113           0 :         // parts of this sstable are referenced by other nodes.
     114           0 :         meta.FileBacking.Size = sm.Size
     115           0 :         if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil {
     116           0 :                 // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc.
     117           0 :                 //
     118           0 :                 // NB: We create new internal keys and pass them into ExternalRangeKeyBounds
     119           0 :                 // so that we can sub a zero sequence number into the bounds. We can set
     120           0 :                 // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum
     121           0 :                 // anyway. However we do need to use the same sequence number across all
     122           0 :                 // bound keys at this step so that we end up with bounds that are consistent
     123           0 :                 // across point/range keys.
     124           0 :                 smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind())
     125           0 :                 largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey)
     126           0 :                 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey)
     127           0 :         }
     128           0 :         if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil {
     129           0 :                 // Initialize meta.{HasPointKeys,Smallest,Largest}, etc.
     130           0 :                 //
     131           0 :                 // See point above in the ExtendRangeKeyBounds call on why we use a zero
     132           0 :                 // sequence number here.
     133           0 :                 smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind())
     134           0 :                 largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind())
     135           0 :                 if sm.LargestPointKey.IsExclusiveSentinel() {
     136           0 :                         largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey)
     137           0 :                 }
     138           0 :                 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey)
     139             :         }
     140           0 :         if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
     141           0 :                 return nil, err
     142           0 :         }
     143           0 :         return meta, nil
     144             : }
     145             : 
     146             : // ingestLoad1External loads the fileMetadata for one external sstable.
     147             : // Sequence number and target level calculation happens during prepare/apply.
     148             : func ingestLoad1External(
     149             :         opts *Options,
     150             :         e ExternalFile,
     151             :         fileNum base.DiskFileNum,
     152             :         objprovider objstorage.Provider,
     153             :         jobID int,
     154           0 : ) (*fileMetadata, error) {
     155           0 :         if e.Size == 0 {
     156           0 :                 // Disallow 0 file sizes
     157           0 :                 return nil, errors.New("pebble: cannot ingest external file with size 0")
     158           0 :         }
     159           0 :         if !e.HasRangeKey && !e.HasPointKey {
     160           0 :                 return nil, errors.New("pebble: cannot ingest external file with no point or range keys")
     161           0 :         }
     162             :         // Don't load table stats. Doing a round trip to shared storage, one SST
     163             :         // at a time is not worth it as it slows down ingestion.
     164           0 :         meta := &fileMetadata{}
     165           0 :         meta.FileNum = fileNum.FileNum()
     166           0 :         meta.CreationTime = time.Now().Unix()
     167           0 :         meta.Virtual = true
     168           0 :         meta.Size = e.Size
     169           0 :         meta.InitProviderBacking(fileNum)
     170           0 : 
     171           0 :         // Try to resolve a reference to the external file.
     172           0 :         backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName)
     173           0 :         if err != nil {
     174           0 :                 return nil, err
     175           0 :         }
     176           0 :         metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
     177           0 :                 FileNum:  fileNum,
     178           0 :                 FileType: fileTypeTable,
     179           0 :                 Backing:  backing,
     180           0 :         }})
     181           0 :         if err != nil {
     182           0 :                 return nil, err
     183           0 :         }
     184           0 :         if opts.EventListener.TableCreated != nil {
     185           0 :                 opts.EventListener.TableCreated(TableCreateInfo{
     186           0 :                         JobID:   jobID,
     187           0 :                         Reason:  "ingesting",
     188           0 :                         Path:    objprovider.Path(metas[0]),
     189           0 :                         FileNum: fileNum.FileNum(),
     190           0 :                 })
     191           0 :         }
     192             :         // In the name of keeping this ingestion as fast as possible, we avoid
     193             :         // *all* existence checks and synthesize a file metadata with smallest/largest
     194             :         // keys that overlap whatever the passed-in span was.
     195           0 :         smallestCopy := make([]byte, len(e.SmallestUserKey))
     196           0 :         copy(smallestCopy, e.SmallestUserKey)
     197           0 :         largestCopy := make([]byte, len(e.LargestUserKey))
     198           0 :         copy(largestCopy, e.LargestUserKey)
     199           0 :         if e.HasPointKey {
     200           0 :                 meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax),
     201           0 :                         base.MakeRangeDeleteSentinelKey(largestCopy))
     202           0 :         }
     203           0 :         if e.HasRangeKey {
     204           0 :                 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet),
     205           0 :                         base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy))
     206           0 :         }
     207             : 
     208             :         // Set the underlying FileBacking's size to the same size as the virtualized
     209             :         // view of the sstable. This ensures that we don't over-prioritize this
     210             :         // sstable for compaction just yet, as we do not have a clear sense of
     211             :         // what parts of this sstable are referenced by other nodes.
     212           0 :         meta.FileBacking.Size = e.Size
     213           0 : 
     214           0 :         if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
     215           0 :                 return nil, err
     216           0 :         }
     217           0 :         return meta, nil
     218             : }
     219             : 
     220             : // ingestLoad1 creates the FileMetadata for one file. This file will be owned
     221             : // by this store.
     222             : func ingestLoad1(
     223             :         opts *Options,
     224             :         fmv FormatMajorVersion,
     225             :         readable objstorage.Readable,
     226             :         cacheID uint64,
     227             :         fileNum base.DiskFileNum,
     228           1 : ) (*fileMetadata, error) {
     229           1 :         cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption)
     230           1 :         r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts)
     231           1 :         if err != nil {
     232           0 :                 return nil, err
     233           0 :         }
     234           1 :         defer r.Close()
     235           1 : 
     236           1 :         // Avoid ingesting tables with format versions this DB doesn't support.
     237           1 :         tf, err := r.TableFormat()
     238           1 :         if err != nil {
     239           0 :                 return nil, err
     240           0 :         }
     241           1 :         if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
     242           0 :                 return nil, errors.Newf(
     243           0 :                         "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
     244           0 :                         tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
     245           0 :                 )
     246           0 :         }
     247             : 
     248           1 :         meta := &fileMetadata{}
     249           1 :         meta.FileNum = fileNum.FileNum()
     250           1 :         meta.Size = uint64(readable.Size())
     251           1 :         meta.CreationTime = time.Now().Unix()
     252           1 :         meta.InitPhysicalBacking()
     253           1 : 
     254           1 :         // Avoid loading into the table cache for collecting stats if we
     255           1 :         // don't need to. If there are no range deletions, we have all the
     256           1 :         // information to compute the stats here.
     257           1 :         //
     258           1 :         // This is helpful in tests for avoiding awkwardness around deletion of
     259           1 :         // ingested files from MemFS. MemFS implements the Windows semantics of
     260           1 :         // disallowing removal of an open file. Under MemFS, if we don't populate
     261           1 :         // meta.Stats here, the file will be loaded into the table cache for
     262           1 :         // calculating stats before we can remove the original link.
     263           1 :         maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties)
     264           1 : 
     265           1 :         {
     266           1 :                 iter, err := r.NewIter(nil /* lower */, nil /* upper */)
     267           1 :                 if err != nil {
     268           0 :                         return nil, err
     269           0 :                 }
     270           1 :                 defer iter.Close()
     271           1 :                 var smallest InternalKey
     272           1 :                 if key, _ := iter.First(); key != nil {
     273           1 :                         if err := ingestValidateKey(opts, key); err != nil {
     274           0 :                                 return nil, err
     275           0 :                         }
     276           1 :                         smallest = (*key).Clone()
     277             :                 }
     278           1 :                 if err := iter.Error(); err != nil {
     279           0 :                         return nil, err
     280           0 :                 }
     281           1 :                 if key, _ := iter.Last(); key != nil {
     282           1 :                         if err := ingestValidateKey(opts, key); err != nil {
     283           0 :                                 return nil, err
     284           0 :                         }
     285           1 :                         meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone())
     286             :                 }
     287           1 :                 if err := iter.Error(); err != nil {
     288           0 :                         return nil, err
     289           0 :                 }
     290             :         }
     291             : 
     292           1 :         iter, err := r.NewRawRangeDelIter()
     293           1 :         if err != nil {
     294           0 :                 return nil, err
     295           0 :         }
     296           1 :         if iter != nil {
     297           1 :                 defer iter.Close()
     298           1 :                 var smallest InternalKey
     299           1 :                 if s := iter.First(); s != nil {
     300           1 :                         key := s.SmallestKey()
     301           1 :                         if err := ingestValidateKey(opts, &key); err != nil {
     302           0 :                                 return nil, err
     303           0 :                         }
     304           1 :                         smallest = key.Clone()
     305             :                 }
     306           1 :                 if err := iter.Error(); err != nil {
     307           0 :                         return nil, err
     308           0 :                 }
     309           1 :                 if s := iter.Last(); s != nil {
     310           1 :                         k := s.SmallestKey()
     311           1 :                         if err := ingestValidateKey(opts, &k); err != nil {
     312           0 :                                 return nil, err
     313           0 :                         }
     314           1 :                         largest := s.LargestKey().Clone()
     315           1 :                         meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
     316             :                 }
     317             :         }
     318             : 
     319             :         // Update the range-key bounds for the table.
     320           1 :         {
     321           1 :                 iter, err := r.NewRawRangeKeyIter()
     322           1 :                 if err != nil {
     323           0 :                         return nil, err
     324           0 :                 }
     325           1 :                 if iter != nil {
     326           1 :                         defer iter.Close()
     327           1 :                         var smallest InternalKey
     328           1 :                         if s := iter.First(); s != nil {
     329           1 :                                 key := s.SmallestKey()
     330           1 :                                 if err := ingestValidateKey(opts, &key); err != nil {
     331           0 :                                         return nil, err
     332           0 :                                 }
     333           1 :                                 smallest = key.Clone()
     334             :                         }
     335           1 :                         if err := iter.Error(); err != nil {
     336           0 :                                 return nil, err
     337           0 :                         }
     338           1 :                         if s := iter.Last(); s != nil {
     339           1 :                                 k := s.SmallestKey()
     340           1 :                                 if err := ingestValidateKey(opts, &k); err != nil {
     341           0 :                                         return nil, err
     342           0 :                                 }
     343             :                                 // As range keys are fragmented, the end key of the last range key in
     344             :                                 // the table provides the upper bound for the table.
     345           1 :                                 largest := s.LargestKey().Clone()
     346           1 :                                 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
     347             :                         }
     348           1 :                         if err := iter.Error(); err != nil {
     349           0 :                                 return nil, err
     350           0 :                         }
     351             :                 }
     352             :         }
     353             : 
     354           1 :         if !meta.HasPointKeys && !meta.HasRangeKeys {
     355           1 :                 return nil, nil
     356           1 :         }
     357             : 
     358             :         // Sanity check that the various bounds on the file were set consistently.
     359           1 :         if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
     360           0 :                 return nil, err
     361           0 :         }
     362             : 
     363           1 :         return meta, nil
     364             : }
     365             : 
     366             : type ingestLoadResult struct {
     367             :         localMeta, sharedMeta []*fileMetadata
     368             :         externalMeta          []*fileMetadata
     369             :         localPaths            []string
     370             :         sharedLevels          []uint8
     371             :         fileCount             int
     372             : }
     373             : 
     374             : func ingestLoad(
     375             :         opts *Options,
     376             :         fmv FormatMajorVersion,
     377             :         paths []string,
     378             :         shared []SharedSSTMeta,
     379             :         external []ExternalFile,
     380             :         cacheID uint64,
     381             :         pending []base.DiskFileNum,
     382             :         objProvider objstorage.Provider,
     383             :         jobID int,
     384           1 : ) (ingestLoadResult, error) {
     385           1 :         meta := make([]*fileMetadata, 0, len(paths))
     386           1 :         newPaths := make([]string, 0, len(paths))
     387           1 :         for i := range paths {
     388           1 :                 f, err := opts.FS.Open(paths[i])
     389           1 :                 if err != nil {
     390           0 :                         return ingestLoadResult{}, err
     391           0 :                 }
     392             : 
     393           1 :                 readable, err := sstable.NewSimpleReadable(f)
     394           1 :                 if err != nil {
     395           0 :                         return ingestLoadResult{}, err
     396           0 :                 }
     397           1 :                 m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i])
     398           1 :                 if err != nil {
     399           0 :                         return ingestLoadResult{}, err
     400           0 :                 }
     401           1 :                 if m != nil {
     402           1 :                         meta = append(meta, m)
     403           1 :                         newPaths = append(newPaths, paths[i])
     404           1 :                 }
     405             :         }
     406           1 :         if len(shared) == 0 && len(external) == 0 {
     407           1 :                 return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil
     408           1 :         }
     409             : 
     410             :         // Sort the shared files according to level.
     411           0 :         sort.Sort(sharedByLevel(shared))
     412           0 : 
     413           0 :         sharedMeta := make([]*fileMetadata, 0, len(shared))
     414           0 :         levels := make([]uint8, 0, len(shared))
     415           0 :         for i := range shared {
     416           0 :                 m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i])
     417           0 :                 if err != nil {
     418           0 :                         return ingestLoadResult{}, err
     419           0 :                 }
     420           0 :                 if shared[i].Level < sharedLevelsStart {
     421           0 :                         return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart")
     422           0 :                 }
     423           0 :                 sharedMeta = append(sharedMeta, m)
     424           0 :                 levels = append(levels, shared[i].Level)
     425             :         }
     426           0 :         externalMeta := make([]*fileMetadata, 0, len(external))
     427           0 :         for i := range external {
     428           0 :                 m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID)
     429           0 :                 if err != nil {
     430           0 :                         return ingestLoadResult{}, err
     431           0 :                 }
     432           0 :                 externalMeta = append(externalMeta, m)
     433             :         }
     434           0 :         result := ingestLoadResult{
     435           0 :                 localMeta:    meta,
     436           0 :                 sharedMeta:   sharedMeta,
     437           0 :                 externalMeta: externalMeta,
     438           0 :                 localPaths:   newPaths,
     439           0 :                 sharedLevels: levels,
     440           0 :                 fileCount:    len(meta) + len(sharedMeta) + len(externalMeta),
     441           0 :         }
     442           0 :         return result, nil
     443             : }
     444             : 
     445             : // Struct for sorting metadatas by smallest user keys, while ensuring the
     446             : // matching path also gets swapped to the same index. For use in
     447             : // ingestSortAndVerify.
     448             : type metaAndPaths struct {
     449             :         meta  []*fileMetadata
     450             :         paths []string
     451             :         cmp   Compare
     452             : }
     453             : 
     454           1 : func (m metaAndPaths) Len() int {
     455           1 :         return len(m.meta)
     456           1 : }
     457             : 
     458           1 : func (m metaAndPaths) Less(i, j int) bool {
     459           1 :         return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0
     460           1 : }
     461             : 
     462           1 : func (m metaAndPaths) Swap(i, j int) {
     463           1 :         m.meta[i], m.meta[j] = m.meta[j], m.meta[i]
     464           1 :         if m.paths != nil {
     465           1 :                 m.paths[i], m.paths[j] = m.paths[j], m.paths[i]
     466           1 :         }
     467             : }
     468             : 
     469           1 : func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error {
     470           1 :         // Verify that all the shared files (i.e. files in sharedMeta)
     471           1 :         // fit within the exciseSpan.
     472           1 :         for i := range lr.sharedMeta {
     473           0 :                 f := lr.sharedMeta[i]
     474           0 :                 if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) {
     475           0 :                         return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
     476           0 :                 }
     477             :         }
     478           1 :         if len(lr.externalMeta) > 0 {
     479           0 :                 if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 {
     480           0 :                         // Currently we only support external ingests on their own. If external
     481           0 :                         // files are present alongside local/shared files, return an error.
     482           0 :                         return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files")
     483           0 :                 }
     484           0 :                 sort.Sort(&metaAndPaths{
     485           0 :                         meta: lr.externalMeta,
     486           0 :                         cmp:  cmp,
     487           0 :                 })
     488           0 :                 for i := 1; i < len(lr.externalMeta); i++ {
     489           0 :                         if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 {
     490           0 :                                 return errors.AssertionFailedf("pebble: external sstables have overlapping ranges")
     491           0 :                         }
     492             :                 }
     493           0 :                 return nil
     494             :         }
     495           1 :         if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 {
     496           1 :                 return nil
     497           1 :         }
     498             : 
     499           1 :         sort.Sort(&metaAndPaths{
     500           1 :                 meta:  lr.localMeta,
     501           1 :                 paths: lr.localPaths,
     502           1 :                 cmp:   cmp,
     503           1 :         })
     504           1 : 
     505           1 :         for i := 1; i < len(lr.localPaths); i++ {
     506           1 :                 if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 {
     507           1 :                         return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges")
     508           1 :                 }
     509             :         }
     510           1 :         if len(lr.sharedMeta) == 0 {
     511           1 :                 return nil
     512           1 :         }
     513           0 :         filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta))
     514           0 :         for l := sharedLevelsStart; l < numLevels; l++ {
     515           0 :                 filesInLevel = filesInLevel[:0]
     516           0 :                 for i := range lr.sharedMeta {
     517           0 :                         if lr.sharedLevels[i] == uint8(l) {
     518           0 :                                 filesInLevel = append(filesInLevel, lr.sharedMeta[i])
     519           0 :                         }
     520             :                 }
     521           0 :                 slices.SortFunc(filesInLevel, func(a, b *fileMetadata) int {
     522           0 :                         return cmp(a.Smallest.UserKey, b.Smallest.UserKey)
     523           0 :                 })
     524           0 :                 for i := 1; i < len(filesInLevel); i++ {
     525           0 :                         if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 {
     526           0 :                                 return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges")
     527           0 :                         }
     528             :                 }
     529             :         }
     530           0 :         return nil
     531             : }
     532             : 
     533           0 : func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error {
     534           0 :         var firstErr error
     535           0 :         for i := range meta {
     536           0 :                 if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil {
     537           0 :                         firstErr = firstError(firstErr, err)
     538           0 :                 }
     539             :         }
     540           0 :         return firstErr
     541             : }
     542             : 
     543             : // ingestLink creates new objects which are backed by either hardlinks to or
     544             : // copies of the ingested files. It also attaches shared objects to the provider.
     545             : func ingestLink(
     546             :         jobID int,
     547             :         opts *Options,
     548             :         objProvider objstorage.Provider,
     549             :         lr ingestLoadResult,
     550             :         shared []SharedSSTMeta,
     551           1 : ) error {
     552           1 :         for i := range lr.localPaths {
     553           1 :                 objMeta, err := objProvider.LinkOrCopyFromLocal(
     554           1 :                         context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum,
     555           1 :                         objstorage.CreateOptions{PreferSharedStorage: true},
     556           1 :                 )
     557           1 :                 if err != nil {
     558           0 :                         if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil {
     559           0 :                                 opts.Logger.Errorf("ingest cleanup failed: %v", err2)
     560           0 :                         }
     561           0 :                         return err
     562             :                 }
     563           1 :                 if opts.EventListener.TableCreated != nil {
     564           1 :                         opts.EventListener.TableCreated(TableCreateInfo{
     565           1 :                                 JobID:   jobID,
     566           1 :                                 Reason:  "ingesting",
     567           1 :                                 Path:    objProvider.Path(objMeta),
     568           1 :                                 FileNum: lr.localMeta[i].FileNum,
     569           1 :                         })
     570           1 :                 }
     571             :         }
     572           1 :         sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared))
     573           1 :         for i := range shared {
     574           0 :                 backing, err := shared[i].Backing.Get()
     575           0 :                 if err != nil {
     576           0 :                         return err
     577           0 :                 }
     578           0 :                 sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{
     579           0 :                         FileNum:  lr.sharedMeta[i].FileBacking.DiskFileNum,
     580           0 :                         FileType: fileTypeTable,
     581           0 :                         Backing:  backing,
     582           0 :                 })
     583             :         }
     584           1 :         sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs)
     585           1 :         if err != nil {
     586           0 :                 return err
     587           0 :         }
     588           1 :         for i := range sharedObjMetas {
     589           0 :                 // One corner case around file sizes we need to be mindful of, is that
     590           0 :                 // if one of the shareObjs was initially created by us (and has boomeranged
     591           0 :                 // back from another node), we'll need to update the FileBacking's size
     592           0 :                 // to be the true underlying size. Otherwise, we could hit errors when we
     593           0 :                 // open the db again after a crash/restart (see checkConsistency in open.go),
     594           0 :                 // plus it more accurately allows us to prioritize compactions of files
     595           0 :                 // that were originally created by us.
     596           0 :                 if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) {
     597           0 :                         size, err := objProvider.Size(sharedObjMetas[i])
     598           0 :                         if err != nil {
     599           0 :                                 return err
     600           0 :                         }
     601           0 :                         lr.sharedMeta[i].FileBacking.Size = uint64(size)
     602             :                 }
     603           0 :                 if opts.EventListener.TableCreated != nil {
     604           0 :                         opts.EventListener.TableCreated(TableCreateInfo{
     605           0 :                                 JobID:   jobID,
     606           0 :                                 Reason:  "ingesting",
     607           0 :                                 Path:    objProvider.Path(sharedObjMetas[i]),
     608           0 :                                 FileNum: lr.sharedMeta[i].FileNum,
     609           0 :                         })
     610           0 :                 }
     611             :         }
     612             :         // We do not need to do anything about lr.externalMetas. Those were already
     613             :         // linked in ingestLoad.
     614             : 
     615           1 :         return nil
     616             : }
     617             : 
     618           1 : func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool {
     619           1 :         iter := mem.newIter(nil)
     620           1 :         rangeDelIter := mem.newRangeDelIter(nil)
     621           1 :         rkeyIter := mem.newRangeKeyIter(nil)
     622           1 : 
     623           1 :         closeIters := func() error {
     624           1 :                 err := iter.Close()
     625           1 :                 if rangeDelIter != nil {
     626           1 :                         err = firstError(err, rangeDelIter.Close())
     627           1 :                 }
     628           1 :                 if rkeyIter != nil {
     629           1 :                         err = firstError(err, rkeyIter.Close())
     630           1 :                 }
     631           1 :                 return err
     632             :         }
     633             : 
     634           1 :         for _, kr := range keyRanges {
     635           1 :                 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) {
     636           1 :                         closeIters()
     637           1 :                         return true
     638           1 :                 }
     639             :         }
     640             : 
     641             :         // Assume overlap if any iterator errored out.
     642           1 :         return closeIters() != nil
     643             : }
     644             : 
     645             : func ingestUpdateSeqNum(
     646             :         cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult,
     647           1 : ) error {
     648           1 :         setSeqFn := func(k base.InternalKey) base.InternalKey {
     649           1 :                 return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
     650           1 :         }
     651           1 :         updateMetadata := func(m *fileMetadata) error {
     652           1 :                 // NB: we set the fields directly here, rather than via their Extend*
     653           1 :                 // methods, as we are updating sequence numbers.
     654           1 :                 if m.HasPointKeys {
     655           1 :                         m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
     656           1 :                 }
     657           1 :                 if m.HasRangeKeys {
     658           1 :                         m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
     659           1 :                 }
     660           1 :                 m.Smallest = setSeqFn(m.Smallest)
     661           1 :                 // Only update the seqnum for the largest key if that key is not an
     662           1 :                 // "exclusive sentinel" (i.e. a range deletion sentinel or a range key
     663           1 :                 // boundary), as doing so effectively drops the exclusive sentinel (by
     664           1 :                 // lowering the seqnum from the max value), and extends the bounds of the
     665           1 :                 // table.
     666           1 :                 // NB: as the largest range key is always an exclusive sentinel, it is never
     667           1 :                 // updated.
     668           1 :                 if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() {
     669           1 :                         m.LargestPointKey = setSeqFn(m.LargestPointKey)
     670           1 :                 }
     671           1 :                 if !m.Largest.IsExclusiveSentinel() {
     672           1 :                         m.Largest = setSeqFn(m.Largest)
     673           1 :                 }
     674             :                 // Setting smallestSeqNum == largestSeqNum triggers the setting of
     675             :                 // Properties.GlobalSeqNum when an sstable is loaded.
     676           1 :                 m.SmallestSeqNum = seqNum
     677           1 :                 m.LargestSeqNum = seqNum
     678           1 :                 // Ensure the new bounds are consistent.
     679           1 :                 if err := m.Validate(cmp, format); err != nil {
     680           0 :                         return err
     681           0 :                 }
     682           1 :                 seqNum++
     683           1 :                 return nil
     684             :         }
     685             : 
     686             :         // Shared sstables are required to be sorted by level ascending. We then
     687             :         // iterate the shared sstables in reverse, assigning the lower sequence
     688             :         // numbers to the shared sstables that will be ingested into the lower
     689             :         // (larger numbered) levels first. This ensures sequence number shadowing is
     690             :         // correct.
     691           1 :         for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- {
     692           0 :                 if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] {
     693           0 :                         panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i]))
     694             :                 }
     695           0 :                 if err := updateMetadata(loadResult.sharedMeta[i]); err != nil {
     696           0 :                         return err
     697           0 :                 }
     698             :         }
     699           1 :         for i := range loadResult.localMeta {
     700           1 :                 if err := updateMetadata(loadResult.localMeta[i]); err != nil {
     701           0 :                         return err
     702           0 :                 }
     703             :         }
     704           1 :         for i := range loadResult.externalMeta {
     705           0 :                 if err := updateMetadata(loadResult.externalMeta[i]); err != nil {
     706           0 :                         return err
     707           0 :                 }
     708             :         }
     709           1 :         return nil
     710             : }
     711             : 
     712             : // Denotes an internal key range. Smallest and largest are both inclusive.
     713             : type internalKeyRange struct {
     714             :         smallest, largest InternalKey
     715             : }
     716             : 
     717             : func overlapWithIterator(
     718             :         iter internalIterator,
     719             :         rangeDelIter *keyspan.FragmentIterator,
     720             :         rkeyIter keyspan.FragmentIterator,
     721             :         keyRange internalKeyRange,
     722             :         cmp Compare,
     723           1 : ) bool {
     724           1 :         // Check overlap with point operations.
     725           1 :         //
     726           1 :         // When using levelIter, it seeks to the SST whose boundaries
     727           1 :         // contain keyRange.smallest.UserKey(S).
     728           1 :         // It then tries to find a point in that SST that is >= S.
     729           1 :         // If there's no such point it means the SST ends in a tombstone in which case
     730           1 :         // levelIter.SeekGE generates a boundary range del sentinel.
     731           1 :         // The comparison of this boundary with keyRange.largest(L) below
     732           1 :         // is subtle but maintains correctness.
     733           1 :         // 1) boundary < L,
     734           1 :         //    since boundary is also > S (initial seek),
     735           1 :         //    whatever the boundary's start key may be, we're always overlapping.
     736           1 :         // 2) boundary > L,
     737           1 :         //    overlap with boundary cannot be determined since we don't know boundary's start key.
     738           1 :         //    We require checking for overlap with rangeDelIter.
     739           1 :         // 3) boundary == L and L is not sentinel,
     740           1 :         //    means boundary < L and hence is similar to 1).
     741           1 :         // 4) boundary == L and L is sentinel,
     742           1 :         //    we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap.
     743           1 :         key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone)
     744           1 :         if key != nil {
     745           1 :                 c := sstableKeyCompare(cmp, *key, keyRange.largest)
     746           1 :                 if c <= 0 {
     747           1 :                         return true
     748           1 :                 }
     749             :         }
     750             :         // Assume overlap if iterator errored.
     751           1 :         if err := iter.Error(); err != nil {
     752           0 :                 return true
     753           0 :         }
     754             : 
     755           1 :         computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool {
     756           1 :                 // NB: The spans surfaced by the fragment iterator are non-overlapping.
     757           1 :                 span := rIter.SeekLT(keyRange.smallest.UserKey)
     758           1 :                 if span == nil {
     759           1 :                         span = rIter.Next()
     760           1 :                 }
     761           1 :                 for ; span != nil; span = rIter.Next() {
     762           1 :                         if span.Empty() {
     763           1 :                                 continue
     764             :                         }
     765           1 :                         key := span.SmallestKey()
     766           1 :                         c := sstableKeyCompare(cmp, key, keyRange.largest)
     767           1 :                         if c > 0 {
     768           1 :                                 // The start of the span is after the largest key in the
     769           1 :                                 // ingested table.
     770           1 :                                 return false
     771           1 :                         }
     772           1 :                         if cmp(span.End, keyRange.smallest.UserKey) > 0 {
     773           1 :                                 // The end of the span is greater than the smallest in the
     774           1 :                                 // table. Note that the span end key is exclusive, thus ">0"
     775           1 :                                 // instead of ">=0".
     776           1 :                                 return true
     777           1 :                         }
     778             :                 }
     779             :                 // Assume overlap if iterator errored.
     780           1 :                 if err := rIter.Error(); err != nil {
     781           0 :                         return true
     782           0 :                 }
     783           1 :                 return false
     784             :         }
     785             : 
     786             :         // rkeyIter is either a range key level iter, or a range key iterator
     787             :         // over a single file.
     788           1 :         if rkeyIter != nil {
     789           1 :                 if computeOverlapWithSpans(rkeyIter) {
     790           1 :                         return true
     791           1 :                 }
     792             :         }
     793             : 
     794             :         // Check overlap with range deletions.
     795           1 :         if rangeDelIter == nil || *rangeDelIter == nil {
     796           1 :                 return false
     797           1 :         }
     798           1 :         return computeOverlapWithSpans(*rangeDelIter)
     799             : }
     800             : 
     801             : // ingestTargetLevel returns the target level for a file being ingested.
     802             : // If suggestSplit is true, it accounts for ingest-time splitting as part of
     803             : // its target level calculation, and if a split candidate is found, that file
     804             : // is returned as the splitFile.
     805             : func ingestTargetLevel(
     806             :         newIters tableNewIters,
     807             :         newRangeKeyIter keyspan.TableNewSpanIter,
     808             :         iterOps IterOptions,
     809             :         comparer *Comparer,
     810             :         v *version,
     811             :         baseLevel int,
     812             :         compactions map[*compaction]struct{},
     813             :         meta *fileMetadata,
     814             :         suggestSplit bool,
     815           1 : ) (targetLevel int, splitFile *fileMetadata, err error) {
     816           1 :         // Find the lowest level which does not have any files which overlap meta. We
     817           1 :         // search from L0 to L6 looking for whether there are any files in the level
     818           1 :         // which overlap meta. We want the "lowest" level (where lower means
     819           1 :         // increasing level number) in order to reduce write amplification.
     820           1 :         //
     821           1 :         // There are 2 kinds of overlap we need to check for: file boundary overlap
     822           1 :         // and data overlap. Data overlap implies file boundary overlap. Note that it
     823           1 :         // is always possible to ingest into L0.
     824           1 :         //
     825           1 :         // To place meta at level i where i > 0:
     826           1 :         // - there must not be any data overlap with levels <= i, since that will
     827           1 :         //   violate the sequence number invariant.
     828           1 :         // - no file boundary overlap with level i, since that will violate the
     829           1 :         //   invariant that files do not overlap in levels i > 0.
     830           1 :         //   - if there is only a file overlap at a given level, and no data overlap,
     831           1 :         //     we can still slot a file at that level. We return the fileMetadata with
     832           1 :         //     which we have file boundary overlap (must be only one file, as sstable
     833           1 :         //     bounds are usually tight on user keys) and the caller is expected to split
     834           1 :         //     that sstable into two virtual sstables, allowing this file to go into that
     835           1 :         //     level. Note that if we have file boundary overlap with two files, which
     836           1 :         //     should only happen on rare occasions, we treat it as data overlap and
     837           1 :         //     don't use this optimization.
     838           1 :         //
     839           1 :         // The file boundary overlap check is simpler to conceptualize. Consider the
     840           1 :         // following example, in which the ingested file lies completely before or
     841           1 :         // after the file being considered.
     842           1 :         //
     843           1 :         //   |--|           |--|  ingested file: [a,b] or [f,g]
     844           1 :         //         |-----|        existing file: [c,e]
     845           1 :         //  _____________________
     846           1 :         //   a  b  c  d  e  f  g
     847           1 :         //
     848           1 :         // In both cases the ingested file can move to considering the next level.
     849           1 :         //
     850           1 :         // File boundary overlap does not necessarily imply data overlap. The check
     851           1 :         // for data overlap is a little more nuanced. Consider the following examples:
     852           1 :         //
     853           1 :         //  1. No data overlap:
     854           1 :         //
     855           1 :         //          |-|   |--|    ingested file: [cc-d] or [ee-ff]
     856           1 :         //  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
     857           1 :         //  _____________________
     858           1 :         //   a  b  c  d  e  f  g
     859           1 :         //
     860           1 :         // In this case the ingested files can "fall through" this level. The checks
     861           1 :         // continue at the next level.
     862           1 :         //
     863           1 :         //  2. Data overlap:
     864           1 :         //
     865           1 :         //            |--|        ingested file: [d-e]
     866           1 :         //  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
     867           1 :         //  _____________________
     868           1 :         //   a  b  c  d  e  f  g
     869           1 :         //
     870           1 :         // In this case the file cannot be ingested into this level as the point 'dd'
     871           1 :         // is in the way.
     872           1 :         //
     873           1 :         // It is worth noting that the check for data overlap is only approximate. In
     874           1 :         // the previous example, the ingested table [d-e] could contain only the
     875           1 :         // points 'd' and 'e', in which case the table would be eligible for
     876           1 :         // considering lower levels. However, such a fine-grained check would need to
     877           1 :         // be exhaustive (comparing points and ranges in both the ingested existing
     878           1 :         // tables) and such a check is prohibitively expensive. Thus Pebble treats any
     879           1 :         // existing point that falls within the ingested table bounds as being "data
     880           1 :         // overlap".
     881           1 : 
     882           1 :         // This assertion implicitly checks that we have the current version of
     883           1 :         // the metadata.
     884           1 :         if v.L0Sublevels == nil {
     885           0 :                 return 0, nil, errors.AssertionFailedf("could not read L0 sublevels")
     886           0 :         }
     887           1 :         iterOps.CategoryAndQoS = sstable.CategoryAndQoS{
     888           1 :                 Category: "pebble-ingest",
     889           1 :                 QoSLevel: sstable.LatencySensitiveQoSLevel,
     890           1 :         }
     891           1 :         // Check for overlap over the keys of L0 by iterating over the sublevels.
     892           1 :         for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ {
     893           1 :                 iter := newLevelIter(context.Background(),
     894           1 :                         iterOps, comparer, newIters, v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{})
     895           1 : 
     896           1 :                 var rangeDelIter keyspan.FragmentIterator
     897           1 :                 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
     898           1 :                 // sets it up for the target file.
     899           1 :                 iter.initRangeDel(&rangeDelIter)
     900           1 : 
     901           1 :                 levelIter := keyspan.LevelIter{}
     902           1 :                 levelIter.Init(
     903           1 :                         keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
     904           1 :                         v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange,
     905           1 :                 )
     906           1 : 
     907           1 :                 kr := internalKeyRange{
     908           1 :                         smallest: meta.Smallest,
     909           1 :                         largest:  meta.Largest,
     910           1 :                 }
     911           1 :                 overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare)
     912           1 :                 err := iter.Close() // Closes range del iter as well.
     913           1 :                 err = firstError(err, levelIter.Close())
     914           1 :                 if err != nil {
     915           0 :                         return 0, nil, err
     916           0 :                 }
     917           1 :                 if overlap {
     918           1 :                         return targetLevel, nil, nil
     919           1 :                 }
     920             :         }
     921             : 
     922           1 :         level := baseLevel
     923           1 :         for ; level < numLevels; level++ {
     924           1 :                 levelIter := newLevelIter(context.Background(),
     925           1 :                         iterOps, comparer, newIters, v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
     926           1 :                 var rangeDelIter keyspan.FragmentIterator
     927           1 :                 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
     928           1 :                 // sets it up for the target file.
     929           1 :                 levelIter.initRangeDel(&rangeDelIter)
     930           1 : 
     931           1 :                 rkeyLevelIter := &keyspan.LevelIter{}
     932           1 :                 rkeyLevelIter.Init(
     933           1 :                         keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
     934           1 :                         v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange,
     935           1 :                 )
     936           1 : 
     937           1 :                 kr := internalKeyRange{
     938           1 :                         smallest: meta.Smallest,
     939           1 :                         largest:  meta.Largest,
     940           1 :                 }
     941           1 :                 overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare)
     942           1 :                 err := levelIter.Close() // Closes range del iter as well.
     943           1 :                 err = firstError(err, rkeyLevelIter.Close())
     944           1 :                 if err != nil {
     945           0 :                         return 0, nil, err
     946           0 :                 }
     947           1 :                 if overlap {
     948           1 :                         return targetLevel, splitFile, nil
     949           1 :                 }
     950             : 
     951             :                 // Check boundary overlap.
     952           1 :                 var candidateSplitFile *fileMetadata
     953           1 :                 boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey,
     954           1 :                         meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
     955           1 :                 if !boundaryOverlaps.Empty() {
     956           1 :                         // We are already guaranteed to not have any data overlaps with files
     957           1 :                         // in boundaryOverlaps, otherwise we'd have returned in the above if
     958           1 :                         // statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for
     959           1 :                         // the case where we can slot this file into the current level despite
     960           1 :                         // a boundary overlap, by splitting one existing file into two virtual
     961           1 :                         // sstables.
     962           1 :                         if suggestSplit && boundaryOverlaps.Len() == 1 {
     963           1 :                                 iter := boundaryOverlaps.Iter()
     964           1 :                                 candidateSplitFile = iter.First()
     965           1 :                         } else {
     966           1 :                                 // We either don't want to suggest ingest-time splits (i.e.
     967           1 :                                 // !suggestSplit), or we boundary-overlapped with more than one file.
     968           1 :                                 continue
     969             :                         }
     970             :                 }
     971             : 
     972             :                 // Check boundary overlap with any ongoing compactions. We consider an
     973             :                 // overlapping compaction that's writing files to an output level as
     974             :                 // equivalent to boundary overlap with files in that output level.
     975             :                 //
     976             :                 // We cannot check for data overlap with the new SSTs compaction will produce
     977             :                 // since compaction hasn't been done yet. However, there's no need to check
     978             :                 // since all keys in them will be from levels in [c.startLevel,
     979             :                 // c.outputLevel], and all those levels have already had their data overlap
     980             :                 // tested negative (else we'd have returned earlier).
     981             :                 //
     982             :                 // An alternative approach would be to cancel these compactions and proceed
     983             :                 // with an ingest-time split on this level if necessary. However, compaction
     984             :                 // cancellation can result in significant wasted effort and is best avoided
     985             :                 // unless necessary.
     986           1 :                 overlaps := false
     987           1 :                 for c := range compactions {
     988           1 :                         if c.outputLevel == nil || level != c.outputLevel.level {
     989           1 :                                 continue
     990             :                         }
     991           1 :                         if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 &&
     992           1 :                                 comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 {
     993           1 :                                 overlaps = true
     994           1 :                                 break
     995             :                         }
     996             :                 }
     997           1 :                 if !overlaps {
     998           1 :                         targetLevel = level
     999           1 :                         splitFile = candidateSplitFile
    1000           1 :                 }
    1001             :         }
    1002           1 :         return targetLevel, splitFile, nil
    1003             : }
    1004             : 
    1005             : // Ingest ingests a set of sstables into the DB. Ingestion of the files is
    1006             : // atomic and semantically equivalent to creating a single batch containing all
    1007             : // of the mutations in the sstables. Ingestion may require the memtable to be
    1008             : // flushed. The ingested sstable files are moved into the DB and must reside on
    1009             : // the same filesystem as the DB. Sstables can be created for ingestion using
    1010             : // sstable.Writer. On success, Ingest removes the input paths.
    1011             : //
    1012             : // Two types of sstables are accepted for ingestion(s): one is sstables present
    1013             : // in the instance's vfs.FS and can be referenced locally. The other is sstables
    1014             : // present in remote.Storage, referred to as shared or foreign sstables. These
    1015             : // shared sstables can be linked through objstorageprovider.Provider, and do not
    1016             : // need to already be present on the local vfs.FS. Foreign sstables must all fit
    1017             : // in an excise span, and are destined for a level specified in SharedSSTMeta.
    1018             : //
    1019             : // All sstables *must* be Sync()'d by the caller after all bytes are written
    1020             : // and before its file handle is closed; failure to do so could violate
    1021             : // durability or lead to corrupted on-disk state. This method cannot, in a
    1022             : // platform-and-FS-agnostic way, ensure that all sstables in the input are
    1023             : // properly synced to disk. Opening new file handles and Sync()-ing them
    1024             : // does not always guarantee durability; see the discussion here on that:
    1025             : // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
    1026             : //
    1027             : // Ingestion loads each sstable into the lowest level of the LSM which it
    1028             : // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
    1029             : // ingestion forces the memtable to flush, and then waits for the flush to
    1030             : // occur. In some cases, such as with no foreign sstables and no excise span,
    1031             : // ingestion that gets blocked on a memtable can join the flushable queue and
    1032             : // finish even before the memtable has been flushed.
    1033             : //
    1034             : // The steps for ingestion are:
    1035             : //
    1036             : //  1. Allocate file numbers for every sstable being ingested.
    1037             : //  2. Load the metadata for all sstables being ingested.
    1038             : //  3. Sort the sstables by smallest key, verifying non overlap (for local
    1039             : //     sstables).
    1040             : //  4. Hard link (or copy) the local sstables into the DB directory.
    1041             : //  5. Allocate a sequence number to use for all of the entries in the
    1042             : //     local sstables. This is the step where overlap with memtables is
    1043             : //     determined. If there is overlap, we remember the most recent memtable
    1044             : //     that overlaps.
    1045             : //  6. Update the sequence number in the ingested local sstables. (Remote
    1046             : //     sstables get fixed sequence numbers that were determined at load time.)
    1047             : //  7. Wait for the most recent memtable that overlaps to flush (if any).
    1048             : //  8. Add the ingested sstables to the version (DB.ingestApply).
    1049             : //     8.1.  If an excise span was specified, figure out what sstables in the
    1050             : //     current version overlap with the excise span, and create new virtual
    1051             : //     sstables out of those sstables that exclude the excised span (DB.excise).
    1052             : //  9. Publish the ingestion sequence number.
    1053             : //
    1054             : // Note that if the mutable memtable overlaps with ingestion, a flush of the
    1055             : // memtable is forced equivalent to DB.Flush. Additionally, subsequent
    1056             : // mutations that get sequence numbers larger than the ingestion sequence
    1057             : // number get queued up behind the ingestion waiting for it to complete. This
    1058             : // can produce a noticeable hiccup in performance. See
    1059             : // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix
    1060             : // this hiccup.
    1061           1 : func (d *DB) Ingest(paths []string) error {
    1062           1 :         if err := d.closed.Load(); err != nil {
    1063           0 :                 panic(err)
    1064             :         }
    1065           1 :         if d.opts.ReadOnly {
    1066           0 :                 return ErrReadOnly
    1067           0 :         }
    1068           1 :         _, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
    1069           1 :         return err
    1070             : }
    1071             : 
    1072             : // IngestOperationStats provides some information about where in the LSM the
    1073             : // bytes were ingested.
    1074             : type IngestOperationStats struct {
    1075             :         // Bytes is the total bytes in the ingested sstables.
    1076             :         Bytes uint64
    1077             :         // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
    1078             :         // into L0. This value is approximate when flushable ingests are active and
    1079             :         // an ingest overlaps an entry in the flushable queue. Currently, this
    1080             :         // approximation is very rough, only including tables that overlapped the
    1081             :         // memtable. This estimate may be improved with #2112.
    1082             :         ApproxIngestedIntoL0Bytes uint64
    1083             :         // MemtableOverlappingFiles is the count of ingested sstables
    1084             :         // that overlapped keys in the memtables.
    1085             :         MemtableOverlappingFiles int
    1086             : }
    1087             : 
    1088             : // ExternalFile are external sstables that can be referenced through
    1089             : // objprovider and ingested as remote files that will not be refcounted or
    1090             : // cleaned up. For use with online restore. Note that the underlying sstable
    1091             : // could contain keys outside the [Smallest,Largest) bounds; however Pebble
    1092             : // is expected to only read the keys within those bounds.
    1093             : type ExternalFile struct {
    1094             :         // Locator is the shared.Locator that can be used with objProvider to
    1095             :         // resolve a reference to this external sstable.
    1096             :         Locator remote.Locator
    1097             :         // ObjName is the unique name of this sstable on Locator.
    1098             :         ObjName string
    1099             :         // Size of the referenced proportion of the virtualized sstable. An estimate
    1100             :         // is acceptable in lieu of the backing file size.
    1101             :         Size uint64
    1102             :         // SmallestUserKey and LargestUserKey are the [smallest,largest) user key
    1103             :         // bounds of the sstable. Both these bounds are loose i.e. it's possible for
    1104             :         // the sstable to not span the entirety of this range. However, multiple
    1105             :         // ExternalFiles in one ingestion must all have non-overlapping
    1106             :         // [smallest, largest) spans. Note that this Largest bound is exclusive.
    1107             :         SmallestUserKey, LargestUserKey []byte
    1108             :         // HasPointKey and HasRangeKey denote whether this file contains point keys
    1109             :         // or range keys. If both structs are false, an error is returned during
    1110             :         // ingestion.
    1111             :         HasPointKey, HasRangeKey bool
    1112             : }
    1113             : 
    1114             : // IngestWithStats does the same as Ingest, and additionally returns
    1115             : // IngestOperationStats.
    1116           0 : func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) {
    1117           0 :         if err := d.closed.Load(); err != nil {
    1118           0 :                 panic(err)
    1119             :         }
    1120           0 :         if d.opts.ReadOnly {
    1121           0 :                 return IngestOperationStats{}, ErrReadOnly
    1122           0 :         }
    1123           0 :         return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
    1124             : }
    1125             : 
    1126             : // IngestExternalFiles does the same as IngestWithStats, and additionally
    1127             : // accepts external files (with locator info that can be resolved using
    1128             : // d.opts.SharedStorage). These files must also be non-overlapping with
    1129             : // each other, and must be resolvable through d.objProvider.
    1130           0 : func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) {
    1131           0 :         if err := d.closed.Load(); err != nil {
    1132           0 :                 panic(err)
    1133             :         }
    1134             : 
    1135           0 :         if d.opts.ReadOnly {
    1136           0 :                 return IngestOperationStats{}, ErrReadOnly
    1137           0 :         }
    1138           0 :         if d.opts.Experimental.RemoteStorage == nil {
    1139           0 :                 return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured")
    1140           0 :         }
    1141           0 :         return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external)
    1142             : }
    1143             : 
    1144             : // IngestAndExcise does the same as IngestWithStats, and additionally accepts a
    1145             : // list of shared files to ingest that can be read from a remote.Storage through
    1146             : // a Provider. All the shared files must live within exciseSpan, and any existing
    1147             : // keys in exciseSpan are deleted by turning existing sstables into virtual
    1148             : // sstables (if not virtual already) and shrinking their spans to exclude
    1149             : // exciseSpan. See the comment at Ingest for a more complete picture of the
    1150             : // ingestion process.
    1151             : //
    1152             : // Panics if this DB instance was not instantiated with a remote.Storage and
    1153             : // shared sstables are present.
    1154             : func (d *DB) IngestAndExcise(
    1155             :         paths []string, shared []SharedSSTMeta, exciseSpan KeyRange,
    1156           0 : ) (IngestOperationStats, error) {
    1157           0 :         if err := d.closed.Load(); err != nil {
    1158           0 :                 panic(err)
    1159             :         }
    1160           0 :         if d.opts.ReadOnly {
    1161           0 :                 return IngestOperationStats{}, ErrReadOnly
    1162           0 :         }
    1163           0 :         return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */)
    1164             : }
    1165             : 
    1166             : // Both DB.mu and commitPipeline.mu must be held while this is called.
    1167             : func (d *DB) newIngestedFlushableEntry(
    1168             :         meta []*fileMetadata, seqNum uint64, logNum base.DiskFileNum,
    1169           1 : ) (*flushableEntry, error) {
    1170           1 :         // Update the sequence number for all of the sstables in the
    1171           1 :         // metadata. Writing the metadata to the manifest when the
    1172           1 :         // version edit is applied is the mechanism that persists the
    1173           1 :         // sequence number. The sstables themselves are left unmodified.
    1174           1 :         // In this case, a version edit will only be written to the manifest
    1175           1 :         // when the flushable is eventually flushed. If Pebble restarts in that
    1176           1 :         // time, then we'll lose the ingest sequence number information. But this
    1177           1 :         // information will also be reconstructed on node restart.
    1178           1 :         if err := ingestUpdateSeqNum(
    1179           1 :                 d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta},
    1180           1 :         ); err != nil {
    1181           0 :                 return nil, err
    1182           0 :         }
    1183             : 
    1184           1 :         f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter)
    1185           1 : 
    1186           1 :         // NB: The logNum/seqNum are the WAL number which we're writing this entry
    1187           1 :         // to and the sequence number within the WAL which we'll write this entry
    1188           1 :         // to.
    1189           1 :         entry := d.newFlushableEntry(f, logNum, seqNum)
    1190           1 :         // The flushable entry starts off with a single reader ref, so increment
    1191           1 :         // the FileMetadata.Refs.
    1192           1 :         for _, file := range f.files {
    1193           1 :                 file.Ref()
    1194           1 :         }
    1195           1 :         entry.unrefFiles = func() []*fileBacking {
    1196           1 :                 var obsolete []*fileBacking
    1197           1 :                 for _, file := range f.files {
    1198           1 :                         if file.Unref() == 0 {
    1199           1 :                                 obsolete = append(obsolete, file.FileMetadata.FileBacking)
    1200           1 :                         }
    1201             :                 }
    1202           1 :                 return obsolete
    1203             :         }
    1204             : 
    1205           1 :         entry.flushForced = true
    1206           1 :         entry.releaseMemAccounting = func() {}
    1207           1 :         return entry, nil
    1208             : }
    1209             : 
    1210             : // Both DB.mu and commitPipeline.mu must be held while this is called. Since
    1211             : // we're holding both locks, the order in which we rotate the memtable or
    1212             : // recycle the WAL in this function is irrelevant as long as the correct log
    1213             : // numbers are assigned to the appropriate flushable.
    1214           1 : func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error {
    1215           1 :         b := d.NewBatch()
    1216           1 :         for _, m := range meta {
    1217           1 :                 b.ingestSST(m.FileNum)
    1218           1 :         }
    1219           1 :         b.setSeqNum(seqNum)
    1220           1 : 
    1221           1 :         // If the WAL is disabled, then the logNum used to create the flushable
    1222           1 :         // entry doesn't matter. We just use the logNum assigned to the current
    1223           1 :         // mutable memtable. If the WAL is enabled, then this logNum will be
    1224           1 :         // overwritten by the logNum of the log which will contain the log entry
    1225           1 :         // for the ingestedFlushable.
    1226           1 :         logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
    1227           1 :         if !d.opts.DisableWAL {
    1228           1 :                 // We create a new WAL for the flushable instead of reusing the end of
    1229           1 :                 // the previous WAL. This simplifies the increment of the minimum
    1230           1 :                 // unflushed log number, and also simplifies WAL replay.
    1231           1 :                 logNum, _ = d.recycleWAL()
    1232           1 :                 d.mu.Unlock()
    1233           1 :                 err := d.commit.directWrite(b)
    1234           1 :                 if err != nil {
    1235           0 :                         d.opts.Logger.Fatalf("%v", err)
    1236           0 :                 }
    1237           1 :                 d.mu.Lock()
    1238             :         }
    1239             : 
    1240           1 :         entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum)
    1241           1 :         if err != nil {
    1242           0 :                 return err
    1243           0 :         }
    1244           1 :         nextSeqNum := seqNum + uint64(b.Count())
    1245           1 : 
    1246           1 :         // Set newLogNum to the logNum of the previous flushable. This value is
    1247           1 :         // irrelevant if the WAL is disabled. If the WAL is enabled, then we set
    1248           1 :         // the appropriate value below.
    1249           1 :         newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
    1250           1 :         if !d.opts.DisableWAL {
    1251           1 :                 // This is WAL num of the next mutable memtable which comes after the
    1252           1 :                 // ingestedFlushable in the flushable queue. The mutable memtable
    1253           1 :                 // will be created below.
    1254           1 :                 newLogNum, _ = d.recycleWAL()
    1255           1 :                 if err != nil {
    1256           0 :                         return err
    1257           0 :                 }
    1258             :         }
    1259             : 
    1260           1 :         currMem := d.mu.mem.mutable
    1261           1 :         // NB: Placing ingested sstables above the current memtables
    1262           1 :         // requires rotating of the existing memtables/WAL. There is
    1263           1 :         // some concern of churning through tiny memtables due to
    1264           1 :         // ingested sstables being placed on top of them, but those
    1265           1 :         // memtables would have to be flushed anyways.
    1266           1 :         d.mu.mem.queue = append(d.mu.mem.queue, entry)
    1267           1 :         d.rotateMemtable(newLogNum, nextSeqNum, currMem)
    1268           1 :         d.updateReadStateLocked(d.opts.DebugCheck)
    1269           1 :         d.maybeScheduleFlush()
    1270           1 :         return nil
    1271             : }
    1272             : 
    1273             : // See comment at Ingest() for details on how this works.
    1274             : func (d *DB) ingest(
    1275             :         paths []string,
    1276             :         targetLevelFunc ingestTargetLevelFunc,
    1277             :         shared []SharedSSTMeta,
    1278             :         exciseSpan KeyRange,
    1279             :         external []ExternalFile,
    1280           1 : ) (IngestOperationStats, error) {
    1281           1 :         if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil {
    1282           0 :                 panic("cannot ingest shared sstables with nil SharedStorage")
    1283             :         }
    1284           1 :         if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables {
    1285           0 :                 return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion")
    1286           0 :         }
    1287             :         // Allocate file numbers for all of the files being ingested and mark them as
    1288             :         // pending in order to prevent them from being deleted. Note that this causes
    1289             :         // the file number ordering to be out of alignment with sequence number
    1290             :         // ordering. The sorting of L0 tables by sequence number avoids relying on
    1291             :         // that (busted) invariant.
    1292           1 :         d.mu.Lock()
    1293           1 :         pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external))
    1294           1 :         for i := 0; i < len(paths)+len(shared)+len(external); i++ {
    1295           1 :                 pendingOutputs[i] = d.mu.versions.getNextDiskFileNum()
    1296           1 :         }
    1297             : 
    1298           1 :         jobID := d.mu.nextJobID
    1299           1 :         d.mu.nextJobID++
    1300           1 :         d.mu.Unlock()
    1301           1 : 
    1302           1 :         // Load the metadata for all the files being ingested. This step detects
    1303           1 :         // and elides empty sstables.
    1304           1 :         loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID)
    1305           1 :         if err != nil {
    1306           0 :                 return IngestOperationStats{}, err
    1307           0 :         }
    1308             : 
    1309           1 :         if loadResult.fileCount == 0 {
    1310           1 :                 // All of the sstables to be ingested were empty. Nothing to do.
    1311           1 :                 return IngestOperationStats{}, nil
    1312           1 :         }
    1313             : 
    1314             :         // Verify the sstables do not overlap.
    1315           1 :         if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil {
    1316           1 :                 return IngestOperationStats{}, err
    1317           1 :         }
    1318             : 
    1319             :         // Hard link the sstables into the DB directory. Since the sstables aren't
    1320             :         // referenced by a version, they won't be used. If the hard linking fails
    1321             :         // (e.g. because the files reside on a different filesystem), ingestLink will
    1322             :         // fall back to copying, and if that fails we undo our work and return an
    1323             :         // error.
    1324           1 :         if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil {
    1325           0 :                 return IngestOperationStats{}, err
    1326           0 :         }
    1327             : 
    1328             :         // Make the new tables durable. We need to do this at some point before we
    1329             :         // update the MANIFEST (via logAndApply), otherwise a crash can have the
    1330             :         // tables referenced in the MANIFEST, but not present in the provider.
    1331           1 :         if err := d.objProvider.Sync(); err != nil {
    1332           0 :                 return IngestOperationStats{}, err
    1333           0 :         }
    1334             : 
    1335             :         // metaFlushableOverlaps is a slice parallel to meta indicating which of the
    1336             :         // ingested sstables overlap some table in the flushable queue. It's used to
    1337             :         // approximate ingest-into-L0 stats when using flushable ingests.
    1338           1 :         metaFlushableOverlaps := make([]bool, loadResult.fileCount)
    1339           1 :         var mem *flushableEntry
    1340           1 :         var mut *memTable
    1341           1 :         // asFlushable indicates whether the sstable was ingested as a flushable.
    1342           1 :         var asFlushable bool
    1343           1 :         iterOps := IterOptions{
    1344           1 :                 CategoryAndQoS: sstable.CategoryAndQoS{
    1345           1 :                         Category: "pebble-ingest",
    1346           1 :                         QoSLevel: sstable.LatencySensitiveQoSLevel,
    1347           1 :                 },
    1348           1 :         }
    1349           1 :         prepare := func(seqNum uint64) {
    1350           1 :                 // Note that d.commit.mu is held by commitPipeline when calling prepare.
    1351           1 : 
    1352           1 :                 d.mu.Lock()
    1353           1 :                 defer d.mu.Unlock()
    1354           1 : 
    1355           1 :                 // Check to see if any files overlap with any of the memtables. The queue
    1356           1 :                 // is ordered from oldest to newest with the mutable memtable being the
    1357           1 :                 // last element in the slice. We want to wait for the newest table that
    1358           1 :                 // overlaps.
    1359           1 : 
    1360           1 :                 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
    1361           1 :                         m := d.mu.mem.queue[i]
    1362           1 :                         iter := m.newIter(&iterOps)
    1363           1 :                         rangeDelIter := m.newRangeDelIter(&iterOps)
    1364           1 :                         rkeyIter := m.newRangeKeyIter(&iterOps)
    1365           1 : 
    1366           1 :                         checkForOverlap := func(i int, meta *fileMetadata) {
    1367           1 :                                 if metaFlushableOverlaps[i] {
    1368           1 :                                         // This table already overlapped a more recent flushable.
    1369           1 :                                         return
    1370           1 :                                 }
    1371           1 :                                 kr := internalKeyRange{
    1372           1 :                                         smallest: meta.Smallest,
    1373           1 :                                         largest:  meta.Largest,
    1374           1 :                                 }
    1375           1 :                                 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
    1376           1 :                                         // If this is the first table to overlap a flushable, save
    1377           1 :                                         // the flushable. This ingest must be ingested or flushed
    1378           1 :                                         // after it.
    1379           1 :                                         if mem == nil {
    1380           1 :                                                 mem = m
    1381           1 :                                         }
    1382           1 :                                         metaFlushableOverlaps[i] = true
    1383             :                                 }
    1384             :                         }
    1385           1 :                         for i := range loadResult.localMeta {
    1386           1 :                                 checkForOverlap(i, loadResult.localMeta[i])
    1387           1 :                         }
    1388           1 :                         for i := range loadResult.sharedMeta {
    1389           0 :                                 checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i])
    1390           0 :                         }
    1391           1 :                         for i := range loadResult.externalMeta {
    1392           0 :                                 checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i])
    1393           0 :                         }
    1394           1 :                         if exciseSpan.Valid() {
    1395           0 :                                 kr := internalKeyRange{
    1396           0 :                                         smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax),
    1397           0 :                                         largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End),
    1398           0 :                                 }
    1399           0 :                                 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
    1400           0 :                                         if mem == nil {
    1401           0 :                                                 mem = m
    1402           0 :                                         }
    1403             :                                 }
    1404             :                         }
    1405           1 :                         err := iter.Close()
    1406           1 :                         if rangeDelIter != nil {
    1407           1 :                                 err = firstError(err, rangeDelIter.Close())
    1408           1 :                         }
    1409           1 :                         if rkeyIter != nil {
    1410           1 :                                 err = firstError(err, rkeyIter.Close())
    1411           1 :                         }
    1412           1 :                         if err != nil {
    1413           0 :                                 d.opts.Logger.Errorf("ingest error reading flushable for log %s: %s", m.logNum, err)
    1414           0 :                         }
    1415             :                 }
    1416             : 
    1417           1 :                 if mem == nil {
    1418           1 :                         // No overlap with any of the queued flushables, so no need to queue
    1419           1 :                         // after them.
    1420           1 : 
    1421           1 :                         // New writes with higher sequence numbers may be concurrently
    1422           1 :                         // committed. We must ensure they don't flush before this ingest
    1423           1 :                         // completes. To do that, we ref the mutable memtable as a writer,
    1424           1 :                         // preventing its flushing (and the flushing of all subsequent
    1425           1 :                         // flushables in the queue). Once we've acquired the manifest lock
    1426           1 :                         // to add the ingested sstables to the LSM, we can unref as we're
    1427           1 :                         // guaranteed that the flush won't edit the LSM before this ingest.
    1428           1 :                         mut = d.mu.mem.mutable
    1429           1 :                         mut.writerRef()
    1430           1 :                         return
    1431           1 :                 }
    1432             :                 // The ingestion overlaps with some entry in the flushable queue.
    1433           1 :                 if d.FormatMajorVersion() < FormatFlushableIngest ||
    1434           1 :                         d.opts.Experimental.DisableIngestAsFlushable() ||
    1435           1 :                         len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 ||
    1436           1 :                         (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) {
    1437           1 :                         // We're not able to ingest as a flushable,
    1438           1 :                         // so we must synchronously flush.
    1439           1 :                         //
    1440           1 :                         // TODO(bilal): Currently, if any of the files being ingested are shared or
    1441           1 :                         // there's an excise span present, we cannot use flushable ingests and need
    1442           1 :                         // to wait synchronously. Either remove this caveat by fleshing out
    1443           1 :                         // flushable ingest logic to also account for these cases, or remove this
    1444           1 :                         // comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676
    1445           1 :                         if mem.flushable == d.mu.mem.mutable {
    1446           1 :                                 err = d.makeRoomForWrite(nil)
    1447           1 :                         }
    1448             :                         // New writes with higher sequence numbers may be concurrently
    1449             :                         // committed. We must ensure they don't flush before this ingest
    1450             :                         // completes. To do that, we ref the mutable memtable as a writer,
    1451             :                         // preventing its flushing (and the flushing of all subsequent
    1452             :                         // flushables in the queue). Once we've acquired the manifest lock
    1453             :                         // to add the ingested sstables to the LSM, we can unref as we're
    1454             :                         // guaranteed that the flush won't edit the LSM before this ingest.
    1455           1 :                         mut = d.mu.mem.mutable
    1456           1 :                         mut.writerRef()
    1457           1 :                         mem.flushForced = true
    1458           1 :                         d.maybeScheduleFlush()
    1459           1 :                         return
    1460             :                 }
    1461             :                 // Since there aren't too many memtables already queued up, we can
    1462             :                 // slide the ingested sstables on top of the existing memtables.
    1463           1 :                 asFlushable = true
    1464           1 :                 err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum)
    1465             :         }
    1466             : 
    1467           1 :         var ve *versionEdit
    1468           1 :         apply := func(seqNum uint64) {
    1469           1 :                 if err != nil || asFlushable {
    1470           1 :                         // An error occurred during prepare.
    1471           1 :                         if mut != nil {
    1472           0 :                                 if mut.writerUnref() {
    1473           0 :                                         d.mu.Lock()
    1474           0 :                                         d.maybeScheduleFlush()
    1475           0 :                                         d.mu.Unlock()
    1476           0 :                                 }
    1477             :                         }
    1478           1 :                         return
    1479             :                 }
    1480             : 
    1481             :                 // Update the sequence numbers for all ingested sstables'
    1482             :                 // metadata. When the version edit is applied, the metadata is
    1483             :                 // written to the manifest, persisting the sequence number.
    1484             :                 // The sstables themselves are left unmodified.
    1485           1 :                 if err = ingestUpdateSeqNum(
    1486           1 :                         d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult,
    1487           1 :                 ); err != nil {
    1488           0 :                         if mut != nil {
    1489           0 :                                 if mut.writerUnref() {
    1490           0 :                                         d.mu.Lock()
    1491           0 :                                         d.maybeScheduleFlush()
    1492           0 :                                         d.mu.Unlock()
    1493           0 :                                 }
    1494             :                         }
    1495           0 :                         return
    1496             :                 }
    1497             : 
    1498             :                 // If we overlapped with a memtable in prepare wait for the flush to
    1499             :                 // finish.
    1500           1 :                 if mem != nil {
    1501           1 :                         <-mem.flushed
    1502           1 :                 }
    1503             : 
    1504             :                 // Assign the sstables to the correct level in the LSM and apply the
    1505             :                 // version edit.
    1506           1 :                 ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan)
    1507             :         }
    1508             : 
    1509             :         // Only one ingest can occur at a time because if not, one would block waiting
    1510             :         // for the other to finish applying. This blocking would happen while holding
    1511             :         // the commit mutex which would prevent unrelated batches from writing their
    1512             :         // changes to the WAL and memtable. This will cause a bigger commit hiccup
    1513             :         // during ingestion.
    1514           1 :         d.commit.ingestSem <- struct{}{}
    1515           1 :         d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply)
    1516           1 :         <-d.commit.ingestSem
    1517           1 : 
    1518           1 :         if err != nil {
    1519           0 :                 if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil {
    1520           0 :                         d.opts.Logger.Errorf("ingest cleanup failed: %v", err2)
    1521           0 :                 }
    1522           1 :         } else {
    1523           1 :                 // Since we either created a hard link to the ingesting files, or copied
    1524           1 :                 // them over, it is safe to remove the originals paths.
    1525           1 :                 for _, path := range loadResult.localPaths {
    1526           1 :                         if err2 := d.opts.FS.Remove(path); err2 != nil {
    1527           0 :                                 d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2)
    1528           0 :                         }
    1529             :                 }
    1530             :         }
    1531             : 
    1532           1 :         info := TableIngestInfo{
    1533           1 :                 JobID:     jobID,
    1534           1 :                 Err:       err,
    1535           1 :                 flushable: asFlushable,
    1536           1 :         }
    1537           1 :         if len(loadResult.localMeta) > 0 {
    1538           1 :                 info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum
    1539           1 :         } else if len(loadResult.sharedMeta) > 0 {
    1540           0 :                 info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum
    1541           0 :         } else {
    1542           0 :                 info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum
    1543           0 :         }
    1544           1 :         var stats IngestOperationStats
    1545           1 :         if ve != nil {
    1546           1 :                 info.Tables = make([]struct {
    1547           1 :                         TableInfo
    1548           1 :                         Level int
    1549           1 :                 }, len(ve.NewFiles))
    1550           1 :                 for i := range ve.NewFiles {
    1551           1 :                         e := &ve.NewFiles[i]
    1552           1 :                         info.Tables[i].Level = e.Level
    1553           1 :                         info.Tables[i].TableInfo = e.Meta.TableInfo()
    1554           1 :                         stats.Bytes += e.Meta.Size
    1555           1 :                         if e.Level == 0 {
    1556           1 :                                 stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
    1557           1 :                         }
    1558           1 :                         if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] {
    1559           1 :                                 stats.MemtableOverlappingFiles++
    1560           1 :                         }
    1561             :                 }
    1562           1 :         } else if asFlushable {
    1563           1 :                 // NB: If asFlushable == true, there are no shared sstables.
    1564           1 :                 info.Tables = make([]struct {
    1565           1 :                         TableInfo
    1566           1 :                         Level int
    1567           1 :                 }, len(loadResult.localMeta))
    1568           1 :                 for i, f := range loadResult.localMeta {
    1569           1 :                         info.Tables[i].Level = -1
    1570           1 :                         info.Tables[i].TableInfo = f.TableInfo()
    1571           1 :                         stats.Bytes += f.Size
    1572           1 :                         // We don't have exact stats on which files will be ingested into
    1573           1 :                         // L0, because actual ingestion into the LSM has been deferred until
    1574           1 :                         // flush time. Instead, we infer based on memtable overlap.
    1575           1 :                         //
    1576           1 :                         // TODO(jackson): If we optimistically compute data overlap (#2112)
    1577           1 :                         // before entering the commit pipeline, we can use that overlap to
    1578           1 :                         // improve our approximation by incorporating overlap with L0, not
    1579           1 :                         // just memtables.
    1580           1 :                         if metaFlushableOverlaps[i] {
    1581           1 :                                 stats.ApproxIngestedIntoL0Bytes += f.Size
    1582           1 :                                 stats.MemtableOverlappingFiles++
    1583           1 :                         }
    1584             :                 }
    1585             :         }
    1586           1 :         d.opts.EventListener.TableIngested(info)
    1587           1 : 
    1588           1 :         return stats, err
    1589             : }
    1590             : 
    1591             : // excise updates ve to include a replacement of the file m with new virtual
    1592             : // sstables that exclude exciseSpan, returning a slice of newly-created files if
    1593             : // any. If the entirety of m is deleted by exciseSpan, no new sstables are added
    1594             : // and m is deleted. Note that ve is updated in-place.
    1595             : //
    1596             : // The manifest lock must be held when calling this method.
    1597             : func (d *DB) excise(
    1598             :         exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int,
    1599           1 : ) ([]manifest.NewFileEntry, error) {
    1600           1 :         numCreatedFiles := 0
    1601           1 :         // Check if there's actually an overlap between m and exciseSpan.
    1602           1 :         if !exciseSpan.Overlaps(d.cmp, m) {
    1603           0 :                 return nil, nil
    1604           0 :         }
    1605           1 :         ve.DeletedFiles[deletedFileEntry{
    1606           1 :                 Level:   level,
    1607           1 :                 FileNum: m.FileNum,
    1608           1 :         }] = m
    1609           1 :         // Fast path: m sits entirely within the exciseSpan, so just delete it.
    1610           1 :         if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
    1611           0 :                 return nil, nil
    1612           0 :         }
    1613           1 :         var iter internalIterator
    1614           1 :         var rangeDelIter keyspan.FragmentIterator
    1615           1 :         var rangeKeyIter keyspan.FragmentIterator
    1616           1 :         needsBacking := false
    1617           1 :         // Create a file to the left of the excise span, if necessary.
    1618           1 :         // The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)].
    1619           1 :         //
    1620           1 :         // We create bounds that are tight on user keys, and we make the effort to find
    1621           1 :         // the last key in the original sstable that's smaller than exciseSpan.Start
    1622           1 :         // even though it requires some sstable reads. We could choose to create
    1623           1 :         // virtual sstables on loose userKey bounds, in which case we could just set
    1624           1 :         // leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest
    1625           1 :         // issue with that approach would be that it'd lead to lots of small virtual
    1626           1 :         // sstables in the LSM that have no guarantee on containing even a single user
    1627           1 :         // key within the file bounds. This has the potential to increase both read and
    1628           1 :         // write-amp as we will be opening up these sstables only to find no relevant
    1629           1 :         // keys in the read path, and compacting sstables on top of them instead of
    1630           1 :         // directly into the space occupied by them. We choose to incur the cost of
    1631           1 :         // calculating tight bounds at this time instead of creating more work in the
    1632           1 :         // future.
    1633           1 :         //
    1634           1 :         // TODO(bilal): Some of this work can happen without grabbing the manifest
    1635           1 :         // lock; we could grab one currentVersion, release the lock, calculate excised
    1636           1 :         // files, then grab the lock again and recalculate for just the files that
    1637           1 :         // have changed since our previous calculation. Do this optimiaztino as part of
    1638           1 :         // https://github.com/cockroachdb/pebble/issues/2112 .
    1639           1 :         if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 {
    1640           1 :                 leftFile := &fileMetadata{
    1641           1 :                         Virtual:     true,
    1642           1 :                         FileBacking: m.FileBacking,
    1643           1 :                         FileNum:     d.mu.versions.getNextFileNum(),
    1644           1 :                         // Note that these are loose bounds for smallest/largest seqnums, but they're
    1645           1 :                         // sufficient for maintaining correctness.
    1646           1 :                         SmallestSeqNum: m.SmallestSeqNum,
    1647           1 :                         LargestSeqNum:  m.LargestSeqNum,
    1648           1 :                 }
    1649           1 :                 if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) {
    1650           1 :                         // This file will contain point keys
    1651           1 :                         smallestPointKey := m.SmallestPointKey
    1652           1 :                         var err error
    1653           1 :                         iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
    1654           1 :                                 CategoryAndQoS: sstable.CategoryAndQoS{
    1655           1 :                                         Category: "pebble-ingest",
    1656           1 :                                         QoSLevel: sstable.LatencySensitiveQoSLevel,
    1657           1 :                                 },
    1658           1 :                                 level: manifest.Level(level),
    1659           1 :                         }, internalIterOpts{})
    1660           1 :                         if err != nil {
    1661           0 :                                 return nil, err
    1662           0 :                         }
    1663           1 :                         var key *InternalKey
    1664           1 :                         if iter != nil {
    1665           1 :                                 defer iter.Close()
    1666           1 :                                 key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone)
    1667           1 :                         } else {
    1668           0 :                                 iter = emptyIter
    1669           0 :                         }
    1670           1 :                         if key != nil {
    1671           1 :                                 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone())
    1672           1 :                         }
    1673             :                         // Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This
    1674             :                         // needs to be a copy if the key is owned by the range del iter.
    1675           1 :                         var lastRangeDel []byte
    1676           1 :                         if rangeDelIter != nil {
    1677           1 :                                 defer rangeDelIter.Close()
    1678           1 :                                 rdel := rangeDelIter.SeekLT(exciseSpan.Start)
    1679           1 :                                 if rdel != nil {
    1680           1 :                                         lastRangeDel = append(lastRangeDel[:0], rdel.End...)
    1681           1 :                                         if d.cmp(lastRangeDel, exciseSpan.Start) > 0 {
    1682           0 :                                                 lastRangeDel = exciseSpan.Start
    1683           0 :                                         }
    1684             :                                 }
    1685           1 :                         } else {
    1686           1 :                                 rangeDelIter = emptyKeyspanIter
    1687           1 :                         }
    1688           1 :                         if lastRangeDel != nil {
    1689           1 :                                 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel))
    1690           1 :                         }
    1691             :                 }
    1692           1 :                 if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) {
    1693           1 :                         // This file will contain range keys
    1694           1 :                         var err error
    1695           1 :                         smallestRangeKey := m.SmallestRangeKey
    1696           1 :                         rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
    1697           1 :                         if err != nil {
    1698           0 :                                 return nil, err
    1699           0 :                         }
    1700             :                         // Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This
    1701             :                         // needs to be a copy if the key is owned by the range key iter.
    1702           1 :                         var lastRangeKey []byte
    1703           1 :                         var lastRangeKeyKind InternalKeyKind
    1704           1 :                         defer rangeKeyIter.Close()
    1705           1 :                         rkey := rangeKeyIter.SeekLT(exciseSpan.Start)
    1706           1 :                         if rkey != nil {
    1707           1 :                                 lastRangeKey = append(lastRangeKey[:0], rkey.End...)
    1708           1 :                                 if d.cmp(lastRangeKey, exciseSpan.Start) > 0 {
    1709           0 :                                         lastRangeKey = exciseSpan.Start
    1710           0 :                                 }
    1711           1 :                                 lastRangeKeyKind = rkey.Keys[0].Kind()
    1712             :                         }
    1713           1 :                         if lastRangeKey != nil {
    1714           1 :                                 leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey))
    1715           1 :                         }
    1716             :                 }
    1717           1 :                 if leftFile.HasRangeKeys || leftFile.HasPointKeys {
    1718           1 :                         var err error
    1719           1 :                         leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey)
    1720           1 :                         if err != nil {
    1721           0 :                                 return nil, err
    1722           0 :                         }
    1723           1 :                         if leftFile.Size == 0 {
    1724           1 :                                 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
    1725           1 :                                 // such as if the excised file only has range keys/dels and no point
    1726           1 :                                 // keys. This can cause panics in places where we divide by file sizes.
    1727           1 :                                 // Correct for it here.
    1728           1 :                                 leftFile.Size = 1
    1729           1 :                         }
    1730           1 :                         if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
    1731           0 :                                 return nil, err
    1732           0 :                         }
    1733           1 :                         leftFile.ValidateVirtual(m)
    1734           1 :                         ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile})
    1735           1 :                         needsBacking = true
    1736           1 :                         numCreatedFiles++
    1737             :                 }
    1738             :         }
    1739             :         // Create a file to the right, if necessary.
    1740           1 :         if exciseSpan.Contains(d.cmp, m.Largest) {
    1741           0 :                 // No key exists to the right of the excise span in this file.
    1742           0 :                 if needsBacking && !m.Virtual {
    1743           0 :                         // If m is virtual, then its file backing is already known to the manifest.
    1744           0 :                         // We don't need to create another file backing. Note that there must be
    1745           0 :                         // only one CreatedBackingTables entry per backing sstable. This is
    1746           0 :                         // indicated by the VersionEdit.CreatedBackingTables invariant.
    1747           0 :                         ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
    1748           0 :                 }
    1749           0 :                 return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
    1750             :         }
    1751             :         // Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest].
    1752             :         //
    1753             :         // See comment before the definition of leftFile for the motivation behind
    1754             :         // calculating tight user-key bounds.
    1755           1 :         rightFile := &fileMetadata{
    1756           1 :                 Virtual:     true,
    1757           1 :                 FileBacking: m.FileBacking,
    1758           1 :                 FileNum:     d.mu.versions.getNextFileNum(),
    1759           1 :                 // Note that these are loose bounds for smallest/largest seqnums, but they're
    1760           1 :                 // sufficient for maintaining correctness.
    1761           1 :                 SmallestSeqNum: m.SmallestSeqNum,
    1762           1 :                 LargestSeqNum:  m.LargestSeqNum,
    1763           1 :         }
    1764           1 :         if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) {
    1765           1 :                 // This file will contain point keys
    1766           1 :                 largestPointKey := m.LargestPointKey
    1767           1 :                 var err error
    1768           1 :                 if iter == nil && rangeDelIter == nil {
    1769           0 :                         iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
    1770           0 :                                 CategoryAndQoS: sstable.CategoryAndQoS{
    1771           0 :                                         Category: "pebble-ingest",
    1772           0 :                                         QoSLevel: sstable.LatencySensitiveQoSLevel,
    1773           0 :                                 },
    1774           0 :                                 level: manifest.Level(level),
    1775           0 :                         }, internalIterOpts{})
    1776           0 :                         if err != nil {
    1777           0 :                                 return nil, err
    1778           0 :                         }
    1779           0 :                         if iter != nil {
    1780           0 :                                 defer iter.Close()
    1781           0 :                         } else {
    1782           0 :                                 iter = emptyIter
    1783           0 :                         }
    1784           0 :                         if rangeDelIter != nil {
    1785           0 :                                 defer rangeDelIter.Close()
    1786           0 :                         } else {
    1787           0 :                                 rangeDelIter = emptyKeyspanIter
    1788           0 :                         }
    1789             :                 }
    1790           1 :                 key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone)
    1791           1 :                 if key != nil {
    1792           1 :                         rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey)
    1793           1 :                 }
    1794             :                 // Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This
    1795             :                 // needs to be a copy if the key is owned by the range del iter.
    1796           1 :                 var firstRangeDel []byte
    1797           1 :                 rdel := rangeDelIter.SeekGE(exciseSpan.End)
    1798           1 :                 if rdel != nil {
    1799           1 :                         firstRangeDel = append(firstRangeDel[:0], rdel.Start...)
    1800           1 :                         if d.cmp(firstRangeDel, exciseSpan.End) < 0 {
    1801           0 :                                 firstRangeDel = exciseSpan.End
    1802           0 :                         }
    1803             :                 }
    1804           1 :                 if firstRangeDel != nil {
    1805           1 :                         smallestPointKey := rdel.SmallestKey()
    1806           1 :                         smallestPointKey.UserKey = firstRangeDel
    1807           1 :                         rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey)
    1808           1 :                 }
    1809             :         }
    1810           1 :         if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) {
    1811           1 :                 // This file will contain range keys.
    1812           1 :                 largestRangeKey := m.LargestRangeKey
    1813           1 :                 if rangeKeyIter == nil {
    1814           0 :                         var err error
    1815           0 :                         rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
    1816           0 :                         if err != nil {
    1817           0 :                                 return nil, err
    1818           0 :                         }
    1819           0 :                         defer rangeKeyIter.Close()
    1820             :                 }
    1821             :                 // Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This
    1822             :                 // needs to be a copy if the key is owned by the range key iter.
    1823           1 :                 var firstRangeKey []byte
    1824           1 :                 rkey := rangeKeyIter.SeekGE(exciseSpan.End)
    1825           1 :                 if rkey != nil {
    1826           1 :                         firstRangeKey = append(firstRangeKey[:0], rkey.Start...)
    1827           1 :                         if d.cmp(firstRangeKey, exciseSpan.End) < 0 {
    1828           0 :                                 firstRangeKey = exciseSpan.End
    1829           0 :                         }
    1830             :                 }
    1831           1 :                 if firstRangeKey != nil {
    1832           1 :                         smallestRangeKey := rkey.SmallestKey()
    1833           1 :                         smallestRangeKey.UserKey = firstRangeKey
    1834           1 :                         // We call ExtendRangeKeyBounds so any internal boundType fields are
    1835           1 :                         // set correctly. Note that this is mildly wasteful as we'll be comparing
    1836           1 :                         // rightFile.{Smallest,Largest}RangeKey with themselves, which can be
    1837           1 :                         // avoided if we exported ExtendOverallKeyBounds or so.
    1838           1 :                         rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey)
    1839           1 :                 }
    1840             :         }
    1841           1 :         if rightFile.HasRangeKeys || rightFile.HasPointKeys {
    1842           1 :                 var err error
    1843           1 :                 rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey)
    1844           1 :                 if err != nil {
    1845           0 :                         return nil, err
    1846           0 :                 }
    1847           1 :                 if rightFile.Size == 0 {
    1848           1 :                         // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
    1849           1 :                         // such as if the excised file only has range keys/dels and no point keys.
    1850           1 :                         // This can cause panics in places where we divide by file sizes. Correct
    1851           1 :                         // for it here.
    1852           1 :                         rightFile.Size = 1
    1853           1 :                 }
    1854           1 :                 rightFile.ValidateVirtual(m)
    1855           1 :                 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile})
    1856           1 :                 needsBacking = true
    1857           1 :                 numCreatedFiles++
    1858             :         }
    1859             : 
    1860           1 :         if needsBacking && !m.Virtual {
    1861           1 :                 // If m is virtual, then its file backing is already known to the manifest.
    1862           1 :                 // We don't need to create another file backing. Note that there must be
    1863           1 :                 // only one CreatedBackingTables entry per backing sstable. This is
    1864           1 :                 // indicated by the VersionEdit.CreatedBackingTables invariant.
    1865           1 :                 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
    1866           1 :         }
    1867             : 
    1868           1 :         if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
    1869           0 :                 return nil, err
    1870           0 :         }
    1871           1 :         return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
    1872             : }
    1873             : 
    1874             : type ingestTargetLevelFunc func(
    1875             :         newIters tableNewIters,
    1876             :         newRangeKeyIter keyspan.TableNewSpanIter,
    1877             :         iterOps IterOptions,
    1878             :         comparer *Comparer,
    1879             :         v *version,
    1880             :         baseLevel int,
    1881             :         compactions map[*compaction]struct{},
    1882             :         meta *fileMetadata,
    1883             :         suggestSplit bool,
    1884             : ) (int, *fileMetadata, error)
    1885             : 
    1886             : type ingestSplitFile struct {
    1887             :         // ingestFile is the file being ingested.
    1888             :         ingestFile *fileMetadata
    1889             :         // splitFile is the file that needs to be split to allow ingestFile to slot
    1890             :         // into `level` level.
    1891             :         splitFile *fileMetadata
    1892             :         // The level where ingestFile will go (and where splitFile already is).
    1893             :         level int
    1894             : }
    1895             : 
    1896             : // ingestSplit splits files specified in `files` and updates ve in-place to
    1897             : // account for existing files getting split into two virtual sstables. The map
    1898             : // `replacedFiles` contains an in-progress map of all files that have been
    1899             : // replaced with new virtual sstables in this version edit so far, which is also
    1900             : // updated in-place.
    1901             : //
    1902             : // d.mu as well as the manifest lock must be held when calling this method.
    1903             : func (d *DB) ingestSplit(
    1904             :         ve *versionEdit,
    1905             :         updateMetrics func(*fileMetadata, int, []newFileEntry),
    1906             :         files []ingestSplitFile,
    1907             :         replacedFiles map[base.FileNum][]newFileEntry,
    1908           1 : ) error {
    1909           1 :         for _, s := range files {
    1910           1 :                 // replacedFiles can be thought of as a tree, where we start iterating with
    1911           1 :                 // s.splitFile and run its fileNum through replacedFiles, then find which of
    1912           1 :                 // the replaced files overlaps with s.ingestFile, which becomes the new
    1913           1 :                 // splitFile, then we check splitFile's replacements in replacedFiles again
    1914           1 :                 // for overlap with s.ingestFile, and so on until we either can't find the
    1915           1 :                 // current splitFile in replacedFiles (i.e. that's the file that now needs to
    1916           1 :                 // be split), or we don't find a file that overlaps with s.ingestFile, which
    1917           1 :                 // means a prior ingest split already produced enough room for s.ingestFile
    1918           1 :                 // to go into this level without necessitating another ingest split.
    1919           1 :                 splitFile := s.splitFile
    1920           1 :                 for splitFile != nil {
    1921           1 :                         replaced, ok := replacedFiles[splitFile.FileNum]
    1922           1 :                         if !ok {
    1923           1 :                                 break
    1924             :                         }
    1925           0 :                         updatedSplitFile := false
    1926           0 :                         for i := range replaced {
    1927           0 :                                 if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) {
    1928           0 :                                         if updatedSplitFile {
    1929           0 :                                                 // This should never happen because the earlier ingestTargetLevel
    1930           0 :                                                 // function only finds split file candidates that are guaranteed to
    1931           0 :                                                 // have no data overlap, only boundary overlap. See the comments
    1932           0 :                                                 // in that method to see the definitions of data vs boundary
    1933           0 :                                                 // overlap. That, plus the fact that files in `replaced` are
    1934           0 :                                                 // guaranteed to have file bounds that are tight on user keys
    1935           0 :                                                 // (as that's what `d.excise` produces), means that the only case
    1936           0 :                                                 // where we overlap with two or more files in `replaced` is if we
    1937           0 :                                                 // actually had data overlap all along, or if the ingestion files
    1938           0 :                                                 // were overlapping, either of which is an invariant violation.
    1939           0 :                                                 panic("updated with two files in ingestSplit")
    1940             :                                         }
    1941           0 :                                         splitFile = replaced[i].Meta
    1942           0 :                                         updatedSplitFile = true
    1943             :                                 }
    1944             :                         }
    1945           0 :                         if !updatedSplitFile {
    1946           0 :                                 // None of the replaced files overlapped with the file being ingested.
    1947           0 :                                 // This can happen if we've already excised a span overlapping with
    1948           0 :                                 // this file, or if we have consecutive ingested files that can slide
    1949           0 :                                 // within the same gap between keys in an existing file. For instance,
    1950           0 :                                 // if an existing file has keys a and g and we're ingesting b-c, d-e,
    1951           0 :                                 // the first loop iteration will split the existing file into one that
    1952           0 :                                 // ends in a and another that starts at g, and the second iteration will
    1953           0 :                                 // fall into this case and require no splitting.
    1954           0 :                                 //
    1955           0 :                                 // No splitting necessary.
    1956           0 :                                 splitFile = nil
    1957           0 :                         }
    1958             :                 }
    1959           1 :                 if splitFile == nil {
    1960           0 :                         continue
    1961             :                 }
    1962             :                 // NB: excise operates on [start, end). We're splitting at [start, end]
    1963             :                 // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation
    1964             :                 // of exclusive vs inclusive end bounds should not make a difference here
    1965             :                 // as we're guaranteed to not have any data overlap between splitFile and
    1966             :                 // s.ingestFile, so panic if we do see a newly added file with an endKey
    1967             :                 // equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel()
    1968           1 :                 added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level)
    1969           1 :                 if err != nil {
    1970           0 :                         return err
    1971           0 :                 }
    1972           1 :                 if _, ok := ve.DeletedFiles[deletedFileEntry{
    1973           1 :                         Level:   s.level,
    1974           1 :                         FileNum: splitFile.FileNum,
    1975           1 :                 }]; !ok {
    1976           0 :                         panic("did not split file that was expected to be split")
    1977             :                 }
    1978           1 :                 replacedFiles[splitFile.FileNum] = added
    1979           1 :                 for i := range added {
    1980           1 :                         if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) {
    1981           0 :                                 panic("ingest-time split produced a file that overlaps with ingested file")
    1982             :                         }
    1983             :                 }
    1984           1 :                 updateMetrics(splitFile, s.level, added)
    1985             :         }
    1986             :         // Flatten the version edit by removing any entries from ve.NewFiles that
    1987             :         // are also in ve.DeletedFiles.
    1988           1 :         newNewFiles := ve.NewFiles[:0]
    1989           1 :         for i := range ve.NewFiles {
    1990           1 :                 fn := ve.NewFiles[i].Meta.FileNum
    1991           1 :                 deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn}
    1992           1 :                 if _, ok := ve.DeletedFiles[deEntry]; ok {
    1993           0 :                         delete(ve.DeletedFiles, deEntry)
    1994           1 :                 } else {
    1995           1 :                         newNewFiles = append(newNewFiles, ve.NewFiles[i])
    1996           1 :                 }
    1997             :         }
    1998           1 :         ve.NewFiles = newNewFiles
    1999           1 :         return nil
    2000             : }
    2001             : 
    2002             : func (d *DB) ingestApply(
    2003             :         jobID int,
    2004             :         lr ingestLoadResult,
    2005             :         findTargetLevel ingestTargetLevelFunc,
    2006             :         mut *memTable,
    2007             :         exciseSpan KeyRange,
    2008           1 : ) (*versionEdit, error) {
    2009           1 :         d.mu.Lock()
    2010           1 :         defer d.mu.Unlock()
    2011           1 : 
    2012           1 :         ve := &versionEdit{
    2013           1 :                 NewFiles: make([]newFileEntry, lr.fileCount),
    2014           1 :         }
    2015           1 :         if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) {
    2016           1 :                 ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{}
    2017           1 :         }
    2018           1 :         metrics := make(map[int]*LevelMetrics)
    2019           1 : 
    2020           1 :         // Lock the manifest for writing before we use the current version to
    2021           1 :         // determine the target level. This prevents two concurrent ingestion jobs
    2022           1 :         // from using the same version to determine the target level, and also
    2023           1 :         // provides serialization with concurrent compaction and flush jobs.
    2024           1 :         // logAndApply unconditionally releases the manifest lock, but any earlier
    2025           1 :         // returns must unlock the manifest.
    2026           1 :         d.mu.versions.logLock()
    2027           1 : 
    2028           1 :         if mut != nil {
    2029           1 :                 // Unref the mutable memtable to allows its flush to proceed. Now that we've
    2030           1 :                 // acquired the manifest lock, we can be certain that if the mutable
    2031           1 :                 // memtable has received more recent conflicting writes, the flush won't
    2032           1 :                 // beat us to applying to the manifest resulting in sequence number
    2033           1 :                 // inversion. Even though we call maybeScheduleFlush right now, this flush
    2034           1 :                 // will apply after our ingestion.
    2035           1 :                 if mut.writerUnref() {
    2036           0 :                         d.maybeScheduleFlush()
    2037           0 :                 }
    2038             :         }
    2039             : 
    2040           1 :         shouldIngestSplit := d.opts.Experimental.IngestSplit != nil &&
    2041           1 :                 d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables
    2042           1 :         current := d.mu.versions.currentVersion()
    2043           1 :         baseLevel := d.mu.versions.picker.getBaseLevel()
    2044           1 :         iterOps := IterOptions{logger: d.opts.Logger}
    2045           1 :         // filesToSplit is a list where each element is a pair consisting of a file
    2046           1 :         // being ingested and a file being split to make room for an ingestion into
    2047           1 :         // that level. Each ingested file will appear at most once in this list. It
    2048           1 :         // is possible for split files to appear twice in this list.
    2049           1 :         filesToSplit := make([]ingestSplitFile, 0)
    2050           1 :         checkCompactions := false
    2051           1 :         for i := 0; i < lr.fileCount; i++ {
    2052           1 :                 // Determine the lowest level in the LSM for which the sstable doesn't
    2053           1 :                 // overlap any existing files in the level.
    2054           1 :                 var m *fileMetadata
    2055           1 :                 sharedIdx := -1
    2056           1 :                 sharedLevel := -1
    2057           1 :                 externalFile := false
    2058           1 :                 if i < len(lr.localMeta) {
    2059           1 :                         // local file.
    2060           1 :                         m = lr.localMeta[i]
    2061           1 :                 } else if (i - len(lr.localMeta)) < len(lr.sharedMeta) {
    2062           0 :                         // shared file.
    2063           0 :                         sharedIdx = i - len(lr.localMeta)
    2064           0 :                         m = lr.sharedMeta[sharedIdx]
    2065           0 :                         sharedLevel = int(lr.sharedLevels[sharedIdx])
    2066           0 :                 } else {
    2067           0 :                         // external file.
    2068           0 :                         externalFile = true
    2069           0 :                         m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))]
    2070           0 :                 }
    2071           1 :                 f := &ve.NewFiles[i]
    2072           1 :                 var err error
    2073           1 :                 if sharedIdx >= 0 {
    2074           0 :                         f.Level = sharedLevel
    2075           0 :                         if f.Level < sharedLevelsStart {
    2076           0 :                                 panic("cannot slot a shared file higher than the highest shared level")
    2077             :                         }
    2078           0 :                         ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
    2079           1 :                 } else {
    2080           1 :                         if externalFile {
    2081           0 :                                 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
    2082           0 :                         }
    2083           1 :                         var splitFile *fileMetadata
    2084           1 :                         if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
    2085           0 :                                 // This file fits perfectly within the excise span. We can slot it at
    2086           0 :                                 // L6, or sharedLevelsStart - 1 if we have shared files.
    2087           0 :                                 if len(lr.sharedMeta) > 0 {
    2088           0 :                                         f.Level = sharedLevelsStart - 1
    2089           0 :                                         if baseLevel > f.Level {
    2090           0 :                                                 f.Level = 0
    2091           0 :                                         }
    2092           0 :                                 } else {
    2093           0 :                                         f.Level = 6
    2094           0 :                                 }
    2095           1 :                         } else {
    2096           1 :                                 // TODO(bilal): findTargetLevel does disk IO (reading files for data
    2097           1 :                                 // overlap) even though we're holding onto d.mu. Consider unlocking
    2098           1 :                                 // d.mu while we do this. We already hold versions.logLock so we should
    2099           1 :                                 // not see any version applications while we're at this. The one
    2100           1 :                                 // complication here would be pulling out the mu.compact.inProgress
    2101           1 :                                 // check from findTargetLevel, as that requires d.mu to be held.
    2102           1 :                                 f.Level, splitFile, err = findTargetLevel(
    2103           1 :                                         d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit)
    2104           1 :                         }
    2105             : 
    2106           1 :                         if splitFile != nil {
    2107           1 :                                 if invariants.Enabled {
    2108           1 :                                         if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil {
    2109           0 :                                                 panic("splitFile returned is not in level it should be")
    2110             :                                         }
    2111             :                                 }
    2112             :                                 // We take advantage of the fact that we won't drop the db mutex
    2113             :                                 // between now and the call to logAndApply. So, no files should
    2114             :                                 // get added to a new in-progress compaction at this point. We can
    2115             :                                 // avoid having to iterate on in-progress compactions to cancel them
    2116             :                                 // if none of the files being split have a compacting state.
    2117           1 :                                 if splitFile.IsCompacting() {
    2118           0 :                                         checkCompactions = true
    2119           0 :                                 }
    2120           1 :                                 filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level})
    2121             :                         }
    2122             :                 }
    2123           1 :                 if err != nil {
    2124           0 :                         d.mu.versions.logUnlock()
    2125           0 :                         return nil, err
    2126           0 :                 }
    2127           1 :                 f.Meta = m
    2128           1 :                 levelMetrics := metrics[f.Level]
    2129           1 :                 if levelMetrics == nil {
    2130           1 :                         levelMetrics = &LevelMetrics{}
    2131           1 :                         metrics[f.Level] = levelMetrics
    2132           1 :                 }
    2133           1 :                 levelMetrics.NumFiles++
    2134           1 :                 levelMetrics.Size += int64(m.Size)
    2135           1 :                 levelMetrics.BytesIngested += m.Size
    2136           1 :                 levelMetrics.TablesIngested++
    2137             :         }
    2138             :         // replacedFiles maps files excised due to exciseSpan (or splitFiles returned
    2139             :         // by ingestTargetLevel), to files that were created to replace it. This map
    2140             :         // is used to resolve references to split files in filesToSplit, as it is
    2141             :         // possible for a file that we want to split to no longer exist or have a
    2142             :         // newer fileMetadata due to a split induced by another ingestion file, or an
    2143             :         // excise.
    2144           1 :         replacedFiles := make(map[base.FileNum][]newFileEntry)
    2145           1 :         updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
    2146           1 :                 levelMetrics := metrics[level]
    2147           1 :                 if levelMetrics == nil {
    2148           0 :                         levelMetrics = &LevelMetrics{}
    2149           0 :                         metrics[level] = levelMetrics
    2150           0 :                 }
    2151           1 :                 levelMetrics.NumFiles--
    2152           1 :                 levelMetrics.Size -= int64(m.Size)
    2153           1 :                 for i := range added {
    2154           1 :                         levelMetrics.NumFiles++
    2155           1 :                         levelMetrics.Size += int64(added[i].Meta.Size)
    2156           1 :                 }
    2157             :         }
    2158           1 :         if exciseSpan.Valid() {
    2159           0 :                 // Iterate through all levels and find files that intersect with exciseSpan.
    2160           0 :                 //
    2161           0 :                 // TODO(bilal): We could drop the DB mutex here as we don't need it for
    2162           0 :                 // excises; we only need to hold the version lock which we already are
    2163           0 :                 // holding. However releasing the DB mutex could mess with the
    2164           0 :                 // ingestTargetLevel calculation that happened above, as it assumed that it
    2165           0 :                 // had a complete view of in-progress compactions that wouldn't change
    2166           0 :                 // until logAndApply is called. If we were to drop the mutex now, we could
    2167           0 :                 // schedule another in-progress compaction that would go into the chosen target
    2168           0 :                 // level and lead to file overlap within level (which would panic in
    2169           0 :                 // logAndApply). We should drop the db mutex here, do the excise, then
    2170           0 :                 // re-grab the DB mutex and rerun just the in-progress compaction check to
    2171           0 :                 // see if any new compactions are conflicting with our chosen target levels
    2172           0 :                 // for files, and if they are, we should signal those compactions to error
    2173           0 :                 // out.
    2174           0 :                 for level := range current.Levels {
    2175           0 :                         overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */)
    2176           0 :                         iter := overlaps.Iter()
    2177           0 : 
    2178           0 :                         for m := iter.First(); m != nil; m = iter.Next() {
    2179           0 :                                 newFiles, err := d.excise(exciseSpan, m, ve, level)
    2180           0 :                                 if err != nil {
    2181           0 :                                         return nil, err
    2182           0 :                                 }
    2183             : 
    2184           0 :                                 if _, ok := ve.DeletedFiles[deletedFileEntry{
    2185           0 :                                         Level:   level,
    2186           0 :                                         FileNum: m.FileNum,
    2187           0 :                                 }]; !ok {
    2188           0 :                                         // We did not excise this file.
    2189           0 :                                         continue
    2190             :                                 }
    2191           0 :                                 replacedFiles[m.FileNum] = newFiles
    2192           0 :                                 updateLevelMetricsOnExcise(m, level, newFiles)
    2193             :                         }
    2194             :                 }
    2195             :         }
    2196           1 :         if len(filesToSplit) > 0 {
    2197           1 :                 // For the same reasons as the above call to excise, we hold the db mutex
    2198           1 :                 // while calling this method.
    2199           1 :                 if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil {
    2200           0 :                         return nil, err
    2201           0 :                 }
    2202             :         }
    2203           1 :         if len(filesToSplit) > 0 || exciseSpan.Valid() {
    2204           1 :                 for c := range d.mu.compact.inProgress {
    2205           1 :                         if c.versionEditApplied {
    2206           0 :                                 continue
    2207             :                         }
    2208             :                         // Check if this compaction overlaps with the excise span. Note that just
    2209             :                         // checking if the inputs individually overlap with the excise span
    2210             :                         // isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
    2211             :                         // as inputs and write it all out as [a,b,e,f] in one sstable. If we're
    2212             :                         // doing a [c,d) excise at the same time as this compaction, we will have
    2213             :                         // to error out the whole compaction as we can't guarantee it hasn't/won't
    2214             :                         // write a file overlapping with the excise span.
    2215           1 :                         if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) {
    2216           0 :                                 c.cancel.Store(true)
    2217           0 :                         }
    2218             :                         // Check if this compaction's inputs have been replaced due to an
    2219             :                         // ingest-time split. In that case, cancel the compaction as a newly picked
    2220             :                         // compaction would need to include any new files that slid in between
    2221             :                         // previously-existing files. Note that we cancel any compaction that has a
    2222             :                         // file that was ingest-split as an input, even if it started before this
    2223             :                         // ingestion.
    2224           1 :                         if checkCompactions {
    2225           0 :                                 for i := range c.inputs {
    2226           0 :                                         iter := c.inputs[i].files.Iter()
    2227           0 :                                         for f := iter.First(); f != nil; f = iter.Next() {
    2228           0 :                                                 if _, ok := replacedFiles[f.FileNum]; ok {
    2229           0 :                                                         c.cancel.Store(true)
    2230           0 :                                                         break
    2231             :                                                 }
    2232             :                                         }
    2233             :                                 }
    2234             :                         }
    2235             :                 }
    2236             :                 // Check for any EventuallyFileOnlySnapshots that could be watching for
    2237             :                 // an excise on this span.
    2238           1 :                 if exciseSpan.Valid() {
    2239           0 :                         for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next {
    2240           0 :                                 if s.efos == nil {
    2241           0 :                                         continue
    2242             :                                 }
    2243           0 :                                 efos := s.efos
    2244           0 :                                 // TODO(bilal): We can make this faster by taking advantage of the sorted
    2245           0 :                                 // nature of protectedRanges to do a sort.Search, or even maintaining a
    2246           0 :                                 // global list of all protected ranges instead of having to peer into every
    2247           0 :                                 // snapshot.
    2248           0 :                                 for i := range efos.protectedRanges {
    2249           0 :                                         if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) {
    2250           0 :                                                 efos.excised.Store(true)
    2251           0 :                                                 break
    2252             :                                         }
    2253             :                                 }
    2254             :                         }
    2255             :                 }
    2256             :         }
    2257           1 :         if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo {
    2258           1 :                 return d.getInProgressCompactionInfoLocked(nil)
    2259           1 :         }); err != nil {
    2260           0 :                 return nil, err
    2261           0 :         }
    2262             : 
    2263           1 :         d.mu.versions.metrics.Ingest.Count++
    2264           1 : 
    2265           1 :         d.updateReadStateLocked(d.opts.DebugCheck)
    2266           1 :         // updateReadStateLocked could have generated obsolete tables, schedule a
    2267           1 :         // cleanup job if necessary.
    2268           1 :         d.deleteObsoleteFiles(jobID)
    2269           1 :         d.updateTableStatsLocked(ve.NewFiles)
    2270           1 :         // The ingestion may have pushed a level over the threshold for compaction,
    2271           1 :         // so check to see if one is necessary and schedule it.
    2272           1 :         d.maybeScheduleCompaction()
    2273           1 :         var toValidate []manifest.NewFileEntry
    2274           1 :         dedup := make(map[base.DiskFileNum]struct{})
    2275           1 :         for _, entry := range ve.NewFiles {
    2276           1 :                 if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok {
    2277           1 :                         toValidate = append(toValidate, entry)
    2278           1 :                         dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{}
    2279           1 :                 }
    2280             :         }
    2281           1 :         d.maybeValidateSSTablesLocked(toValidate)
    2282           1 :         return ve, nil
    2283             : }
    2284             : 
    2285             : // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending
    2286             : // queue of files to be validated, when the feature is enabled.
    2287             : //
    2288             : // Note that if two entries with the same backing file are added twice, then the
    2289             : // block checksums for the backing file will be validated twice.
    2290             : //
    2291             : // DB.mu must be locked when calling.
    2292           1 : func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) {
    2293           1 :         // Only add to the validation queue when the feature is enabled.
    2294           1 :         if !d.opts.Experimental.ValidateOnIngest {
    2295           1 :                 return
    2296           1 :         }
    2297             : 
    2298           1 :         d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
    2299           1 :         if d.shouldValidateSSTablesLocked() {
    2300           1 :                 go d.validateSSTables()
    2301           1 :         }
    2302             : }
    2303             : 
    2304             : // shouldValidateSSTablesLocked returns true if SSTable validation should run.
    2305             : // DB.mu must be locked when calling.
    2306           1 : func (d *DB) shouldValidateSSTablesLocked() bool {
    2307           1 :         return !d.mu.tableValidation.validating &&
    2308           1 :                 d.closed.Load() == nil &&
    2309           1 :                 d.opts.Experimental.ValidateOnIngest &&
    2310           1 :                 len(d.mu.tableValidation.pending) > 0
    2311           1 : }
    2312             : 
    2313             : // validateSSTables runs a round of validation on the tables in the pending
    2314             : // queue.
    2315           1 : func (d *DB) validateSSTables() {
    2316           1 :         d.mu.Lock()
    2317           1 :         if !d.shouldValidateSSTablesLocked() {
    2318           1 :                 d.mu.Unlock()
    2319           1 :                 return
    2320           1 :         }
    2321             : 
    2322           1 :         pending := d.mu.tableValidation.pending
    2323           1 :         d.mu.tableValidation.pending = nil
    2324           1 :         d.mu.tableValidation.validating = true
    2325           1 :         jobID := d.mu.nextJobID
    2326           1 :         d.mu.nextJobID++
    2327           1 :         rs := d.loadReadState()
    2328           1 : 
    2329           1 :         // Drop DB.mu before performing IO.
    2330           1 :         d.mu.Unlock()
    2331           1 : 
    2332           1 :         // Validate all tables in the pending queue. This could lead to a situation
    2333           1 :         // where we are starving IO from other tasks due to having to page through
    2334           1 :         // all the blocks in all the sstables in the queue.
    2335           1 :         // TODO(travers): Add some form of pacing to avoid IO starvation.
    2336           1 : 
    2337           1 :         // If we fail to validate any files due to reasons other than uncovered
    2338           1 :         // corruption, accumulate them and re-queue them for another attempt.
    2339           1 :         var retry []manifest.NewFileEntry
    2340           1 : 
    2341           1 :         for _, f := range pending {
    2342           1 :                 // The file may have been moved or deleted since it was ingested, in
    2343           1 :                 // which case we skip.
    2344           1 :                 if !rs.current.Contains(f.Level, d.cmp, f.Meta) {
    2345           1 :                         // Assume the file was moved to a lower level. It is rare enough
    2346           1 :                         // that a table is moved or deleted between the time it was ingested
    2347           1 :                         // and the time the validation routine runs that the overall cost of
    2348           1 :                         // this inner loop is tolerably low, when amortized over all
    2349           1 :                         // ingested tables.
    2350           1 :                         found := false
    2351           1 :                         for i := f.Level + 1; i < numLevels; i++ {
    2352           1 :                                 if rs.current.Contains(i, d.cmp, f.Meta) {
    2353           1 :                                         found = true
    2354           1 :                                         break
    2355             :                                 }
    2356             :                         }
    2357           1 :                         if !found {
    2358           1 :                                 continue
    2359             :                         }
    2360             :                 }
    2361             : 
    2362           1 :                 var err error
    2363           1 :                 if f.Meta.Virtual {
    2364           1 :                         err = d.tableCache.withVirtualReader(
    2365           1 :                                 f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error {
    2366           1 :                                         return v.ValidateBlockChecksumsOnBacking()
    2367           1 :                                 })
    2368           1 :                 } else {
    2369           1 :                         err = d.tableCache.withReader(
    2370           1 :                                 f.Meta.PhysicalMeta(), func(r *sstable.Reader) error {
    2371           1 :                                         return r.ValidateBlockChecksums()
    2372           1 :                                 })
    2373             :                 }
    2374             : 
    2375           1 :                 if err != nil {
    2376           0 :                         if IsCorruptionError(err) {
    2377           0 :                                 // TODO(travers): Hook into the corruption reporting pipeline, once
    2378           0 :                                 // available. See pebble#1192.
    2379           0 :                                 d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err)
    2380           0 :                         } else {
    2381           0 :                                 // If there was some other, possibly transient, error that
    2382           0 :                                 // caused table validation to fail inform the EventListener and
    2383           0 :                                 // move on. We remember the table so that we can retry it in a
    2384           0 :                                 // subsequent table validation job.
    2385           0 :                                 //
    2386           0 :                                 // TODO(jackson): If the error is not transient, this will retry
    2387           0 :                                 // validation indefinitely. While not great, it's the same
    2388           0 :                                 // behavior as erroring flushes and compactions. We should
    2389           0 :                                 // address this as a part of #270.
    2390           0 :                                 d.opts.EventListener.BackgroundError(err)
    2391           0 :                                 retry = append(retry, f)
    2392           0 :                                 continue
    2393             :                         }
    2394             :                 }
    2395             : 
    2396           1 :                 d.opts.EventListener.TableValidated(TableValidatedInfo{
    2397           1 :                         JobID: jobID,
    2398           1 :                         Meta:  f.Meta,
    2399           1 :                 })
    2400             :         }
    2401           1 :         rs.unref()
    2402           1 :         d.mu.Lock()
    2403           1 :         defer d.mu.Unlock()
    2404           1 :         d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...)
    2405           1 :         d.mu.tableValidation.validating = false
    2406           1 :         d.mu.tableValidation.cond.Broadcast()
    2407           1 :         if d.shouldValidateSSTablesLocked() {
    2408           1 :                 go d.validateSSTables()
    2409           1 :         }
    2410             : }

Generated by: LCOV version 1.14