LCOV - code coverage report
Current view: top level - pebble - snapshot.go (source / functions) Hit Total Coverage
Test: 2024-12-09 08:17Z 0f9bf63d - meta test only.lcov Lines: 158 280 56.4 %
Date: 2024-12-09 08:18:14 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package pebble
       6             : 
       7             : import (
       8             :         "context"
       9             :         "io"
      10             :         "math"
      11             :         "sync"
      12             :         "time"
      13             : 
      14             :         "github.com/cockroachdb/pebble/internal/base"
      15             :         "github.com/cockroachdb/pebble/internal/invariants"
      16             :         "github.com/cockroachdb/pebble/rangekey"
      17             :         "github.com/cockroachdb/pebble/sstable"
      18             : )
      19             : 
      20             : // Snapshot provides a read-only point-in-time view of the DB state.
      21             : type Snapshot struct {
      22             :         // The db the snapshot was created from.
      23             :         db     *DB
      24             :         seqNum base.SeqNum
      25             : 
      26             :         // Set if part of an EventuallyFileOnlySnapshot.
      27             :         efos *EventuallyFileOnlySnapshot
      28             : 
      29             :         // The list the snapshot is linked into.
      30             :         list *snapshotList
      31             : 
      32             :         // The next/prev link for the snapshotList doubly-linked list of snapshots.
      33             :         prev, next *Snapshot
      34             : }
      35             : 
      36             : var _ Reader = (*Snapshot)(nil)
      37             : 
      38             : // Get gets the value for the given key. It returns ErrNotFound if the Snapshot
      39             : // does not contain the key.
      40             : //
      41             : // The caller should not modify the contents of the returned slice, but it is
      42             : // safe to modify the contents of the argument after Get returns. The returned
      43             : // slice will remain valid until the returned Closer is closed. On success, the
      44             : // caller MUST call closer.Close() or a memory leak will occur.
      45           1 : func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) {
      46           1 :         if s.db == nil {
      47           0 :                 panic(ErrClosed)
      48             :         }
      49           1 :         return s.db.getInternal(key, nil /* batch */, s)
      50             : }
      51             : 
      52             : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
      53             : // return false). The iterator can be positioned via a call to SeekGE,
      54             : // SeekLT, First or Last.
      55           1 : func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) {
      56           1 :         return s.NewIterWithContext(context.Background(), o)
      57           1 : }
      58             : 
      59             : // NewIterWithContext is like NewIter, and additionally accepts a context for
      60             : // tracing.
      61           1 : func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
      62           1 :         if s.db == nil {
      63           0 :                 panic(ErrClosed)
      64             :         }
      65           1 :         return s.db.newIter(ctx, nil /* batch */, newIterOpts{
      66           1 :                 snapshot: snapshotIterOpts{seqNum: s.seqNum},
      67           1 :         }, o), nil
      68             : }
      69             : 
      70             : // ScanInternal scans all internal keys within the specified bounds, truncating
      71             : // any rangedels and rangekeys to those bounds. For use when an external user
      72             : // needs to be aware of all internal keys that make up a key range.
      73             : //
      74             : // See comment on db.ScanInternal for the behaviour that can be expected of
      75             : // point keys deleted by range dels and keys masked by range keys.
      76             : func (s *Snapshot) ScanInternal(
      77             :         ctx context.Context,
      78             :         category sstable.Category,
      79             :         lower, upper []byte,
      80             :         visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
      81             :         visitRangeDel func(start, end []byte, seqNum base.SeqNum) error,
      82             :         visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
      83             :         visitSharedFile func(sst *SharedSSTMeta) error,
      84             :         visitExternalFile func(sst *ExternalFile) error,
      85           0 : ) error {
      86           0 :         if s.db == nil {
      87           0 :                 panic(ErrClosed)
      88             :         }
      89           0 :         scanInternalOpts := &scanInternalOptions{
      90           0 :                 category:          category,
      91           0 :                 visitPointKey:     visitPointKey,
      92           0 :                 visitRangeDel:     visitRangeDel,
      93           0 :                 visitRangeKey:     visitRangeKey,
      94           0 :                 visitSharedFile:   visitSharedFile,
      95           0 :                 visitExternalFile: visitExternalFile,
      96           0 :                 IterOptions: IterOptions{
      97           0 :                         KeyTypes:   IterKeyTypePointsAndRanges,
      98           0 :                         LowerBound: lower,
      99           0 :                         UpperBound: upper,
     100           0 :                 },
     101           0 :         }
     102           0 : 
     103           0 :         iter, err := s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts)
     104           0 :         if err != nil {
     105           0 :                 return err
     106           0 :         }
     107           0 :         defer iter.close()
     108           0 : 
     109           0 :         return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
     110             : }
     111             : 
     112             : // closeLocked is similar to Close(), except it requires that db.mu be held
     113             : // by the caller.
     114           1 : func (s *Snapshot) closeLocked() error {
     115           1 :         s.db.mu.snapshots.remove(s)
     116           1 : 
     117           1 :         // If s was the previous earliest snapshot, we might be able to reclaim
     118           1 :         // disk space by dropping obsolete records that were pinned by s.
     119           1 :         if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
     120           1 :                 s.db.maybeScheduleCompactionPicker(pickElisionOnly)
     121           1 :         }
     122           1 :         s.db = nil
     123           1 :         return nil
     124             : }
     125             : 
     126             : // Close closes the snapshot, releasing its resources. Close must be called.
     127             : // Failure to do so will result in a tiny memory leak and a large leak of
     128             : // resources on disk due to the entries the snapshot is preventing from being
     129             : // deleted.
     130             : //
     131             : // d.mu must NOT be held by the caller.
     132           1 : func (s *Snapshot) Close() error {
     133           1 :         db := s.db
     134           1 :         if db == nil {
     135           0 :                 panic(ErrClosed)
     136             :         }
     137           1 :         db.mu.Lock()
     138           1 :         defer db.mu.Unlock()
     139           1 :         return s.closeLocked()
     140             : }
     141             : 
     142             : type snapshotList struct {
     143             :         root Snapshot
     144             : }
     145             : 
     146           1 : func (l *snapshotList) init() {
     147           1 :         l.root.next = &l.root
     148           1 :         l.root.prev = &l.root
     149           1 : }
     150             : 
     151           1 : func (l *snapshotList) empty() bool {
     152           1 :         return l.root.next == &l.root
     153           1 : }
     154             : 
     155           1 : func (l *snapshotList) count() int {
     156           1 :         if l.empty() {
     157           1 :                 return 0
     158           1 :         }
     159           0 :         var count int
     160           0 :         for i := l.root.next; i != &l.root; i = i.next {
     161           0 :                 count++
     162           0 :         }
     163           0 :         return count
     164             : }
     165             : 
     166           1 : func (l *snapshotList) earliest() base.SeqNum {
     167           1 :         v := base.SeqNum(math.MaxUint64)
     168           1 :         if !l.empty() {
     169           1 :                 v = l.root.next.seqNum
     170           1 :         }
     171           1 :         return v
     172             : }
     173             : 
     174           1 : func (l *snapshotList) toSlice() []base.SeqNum {
     175           1 :         if l.empty() {
     176           1 :                 return nil
     177           1 :         }
     178           1 :         var results []base.SeqNum
     179           1 :         for i := l.root.next; i != &l.root; i = i.next {
     180           1 :                 results = append(results, i.seqNum)
     181           1 :         }
     182           1 :         return results
     183             : }
     184             : 
     185           1 : func (l *snapshotList) pushBack(s *Snapshot) {
     186           1 :         if s.list != nil || s.prev != nil || s.next != nil {
     187           0 :                 panic("pebble: snapshot list is inconsistent")
     188             :         }
     189           1 :         s.prev = l.root.prev
     190           1 :         s.prev.next = s
     191           1 :         s.next = &l.root
     192           1 :         s.next.prev = s
     193           1 :         s.list = l
     194             : }
     195             : 
     196           1 : func (l *snapshotList) remove(s *Snapshot) {
     197           1 :         if s == &l.root {
     198           0 :                 panic("pebble: cannot remove snapshot list root node")
     199             :         }
     200           1 :         if s.list != l {
     201           0 :                 panic("pebble: snapshot list is inconsistent")
     202             :         }
     203           1 :         s.prev.next = s.next
     204           1 :         s.next.prev = s.prev
     205           1 :         s.next = nil // avoid memory leaks
     206           1 :         s.prev = nil // avoid memory leaks
     207           1 :         s.list = nil // avoid memory leaks
     208             : }
     209             : 
     210             : // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view
     211             : // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot
     212             : // induces less write amplification than Snapshot, at the cost of increased space
     213             : // amplification. While a Snapshot may increase write amplification across all
     214             : // flushes and compactions for the duration of its lifetime, an
     215             : // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if
     216             : // memtables at the time of EFOS instantiation contained keys that the EFOS is
     217             : // interested in (i.e. its protectedRanges). In that case, the EFOS prevents
     218             : // elision of keys visible to it, similar to a Snapshot, until those memtables
     219             : // are flushed, and once that happens, the "EventuallyFileOnlySnapshot"
     220             : // transitions to a file-only snapshot state in which it pins zombies sstables
     221             : // like an open Iterator would, without pinning any memtables. Callers that can
     222             : // tolerate the increased space amplification of pinning zombie sstables until
     223             : // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their
     224             : // reduced write amplification. Callers that desire the benefits of the file-only
     225             : // state that requires no pinning of memtables should call
     226             : // `WaitForFileOnlySnapshot()` before relying on the EFOS to keep producing iterators
     227             : // with zero write-amp and zero pinning of memtables in memory.
     228             : //
     229             : // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in
     230             : // subtle ways. The IngestAndExcise can force the transition of an EFOS to a
     231             : // file-only snapshot if an excise overlaps with the EFOS bounds.
     232             : type EventuallyFileOnlySnapshot struct {
     233             :         mu struct {
     234             :                 // NB: If both this mutex and db.mu are being grabbed, db.mu should be
     235             :                 // grabbed _before_ grabbing this one.
     236             :                 sync.Mutex
     237             : 
     238             :                 // Either the snap field is set below, or the version is set at any given
     239             :                 // point of time. If a snapshot is referenced, this is not a file-only
     240             :                 // snapshot yet, and if a version is set (and ref'd) this is a file-only
     241             :                 // snapshot.
     242             : 
     243             :                 // The wrapped regular snapshot, if not a file-only snapshot yet.
     244             :                 snap *Snapshot
     245             :                 // The wrapped version reference, if a file-only snapshot.
     246             :                 vers *version
     247             :         }
     248             : 
     249             :         // Key ranges to watch for an excise on.
     250             :         protectedRanges []KeyRange
     251             : 
     252             :         // The db the snapshot was created from.
     253             :         db     *DB
     254             :         seqNum base.SeqNum
     255             :         closed chan struct{}
     256             : }
     257             : 
     258           1 : func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot {
     259           1 :         isFileOnly := true
     260           1 : 
     261           1 :         d.mu.Lock()
     262           1 :         defer d.mu.Unlock()
     263           1 :         seqNum := d.mu.versions.visibleSeqNum.Load()
     264           1 :         // Check if any of the keyRanges overlap with a memtable.
     265           1 :         for i := range d.mu.mem.queue {
     266           1 :                 d.mu.mem.queue[i].computePossibleOverlaps(func(bounded) shouldContinue {
     267           1 :                         isFileOnly = false
     268           1 :                         return stopIteration
     269           1 :                 }, sliceAsBounded(keyRanges)...)
     270             :         }
     271           1 :         es := &EventuallyFileOnlySnapshot{
     272           1 :                 db:              d,
     273           1 :                 seqNum:          seqNum,
     274           1 :                 protectedRanges: keyRanges,
     275           1 :                 closed:          make(chan struct{}),
     276           1 :         }
     277           1 :         if isFileOnly {
     278           1 :                 es.mu.vers = d.mu.versions.currentVersion()
     279           1 :                 es.mu.vers.Ref()
     280           1 :         } else {
     281           1 :                 s := &Snapshot{
     282           1 :                         db:     d,
     283           1 :                         seqNum: seqNum,
     284           1 :                 }
     285           1 :                 s.efos = es
     286           1 :                 es.mu.snap = s
     287           1 :                 d.mu.snapshots.pushBack(s)
     288           1 :         }
     289           1 :         return es
     290             : }
     291             : 
     292             : // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires
     293             : // earliestUnflushedSeqNum and vers to correspond to the same Version from the
     294             : // current or a past acquisition of db.mu. vers must have been Ref()'d before
     295             : // that mutex was released, if it was released.
     296             : //
     297             : // NB: The caller is expected to check for es.excised before making this
     298             : // call.
     299             : //
     300             : // d.mu must be held when calling this method.
     301           1 : func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error {
     302           1 :         es.mu.Lock()
     303           1 :         select {
     304           1 :         case <-es.closed:
     305           1 :                 vers.UnrefLocked()
     306           1 :                 es.mu.Unlock()
     307           1 :                 return ErrClosed
     308           1 :         default:
     309             :         }
     310           1 :         if es.mu.snap == nil {
     311           0 :                 es.mu.Unlock()
     312           0 :                 panic("pebble: tried to transition an eventually-file-only-snapshot twice")
     313             :         }
     314             :         // The caller has already called Ref() on vers.
     315           1 :         es.mu.vers = vers
     316           1 :         // NB: The callers should have already done a check of es.excised.
     317           1 :         oldSnap := es.mu.snap
     318           1 :         es.mu.snap = nil
     319           1 :         es.mu.Unlock()
     320           1 :         return oldSnap.closeLocked()
     321             : }
     322             : 
     323             : // hasTransitioned returns true if this EFOS has transitioned to a file-only
     324             : // snapshot.
     325           1 : func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool {
     326           1 :         es.mu.Lock()
     327           1 :         defer es.mu.Unlock()
     328           1 :         return es.mu.vers != nil
     329           1 : }
     330             : 
     331             : // waitForFlush waits for a flush on any memtables that need to be flushed
     332             : // before this EFOS can transition to a file-only snapshot. If this EFOS is
     333             : // waiting on a flush of the mutable memtable, it forces a rotation within
     334             : // `dur` duration. For immutable memtables, it schedules a flush and waits for
     335             : // it to finish.
     336           0 : func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error {
     337           0 :         es.db.mu.Lock()
     338           0 :         defer es.db.mu.Unlock()
     339           0 : 
     340           0 :         earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked()
     341           0 :         for earliestUnflushedSeqNum < es.seqNum {
     342           0 :                 select {
     343           0 :                 case <-es.closed:
     344           0 :                         return ErrClosed
     345           0 :                 case <-ctx.Done():
     346           0 :                         return ctx.Err()
     347           0 :                 default:
     348             :                 }
     349             :                 // Check if the current mutable memtable contains keys less than seqNum.
     350             :                 // If so, rotate it.
     351           0 :                 if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 {
     352           0 :                         es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur)
     353           0 :                 } else {
     354           0 :                         // Find the last memtable that contains seqNums less than es.seqNum,
     355           0 :                         // and force a flush on it.
     356           0 :                         var mem *flushableEntry
     357           0 :                         for i := range es.db.mu.mem.queue {
     358           0 :                                 if es.db.mu.mem.queue[i].logSeqNum < es.seqNum {
     359           0 :                                         mem = es.db.mu.mem.queue[i]
     360           0 :                                 }
     361             :                         }
     362           0 :                         mem.flushForced = true
     363           0 :                         es.db.maybeScheduleFlush()
     364             :                 }
     365           0 :                 es.db.mu.compact.cond.Wait()
     366           0 : 
     367           0 :                 earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked()
     368             :         }
     369           0 :         return nil
     370             : }
     371             : 
     372             : // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot
     373             : // has been converted into a file-only snapshot (i.e. all memtables containing
     374             : // keys < seqNum are flushed). A duration can be passed in, and if nonzero,
     375             : // a delayed flush will be scheduled at that duration if necessary.
     376             : //
     377             : // Idempotent; can be called multiple times with no side effects.
     378             : func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(
     379             :         ctx context.Context, dur time.Duration,
     380           0 : ) error {
     381           0 :         if es.hasTransitioned() {
     382           0 :                 return nil
     383           0 :         }
     384             : 
     385           0 :         if err := es.waitForFlush(ctx, dur); err != nil {
     386           0 :                 return err
     387           0 :         }
     388             : 
     389           0 :         if invariants.Enabled {
     390           0 :                 // Since we aren't returning an error, we _must_ have transitioned to a
     391           0 :                 // file-only snapshot by now.
     392           0 :                 if !es.hasTransitioned() {
     393           0 :                         panic("expected EFOS to have transitioned to file-only snapshot after flush")
     394             :                 }
     395             :         }
     396           0 :         return nil
     397             : }
     398             : 
     399             : // Close closes the file-only snapshot and releases all referenced resources.
     400             : // Not idempotent.
     401           1 : func (es *EventuallyFileOnlySnapshot) Close() error {
     402           1 :         close(es.closed)
     403           1 :         es.db.mu.Lock()
     404           1 :         defer es.db.mu.Unlock()
     405           1 :         es.mu.Lock()
     406           1 :         defer es.mu.Unlock()
     407           1 : 
     408           1 :         if es.mu.snap != nil {
     409           1 :                 if err := es.mu.snap.closeLocked(); err != nil {
     410           0 :                         return err
     411           0 :                 }
     412             :         }
     413           1 :         if es.mu.vers != nil {
     414           1 :                 es.mu.vers.UnrefLocked()
     415           1 :         }
     416           1 :         return nil
     417             : }
     418             : 
     419             : // Get implements the Reader interface.
     420           1 : func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) {
     421           1 :         // TODO(jackson): Use getInternal.
     422           1 :         iter, err := es.NewIter(nil)
     423           1 :         if err != nil {
     424           0 :                 return nil, nil, err
     425           0 :         }
     426           1 :         if !iter.SeekPrefixGE(key) {
     427           1 :                 if err = firstError(iter.Error(), iter.Close()); err != nil {
     428           0 :                         return nil, nil, err
     429           0 :                 }
     430           1 :                 return nil, nil, ErrNotFound
     431             :         }
     432           1 :         if !es.db.equal(iter.Key(), key) {
     433           1 :                 return nil, nil, firstError(iter.Close(), ErrNotFound)
     434           1 :         }
     435           1 :         return iter.Value(), iter, nil
     436             : }
     437             : 
     438             : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
     439             : // return false). The iterator can be positioned via a call to SeekGE,
     440             : // SeekLT, First or Last.
     441           1 : func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) {
     442           1 :         return es.NewIterWithContext(context.Background(), o)
     443           1 : }
     444             : 
     445             : // NewIterWithContext is like NewIter, and additionally accepts a context for
     446             : // tracing.
     447             : func (es *EventuallyFileOnlySnapshot) NewIterWithContext(
     448             :         ctx context.Context, o *IterOptions,
     449           1 : ) (*Iterator, error) {
     450           1 :         select {
     451           0 :         case <-es.closed:
     452           0 :                 panic(ErrClosed)
     453           1 :         default:
     454             :         }
     455             : 
     456           1 :         es.mu.Lock()
     457           1 :         defer es.mu.Unlock()
     458           1 :         if es.mu.vers != nil {
     459           1 :                 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers}
     460           1 :                 return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil
     461           1 :         }
     462             : 
     463           1 :         sOpts := snapshotIterOpts{seqNum: es.seqNum}
     464           1 :         iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o)
     465           1 :         return iter, nil
     466             : }
     467             : 
     468             : // ScanInternal scans all internal keys within the specified bounds, truncating
     469             : // any rangedels and rangekeys to those bounds. For use when an external user
     470             : // needs to be aware of all internal keys that make up a key range.
     471             : //
     472             : // See comment on db.ScanInternal for the behaviour that can be expected of
     473             : // point keys deleted by range dels and keys masked by range keys.
     474             : func (es *EventuallyFileOnlySnapshot) ScanInternal(
     475             :         ctx context.Context,
     476             :         category sstable.Category,
     477             :         lower, upper []byte,
     478             :         visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
     479             :         visitRangeDel func(start, end []byte, seqNum base.SeqNum) error,
     480             :         visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
     481             :         visitSharedFile func(sst *SharedSSTMeta) error,
     482             :         visitExternalFile func(sst *ExternalFile) error,
     483           0 : ) error {
     484           0 :         if es.db == nil {
     485           0 :                 panic(ErrClosed)
     486             :         }
     487           0 :         var sOpts snapshotIterOpts
     488           0 :         opts := &scanInternalOptions{
     489           0 :                 category: category,
     490           0 :                 IterOptions: IterOptions{
     491           0 :                         KeyTypes:   IterKeyTypePointsAndRanges,
     492           0 :                         LowerBound: lower,
     493           0 :                         UpperBound: upper,
     494           0 :                 },
     495           0 :                 visitPointKey:     visitPointKey,
     496           0 :                 visitRangeDel:     visitRangeDel,
     497           0 :                 visitRangeKey:     visitRangeKey,
     498           0 :                 visitSharedFile:   visitSharedFile,
     499           0 :                 visitExternalFile: visitExternalFile,
     500           0 :         }
     501           0 :         es.mu.Lock()
     502           0 :         if es.mu.vers != nil {
     503           0 :                 sOpts = snapshotIterOpts{
     504           0 :                         seqNum: es.seqNum,
     505           0 :                         vers:   es.mu.vers,
     506           0 :                 }
     507           0 :         } else {
     508           0 :                 sOpts = snapshotIterOpts{
     509           0 :                         seqNum: es.seqNum,
     510           0 :                 }
     511           0 :         }
     512           0 :         es.mu.Unlock()
     513           0 :         iter, err := es.db.newInternalIter(ctx, sOpts, opts)
     514           0 :         if err != nil {
     515           0 :                 return err
     516           0 :         }
     517           0 :         defer iter.close()
     518           0 : 
     519           0 :         return scanInternalImpl(ctx, lower, upper, iter, opts)
     520             : }

Generated by: LCOV version 1.14