LCOV - code coverage report
Current view: top level - pebble - snapshot.go (source / functions) Coverage Total Hit
Test: 2025-02-28 08:17Z 9af14eed - meta test only.lcov Lines: 56.4 % 280 158
Test Date: 2025-02-28 08:18:29 Functions: - 0 0

            Line data    Source code
       1              : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2              : // of this source code is governed by a BSD-style license that can be found in
       3              : // the LICENSE file.
       4              : 
       5              : package pebble
       6              : 
       7              : import (
       8              :         "context"
       9              :         "io"
      10              :         "math"
      11              :         "sync"
      12              :         "time"
      13              : 
      14              :         "github.com/cockroachdb/pebble/internal/base"
      15              :         "github.com/cockroachdb/pebble/internal/invariants"
      16              :         "github.com/cockroachdb/pebble/rangekey"
      17              :         "github.com/cockroachdb/pebble/sstable/block"
      18              : )
      19              : 
      20              : // Snapshot provides a read-only point-in-time view of the DB state.
      21              : type Snapshot struct {
      22              :         // The db the snapshot was created from.
      23              :         db     *DB
      24              :         seqNum base.SeqNum
      25              : 
      26              :         // Set if part of an EventuallyFileOnlySnapshot.
      27              :         efos *EventuallyFileOnlySnapshot
      28              : 
      29              :         // The list the snapshot is linked into.
      30              :         list *snapshotList
      31              : 
      32              :         // The next/prev link for the snapshotList doubly-linked list of snapshots.
      33              :         prev, next *Snapshot
      34              : }
      35              : 
      36              : var _ Reader = (*Snapshot)(nil)
      37              : 
      38              : // Get gets the value for the given key. It returns ErrNotFound if the Snapshot
      39              : // does not contain the key.
      40              : //
      41              : // The caller should not modify the contents of the returned slice, but it is
      42              : // safe to modify the contents of the argument after Get returns. The returned
      43              : // slice will remain valid until the returned Closer is closed. On success, the
      44              : // caller MUST call closer.Close() or a memory leak will occur.
      45            1 : func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) {
      46            1 :         if s.db == nil {
      47            0 :                 panic(ErrClosed)
      48              :         }
      49            1 :         return s.db.getInternal(key, nil /* batch */, s)
      50              : }
      51              : 
      52              : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
      53              : // return false). The iterator can be positioned via a call to SeekGE,
      54              : // SeekLT, First or Last.
      55            1 : func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) {
      56            1 :         return s.NewIterWithContext(context.Background(), o)
      57            1 : }
      58              : 
      59              : // NewIterWithContext is like NewIter, and additionally accepts a context for
      60              : // tracing.
      61            1 : func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
      62            1 :         if s.db == nil {
      63            0 :                 panic(ErrClosed)
      64              :         }
      65            1 :         return s.db.newIter(ctx, nil /* batch */, newIterOpts{
      66            1 :                 snapshot: snapshotIterOpts{seqNum: s.seqNum},
      67            1 :         }, o), nil
      68              : }
      69              : 
      70              : // ScanInternal scans all internal keys within the specified bounds, truncating
      71              : // any rangedels and rangekeys to those bounds. For use when an external user
      72              : // needs to be aware of all internal keys that make up a key range.
      73              : //
      74              : // See comment on db.ScanInternal for the behaviour that can be expected of
      75              : // point keys deleted by range dels and keys masked by range keys.
      76              : func (s *Snapshot) ScanInternal(
      77              :         ctx context.Context,
      78              :         category block.Category,
      79              :         lower, upper []byte,
      80              :         visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
      81              :         visitRangeDel func(start, end []byte, seqNum base.SeqNum) error,
      82              :         visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
      83              :         visitSharedFile func(sst *SharedSSTMeta) error,
      84              :         visitExternalFile func(sst *ExternalFile) error,
      85            0 : ) error {
      86            0 :         if s.db == nil {
      87            0 :                 panic(ErrClosed)
      88              :         }
      89            0 :         scanInternalOpts := &scanInternalOptions{
      90            0 :                 category:          category,
      91            0 :                 visitPointKey:     visitPointKey,
      92            0 :                 visitRangeDel:     visitRangeDel,
      93            0 :                 visitRangeKey:     visitRangeKey,
      94            0 :                 visitSharedFile:   visitSharedFile,
      95            0 :                 visitExternalFile: visitExternalFile,
      96            0 :                 IterOptions: IterOptions{
      97            0 :                         KeyTypes:   IterKeyTypePointsAndRanges,
      98            0 :                         LowerBound: lower,
      99            0 :                         UpperBound: upper,
     100            0 :                 },
     101            0 :         }
     102            0 : 
     103            0 :         iter, err := s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts)
     104            0 :         if err != nil {
     105            0 :                 return err
     106            0 :         }
     107            0 :         defer iter.close()
     108            0 : 
     109            0 :         return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
     110              : }
     111              : 
     112              : // closeLocked is similar to Close(), except it requires that db.mu be held
     113              : // by the caller.
     114            1 : func (s *Snapshot) closeLocked() error {
     115            1 :         s.db.mu.snapshots.remove(s)
     116            1 : 
     117            1 :         // If s was the previous earliest snapshot, we might be able to reclaim
     118            1 :         // disk space by dropping obsolete records that were pinned by s.
     119            1 :         if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
     120            1 :                 s.db.maybeScheduleCompactionPicker(pickElisionOnly)
     121            1 :         }
     122            1 :         s.db = nil
     123            1 :         return nil
     124              : }
     125              : 
     126              : // Close closes the snapshot, releasing its resources. Close must be called.
     127              : // Failure to do so will result in a tiny memory leak and a large leak of
     128              : // resources on disk due to the entries the snapshot is preventing from being
     129              : // deleted.
     130              : //
     131              : // d.mu must NOT be held by the caller.
     132            1 : func (s *Snapshot) Close() error {
     133            1 :         db := s.db
     134            1 :         if db == nil {
     135            0 :                 panic(ErrClosed)
     136              :         }
     137            1 :         db.mu.Lock()
     138            1 :         defer db.mu.Unlock()
     139            1 :         return s.closeLocked()
     140              : }
     141              : 
     142              : type snapshotList struct {
     143              :         root Snapshot
     144              : }
     145              : 
     146            1 : func (l *snapshotList) init() {
     147            1 :         l.root.next = &l.root
     148            1 :         l.root.prev = &l.root
     149            1 : }
     150              : 
     151            1 : func (l *snapshotList) empty() bool {
     152            1 :         return l.root.next == &l.root
     153            1 : }
     154              : 
     155            1 : func (l *snapshotList) count() int {
     156            1 :         if l.empty() {
     157            1 :                 return 0
     158            1 :         }
     159            0 :         var count int
     160            0 :         for i := l.root.next; i != &l.root; i = i.next {
     161            0 :                 count++
     162            0 :         }
     163            0 :         return count
     164              : }
     165              : 
     166            1 : func (l *snapshotList) earliest() base.SeqNum {
     167            1 :         v := base.SeqNum(math.MaxUint64)
     168            1 :         if !l.empty() {
     169            1 :                 v = l.root.next.seqNum
     170            1 :         }
     171            1 :         return v
     172              : }
     173              : 
     174            1 : func (l *snapshotList) toSlice() []base.SeqNum {
     175            1 :         if l.empty() {
     176            1 :                 return nil
     177            1 :         }
     178            1 :         var results []base.SeqNum
     179            1 :         for i := l.root.next; i != &l.root; i = i.next {
     180            1 :                 results = append(results, i.seqNum)
     181            1 :         }
     182            1 :         return results
     183              : }
     184              : 
     185            1 : func (l *snapshotList) pushBack(s *Snapshot) {
     186            1 :         if s.list != nil || s.prev != nil || s.next != nil {
     187            0 :                 panic("pebble: snapshot list is inconsistent")
     188              :         }
     189            1 :         s.prev = l.root.prev
     190            1 :         s.prev.next = s
     191            1 :         s.next = &l.root
     192            1 :         s.next.prev = s
     193            1 :         s.list = l
     194              : }
     195              : 
     196            1 : func (l *snapshotList) remove(s *Snapshot) {
     197            1 :         if s == &l.root {
     198            0 :                 panic("pebble: cannot remove snapshot list root node")
     199              :         }
     200            1 :         if s.list != l {
     201            0 :                 panic("pebble: snapshot list is inconsistent")
     202              :         }
     203            1 :         s.prev.next = s.next
     204            1 :         s.next.prev = s.prev
     205            1 :         s.next = nil // avoid memory leaks
     206            1 :         s.prev = nil // avoid memory leaks
     207            1 :         s.list = nil // avoid memory leaks
     208              : }
     209              : 
     210              : // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view
     211              : // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot
     212              : // induces less write amplification than Snapshot, at the cost of increased space
     213              : // amplification. While a Snapshot may increase write amplification across all
     214              : // flushes and compactions for the duration of its lifetime, an
     215              : // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if
     216              : // memtables at the time of EFOS instantiation contained keys that the EFOS is
     217              : // interested in (i.e. its protectedRanges). In that case, the EFOS prevents
     218              : // elision of keys visible to it, similar to a Snapshot, until those memtables
     219              : // are flushed, and once that happens, the "EventuallyFileOnlySnapshot"
     220              : // transitions to a file-only snapshot state in which it pins zombies sstables
     221              : // like an open Iterator would, without pinning any memtables. Callers that can
     222              : // tolerate the increased space amplification of pinning zombie sstables until
     223              : // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their
     224              : // reduced write amplification. Callers that desire the benefits of the file-only
     225              : // state that requires no pinning of memtables should call
     226              : // `WaitForFileOnlySnapshot()` before relying on the EFOS to keep producing iterators
     227              : // with zero write-amp and zero pinning of memtables in memory.
     228              : //
     229              : // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in
     230              : // subtle ways. The IngestAndExcise can force the transition of an EFOS to a
     231              : // file-only snapshot if an excise overlaps with the EFOS bounds.
     232              : type EventuallyFileOnlySnapshot struct {
     233              :         mu struct {
     234              :                 // NB: If both this mutex and db.mu are being grabbed, db.mu should be
     235              :                 // grabbed _before_ grabbing this one.
     236              :                 sync.Mutex
     237              : 
     238              :                 // Either the snap field is set below, or the version is set at any given
     239              :                 // point of time. If a snapshot is referenced, this is not a file-only
     240              :                 // snapshot yet, and if a version is set (and ref'd) this is a file-only
     241              :                 // snapshot.
     242              : 
     243              :                 // The wrapped regular snapshot, if not a file-only snapshot yet.
     244              :                 snap *Snapshot
     245              :                 // The wrapped version reference, if a file-only snapshot.
     246              :                 vers *version
     247              :         }
     248              : 
     249              :         // Key ranges to watch for an excise on.
     250              :         protectedRanges []KeyRange
     251              : 
     252              :         // The db the snapshot was created from.
     253              :         db     *DB
     254              :         seqNum base.SeqNum
     255              :         closed chan struct{}
     256              : }
     257              : 
     258            1 : func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot {
     259            1 :         isFileOnly := true
     260            1 : 
     261            1 :         d.mu.Lock()
     262            1 :         defer d.mu.Unlock()
     263            1 :         seqNum := d.mu.versions.visibleSeqNum.Load()
     264            1 :         // Check if any of the keyRanges overlap with a memtable.
     265            1 :         for i := range d.mu.mem.queue {
     266            1 :                 d.mu.mem.queue[i].computePossibleOverlaps(func(bounded) shouldContinue {
     267            1 :                         isFileOnly = false
     268            1 :                         return stopIteration
     269            1 :                 }, sliceAsBounded(keyRanges)...)
     270              :         }
     271            1 :         es := &EventuallyFileOnlySnapshot{
     272            1 :                 db:              d,
     273            1 :                 seqNum:          seqNum,
     274            1 :                 protectedRanges: keyRanges,
     275            1 :                 closed:          make(chan struct{}),
     276            1 :         }
     277            1 :         if isFileOnly {
     278            1 :                 es.mu.vers = d.mu.versions.currentVersion()
     279            1 :                 es.mu.vers.Ref()
     280            1 :         } else {
     281            1 :                 s := &Snapshot{
     282            1 :                         db:     d,
     283            1 :                         seqNum: seqNum,
     284            1 :                 }
     285            1 :                 s.efos = es
     286            1 :                 es.mu.snap = s
     287            1 :                 d.mu.snapshots.pushBack(s)
     288            1 :         }
     289            1 :         return es
     290              : }
     291              : 
     292              : // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires
     293              : // earliestUnflushedSeqNum and vers to correspond to the same Version from the
     294              : // current or a past acquisition of db.mu. vers must have been Ref()'d before
     295              : // that mutex was released, if it was released.
     296              : //
     297              : // NB: The caller is expected to check for es.excised before making this
     298              : // call.
     299              : //
     300              : // d.mu must be held when calling this method.
     301            1 : func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error {
     302            1 :         es.mu.Lock()
     303            1 :         select {
     304            1 :         case <-es.closed:
     305            1 :                 vers.UnrefLocked()
     306            1 :                 es.mu.Unlock()
     307            1 :                 return ErrClosed
     308            1 :         default:
     309              :         }
     310            1 :         if es.mu.snap == nil {
     311            0 :                 es.mu.Unlock()
     312            0 :                 panic("pebble: tried to transition an eventually-file-only-snapshot twice")
     313              :         }
     314              :         // The caller has already called Ref() on vers.
     315            1 :         es.mu.vers = vers
     316            1 :         // NB: The callers should have already done a check of es.excised.
     317            1 :         oldSnap := es.mu.snap
     318            1 :         es.mu.snap = nil
     319            1 :         es.mu.Unlock()
     320            1 :         return oldSnap.closeLocked()
     321              : }
     322              : 
     323              : // hasTransitioned returns true if this EFOS has transitioned to a file-only
     324              : // snapshot.
     325            1 : func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool {
     326            1 :         es.mu.Lock()
     327            1 :         defer es.mu.Unlock()
     328            1 :         return es.mu.vers != nil
     329            1 : }
     330              : 
     331              : // waitForFlush waits for a flush on any memtables that need to be flushed
     332              : // before this EFOS can transition to a file-only snapshot. If this EFOS is
     333              : // waiting on a flush of the mutable memtable, it forces a rotation within
     334              : // `dur` duration. For immutable memtables, it schedules a flush and waits for
     335              : // it to finish.
     336            0 : func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error {
     337            0 :         es.db.mu.Lock()
     338            0 :         defer es.db.mu.Unlock()
     339            0 : 
     340            0 :         earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked()
     341            0 :         for earliestUnflushedSeqNum < es.seqNum {
     342            0 :                 select {
     343            0 :                 case <-es.closed:
     344            0 :                         return ErrClosed
     345            0 :                 case <-ctx.Done():
     346            0 :                         return ctx.Err()
     347            0 :                 default:
     348              :                 }
     349              :                 // Check if the current mutable memtable contains keys less than seqNum.
     350              :                 // If so, rotate it.
     351            0 :                 if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 {
     352            0 :                         es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur)
     353            0 :                 } else {
     354            0 :                         // Find the last memtable that contains seqNums less than es.seqNum,
     355            0 :                         // and force a flush on it.
     356            0 :                         var mem *flushableEntry
     357            0 :                         for i := range es.db.mu.mem.queue {
     358            0 :                                 if es.db.mu.mem.queue[i].logSeqNum < es.seqNum {
     359            0 :                                         mem = es.db.mu.mem.queue[i]
     360            0 :                                 }
     361              :                         }
     362            0 :                         mem.flushForced = true
     363            0 :                         es.db.maybeScheduleFlush()
     364              :                 }
     365            0 :                 es.db.mu.compact.cond.Wait()
     366            0 : 
     367            0 :                 earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked()
     368              :         }
     369            0 :         return nil
     370              : }
     371              : 
     372              : // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot
     373              : // has been converted into a file-only snapshot (i.e. all memtables containing
     374              : // keys < seqNum are flushed). A duration can be passed in, and if nonzero,
     375              : // a delayed flush will be scheduled at that duration if necessary.
     376              : //
     377              : // Idempotent; can be called multiple times with no side effects.
     378              : func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(
     379              :         ctx context.Context, dur time.Duration,
     380            0 : ) error {
     381            0 :         if es.hasTransitioned() {
     382            0 :                 return nil
     383            0 :         }
     384              : 
     385            0 :         if err := es.waitForFlush(ctx, dur); err != nil {
     386            0 :                 return err
     387            0 :         }
     388              : 
     389            0 :         if invariants.Enabled {
     390            0 :                 // Since we aren't returning an error, we _must_ have transitioned to a
     391            0 :                 // file-only snapshot by now.
     392            0 :                 if !es.hasTransitioned() {
     393            0 :                         panic("expected EFOS to have transitioned to file-only snapshot after flush")
     394              :                 }
     395              :         }
     396            0 :         return nil
     397              : }
     398              : 
     399              : // Close closes the file-only snapshot and releases all referenced resources.
     400              : // Not idempotent.
     401            1 : func (es *EventuallyFileOnlySnapshot) Close() error {
     402            1 :         close(es.closed)
     403            1 :         es.db.mu.Lock()
     404            1 :         defer es.db.mu.Unlock()
     405            1 :         es.mu.Lock()
     406            1 :         defer es.mu.Unlock()
     407            1 : 
     408            1 :         if es.mu.snap != nil {
     409            1 :                 if err := es.mu.snap.closeLocked(); err != nil {
     410            0 :                         return err
     411            0 :                 }
     412              :         }
     413            1 :         if es.mu.vers != nil {
     414            1 :                 es.mu.vers.UnrefLocked()
     415            1 :         }
     416            1 :         return nil
     417              : }
     418              : 
     419              : // Get implements the Reader interface.
     420            1 : func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) {
     421            1 :         // TODO(jackson): Use getInternal.
     422            1 :         iter, err := es.NewIter(nil)
     423            1 :         if err != nil {
     424            0 :                 return nil, nil, err
     425            0 :         }
     426            1 :         if !iter.SeekPrefixGE(key) {
     427            1 :                 if err = firstError(iter.Error(), iter.Close()); err != nil {
     428            0 :                         return nil, nil, err
     429            0 :                 }
     430            1 :                 return nil, nil, ErrNotFound
     431              :         }
     432            1 :         if !es.db.equal(iter.Key(), key) {
     433            1 :                 return nil, nil, firstError(iter.Close(), ErrNotFound)
     434            1 :         }
     435            1 :         return iter.Value(), iter, nil
     436              : }
     437              : 
     438              : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
     439              : // return false). The iterator can be positioned via a call to SeekGE,
     440              : // SeekLT, First or Last.
     441            1 : func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) {
     442            1 :         return es.NewIterWithContext(context.Background(), o)
     443            1 : }
     444              : 
     445              : // NewIterWithContext is like NewIter, and additionally accepts a context for
     446              : // tracing.
     447              : func (es *EventuallyFileOnlySnapshot) NewIterWithContext(
     448              :         ctx context.Context, o *IterOptions,
     449            1 : ) (*Iterator, error) {
     450            1 :         select {
     451            0 :         case <-es.closed:
     452            0 :                 panic(ErrClosed)
     453            1 :         default:
     454              :         }
     455              : 
     456            1 :         es.mu.Lock()
     457            1 :         defer es.mu.Unlock()
     458            1 :         if es.mu.vers != nil {
     459            1 :                 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers}
     460            1 :                 return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil
     461            1 :         }
     462              : 
     463            1 :         sOpts := snapshotIterOpts{seqNum: es.seqNum}
     464            1 :         iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o)
     465            1 :         return iter, nil
     466              : }
     467              : 
     468              : // ScanInternal scans all internal keys within the specified bounds, truncating
     469              : // any rangedels and rangekeys to those bounds. For use when an external user
     470              : // needs to be aware of all internal keys that make up a key range.
     471              : //
     472              : // See comment on db.ScanInternal for the behaviour that can be expected of
     473              : // point keys deleted by range dels and keys masked by range keys.
     474              : func (es *EventuallyFileOnlySnapshot) ScanInternal(
     475              :         ctx context.Context,
     476              :         category block.Category,
     477              :         lower, upper []byte,
     478              :         visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
     479              :         visitRangeDel func(start, end []byte, seqNum base.SeqNum) error,
     480              :         visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
     481              :         visitSharedFile func(sst *SharedSSTMeta) error,
     482              :         visitExternalFile func(sst *ExternalFile) error,
     483            0 : ) error {
     484            0 :         if es.db == nil {
     485            0 :                 panic(ErrClosed)
     486              :         }
     487            0 :         var sOpts snapshotIterOpts
     488            0 :         opts := &scanInternalOptions{
     489            0 :                 category: category,
     490            0 :                 IterOptions: IterOptions{
     491            0 :                         KeyTypes:   IterKeyTypePointsAndRanges,
     492            0 :                         LowerBound: lower,
     493            0 :                         UpperBound: upper,
     494            0 :                 },
     495            0 :                 visitPointKey:     visitPointKey,
     496            0 :                 visitRangeDel:     visitRangeDel,
     497            0 :                 visitRangeKey:     visitRangeKey,
     498            0 :                 visitSharedFile:   visitSharedFile,
     499            0 :                 visitExternalFile: visitExternalFile,
     500            0 :         }
     501            0 :         es.mu.Lock()
     502            0 :         if es.mu.vers != nil {
     503            0 :                 sOpts = snapshotIterOpts{
     504            0 :                         seqNum: es.seqNum,
     505            0 :                         vers:   es.mu.vers,
     506            0 :                 }
     507            0 :         } else {
     508            0 :                 sOpts = snapshotIterOpts{
     509            0 :                         seqNum: es.seqNum,
     510            0 :                 }
     511            0 :         }
     512            0 :         es.mu.Unlock()
     513            0 :         iter, err := es.db.newInternalIter(ctx, sOpts, opts)
     514            0 :         if err != nil {
     515            0 :                 return err
     516            0 :         }
     517            0 :         defer iter.close()
     518            0 : 
     519            0 :         return scanInternalImpl(ctx, lower, upper, iter, opts)
     520              : }
        

Generated by: LCOV version 2.0-1