LCOV - code coverage report
Current view: top level - pebble - snapshot.go (source / functions) Coverage Total Hit
Test: 2025-09-20 08:18Z 489b133b - tests only.lcov Lines: 83.5 % 255 213
Test Date: 2025-09-20 08:19:46 Functions: - 0 0

            Line data    Source code
       1              : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2              : // of this source code is governed by a BSD-style license that can be found in
       3              : // the LICENSE file.
       4              : 
       5              : package pebble
       6              : 
       7              : import (
       8              :         "context"
       9              :         "io"
      10              :         "math"
      11              :         "sync"
      12              :         "time"
      13              : 
      14              :         "github.com/cockroachdb/errors"
      15              :         "github.com/cockroachdb/pebble/internal/base"
      16              :         "github.com/cockroachdb/pebble/internal/invariants"
      17              :         "github.com/cockroachdb/pebble/internal/manifest"
      18              : )
      19              : 
      20              : // Snapshot provides a read-only point-in-time view of the DB state.
      21              : type Snapshot struct {
      22              :         // The db the snapshot was created from.
      23              :         db     *DB
      24              :         seqNum base.SeqNum
      25              : 
      26              :         // Set if part of an EventuallyFileOnlySnapshot.
      27              :         efos *EventuallyFileOnlySnapshot
      28              : 
      29              :         // The list the snapshot is linked into.
      30              :         list *snapshotList
      31              : 
      32              :         // The next/prev link for the snapshotList doubly-linked list of snapshots.
      33              :         prev, next *Snapshot
      34              : }
      35              : 
      36              : var _ Reader = (*Snapshot)(nil)
      37              : 
      38              : // Get gets the value for the given key. It returns ErrNotFound if the Snapshot
      39              : // does not contain the key.
      40              : //
      41              : // The caller should not modify the contents of the returned slice, but it is
      42              : // safe to modify the contents of the argument after Get returns. The returned
      43              : // slice will remain valid until the returned Closer is closed. On success, the
      44              : // caller MUST call closer.Close() or a memory leak will occur.
      45            1 : func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) {
      46            1 :         if s.db == nil {
      47            1 :                 panic(ErrClosed)
      48              :         }
      49            1 :         return s.db.getInternal(key, nil /* batch */, s)
      50              : }
      51              : 
      52              : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
      53              : // return false). The iterator can be positioned via a call to SeekGE,
      54              : // SeekLT, First or Last.
      55            1 : func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) {
      56            1 :         return s.NewIterWithContext(context.Background(), o)
      57            1 : }
      58              : 
      59              : // NewIterWithContext is like NewIter, and additionally accepts a context for
      60              : // tracing.
      61            1 : func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
      62            1 :         if s.db == nil {
      63            1 :                 panic(ErrClosed)
      64              :         }
      65            1 :         return s.db.newIter(ctx, nil /* batch */, newIterOpts{
      66            1 :                 snapshot: snapshotIterOpts{seqNum: s.seqNum},
      67            1 :         }, o), nil
      68              : }
      69              : 
      70              : // ScanInternal scans all internal keys within the specified bounds, truncating
      71              : // any rangedels and rangekeys to those bounds. For use when an external user
      72              : // needs to be aware of all internal keys that make up a key range.
      73              : //
      74              : // See comment on db.ScanInternal for the behaviour that can be expected of
      75              : // point keys deleted by range dels and keys masked by range keys.
      76            1 : func (s *Snapshot) ScanInternal(ctx context.Context, opts ScanInternalOptions) (err error) {
      77            1 :         if s.db == nil {
      78            0 :                 panic(ErrClosed)
      79              :         }
      80            1 :         var iter *scanInternalIterator
      81            1 :         iter, err = s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, &opts)
      82            1 :         if err != nil {
      83            0 :                 return err
      84            0 :         }
      85            1 :         defer func() { err = errors.CombineErrors(err, iter.Close()) }()
      86            1 :         return scanInternalImpl(ctx, iter, &opts)
      87              : }
      88              : 
      89              : // closeLocked is similar to Close(), except it requires that db.mu be held
      90              : // by the caller.
      91            1 : func (s *Snapshot) closeLocked() error {
      92            1 :         s.db.mu.snapshots.remove(s)
      93            1 : 
      94            1 :         // If s was the previous earliest snapshot, we might be able to reclaim
      95            1 :         // disk space by dropping obsolete records that were pinned by s.
      96            1 :         if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
      97            1 :                 // NB: maybeScheduleCompaction also picks elision-only compactions.
      98            1 :                 s.db.maybeScheduleCompaction()
      99            1 :         }
     100            1 :         s.db = nil
     101            1 :         return nil
     102              : }
     103              : 
     104              : // Close closes the snapshot, releasing its resources. Close must be called.
     105              : // Failure to do so will result in a tiny memory leak and a large leak of
     106              : // resources on disk due to the entries the snapshot is preventing from being
     107              : // deleted.
     108              : //
     109              : // d.mu must NOT be held by the caller.
     110            1 : func (s *Snapshot) Close() error {
     111            1 :         db := s.db
     112            1 :         if db == nil {
     113            1 :                 panic(ErrClosed)
     114              :         }
     115            1 :         db.mu.Lock()
     116            1 :         defer db.mu.Unlock()
     117            1 :         return s.closeLocked()
     118              : }
     119              : 
     120              : type snapshotList struct {
     121              :         root Snapshot
     122              : }
     123              : 
     124            1 : func (l *snapshotList) init() {
     125            1 :         l.root.next = &l.root
     126            1 :         l.root.prev = &l.root
     127            1 : }
     128              : 
     129            1 : func (l *snapshotList) empty() bool {
     130            1 :         return l.root.next == &l.root
     131            1 : }
     132              : 
     133            1 : func (l *snapshotList) count() int {
     134            1 :         if l.empty() {
     135            1 :                 return 0
     136            1 :         }
     137            0 :         var count int
     138            0 :         for i := l.root.next; i != &l.root; i = i.next {
     139            0 :                 count++
     140            0 :         }
     141            0 :         return count
     142              : }
     143              : 
     144            1 : func (l *snapshotList) earliest() base.SeqNum {
     145            1 :         v := base.SeqNum(math.MaxUint64)
     146            1 :         if !l.empty() {
     147            1 :                 v = l.root.next.seqNum
     148            1 :         }
     149            1 :         return v
     150              : }
     151              : 
     152            1 : func (l *snapshotList) toSlice() []base.SeqNum {
     153            1 :         if l.empty() {
     154            1 :                 return nil
     155            1 :         }
     156            1 :         var results []base.SeqNum
     157            1 :         for i := l.root.next; i != &l.root; i = i.next {
     158            1 :                 results = append(results, i.seqNum)
     159            1 :         }
     160            1 :         return results
     161              : }
     162              : 
     163            1 : func (l *snapshotList) pushBack(s *Snapshot) {
     164            1 :         if s.list != nil || s.prev != nil || s.next != nil {
     165            0 :                 panic("pebble: snapshot list is inconsistent")
     166              :         }
     167            1 :         s.prev = l.root.prev
     168            1 :         s.prev.next = s
     169            1 :         s.next = &l.root
     170            1 :         s.next.prev = s
     171            1 :         s.list = l
     172              : }
     173              : 
     174            1 : func (l *snapshotList) remove(s *Snapshot) {
     175            1 :         if s == &l.root {
     176            0 :                 panic("pebble: cannot remove snapshot list root node")
     177              :         }
     178            1 :         if s.list != l {
     179            0 :                 panic("pebble: snapshot list is inconsistent")
     180              :         }
     181            1 :         s.prev.next = s.next
     182            1 :         s.next.prev = s.prev
     183            1 :         s.next = nil // avoid memory leaks
     184            1 :         s.prev = nil // avoid memory leaks
     185            1 :         s.list = nil // avoid memory leaks
     186              : }
     187              : 
     188              : // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view
     189              : // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot
     190              : // induces less write amplification than Snapshot, at the cost of increased space
     191              : // amplification. While a Snapshot may increase write amplification across all
     192              : // flushes and compactions for the duration of its lifetime, an
     193              : // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if
     194              : // memtables at the time of EFOS instantiation contained keys that the EFOS is
     195              : // interested in (i.e. its protectedRanges). In that case, the EFOS prevents
     196              : // elision of keys visible to it, similar to a Snapshot, until those memtables
     197              : // are flushed, and once that happens, the "EventuallyFileOnlySnapshot"
     198              : // transitions to a file-only snapshot state in which it pins zombies sstables
     199              : // like an open Iterator would, without pinning any memtables. Callers that can
     200              : // tolerate the increased space amplification of pinning zombie sstables until
     201              : // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their
     202              : // reduced write amplification. Callers that desire the benefits of the file-only
     203              : // state that requires no pinning of memtables should call
     204              : // `WaitForFileOnlySnapshot()` before relying on the EFOS to keep producing iterators
     205              : // with zero write-amp and zero pinning of memtables in memory.
     206              : //
     207              : // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in
     208              : // subtle ways. The IngestAndExcise can force the transition of an EFOS to a
     209              : // file-only snapshot if an excise overlaps with the EFOS bounds.
     210              : type EventuallyFileOnlySnapshot struct {
     211              :         mu struct {
     212              :                 // NB: If both this mutex and db.mu are being grabbed, db.mu should be
     213              :                 // grabbed _before_ grabbing this one.
     214              :                 sync.Mutex
     215              : 
     216              :                 // Either the snap field is set below, or the version is set at any given
     217              :                 // point of time. If a snapshot is referenced, this is not a file-only
     218              :                 // snapshot yet, and if a version is set (and ref'd) this is a file-only
     219              :                 // snapshot.
     220              : 
     221              :                 // The wrapped regular snapshot, if not a file-only snapshot yet.
     222              :                 snap *Snapshot
     223              :                 // The wrapped version reference, if a file-only snapshot.
     224              :                 vers *manifest.Version
     225              :         }
     226              : 
     227              :         // Key ranges to watch for an excise on.
     228              :         protectedRanges []KeyRange
     229              : 
     230              :         // The db the snapshot was created from.
     231              :         db     *DB
     232              :         seqNum base.SeqNum
     233              :         closed chan struct{}
     234              : }
     235              : 
     236            1 : func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot {
     237            1 :         isFileOnly := true
     238            1 : 
     239            1 :         d.mu.Lock()
     240            1 :         defer d.mu.Unlock()
     241            1 :         seqNum := d.mu.versions.visibleSeqNum.Load()
     242            1 :         // Check if any of the keyRanges overlap with a memtable.
     243            1 :         for i := range d.mu.mem.queue {
     244            1 :                 d.mu.mem.queue[i].computePossibleOverlaps(func(bounded) shouldContinue {
     245            1 :                         isFileOnly = false
     246            1 :                         return stopIteration
     247            1 :                 }, sliceAsBounded(keyRanges)...)
     248              :         }
     249            1 :         es := &EventuallyFileOnlySnapshot{
     250            1 :                 db:              d,
     251            1 :                 seqNum:          seqNum,
     252            1 :                 protectedRanges: keyRanges,
     253            1 :                 closed:          make(chan struct{}),
     254            1 :         }
     255            1 :         if isFileOnly {
     256            1 :                 es.mu.vers = d.mu.versions.currentVersion()
     257            1 :                 es.mu.vers.Ref()
     258            1 :         } else {
     259            1 :                 s := &Snapshot{
     260            1 :                         db:     d,
     261            1 :                         seqNum: seqNum,
     262            1 :                 }
     263            1 :                 s.efos = es
     264            1 :                 es.mu.snap = s
     265            1 :                 d.mu.snapshots.pushBack(s)
     266            1 :         }
     267            1 :         return es
     268              : }
     269              : 
     270              : // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires
     271              : // earliestUnflushedSeqNum and vers to correspond to the same Version from the
     272              : // current or a past acquisition of db.mu. vers must have been Ref()'d before
     273              : // that mutex was released, if it was released.
     274              : //
     275              : // NB: The caller is expected to check for es.excised before making this
     276              : // call.
     277              : //
     278              : // d.mu must be held when calling this method.
     279            1 : func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *manifest.Version) error {
     280            1 :         es.mu.Lock()
     281            1 :         select {
     282            0 :         case <-es.closed:
     283            0 :                 vers.UnrefLocked()
     284            0 :                 es.mu.Unlock()
     285            0 :                 return ErrClosed
     286            1 :         default:
     287              :         }
     288            1 :         if es.mu.snap == nil {
     289            0 :                 es.mu.Unlock()
     290            0 :                 panic("pebble: tried to transition an eventually-file-only-snapshot twice")
     291              :         }
     292              :         // The caller has already called Ref() on vers.
     293            1 :         es.mu.vers = vers
     294            1 :         // NB: The callers should have already done a check of es.excised.
     295            1 :         oldSnap := es.mu.snap
     296            1 :         es.mu.snap = nil
     297            1 :         es.mu.Unlock()
     298            1 :         return oldSnap.closeLocked()
     299              : }
     300              : 
     301              : // hasTransitioned returns true if this EFOS has transitioned to a file-only
     302              : // snapshot.
     303            1 : func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool {
     304            1 :         es.mu.Lock()
     305            1 :         defer es.mu.Unlock()
     306            1 :         return es.mu.vers != nil
     307            1 : }
     308              : 
     309              : // waitForFlush waits for a flush on any memtables that need to be flushed
     310              : // before this EFOS can transition to a file-only snapshot. If this EFOS is
     311              : // waiting on a flush of the mutable memtable, it forces a rotation within
     312              : // `dur` duration. For immutable memtables, it schedules a flush and waits for
     313              : // it to finish.
     314            1 : func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error {
     315            1 :         es.db.mu.Lock()
     316            1 :         defer es.db.mu.Unlock()
     317            1 : 
     318            1 :         earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked()
     319            1 :         for earliestUnflushedSeqNum < es.seqNum {
     320            1 :                 select {
     321            0 :                 case <-es.closed:
     322            0 :                         return ErrClosed
     323            0 :                 case <-ctx.Done():
     324            0 :                         return ctx.Err()
     325            1 :                 default:
     326              :                 }
     327              :                 // Check if the current mutable memtable contains keys less than seqNum.
     328              :                 // If so, rotate it.
     329            1 :                 if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 {
     330            1 :                         es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur)
     331            1 :                 } else {
     332            0 :                         // Find the last memtable that contains seqNums less than es.seqNum,
     333            0 :                         // and force a flush on it.
     334            0 :                         var mem *flushableEntry
     335            0 :                         for i := range es.db.mu.mem.queue {
     336            0 :                                 if es.db.mu.mem.queue[i].logSeqNum < es.seqNum {
     337            0 :                                         mem = es.db.mu.mem.queue[i]
     338            0 :                                 }
     339              :                         }
     340            0 :                         mem.flushForced = true
     341            0 :                         es.db.maybeScheduleFlush()
     342              :                 }
     343            1 :                 es.db.mu.compact.cond.Wait()
     344            1 : 
     345            1 :                 earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked()
     346              :         }
     347            1 :         return nil
     348              : }
     349              : 
     350              : // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot
     351              : // has been converted into a file-only snapshot (i.e. all memtables containing
     352              : // keys < seqNum are flushed). A duration can be passed in, and if nonzero,
     353              : // a delayed flush will be scheduled at that duration if necessary.
     354              : //
     355              : // Idempotent; can be called multiple times with no side effects.
     356              : func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(
     357              :         ctx context.Context, dur time.Duration,
     358            1 : ) error {
     359            1 :         if es.hasTransitioned() {
     360            1 :                 return nil
     361            1 :         }
     362              : 
     363            1 :         if err := es.waitForFlush(ctx, dur); err != nil {
     364            0 :                 return err
     365            0 :         }
     366              : 
     367            1 :         if invariants.Enabled {
     368            1 :                 // Since we aren't returning an error, we _must_ have transitioned to a
     369            1 :                 // file-only snapshot by now.
     370            1 :                 if !es.hasTransitioned() {
     371            0 :                         panic("expected EFOS to have transitioned to file-only snapshot after flush")
     372              :                 }
     373              :         }
     374            1 :         return nil
     375              : }
     376              : 
     377              : // Close closes the file-only snapshot and releases all referenced resources.
     378              : // Not idempotent.
     379            1 : func (es *EventuallyFileOnlySnapshot) Close() error {
     380            1 :         close(es.closed)
     381            1 :         es.db.mu.Lock()
     382            1 :         defer es.db.mu.Unlock()
     383            1 :         es.mu.Lock()
     384            1 :         defer es.mu.Unlock()
     385            1 : 
     386            1 :         if es.mu.snap != nil {
     387            1 :                 if err := es.mu.snap.closeLocked(); err != nil {
     388            0 :                         return err
     389            0 :                 }
     390              :         }
     391            1 :         if es.mu.vers != nil {
     392            1 :                 es.mu.vers.UnrefLocked()
     393            1 :         }
     394            1 :         return nil
     395              : }
     396              : 
     397              : // Get implements the Reader interface.
     398            1 : func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) {
     399            1 :         // TODO(jackson): Use getInternal.
     400            1 :         iter, err := es.NewIter(nil)
     401            1 :         if err != nil {
     402            0 :                 return nil, nil, err
     403            0 :         }
     404            1 :         if !iter.SeekPrefixGE(key) {
     405            1 :                 if err = firstError(iter.Error(), iter.Close()); err != nil {
     406            1 :                         return nil, nil, err
     407            1 :                 }
     408            1 :                 return nil, nil, ErrNotFound
     409              :         }
     410            1 :         if !es.db.equal(iter.Key(), key) {
     411            1 :                 return nil, nil, firstError(iter.Close(), ErrNotFound)
     412            1 :         }
     413            1 :         return iter.Value(), iter, nil
     414              : }
     415              : 
     416              : // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
     417              : // return false). The iterator can be positioned via a call to SeekGE,
     418              : // SeekLT, First or Last.
     419            1 : func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) {
     420            1 :         return es.NewIterWithContext(context.Background(), o)
     421            1 : }
     422              : 
     423              : // NewIterWithContext is like NewIter, and additionally accepts a context for
     424              : // tracing.
     425              : func (es *EventuallyFileOnlySnapshot) NewIterWithContext(
     426              :         ctx context.Context, o *IterOptions,
     427            1 : ) (*Iterator, error) {
     428            1 :         select {
     429            0 :         case <-es.closed:
     430            0 :                 panic(ErrClosed)
     431            1 :         default:
     432              :         }
     433              : 
     434            1 :         es.mu.Lock()
     435            1 :         defer es.mu.Unlock()
     436            1 :         if es.mu.vers != nil {
     437            1 :                 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers}
     438            1 :                 return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil
     439            1 :         }
     440              : 
     441            1 :         sOpts := snapshotIterOpts{seqNum: es.seqNum}
     442            1 :         iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o)
     443            1 :         return iter, nil
     444              : }
     445              : 
     446              : // ScanInternal scans all internal keys within the specified bounds, truncating
     447              : // any rangedels and rangekeys to those bounds. For use when an external user
     448              : // needs to be aware of all internal keys that make up a key range.
     449              : //
     450              : // See comment on db.ScanInternal for the behaviour that can be expected of
     451              : // point keys deleted by range dels and keys masked by range keys.
     452              : func (es *EventuallyFileOnlySnapshot) ScanInternal(
     453              :         ctx context.Context, opts ScanInternalOptions,
     454            1 : ) (err error) {
     455            1 :         if es.db == nil {
     456            0 :                 panic(ErrClosed)
     457              :         }
     458            1 :         var sOpts snapshotIterOpts
     459            1 : 
     460            1 :         es.mu.Lock()
     461            1 :         if es.mu.vers != nil {
     462            1 :                 sOpts = snapshotIterOpts{
     463            1 :                         seqNum: es.seqNum,
     464            1 :                         vers:   es.mu.vers,
     465            1 :                 }
     466            1 :         } else {
     467            1 :                 sOpts = snapshotIterOpts{
     468            1 :                         seqNum: es.seqNum,
     469            1 :                 }
     470            1 :         }
     471            1 :         es.mu.Unlock()
     472            1 :         var iter *scanInternalIterator
     473            1 :         iter, err = es.db.newInternalIter(ctx, sOpts, &opts)
     474            1 :         if err != nil {
     475            0 :                 return err
     476            0 :         }
     477            1 :         defer func() { err = errors.CombineErrors(err, iter.Close()) }()
     478            1 :         return scanInternalImpl(ctx, iter, &opts)
     479              : }
        

Generated by: LCOV version 2.0-1