LCOV - code coverage report
Current view: top level - pebble/objstorage - objstorage.go (source / functions) Hit Total Coverage
Test: 2024-06-05 08:15Z 907d8652 - meta test only.lcov Lines: 42 55 76.4 %
Date: 2024-06-05 08:16:31 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package objstorage
       6             : 
       7             : import (
       8             :         "context"
       9             :         "fmt"
      10             : 
      11             :         "github.com/cockroachdb/errors"
      12             :         "github.com/cockroachdb/pebble/internal/base"
      13             :         "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
      14             :         "github.com/cockroachdb/pebble/objstorage/remote"
      15             :         "github.com/cockroachdb/pebble/vfs"
      16             :         "github.com/cockroachdb/redact"
      17             : )
      18             : 
      19             : // Readable is the handle for an object that is open for reading.
      20             : type Readable interface {
      21             :         // ReadAt reads len(p) bytes into p starting at offset off.
      22             :         //
      23             :         // Does not return partial results; if off + len(p) is past the end of the
      24             :         // object, an error is returned.
      25             :         //
      26             :         // Clients of ReadAt can execute parallel ReadAt calls on the
      27             :         // same Readable.
      28             :         ReadAt(ctx context.Context, p []byte, off int64) error
      29             : 
      30             :         Close() error
      31             : 
      32             :         // Size returns the size of the object.
      33             :         Size() int64
      34             : 
      35             :         // NewReadHandle creates a read handle for ReadAt requests that are related
      36             :         // and can benefit from optimizations like read-ahead.
      37             :         //
      38             :         // The ReadHandle must be closed before the Readable is closed.
      39             :         //
      40             :         // Multiple separate ReadHandles can be used.
      41             :         NewReadHandle(ctx context.Context, readBeforeSize ReadBeforeSize) ReadHandle
      42             : }
      43             : 
      44             : // ReadBeforeSize specifies whether the first read should read additional
      45             : // bytes before the offset, and how big the overall read should be. This is
      46             : // just a suggestion that the callee can ignore (and does ignore in
      47             : // fileReadable).
      48             : //
      49             : // When 0, the first read will only read what it is asked to read, say n
      50             : // bytes. When it is a value b > 0, if b > n, then the read will be padded by
      51             : // an additional b-n bytes to the left, resulting in an overall read size of
      52             : // b. This behavior is akin to what the read-ahead implementation does -- when
      53             : // the n bytes are not buffered, and there is read-ahead of b > n, the read
      54             : // length is b bytes.
      55             : type ReadBeforeSize int64
      56             : 
      57             : const (
      58             :         // NoReadBefore specifies no read-before.
      59             :         NoReadBefore ReadBeforeSize = 0
      60             :         // ReadBeforeForNewReader is used for a new Reader reading the footer,
      61             :         // metaindex, properties. 32KB is unnecessarily large, but it is still small
      62             :         // when considering remote object storage.
      63             :         ReadBeforeForNewReader = 32 * 1024
      64             :         // ReadBeforeForIndexAndFilter is used for an iterator reading the top-level
      65             :         // index, filter and second-level index blocks.
      66             :         //
      67             :         // Consider a 128MB sstable with 32KB blocks, so 4K blocks. Say keys are
      68             :         // ~100 bytes, then the size of the index blocks is ~400KB. 512KB is a bit
      69             :         // bigger, and not too large to be a memory concern.
      70             :         ReadBeforeForIndexAndFilter = 512 * 1024
      71             : )
      72             : 
      73             : // ReadHandle is used to perform reads that are related and might benefit from
      74             : // optimizations like read-ahead.
      75             : type ReadHandle interface {
      76             :         // ReadAt reads len(p) bytes into p starting at offset off.
      77             :         //
      78             :         // Does not return partial results; if off + len(p) is past the end of the
      79             :         // object, an error is returned.
      80             :         //
      81             :         // Parallel ReadAt calls on the same ReadHandle are not allowed.
      82             :         ReadAt(ctx context.Context, p []byte, off int64) error
      83             : 
      84             :         Close() error
      85             : 
      86             :         // SetupForCompaction informs the implementation that the read handle will
      87             :         // be used to read data blocks for a compaction. The implementation can expect
      88             :         // sequential reads, and can decide to not retain data in any caches.
      89             :         SetupForCompaction()
      90             : 
      91             :         // RecordCacheHit informs the implementation that we were able to retrieve a
      92             :         // block from cache. This is useful for example when the implementation is
      93             :         // trying to detect a sequential reading pattern.
      94             :         RecordCacheHit(ctx context.Context, offset, size int64)
      95             : }
      96             : 
      97             : // Writable is the handle for an object that is open for writing.
      98             : // Either Finish or Abort must be called.
      99             : type Writable interface {
     100             :         // Write writes len(p) bytes from p to the underlying object. The data is not
     101             :         // guaranteed to be durable until Finish is called.
     102             :         //
     103             :         // Note that Write *is* allowed to modify the slice passed in, whether
     104             :         // temporarily or permanently. Callers of Write need to take this into
     105             :         // account.
     106             :         Write(p []byte) error
     107             : 
     108             :         // Finish completes the object and makes the data durable.
     109             :         // No further calls are allowed after calling Finish.
     110             :         Finish() error
     111             : 
     112             :         // Abort gives up on finishing the object. There is no guarantee about whether
     113             :         // the object exists after calling Abort.
     114             :         // No further calls are allowed after calling Abort.
     115             :         Abort()
     116             : }
     117             : 
     118             : // ObjectMetadata contains the metadata required to be able to access an object.
     119             : type ObjectMetadata struct {
     120             :         DiskFileNum base.DiskFileNum
     121             :         FileType    base.FileType
     122             : 
     123             :         // The fields below are only set if the object is on remote storage.
     124             :         Remote struct {
     125             :                 // CreatorID identifies the DB instance that originally created the object.
     126             :                 //
     127             :                 // Only used when CustomObjectName is not set.
     128             :                 CreatorID CreatorID
     129             :                 // CreatorFileNum is the identifier for the object within the context of the
     130             :                 // DB instance that originally created the object.
     131             :                 //
     132             :                 // Only used when CustomObjectName is not set.
     133             :                 CreatorFileNum base.DiskFileNum
     134             :                 // CustomObjectName (if it is set) overrides the object name that is normally
     135             :                 // derived from the CreatorID and CreatorFileNum.
     136             :                 CustomObjectName string
     137             :                 // CleanupMethod indicates the method for cleaning up unused shared objects.
     138             :                 CleanupMethod SharedCleanupMethod
     139             :                 // Locator identifies the remote.Storage implementation for this object.
     140             :                 Locator remote.Locator
     141             :                 // Storage is the remote.Storage object corresponding to the Locator. Used
     142             :                 // to avoid lookups in hot paths.
     143             :                 Storage remote.Storage
     144             :         }
     145             : }
     146             : 
     147             : // IsRemote returns true if the object is on remote storage.
     148           1 : func (meta *ObjectMetadata) IsRemote() bool {
     149           1 :         return meta.IsShared() || meta.IsExternal()
     150           1 : }
     151             : 
     152             : // IsExternal returns true if the object is on remote storage but is not owned
     153             : // by any Pebble instances in the cluster.
     154           1 : func (meta *ObjectMetadata) IsExternal() bool {
     155           1 :         return meta.Remote.CustomObjectName != ""
     156           1 : }
     157             : 
     158             : // IsShared returns true if the object is on remote storage and is owned by a
     159             : // Pebble instance in the cluster (potentially shared between multiple
     160             : // instances).
     161           1 : func (meta *ObjectMetadata) IsShared() bool {
     162           1 :         return meta.Remote.CreatorID.IsSet()
     163           1 : }
     164             : 
     165             : // AssertValid checks that the metadata is sane.
     166           1 : func (meta *ObjectMetadata) AssertValid() {
     167           1 :         if !meta.IsRemote() {
     168           1 :                 // Verify all Remote fields are empty.
     169           1 :                 if meta.Remote != (ObjectMetadata{}).Remote {
     170           0 :                         panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote))
     171             :                 }
     172           1 :         } else {
     173           1 :                 if meta.Remote.CustomObjectName == "" {
     174           1 :                         if meta.Remote.CreatorID == 0 {
     175           0 :                                 panic(errors.AssertionFailedf("CreatorID not set"))
     176             :                         }
     177           1 :                         if meta.Remote.CreatorFileNum == 0 {
     178           0 :                                 panic(errors.AssertionFailedf("CreatorFileNum not set"))
     179             :                         }
     180             :                 }
     181           1 :                 if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking {
     182           0 :                         panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod))
     183             :                 }
     184           1 :                 if meta.Remote.Storage == nil {
     185           0 :                         panic(errors.AssertionFailedf("Storage not set"))
     186             :                 }
     187             :         }
     188             : }
     189             : 
     190             : // CreatorID identifies the DB instance that originally created a shared object.
     191             : // This ID is incorporated in backing object names.
     192             : // Must be non-zero.
     193             : type CreatorID uint64
     194             : 
     195             : // IsSet returns true if the CreatorID is not zero.
     196           1 : func (c CreatorID) IsSet() bool { return c != 0 }
     197             : 
     198           1 : func (c CreatorID) String() string { return fmt.Sprintf("%d", c) }
     199             : 
     200             : // SafeFormat implements redact.SafeFormatter.
     201           0 : func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) {
     202           0 :         w.Printf("%d", redact.SafeUint(c))
     203           0 : }
     204             : 
     205             : // SharedCleanupMethod indicates the method for cleaning up unused shared objects.
     206             : type SharedCleanupMethod uint8
     207             : 
     208             : const (
     209             :         // SharedRefTracking is used for shared objects for which objstorage providers
     210             :         // keep track of references via reference marker objects.
     211             :         SharedRefTracking SharedCleanupMethod = iota
     212             : 
     213             :         // SharedNoCleanup is used for remote objects that are managed externally; the
     214             :         // objstorage provider never deletes such objects.
     215             :         SharedNoCleanup
     216             : )
     217             : 
     218             : // OpenOptions contains optional arguments for OpenForReading.
     219             : type OpenOptions struct {
     220             :         // MustExist triggers a fatal error if the file does not exist. The fatal
     221             :         // error message contains extra information helpful for debugging.
     222             :         MustExist bool
     223             : }
     224             : 
     225             : // CreateOptions contains optional arguments for Create.
     226             : type CreateOptions struct {
     227             :         // PreferSharedStorage causes the object to be created on shared storage if
     228             :         // the provider has shared storage configured.
     229             :         PreferSharedStorage bool
     230             : 
     231             :         // SharedCleanupMethod is used for the object when it is created on shared storage.
     232             :         // The default (zero) value is SharedRefTracking.
     233             :         SharedCleanupMethod SharedCleanupMethod
     234             : 
     235             :         // WriteCategory is used for the object when it is created on local storage
     236             :         // to collect aggregated write metrics for each write source.
     237             :         WriteCategory vfs.DiskWriteCategory
     238             : }
     239             : 
     240             : // Provider is a singleton object used to access and manage objects.
     241             : //
     242             : // An object is conceptually like a large immutable file. The main use of
     243             : // objects is for storing sstables; in the future it could also be used for blob
     244             : // storage.
     245             : //
     246             : // The Provider can only manage objects that it knows about - either objects
     247             : // created by the provider, or existing objects the Provider was informed about
     248             : // via AddObjects.
     249             : //
     250             : // Objects are currently backed by a vfs.File or a remote.Storage object.
     251             : type Provider interface {
     252             :         // OpenForReading opens an existing object.
     253             :         OpenForReading(
     254             :                 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions,
     255             :         ) (Readable, error)
     256             : 
     257             :         // Create creates a new object and opens it for writing.
     258             :         //
     259             :         // The object is not guaranteed to be durable (accessible in case of crashes)
     260             :         // until Sync is called.
     261             :         Create(
     262             :                 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions,
     263             :         ) (w Writable, meta ObjectMetadata, err error)
     264             : 
     265             :         // Remove removes an object.
     266             :         //
     267             :         // The object is not guaranteed to be durably removed until Sync is called.
     268             :         Remove(fileType base.FileType, FileNum base.DiskFileNum) error
     269             : 
     270             :         // Sync flushes the metadata from creation or removal of objects since the last Sync.
     271             :         // This includes objects that have been Created but for which
     272             :         // Writable.Finish() has not yet been called.
     273             :         Sync() error
     274             : 
     275             :         // LinkOrCopyFromLocal creates a new object that is either a copy of a given
     276             :         // local file or a hard link (if the new object is created on the same FS, and
     277             :         // if the FS supports it).
     278             :         //
     279             :         // The object is not guaranteed to be durable (accessible in case of crashes)
     280             :         // until Sync is called.
     281             :         LinkOrCopyFromLocal(
     282             :                 ctx context.Context,
     283             :                 srcFS vfs.FS,
     284             :                 srcFilePath string,
     285             :                 dstFileType base.FileType,
     286             :                 dstFileNum base.DiskFileNum,
     287             :                 opts CreateOptions,
     288             :         ) (ObjectMetadata, error)
     289             : 
     290             :         // Lookup returns the metadata of an object that is already known to the Provider.
     291             :         // Does not perform any I/O.
     292             :         Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error)
     293             : 
     294             :         // Path returns an internal, implementation-dependent path for the object. It is
     295             :         // meant to be used for informational purposes (like logging).
     296             :         Path(meta ObjectMetadata) string
     297             : 
     298             :         // Size returns the size of the object.
     299             :         Size(meta ObjectMetadata) (int64, error)
     300             : 
     301             :         // List returns the objects currently known to the provider. Does not perform any I/O.
     302             :         List() []ObjectMetadata
     303             : 
     304             :         // SetCreatorID sets the CreatorID which is needed in order to use shared
     305             :         // objects. Remote object usage is disabled until this method is called the
     306             :         // first time. Once set, the Creator ID is persisted and cannot change.
     307             :         //
     308             :         // Cannot be called if shared storage is not configured for the provider.
     309             :         SetCreatorID(creatorID CreatorID) error
     310             : 
     311             :         // IsSharedForeign returns whether this object is owned by a different node.
     312             :         IsSharedForeign(meta ObjectMetadata) bool
     313             : 
     314             :         // RemoteObjectBacking encodes the remote object metadata for the given object.
     315             :         RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error)
     316             : 
     317             :         // CreateExternalObjectBacking creates a backing for an existing object with a
     318             :         // custom object name. The object is considered to be managed outside of
     319             :         // Pebble and will never be removed by Pebble.
     320             :         CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error)
     321             : 
     322             :         // GetExternalObjects returns a list of DiskFileNums corresponding to all
     323             :         // objects that are backed by the given external object.
     324             :         GetExternalObjects(locator remote.Locator, objName string) []base.DiskFileNum
     325             : 
     326             :         // AttachRemoteObjects registers existing remote objects with this provider.
     327             :         //
     328             :         // The objects are not guaranteed to be durable (accessible in case of
     329             :         // crashes) until Sync is called.
     330             :         AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error)
     331             : 
     332             :         Close() error
     333             : 
     334             :         // IsNotExistError indicates whether the error is known to report that a file or
     335             :         // directory does not exist.
     336             :         IsNotExistError(err error) bool
     337             : 
     338             :         // CheckpointState saves any saved state on local disk to the specified
     339             :         // directory on the specified VFS. A new Pebble instance instantiated at that
     340             :         // path should be able to resolve references to the specified files.
     341             :         CheckpointState(fs vfs.FS, dir string, fileType base.FileType, fileNums []base.DiskFileNum) error
     342             : 
     343             :         // Metrics returns metrics about objstorage. Currently, it only returns metrics
     344             :         // about the shared cache.
     345             :         Metrics() sharedcache.Metrics
     346             : }
     347             : 
     348             : // RemoteObjectBacking encodes the metadata necessary to incorporate a shared
     349             : // object into a different Pebble instance. The encoding is specific to a given
     350             : // Provider implementation.
     351             : type RemoteObjectBacking []byte
     352             : 
     353             : // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which
     354             : // ensures that the backing stays valid. A backing can otherwise become invalid
     355             : // if this provider unrefs the shared object. The RemoteObjectBackingHandle
     356             : // delays any unref until Close.
     357             : type RemoteObjectBackingHandle interface {
     358             :         // Get returns the backing. The backing is only guaranteed to be valid until
     359             :         // Close is called (or until the Provider is closed). If Close was already
     360             :         // called, returns an error.
     361             :         Get() (RemoteObjectBacking, error)
     362             :         Close()
     363             : }
     364             : 
     365             : // RemoteObjectToAttach contains the arguments needed to attach an existing remote object.
     366             : type RemoteObjectToAttach struct {
     367             :         // FileNum is the file number that will be used to refer to this object (in
     368             :         // the context of this instance).
     369             :         FileNum  base.DiskFileNum
     370             :         FileType base.FileType
     371             :         // Backing contains the metadata for the remote object backing (normally
     372             :         // generated from a different instance, but using the same Provider
     373             :         // implementation).
     374             :         Backing RemoteObjectBacking
     375             : }
     376             : 
     377             : // Copy copies the specified range from the input to the output.
     378           1 : func Copy(ctx context.Context, in Readable, out Writable, offset, length uint64) error {
     379           1 :         r := in.NewReadHandle(ctx, NoReadBefore)
     380           1 :         r.SetupForCompaction()
     381           1 :         buf := make([]byte, 256<<10)
     382           1 :         end := offset + length
     383           1 :         for offset < end {
     384           1 :                 n := min(end-offset, uint64(len(buf)))
     385           1 :                 if n == 0 {
     386           0 :                         break
     387             :                 }
     388           1 :                 readErr := r.ReadAt(ctx, buf[:n], int64(offset))
     389           1 :                 if readErr != nil {
     390           0 :                         return readErr
     391           0 :                 }
     392           1 :                 offset += n
     393           1 :                 if err := out.Write(buf[:n]); err != nil {
     394           0 :                         return err
     395           0 :                 }
     396             :         }
     397           1 :         return nil
     398             : }
     399             : 
     400             : // IsLocalTable returns true if a table with the given fileNum exists and is
     401             : // local.
     402           1 : func IsLocalTable(provider Provider, fileNum base.DiskFileNum) bool {
     403           1 :         meta, err := provider.Lookup(base.FileTypeTable, fileNum)
     404           1 :         return err == nil && !meta.IsRemote()
     405           1 : }
     406             : 
     407             : // IsExternalTable returns true if a table with the given fileNum exists and is
     408             : // external.
     409           1 : func IsExternalTable(provider Provider, fileNum base.DiskFileNum) bool {
     410           1 :         meta, err := provider.Lookup(base.FileTypeTable, fileNum)
     411           1 :         return err == nil && meta.IsExternal()
     412           1 : }

Generated by: LCOV version 1.14