Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package objstorage 6 : 7 : import ( 8 : "context" 9 : "fmt" 10 : 11 : "github.com/cockroachdb/errors" 12 : "github.com/cockroachdb/pebble/internal/base" 13 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 14 : "github.com/cockroachdb/pebble/objstorage/remote" 15 : "github.com/cockroachdb/pebble/vfs" 16 : ) 17 : 18 : // Readable is the handle for an object that is open for reading. 19 : type Readable interface { 20 : // ReadAt reads len(p) bytes into p starting at offset off. 21 : // 22 : // Does not return partial results; if off + len(p) is past the end of the 23 : // object, an error is returned. 24 : // 25 : // Clients of ReadAt can execute parallel ReadAt calls on the 26 : // same Readable. 27 : ReadAt(ctx context.Context, p []byte, off int64) error 28 : 29 : Close() error 30 : 31 : // Size returns the size of the object. 32 : Size() int64 33 : 34 : // NewReadHandle creates a read handle for ReadAt requests that are related 35 : // and can benefit from optimizations like read-ahead. 36 : // 37 : // The ReadHandle must be closed before the Readable is closed. 38 : // 39 : // Multiple separate ReadHandles can be used. 40 : NewReadHandle(ctx context.Context) ReadHandle 41 : } 42 : 43 : // ReadHandle is used to perform reads that are related and might benefit from 44 : // optimizations like read-ahead. 45 : type ReadHandle interface { 46 : // ReadAt reads len(p) bytes into p starting at offset off. 47 : // 48 : // Does not return partial results; if off + len(p) is past the end of the 49 : // object, an error is returned. 50 : // 51 : // Parallel ReadAt calls on the same ReadHandle are not allowed. 52 : ReadAt(ctx context.Context, p []byte, off int64) error 53 : 54 : Close() error 55 : 56 : // SetupForCompaction informs the implementation that the read handle will 57 : // be used to read data blocks for a compaction. The implementation can expect 58 : // sequential reads, and can decide to not retain data in any caches. 59 : SetupForCompaction() 60 : 61 : // RecordCacheHit informs the implementation that we were able to retrieve a 62 : // block from cache. This is useful for example when the implementation is 63 : // trying to detect a sequential reading pattern. 64 : RecordCacheHit(ctx context.Context, offset, size int64) 65 : } 66 : 67 : // Writable is the handle for an object that is open for writing. 68 : // Either Finish or Abort must be called. 69 : type Writable interface { 70 : // Write writes len(p) bytes from p to the underlying object. The data is not 71 : // guaranteed to be durable until Finish is called. 72 : // 73 : // Note that Write *is* allowed to modify the slice passed in, whether 74 : // temporarily or permanently. Callers of Write need to take this into 75 : // account. 76 : Write(p []byte) error 77 : 78 : // Finish completes the object and makes the data durable. 79 : // No further calls are allowed after calling Finish. 80 : Finish() error 81 : 82 : // Abort gives up on finishing the object. There is no guarantee about whether 83 : // the object exists after calling Abort. 84 : // No further calls are allowed after calling Abort. 85 : Abort() 86 : } 87 : 88 : // ObjectMetadata contains the metadata required to be able to access an object. 89 : type ObjectMetadata struct { 90 : DiskFileNum base.DiskFileNum 91 : FileType base.FileType 92 : 93 : // The fields below are only set if the object is on remote storage. 94 : Remote struct { 95 : // CreatorID identifies the DB instance that originally created the object. 96 : // 97 : // Only used when CustomObjectName is not set. 98 : CreatorID CreatorID 99 : // CreatorFileNum is the identifier for the object within the context of the 100 : // DB instance that originally created the object. 101 : // 102 : // Only used when CustomObjectName is not set. 103 : CreatorFileNum base.DiskFileNum 104 : // CustomObjectName (if it is set) overrides the object name that is normally 105 : // derived from the CreatorID and CreatorFileNum. 106 : CustomObjectName string 107 : // CleanupMethod indicates the method for cleaning up unused shared objects. 108 : CleanupMethod SharedCleanupMethod 109 : // Locator identifies the remote.Storage implementation for this object. 110 : Locator remote.Locator 111 : // Storage is the remote.Storage object corresponding to the Locator. Used 112 : // to avoid lookups in hot paths. 113 : Storage remote.Storage 114 : } 115 : } 116 : 117 : // IsRemote returns true if the object is on remote storage. 118 1 : func (meta *ObjectMetadata) IsRemote() bool { 119 1 : return meta.IsShared() || meta.IsExternal() 120 1 : } 121 : 122 : // IsExternal returns true if the object is on remote storage but is not owned 123 : // by any Pebble instances in the cluster. 124 1 : func (meta *ObjectMetadata) IsExternal() bool { 125 1 : return meta.Remote.CustomObjectName != "" 126 1 : } 127 : 128 : // IsShared returns true if the object is on remote storage and is owned by a 129 : // Pebble instance in the cluster (potentially shared between multiple 130 : // instances). 131 1 : func (meta *ObjectMetadata) IsShared() bool { 132 1 : return meta.Remote.CreatorID.IsSet() 133 1 : } 134 : 135 : // AssertValid checks that the metadata is sane. 136 0 : func (meta *ObjectMetadata) AssertValid() { 137 0 : if !meta.IsRemote() { 138 0 : // Verify all Remote fields are empty. 139 0 : if meta.Remote != (ObjectMetadata{}).Remote { 140 0 : panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote)) 141 : } 142 0 : } else { 143 0 : if meta.Remote.CustomObjectName != "" { 144 0 : if meta.Remote.CreatorID == 0 { 145 0 : panic(errors.AssertionFailedf("CreatorID not set")) 146 : } 147 0 : if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() { 148 0 : panic(errors.AssertionFailedf("CreatorFileNum not set")) 149 : } 150 : } 151 0 : if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking { 152 0 : panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod)) 153 : } 154 0 : if meta.Remote.Storage == nil { 155 0 : panic(errors.AssertionFailedf("Storage not set")) 156 : } 157 : } 158 : } 159 : 160 : // CreatorID identifies the DB instance that originally created a shared object. 161 : // This ID is incorporated in backing object names. 162 : // Must be non-zero. 163 : type CreatorID uint64 164 : 165 : // IsSet returns true if the CreatorID is not zero. 166 1 : func (c CreatorID) IsSet() bool { return c != 0 } 167 : 168 1 : func (c CreatorID) String() string { return fmt.Sprintf("%d", c) } 169 : 170 : // SharedCleanupMethod indicates the method for cleaning up unused shared objects. 171 : type SharedCleanupMethod uint8 172 : 173 : const ( 174 : // SharedRefTracking is used for shared objects for which objstorage providers 175 : // keep track of references via reference marker objects. 176 : SharedRefTracking SharedCleanupMethod = iota 177 : 178 : // SharedNoCleanup is used for remote objects that are managed externally; the 179 : // objstorage provider never deletes such objects. 180 : SharedNoCleanup 181 : ) 182 : 183 : // OpenOptions contains optional arguments for OpenForReading. 184 : type OpenOptions struct { 185 : // MustExist triggers a fatal error if the file does not exist. The fatal 186 : // error message contains extra information helpful for debugging. 187 : MustExist bool 188 : } 189 : 190 : // CreateOptions contains optional arguments for Create. 191 : type CreateOptions struct { 192 : // PreferSharedStorage causes the object to be created on shared storage if 193 : // the provider has shared storage configured. 194 : PreferSharedStorage bool 195 : 196 : // SharedCleanupMethod is used for the object when it is created on shared storage. 197 : // The default (zero) value is SharedRefTracking. 198 : SharedCleanupMethod SharedCleanupMethod 199 : } 200 : 201 : // Provider is a singleton object used to access and manage objects. 202 : // 203 : // An object is conceptually like a large immutable file. The main use of 204 : // objects is for storing sstables; in the future it could also be used for blob 205 : // storage. 206 : // 207 : // The Provider can only manage objects that it knows about - either objects 208 : // created by the provider, or existing objects the Provider was informed about 209 : // via AddObjects. 210 : // 211 : // Objects are currently backed by a vfs.File or a remote.Storage object. 212 : type Provider interface { 213 : // OpenForReading opens an existing object. 214 : OpenForReading( 215 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions, 216 : ) (Readable, error) 217 : 218 : // Create creates a new object and opens it for writing. 219 : // 220 : // The object is not guaranteed to be durable (accessible in case of crashes) 221 : // until Sync is called. 222 : Create( 223 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions, 224 : ) (w Writable, meta ObjectMetadata, err error) 225 : 226 : // Remove removes an object. 227 : // 228 : // The object is not guaranteed to be durably removed until Sync is called. 229 : Remove(fileType base.FileType, FileNum base.DiskFileNum) error 230 : 231 : // Sync flushes the metadata from creation or removal of objects since the last Sync. 232 : // This includes objects that have been Created but for which 233 : // Writable.Finish() has not yet been called. 234 : Sync() error 235 : 236 : // LinkOrCopyFromLocal creates a new object that is either a copy of a given 237 : // local file or a hard link (if the new object is created on the same FS, and 238 : // if the FS supports it). 239 : // 240 : // The object is not guaranteed to be durable (accessible in case of crashes) 241 : // until Sync is called. 242 : LinkOrCopyFromLocal( 243 : ctx context.Context, 244 : srcFS vfs.FS, 245 : srcFilePath string, 246 : dstFileType base.FileType, 247 : dstFileNum base.DiskFileNum, 248 : opts CreateOptions, 249 : ) (ObjectMetadata, error) 250 : 251 : // Lookup returns the metadata of an object that is already known to the Provider. 252 : // Does not perform any I/O. 253 : Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error) 254 : 255 : // Path returns an internal, implementation-dependent path for the object. It is 256 : // meant to be used for informational purposes (like logging). 257 : Path(meta ObjectMetadata) string 258 : 259 : // Size returns the size of the object. 260 : Size(meta ObjectMetadata) (int64, error) 261 : 262 : // List returns the objects currently known to the provider. Does not perform any I/O. 263 : List() []ObjectMetadata 264 : 265 : // SetCreatorID sets the CreatorID which is needed in order to use shared 266 : // objects. Remote object usage is disabled until this method is called the 267 : // first time. Once set, the Creator ID is persisted and cannot change. 268 : // 269 : // Cannot be called if shared storage is not configured for the provider. 270 : SetCreatorID(creatorID CreatorID) error 271 : 272 : // IsSharedForeign returns whether this object is owned by a different node. 273 : IsSharedForeign(meta ObjectMetadata) bool 274 : 275 : // RemoteObjectBacking encodes the remote object metadata for the given object. 276 : RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error) 277 : 278 : // CreateExternalObjectBacking creates a backing for an existing object with a 279 : // custom object name. The object is considered to be managed outside of 280 : // Pebble and will never be removed by Pebble. 281 : CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error) 282 : 283 : // AttachRemoteObjects registers existing remote objects with this provider. 284 : AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error) 285 : 286 : Close() error 287 : 288 : // IsNotExistError indicates whether the error is known to report that a file or 289 : // directory does not exist. 290 : IsNotExistError(err error) bool 291 : 292 : // Metrics returns metrics about objstorage. Currently, it only returns metrics 293 : // about the shared cache. 294 : Metrics() sharedcache.Metrics 295 : } 296 : 297 : // RemoteObjectBacking encodes the metadata necessary to incorporate a shared 298 : // object into a different Pebble instance. The encoding is specific to a given 299 : // Provider implementation. 300 : type RemoteObjectBacking []byte 301 : 302 : // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which 303 : // ensures that the backing stays valid. A backing can otherwise become invalid 304 : // if this provider unrefs the shared object. The RemoteObjectBackingHandle 305 : // delays any unref until Close. 306 : type RemoteObjectBackingHandle interface { 307 : // Get returns the backing. The backing is only guaranteed to be valid until 308 : // Close is called (or until the Provider is closed). If Close was already 309 : // called, returns an error. 310 : Get() (RemoteObjectBacking, error) 311 : Close() 312 : } 313 : 314 : // RemoteObjectToAttach contains the arguments needed to attach an existing remote object. 315 : type RemoteObjectToAttach struct { 316 : // FileNum is the file number that will be used to refer to this object (in 317 : // the context of this instance). 318 : FileNum base.DiskFileNum 319 : FileType base.FileType 320 : // Backing contains the metadata for the remote object backing (normally 321 : // generated from a different instance, but using the same Provider 322 : // implementation). 323 : Backing RemoteObjectBacking 324 : }