Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package objstorage 6 : 7 : import ( 8 : "context" 9 : "fmt" 10 : 11 : "github.com/cockroachdb/errors" 12 : "github.com/cockroachdb/pebble/internal/base" 13 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 14 : "github.com/cockroachdb/pebble/objstorage/remote" 15 : "github.com/cockroachdb/pebble/vfs" 16 : "github.com/cockroachdb/redact" 17 : ) 18 : 19 : // Readable is the handle for an object that is open for reading. 20 : type Readable interface { 21 : // ReadAt reads len(p) bytes into p starting at offset off. 22 : // 23 : // Does not return partial results; if off + len(p) is past the end of the 24 : // object, an error is returned. 25 : // 26 : // Clients of ReadAt can execute parallel ReadAt calls on the 27 : // same Readable. 28 : ReadAt(ctx context.Context, p []byte, off int64) error 29 : 30 : Close() error 31 : 32 : // Size returns the size of the object. 33 : Size() int64 34 : 35 : // NewReadHandle creates a read handle for ReadAt requests that are related 36 : // and can benefit from optimizations like read-ahead. 37 : // 38 : // The ReadHandle must be closed before the Readable is closed. 39 : // 40 : // Multiple separate ReadHandles can be used. 41 : NewReadHandle(ctx context.Context) ReadHandle 42 : } 43 : 44 : // ReadHandle is used to perform reads that are related and might benefit from 45 : // optimizations like read-ahead. 46 : type ReadHandle interface { 47 : // ReadAt reads len(p) bytes into p starting at offset off. 48 : // 49 : // Does not return partial results; if off + len(p) is past the end of the 50 : // object, an error is returned. 51 : // 52 : // Parallel ReadAt calls on the same ReadHandle are not allowed. 53 : ReadAt(ctx context.Context, p []byte, off int64) error 54 : 55 : Close() error 56 : 57 : // SetupForCompaction informs the implementation that the read handle will 58 : // be used to read data blocks for a compaction. The implementation can expect 59 : // sequential reads, and can decide to not retain data in any caches. 60 : SetupForCompaction() 61 : 62 : // RecordCacheHit informs the implementation that we were able to retrieve a 63 : // block from cache. This is useful for example when the implementation is 64 : // trying to detect a sequential reading pattern. 65 : RecordCacheHit(ctx context.Context, offset, size int64) 66 : } 67 : 68 : // Writable is the handle for an object that is open for writing. 69 : // Either Finish or Abort must be called. 70 : type Writable interface { 71 : // Write writes len(p) bytes from p to the underlying object. The data is not 72 : // guaranteed to be durable until Finish is called. 73 : // 74 : // Note that Write *is* allowed to modify the slice passed in, whether 75 : // temporarily or permanently. Callers of Write need to take this into 76 : // account. 77 : Write(p []byte) error 78 : 79 : // Finish completes the object and makes the data durable. 80 : // No further calls are allowed after calling Finish. 81 : Finish() error 82 : 83 : // Abort gives up on finishing the object. There is no guarantee about whether 84 : // the object exists after calling Abort. 85 : // No further calls are allowed after calling Abort. 86 : Abort() 87 : } 88 : 89 : // ObjectMetadata contains the metadata required to be able to access an object. 90 : type ObjectMetadata struct { 91 : DiskFileNum base.DiskFileNum 92 : FileType base.FileType 93 : 94 : // The fields below are only set if the object is on remote storage. 95 : Remote struct { 96 : // CreatorID identifies the DB instance that originally created the object. 97 : // 98 : // Only used when CustomObjectName is not set. 99 : CreatorID CreatorID 100 : // CreatorFileNum is the identifier for the object within the context of the 101 : // DB instance that originally created the object. 102 : // 103 : // Only used when CustomObjectName is not set. 104 : CreatorFileNum base.DiskFileNum 105 : // CustomObjectName (if it is set) overrides the object name that is normally 106 : // derived from the CreatorID and CreatorFileNum. 107 : CustomObjectName string 108 : // CleanupMethod indicates the method for cleaning up unused shared objects. 109 : CleanupMethod SharedCleanupMethod 110 : // Locator identifies the remote.Storage implementation for this object. 111 : Locator remote.Locator 112 : // Storage is the remote.Storage object corresponding to the Locator. Used 113 : // to avoid lookups in hot paths. 114 : Storage remote.Storage 115 : } 116 : } 117 : 118 : // IsRemote returns true if the object is on remote storage. 119 1 : func (meta *ObjectMetadata) IsRemote() bool { 120 1 : return meta.IsShared() || meta.IsExternal() 121 1 : } 122 : 123 : // IsExternal returns true if the object is on remote storage but is not owned 124 : // by any Pebble instances in the cluster. 125 1 : func (meta *ObjectMetadata) IsExternal() bool { 126 1 : return meta.Remote.CustomObjectName != "" 127 1 : } 128 : 129 : // IsShared returns true if the object is on remote storage and is owned by a 130 : // Pebble instance in the cluster (potentially shared between multiple 131 : // instances). 132 1 : func (meta *ObjectMetadata) IsShared() bool { 133 1 : return meta.Remote.CreatorID.IsSet() 134 1 : } 135 : 136 : // AssertValid checks that the metadata is sane. 137 1 : func (meta *ObjectMetadata) AssertValid() { 138 1 : if !meta.IsRemote() { 139 1 : // Verify all Remote fields are empty. 140 1 : if meta.Remote != (ObjectMetadata{}).Remote { 141 0 : panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote)) 142 : } 143 1 : } else { 144 1 : if meta.Remote.CustomObjectName != "" { 145 0 : if meta.Remote.CreatorID == 0 { 146 0 : panic(errors.AssertionFailedf("CreatorID not set")) 147 : } 148 0 : if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() { 149 0 : panic(errors.AssertionFailedf("CreatorFileNum not set")) 150 : } 151 : } 152 1 : if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking { 153 0 : panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod)) 154 : } 155 1 : if meta.Remote.Storage == nil { 156 0 : panic(errors.AssertionFailedf("Storage not set")) 157 : } 158 : } 159 : } 160 : 161 : // CreatorID identifies the DB instance that originally created a shared object. 162 : // This ID is incorporated in backing object names. 163 : // Must be non-zero. 164 : type CreatorID uint64 165 : 166 : // IsSet returns true if the CreatorID is not zero. 167 1 : func (c CreatorID) IsSet() bool { return c != 0 } 168 : 169 1 : func (c CreatorID) String() string { return fmt.Sprintf("%d", c) } 170 : 171 : // SafeFormat implements redact.SafeFormatter. 172 1 : func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) { 173 1 : w.Printf("%d", redact.SafeUint(c)) 174 1 : } 175 : 176 : // SharedCleanupMethod indicates the method for cleaning up unused shared objects. 177 : type SharedCleanupMethod uint8 178 : 179 : const ( 180 : // SharedRefTracking is used for shared objects for which objstorage providers 181 : // keep track of references via reference marker objects. 182 : SharedRefTracking SharedCleanupMethod = iota 183 : 184 : // SharedNoCleanup is used for remote objects that are managed externally; the 185 : // objstorage provider never deletes such objects. 186 : SharedNoCleanup 187 : ) 188 : 189 : // OpenOptions contains optional arguments for OpenForReading. 190 : type OpenOptions struct { 191 : // MustExist triggers a fatal error if the file does not exist. The fatal 192 : // error message contains extra information helpful for debugging. 193 : MustExist bool 194 : } 195 : 196 : // CreateOptions contains optional arguments for Create. 197 : type CreateOptions struct { 198 : // PreferSharedStorage causes the object to be created on shared storage if 199 : // the provider has shared storage configured. 200 : PreferSharedStorage bool 201 : 202 : // SharedCleanupMethod is used for the object when it is created on shared storage. 203 : // The default (zero) value is SharedRefTracking. 204 : SharedCleanupMethod SharedCleanupMethod 205 : } 206 : 207 : // Provider is a singleton object used to access and manage objects. 208 : // 209 : // An object is conceptually like a large immutable file. The main use of 210 : // objects is for storing sstables; in the future it could also be used for blob 211 : // storage. 212 : // 213 : // The Provider can only manage objects that it knows about - either objects 214 : // created by the provider, or existing objects the Provider was informed about 215 : // via AddObjects. 216 : // 217 : // Objects are currently backed by a vfs.File or a remote.Storage object. 218 : type Provider interface { 219 : // OpenForReading opens an existing object. 220 : OpenForReading( 221 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions, 222 : ) (Readable, error) 223 : 224 : // Create creates a new object and opens it for writing. 225 : // 226 : // The object is not guaranteed to be durable (accessible in case of crashes) 227 : // until Sync is called. 228 : Create( 229 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions, 230 : ) (w Writable, meta ObjectMetadata, err error) 231 : 232 : // Remove removes an object. 233 : // 234 : // The object is not guaranteed to be durably removed until Sync is called. 235 : Remove(fileType base.FileType, FileNum base.DiskFileNum) error 236 : 237 : // Sync flushes the metadata from creation or removal of objects since the last Sync. 238 : // This includes objects that have been Created but for which 239 : // Writable.Finish() has not yet been called. 240 : Sync() error 241 : 242 : // LinkOrCopyFromLocal creates a new object that is either a copy of a given 243 : // local file or a hard link (if the new object is created on the same FS, and 244 : // if the FS supports it). 245 : // 246 : // The object is not guaranteed to be durable (accessible in case of crashes) 247 : // until Sync is called. 248 : LinkOrCopyFromLocal( 249 : ctx context.Context, 250 : srcFS vfs.FS, 251 : srcFilePath string, 252 : dstFileType base.FileType, 253 : dstFileNum base.DiskFileNum, 254 : opts CreateOptions, 255 : ) (ObjectMetadata, error) 256 : 257 : // Lookup returns the metadata of an object that is already known to the Provider. 258 : // Does not perform any I/O. 259 : Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error) 260 : 261 : // Path returns an internal, implementation-dependent path for the object. It is 262 : // meant to be used for informational purposes (like logging). 263 : Path(meta ObjectMetadata) string 264 : 265 : // Size returns the size of the object. 266 : Size(meta ObjectMetadata) (int64, error) 267 : 268 : // List returns the objects currently known to the provider. Does not perform any I/O. 269 : List() []ObjectMetadata 270 : 271 : // SetCreatorID sets the CreatorID which is needed in order to use shared 272 : // objects. Remote object usage is disabled until this method is called the 273 : // first time. Once set, the Creator ID is persisted and cannot change. 274 : // 275 : // Cannot be called if shared storage is not configured for the provider. 276 : SetCreatorID(creatorID CreatorID) error 277 : 278 : // IsSharedForeign returns whether this object is owned by a different node. 279 : IsSharedForeign(meta ObjectMetadata) bool 280 : 281 : // RemoteObjectBacking encodes the remote object metadata for the given object. 282 : RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error) 283 : 284 : // CreateExternalObjectBacking creates a backing for an existing object with a 285 : // custom object name. The object is considered to be managed outside of 286 : // Pebble and will never be removed by Pebble. 287 : CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error) 288 : 289 : // AttachRemoteObjects registers existing remote objects with this provider. 290 : AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error) 291 : 292 : Close() error 293 : 294 : // IsNotExistError indicates whether the error is known to report that a file or 295 : // directory does not exist. 296 : IsNotExistError(err error) bool 297 : 298 : // Metrics returns metrics about objstorage. Currently, it only returns metrics 299 : // about the shared cache. 300 : Metrics() sharedcache.Metrics 301 : } 302 : 303 : // RemoteObjectBacking encodes the metadata necessary to incorporate a shared 304 : // object into a different Pebble instance. The encoding is specific to a given 305 : // Provider implementation. 306 : type RemoteObjectBacking []byte 307 : 308 : // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which 309 : // ensures that the backing stays valid. A backing can otherwise become invalid 310 : // if this provider unrefs the shared object. The RemoteObjectBackingHandle 311 : // delays any unref until Close. 312 : type RemoteObjectBackingHandle interface { 313 : // Get returns the backing. The backing is only guaranteed to be valid until 314 : // Close is called (or until the Provider is closed). If Close was already 315 : // called, returns an error. 316 : Get() (RemoteObjectBacking, error) 317 : Close() 318 : } 319 : 320 : // RemoteObjectToAttach contains the arguments needed to attach an existing remote object. 321 : type RemoteObjectToAttach struct { 322 : // FileNum is the file number that will be used to refer to this object (in 323 : // the context of this instance). 324 : FileNum base.DiskFileNum 325 : FileType base.FileType 326 : // Backing contains the metadata for the remote object backing (normally 327 : // generated from a different instance, but using the same Provider 328 : // implementation). 329 : Backing RemoteObjectBacking 330 : }