Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package objstorage
6 :
7 : import (
8 : "context"
9 : "fmt"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
14 : "github.com/cockroachdb/pebble/objstorage/remote"
15 : "github.com/cockroachdb/pebble/vfs"
16 : "github.com/cockroachdb/redact"
17 : )
18 :
19 : // Readable is the handle for an object that is open for reading.
20 : type Readable interface {
21 : // ReadAt reads len(p) bytes into p starting at offset off.
22 : //
23 : // Does not return partial results; if off + len(p) is past the end of the
24 : // object, an error is returned.
25 : //
26 : // Clients of ReadAt can execute parallel ReadAt calls on the
27 : // same Readable.
28 : ReadAt(ctx context.Context, p []byte, off int64) error
29 :
30 : Close() error
31 :
32 : // Size returns the size of the object.
33 : Size() int64
34 :
35 : // NewReadHandle creates a read handle for ReadAt requests that are related
36 : // and can benefit from optimizations like read-ahead.
37 : //
38 : // The ReadHandle must be closed before the Readable is closed.
39 : //
40 : // Multiple separate ReadHandles can be used.
41 : NewReadHandle(ctx context.Context, readBeforeSize ReadBeforeSize) ReadHandle
42 : }
43 :
44 : // ReadBeforeSize specifies whether the first read should read additional
45 : // bytes before the offset, and how big the overall read should be. This is
46 : // just a suggestion that the callee can ignore (and does ignore in
47 : // fileReadable).
48 : //
49 : // When 0, the first read will only read what it is asked to read, say n
50 : // bytes. When it is a value b > 0, if b > n, then the read will be padded by
51 : // an additional b-n bytes to the left, resulting in an overall read size of
52 : // b. This behavior is akin to what the read-ahead implementation does -- when
53 : // the n bytes are not buffered, and there is read-ahead of b > n, the read
54 : // length is b bytes.
55 : type ReadBeforeSize int64
56 :
57 : const (
58 : // NoReadBefore specifies no read-before.
59 : NoReadBefore ReadBeforeSize = 0
60 : // ReadBeforeForNewReader is used for a new Reader reading the footer,
61 : // metaindex, properties. 32KB is unnecessarily large, but it is still small
62 : // when considering remote object storage.
63 : ReadBeforeForNewReader = 32 * 1024
64 : // ReadBeforeForIndexAndFilter is used for an iterator reading the top-level
65 : // index, filter and second-level index blocks.
66 : //
67 : // Consider a 128MB sstable with 32KB blocks, so 4K blocks. Say keys are
68 : // ~100 bytes, then the size of the index blocks is ~400KB. 512KB is a bit
69 : // bigger, and not too large to be a memory concern.
70 : ReadBeforeForIndexAndFilter = 512 * 1024
71 : )
72 :
73 : // ReadHandle is used to perform reads that are related and might benefit from
74 : // optimizations like read-ahead.
75 : type ReadHandle interface {
76 : // ReadAt reads len(p) bytes into p starting at offset off.
77 : //
78 : // Does not return partial results; if off + len(p) is past the end of the
79 : // object, an error is returned.
80 : //
81 : // Parallel ReadAt calls on the same ReadHandle are not allowed.
82 : ReadAt(ctx context.Context, p []byte, off int64) error
83 :
84 : Close() error
85 :
86 : // SetupForCompaction informs the implementation that the read handle will
87 : // be used to read data blocks for a compaction. The implementation can expect
88 : // sequential reads, and can decide to not retain data in any caches.
89 : SetupForCompaction()
90 :
91 : // RecordCacheHit informs the implementation that we were able to retrieve a
92 : // block from cache. This is useful for example when the implementation is
93 : // trying to detect a sequential reading pattern.
94 : RecordCacheHit(ctx context.Context, offset, size int64)
95 : }
96 :
97 : // Writable is the handle for an object that is open for writing.
98 : // Either Finish or Abort must be called.
99 : type Writable interface {
100 : // Write writes len(p) bytes from p to the underlying object. The data is not
101 : // guaranteed to be durable until Finish is called.
102 : //
103 : // Note that Write *is* allowed to modify the slice passed in, whether
104 : // temporarily or permanently. Callers of Write need to take this into
105 : // account.
106 : Write(p []byte) error
107 :
108 : // Finish completes the object and makes the data durable.
109 : // No further calls are allowed after calling Finish.
110 : Finish() error
111 :
112 : // Abort gives up on finishing the object. There is no guarantee about whether
113 : // the object exists after calling Abort.
114 : // No further calls are allowed after calling Abort.
115 : Abort()
116 : }
117 :
118 : // ObjectMetadata contains the metadata required to be able to access an object.
119 : type ObjectMetadata struct {
120 : DiskFileNum base.DiskFileNum
121 : FileType base.FileType
122 :
123 : // The fields below are only set if the object is on remote storage.
124 : Remote struct {
125 : // CreatorID identifies the DB instance that originally created the object.
126 : //
127 : // Only used when CustomObjectName is not set.
128 : CreatorID CreatorID
129 : // CreatorFileNum is the identifier for the object within the context of the
130 : // DB instance that originally created the object.
131 : //
132 : // Only used when CustomObjectName is not set.
133 : CreatorFileNum base.DiskFileNum
134 : // CustomObjectName (if it is set) overrides the object name that is normally
135 : // derived from the CreatorID and CreatorFileNum.
136 : CustomObjectName string
137 : // CleanupMethod indicates the method for cleaning up unused shared objects.
138 : CleanupMethod SharedCleanupMethod
139 : // Locator identifies the remote.Storage implementation for this object.
140 : Locator remote.Locator
141 : // Storage is the remote.Storage object corresponding to the Locator. Used
142 : // to avoid lookups in hot paths.
143 : Storage remote.Storage
144 : }
145 : }
146 :
147 : // IsRemote returns true if the object is on remote storage.
148 1 : func (meta *ObjectMetadata) IsRemote() bool {
149 1 : return meta.IsShared() || meta.IsExternal()
150 1 : }
151 :
152 : // IsExternal returns true if the object is on remote storage but is not owned
153 : // by any Pebble instances in the cluster.
154 1 : func (meta *ObjectMetadata) IsExternal() bool {
155 1 : return meta.Remote.CustomObjectName != ""
156 1 : }
157 :
158 : // IsShared returns true if the object is on remote storage and is owned by a
159 : // Pebble instance in the cluster (potentially shared between multiple
160 : // instances).
161 1 : func (meta *ObjectMetadata) IsShared() bool {
162 1 : return meta.Remote.CreatorID.IsSet()
163 1 : }
164 :
165 : // AssertValid checks that the metadata is sane.
166 1 : func (meta *ObjectMetadata) AssertValid() {
167 1 : if !meta.IsRemote() {
168 1 : // Verify all Remote fields are empty.
169 1 : if meta.Remote != (ObjectMetadata{}).Remote {
170 0 : panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote))
171 : }
172 1 : } else {
173 1 : if meta.Remote.CustomObjectName == "" {
174 1 : if meta.Remote.CreatorID == 0 {
175 0 : panic(errors.AssertionFailedf("CreatorID not set"))
176 : }
177 1 : if meta.Remote.CreatorFileNum == 0 {
178 0 : panic(errors.AssertionFailedf("CreatorFileNum not set"))
179 : }
180 : }
181 1 : if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking {
182 0 : panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod))
183 : }
184 1 : if meta.Remote.Storage == nil {
185 0 : panic(errors.AssertionFailedf("Storage not set"))
186 : }
187 : }
188 : }
189 :
190 : // CreatorID identifies the DB instance that originally created a shared object.
191 : // This ID is incorporated in backing object names.
192 : // Must be non-zero.
193 : type CreatorID uint64
194 :
195 : // IsSet returns true if the CreatorID is not zero.
196 1 : func (c CreatorID) IsSet() bool { return c != 0 }
197 :
198 1 : func (c CreatorID) String() string { return fmt.Sprintf("%d", c) }
199 :
200 : // SafeFormat implements redact.SafeFormatter.
201 0 : func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) {
202 0 : w.Printf("%d", redact.SafeUint(c))
203 0 : }
204 :
205 : // SharedCleanupMethod indicates the method for cleaning up unused shared objects.
206 : type SharedCleanupMethod uint8
207 :
208 : const (
209 : // SharedRefTracking is used for shared objects for which objstorage providers
210 : // keep track of references via reference marker objects.
211 : SharedRefTracking SharedCleanupMethod = iota
212 :
213 : // SharedNoCleanup is used for remote objects that are managed externally; the
214 : // objstorage provider never deletes such objects.
215 : SharedNoCleanup
216 : )
217 :
218 : // OpenOptions contains optional arguments for OpenForReading.
219 : type OpenOptions struct {
220 : // MustExist triggers a fatal error if the file does not exist. The fatal
221 : // error message contains extra information helpful for debugging.
222 : MustExist bool
223 : }
224 :
225 : // CreateOptions contains optional arguments for Create.
226 : type CreateOptions struct {
227 : // PreferSharedStorage causes the object to be created on shared storage if
228 : // the provider has shared storage configured.
229 : PreferSharedStorage bool
230 :
231 : // SharedCleanupMethod is used for the object when it is created on shared storage.
232 : // The default (zero) value is SharedRefTracking.
233 : SharedCleanupMethod SharedCleanupMethod
234 :
235 : // WriteCategory is used for the object when it is created on local storage
236 : // to collect aggregated write metrics for each write source.
237 : WriteCategory vfs.DiskWriteCategory
238 : }
239 :
240 : // Provider is a singleton object used to access and manage objects.
241 : //
242 : // An object is conceptually like a large immutable file. The main use of
243 : // objects is for storing sstables; in the future it could also be used for blob
244 : // storage.
245 : //
246 : // The Provider can only manage objects that it knows about - either objects
247 : // created by the provider, or existing objects the Provider was informed about
248 : // via AddObjects.
249 : //
250 : // Objects are currently backed by a vfs.File or a remote.Storage object.
251 : type Provider interface {
252 : // OpenForReading opens an existing object.
253 : OpenForReading(
254 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions,
255 : ) (Readable, error)
256 :
257 : // Create creates a new object and opens it for writing.
258 : //
259 : // The object is not guaranteed to be durable (accessible in case of crashes)
260 : // until Sync is called.
261 : Create(
262 : ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions,
263 : ) (w Writable, meta ObjectMetadata, err error)
264 :
265 : // Remove removes an object.
266 : //
267 : // The object is not guaranteed to be durably removed until Sync is called.
268 : Remove(fileType base.FileType, FileNum base.DiskFileNum) error
269 :
270 : // Sync flushes the metadata from creation or removal of objects since the last Sync.
271 : // This includes objects that have been Created but for which
272 : // Writable.Finish() has not yet been called.
273 : Sync() error
274 :
275 : // LinkOrCopyFromLocal creates a new object that is either a copy of a given
276 : // local file or a hard link (if the new object is created on the same FS, and
277 : // if the FS supports it).
278 : //
279 : // The object is not guaranteed to be durable (accessible in case of crashes)
280 : // until Sync is called.
281 : LinkOrCopyFromLocal(
282 : ctx context.Context,
283 : srcFS vfs.FS,
284 : srcFilePath string,
285 : dstFileType base.FileType,
286 : dstFileNum base.DiskFileNum,
287 : opts CreateOptions,
288 : ) (ObjectMetadata, error)
289 :
290 : // Lookup returns the metadata of an object that is already known to the Provider.
291 : // Does not perform any I/O.
292 : Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error)
293 :
294 : // Path returns an internal, implementation-dependent path for the object. It is
295 : // meant to be used for informational purposes (like logging).
296 : Path(meta ObjectMetadata) string
297 :
298 : // Size returns the size of the object.
299 : Size(meta ObjectMetadata) (int64, error)
300 :
301 : // List returns the objects currently known to the provider. Does not perform any I/O.
302 : List() []ObjectMetadata
303 :
304 : // SetCreatorID sets the CreatorID which is needed in order to use shared
305 : // objects. Remote object usage is disabled until this method is called the
306 : // first time. Once set, the Creator ID is persisted and cannot change.
307 : //
308 : // Cannot be called if shared storage is not configured for the provider.
309 : SetCreatorID(creatorID CreatorID) error
310 :
311 : // IsSharedForeign returns whether this object is owned by a different node.
312 : IsSharedForeign(meta ObjectMetadata) bool
313 :
314 : // RemoteObjectBacking encodes the remote object metadata for the given object.
315 : RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error)
316 :
317 : // CreateExternalObjectBacking creates a backing for an existing object with a
318 : // custom object name. The object is considered to be managed outside of
319 : // Pebble and will never be removed by Pebble.
320 : CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error)
321 :
322 : // GetExternalObjects returns a list of DiskFileNums corresponding to all
323 : // objects that are backed by the given external object.
324 : GetExternalObjects(locator remote.Locator, objName string) []base.DiskFileNum
325 :
326 : // AttachRemoteObjects registers existing remote objects with this provider.
327 : //
328 : // The objects are not guaranteed to be durable (accessible in case of
329 : // crashes) until Sync is called.
330 : AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error)
331 :
332 : Close() error
333 :
334 : // IsNotExistError indicates whether the error is known to report that a file or
335 : // directory does not exist.
336 : IsNotExistError(err error) bool
337 :
338 : // CheckpointState saves any saved state on local disk to the specified
339 : // directory on the specified VFS. A new Pebble instance instantiated at that
340 : // path should be able to resolve references to the specified files.
341 : CheckpointState(fs vfs.FS, dir string, fileType base.FileType, fileNums []base.DiskFileNum) error
342 :
343 : // Metrics returns metrics about objstorage. Currently, it only returns metrics
344 : // about the shared cache.
345 : Metrics() sharedcache.Metrics
346 : }
347 :
348 : // RemoteObjectBacking encodes the metadata necessary to incorporate a shared
349 : // object into a different Pebble instance. The encoding is specific to a given
350 : // Provider implementation.
351 : type RemoteObjectBacking []byte
352 :
353 : // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which
354 : // ensures that the backing stays valid. A backing can otherwise become invalid
355 : // if this provider unrefs the shared object. The RemoteObjectBackingHandle
356 : // delays any unref until Close.
357 : type RemoteObjectBackingHandle interface {
358 : // Get returns the backing. The backing is only guaranteed to be valid until
359 : // Close is called (or until the Provider is closed). If Close was already
360 : // called, returns an error.
361 : Get() (RemoteObjectBacking, error)
362 : Close()
363 : }
364 :
365 : // RemoteObjectToAttach contains the arguments needed to attach an existing remote object.
366 : type RemoteObjectToAttach struct {
367 : // FileNum is the file number that will be used to refer to this object (in
368 : // the context of this instance).
369 : FileNum base.DiskFileNum
370 : FileType base.FileType
371 : // Backing contains the metadata for the remote object backing (normally
372 : // generated from a different instance, but using the same Provider
373 : // implementation).
374 : Backing RemoteObjectBacking
375 : }
376 :
377 : // Copy copies the specified range from the input to the output.
378 1 : func Copy(ctx context.Context, in Readable, out Writable, offset, length uint64) error {
379 1 : r := in.NewReadHandle(ctx, NoReadBefore)
380 1 : r.SetupForCompaction()
381 1 : buf := make([]byte, 256<<10)
382 1 : end := offset + length
383 1 : for offset < end {
384 1 : n := min(end-offset, uint64(len(buf)))
385 1 : if n == 0 {
386 0 : break
387 : }
388 1 : readErr := r.ReadAt(ctx, buf[:n], int64(offset))
389 1 : if readErr != nil {
390 0 : return readErr
391 0 : }
392 1 : offset += n
393 1 : if err := out.Write(buf[:n]); err != nil {
394 0 : return err
395 0 : }
396 : }
397 1 : return nil
398 : }
399 :
400 : // IsLocalTable returns true if a table with the given fileNum exists and is
401 : // local.
402 1 : func IsLocalTable(provider Provider, fileNum base.DiskFileNum) bool {
403 1 : meta, err := provider.Lookup(base.FileTypeTable, fileNum)
404 1 : return err == nil && !meta.IsRemote()
405 1 : }
406 :
407 : // IsExternalTable returns true if a table with the given fileNum exists and is
408 : // external.
409 1 : func IsExternalTable(provider Provider, fileNum base.DiskFileNum) bool {
410 1 : meta, err := provider.Lookup(base.FileTypeTable, fileNum)
411 1 : return err == nil && meta.IsExternal()
412 1 : }
|