Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "strconv"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/manifest"
13 : "github.com/cockroachdb/pebble/objstorage/remote"
14 : "github.com/cockroachdb/pebble/sstable"
15 : "github.com/cockroachdb/pebble/sstable/blob"
16 : "github.com/cockroachdb/pebble/vfs"
17 : "github.com/cockroachdb/pebble/vfs/atomicfs"
18 : )
19 :
20 : // FormatMajorVersion is a constant controlling the format of persisted
21 : // data. Backwards incompatible changes to durable formats are gated
22 : // behind new format major versions.
23 : //
24 : // At any point, a database's format major version may be bumped.
25 : // However, once a database's format major version is increased,
26 : // previous versions of Pebble will refuse to open the database.
27 : //
28 : // The zero value format is the FormatDefault constant. The exact
29 : // FormatVersion that the default corresponds to may change with time.
30 : type FormatMajorVersion uint64
31 :
32 : // SafeValue implements redact.SafeValue.
33 0 : func (v FormatMajorVersion) SafeValue() {}
34 :
35 : // String implements fmt.Stringer.
36 2 : func (v FormatMajorVersion) String() string {
37 2 : // NB: This must not change. It's used as the value for the on-disk
38 2 : // version marker file.
39 2 : //
40 2 : // Specifically, this value must always parse as a base 10 integer
41 2 : // that fits in a uint64. We format it as zero-padded, 3-digit
42 2 : // number today, but the padding may change.
43 2 : return fmt.Sprintf("%03d", v)
44 2 : }
45 :
46 : const (
47 : // FormatDefault leaves the format version unspecified. When used to create a
48 : // new store, Pebble will choose the earliest format version it supports.
49 : FormatDefault FormatMajorVersion = iota
50 :
51 : // 21.2 versions.
52 :
53 : // FormatMostCompatible maintains the most backwards compatibility,
54 : // maintaining bi-directional compatibility with RocksDB 6.2.1 in
55 : // the particular configuration described in the Pebble README.
56 : // Deprecated.
57 : _ // FormatMostCompatible
58 :
59 : // formatVersionedManifestMarker is the first
60 : // backwards-incompatible change made to Pebble, introducing the
61 : // format-version marker file for handling backwards-incompatible
62 : // changes more broadly, and replacing the `CURRENT` file with a
63 : // marker file.
64 : //
65 : // This format version is intended as an intermediary version state.
66 : // It is deliberately unexported to discourage direct use of this
67 : // format major version. Clients should use FormatVersioned which
68 : // also ensures earlier versions of Pebble fail to open a database
69 : // written in a future format major version.
70 : // Deprecated.
71 : _ // formatVersionedManifestMarker
72 :
73 : // FormatVersioned is a new format major version that replaces the
74 : // old `CURRENT` file with a new 'marker' file scheme. Previous
75 : // Pebble versions will be unable to open the database unless
76 : // they're aware of format versions.
77 : // Deprecated.
78 : _ // FormatVersioned
79 :
80 : // FormatSetWithDelete is a format major version that introduces a new key
81 : // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
82 : // unable to open this database.
83 : // Deprecated.
84 : _ // FormatSetWithDelete
85 :
86 : // 22.1 versions.
87 :
88 : // FormatBlockPropertyCollector is a format major version that introduces
89 : // BlockPropertyCollectors.
90 : // Deprecated.
91 : _ // FormatBlockPropertyCollector
92 :
93 : // FormatSplitUserKeysMarked is a format major version that guarantees that
94 : // all files that share user keys with neighbors are marked for compaction
95 : // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
96 : // (without holding mutexes) until the scan of the LSM is complete and the
97 : // manifest has been rotated.
98 : // Deprecated.
99 : _ // FormatSplitUserKeysMarked
100 :
101 : // 22.2 versions.
102 :
103 : // FormatSplitUserKeysMarkedCompacted is a format major version that
104 : // guarantees that all files explicitly marked for compaction in the manifest
105 : // have been compacted. Combined with the FormatSplitUserKeysMarked format
106 : // major version, this version guarantees that there are no user keys split
107 : // across multiple files within a level L1+. Ratcheting to this format version
108 : // will block (without holding mutexes) until all necessary compactions for
109 : // files marked for compaction are complete.
110 : // Deprecated.
111 : _ // FormatSplitUserKeysMarkedCompacted
112 :
113 : // FormatRangeKeys is a format major version that introduces range keys.
114 : // Deprecated.
115 : _ // FormatRangeKeys
116 :
117 : // FormatMinTableFormatPebblev1 is a format major version that guarantees that
118 : // tables created by or ingested into the DB at or above this format major
119 : // version will have a table format version of at least Pebblev1 (Block
120 : // Properties).
121 : // Deprecated.
122 : _ // FormatMinTableFormatPebblev1
123 :
124 : // FormatPrePebblev1Marked is a format major version that guarantees that all
125 : // sstables with a table format version pre-Pebblev1 (i.e. those that are
126 : // guaranteed to not contain block properties) are marked for compaction in
127 : // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
128 : // holding mutexes) until the scan of the LSM is complete and the manifest has
129 : // been rotated.
130 : // Deprecated.
131 : _ // FormatPrePebblev1Marked
132 :
133 : // 23.1 versions.
134 :
135 : // formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
136 : // This format major version was originally intended to ship in the 23.1
137 : // release. It was later decided that this should be deferred until a
138 : // subsequent release. The original ordering is preserved so as not to
139 : // introduce breaking changes in Cockroach.
140 : _ // formatUnusedPrePebblev1MarkedCompacted
141 :
142 : // FormatSSTableValueBlocks is a format major version that adds support for
143 : // storing values in value blocks in the sstable. Value block support is not
144 : // necessarily enabled when writing sstables, when running with this format
145 : // major version.
146 : _ // FormatSSTableValueBlocks
147 :
148 : // FormatFlushableIngest is a format major version that enables lazy
149 : // addition of ingested sstables into the LSM structure. When an ingest
150 : // overlaps with a memtable, a record of the ingest is written to the WAL
151 : // without waiting for a flush. Subsequent reads treat the ingested files as
152 : // a level above the overlapping memtable. Once the memtable is flushed, the
153 : // ingested files are moved into the lowest possible levels.
154 : //
155 : // This feature is behind a format major version because it required
156 : // breaking changes to the WAL format.
157 : FormatFlushableIngest
158 :
159 : // 23.2 versions.
160 :
161 : // FormatPrePebblev1MarkedCompacted is a format major version that guarantees
162 : // that all sstables explicitly marked for compaction in the manifest (see
163 : // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
164 : // version will block (without holding mutexes) until all necessary
165 : // compactions for files marked for compaction are complete.
166 : FormatPrePebblev1MarkedCompacted
167 :
168 : // FormatDeleteSizedAndObsolete is a format major version that adds support
169 : // for deletion tombstones that encode the size of the value they're
170 : // expected to delete. This format major version is required before the
171 : // associated key kind may be committed through batch applications or
172 : // ingests. It also adds support for keys that are marked obsolete (see
173 : // sstable/format.go for details).
174 : FormatDeleteSizedAndObsolete
175 :
176 : // FormatVirtualSSTables is a format major version that adds support for
177 : // virtual sstables that can reference a sub-range of keys in an underlying
178 : // physical sstable. This information is persisted through new,
179 : // backward-incompatible fields in the Manifest, and therefore requires
180 : // a format major version.
181 : FormatVirtualSSTables
182 :
183 : // FormatSyntheticPrefixSuffix is a format major version that adds support for
184 : // sstables to have their content exposed in a different prefix or suffix of
185 : // keyspace than the actual prefix/suffix persisted in the keys in such
186 : // sstables. The prefix and suffix replacement information is stored in new
187 : // fields in the Manifest and thus requires a format major version.
188 : FormatSyntheticPrefixSuffix
189 :
190 : // FormatFlushableIngestExcises is a format major version that adds support for
191 : // having excises unconditionally being written as flushable ingestions. This
192 : // is implemented through adding a new key kind that can go in the same batches
193 : // as flushable ingested sstables.
194 : FormatFlushableIngestExcises
195 :
196 : // FormatColumnarBlocks is a format major version enabling use of the
197 : // TableFormatPebblev5 table format, that encodes sstable data blocks, index
198 : // blocks and keyspan blocks by organizing the KVs into columns within the
199 : // block.
200 : FormatColumnarBlocks
201 :
202 : // FormatWALSyncChunks is a format major version enabling the writing of
203 : // WAL sync chunks. These new chunks are used to disambiguate between corruption
204 : // and logical EOF during WAL replay. This is implemented by adding a new
205 : // chunk wire format that encodes an additional "Synced Offset" field which acts
206 : // as a commitment that the WAL should have been synced up until the offset.
207 : FormatWALSyncChunks
208 :
209 : // FormatTableFormatV6 is a format major version enabling the sstable table
210 : // format TableFormatPebblev6.
211 : //
212 : // The TableFormatPebblev6 sstable format introduces a checksum within the
213 : // sstable footer, allows inclusion of blob handle references within the
214 : // value column of a sstable block, and supports columnar meta index +
215 : // properties blocks.
216 : //
217 : // This format major version does not yet enable use of value separation.
218 : FormatTableFormatV6
219 :
220 : // formatDeprecatedExperimentalValueSeparation was used to enable an
221 : // experimental version of value separation, separating values into external
222 : // blob files that do not participate in every compaction.
223 : //
224 : // Value separation now depends on TableFormatPebblev7 which this format
225 : // major version precedes. This format major version is deprecated and
226 : // unexported, and value separation now requires FormatValueSeparation.
227 : formatDeprecatedExperimentalValueSeparation
228 :
229 : // formatFooterAttributes is a format major version that adds support for
230 : // writing sstable.Attributes in the footer of sstables.
231 : formatFooterAttributes
232 :
233 : // FormatValueSeparation is a format major version that adds support for
234 : // value separation, separating values into external blob files that do not
235 : // participate in every compaction.
236 : FormatValueSeparation
237 :
238 : // FormatExciseBoundsRecord is a format major version that adds support for
239 : // persisting excise bounds records in the manifest (VersionEdit).
240 : FormatExciseBoundsRecord
241 :
242 : // FormatV2BlobFiles is a format major version that adds support for V2 blob
243 : // file format (which adds compression statistics).
244 : FormatV2BlobFiles
245 :
246 : // -- Add new versions here --
247 :
248 : // FormatNewest is the most recent format major version.
249 : FormatNewest FormatMajorVersion = iota - 1
250 :
251 : // Experimental versions, which are excluded by FormatNewest (but can be used
252 : // in tests) can be defined here.
253 :
254 : // -- Add experimental versions here --
255 :
256 : // internalFormatNewest is the most recent, possibly experimental format major
257 : // version.
258 : internalFormatNewest FormatMajorVersion = iota - 2
259 : )
260 :
261 : // FormatMinSupported is the minimum format version that is supported by this
262 : // Pebble version.
263 : const FormatMinSupported = FormatFlushableIngest
264 :
265 : // FormatMinForSharedObjects it the minimum format version that supports shared
266 : // objects (see CreateOnShared option).
267 : const FormatMinForSharedObjects = FormatVirtualSSTables
268 :
269 : // resolveDefault asserts that the given version is supported, and returns the
270 : // given version, replacing FormatDefault with FormatMinSupported.
271 2 : func (v FormatMajorVersion) resolveDefault() FormatMajorVersion {
272 2 : if v == FormatDefault {
273 1 : return FormatMinSupported
274 1 : }
275 2 : if v < FormatMinSupported || v > internalFormatNewest {
276 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
277 : }
278 2 : return v
279 : }
280 :
281 : // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
282 : // this FormatMajorVersion.
283 2 : func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
284 2 : v = v.resolveDefault()
285 2 : switch {
286 2 : case v >= formatFooterAttributes:
287 2 : return sstable.TableFormatPebblev7
288 2 : case v >= FormatTableFormatV6:
289 2 : return sstable.TableFormatPebblev6
290 2 : case v >= FormatColumnarBlocks:
291 2 : return sstable.TableFormatPebblev5
292 2 : case v >= FormatDeleteSizedAndObsolete:
293 2 : return sstable.TableFormatPebblev4
294 2 : default:
295 2 : return sstable.TableFormatPebblev3
296 : }
297 : }
298 :
299 : // MinTableFormat returns the minimum sstable.TableFormat that can be used at
300 : // this FormatMajorVersion.
301 2 : func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
302 2 : _ = v.resolveDefault()
303 2 : return sstable.TableFormatPebblev1
304 2 : }
305 :
306 : // MaxBlobFileFormat returns the maximum blob.FileFormat that can be used at
307 : // this FormatMajorVersion. It can only be used on versions that support value
308 : // separation.
309 2 : func (v FormatMajorVersion) MaxBlobFileFormat() blob.FileFormat {
310 2 : v = v.resolveDefault()
311 2 : switch {
312 2 : case v >= FormatV2BlobFiles:
313 2 : return blob.FileFormatV2
314 2 : case v >= FormatValueSeparation:
315 2 : return blob.FileFormatV1
316 1 : default:
317 1 : panic(fmt.Sprintf("pebble: format major version %s does not support blob files", v))
318 : }
319 : }
320 :
321 : // formatMajorVersionMigrations defines the migrations from one format
322 : // major version to the next. Each migration is defined as a closure
323 : // which will be invoked on the database before the new format major
324 : // version is committed. Migrations must be idempotent. Migrations are
325 : // invoked with d.mu locked.
326 : //
327 : // Each migration is responsible for invoking finalizeFormatVersUpgrade
328 : // to set the new format major version. RatchetFormatMajorVersion will
329 : // panic if a migration returns a nil error but fails to finalize the
330 : // new format major version.
331 : var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
332 0 : FormatFlushableIngest: func(d *DB) error { return nil },
333 2 : FormatPrePebblev1MarkedCompacted: func(d *DB) error {
334 2 : // Before finalizing the format major version, rewrite any sstables
335 2 : // still marked for compaction. Note all format major versions
336 2 : // migrations are invoked with DB.mu locked.
337 2 : if err := d.compactMarkedFilesLocked(); err != nil {
338 0 : return err
339 0 : }
340 2 : return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
341 : },
342 2 : FormatDeleteSizedAndObsolete: func(d *DB) error {
343 2 : return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
344 2 : },
345 2 : FormatVirtualSSTables: func(d *DB) error {
346 2 : return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
347 2 : },
348 2 : FormatSyntheticPrefixSuffix: func(d *DB) error {
349 2 : return d.finalizeFormatVersUpgrade(FormatSyntheticPrefixSuffix)
350 2 : },
351 2 : FormatFlushableIngestExcises: func(d *DB) error {
352 2 : return d.finalizeFormatVersUpgrade(FormatFlushableIngestExcises)
353 2 : },
354 2 : FormatColumnarBlocks: func(d *DB) error {
355 2 : return d.finalizeFormatVersUpgrade(FormatColumnarBlocks)
356 2 : },
357 2 : FormatWALSyncChunks: func(d *DB) error {
358 2 : return d.finalizeFormatVersUpgrade(FormatWALSyncChunks)
359 2 : },
360 2 : FormatTableFormatV6: func(d *DB) error {
361 2 : return d.finalizeFormatVersUpgrade(FormatTableFormatV6)
362 2 : },
363 2 : formatDeprecatedExperimentalValueSeparation: func(d *DB) error {
364 2 : return d.finalizeFormatVersUpgrade(formatDeprecatedExperimentalValueSeparation)
365 2 : },
366 2 : formatFooterAttributes: func(d *DB) error {
367 2 : return d.finalizeFormatVersUpgrade(formatFooterAttributes)
368 2 : },
369 2 : FormatValueSeparation: func(d *DB) error {
370 2 : return d.finalizeFormatVersUpgrade(FormatValueSeparation)
371 2 : },
372 2 : FormatExciseBoundsRecord: func(d *DB) error {
373 2 : return d.finalizeFormatVersUpgrade(FormatExciseBoundsRecord)
374 2 : },
375 2 : FormatV2BlobFiles: func(d *DB) error {
376 2 : return d.finalizeFormatVersUpgrade(FormatV2BlobFiles)
377 2 : },
378 : }
379 :
380 : const formatVersionMarkerName = `format-version`
381 :
382 : // lookupFormatMajorVersion retrieves the format version from the format version
383 : // marker file.
384 : //
385 : // If such a file does not exist, returns FormatDefault. Note that this case is
386 : // only acceptable if we are creating a new store (we no longer support
387 : // FormatMostCompatible which is the only one with no version marker file).
388 : func lookupFormatMajorVersion(
389 : fs vfs.FS, dirname string, ls []string,
390 2 : ) (FormatMajorVersion, *atomicfs.Marker, error) {
391 2 : m, versString, err := atomicfs.LocateMarkerInListing(fs, dirname, formatVersionMarkerName, ls)
392 2 : if err != nil {
393 1 : return 0, nil, err
394 1 : }
395 2 : if versString == "" {
396 2 : return FormatDefault, m, nil
397 2 : }
398 2 : v, err := strconv.ParseUint(versString, 10, 64)
399 2 : if err != nil {
400 0 : return 0, nil, errors.Wrap(err, "parsing format major version")
401 0 : }
402 2 : vers := FormatMajorVersion(v)
403 2 : if vers == FormatDefault {
404 0 : return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
405 0 : }
406 2 : if vers > internalFormatNewest {
407 1 : return 0, nil, errors.Newf("pebble: database %q written in unknown format major version %d", dirname, vers)
408 1 : }
409 2 : if vers < FormatMinSupported {
410 0 : return 0, nil, errors.Newf("pebble: database %q written in format major version %d which is no longer supported", dirname, vers)
411 0 : }
412 2 : return vers, m, nil
413 : }
414 :
415 : // FormatMajorVersion returns the database's active format major
416 : // version. The format major version may be higher than the one
417 : // provided in Options when the database was opened if the existing
418 : // database was written with a higher format version.
419 2 : func (d *DB) FormatMajorVersion() FormatMajorVersion {
420 2 : return FormatMajorVersion(d.mu.formatVers.vers.Load())
421 2 : }
422 :
423 : // TableFormat returns the TableFormat that the database is currently using when
424 : // writing sstables. The table format is determined by the database's format
425 : // major version, as well as experimental settings like EnableValueBlocks and
426 : // EnableColumnarBlocks.
427 2 : func (d *DB) TableFormat() sstable.TableFormat {
428 2 : // The table is typically written at the maximum allowable format implied by
429 2 : // the current format major version of the DB.
430 2 : f := d.FormatMajorVersion().MaxTableFormat()
431 2 : if f == sstable.TableFormatPebblev3 {
432 2 : // In format major versions with maximum table formats of Pebblev3,
433 2 : // value blocks were conditional on an experimental setting. In format
434 2 : // major versions with maximum table formats of Pebblev4 and higher,
435 2 : // value blocks are always enabled.
436 2 : if d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks() {
437 2 : f = sstable.TableFormatPebblev2
438 2 : }
439 : }
440 2 : return f
441 : }
442 :
443 : // BlobFileFormat returns the blob.FileFormat that the database is currently
444 : // using when writing blob files.
445 2 : func (d *DB) BlobFileFormat() blob.FileFormat {
446 2 : return d.FormatMajorVersion().MaxBlobFileFormat()
447 2 : }
448 :
449 : // shouldCreateShared returns true if the database should use shared objects
450 : // when creating new objects on the given level.
451 2 : func (d *DB) shouldCreateShared(targetLevel int) bool {
452 2 : return remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, targetLevel) &&
453 2 : d.FormatMajorVersion() >= FormatMinForSharedObjects
454 2 : }
455 :
456 : // RatchetFormatMajorVersion ratchets the opened database's format major
457 : // version to the provided version. It errors if the provided format
458 : // major version is below the database's current version. Once a
459 : // database's format major version is upgraded, previous Pebble versions
460 : // that do not know of the format version will be unable to open the
461 : // database.
462 2 : func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
463 2 : if err := d.closed.Load(); err != nil {
464 1 : panic(err)
465 : }
466 :
467 2 : d.mu.Lock()
468 2 : defer d.mu.Unlock()
469 2 : return d.ratchetFormatMajorVersionLocked(fmv)
470 : }
471 :
472 2 : func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
473 2 : if d.opts.ReadOnly {
474 0 : return ErrReadOnly
475 0 : }
476 2 : if formatVers > internalFormatNewest {
477 0 : // Guard against accidentally forgetting to update internalFormatNewest.
478 0 : return errors.Errorf("pebble: unknown format version %d", formatVers)
479 0 : }
480 2 : if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
481 0 : return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
482 0 : currentVers, formatVers)
483 0 : }
484 2 : if d.mu.formatVers.ratcheting {
485 0 : return errors.Newf("pebble: database format major version upgrade is in-progress")
486 0 : }
487 2 : d.mu.formatVers.ratcheting = true
488 2 : defer func() { d.mu.formatVers.ratcheting = false }()
489 :
490 2 : for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
491 2 : if err := formatMajorVersionMigrations[nextVers](d); err != nil {
492 0 : return errors.Wrapf(err, "migrating to version %d", nextVers)
493 0 : }
494 :
495 : // NB: The migration is responsible for calling
496 : // finalizeFormatVersUpgrade to finalize the upgrade. This
497 : // structure is necessary because some migrations may need to
498 : // update in-memory state (without ever dropping locks) after
499 : // the upgrade is finalized. Here we assert that the upgrade
500 : // did occur.
501 2 : if d.FormatMajorVersion() != nextVers {
502 0 : d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
503 0 : }
504 : }
505 2 : return nil
506 : }
507 :
508 : // finalizeFormatVersUpgrade is typically only be called from within a
509 : // format major version migration.
510 : //
511 : // See formatMajorVersionMigrations.
512 2 : func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
513 2 : if err := d.writeFormatVersionMarker(formatVers); err != nil {
514 0 : return err
515 0 : }
516 2 : d.mu.formatVers.vers.Store(uint64(formatVers))
517 2 : d.opts.EventListener.FormatUpgrade(formatVers)
518 2 : return nil
519 : }
520 :
521 2 : func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error {
522 2 : // We use the marker to encode the active format version in the
523 2 : // marker filename. Unlike other uses of the atomic marker, there is
524 2 : // no file with the filename `formatVers.String()` on the
525 2 : // filesystem.
526 2 : return d.mu.formatVers.marker.Move(formatVers.String())
527 2 : }
528 :
529 : // compactMarkedFilesLocked performs a migration that schedules rewrite
530 : // compactions to compact away any sstables marked for compaction.
531 : // compactMarkedFilesLocked is run while ratcheting the database's format major
532 : // version to FormatSplitUserKeysMarkedCompacted.
533 : //
534 : // Note that while this method is called with the DB.mu held, and will not
535 : // return until all marked files have been compacted, the mutex is dropped while
536 : // waiting for compactions to complete (or for slots to free up).
537 2 : func (d *DB) compactMarkedFilesLocked() error {
538 2 : curr := d.mu.versions.currentVersion()
539 2 : if curr.Stats.MarkedForCompaction == 0 {
540 2 : return nil
541 2 : }
542 : // Attempt to schedule a compaction to rewrite a file marked for compaction.
543 : // We simply call maybeScheduleCompaction since it also picks rewrite
544 : // compactions. Note that we don't need to call this repeatedly in the for
545 : // loop below since the completion of a compaction either starts a new one
546 : // or ensures a compaction is queued for scheduling. By calling
547 : // maybeScheduleCompaction here we are simply kicking off this behavior.
548 0 : d.maybeScheduleCompaction()
549 0 :
550 0 : // The above attempt might succeed and schedule a rewrite compaction. Or
551 0 : // there might not be available compaction concurrency to schedule the
552 0 : // compaction. Or compaction of the file might have already been in
553 0 : // progress. In any scenario, wait until there's some change in the
554 0 : // state of active compactions.
555 0 : for curr.Stats.MarkedForCompaction > 0 {
556 0 : // Before waiting, check that the database hasn't been closed. Trying to
557 0 : // schedule the compaction may have dropped d.mu while waiting for a
558 0 : // manifest write to complete. In that dropped interim, the database may
559 0 : // have been closed.
560 0 : if err := d.closed.Load(); err != nil {
561 0 : return err.(error)
562 0 : }
563 :
564 : // Some flush or compaction may have scheduled or completed while we waited
565 : // for the manifest lock in maybeScheduleCompactionPicker. Get the latest
566 : // Version before waiting on a compaction.
567 0 : curr = d.mu.versions.currentVersion()
568 0 :
569 0 : // Only wait on compactions if there are files still marked for compaction.
570 0 : // NB: Waiting on this condition variable drops d.mu while blocked.
571 0 : if curr.Stats.MarkedForCompaction > 0 {
572 0 : // NB: we cannot assert that d.mu.compact.compactingCount > 0, since
573 0 : // with a CompactionScheduler a DB may not have even one ongoing
574 0 : // compaction (if other competing activities are being preferred by the
575 0 : // scheduler).
576 0 : d.mu.compact.cond.Wait()
577 0 : // Refresh the current version again.
578 0 : curr = d.mu.versions.currentVersion()
579 0 : }
580 : }
581 0 : return nil
582 : }
583 :
584 : // findFilesFunc scans the LSM for files, returning true if at least one
585 : // file was found. The returned array contains the matched files, if any, per
586 : // level.
587 : type findFilesFunc func(v *manifest.Version) (found bool, files [numLevels][]*manifest.TableMetadata, _ error)
588 :
589 : // This method is not used currently, but it will be useful the next time we need
590 : // to mark files for compaction.
591 : var _ = (*DB)(nil).markFilesLocked
592 :
593 : // markFilesLocked durably marks the files that match the given findFilesFunc for
594 : // compaction.
595 0 : func (d *DB) markFilesLocked(findFn findFilesFunc) error {
596 0 : jobID := d.newJobIDLocked()
597 0 :
598 0 : // Acquire a read state to have a view of the LSM and a guarantee that none
599 0 : // of the referenced files will be deleted until we've unreferenced the read
600 0 : // state. Some findFilesFuncs may read the files, requiring they not be
601 0 : // deleted.
602 0 : rs := d.loadReadState()
603 0 : var (
604 0 : found bool
605 0 : files [numLevels][]*manifest.TableMetadata
606 0 : err error
607 0 : )
608 0 : func() {
609 0 : defer rs.unrefLocked()
610 0 : // Note the unusual locking: unlock, defer Lock(). The scan of the files in
611 0 : // the version does not need to block other operations that require the
612 0 : // DB.mu. Drop it for the scan, before re-acquiring it.
613 0 : d.mu.Unlock()
614 0 : defer d.mu.Lock()
615 0 : found, files, err = findFn(rs.current)
616 0 : }()
617 0 : if err != nil {
618 0 : return err
619 0 : }
620 :
621 : // The database lock has been acquired again by the defer within the above
622 : // anonymous function.
623 0 : if !found {
624 0 : // Nothing to do.
625 0 : return nil
626 0 : }
627 :
628 : // After scanning, if we found files to mark, we fetch the current state of
629 : // the LSM (which may have changed) and set MarkedForCompaction on the files,
630 : // and update the version's Stats.MarkedForCompaction count, which are both
631 : // protected by d.mu.
632 :
633 : // Lock the manifest for a coherent view of the LSM. The database lock has
634 : // been re-acquired by the defer within the above anonymous function.
635 0 : _, err = d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) {
636 0 : vers := d.mu.versions.currentVersion()
637 0 : for l, filesToMark := range files {
638 0 : if len(filesToMark) == 0 {
639 0 : continue
640 : }
641 0 : for _, f := range filesToMark {
642 0 : // Ignore files to be marked that have already been compacted or marked.
643 0 : if f.CompactionState == manifest.CompactionStateCompacted ||
644 0 : f.MarkedForCompaction {
645 0 : continue
646 : }
647 : // Else, mark the file for compaction in this version.
648 0 : vers.Stats.MarkedForCompaction++
649 0 : f.MarkedForCompaction = true
650 : }
651 : // The compaction picker uses the markedForCompactionAnnotator to
652 : // quickly find files marked for compaction, or to quickly determine
653 : // that there are no such files marked for compaction within a level.
654 : // A b-tree node may be annotated with an annotation recording that
655 : // there are no files marked for compaction within the node's subtree,
656 : // based on the assumption that it's static.
657 : //
658 : // Since we're marking files for compaction, these b-tree nodes'
659 : // annotations will be out of date. Clear the compaction-picking
660 : // annotation, so that it's recomputed the next time the compaction
661 : // picker looks for a file marked for compaction.
662 0 : markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l])
663 : }
664 : // The 'marked-for-compaction' bit is persisted in the MANIFEST file
665 : // metadata. We've already modified the in-memory table metadata, but the
666 : // manifest hasn't been updated. Force rotation to a new MANIFEST file,
667 : // which will write every table metadata to the new manifest file and ensure
668 : // that the now marked-for-compaction table metadata are persisted as marked.
669 0 : return versionUpdate{
670 0 : VE: &manifest.VersionEdit{},
671 0 : JobID: jobID,
672 0 : ForceManifestRotation: true,
673 0 : InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) },
674 : }, nil
675 : })
676 0 : return err
677 : }
|