Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "strconv"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/manifest"
13 : "github.com/cockroachdb/pebble/objstorage/remote"
14 : "github.com/cockroachdb/pebble/sstable"
15 : "github.com/cockroachdb/pebble/vfs"
16 : "github.com/cockroachdb/pebble/vfs/atomicfs"
17 : )
18 :
19 : // FormatMajorVersion is a constant controlling the format of persisted
20 : // data. Backwards incompatible changes to durable formats are gated
21 : // behind new format major versions.
22 : //
23 : // At any point, a database's format major version may be bumped.
24 : // However, once a database's format major version is increased,
25 : // previous versions of Pebble will refuse to open the database.
26 : //
27 : // The zero value format is the FormatDefault constant. The exact
28 : // FormatVersion that the default corresponds to may change with time.
29 : type FormatMajorVersion uint64
30 :
31 : // SafeValue implements redact.SafeValue.
32 0 : func (v FormatMajorVersion) SafeValue() {}
33 :
34 : // String implements fmt.Stringer.
35 1 : func (v FormatMajorVersion) String() string {
36 1 : // NB: This must not change. It's used as the value for the on-disk
37 1 : // version marker file.
38 1 : //
39 1 : // Specifically, this value must always parse as a base 10 integer
40 1 : // that fits in a uint64. We format it as zero-padded, 3-digit
41 1 : // number today, but the padding may change.
42 1 : return fmt.Sprintf("%03d", v)
43 1 : }
44 :
45 : const (
46 : // FormatDefault leaves the format version unspecified. When used to create a
47 : // new store, Pebble will choose the earliest format version it supports.
48 : FormatDefault FormatMajorVersion = iota
49 :
50 : // 21.2 versions.
51 :
52 : // FormatMostCompatible maintains the most backwards compatibility,
53 : // maintaining bi-directional compatibility with RocksDB 6.2.1 in
54 : // the particular configuration described in the Pebble README.
55 : // Deprecated.
56 : _ // FormatMostCompatible
57 :
58 : // formatVersionedManifestMarker is the first
59 : // backwards-incompatible change made to Pebble, introducing the
60 : // format-version marker file for handling backwards-incompatible
61 : // changes more broadly, and replacing the `CURRENT` file with a
62 : // marker file.
63 : //
64 : // This format version is intended as an intermediary version state.
65 : // It is deliberately unexported to discourage direct use of this
66 : // format major version. Clients should use FormatVersioned which
67 : // also ensures earlier versions of Pebble fail to open a database
68 : // written in a future format major version.
69 : // Deprecated.
70 : _ // formatVersionedManifestMarker
71 :
72 : // FormatVersioned is a new format major version that replaces the
73 : // old `CURRENT` file with a new 'marker' file scheme. Previous
74 : // Pebble versions will be unable to open the database unless
75 : // they're aware of format versions.
76 : // Deprecated.
77 : _ // FormatVersioned
78 :
79 : // FormatSetWithDelete is a format major version that introduces a new key
80 : // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
81 : // unable to open this database.
82 : // Deprecated.
83 : _ // FormatSetWithDelete
84 :
85 : // 22.1 versions.
86 :
87 : // FormatBlockPropertyCollector is a format major version that introduces
88 : // BlockPropertyCollectors.
89 : // Deprecated.
90 : _ // FormatBlockPropertyCollector
91 :
92 : // FormatSplitUserKeysMarked is a format major version that guarantees that
93 : // all files that share user keys with neighbors are marked for compaction
94 : // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
95 : // (without holding mutexes) until the scan of the LSM is complete and the
96 : // manifest has been rotated.
97 : // Deprecated.
98 : _ // FormatSplitUserKeysMarked
99 :
100 : // 22.2 versions.
101 :
102 : // FormatSplitUserKeysMarkedCompacted is a format major version that
103 : // guarantees that all files explicitly marked for compaction in the manifest
104 : // have been compacted. Combined with the FormatSplitUserKeysMarked format
105 : // major version, this version guarantees that there are no user keys split
106 : // across multiple files within a level L1+. Ratcheting to this format version
107 : // will block (without holding mutexes) until all necessary compactions for
108 : // files marked for compaction are complete.
109 : // Deprecated.
110 : _ // FormatSplitUserKeysMarkedCompacted
111 :
112 : // FormatRangeKeys is a format major version that introduces range keys.
113 : // Deprecated.
114 : _ // FormatRangeKeys
115 :
116 : // FormatMinTableFormatPebblev1 is a format major version that guarantees that
117 : // tables created by or ingested into the DB at or above this format major
118 : // version will have a table format version of at least Pebblev1 (Block
119 : // Properties).
120 : // Deprecated.
121 : _ // FormatMinTableFormatPebblev1
122 :
123 : // FormatPrePebblev1Marked is a format major version that guarantees that all
124 : // sstables with a table format version pre-Pebblev1 (i.e. those that are
125 : // guaranteed to not contain block properties) are marked for compaction in
126 : // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
127 : // holding mutexes) until the scan of the LSM is complete and the manifest has
128 : // been rotated.
129 : // Deprecated.
130 : _ // FormatPrePebblev1Marked
131 :
132 : // 23.1 versions.
133 :
134 : // formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
135 : // This format major version was originally intended to ship in the 23.1
136 : // release. It was later decided that this should be deferred until a
137 : // subsequent release. The original ordering is preserved so as not to
138 : // introduce breaking changes in Cockroach.
139 : _ // formatUnusedPrePebblev1MarkedCompacted
140 :
141 : // FormatSSTableValueBlocks is a format major version that adds support for
142 : // storing values in value blocks in the sstable. Value block support is not
143 : // necessarily enabled when writing sstables, when running with this format
144 : // major version.
145 : _ // FormatSSTableValueBlocks
146 :
147 : // FormatFlushableIngest is a format major version that enables lazy
148 : // addition of ingested sstables into the LSM structure. When an ingest
149 : // overlaps with a memtable, a record of the ingest is written to the WAL
150 : // without waiting for a flush. Subsequent reads treat the ingested files as
151 : // a level above the overlapping memtable. Once the memtable is flushed, the
152 : // ingested files are moved into the lowest possible levels.
153 : //
154 : // This feature is behind a format major version because it required
155 : // breaking changes to the WAL format.
156 : FormatFlushableIngest
157 :
158 : // 23.2 versions.
159 :
160 : // FormatPrePebblev1MarkedCompacted is a format major version that guarantees
161 : // that all sstables explicitly marked for compaction in the manifest (see
162 : // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
163 : // version will block (without holding mutexes) until all necessary
164 : // compactions for files marked for compaction are complete.
165 : FormatPrePebblev1MarkedCompacted
166 :
167 : // FormatDeleteSizedAndObsolete is a format major version that adds support
168 : // for deletion tombstones that encode the size of the value they're
169 : // expected to delete. This format major version is required before the
170 : // associated key kind may be committed through batch applications or
171 : // ingests. It also adds support for keys that are marked obsolete (see
172 : // sstable/format.go for details).
173 : FormatDeleteSizedAndObsolete
174 :
175 : // FormatVirtualSSTables is a format major version that adds support for
176 : // virtual sstables that can reference a sub-range of keys in an underlying
177 : // physical sstable. This information is persisted through new,
178 : // backward-incompatible fields in the Manifest, and therefore requires
179 : // a format major version.
180 : FormatVirtualSSTables
181 :
182 : // FormatSyntheticPrefixSuffix is a format major version that adds support for
183 : // sstables to have their content exposed in a different prefix or suffix of
184 : // keyspace than the actual prefix/suffix persisted in the keys in such
185 : // sstables. The prefix and suffix replacement information is stored in new
186 : // fields in the Manifest and thus requires a format major version.
187 : FormatSyntheticPrefixSuffix
188 :
189 : // FormatFlushableIngestExcises is a format major version that adds support for
190 : // having excises unconditionally being written as flushable ingestions. This
191 : // is implemented through adding a new key kind that can go in the same batches
192 : // as flushable ingested sstables.
193 : FormatFlushableIngestExcises
194 :
195 : // FormatColumnarBlocks is a format major version enabling use of the
196 : // TableFormatPebblev5 table format, that encodes sstable data blocks, index
197 : // blocks and keyspan blocks by organizing the KVs into columns within the
198 : // block.
199 : FormatColumnarBlocks
200 :
201 : // FormatWALSyncChunks is a format major version enabling the writing of
202 : // WAL sync chunks. These new chunks are used to disambiguate between corruption
203 : // and logical EOF during WAL replay. This is implemented by adding a new
204 : // chunk wire format that encodes an additional "Synced Offset" field which acts
205 : // as a commitment that the WAL should have been synced up until the offset.
206 : FormatWALSyncChunks
207 :
208 : // FormatTableFormatV6 is a format major version enabling the sstable table
209 : // format TableFormatPebblev6.
210 : //
211 : // The TableFormatPebblev6 sstable format introduces a checksum within the
212 : // sstable footer, allows inclusion of blob handle references within the
213 : // value column of a sstable block, and supports columnar meta index +
214 : // properties blocks.
215 : //
216 : // This format major version does not yet enable use of value separation.
217 : FormatTableFormatV6
218 :
219 : // formatDeprecatedExperimentalValueSeparation was used to enable an
220 : // experimental version of value separation, separating values into external
221 : // blob files that do not participate in every compaction.
222 : //
223 : // Value separation now depends on TableFormatPebblev7 which this format
224 : // major version precedes. This format major version is deprecated and
225 : // unexported, and value separation now requires FormatValueSeparation.
226 : formatDeprecatedExperimentalValueSeparation
227 :
228 : // formatFooterAttributes is a format major version that adds support for
229 : // writing sstable.Attributes in the footer of sstables.
230 : formatFooterAttributes
231 :
232 : // FormatValueSeparation is a format major version that adds support for
233 : // value separation, separating values into external blob files that do not
234 : // participate in every compaction.
235 : FormatValueSeparation
236 :
237 : // -- Add new versions here --
238 :
239 : // FormatNewest is the most recent format major version.
240 : FormatNewest FormatMajorVersion = iota - 1
241 :
242 : // Experimental versions, which are excluded by FormatNewest (but can be used
243 : // in tests) can be defined here.
244 :
245 : // -- Add experimental versions here --
246 :
247 : // internalFormatNewest is the most recent, possibly experimental format major
248 : // version.
249 : internalFormatNewest FormatMajorVersion = iota - 2
250 : )
251 :
252 : // FormatMinSupported is the minimum format version that is supported by this
253 : // Pebble version.
254 : const FormatMinSupported = FormatFlushableIngest
255 :
256 : // FormatMinForSharedObjects it the minimum format version that supports shared
257 : // objects (see CreateOnShared option).
258 : const FormatMinForSharedObjects = FormatVirtualSSTables
259 :
260 : // IsSupported returns true if the version is supported by the current Pebble
261 : // version.
262 0 : func (v FormatMajorVersion) IsSupported() bool {
263 0 : return v == FormatDefault && v >= FormatMinSupported && v <= internalFormatNewest
264 0 : }
265 :
266 : // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
267 : // this FormatMajorVersion.
268 1 : func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
269 1 : switch v {
270 1 : case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
271 1 : return sstable.TableFormatPebblev3
272 : case FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix,
273 1 : FormatFlushableIngestExcises:
274 1 : return sstable.TableFormatPebblev4
275 1 : case FormatColumnarBlocks, FormatWALSyncChunks:
276 1 : return sstable.TableFormatPebblev5
277 1 : case FormatTableFormatV6, formatDeprecatedExperimentalValueSeparation:
278 1 : return sstable.TableFormatPebblev6
279 1 : case formatFooterAttributes, FormatValueSeparation:
280 1 : return sstable.TableFormatPebblev7
281 1 : default:
282 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
283 : }
284 : }
285 :
286 : // MinTableFormat returns the minimum sstable.TableFormat that can be used at
287 : // this FormatMajorVersion.
288 1 : func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
289 1 : switch v {
290 : case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
291 : FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix,
292 : FormatFlushableIngestExcises, FormatColumnarBlocks, FormatWALSyncChunks,
293 : FormatTableFormatV6, formatDeprecatedExperimentalValueSeparation, formatFooterAttributes,
294 1 : FormatValueSeparation:
295 1 : return sstable.TableFormatPebblev1
296 1 : default:
297 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
298 : }
299 : }
300 :
301 : // formatMajorVersionMigrations defines the migrations from one format
302 : // major version to the next. Each migration is defined as a closure
303 : // which will be invoked on the database before the new format major
304 : // version is committed. Migrations must be idempotent. Migrations are
305 : // invoked with d.mu locked.
306 : //
307 : // Each migration is responsible for invoking finalizeFormatVersUpgrade
308 : // to set the new format major version. RatchetFormatMajorVersion will
309 : // panic if a migration returns a nil error but fails to finalize the
310 : // new format major version.
311 : var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
312 0 : FormatFlushableIngest: func(d *DB) error { return nil },
313 1 : FormatPrePebblev1MarkedCompacted: func(d *DB) error {
314 1 : // Before finalizing the format major version, rewrite any sstables
315 1 : // still marked for compaction. Note all format major versions
316 1 : // migrations are invoked with DB.mu locked.
317 1 : if err := d.compactMarkedFilesLocked(); err != nil {
318 0 : return err
319 0 : }
320 1 : return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
321 : },
322 1 : FormatDeleteSizedAndObsolete: func(d *DB) error {
323 1 : return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
324 1 : },
325 1 : FormatVirtualSSTables: func(d *DB) error {
326 1 : return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
327 1 : },
328 1 : FormatSyntheticPrefixSuffix: func(d *DB) error {
329 1 : return d.finalizeFormatVersUpgrade(FormatSyntheticPrefixSuffix)
330 1 : },
331 1 : FormatFlushableIngestExcises: func(d *DB) error {
332 1 : return d.finalizeFormatVersUpgrade(FormatFlushableIngestExcises)
333 1 : },
334 1 : FormatColumnarBlocks: func(d *DB) error {
335 1 : return d.finalizeFormatVersUpgrade(FormatColumnarBlocks)
336 1 : },
337 1 : FormatWALSyncChunks: func(d *DB) error {
338 1 : return d.finalizeFormatVersUpgrade(FormatWALSyncChunks)
339 1 : },
340 1 : FormatTableFormatV6: func(d *DB) error {
341 1 : return d.finalizeFormatVersUpgrade(FormatTableFormatV6)
342 1 : },
343 1 : formatDeprecatedExperimentalValueSeparation: func(d *DB) error {
344 1 : return d.finalizeFormatVersUpgrade(formatDeprecatedExperimentalValueSeparation)
345 1 : },
346 1 : formatFooterAttributes: func(d *DB) error {
347 1 : return d.finalizeFormatVersUpgrade(formatFooterAttributes)
348 1 : },
349 1 : FormatValueSeparation: func(d *DB) error {
350 1 : return d.finalizeFormatVersUpgrade(FormatValueSeparation)
351 1 : },
352 : }
353 :
354 : const formatVersionMarkerName = `format-version`
355 :
356 : // lookupFormatMajorVersion retrieves the format version from the format version
357 : // marker file.
358 : //
359 : // If such a file does not exist, returns FormatDefault. Note that this case is
360 : // only acceptable if we are creating a new store (we no longer support
361 : // FormatMostCompatible which is the only one with no version marker file).
362 : func lookupFormatMajorVersion(
363 : fs vfs.FS, dirname string, ls []string,
364 1 : ) (FormatMajorVersion, *atomicfs.Marker, error) {
365 1 : m, versString, err := atomicfs.LocateMarkerInListing(fs, dirname, formatVersionMarkerName, ls)
366 1 : if err != nil {
367 1 : return 0, nil, err
368 1 : }
369 1 : if versString == "" {
370 1 : return FormatDefault, m, nil
371 1 : }
372 1 : v, err := strconv.ParseUint(versString, 10, 64)
373 1 : if err != nil {
374 0 : return 0, nil, errors.Wrap(err, "parsing format major version")
375 0 : }
376 1 : vers := FormatMajorVersion(v)
377 1 : if vers == FormatDefault {
378 0 : return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
379 0 : }
380 1 : if vers > internalFormatNewest {
381 1 : return 0, nil, errors.Newf("pebble: database %q written in unknown format major version %d", dirname, vers)
382 1 : }
383 1 : if vers < FormatMinSupported {
384 0 : return 0, nil, errors.Newf("pebble: database %q written in format major version %d which is no longer supported", dirname, vers)
385 0 : }
386 1 : return vers, m, nil
387 : }
388 :
389 : // FormatMajorVersion returns the database's active format major
390 : // version. The format major version may be higher than the one
391 : // provided in Options when the database was opened if the existing
392 : // database was written with a higher format version.
393 1 : func (d *DB) FormatMajorVersion() FormatMajorVersion {
394 1 : return FormatMajorVersion(d.mu.formatVers.vers.Load())
395 1 : }
396 :
397 : // TableFormat returns the TableFormat that the database is currently using when
398 : // writing sstables. The table format is determined by the database's format
399 : // major version, as well as experimental settings like EnableValueBlocks and
400 : // EnableColumnarBlocks.
401 1 : func (d *DB) TableFormat() sstable.TableFormat {
402 1 : // The table is typically written at the maximum allowable format implied by
403 1 : // the current format major version of the DB.
404 1 : f := d.FormatMajorVersion().MaxTableFormat()
405 1 : switch f {
406 1 : case sstable.TableFormatPebblev3:
407 1 : // In format major versions with maximum table formats of Pebblev3,
408 1 : // value blocks were conditional on an experimental setting. In format
409 1 : // major versions with maximum table formats of Pebblev4 and higher,
410 1 : // value blocks are always enabled.
411 1 : if d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks() {
412 1 : f = sstable.TableFormatPebblev2
413 1 : }
414 1 : default:
415 1 : if f.BlockColumnar() && (d.opts.Experimental.EnableColumnarBlocks == nil ||
416 1 : !d.opts.Experimental.EnableColumnarBlocks()) {
417 1 : f = sstable.TableFormatPebblev4
418 1 : }
419 : }
420 1 : return f
421 : }
422 :
423 : // shouldCreateShared returns true if the database should use shared objects
424 : // when creating new objects on the given level.
425 1 : func (d *DB) shouldCreateShared(targetLevel int) bool {
426 1 : return remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, targetLevel) &&
427 1 : d.FormatMajorVersion() >= FormatMinForSharedObjects
428 1 : }
429 :
430 : // RatchetFormatMajorVersion ratchets the opened database's format major
431 : // version to the provided version. It errors if the provided format
432 : // major version is below the database's current version. Once a
433 : // database's format major version is upgraded, previous Pebble versions
434 : // that do not know of the format version will be unable to open the
435 : // database.
436 1 : func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
437 1 : if err := d.closed.Load(); err != nil {
438 1 : panic(err)
439 : }
440 :
441 1 : d.mu.Lock()
442 1 : defer d.mu.Unlock()
443 1 : return d.ratchetFormatMajorVersionLocked(fmv)
444 : }
445 :
446 1 : func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
447 1 : if d.opts.ReadOnly {
448 0 : return ErrReadOnly
449 0 : }
450 1 : if formatVers > internalFormatNewest {
451 0 : // Guard against accidentally forgetting to update internalFormatNewest.
452 0 : return errors.Errorf("pebble: unknown format version %d", formatVers)
453 0 : }
454 1 : if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
455 0 : return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
456 0 : currentVers, formatVers)
457 0 : }
458 1 : if d.mu.formatVers.ratcheting {
459 0 : return errors.Newf("pebble: database format major version upgrade is in-progress")
460 0 : }
461 1 : d.mu.formatVers.ratcheting = true
462 1 : defer func() { d.mu.formatVers.ratcheting = false }()
463 :
464 1 : for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
465 1 : if err := formatMajorVersionMigrations[nextVers](d); err != nil {
466 0 : return errors.Wrapf(err, "migrating to version %d", nextVers)
467 0 : }
468 :
469 : // NB: The migration is responsible for calling
470 : // finalizeFormatVersUpgrade to finalize the upgrade. This
471 : // structure is necessary because some migrations may need to
472 : // update in-memory state (without ever dropping locks) after
473 : // the upgrade is finalized. Here we assert that the upgrade
474 : // did occur.
475 1 : if d.FormatMajorVersion() != nextVers {
476 0 : d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
477 0 : }
478 : }
479 1 : return nil
480 : }
481 :
482 : // finalizeFormatVersUpgrade is typically only be called from within a
483 : // format major version migration.
484 : //
485 : // See formatMajorVersionMigrations.
486 1 : func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
487 1 : if err := d.writeFormatVersionMarker(formatVers); err != nil {
488 0 : return err
489 0 : }
490 1 : d.mu.formatVers.vers.Store(uint64(formatVers))
491 1 : d.opts.EventListener.FormatUpgrade(formatVers)
492 1 : return nil
493 : }
494 :
495 1 : func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error {
496 1 : // We use the marker to encode the active format version in the
497 1 : // marker filename. Unlike other uses of the atomic marker, there is
498 1 : // no file with the filename `formatVers.String()` on the
499 1 : // filesystem.
500 1 : return d.mu.formatVers.marker.Move(formatVers.String())
501 1 : }
502 :
503 : // compactMarkedFilesLocked performs a migration that schedules rewrite
504 : // compactions to compact away any sstables marked for compaction.
505 : // compactMarkedFilesLocked is run while ratcheting the database's format major
506 : // version to FormatSplitUserKeysMarkedCompacted.
507 : //
508 : // Note that while this method is called with the DB.mu held, and will not
509 : // return until all marked files have been compacted, the mutex is dropped while
510 : // waiting for compactions to complete (or for slots to free up).
511 1 : func (d *DB) compactMarkedFilesLocked() error {
512 1 : curr := d.mu.versions.currentVersion()
513 1 : if curr.Stats.MarkedForCompaction == 0 {
514 1 : return nil
515 1 : }
516 : // Attempt to schedule a compaction to rewrite a file marked for compaction.
517 : // We simply call maybeScheduleCompaction since it also picks rewrite
518 : // compactions. Note that we don't need to call this repeatedly in the for
519 : // loop below since the completion of a compaction either starts a new one
520 : // or ensures a compaction is queued for scheduling. By calling
521 : // maybeScheduleCompaction here we are simply kicking off this behavior.
522 0 : d.maybeScheduleCompaction()
523 0 :
524 0 : // The above attempt might succeed and schedule a rewrite compaction. Or
525 0 : // there might not be available compaction concurrency to schedule the
526 0 : // compaction. Or compaction of the file might have already been in
527 0 : // progress. In any scenario, wait until there's some change in the
528 0 : // state of active compactions.
529 0 : for curr.Stats.MarkedForCompaction > 0 {
530 0 : // Before waiting, check that the database hasn't been closed. Trying to
531 0 : // schedule the compaction may have dropped d.mu while waiting for a
532 0 : // manifest write to complete. In that dropped interim, the database may
533 0 : // have been closed.
534 0 : if err := d.closed.Load(); err != nil {
535 0 : return err.(error)
536 0 : }
537 :
538 : // Some flush or compaction may have scheduled or completed while we waited
539 : // for the manifest lock in maybeScheduleCompactionPicker. Get the latest
540 : // Version before waiting on a compaction.
541 0 : curr = d.mu.versions.currentVersion()
542 0 :
543 0 : // Only wait on compactions if there are files still marked for compaction.
544 0 : // NB: Waiting on this condition variable drops d.mu while blocked.
545 0 : if curr.Stats.MarkedForCompaction > 0 {
546 0 : // NB: we cannot assert that d.mu.compact.compactingCount > 0, since
547 0 : // with a CompactionScheduler a DB may not have even one ongoing
548 0 : // compaction (if other competing activities are being preferred by the
549 0 : // scheduler).
550 0 : d.mu.compact.cond.Wait()
551 0 : // Refresh the current version again.
552 0 : curr = d.mu.versions.currentVersion()
553 0 : }
554 : }
555 0 : return nil
556 : }
557 :
558 : // findFilesFunc scans the LSM for files, returning true if at least one
559 : // file was found. The returned array contains the matched files, if any, per
560 : // level.
561 : type findFilesFunc func(v *manifest.Version) (found bool, files [numLevels][]*manifest.TableMetadata, _ error)
562 :
563 : // This method is not used currently, but it will be useful the next time we need
564 : // to mark files for compaction.
565 : var _ = (*DB)(nil).markFilesLocked
566 :
567 : // markFilesLocked durably marks the files that match the given findFilesFunc for
568 : // compaction.
569 0 : func (d *DB) markFilesLocked(findFn findFilesFunc) error {
570 0 : jobID := d.newJobIDLocked()
571 0 :
572 0 : // Acquire a read state to have a view of the LSM and a guarantee that none
573 0 : // of the referenced files will be deleted until we've unreferenced the read
574 0 : // state. Some findFilesFuncs may read the files, requiring they not be
575 0 : // deleted.
576 0 : rs := d.loadReadState()
577 0 : var (
578 0 : found bool
579 0 : files [numLevels][]*manifest.TableMetadata
580 0 : err error
581 0 : )
582 0 : func() {
583 0 : defer rs.unrefLocked()
584 0 : // Note the unusual locking: unlock, defer Lock(). The scan of the files in
585 0 : // the version does not need to block other operations that require the
586 0 : // DB.mu. Drop it for the scan, before re-acquiring it.
587 0 : d.mu.Unlock()
588 0 : defer d.mu.Lock()
589 0 : found, files, err = findFn(rs.current)
590 0 : }()
591 0 : if err != nil {
592 0 : return err
593 0 : }
594 :
595 : // The database lock has been acquired again by the defer within the above
596 : // anonymous function.
597 0 : if !found {
598 0 : // Nothing to do.
599 0 : return nil
600 0 : }
601 :
602 : // After scanning, if we found files to mark, we fetch the current state of
603 : // the LSM (which may have changed) and set MarkedForCompaction on the files,
604 : // and update the version's Stats.MarkedForCompaction count, which are both
605 : // protected by d.mu.
606 :
607 : // Lock the manifest for a coherent view of the LSM. The database lock has
608 : // been re-acquired by the defer within the above anonymous function.
609 0 : return d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) {
610 0 : vers := d.mu.versions.currentVersion()
611 0 : for l, filesToMark := range files {
612 0 : if len(filesToMark) == 0 {
613 0 : continue
614 : }
615 0 : for _, f := range filesToMark {
616 0 : // Ignore files to be marked that have already been compacted or marked.
617 0 : if f.CompactionState == manifest.CompactionStateCompacted ||
618 0 : f.MarkedForCompaction {
619 0 : continue
620 : }
621 : // Else, mark the file for compaction in this version.
622 0 : vers.Stats.MarkedForCompaction++
623 0 : f.MarkedForCompaction = true
624 : }
625 : // The compaction picker uses the markedForCompactionAnnotator to
626 : // quickly find files marked for compaction, or to quickly determine
627 : // that there are no such files marked for compaction within a level.
628 : // A b-tree node may be annotated with an annotation recording that
629 : // there are no files marked for compaction within the node's subtree,
630 : // based on the assumption that it's static.
631 : //
632 : // Since we're marking files for compaction, these b-tree nodes'
633 : // annotations will be out of date. Clear the compaction-picking
634 : // annotation, so that it's recomputed the next time the compaction
635 : // picker looks for a file marked for compaction.
636 0 : markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l])
637 : }
638 : // The 'marked-for-compaction' bit is persisted in the MANIFEST file
639 : // metadata. We've already modified the in-memory table metadata, but the
640 : // manifest hasn't been updated. Force rotation to a new MANIFEST file,
641 : // which will write every table metadata to the new manifest file and ensure
642 : // that the now marked-for-compaction table metadata are persisted as marked.
643 0 : return versionUpdate{
644 0 : VE: &manifest.VersionEdit{},
645 0 : JobID: jobID,
646 0 : ForceManifestRotation: true,
647 0 : InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) },
648 : }, nil
649 : })
650 : }
|