Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "strconv"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/manifest"
13 : "github.com/cockroachdb/pebble/sstable"
14 : "github.com/cockroachdb/pebble/vfs"
15 : "github.com/cockroachdb/pebble/vfs/atomicfs"
16 : )
17 :
18 : // FormatMajorVersion is a constant controlling the format of persisted
19 : // data. Backwards incompatible changes to durable formats are gated
20 : // behind new format major versions.
21 : //
22 : // At any point, a database's format major version may be bumped.
23 : // However, once a database's format major version is increased,
24 : // previous versions of Pebble will refuse to open the database.
25 : //
26 : // The zero value format is the FormatDefault constant. The exact
27 : // FormatVersion that the default corresponds to may change with time.
28 : type FormatMajorVersion uint64
29 :
30 : // SafeValue implements redact.SafeValue.
31 0 : func (v FormatMajorVersion) SafeValue() {}
32 :
33 : // String implements fmt.Stringer.
34 2 : func (v FormatMajorVersion) String() string {
35 2 : // NB: This must not change. It's used as the value for the on-disk
36 2 : // version marker file.
37 2 : //
38 2 : // Specifically, this value must always parse as a base 10 integer
39 2 : // that fits in a uint64. We format it as zero-padded, 3-digit
40 2 : // number today, but the padding may change.
41 2 : return fmt.Sprintf("%03d", v)
42 2 : }
43 :
44 : const (
45 : // FormatDefault leaves the format version unspecified. When used to create a
46 : // new store, Pebble will choose the earliest format version it supports.
47 : FormatDefault FormatMajorVersion = iota
48 :
49 : // 21.2 versions.
50 :
51 : // FormatMostCompatible maintains the most backwards compatibility,
52 : // maintaining bi-directional compatibility with RocksDB 6.2.1 in
53 : // the particular configuration described in the Pebble README.
54 : // Deprecated.
55 : _ // FormatMostCompatible
56 :
57 : // formatVersionedManifestMarker is the first
58 : // backwards-incompatible change made to Pebble, introducing the
59 : // format-version marker file for handling backwards-incompatible
60 : // changes more broadly, and replacing the `CURRENT` file with a
61 : // marker file.
62 : //
63 : // This format version is intended as an intermediary version state.
64 : // It is deliberately unexported to discourage direct use of this
65 : // format major version. Clients should use FormatVersioned which
66 : // also ensures earlier versions of Pebble fail to open a database
67 : // written in a future format major version.
68 : // Deprecated.
69 : _ // formatVersionedManifestMarker
70 :
71 : // FormatVersioned is a new format major version that replaces the
72 : // old `CURRENT` file with a new 'marker' file scheme. Previous
73 : // Pebble versions will be unable to open the database unless
74 : // they're aware of format versions.
75 : // Deprecated.
76 : _ // FormatVersioned
77 :
78 : // FormatSetWithDelete is a format major version that introduces a new key
79 : // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
80 : // unable to open this database.
81 : // Deprecated.
82 : _ // FormatSetWithDelete
83 :
84 : // 22.1 versions.
85 :
86 : // FormatBlockPropertyCollector is a format major version that introduces
87 : // BlockPropertyCollectors.
88 : // Deprecated.
89 : _ // FormatBlockPropertyCollector
90 :
91 : // FormatSplitUserKeysMarked is a format major version that guarantees that
92 : // all files that share user keys with neighbors are marked for compaction
93 : // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
94 : // (without holding mutexes) until the scan of the LSM is complete and the
95 : // manifest has been rotated.
96 : // Deprecated.
97 : _ // FormatSplitUserKeysMarked
98 :
99 : // 22.2 versions.
100 :
101 : // FormatSplitUserKeysMarkedCompacted is a format major version that
102 : // guarantees that all files explicitly marked for compaction in the manifest
103 : // have been compacted. Combined with the FormatSplitUserKeysMarked format
104 : // major version, this version guarantees that there are no user keys split
105 : // across multiple files within a level L1+. Ratcheting to this format version
106 : // will block (without holding mutexes) until all necessary compactions for
107 : // files marked for compaction are complete.
108 : // Deprecated.
109 : _ // FormatSplitUserKeysMarkedCompacted
110 :
111 : // FormatRangeKeys is a format major version that introduces range keys.
112 : // Deprecated.
113 : _ // FormatRangeKeys
114 :
115 : // FormatMinTableFormatPebblev1 is a format major version that guarantees that
116 : // tables created by or ingested into the DB at or above this format major
117 : // version will have a table format version of at least Pebblev1 (Block
118 : // Properties).
119 : // Deprecated.
120 : _ // FormatMinTableFormatPebblev1
121 :
122 : // FormatPrePebblev1Marked is a format major version that guarantees that all
123 : // sstables with a table format version pre-Pebblev1 (i.e. those that are
124 : // guaranteed to not contain block properties) are marked for compaction in
125 : // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
126 : // holding mutexes) until the scan of the LSM is complete and the manifest has
127 : // been rotated.
128 : // Deprecated.
129 : _ // FormatPrePebblev1Marked
130 :
131 : // 23.1 versions.
132 :
133 : // formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
134 : // This format major version was originally intended to ship in the 23.1
135 : // release. It was later decided that this should be deferred until a
136 : // subsequent release. The original ordering is preserved so as not to
137 : // introduce breaking changes in Cockroach.
138 : _ // formatUnusedPrePebblev1MarkedCompacted
139 :
140 : // FormatSSTableValueBlocks is a format major version that adds support for
141 : // storing values in value blocks in the sstable. Value block support is not
142 : // necessarily enabled when writing sstables, when running with this format
143 : // major version.
144 : _ // FormatSSTableValueBlocks
145 :
146 : // FormatFlushableIngest is a format major version that enables lazy
147 : // addition of ingested sstables into the LSM structure. When an ingest
148 : // overlaps with a memtable, a record of the ingest is written to the WAL
149 : // without waiting for a flush. Subsequent reads treat the ingested files as
150 : // a level above the overlapping memtable. Once the memtable is flushed, the
151 : // ingested files are moved into the lowest possible levels.
152 : //
153 : // This feature is behind a format major version because it required
154 : // breaking changes to the WAL format.
155 : FormatFlushableIngest
156 :
157 : // 23.2 versions.
158 :
159 : // FormatPrePebblev1MarkedCompacted is a format major version that guarantees
160 : // that all sstables explicitly marked for compaction in the manifest (see
161 : // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
162 : // version will block (without holding mutexes) until all necessary
163 : // compactions for files marked for compaction are complete.
164 : FormatPrePebblev1MarkedCompacted
165 :
166 : // FormatDeleteSizedAndObsolete is a format major version that adds support
167 : // for deletion tombstones that encode the size of the value they're
168 : // expected to delete. This format major version is required before the
169 : // associated key kind may be committed through batch applications or
170 : // ingests. It also adds support for keys that are marked obsolete (see
171 : // sstable/format.go for details).
172 : FormatDeleteSizedAndObsolete
173 :
174 : // FormatVirtualSSTables is a format major version that adds support for
175 : // virtual sstables that can reference a sub-range of keys in an underlying
176 : // physical sstable. This information is persisted through new,
177 : // backward-incompatible fields in the Manifest, and therefore requires
178 : // a format major version.
179 : FormatVirtualSSTables
180 :
181 : // FormatSyntheticPrefixSuffix is a format major version that adds support for
182 : // sstables to have their content exposed in a different prefix or suffix of
183 : // keyspace than the actual prefix/suffix persisted in the keys in such
184 : // sstables. The prefix and suffix replacement information is stored in new
185 : // fields in the Manifest and thus requires a format major version.
186 : FormatSyntheticPrefixSuffix
187 :
188 : // FormatFlushableIngestExcises is a format major version that adds support for
189 : // having excises unconditionally being written as flushable ingestions. This
190 : // is implemented through adding a new key kind that can go in the same batches
191 : // as flushable ingested sstables.
192 : FormatFlushableIngestExcises
193 :
194 : // FormatColumnarBlocks is a format major version enabling use of the
195 : // TableFormatPebblev5 table format, that encodes sstable data blocks, index
196 : // blocks and keyspan blocks by organizing the KVs into columns within the
197 : // block.
198 : FormatColumnarBlocks
199 :
200 : // FormatWALSyncChunks is a format major version enabling the writing of
201 : // WAL sync chunks. These new chunks are used to disambiguate between corruption
202 : // and logical EOF during WAL replay. This is implemented by adding a new
203 : // chunk wire format that encodes an additional "Synced Offset" field which acts
204 : // as a commitment that the WAL should have been synced up until the offset.
205 : FormatWALSyncChunks
206 :
207 : // FormatTableFormatV6 is a format major version enabling the sstable table
208 : // format TableFormatPebblev6.
209 : //
210 : // The TableFormatPebblev6 sstable format introduces a checksum within the
211 : // sstable footer, and allows inclusion of blob handle references within the
212 : // value column of a sstable block.
213 : //
214 : // This format major version does not yet enable use of value separation.
215 : FormatTableFormatV6
216 :
217 : // -- Add new versions here --
218 :
219 : // FormatNewest is the most recent format major version.
220 : FormatNewest FormatMajorVersion = iota - 1
221 :
222 : // Experimental versions, which are excluded by FormatNewest (but can be used
223 : // in tests) can be defined here.
224 :
225 : // -- Add experimental versions here --
226 :
227 : // internalFormatNewest is the most recent, possibly experimental format major
228 : // version.
229 : internalFormatNewest FormatMajorVersion = iota - 2
230 : )
231 :
232 : // FormatMinSupported is the minimum format version that is supported by this
233 : // Pebble version.
234 : const FormatMinSupported = FormatFlushableIngest
235 :
236 : // FormatMinForSharedObjects it the minimum format version that supports shared
237 : // objects (see CreateOnShared option).
238 : const FormatMinForSharedObjects = FormatVirtualSSTables
239 :
240 : // IsSupported returns true if the version is supported by the current Pebble
241 : // version.
242 0 : func (v FormatMajorVersion) IsSupported() bool {
243 0 : return v == FormatDefault && v >= FormatMinSupported && v <= internalFormatNewest
244 0 : }
245 :
246 : // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
247 : // this FormatMajorVersion.
248 2 : func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
249 2 : switch v {
250 2 : case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
251 2 : return sstable.TableFormatPebblev3
252 : case FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix,
253 2 : FormatFlushableIngestExcises:
254 2 : return sstable.TableFormatPebblev4
255 2 : case FormatColumnarBlocks, FormatWALSyncChunks:
256 2 : return sstable.TableFormatPebblev5
257 2 : case FormatTableFormatV6:
258 2 : return sstable.TableFormatPebblev6
259 1 : default:
260 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
261 : }
262 : }
263 :
264 : // MinTableFormat returns the minimum sstable.TableFormat that can be used at
265 : // this FormatMajorVersion.
266 2 : func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
267 2 : switch v {
268 : case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
269 : FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix,
270 : FormatFlushableIngestExcises, FormatColumnarBlocks, FormatWALSyncChunks,
271 2 : FormatTableFormatV6:
272 2 : return sstable.TableFormatPebblev1
273 1 : default:
274 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
275 : }
276 : }
277 :
278 : // formatMajorVersionMigrations defines the migrations from one format
279 : // major version to the next. Each migration is defined as a closure
280 : // which will be invoked on the database before the new format major
281 : // version is committed. Migrations must be idempotent. Migrations are
282 : // invoked with d.mu locked.
283 : //
284 : // Each migration is responsible for invoking finalizeFormatVersUpgrade
285 : // to set the new format major version. RatchetFormatMajorVersion will
286 : // panic if a migration returns a nil error but fails to finalize the
287 : // new format major version.
288 : var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
289 0 : FormatFlushableIngest: func(d *DB) error { return nil },
290 2 : FormatPrePebblev1MarkedCompacted: func(d *DB) error {
291 2 : // Before finalizing the format major version, rewrite any sstables
292 2 : // still marked for compaction. Note all format major versions
293 2 : // migrations are invoked with DB.mu locked.
294 2 : if err := d.compactMarkedFilesLocked(); err != nil {
295 0 : return err
296 0 : }
297 2 : return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
298 : },
299 2 : FormatDeleteSizedAndObsolete: func(d *DB) error {
300 2 : return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
301 2 : },
302 2 : FormatVirtualSSTables: func(d *DB) error {
303 2 : return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
304 2 : },
305 2 : FormatSyntheticPrefixSuffix: func(d *DB) error {
306 2 : return d.finalizeFormatVersUpgrade(FormatSyntheticPrefixSuffix)
307 2 : },
308 2 : FormatFlushableIngestExcises: func(d *DB) error {
309 2 : return d.finalizeFormatVersUpgrade(FormatFlushableIngestExcises)
310 2 : },
311 2 : FormatColumnarBlocks: func(d *DB) error {
312 2 : return d.finalizeFormatVersUpgrade(FormatColumnarBlocks)
313 2 : },
314 2 : FormatWALSyncChunks: func(d *DB) error {
315 2 : return d.finalizeFormatVersUpgrade(FormatWALSyncChunks)
316 2 : },
317 2 : FormatTableFormatV6: func(d *DB) error {
318 2 : return d.finalizeFormatVersUpgrade(FormatTableFormatV6)
319 2 : },
320 : }
321 :
322 : const formatVersionMarkerName = `format-version`
323 :
324 : // lookupFormatMajorVersion retrieves the format version from the format version
325 : // marker file.
326 : //
327 : // If such a file does not exist, returns FormatDefault. Note that this case is
328 : // only acceptable if we are creating a new store (we no longer support
329 : // FormatMostCompatible which is the only one with no version marker file).
330 : func lookupFormatMajorVersion(
331 : fs vfs.FS, dirname string, ls []string,
332 2 : ) (FormatMajorVersion, *atomicfs.Marker, error) {
333 2 : m, versString, err := atomicfs.LocateMarkerInListing(fs, dirname, formatVersionMarkerName, ls)
334 2 : if err != nil {
335 1 : return 0, nil, err
336 1 : }
337 2 : if versString == "" {
338 2 : return FormatDefault, m, nil
339 2 : }
340 2 : v, err := strconv.ParseUint(versString, 10, 64)
341 2 : if err != nil {
342 0 : return 0, nil, errors.Wrap(err, "parsing format major version")
343 0 : }
344 2 : vers := FormatMajorVersion(v)
345 2 : if vers == FormatDefault {
346 0 : return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
347 0 : }
348 2 : if vers > internalFormatNewest {
349 1 : return 0, nil, errors.Newf("pebble: database %q written in unknown format major version %d", dirname, vers)
350 1 : }
351 2 : if vers < FormatMinSupported {
352 0 : return 0, nil, errors.Newf("pebble: database %q written in format major version %d which is no longer supported", dirname, vers)
353 0 : }
354 2 : return vers, m, nil
355 : }
356 :
357 : // FormatMajorVersion returns the database's active format major
358 : // version. The format major version may be higher than the one
359 : // provided in Options when the database was opened if the existing
360 : // database was written with a higher format version.
361 2 : func (d *DB) FormatMajorVersion() FormatMajorVersion {
362 2 : return FormatMajorVersion(d.mu.formatVers.vers.Load())
363 2 : }
364 :
365 : // TableFormat returns the TableFormat that the database is currently using when
366 : // writing sstables. The table format is determined by the database's format
367 : // major version, as well as experimental settings like EnableValueBlocks and
368 : // EnableColumnarBlocks.
369 2 : func (d *DB) TableFormat() sstable.TableFormat {
370 2 : // The table is typically written at the maximum allowable format implied by
371 2 : // the current format major version of the DB.
372 2 : f := d.FormatMajorVersion().MaxTableFormat()
373 2 : switch f {
374 2 : case sstable.TableFormatPebblev3:
375 2 : // In format major versions with maximum table formats of Pebblev3,
376 2 : // value blocks were conditional on an experimental setting. In format
377 2 : // major versions with maximum table formats of Pebblev4 and higher,
378 2 : // value blocks are always enabled.
379 2 : if d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks() {
380 2 : f = sstable.TableFormatPebblev2
381 2 : }
382 2 : case sstable.TableFormatPebblev5:
383 2 : if d.opts.Experimental.EnableColumnarBlocks == nil || !d.opts.Experimental.EnableColumnarBlocks() {
384 1 : f = sstable.TableFormatPebblev4
385 1 : }
386 : }
387 2 : return f
388 : }
389 :
390 : // RatchetFormatMajorVersion ratchets the opened database's format major
391 : // version to the provided version. It errors if the provided format
392 : // major version is below the database's current version. Once a
393 : // database's format major version is upgraded, previous Pebble versions
394 : // that do not know of the format version will be unable to open the
395 : // database.
396 2 : func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
397 2 : if err := d.closed.Load(); err != nil {
398 1 : panic(err)
399 : }
400 :
401 2 : d.mu.Lock()
402 2 : defer d.mu.Unlock()
403 2 : return d.ratchetFormatMajorVersionLocked(fmv)
404 : }
405 :
406 2 : func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
407 2 : if d.opts.ReadOnly {
408 0 : return ErrReadOnly
409 0 : }
410 2 : if formatVers > internalFormatNewest {
411 0 : // Guard against accidentally forgetting to update internalFormatNewest.
412 0 : return errors.Errorf("pebble: unknown format version %d", formatVers)
413 0 : }
414 2 : if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
415 0 : return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
416 0 : currentVers, formatVers)
417 0 : }
418 2 : if d.mu.formatVers.ratcheting {
419 0 : return errors.Newf("pebble: database format major version upgrade is in-progress")
420 0 : }
421 2 : d.mu.formatVers.ratcheting = true
422 2 : defer func() { d.mu.formatVers.ratcheting = false }()
423 :
424 2 : for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
425 2 : if err := formatMajorVersionMigrations[nextVers](d); err != nil {
426 0 : return errors.Wrapf(err, "migrating to version %d", nextVers)
427 0 : }
428 :
429 : // NB: The migration is responsible for calling
430 : // finalizeFormatVersUpgrade to finalize the upgrade. This
431 : // structure is necessary because some migrations may need to
432 : // update in-memory state (without ever dropping locks) after
433 : // the upgrade is finalized. Here we assert that the upgrade
434 : // did occur.
435 2 : if d.FormatMajorVersion() != nextVers {
436 0 : d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
437 0 : }
438 : }
439 2 : return nil
440 : }
441 :
442 : // finalizeFormatVersUpgrade is typically only be called from within a
443 : // format major version migration.
444 : //
445 : // See formatMajorVersionMigrations.
446 2 : func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
447 2 : if err := d.writeFormatVersionMarker(formatVers); err != nil {
448 0 : return err
449 0 : }
450 2 : d.mu.formatVers.vers.Store(uint64(formatVers))
451 2 : d.opts.EventListener.FormatUpgrade(formatVers)
452 2 : return nil
453 : }
454 :
455 2 : func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error {
456 2 : // We use the marker to encode the active format version in the
457 2 : // marker filename. Unlike other uses of the atomic marker, there is
458 2 : // no file with the filename `formatVers.String()` on the
459 2 : // filesystem.
460 2 : return d.mu.formatVers.marker.Move(formatVers.String())
461 2 : }
462 :
463 : // compactMarkedFilesLocked performs a migration that schedules rewrite
464 : // compactions to compact away any sstables marked for compaction.
465 : // compactMarkedFilesLocked is run while ratcheting the database's format major
466 : // version to FormatSplitUserKeysMarkedCompacted.
467 : //
468 : // Note that while this method is called with the DB.mu held, and will not
469 : // return until all marked files have been compacted, the mutex is dropped while
470 : // waiting for compactions to complete (or for slots to free up).
471 2 : func (d *DB) compactMarkedFilesLocked() error {
472 2 : curr := d.mu.versions.currentVersion()
473 2 : if curr.Stats.MarkedForCompaction == 0 {
474 2 : return nil
475 2 : }
476 : // Attempt to schedule a compaction to rewrite a file marked for compaction.
477 : // We simply call maybeScheduleCompaction since it also picks rewrite
478 : // compactions. Note that we don't need to call this repeatedly in the for
479 : // loop below since the completion of a compaction either starts a new one
480 : // or ensures a compaction is queued for scheduling. By calling
481 : // maybeScheduleCompaction here we are simply kicking off this behavior.
482 0 : d.maybeScheduleCompaction()
483 0 :
484 0 : // The above attempt might succeed and schedule a rewrite compaction. Or
485 0 : // there might not be available compaction concurrency to schedule the
486 0 : // compaction. Or compaction of the file might have already been in
487 0 : // progress. In any scenario, wait until there's some change in the
488 0 : // state of active compactions.
489 0 : for curr.Stats.MarkedForCompaction > 0 {
490 0 : // Before waiting, check that the database hasn't been closed. Trying to
491 0 : // schedule the compaction may have dropped d.mu while waiting for a
492 0 : // manifest write to complete. In that dropped interim, the database may
493 0 : // have been closed.
494 0 : if err := d.closed.Load(); err != nil {
495 0 : return err.(error)
496 0 : }
497 :
498 : // Some flush or compaction may have scheduled or completed while we waited
499 : // for the manifest lock in maybeScheduleCompactionPicker. Get the latest
500 : // Version before waiting on a compaction.
501 0 : curr = d.mu.versions.currentVersion()
502 0 :
503 0 : // Only wait on compactions if there are files still marked for compaction.
504 0 : // NB: Waiting on this condition variable drops d.mu while blocked.
505 0 : if curr.Stats.MarkedForCompaction > 0 {
506 0 : // NB: we cannot assert that d.mu.compact.compactingCount > 0, since
507 0 : // with a CompactionScheduler a DB may not have even one ongoing
508 0 : // compaction (if other competing activities are being preferred by the
509 0 : // scheduler).
510 0 : d.mu.compact.cond.Wait()
511 0 : // Refresh the current version again.
512 0 : curr = d.mu.versions.currentVersion()
513 0 : }
514 : }
515 0 : return nil
516 : }
517 :
518 : // findFilesFunc scans the LSM for files, returning true if at least one
519 : // file was found. The returned array contains the matched files, if any, per
520 : // level.
521 : type findFilesFunc func(v *version) (found bool, files [numLevels][]*tableMetadata, _ error)
522 :
523 : // This method is not used currently, but it will be useful the next time we need
524 : // to mark files for compaction.
525 : var _ = (*DB)(nil).markFilesLocked
526 :
527 : // markFilesLocked durably marks the files that match the given findFilesFunc for
528 : // compaction.
529 0 : func (d *DB) markFilesLocked(findFn findFilesFunc) error {
530 0 : jobID := d.newJobIDLocked()
531 0 :
532 0 : // Acquire a read state to have a view of the LSM and a guarantee that none
533 0 : // of the referenced files will be deleted until we've unreferenced the read
534 0 : // state. Some findFilesFuncs may read the files, requiring they not be
535 0 : // deleted.
536 0 : rs := d.loadReadState()
537 0 : var (
538 0 : found bool
539 0 : files [numLevels][]*tableMetadata
540 0 : err error
541 0 : )
542 0 : func() {
543 0 : defer rs.unrefLocked()
544 0 : // Note the unusual locking: unlock, defer Lock(). The scan of the files in
545 0 : // the version does not need to block other operations that require the
546 0 : // DB.mu. Drop it for the scan, before re-acquiring it.
547 0 : d.mu.Unlock()
548 0 : defer d.mu.Lock()
549 0 : found, files, err = findFn(rs.current)
550 0 : }()
551 0 : if err != nil {
552 0 : return err
553 0 : }
554 :
555 : // The database lock has been acquired again by the defer within the above
556 : // anonymous function.
557 0 : if !found {
558 0 : // Nothing to do.
559 0 : return nil
560 0 : }
561 :
562 : // After scanning, if we found files to mark, we fetch the current state of
563 : // the LSM (which may have changed) and set MarkedForCompaction on the files,
564 : // and update the version's Stats.MarkedForCompaction count, which are both
565 : // protected by d.mu.
566 :
567 : // Lock the manifest for a coherent view of the LSM. The database lock has
568 : // been re-acquired by the defer within the above anonymous function.
569 0 : d.mu.versions.logLock()
570 0 : vers := d.mu.versions.currentVersion()
571 0 : for l, filesToMark := range files {
572 0 : if len(filesToMark) == 0 {
573 0 : continue
574 : }
575 0 : for _, f := range filesToMark {
576 0 : // Ignore files to be marked that have already been compacted or marked.
577 0 : if f.CompactionState == manifest.CompactionStateCompacted ||
578 0 : f.MarkedForCompaction {
579 0 : continue
580 : }
581 : // Else, mark the file for compaction in this version.
582 0 : vers.Stats.MarkedForCompaction++
583 0 : f.MarkedForCompaction = true
584 : }
585 : // The compaction picker uses the markedForCompactionAnnotator to
586 : // quickly find files marked for compaction, or to quickly determine
587 : // that there are no such files marked for compaction within a level.
588 : // A b-tree node may be annotated with an annotation recording that
589 : // there are no files marked for compaction within the node's subtree,
590 : // based on the assumption that it's static.
591 : //
592 : // Since we're marking files for compaction, these b-tree nodes'
593 : // annotations will be out of date. Clear the compaction-picking
594 : // annotation, so that it's recomputed the next time the compaction
595 : // picker looks for a file marked for compaction.
596 0 : markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l])
597 : }
598 :
599 : // The 'marked-for-compaction' bit is persisted in the MANIFEST file
600 : // metadata. We've already modified the in-memory table metadata, but the
601 : // manifest hasn't been updated. Force rotation to a new MANIFEST file,
602 : // which will write every table metadata to the new manifest file and ensure
603 : // that the now marked-for-compaction table metadata are persisted as marked.
604 : // NB: This call to logAndApply will unlockthe MANIFEST, which we locked up
605 : // above before obtaining `vers`.
606 0 : return d.mu.versions.logAndApply(
607 0 : jobID,
608 0 : &manifest.VersionEdit{},
609 0 : map[int]*LevelMetrics{},
610 0 : true, /* forceRotation */
611 0 : func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
612 : }
|