Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "strconv"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/manifest"
14 : "github.com/cockroachdb/pebble/sstable"
15 : "github.com/cockroachdb/pebble/vfs"
16 : "github.com/cockroachdb/pebble/vfs/atomicfs"
17 : )
18 :
19 : // FormatMajorVersion is a constant controlling the format of persisted
20 : // data. Backwards incompatible changes to durable formats are gated
21 : // behind new format major versions.
22 : //
23 : // At any point, a database's format major version may be bumped.
24 : // However, once a database's format major version is increased,
25 : // previous versions of Pebble will refuse to open the database.
26 : //
27 : // The zero value format is the FormatDefault constant. The exact
28 : // FormatVersion that the default corresponds to may change with time.
29 : type FormatMajorVersion uint64
30 :
31 : // SafeValue implements redact.SafeValue.
32 0 : func (v FormatMajorVersion) SafeValue() {}
33 :
34 : // String implements fmt.Stringer.
35 1 : func (v FormatMajorVersion) String() string {
36 1 : // NB: This must not change. It's used as the value for the on-disk
37 1 : // version marker file.
38 1 : //
39 1 : // Specifically, this value must always parse as a base 10 integer
40 1 : // that fits in a uint64. We format it as zero-padded, 3-digit
41 1 : // number today, but the padding may change.
42 1 : return fmt.Sprintf("%03d", v)
43 1 : }
44 :
45 : const (
46 : // 21.2 versions.
47 :
48 : // FormatDefault leaves the format version unspecified. The
49 : // FormatDefault constant may be ratcheted upwards over time.
50 : FormatDefault FormatMajorVersion = iota
51 : // FormatMostCompatible maintains the most backwards compatibility,
52 : // maintaining bi-directional compatibility with RocksDB 6.2.1 in
53 : // the particular configuration described in the Pebble README.
54 : FormatMostCompatible
55 : // formatVersionedManifestMarker is the first
56 : // backwards-incompatible change made to Pebble, introducing the
57 : // format-version marker file for handling backwards-incompatible
58 : // changes more broadly, and replacing the `CURRENT` file with a
59 : // marker file.
60 : //
61 : // This format version is intended as an intermediary version state.
62 : // It is deliberately unexported to discourage direct use of this
63 : // format major version. Clients should use FormatVersioned which
64 : // also ensures earlier versions of Pebble fail to open a database
65 : // written in a future format major version.
66 : formatVersionedManifestMarker
67 : // FormatVersioned is a new format major version that replaces the
68 : // old `CURRENT` file with a new 'marker' file scheme. Previous
69 : // Pebble versions will be unable to open the database unless
70 : // they're aware of format versions.
71 : FormatVersioned
72 : // FormatSetWithDelete is a format major version that introduces a new key
73 : // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
74 : // unable to open this database.
75 : FormatSetWithDelete
76 :
77 : // 22.1 versions.
78 :
79 : // FormatBlockPropertyCollector is a format major version that introduces
80 : // BlockPropertyCollectors.
81 : FormatBlockPropertyCollector
82 : // FormatSplitUserKeysMarked is a format major version that guarantees that
83 : // all files that share user keys with neighbors are marked for compaction
84 : // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
85 : // (without holding mutexes) until the scan of the LSM is complete and the
86 : // manifest has been rotated.
87 : FormatSplitUserKeysMarked
88 :
89 : // 22.2 versions.
90 :
91 : // FormatSplitUserKeysMarkedCompacted is a format major version that
92 : // guarantees that all files explicitly marked for compaction in the manifest
93 : // have been compacted. Combined with the FormatSplitUserKeysMarked format
94 : // major version, this version guarantees that there are no user keys split
95 : // across multiple files within a level L1+. Ratcheting to this format version
96 : // will block (without holding mutexes) until all necessary compactions for
97 : // files marked for compaction are complete.
98 : FormatSplitUserKeysMarkedCompacted
99 : // FormatRangeKeys is a format major version that introduces range keys.
100 : FormatRangeKeys
101 : // FormatMinTableFormatPebblev1 is a format major version that guarantees that
102 : // tables created by or ingested into the DB at or above this format major
103 : // version will have a table format version of at least Pebblev1 (Block
104 : // Properties).
105 : FormatMinTableFormatPebblev1
106 : // FormatPrePebblev1Marked is a format major version that guarantees that all
107 : // sstables with a table format version pre-Pebblev1 (i.e. those that are
108 : // guaranteed to not contain block properties) are marked for compaction in
109 : // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
110 : // holding mutexes) until the scan of the LSM is complete and the manifest has
111 : // been rotated.
112 : FormatPrePebblev1Marked
113 :
114 : // 23.1 versions.
115 :
116 : // FormatUnusedPrePebblev1MarkedCompacted is an unused format major version.
117 : // This format major version was originally intended to ship in the 23.1
118 : // release. It was later decided that this should be deferred until a
119 : // subsequent release. The original ordering is preserved so as not to
120 : // introduce breaking changes in Cockroach.
121 : FormatUnusedPrePebblev1MarkedCompacted
122 :
123 : // FormatSSTableValueBlocks is a format major version that adds support for
124 : // storing values in value blocks in the sstable. Value block support is not
125 : // necessarily enabled when writing sstables, when running with this format
126 : // major version.
127 : //
128 : // WARNING: In development, so no production code should upgrade to this
129 : // format, since a DB with this format major version will not actually
130 : // interoperate correctly with another DB with the same format major
131 : // version. This format major version is introduced so that tests can start
132 : // being executed up to this version. Note that these tests succeed despite
133 : // the incomplete support since they do not enable value blocks and use
134 : // TableFormatPebblev2.
135 : FormatSSTableValueBlocks
136 :
137 : // FormatFlushableIngest is a format major version that enables lazy
138 : // addition of ingested sstables into the LSM structure. When an ingest
139 : // overlaps with a memtable, a record of the ingest is written to the WAL
140 : // without waiting for a flush. Subsequent reads treat the ingested files as
141 : // a level above the overlapping memtable. Once the memtable is flushed, the
142 : // ingested files are moved into the lowest possible levels.
143 : //
144 : // This feature is behind a format major version because it required
145 : // breaking changes to the WAL format.
146 : FormatFlushableIngest
147 :
148 : // 23.2 versions.
149 :
150 : // FormatPrePebblev1MarkedCompacted is a format major version that guarantees
151 : // that all sstables explicitly marked for compaction in the manifest (see
152 : // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
153 : // version will block (without holding mutexes) until all necessary
154 : // compactions for files marked for compaction are complete.
155 : FormatPrePebblev1MarkedCompacted
156 :
157 : // FormatDeleteSizedAndObsolete is a format major version that adds support
158 : // for deletion tombstones that encode the size of the value they're
159 : // expected to delete. This format major version is required before the
160 : // associated key kind may be committed through batch applications or
161 : // ingests. It also adds support for keys that are marked obsolete (see
162 : // sstable/format.go for details).
163 : FormatDeleteSizedAndObsolete
164 :
165 : // FormatVirtualSSTables is a format major version that adds support for
166 : // virtual sstables that can reference a sub-range of keys in an underlying
167 : // physical sstable. This information is persisted through new,
168 : // backward-incompatible fields in the Manifest, and therefore requires
169 : // a format major version.
170 : FormatVirtualSSTables
171 :
172 : // internalFormatNewest holds the newest format major version, including
173 : // experimental ones excluded from the exported FormatNewest constant until
174 : // they've stabilized. Used in tests.
175 : internalFormatNewest FormatMajorVersion = iota - 1
176 :
177 : // FormatNewest always contains the most recent format major version.
178 : FormatNewest FormatMajorVersion = internalFormatNewest
179 : )
180 :
181 : // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
182 : // this FormatMajorVersion.
183 1 : func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
184 1 : switch v {
185 : case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
186 1 : FormatVersioned, FormatSetWithDelete:
187 1 : return sstable.TableFormatRocksDBv2
188 : case FormatBlockPropertyCollector, FormatSplitUserKeysMarked,
189 1 : FormatSplitUserKeysMarkedCompacted:
190 1 : return sstable.TableFormatPebblev1
191 : case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
192 1 : FormatUnusedPrePebblev1MarkedCompacted:
193 1 : return sstable.TableFormatPebblev2
194 1 : case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
195 1 : return sstable.TableFormatPebblev3
196 1 : case FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
197 1 : return sstable.TableFormatPebblev4
198 1 : default:
199 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
200 : }
201 : }
202 :
203 : // MinTableFormat returns the minimum sstable.TableFormat that can be used at
204 : // this FormatMajorVersion.
205 1 : func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
206 1 : switch v {
207 : case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
208 : FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector,
209 : FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted,
210 1 : FormatRangeKeys:
211 1 : return sstable.TableFormatLevelDB
212 : case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
213 : FormatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks,
214 : FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
215 1 : FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
216 1 : return sstable.TableFormatPebblev1
217 1 : default:
218 1 : panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
219 : }
220 : }
221 :
222 : // formatMajorVersionMigrations defines the migrations from one format
223 : // major version to the next. Each migration is defined as a closure
224 : // which will be invoked on the database before the new format major
225 : // version is committed. Migrations must be idempotent. Migrations are
226 : // invoked with d.mu locked.
227 : //
228 : // Each migration is responsible for invoking finalizeFormatVersUpgrade
229 : // to set the new format major version. RatchetFormatMajorVersion will
230 : // panic if a migration returns a nil error but fails to finalize the
231 : // new format major version.
232 : var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
233 0 : FormatMostCompatible: func(d *DB) error { return nil },
234 1 : formatVersionedManifestMarker: func(d *DB) error {
235 1 : // formatVersionedManifestMarker introduces the use of a marker
236 1 : // file for pointing to the current MANIFEST file.
237 1 :
238 1 : // Lock the manifest.
239 1 : d.mu.versions.logLock()
240 1 : defer d.mu.versions.logUnlock()
241 1 :
242 1 : // Construct the filename of the currently active manifest and
243 1 : // move the manifest marker to that filename. The marker is
244 1 : // guaranteed to exist, because we unconditionally locate it
245 1 : // during Open.
246 1 : manifestFileNum := d.mu.versions.manifestFileNum
247 1 : filename := base.MakeFilename(fileTypeManifest, manifestFileNum)
248 1 : if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
249 0 : return errors.Wrap(err, "moving manifest marker")
250 0 : }
251 :
252 : // Now that we have a manifest marker file in place and pointing
253 : // to the current MANIFEST, finalize the upgrade. If we fail for
254 : // some reason, a retry of this migration is guaranteed to again
255 : // move the manifest marker file to the latest manifest. If
256 : // we're unable to finalize the upgrade, a subsequent call to
257 : // Open will ignore the manifest marker.
258 1 : if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
259 0 : return err
260 0 : }
261 :
262 : // We've finalized the upgrade. All subsequent Open calls will
263 : // ignore the CURRENT file and instead read the manifest marker.
264 : // Before we unlock the manifest, we need to update versionSet
265 : // to use the manifest marker on future rotations.
266 1 : d.mu.versions.setCurrent = setCurrentFuncMarker(
267 1 : d.mu.versions.manifestMarker,
268 1 : d.mu.versions.fs,
269 1 : d.mu.versions.dirname)
270 1 : return nil
271 : },
272 : // The FormatVersioned version is split into two, each with their
273 : // own migration to ensure the post-migration cleanup happens even
274 : // if there's a crash immediately after finalizing the version. Once
275 : // a new format major version is finalized, its migration will never
276 : // run again. Post-migration cleanup like the one in the migration
277 : // below must be performed in a separate migration or every time the
278 : // database opens.
279 1 : FormatVersioned: func(d *DB) error {
280 1 : // Replace the `CURRENT` file with one that points to the
281 1 : // nonexistent `MANIFEST-000000` file. If an earlier Pebble
282 1 : // version that does not know about format major versions
283 1 : // attempts to open the database, it will error avoiding
284 1 : // accidental corruption.
285 1 : if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil {
286 0 : return err
287 0 : }
288 1 : return d.finalizeFormatVersUpgrade(FormatVersioned)
289 : },
290 : // As SetWithDelete is a new key kind, there is nothing to migrate. We can
291 : // simply finalize the format version and we're done.
292 1 : FormatSetWithDelete: func(d *DB) error {
293 1 : return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
294 1 : },
295 1 : FormatBlockPropertyCollector: func(d *DB) error {
296 1 : return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
297 1 : },
298 1 : FormatSplitUserKeysMarked: func(d *DB) error {
299 1 : // Mark any unmarked files with split-user keys. Note all format major
300 1 : // versions migrations are invoked with DB.mu locked.
301 1 : if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil {
302 0 : return err
303 0 : }
304 1 : return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked)
305 : },
306 1 : FormatSplitUserKeysMarkedCompacted: func(d *DB) error {
307 1 : // Before finalizing the format major version, rewrite any sstables
308 1 : // still marked for compaction. Note all format major versions
309 1 : // migrations are invoked with DB.mu locked.
310 1 : if err := d.compactMarkedFilesLocked(); err != nil {
311 0 : return err
312 0 : }
313 1 : return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted)
314 : },
315 1 : FormatRangeKeys: func(d *DB) error {
316 1 : return d.finalizeFormatVersUpgrade(FormatRangeKeys)
317 1 : },
318 1 : FormatMinTableFormatPebblev1: func(d *DB) error {
319 1 : return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1)
320 1 : },
321 1 : FormatPrePebblev1Marked: func(d *DB) error {
322 1 : // Mark any unmarked files that contain only table properties. Note all
323 1 : // format major versions migrations are invoked with DB.mu locked.
324 1 : if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil {
325 0 : return err
326 0 : }
327 1 : return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked)
328 : },
329 1 : FormatUnusedPrePebblev1MarkedCompacted: func(d *DB) error {
330 1 : // Intentional no-op.
331 1 : return d.finalizeFormatVersUpgrade(FormatUnusedPrePebblev1MarkedCompacted)
332 1 : },
333 1 : FormatSSTableValueBlocks: func(d *DB) error {
334 1 : return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks)
335 1 : },
336 1 : FormatFlushableIngest: func(d *DB) error {
337 1 : return d.finalizeFormatVersUpgrade(FormatFlushableIngest)
338 1 : },
339 1 : FormatPrePebblev1MarkedCompacted: func(d *DB) error {
340 1 : // Before finalizing the format major version, rewrite any sstables
341 1 : // still marked for compaction. Note all format major versions
342 1 : // migrations are invoked with DB.mu locked.
343 1 : if err := d.compactMarkedFilesLocked(); err != nil {
344 0 : return err
345 0 : }
346 1 : return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
347 : },
348 1 : FormatDeleteSizedAndObsolete: func(d *DB) error {
349 1 : return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
350 1 : },
351 1 : FormatVirtualSSTables: func(d *DB) error {
352 1 : return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
353 1 : },
354 : }
355 :
356 : const formatVersionMarkerName = `format-version`
357 :
358 : func lookupFormatMajorVersion(
359 : fs vfs.FS, dirname string,
360 1 : ) (FormatMajorVersion, *atomicfs.Marker, error) {
361 1 : m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
362 1 : if err != nil {
363 1 : return 0, nil, err
364 1 : }
365 1 : if versString == "" {
366 1 : return FormatMostCompatible, m, nil
367 1 : }
368 1 : v, err := strconv.ParseUint(versString, 10, 64)
369 1 : if err != nil {
370 0 : return 0, nil, errors.Wrap(err, "parsing format major version")
371 0 : }
372 1 : vers := FormatMajorVersion(v)
373 1 : if vers == FormatDefault {
374 0 : return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
375 0 : }
376 1 : if vers > internalFormatNewest {
377 1 : return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
378 1 : }
379 1 : return vers, m, nil
380 : }
381 :
382 : // FormatMajorVersion returns the database's active format major
383 : // version. The format major version may be higher than the one
384 : // provided in Options when the database was opened if the existing
385 : // database was written with a higher format version.
386 1 : func (d *DB) FormatMajorVersion() FormatMajorVersion {
387 1 : return FormatMajorVersion(d.mu.formatVers.vers.Load())
388 1 : }
389 :
390 : // RatchetFormatMajorVersion ratchets the opened database's format major
391 : // version to the provided version. It errors if the provided format
392 : // major version is below the database's current version. Once a
393 : // database's format major version is upgraded, previous Pebble versions
394 : // that do not know of the format version will be unable to open the
395 : // database.
396 1 : func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
397 1 : if err := d.closed.Load(); err != nil {
398 1 : panic(err)
399 : }
400 :
401 1 : d.mu.Lock()
402 1 : defer d.mu.Unlock()
403 1 : return d.ratchetFormatMajorVersionLocked(fmv)
404 : }
405 :
406 1 : func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
407 1 : if d.opts.ReadOnly {
408 0 : return ErrReadOnly
409 0 : }
410 1 : if formatVers > internalFormatNewest {
411 0 : // Guard against accidentally forgetting to update internalFormatNewest.
412 0 : return errors.Errorf("pebble: unknown format version %d", formatVers)
413 0 : }
414 1 : if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
415 0 : return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
416 0 : currentVers, formatVers)
417 0 : }
418 1 : if d.mu.formatVers.ratcheting {
419 0 : return errors.Newf("pebble: database format major version upgrade is in-progress")
420 0 : }
421 1 : d.mu.formatVers.ratcheting = true
422 1 : defer func() { d.mu.formatVers.ratcheting = false }()
423 :
424 1 : for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
425 1 : if err := formatMajorVersionMigrations[nextVers](d); err != nil {
426 0 : return errors.Wrapf(err, "migrating to version %d", nextVers)
427 0 : }
428 :
429 : // NB: The migration is responsible for calling
430 : // finalizeFormatVersUpgrade to finalize the upgrade. This
431 : // structure is necessary because some migrations may need to
432 : // update in-memory state (without ever dropping locks) after
433 : // the upgrade is finalized. Here we assert that the upgrade
434 : // did occur.
435 1 : if d.FormatMajorVersion() != nextVers {
436 0 : d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
437 0 : }
438 : }
439 1 : return nil
440 : }
441 :
442 : // finalizeFormatVersUpgrade is typically only be called from within a
443 : // format major version migration.
444 : //
445 : // See formatMajorVersionMigrations.
446 1 : func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
447 1 : // We use the marker to encode the active format version in the
448 1 : // marker filename. Unlike other uses of the atomic marker, there is
449 1 : // no file with the filename `formatVers.String()` on the
450 1 : // filesystem.
451 1 : if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
452 0 : return err
453 0 : }
454 1 : d.mu.formatVers.vers.Store(uint64(formatVers))
455 1 : d.opts.EventListener.FormatUpgrade(formatVers)
456 1 : return nil
457 : }
458 :
459 : // compactMarkedFilesLocked performs a migration that schedules rewrite
460 : // compactions to compact away any sstables marked for compaction.
461 : // compactMarkedFilesLocked is run while ratcheting the database's format major
462 : // version to FormatSplitUserKeysMarkedCompacted.
463 : //
464 : // Note that while this method is called with the DB.mu held, and will not
465 : // return until all marked files have been compacted, the mutex is dropped while
466 : // waiting for compactions to complete (or for slots to free up).
467 1 : func (d *DB) compactMarkedFilesLocked() error {
468 1 : curr := d.mu.versions.currentVersion()
469 1 : for curr.Stats.MarkedForCompaction > 0 {
470 1 : // Attempt to schedule a compaction to rewrite a file marked for
471 1 : // compaction.
472 1 : d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
473 1 : return picker.pickRewriteCompaction(env)
474 1 : })
475 :
476 : // The above attempt might succeed and schedule a rewrite compaction. Or
477 : // there might not be available compaction concurrency to schedule the
478 : // compaction. Or compaction of the file might have already been in
479 : // progress. In any scenario, wait until there's some change in the
480 : // state of active compactions.
481 :
482 : // Before waiting, check that the database hasn't been closed. Trying to
483 : // schedule the compaction may have dropped d.mu while waiting for a
484 : // manifest write to complete. In that dropped interim, the database may
485 : // have been closed.
486 1 : if err := d.closed.Load(); err != nil {
487 0 : return err.(error)
488 0 : }
489 :
490 : // Some flush or compaction may have scheduled or completed while we waited
491 : // for the manifest lock in maybeScheduleCompactionPicker. Get the latest
492 : // Version before waiting on a compaction.
493 1 : curr = d.mu.versions.currentVersion()
494 1 :
495 1 : // Only wait on compactions if there are files still marked for compaction.
496 1 : // NB: Waiting on this condition variable drops d.mu while blocked.
497 1 : if curr.Stats.MarkedForCompaction > 0 {
498 1 : if d.mu.compact.compactingCount == 0 {
499 0 : panic("expected a compaction of marked files in progress")
500 : }
501 1 : d.mu.compact.cond.Wait()
502 1 : // Refresh the current version again.
503 1 : curr = d.mu.versions.currentVersion()
504 : }
505 : }
506 1 : return nil
507 : }
508 :
509 : // findFilesFunc scans the LSM for files, returning true if at least one
510 : // file was found. The returned array contains the matched files, if any, per
511 : // level.
512 : type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error)
513 :
514 : // markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent
515 : // files that contain the same user key. Such arrangements of files were
516 : // permitted in RocksDB and in Pebble up to SHA a860bbad.
517 1 : var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc {
518 1 : return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) {
519 1 : // Files with split user keys are expected to be rare and performing key
520 1 : // comparisons for every file within the LSM is expensive, so drop the
521 1 : // database lock while scanning the file metadata.
522 1 : for l := numLevels - 1; l > 0; l-- {
523 1 : iter := v.Levels[l].Iter()
524 1 : var prevFile *fileMetadata
525 1 : var prevUserKey []byte
526 1 : for f := iter.First(); f != nil; f = iter.Next() {
527 1 : if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) {
528 1 : // NB: We may append a file twice, once as prevFile and once
529 1 : // as f. That's okay, and handled below.
530 1 : files[l] = append(files[l], prevFile, f)
531 1 : found = true
532 1 : }
533 1 : if f.Largest.IsExclusiveSentinel() {
534 0 : prevUserKey = nil
535 0 : prevFile = nil
536 1 : } else {
537 1 : prevUserKey = f.Largest.UserKey
538 1 : prevFile = f
539 1 : }
540 : }
541 : }
542 1 : return
543 : }
544 : }
545 :
546 : // markFilesPrePebblev1 scans the LSM for files that do not support block
547 : // properties (i.e. a table format version pre-Pebblev1).
548 1 : var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc {
549 1 : return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) {
550 1 : for l := numLevels - 1; l > 0; l-- {
551 1 : iter := v.Levels[l].Iter()
552 1 : for f := iter.First(); f != nil; f = iter.Next() {
553 1 : if f.Virtual {
554 0 : // Any physical sstable which has been virtualized must
555 0 : // have already undergone this migration, and we don't
556 0 : // need to worry about the virtual sstable themselves.
557 0 : panic("pebble: unexpected virtual sstable during migration")
558 : }
559 1 : err = tc.withReader(
560 1 : f.PhysicalMeta(), func(r *sstable.Reader) error {
561 1 : tf, err := r.TableFormat()
562 1 : if err != nil {
563 0 : return err
564 0 : }
565 1 : if tf < sstable.TableFormatPebblev1 {
566 1 : found = true
567 1 : files[l] = append(files[l], f)
568 1 : }
569 1 : return nil
570 : })
571 1 : if err != nil {
572 0 : return
573 0 : }
574 : }
575 : }
576 1 : return
577 : }
578 : }
579 :
580 : // markFilesLock durably marks the files that match the given findFilesFunc for
581 : // compaction.
582 1 : func (d *DB) markFilesLocked(findFn findFilesFunc) error {
583 1 : jobID := d.mu.nextJobID
584 1 : d.mu.nextJobID++
585 1 :
586 1 : // Acquire a read state to have a view of the LSM and a guarantee that none
587 1 : // of the referenced files will be deleted until we've unreferenced the read
588 1 : // state. Some findFilesFuncs may read the files, requiring they not be
589 1 : // deleted.
590 1 : rs := d.loadReadState()
591 1 : var (
592 1 : found bool
593 1 : files [numLevels][]*fileMetadata
594 1 : err error
595 1 : )
596 1 : func() {
597 1 : defer rs.unrefLocked()
598 1 : // Note the unusual locking: unlock, defer Lock(). The scan of the files in
599 1 : // the version does not need to block other operations that require the
600 1 : // DB.mu. Drop it for the scan, before re-acquiring it.
601 1 : d.mu.Unlock()
602 1 : defer d.mu.Lock()
603 1 : found, files, err = findFn(rs.current)
604 1 : }()
605 1 : if err != nil {
606 0 : return err
607 0 : }
608 :
609 : // The database lock has been acquired again by the defer within the above
610 : // anonymous function.
611 1 : if !found {
612 1 : // Nothing to do.
613 1 : return nil
614 1 : }
615 :
616 : // After scanning, if we found files to mark, we fetch the current state of
617 : // the LSM (which may have changed) and set MarkedForCompaction on the files,
618 : // and update the version's Stats.MarkedForCompaction count, which are both
619 : // protected by d.mu.
620 :
621 : // Lock the manifest for a coherent view of the LSM. The database lock has
622 : // been re-acquired by the defer within the above anonymous function.
623 1 : d.mu.versions.logLock()
624 1 : vers := d.mu.versions.currentVersion()
625 1 : for l, filesToMark := range files {
626 1 : if len(filesToMark) == 0 {
627 1 : continue
628 : }
629 1 : for _, f := range filesToMark {
630 1 : // Ignore files to be marked that have already been compacted or marked.
631 1 : if f.CompactionState == manifest.CompactionStateCompacted ||
632 1 : f.MarkedForCompaction {
633 0 : continue
634 : }
635 : // Else, mark the file for compaction in this version.
636 1 : vers.Stats.MarkedForCompaction++
637 1 : f.MarkedForCompaction = true
638 : }
639 : // The compaction picker uses the markedForCompactionAnnotator to
640 : // quickly find files marked for compaction, or to quickly determine
641 : // that there are no such files marked for compaction within a level.
642 : // A b-tree node may be annotated with an annotation recording that
643 : // there are no files marked for compaction within the node's subtree,
644 : // based on the assumption that it's static.
645 : //
646 : // Since we're marking files for compaction, these b-tree nodes'
647 : // annotations will be out of date. Clear the compaction-picking
648 : // annotation, so that it's recomputed the next time the compaction
649 : // picker looks for a file marked for compaction.
650 1 : vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
651 : }
652 :
653 : // The 'marked-for-compaction' bit is persisted in the MANIFEST file
654 : // metadata. We've already modified the in-memory file metadata, but the
655 : // manifest hasn't been updated. Force rotation to a new MANIFEST file,
656 : // which will write every file metadata to the new manifest file and ensure
657 : // that the now marked-for-compaction file metadata are persisted as marked.
658 : // NB: This call to logAndApply will unlockthe MANIFEST, which we locked up
659 : // above before obtaining `vers`.
660 1 : return d.mu.versions.logAndApply(
661 1 : jobID,
662 1 : &manifest.VersionEdit{},
663 1 : map[int]*LevelMetrics{},
664 1 : true, /* forceRotation */
665 1 : func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
666 : }
|