Line data Source code
1 : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : stdcmp "cmp"
10 : "fmt"
11 : "slices"
12 : "sort"
13 : "strings"
14 : "sync"
15 : "sync/atomic"
16 :
17 : "github.com/cockroachdb/errors"
18 : "github.com/cockroachdb/pebble/internal/base"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : "github.com/cockroachdb/pebble/sstable"
21 : "github.com/cockroachdb/pebble/sstable/block"
22 : )
23 :
24 : // Compare exports the base.Compare type.
25 : type Compare = base.Compare
26 :
27 : // InternalKey exports the base.InternalKey type.
28 : type InternalKey = base.InternalKey
29 :
30 : // TableInfo contains the common information for table related events.
31 : type TableInfo struct {
32 : // FileNum is the internal DB identifier for the table.
33 : FileNum base.FileNum
34 : // Size is the size of the file in bytes.
35 : Size uint64
36 : // Smallest is the smallest internal key in the table.
37 : Smallest InternalKey
38 : // Largest is the largest internal key in the table.
39 : Largest InternalKey
40 : // SmallestSeqNum is the smallest sequence number in the table.
41 : SmallestSeqNum base.SeqNum
42 : // LargestSeqNum is the largest sequence number in the table.
43 : LargestSeqNum base.SeqNum
44 : }
45 :
46 : // TableStats contains statistics on a table used for compaction heuristics,
47 : // and export via Metrics.
48 : type TableStats struct {
49 : // The total number of entries in the table.
50 : NumEntries uint64
51 : // The number of point and range deletion entries in the table.
52 : NumDeletions uint64
53 : // NumRangeKeySets is the total number of range key sets in the table.
54 : //
55 : // NB: If there's a chance that the sstable contains any range key sets,
56 : // then NumRangeKeySets must be > 0.
57 : NumRangeKeySets uint64
58 : // Estimate of the total disk space that may be dropped by this table's
59 : // point deletions by compacting them.
60 : PointDeletionsBytesEstimate uint64
61 : // Estimate of the total disk space that may be dropped by this table's
62 : // range deletions by compacting them. This estimate is at data-block
63 : // granularity and is not updated if compactions beneath the table reduce
64 : // the amount of reclaimable disk space. It also does not account for
65 : // overlapping data in L0 and ignores L0 sublevels, but the error that
66 : // introduces is expected to be small.
67 : //
68 : // Tables in the bottommost level of the LSM may have a nonzero estimate if
69 : // snapshots or move compactions prevented the elision of their range
70 : // tombstones. A table in the bottommost level that was ingested into L6
71 : // will have a zero estimate, because the file's sequence numbers indicate
72 : // that the tombstone cannot drop any data contained within the file itself.
73 : RangeDeletionsBytesEstimate uint64
74 : // Total size of value blocks and value index block.
75 : ValueBlocksSize uint64
76 : // CompressionType is the compression type of the table.
77 : CompressionType block.Compression
78 : // TombstoneDenseBlocksRatio is the ratio of data blocks in this table that
79 : // fulfills at least one of the following:
80 : // 1. The block contains at least options.Experimental.NumDeletionsThreshold
81 : // point tombstones.
82 : // 2. The ratio of the uncompressed size of point tombstones to the
83 : // uncompressed size of the block is at least
84 : // options.Experimental.DeletionSizeRatioThreshold.
85 : // This statistic is used to determine eligibility for a tombstone density
86 : // compaction.
87 : TombstoneDenseBlocksRatio float64
88 : }
89 :
90 : // boundType represents the type of key (point or range) present as the smallest
91 : // and largest keys.
92 : type boundType uint8
93 :
94 : const (
95 : boundTypePointKey boundType = iota + 1
96 : boundTypeRangeKey
97 : )
98 :
99 : // CompactionState is the compaction state of a file.
100 : //
101 : // The following shows the valid state transitions:
102 : //
103 : // NotCompacting --> Compacting --> Compacted
104 : // ^ |
105 : // | |
106 : // +-------<-------+
107 : //
108 : // Input files to a compaction transition to Compacting when a compaction is
109 : // picked. A file that has finished compacting typically transitions into the
110 : // Compacted state, at which point it is effectively obsolete ("zombied") and
111 : // will eventually be removed from the LSM. A file that has been move-compacted
112 : // will transition from Compacting back into the NotCompacting state, signaling
113 : // that the file may be selected for a subsequent compaction. A failed
114 : // compaction will result in all input tables transitioning from Compacting to
115 : // NotCompacting.
116 : //
117 : // This state is in-memory only. It is not persisted to the manifest.
118 : type CompactionState uint8
119 :
120 : // CompactionStates.
121 : const (
122 : CompactionStateNotCompacting CompactionState = iota
123 : CompactionStateCompacting
124 : CompactionStateCompacted
125 : )
126 :
127 : // String implements fmt.Stringer.
128 0 : func (s CompactionState) String() string {
129 0 : switch s {
130 0 : case CompactionStateNotCompacting:
131 0 : return "NotCompacting"
132 0 : case CompactionStateCompacting:
133 0 : return "Compacting"
134 0 : case CompactionStateCompacted:
135 0 : return "Compacted"
136 0 : default:
137 0 : panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
138 : }
139 : }
140 :
141 : // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
142 : // some version. FileMetadata does not contain the actual level of the sst,
143 : // since such leveled-ssts can move across levels in different versions, while
144 : // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
145 : // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
146 : // only state is FileBacking. A backing-sst is level-less. It is possible for a
147 : // backing-sst to be referred to by a physical sst in one version and by one or
148 : // more virtual ssts in one or more versions. A backing-sst becomes obsolete
149 : // and can be deleted once it is no longer required by any physical or virtual
150 : // sst in any version.
151 : //
152 : // We maintain some invariants:
153 : //
154 : // 1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
155 : // and there will be exactly one FileMetadata associated with the FileNum.
156 : //
157 : // 2. Within a version, a backing-sst is either only referred to by one
158 : // physical sst or one or more virtual ssts.
159 : //
160 : // 3. Once a backing-sst is referred to by a virtual sst in the latest version,
161 : // it cannot go back to being referred to by a physical sst in any future
162 : // version.
163 : //
164 : // Once a physical sst is no longer needed by any version, we will no longer
165 : // maintain the file metadata associated with it. We will still maintain the
166 : // FileBacking associated with the physical sst if the backing sst is required
167 : // by any virtual ssts in any version.
168 : type FileMetadata struct {
169 : // AllowedSeeks is used to determine if a file should be picked for
170 : // a read triggered compaction. It is decremented when read sampling
171 : // in pebble.Iterator after every after every positioning operation
172 : // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
173 : AllowedSeeks atomic.Int64
174 :
175 : // statsValid indicates if stats have been loaded for the table. The
176 : // TableStats structure is populated only if valid is true.
177 : statsValid atomic.Bool
178 :
179 : // FileBacking is the state which backs either a physical or virtual
180 : // sstables.
181 : FileBacking *FileBacking
182 :
183 : // InitAllowedSeeks is the inital value of allowed seeks. This is used
184 : // to re-set allowed seeks on a file once it hits 0.
185 : InitAllowedSeeks int64
186 : // FileNum is the file number.
187 : //
188 : // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
189 : FileNum base.FileNum
190 : // Size is the size of the file, in bytes. Size is an approximate value for
191 : // virtual sstables.
192 : //
193 : // INVARIANTS:
194 : // - When !FileMetadata.Virtual, Size == FileBacking.Size.
195 : // - Size should be non-zero. Size 0 virtual sstables must not be created.
196 : Size uint64
197 : // File creation time in seconds since the epoch (1970-01-01 00:00:00
198 : // UTC). For ingested sstables, this corresponds to the time the file was
199 : // ingested. For virtual sstables, this corresponds to the wall clock time
200 : // when the FileMetadata for the virtual sstable was first created.
201 : CreationTime int64
202 : // LargestSeqNumAbsolute is an upper bound for the largest sequence number
203 : // in the table. This upper bound is guaranteed to be higher than any
204 : // sequence number any of the table's keys have held at any point in time
205 : // while the database has been open. Specifically, if the table contains
206 : // keys that have had their sequence numbers zeroed during a compaction,
207 : // LargestSeqNumAbsolute will be at least as high as the pre-zeroing
208 : // sequence number. LargestSeqNumAbsolute is NOT durably persisted, so after
209 : // a database restart it takes on the value of LargestSeqNum.
210 : LargestSeqNumAbsolute base.SeqNum
211 : // Lower and upper bounds for the smallest and largest sequence numbers in
212 : // the table, across both point and range keys. For physical sstables, these
213 : // values are tight bounds. For virtual sstables, there is no guarantee that
214 : // there will be keys with SmallestSeqNum or LargestSeqNum within virtual
215 : // sstable bounds.
216 : SmallestSeqNum base.SeqNum
217 : LargestSeqNum base.SeqNum
218 : // SmallestPointKey and LargestPointKey are the inclusive bounds for the
219 : // internal point keys stored in the table. This includes RANGEDELs, which
220 : // alter point keys.
221 : // NB: these field should be set using ExtendPointKeyBounds. They are left
222 : // exported for reads as an optimization.
223 : SmallestPointKey InternalKey
224 : LargestPointKey InternalKey
225 : // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
226 : // internal range keys stored in the table.
227 : // NB: these field should be set using ExtendRangeKeyBounds. They are left
228 : // exported for reads as an optimization.
229 : SmallestRangeKey InternalKey
230 : LargestRangeKey InternalKey
231 : // Smallest and Largest are the inclusive bounds for the internal keys stored
232 : // in the table, across both point and range keys.
233 : // NB: these fields are derived from their point and range key equivalents,
234 : // and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
235 : Smallest InternalKey
236 : Largest InternalKey
237 : // Stats describe table statistics. Protected by DB.mu.
238 : //
239 : // For virtual sstables, set stats upon virtual sstable creation as
240 : // asynchronous computation of stats is not currently supported.
241 : //
242 : // TODO(bananabrick): To support manifest replay for virtual sstables, we
243 : // probably need to compute virtual sstable stats asynchronously. Otherwise,
244 : // we'd have to write virtual sstable stats to the version edit.
245 : Stats TableStats
246 :
247 : // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
248 : // pick L0 compactions. Only accurate for the most recent Version.
249 : SubLevel int
250 : L0Index int
251 : minIntervalIndex int
252 : maxIntervalIndex int
253 :
254 : // NB: the alignment of this struct is 8 bytes. We pack all the bools to
255 : // ensure an optimal packing.
256 :
257 : // IsIntraL0Compacting is set to True if this file is part of an intra-L0
258 : // compaction. When it's true, IsCompacting must also return true. If
259 : // Compacting is true and IsIntraL0Compacting is false for an L0 file, the
260 : // file must be part of a compaction to Lbase.
261 : IsIntraL0Compacting bool
262 : CompactionState CompactionState
263 : // True if compaction of this file has been explicitly requested.
264 : // Previously, RocksDB and earlier versions of Pebble allowed this
265 : // flag to be set by a user table property collector. Some earlier
266 : // versions of Pebble respected this flag, while other more recent
267 : // versions ignored this flag.
268 : //
269 : // More recently this flag has been repurposed to facilitate the
270 : // compaction of 'atomic compaction units'. Files marked for
271 : // compaction are compacted in a rewrite compaction at the lowest
272 : // possible compaction priority.
273 : //
274 : // NB: A count of files marked for compaction is maintained on
275 : // Version, and compaction picking reads cached annotations
276 : // determined by this field.
277 : //
278 : // Protected by DB.mu.
279 : MarkedForCompaction bool
280 : // HasPointKeys tracks whether the table contains point keys (including
281 : // RANGEDELs). If a table contains only range deletions, HasPointsKeys is
282 : // still true.
283 : HasPointKeys bool
284 : // HasRangeKeys tracks whether the table contains any range keys.
285 : HasRangeKeys bool
286 : // smallestSet and largestSet track whether the overall bounds have been set.
287 : boundsSet bool
288 : // boundTypeSmallest and boundTypeLargest provide an indication as to which
289 : // key type (point or range) corresponds to the smallest and largest overall
290 : // table bounds.
291 : boundTypeSmallest, boundTypeLargest boundType
292 : // Virtual is true if the FileMetadata belongs to a virtual sstable.
293 : Virtual bool
294 :
295 : // SyntheticPrefix is used to prepend a prefix to all keys and/or override all
296 : // suffixes in a table; used for some virtual tables.
297 : SyntheticPrefixAndSuffix sstable.SyntheticPrefixAndSuffix
298 : }
299 :
300 : // InternalKeyBounds returns the set of overall table bounds.
301 0 : func (m *FileMetadata) InternalKeyBounds() (InternalKey, InternalKey) {
302 0 : return m.Smallest, m.Largest
303 0 : }
304 :
305 : // UserKeyBounds returns the user key bounds that correspond to m.Smallest and
306 : // Largest. Because we do not allow split user keys, the user key bounds of
307 : // files within a level do not overlap.
308 1 : func (m *FileMetadata) UserKeyBounds() base.UserKeyBounds {
309 1 : return base.UserKeyBoundsFromInternal(m.Smallest, m.Largest)
310 1 : }
311 :
312 : // UserKeyBoundsByType returns the user key bounds for the given key types.
313 : // Note that the returned bounds are invalid when requesting KeyTypePoint but
314 : // HasPointKeys is false, or when requesting KeyTypeRange and HasRangeKeys is
315 : // false.
316 1 : func (m *FileMetadata) UserKeyBoundsByType(keyType KeyType) base.UserKeyBounds {
317 1 : switch keyType {
318 1 : case KeyTypePoint:
319 1 : return base.UserKeyBoundsFromInternal(m.SmallestPointKey, m.LargestPointKey)
320 1 : case KeyTypeRange:
321 1 : return base.UserKeyBoundsFromInternal(m.SmallestRangeKey, m.LargestRangeKey)
322 0 : default:
323 0 : return base.UserKeyBoundsFromInternal(m.Smallest, m.Largest)
324 : }
325 : }
326 :
327 : // SyntheticSeqNum returns a SyntheticSeqNum which is set when SmallestSeqNum
328 : // equals LargestSeqNum.
329 1 : func (m *FileMetadata) SyntheticSeqNum() sstable.SyntheticSeqNum {
330 1 : if m.SmallestSeqNum == m.LargestSeqNum {
331 1 : return sstable.SyntheticSeqNum(m.SmallestSeqNum)
332 1 : }
333 1 : return sstable.NoSyntheticSeqNum
334 : }
335 :
336 : // IterTransforms returns an sstable.IterTransforms populated according to the
337 : // file.
338 1 : func (m *FileMetadata) IterTransforms() sstable.IterTransforms {
339 1 : return sstable.IterTransforms{
340 1 : SyntheticSeqNum: m.SyntheticSeqNum(),
341 1 : SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix,
342 1 : }
343 1 : }
344 :
345 : // FragmentIterTransforms returns an sstable.FragmentIterTransforms populated
346 : // according to the file.
347 1 : func (m *FileMetadata) FragmentIterTransforms() sstable.FragmentIterTransforms {
348 1 : return sstable.FragmentIterTransforms{
349 1 : SyntheticSeqNum: m.SyntheticSeqNum(),
350 1 : SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix,
351 1 : }
352 1 : }
353 :
354 : // PhysicalFileMeta is used by functions which want a guarantee that their input
355 : // belongs to a physical sst and not a virtual sst.
356 : //
357 : // NB: This type should only be constructed by calling
358 : // FileMetadata.PhysicalMeta.
359 : type PhysicalFileMeta struct {
360 : *FileMetadata
361 : }
362 :
363 : // VirtualFileMeta is used by functions which want a guarantee that their input
364 : // belongs to a virtual sst and not a physical sst.
365 : //
366 : // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
367 : // fields have additional invariants imposed on them, and/or slightly varying
368 : // meanings:
369 : // - Smallest and Largest (and their counterparts
370 : // {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
371 : // key at that exact bound. We make the effort to determine the next smallest
372 : // or largest key in an sstable after virtualizing it, to maintain this
373 : // tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
374 : // could mean that a rangedel or range key ends at that user key, or has been
375 : // truncated to that user key.
376 : // - One invariant is that if a rangedel or range key is truncated on its
377 : // upper bound, the virtual sstable *must* have a rangedel or range key
378 : // sentinel key as its upper bound. This is because truncation yields
379 : // an exclusive upper bound for the rangedel/rangekey, and if there are
380 : // any points at that exclusive upper bound within the same virtual
381 : // sstable, those could get uncovered by this truncation. We enforce this
382 : // invariant in calls to keyspan.Truncate.
383 : // - Size is an estimate of the size of the virtualized portion of this sstable.
384 : // The underlying file's size is stored in FileBacking.Size, though it could
385 : // also be estimated or could correspond to just the referenced portion of
386 : // a file (eg. if the file originated on another node).
387 : // - Size must be > 0.
388 : // - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
389 : // This means that all keys in the virtual sstable must have seqnums within
390 : // [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
391 : // a key with a seqnum at either of the bounds. Calculating tight seqnum
392 : // bounds would be too expensive and deliver little value.
393 : //
394 : // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
395 : type VirtualFileMeta struct {
396 : *FileMetadata
397 : }
398 :
399 : // VirtualReaderParams fills in the parameters necessary to create a virtual
400 : // sstable reader.
401 1 : func (m VirtualFileMeta) VirtualReaderParams(isShared bool) sstable.VirtualReaderParams {
402 1 : return sstable.VirtualReaderParams{
403 1 : Lower: m.Smallest,
404 1 : Upper: m.Largest,
405 1 : FileNum: m.FileNum,
406 1 : IsSharedIngested: isShared && m.SyntheticSeqNum() != 0,
407 1 : Size: m.Size,
408 1 : BackingSize: m.FileBacking.Size,
409 1 : }
410 1 : }
411 :
412 : // PhysicalMeta should be the only source of creating the PhysicalFileMeta
413 : // wrapper type.
414 1 : func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
415 1 : if m.Virtual {
416 0 : panic("pebble: file metadata does not belong to a physical sstable")
417 : }
418 1 : return PhysicalFileMeta{
419 1 : m,
420 1 : }
421 : }
422 :
423 : // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
424 : // type.
425 1 : func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
426 1 : if !m.Virtual {
427 0 : panic("pebble: file metadata does not belong to a virtual sstable")
428 : }
429 1 : return VirtualFileMeta{
430 1 : m,
431 1 : }
432 : }
433 :
434 : // FileBacking either backs a single physical sstable, or one or more virtual
435 : // sstables.
436 : //
437 : // See the comment above the FileMetadata type for sstable terminology.
438 : type FileBacking struct {
439 : DiskFileNum base.DiskFileNum
440 : Size uint64
441 :
442 : // Reference count for the backing file, used to determine when a backing file
443 : // is obsolete and can be removed.
444 : //
445 : // The reference count is at least the number of distinct tables that use this
446 : // backing across all versions that have a non-zero reference count. The tables
447 : // in each version are maintained in a copy-on-write B-tree and each B-tree node
448 : // keeps a reference on the respective backings.
449 : //
450 : // In addition, a reference count is taken for every backing in the latest
451 : // version's VirtualBackings (necessary to support Protect/Unprotect).
452 : refs atomic.Int32
453 : }
454 :
455 : // MustHaveRefs asserts that the backing has a positive refcount.
456 1 : func (b *FileBacking) MustHaveRefs() {
457 1 : if refs := b.refs.Load(); refs <= 0 {
458 0 : panic(errors.AssertionFailedf("backing %s must have positive refcount (refs=%d)",
459 0 : b.DiskFileNum, refs))
460 : }
461 : }
462 :
463 : // Ref increments the backing's ref count.
464 1 : func (b *FileBacking) Ref() {
465 1 : b.refs.Add(1)
466 1 : }
467 :
468 : // IsUnused returns if the backing is not being used by any tables in a version
469 : // or btree.
470 1 : func (b *FileBacking) IsUnused() bool {
471 1 : return b.refs.Load() == 0
472 1 : }
473 :
474 : // Unref decrements the backing's ref count (and returns the new count).
475 1 : func (b *FileBacking) Unref() int32 {
476 1 : v := b.refs.Add(-1)
477 1 : if invariants.Enabled && v < 0 {
478 0 : panic("pebble: invalid FileMetadata refcounting")
479 : }
480 1 : return v
481 : }
482 :
483 : // InitPhysicalBacking allocates and sets the FileBacking which is required by a
484 : // physical sstable FileMetadata.
485 : //
486 : // Ensure that the state required by FileBacking, such as the FileNum, is
487 : // already set on the FileMetadata before InitPhysicalBacking is called.
488 : // Calling InitPhysicalBacking only after the relevant state has been set in the
489 : // FileMetadata is not necessary in tests which don't rely on FileBacking.
490 1 : func (m *FileMetadata) InitPhysicalBacking() {
491 1 : if m.Virtual {
492 0 : panic("pebble: virtual sstables should use a pre-existing FileBacking")
493 : }
494 1 : if m.FileBacking == nil {
495 1 : m.FileBacking = &FileBacking{
496 1 : DiskFileNum: base.PhysicalTableDiskFileNum(m.FileNum),
497 1 : Size: m.Size,
498 1 : }
499 1 : }
500 : }
501 :
502 : // InitProviderBacking creates a new FileBacking for a file backed by
503 : // an objstorage.Provider.
504 1 : func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum, size uint64) {
505 1 : if !m.Virtual {
506 0 : panic("pebble: provider-backed sstables must be virtual")
507 : }
508 1 : if m.FileBacking == nil {
509 1 : m.FileBacking = &FileBacking{DiskFileNum: fileNum}
510 1 : }
511 1 : m.FileBacking.Size = size
512 : }
513 :
514 : // ValidateVirtual should be called once the FileMetadata for a virtual sstable
515 : // is created to verify that the fields of the virtual sstable are sound.
516 1 : func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
517 1 : switch {
518 0 : case !m.Virtual:
519 0 : panic("pebble: invalid virtual sstable")
520 0 : case createdFrom.SmallestSeqNum != m.SmallestSeqNum:
521 0 : panic("pebble: invalid smallest sequence number for virtual sstable")
522 0 : case createdFrom.LargestSeqNum != m.LargestSeqNum:
523 0 : panic("pebble: invalid largest sequence number for virtual sstable")
524 0 : case createdFrom.LargestSeqNumAbsolute != m.LargestSeqNumAbsolute:
525 0 : panic("pebble: invalid largest absolute sequence number for virtual sstable")
526 0 : case createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking:
527 0 : panic("pebble: invalid physical sstable state for virtual sstable")
528 0 : case m.Size == 0:
529 0 : panic("pebble: virtual sstable size must be set upon creation")
530 : }
531 : }
532 :
533 : // SetCompactionState transitions this file's compaction state to the given
534 : // state. Protected by DB.mu.
535 1 : func (m *FileMetadata) SetCompactionState(to CompactionState) {
536 1 : if invariants.Enabled {
537 1 : transitionErr := func() error {
538 0 : return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
539 0 : }
540 1 : switch m.CompactionState {
541 1 : case CompactionStateNotCompacting:
542 1 : if to != CompactionStateCompacting {
543 0 : panic(transitionErr())
544 : }
545 1 : case CompactionStateCompacting:
546 1 : if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
547 0 : panic(transitionErr())
548 : }
549 0 : case CompactionStateCompacted:
550 0 : panic(transitionErr())
551 0 : default:
552 0 : panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
553 : }
554 : }
555 1 : m.CompactionState = to
556 : }
557 :
558 : // IsCompacting returns true if this file's compaction state is
559 : // CompactionStateCompacting. Protected by DB.mu.
560 1 : func (m *FileMetadata) IsCompacting() bool {
561 1 : return m.CompactionState == CompactionStateCompacting
562 1 : }
563 :
564 : // StatsValid returns true if the table stats have been populated. If StatValid
565 : // returns true, the Stats field may be read (with or without holding the
566 : // database mutex).
567 1 : func (m *FileMetadata) StatsValid() bool {
568 1 : return m.statsValid.Load()
569 1 : }
570 :
571 : // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
572 : // while populating TableStats and calling StatsMarkValud. Once stats are
573 : // populated, they must not be mutated.
574 1 : func (m *FileMetadata) StatsMarkValid() {
575 1 : m.statsValid.Store(true)
576 1 : }
577 :
578 : // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
579 : // and overall table bounds with the given smallest and largest keys. The
580 : // smallest and largest bounds may not be extended if the table already has a
581 : // bound that is smaller or larger, respectively. The receiver is returned.
582 : // NB: calling this method should be preferred to manually setting the bounds by
583 : // manipulating the fields directly, to maintain certain invariants.
584 : func (m *FileMetadata) ExtendPointKeyBounds(
585 : cmp Compare, smallest, largest InternalKey,
586 1 : ) *FileMetadata {
587 1 : // Update the point key bounds.
588 1 : if !m.HasPointKeys {
589 1 : m.SmallestPointKey, m.LargestPointKey = smallest, largest
590 1 : m.HasPointKeys = true
591 1 : } else {
592 1 : if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
593 1 : m.SmallestPointKey = smallest
594 1 : }
595 1 : if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
596 1 : m.LargestPointKey = largest
597 1 : }
598 : }
599 : // Update the overall bounds.
600 1 : m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
601 1 : return m
602 : }
603 :
604 : // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
605 : // and overall table bounds with the given smallest and largest keys. The
606 : // smallest and largest bounds may not be extended if the table already has a
607 : // bound that is smaller or larger, respectively. The receiver is returned.
608 : // NB: calling this method should be preferred to manually setting the bounds by
609 : // manipulating the fields directly, to maintain certain invariants.
610 : func (m *FileMetadata) ExtendRangeKeyBounds(
611 : cmp Compare, smallest, largest InternalKey,
612 1 : ) *FileMetadata {
613 1 : // Update the range key bounds.
614 1 : if !m.HasRangeKeys {
615 1 : m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
616 1 : m.HasRangeKeys = true
617 1 : } else {
618 0 : if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
619 0 : m.SmallestRangeKey = smallest
620 0 : }
621 0 : if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
622 0 : m.LargestRangeKey = largest
623 0 : }
624 : }
625 : // Update the overall bounds.
626 1 : m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
627 1 : return m
628 : }
629 :
630 : // extendOverallBounds attempts to extend the overall table lower and upper
631 : // bounds. The given bounds may not be used if a lower or upper bound already
632 : // exists that is smaller or larger than the given keys, respectively. The given
633 : // boundType will be used if the bounds are updated.
634 : func (m *FileMetadata) extendOverallBounds(
635 : cmp Compare, smallest, largest InternalKey, bTyp boundType,
636 1 : ) {
637 1 : if !m.boundsSet {
638 1 : m.Smallest, m.Largest = smallest, largest
639 1 : m.boundsSet = true
640 1 : m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
641 1 : } else {
642 1 : if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
643 1 : m.Smallest = smallest
644 1 : m.boundTypeSmallest = bTyp
645 1 : }
646 1 : if base.InternalCompare(cmp, largest, m.Largest) > 0 {
647 1 : m.Largest = largest
648 1 : m.boundTypeLargest = bTyp
649 1 : }
650 : }
651 : }
652 :
653 : // Overlaps returns true if the file key range overlaps with the given user key bounds.
654 1 : func (m *FileMetadata) Overlaps(cmp Compare, bounds *base.UserKeyBounds) bool {
655 1 : b := m.UserKeyBounds()
656 1 : return b.Overlaps(cmp, bounds)
657 1 : }
658 :
659 : // ContainedWithinSpan returns true if the file key range completely overlaps with the
660 : // given range ("end" is assumed to exclusive).
661 0 : func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
662 0 : lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
663 0 : return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
664 0 : }
665 :
666 : // ContainsKeyType returns whether or not the file contains keys of the provided
667 : // type.
668 1 : func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
669 1 : switch kt {
670 1 : case KeyTypePointAndRange:
671 1 : return true
672 1 : case KeyTypePoint:
673 1 : return m.HasPointKeys
674 1 : case KeyTypeRange:
675 1 : return m.HasRangeKeys
676 0 : default:
677 0 : panic("unrecognized key type")
678 : }
679 : }
680 :
681 : // SmallestBound returns the file's smallest bound of the key type. It returns a
682 : // false second return value if the file does not contain any keys of the key
683 : // type.
684 1 : func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
685 1 : switch kt {
686 0 : case KeyTypePointAndRange:
687 0 : return &m.Smallest, true
688 1 : case KeyTypePoint:
689 1 : return &m.SmallestPointKey, m.HasPointKeys
690 1 : case KeyTypeRange:
691 1 : return &m.SmallestRangeKey, m.HasRangeKeys
692 0 : default:
693 0 : panic("unrecognized key type")
694 : }
695 : }
696 :
697 : // LargestBound returns the file's largest bound of the key type. It returns a
698 : // false second return value if the file does not contain any keys of the key
699 : // type.
700 1 : func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
701 1 : switch kt {
702 0 : case KeyTypePointAndRange:
703 0 : return &m.Largest, true
704 1 : case KeyTypePoint:
705 1 : return &m.LargestPointKey, m.HasPointKeys
706 1 : case KeyTypeRange:
707 1 : return &m.LargestRangeKey, m.HasRangeKeys
708 0 : default:
709 0 : panic("unrecognized key type")
710 : }
711 : }
712 :
713 : const (
714 : maskContainsPointKeys = 1 << 0
715 : maskSmallest = 1 << 1
716 : maskLargest = 1 << 2
717 : )
718 :
719 : // boundsMarker returns a marker byte whose bits encode the following
720 : // information (in order from least significant bit):
721 : // - if the table contains point keys
722 : // - if the table's smallest key is a point key
723 : // - if the table's largest key is a point key
724 1 : func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
725 1 : if m.HasPointKeys {
726 1 : sentinel |= maskContainsPointKeys
727 1 : }
728 1 : switch m.boundTypeSmallest {
729 1 : case boundTypePointKey:
730 1 : sentinel |= maskSmallest
731 1 : case boundTypeRangeKey:
732 : // No op - leave bit unset.
733 0 : default:
734 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
735 : }
736 1 : switch m.boundTypeLargest {
737 1 : case boundTypePointKey:
738 1 : sentinel |= maskLargest
739 1 : case boundTypeRangeKey:
740 : // No op - leave bit unset.
741 0 : default:
742 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
743 : }
744 1 : return
745 : }
746 :
747 : // String implements fmt.Stringer, printing the file number and the overall
748 : // table bounds.
749 1 : func (m *FileMetadata) String() string {
750 1 : return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
751 1 : }
752 :
753 : // DebugString returns a verbose representation of FileMetadata, typically for
754 : // use in tests and debugging, returning the file number and the point, range
755 : // and overall bounds for the table.
756 1 : func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
757 1 : var b bytes.Buffer
758 1 : if m.Virtual {
759 1 : fmt.Fprintf(&b, "%s(%s):[%s-%s]",
760 1 : m.FileNum, m.FileBacking.DiskFileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
761 1 : } else {
762 1 : fmt.Fprintf(&b, "%s:[%s-%s]",
763 1 : m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
764 1 : }
765 1 : if !verbose {
766 1 : return b.String()
767 1 : }
768 0 : fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
769 0 : if m.HasPointKeys {
770 0 : fmt.Fprintf(&b, " points:[%s-%s]",
771 0 : m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
772 0 : }
773 0 : if m.HasRangeKeys {
774 0 : fmt.Fprintf(&b, " ranges:[%s-%s]",
775 0 : m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
776 0 : }
777 0 : if m.Size != 0 {
778 0 : fmt.Fprintf(&b, " size:%d", m.Size)
779 0 : }
780 0 : return b.String()
781 : }
782 :
783 : // ParseFileMetadataDebug parses a FileMetadata from its DebugString
784 : // representation.
785 0 : func ParseFileMetadataDebug(s string) (_ *FileMetadata, err error) {
786 0 : defer func() {
787 0 : if r := recover(); r != nil {
788 0 : err = errors.CombineErrors(err, errFromPanic(r))
789 0 : }
790 : }()
791 :
792 : // Input format:
793 : // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] size:5
794 0 : m := &FileMetadata{}
795 0 : p := makeDebugParser(s)
796 0 : m.FileNum = p.FileNum()
797 0 : var backingNum base.DiskFileNum
798 0 : if p.Peek() == "(" {
799 0 : p.Expect("(")
800 0 : backingNum = p.DiskFileNum()
801 0 : p.Expect(")")
802 0 : }
803 0 : p.Expect(":", "[")
804 0 : m.Smallest = p.InternalKey()
805 0 : p.Expect("-")
806 0 : m.Largest = p.InternalKey()
807 0 : p.Expect("]")
808 0 :
809 0 : for !p.Done() {
810 0 : field := p.Next()
811 0 : p.Expect(":")
812 0 : switch field {
813 0 : case "seqnums":
814 0 : p.Expect("[")
815 0 : m.SmallestSeqNum = p.SeqNum()
816 0 : p.Expect("-")
817 0 : m.LargestSeqNum = p.SeqNum()
818 0 : p.Expect("]")
819 0 : m.LargestSeqNumAbsolute = m.LargestSeqNum
820 :
821 0 : case "points":
822 0 : p.Expect("[")
823 0 : m.SmallestPointKey = p.InternalKey()
824 0 : p.Expect("-")
825 0 : m.LargestPointKey = p.InternalKey()
826 0 : m.HasPointKeys = true
827 0 : p.Expect("]")
828 :
829 0 : case "ranges":
830 0 : p.Expect("[")
831 0 : m.SmallestRangeKey = p.InternalKey()
832 0 : p.Expect("-")
833 0 : m.LargestRangeKey = p.InternalKey()
834 0 : m.HasRangeKeys = true
835 0 : p.Expect("]")
836 :
837 0 : case "size":
838 0 : m.Size = p.Uint64()
839 :
840 0 : default:
841 0 : p.Errf("unknown field %q", field)
842 : }
843 : }
844 :
845 : // By default, when the parser sees just the overall bounds, we set the point
846 : // keys. This preserves backwards compatability with existing test cases that
847 : // specify only the overall bounds.
848 0 : if !m.HasPointKeys && !m.HasRangeKeys {
849 0 : m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
850 0 : m.HasPointKeys = true
851 0 : }
852 0 : if backingNum == 0 {
853 0 : m.InitPhysicalBacking()
854 0 : } else {
855 0 : m.Virtual = true
856 0 : m.InitProviderBacking(backingNum, 0 /* size */)
857 0 : }
858 0 : return m, nil
859 : }
860 :
861 : // Validate validates the metadata for consistency with itself, returning an
862 : // error if inconsistent.
863 1 : func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
864 1 : // Combined range and point key validation.
865 1 :
866 1 : if !m.HasPointKeys && !m.HasRangeKeys {
867 0 : return base.CorruptionErrorf("file %s has neither point nor range keys",
868 0 : errors.Safe(m.FileNum))
869 0 : }
870 1 : if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
871 0 : return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
872 0 : errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
873 0 : m.Largest.Pretty(formatKey))
874 0 : }
875 1 : if m.SmallestSeqNum > m.LargestSeqNum {
876 0 : return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
877 0 : errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
878 0 : }
879 1 : if m.LargestSeqNumAbsolute < m.LargestSeqNum {
880 0 : return base.CorruptionErrorf("file %s has inconsistent absolute largest seqnum bounds: %d vs %d",
881 0 : errors.Safe(m.FileNum), m.LargestSeqNumAbsolute, m.LargestSeqNum)
882 0 : }
883 :
884 : // Point key validation.
885 :
886 1 : if m.HasPointKeys {
887 1 : if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
888 0 : return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
889 0 : errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
890 0 : m.LargestPointKey.Pretty(formatKey))
891 0 : }
892 1 : if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
893 1 : base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
894 0 : return base.CorruptionErrorf(
895 0 : "file %s has inconsistent point key bounds relative to overall bounds: "+
896 0 : "overall = [%s-%s], point keys = [%s-%s]",
897 0 : errors.Safe(m.FileNum),
898 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
899 0 : m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
900 0 : )
901 0 : }
902 1 : if !isValidPointBoundKeyKind[m.SmallestPointKey.Kind()] {
903 0 : return base.CorruptionErrorf("file %s has invalid smallest point key kind", m)
904 0 : }
905 1 : if !isValidPointBoundKeyKind[m.LargestPointKey.Kind()] {
906 0 : return base.CorruptionErrorf("file %s has invalid largest point key kind", m)
907 0 : }
908 : }
909 :
910 : // Range key validation.
911 :
912 1 : if m.HasRangeKeys {
913 1 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
914 0 : return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
915 0 : errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
916 0 : m.LargestRangeKey.Pretty(formatKey))
917 0 : }
918 1 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
919 1 : base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
920 0 : return base.CorruptionErrorf(
921 0 : "file %s has inconsistent range key bounds relative to overall bounds: "+
922 0 : "overall = [%s-%s], range keys = [%s-%s]",
923 0 : errors.Safe(m.FileNum),
924 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
925 0 : m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
926 0 : )
927 0 : }
928 1 : if !isValidRangeKeyBoundKeyKind[m.SmallestRangeKey.Kind()] {
929 0 : return base.CorruptionErrorf("file %s has invalid smallest range key kind", m)
930 0 : }
931 1 : if !isValidRangeKeyBoundKeyKind[m.LargestRangeKey.Kind()] {
932 0 : return base.CorruptionErrorf("file %s has invalid largest range key kind", m)
933 0 : }
934 : }
935 :
936 : // Ensure that FileMetadata.Init was called.
937 1 : if m.FileBacking == nil {
938 0 : return base.CorruptionErrorf("file metadata FileBacking not set")
939 0 : }
940 :
941 1 : if m.SyntheticPrefixAndSuffix.HasPrefix() {
942 1 : if !m.Virtual {
943 0 : return base.CorruptionErrorf("non-virtual file with synthetic prefix")
944 0 : }
945 1 : if !bytes.HasPrefix(m.Smallest.UserKey, m.SyntheticPrefixAndSuffix.Prefix()) {
946 0 : return base.CorruptionErrorf("virtual file with synthetic prefix has smallest key with a different prefix: %s", m.Smallest.Pretty(formatKey))
947 0 : }
948 1 : if !bytes.HasPrefix(m.Largest.UserKey, m.SyntheticPrefixAndSuffix.Prefix()) {
949 0 : return base.CorruptionErrorf("virtual file with synthetic prefix has largest key with a different prefix: %s", m.Largest.Pretty(formatKey))
950 0 : }
951 : }
952 :
953 1 : if m.SyntheticPrefixAndSuffix.HasSuffix() {
954 1 : if !m.Virtual {
955 0 : return base.CorruptionErrorf("non-virtual file with synthetic suffix")
956 0 : }
957 : }
958 :
959 1 : return nil
960 : }
961 :
962 : var (
963 : isValidPointBoundKeyKind = [base.InternalKeyKindMax + 1]bool{
964 : base.InternalKeyKindDelete: true,
965 : base.InternalKeyKindSet: true,
966 : base.InternalKeyKindMerge: true,
967 : base.InternalKeyKindSingleDelete: true,
968 : base.InternalKeyKindRangeDelete: true,
969 : base.InternalKeyKindSetWithDelete: true,
970 : base.InternalKeyKindDeleteSized: true,
971 : }
972 : isValidRangeKeyBoundKeyKind = [base.InternalKeyKindMax + 1]bool{
973 : base.InternalKeyKindRangeKeySet: true,
974 : base.InternalKeyKindRangeKeyUnset: true,
975 : base.InternalKeyKindRangeKeyDelete: true,
976 : }
977 : )
978 :
979 : // TableInfo returns a subset of the FileMetadata state formatted as a
980 : // TableInfo.
981 1 : func (m *FileMetadata) TableInfo() TableInfo {
982 1 : return TableInfo{
983 1 : FileNum: m.FileNum,
984 1 : Size: m.Size,
985 1 : Smallest: m.Smallest,
986 1 : Largest: m.Largest,
987 1 : SmallestSeqNum: m.SmallestSeqNum,
988 1 : LargestSeqNum: m.LargestSeqNum,
989 1 : }
990 1 : }
991 :
992 1 : func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
993 1 : // NB: This is the same ordering that RocksDB uses for L0 files.
994 1 :
995 1 : // Sort first by largest sequence number.
996 1 : if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
997 1 : return v
998 1 : }
999 : // Then by smallest sequence number.
1000 1 : if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
1001 1 : return v
1002 1 : }
1003 : // Break ties by file number.
1004 1 : return stdcmp.Compare(m.FileNum, b.FileNum)
1005 : }
1006 :
1007 1 : func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
1008 1 : return m.cmpSeqNum(b) < 0
1009 1 : }
1010 :
1011 1 : func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
1012 1 : return base.InternalCompare(cmp, m.Smallest, b.Smallest)
1013 1 : }
1014 :
1015 : // KeyRange returns the minimum smallest and maximum largest internalKey for
1016 : // all the FileMetadata in iters.
1017 1 : func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
1018 1 : first := true
1019 1 : for _, iter := range iters {
1020 1 : for meta := iter.First(); meta != nil; meta = iter.Next() {
1021 1 : if first {
1022 1 : first = false
1023 1 : smallest, largest = meta.Smallest, meta.Largest
1024 1 : continue
1025 : }
1026 1 : if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
1027 1 : smallest = meta.Smallest
1028 1 : }
1029 1 : if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
1030 1 : largest = meta.Largest
1031 1 : }
1032 : }
1033 : }
1034 1 : return smallest, largest
1035 : }
1036 :
1037 : type bySeqNum []*FileMetadata
1038 :
1039 1 : func (b bySeqNum) Len() int { return len(b) }
1040 1 : func (b bySeqNum) Less(i, j int) bool {
1041 1 : return b[i].lessSeqNum(b[j])
1042 1 : }
1043 1 : func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
1044 :
1045 : // SortBySeqNum sorts the specified files by increasing sequence number.
1046 1 : func SortBySeqNum(files []*FileMetadata) {
1047 1 : sort.Sort(bySeqNum(files))
1048 1 : }
1049 :
1050 : type bySmallest struct {
1051 : files []*FileMetadata
1052 : cmp Compare
1053 : }
1054 :
1055 0 : func (b bySmallest) Len() int { return len(b.files) }
1056 0 : func (b bySmallest) Less(i, j int) bool {
1057 0 : return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
1058 0 : }
1059 0 : func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
1060 :
1061 : // SortBySmallest sorts the specified files by smallest key using the supplied
1062 : // comparison function to order user keys.
1063 0 : func SortBySmallest(files []*FileMetadata, cmp Compare) {
1064 0 : sort.Sort(bySmallest{files, cmp})
1065 0 : }
1066 :
1067 : // NumLevels is the number of levels a Version contains.
1068 : const NumLevels = 7
1069 :
1070 : // NewVersion constructs a new Version with the provided files. It requires
1071 : // the provided files are already well-ordered. It's intended for testing.
1072 : func NewVersion(
1073 : comparer *base.Comparer, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
1074 0 : ) *Version {
1075 0 : v := &Version{
1076 0 : cmp: comparer,
1077 0 : }
1078 0 : for l := range files {
1079 0 : // NB: We specifically insert `files` into the B-Tree in the order
1080 0 : // they appear within `files`. Some tests depend on this behavior in
1081 0 : // order to test consistency checking, etc. Once we've constructed the
1082 0 : // initial B-Tree, we swap out the btreeCmp for the correct one.
1083 0 : // TODO(jackson): Adjust or remove the tests and remove this.
1084 0 : v.Levels[l].tree, _ = makeBTree(comparer.Compare, btreeCmpSpecificOrder(files[l]), files[l])
1085 0 : v.Levels[l].level = l
1086 0 : if l == 0 {
1087 0 : v.Levels[l].tree.bcmp = btreeCmpSeqNum
1088 0 : } else {
1089 0 : v.Levels[l].tree.bcmp = btreeCmpSmallestKey(comparer.Compare)
1090 0 : }
1091 0 : for _, f := range files[l] {
1092 0 : v.Levels[l].totalSize += f.Size
1093 0 : }
1094 : }
1095 0 : if err := v.InitL0Sublevels(flushSplitBytes); err != nil {
1096 0 : panic(err)
1097 : }
1098 0 : return v
1099 : }
1100 :
1101 : // TestingNewVersion returns a blank Version, used for tests.
1102 0 : func TestingNewVersion(comparer *base.Comparer) *Version {
1103 0 : return &Version{
1104 0 : cmp: comparer,
1105 0 : }
1106 0 : }
1107 :
1108 : // Version is a collection of file metadata for on-disk tables at various
1109 : // levels. In-memory DBs are written to level-0 tables, and compactions
1110 : // migrate data from level N to level N+1. The tables map internal keys (which
1111 : // are a user key, a delete or set bit, and a sequence number) to user values.
1112 : //
1113 : // The tables at level 0 are sorted by largest sequence number. Due to file
1114 : // ingestion, there may be overlap in the ranges of sequence numbers contain in
1115 : // level 0 sstables. In particular, it is valid for one level 0 sstable to have
1116 : // the seqnum range [1,100] while an adjacent sstable has the seqnum range
1117 : // [50,50]. This occurs when the [50,50] table was ingested and given a global
1118 : // seqnum. The ingestion code will have ensured that the [50,50] sstable will
1119 : // not have any keys that overlap with the [1,100] in the seqnum range
1120 : // [1,49]. The range of internal keys [fileMetadata.smallest,
1121 : // fileMetadata.largest] in each level 0 table may overlap.
1122 : //
1123 : // The tables at any non-0 level are sorted by their internal key range and any
1124 : // two tables at the same non-0 level do not overlap.
1125 : //
1126 : // The internal key ranges of two tables at different levels X and Y may
1127 : // overlap, for any X != Y.
1128 : //
1129 : // Finally, for every internal key in a table at level X, there is no internal
1130 : // key in a higher level table that has both the same user key and a higher
1131 : // sequence number.
1132 : type Version struct {
1133 : refs atomic.Int32
1134 :
1135 : // The level 0 sstables are organized in a series of sublevels. Similar to
1136 : // the seqnum invariant in normal levels, there is no internal key in a
1137 : // higher level table that has both the same user key and a higher sequence
1138 : // number. Within a sublevel, tables are sorted by their internal key range
1139 : // and any two tables at the same sublevel do not overlap. Unlike the normal
1140 : // levels, sublevel n contains older tables (lower sequence numbers) than
1141 : // sublevel n+1.
1142 : //
1143 : // The L0Sublevels struct is mostly used for compaction picking. As most
1144 : // internal data structures in it are only necessary for compaction picking
1145 : // and not for iterator creation, the reference to L0Sublevels is nil'd
1146 : // after this version becomes the non-newest version, to reduce memory
1147 : // usage.
1148 : //
1149 : // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
1150 : // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
1151 : // a reference to that slice, as that slice is necessary for iterator
1152 : // creation and needs to outlast L0Sublevels.
1153 : L0Sublevels *L0Sublevels
1154 : L0SublevelFiles []LevelSlice
1155 :
1156 : Levels [NumLevels]LevelMetadata
1157 :
1158 : // RangeKeyLevels holds a subset of the same files as Levels that contain range
1159 : // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
1160 : // duplication should be minimal, as range keys are expected to be rare.
1161 : RangeKeyLevels [NumLevels]LevelMetadata
1162 :
1163 : // The callback to invoke when the last reference to a version is
1164 : // removed. Will be called with list.mu held.
1165 : Deleted func(obsolete []*FileBacking)
1166 :
1167 : // Stats holds aggregated stats about the version maintained from
1168 : // version to version.
1169 : Stats struct {
1170 : // MarkedForCompaction records the count of files marked for
1171 : // compaction within the version.
1172 : MarkedForCompaction int
1173 : }
1174 :
1175 : cmp *base.Comparer
1176 :
1177 : // The list the version is linked into.
1178 : list *VersionList
1179 :
1180 : // The next/prev link for the versionList doubly-linked list of versions.
1181 : prev, next *Version
1182 : }
1183 :
1184 : // String implements fmt.Stringer, printing the FileMetadata for each level in
1185 : // the Version.
1186 0 : func (v *Version) String() string {
1187 0 : return v.string(false)
1188 0 : }
1189 :
1190 : // DebugString returns an alternative format to String() which includes sequence
1191 : // number and kind information for the sstable boundaries.
1192 0 : func (v *Version) DebugString() string {
1193 0 : return v.string(true)
1194 0 : }
1195 :
1196 1 : func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
1197 1 : var buf bytes.Buffer
1198 1 : for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
1199 1 : fmt.Fprintf(&buf, "L0.%d:\n", sublevel)
1200 1 : sublevels[sublevel].Each(func(f *FileMetadata) {
1201 1 : fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose))
1202 1 : })
1203 : }
1204 1 : return buf.String()
1205 : }
1206 :
1207 0 : func (v *Version) string(verbose bool) string {
1208 0 : var buf bytes.Buffer
1209 0 : if len(v.L0SublevelFiles) > 0 {
1210 0 : fmt.Fprintf(&buf, "%s", describeSublevels(v.cmp.FormatKey, verbose, v.L0SublevelFiles))
1211 0 : }
1212 0 : for level := 1; level < NumLevels; level++ {
1213 0 : if v.Levels[level].Empty() {
1214 0 : continue
1215 : }
1216 0 : fmt.Fprintf(&buf, "L%d:\n", level)
1217 0 : iter := v.Levels[level].Iter()
1218 0 : for f := iter.First(); f != nil; f = iter.Next() {
1219 0 : fmt.Fprintf(&buf, " %s\n", f.DebugString(v.cmp.FormatKey, verbose))
1220 0 : }
1221 : }
1222 0 : return buf.String()
1223 : }
1224 :
1225 : // ParseVersionDebug parses a Version from its DebugString output.
1226 0 : func ParseVersionDebug(comparer *base.Comparer, flushSplitBytes int64, s string) (*Version, error) {
1227 0 : var files [NumLevels][]*FileMetadata
1228 0 : level := -1
1229 0 : for _, l := range strings.Split(s, "\n") {
1230 0 : if l == "" {
1231 0 : continue
1232 : }
1233 0 : p := makeDebugParser(l)
1234 0 : if l, ok := p.TryLevel(); ok {
1235 0 : level = l
1236 0 : continue
1237 : }
1238 :
1239 0 : if level == -1 {
1240 0 : return nil, errors.Errorf("version string must start with a level")
1241 0 : }
1242 0 : m, err := ParseFileMetadataDebug(l)
1243 0 : if err != nil {
1244 0 : return nil, err
1245 0 : }
1246 0 : files[level] = append(files[level], m)
1247 : }
1248 : // L0 files are printed from higher sublevel to lower, which means in a
1249 : // partial order that represents newest to oldest. Reverse the order of L0
1250 : // files to ensure we construct the same sublevels.
1251 0 : slices.Reverse(files[0])
1252 0 : v := NewVersion(comparer, flushSplitBytes, files)
1253 0 : if err := v.CheckOrdering(); err != nil {
1254 0 : return nil, err
1255 0 : }
1256 0 : return v, nil
1257 : }
1258 :
1259 : // Refs returns the number of references to the version.
1260 1 : func (v *Version) Refs() int32 {
1261 1 : return v.refs.Load()
1262 1 : }
1263 :
1264 : // Ref increments the version refcount.
1265 1 : func (v *Version) Ref() {
1266 1 : v.refs.Add(1)
1267 1 : }
1268 :
1269 : // Unref decrements the version refcount. If the last reference to the version
1270 : // was removed, the version is removed from the list of versions and the
1271 : // Deleted callback is invoked. Requires that the VersionList mutex is NOT
1272 : // locked.
1273 1 : func (v *Version) Unref() {
1274 1 : if v.refs.Add(-1) == 0 {
1275 1 : l := v.list
1276 1 : l.mu.Lock()
1277 1 : l.Remove(v)
1278 1 : v.Deleted(v.unrefFiles())
1279 1 : l.mu.Unlock()
1280 1 : }
1281 : }
1282 :
1283 : // UnrefLocked decrements the version refcount. If the last reference to the
1284 : // version was removed, the version is removed from the list of versions and
1285 : // the Deleted callback is invoked. Requires that the VersionList mutex is
1286 : // already locked.
1287 1 : func (v *Version) UnrefLocked() {
1288 1 : if v.refs.Add(-1) == 0 {
1289 1 : v.list.Remove(v)
1290 1 : v.Deleted(v.unrefFiles())
1291 1 : }
1292 : }
1293 :
1294 1 : func (v *Version) unrefFiles() []*FileBacking {
1295 1 : var obsolete []*FileBacking
1296 1 : for _, lm := range v.Levels {
1297 1 : obsolete = append(obsolete, lm.release()...)
1298 1 : }
1299 1 : for _, lm := range v.RangeKeyLevels {
1300 1 : obsolete = append(obsolete, lm.release()...)
1301 1 : }
1302 1 : return obsolete
1303 : }
1304 :
1305 : // Next returns the next version in the list of versions.
1306 1 : func (v *Version) Next() *Version {
1307 1 : return v.next
1308 1 : }
1309 :
1310 : // InitL0Sublevels initializes the L0Sublevels
1311 1 : func (v *Version) InitL0Sublevels(flushSplitBytes int64) error {
1312 1 : var err error
1313 1 : v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], v.cmp.Compare, v.cmp.FormatKey, flushSplitBytes)
1314 1 : if err == nil && v.L0Sublevels != nil {
1315 1 : v.L0SublevelFiles = v.L0Sublevels.Levels
1316 1 : }
1317 1 : return err
1318 : }
1319 :
1320 : // CalculateInuseKeyRanges examines file metadata in levels [level, maxLevel]
1321 : // within bounds [smallest,largest], returning an ordered slice of key ranges
1322 : // that include all keys that exist within levels [level, maxLevel] and within
1323 : // [smallest,largest].
1324 : func (v *Version) CalculateInuseKeyRanges(
1325 : level, maxLevel int, smallest, largest []byte,
1326 1 : ) []base.UserKeyBounds {
1327 1 : // Use two slices, alternating which one is input and which one is output
1328 1 : // as we descend the LSM.
1329 1 : var input, output []base.UserKeyBounds
1330 1 :
1331 1 : // L0 requires special treatment, since sstables within L0 may overlap.
1332 1 : // We use the L0 Sublevels structure to efficiently calculate the merged
1333 1 : // in-use key ranges.
1334 1 : if level == 0 {
1335 1 : output = v.L0Sublevels.InUseKeyRanges(smallest, largest)
1336 1 : level++
1337 1 : }
1338 :
1339 : // NB: We always treat `largest` as inclusive for simplicity, because
1340 : // there's little consequence to calculating slightly broader in-use key
1341 : // ranges.
1342 1 : bounds := base.UserKeyBoundsInclusive(smallest, largest)
1343 1 : for ; level <= maxLevel; level++ {
1344 1 : overlaps := v.Overlaps(level, bounds)
1345 1 : iter := overlaps.Iter()
1346 1 :
1347 1 : // We may already have in-use key ranges from higher levels. Iterate
1348 1 : // through both our accumulated in-use key ranges and this level's
1349 1 : // files, merging the two.
1350 1 : //
1351 1 : // Tables higher within the LSM have broader key spaces. We use this
1352 1 : // when possible to seek past a level's files that are contained by
1353 1 : // our current accumulated in-use key ranges. This helps avoid
1354 1 : // per-sstable work during flushes or compactions in high levels which
1355 1 : // overlap the majority of the LSM's sstables.
1356 1 : input, output = output, input
1357 1 : output = output[:0]
1358 1 :
1359 1 : cmp := v.cmp.Compare
1360 1 : inputIdx := 0
1361 1 : var currFile *FileMetadata
1362 1 : // If we have an accumulated key range and its start is ≤ smallest,
1363 1 : // we can seek to the accumulated range's end. Otherwise, we need to
1364 1 : // start at the first overlapping file within the level.
1365 1 : if len(input) > 0 && cmp(input[0].Start, smallest) <= 0 {
1366 1 : currFile = seekGT(&iter, cmp, input[0].End)
1367 1 : } else {
1368 1 : currFile = iter.First()
1369 1 : }
1370 :
1371 1 : for currFile != nil && inputIdx < len(input) {
1372 1 : // Invariant: Neither currFile nor input[inputIdx] overlaps any earlier
1373 1 : // ranges.
1374 1 : switch {
1375 1 : case cmp(currFile.Largest.UserKey, input[inputIdx].Start) < 0:
1376 1 : // File is completely before input range.
1377 1 : output = append(output, currFile.UserKeyBounds())
1378 1 : currFile = iter.Next()
1379 :
1380 1 : case cmp(input[inputIdx].End.Key, currFile.Smallest.UserKey) < 0:
1381 1 : // Input range is completely before the next file.
1382 1 : output = append(output, input[inputIdx])
1383 1 : inputIdx++
1384 :
1385 1 : default:
1386 1 : // Input range and file range overlap or touch. We will maximally extend
1387 1 : // the range with more overlapping inputs and files.
1388 1 : currAccum := currFile.UserKeyBounds()
1389 1 : if cmp(input[inputIdx].Start, currAccum.Start) < 0 {
1390 1 : currAccum.Start = input[inputIdx].Start
1391 1 : }
1392 1 : currFile = iter.Next()
1393 1 :
1394 1 : // Extend curAccum with any overlapping (or touching) input intervals or
1395 1 : // files. Note that we will always consume at least input[inputIdx].
1396 1 : for {
1397 1 : if inputIdx < len(input) && cmp(input[inputIdx].Start, currAccum.End.Key) <= 0 {
1398 1 : if currAccum.End.CompareUpperBounds(cmp, input[inputIdx].End) < 0 {
1399 1 : currAccum.End = input[inputIdx].End
1400 1 : // Skip over files that are entirely inside this newly extended
1401 1 : // accumulated range; we expect ranges to be wider in levels that
1402 1 : // are higher up so this might skip over a non-trivial number of
1403 1 : // files.
1404 1 : currFile = seekGT(&iter, cmp, currAccum.End)
1405 1 : }
1406 1 : inputIdx++
1407 1 : } else if currFile != nil && cmp(currFile.Smallest.UserKey, currAccum.End.Key) <= 0 {
1408 1 : if b := currFile.UserKeyBounds(); currAccum.End.CompareUpperBounds(cmp, b.End) < 0 {
1409 1 : currAccum.End = b.End
1410 1 : }
1411 1 : currFile = iter.Next()
1412 1 : } else {
1413 1 : // No overlaps remaining.
1414 1 : break
1415 : }
1416 : }
1417 1 : output = append(output, currAccum)
1418 : }
1419 : }
1420 : // If we have either files or input ranges left over, add them to the
1421 : // output.
1422 1 : output = append(output, input[inputIdx:]...)
1423 1 : for ; currFile != nil; currFile = iter.Next() {
1424 1 : output = append(output, currFile.UserKeyBounds())
1425 1 : }
1426 : }
1427 1 : return output
1428 : }
1429 :
1430 : // seekGT seeks to the first file that ends with a boundary that is after the
1431 : // given boundary. Specifically:
1432 : // - if boundary.End is inclusive, the returned file ending boundary is strictly
1433 : // greater than boundary.End.Key
1434 : // - if boundary.End is exclusive, the returned file ending boundary is either
1435 : // greater than boundary.End.Key, or it's inclusive at boundary.End.Key.
1436 1 : func seekGT(iter *LevelIterator, cmp base.Compare, boundary base.UserKeyBoundary) *FileMetadata {
1437 1 : f := iter.SeekGE(cmp, boundary.Key)
1438 1 : if f == nil {
1439 1 : return nil
1440 1 : }
1441 : // If boundary is inclusive or the file boundary is exclusive we do not
1442 : // tolerate an equal largest key.
1443 : // Note: we know f.Largest.UserKey >= boundary.End.Key so this condition is
1444 : // equivalent to boundary.End.IsUpperBoundForInternalKey(cmp, f.Largest).
1445 1 : if (boundary.Kind == base.Inclusive || f.Largest.IsExclusiveSentinel()) && cmp(boundary.Key, f.Largest.UserKey) == 0 {
1446 1 : return iter.Next()
1447 1 : }
1448 1 : return f
1449 : }
1450 :
1451 : // Contains returns a boolean indicating whether the provided file exists in
1452 : // the version at the given level. If level is non-zero then Contains binary
1453 : // searches among the files. If level is zero, Contains scans the entire
1454 : // level.
1455 1 : func (v *Version) Contains(level int, m *FileMetadata) bool {
1456 1 : iter := v.Levels[level].Iter()
1457 1 : if level > 0 {
1458 1 : overlaps := v.Overlaps(level, m.UserKeyBounds())
1459 1 : iter = overlaps.Iter()
1460 1 : }
1461 1 : for f := iter.First(); f != nil; f = iter.Next() {
1462 1 : if f == m {
1463 1 : return true
1464 1 : }
1465 : }
1466 1 : return false
1467 : }
1468 :
1469 : // Overlaps returns all elements of v.files[level] whose user key range
1470 : // intersects the given bounds. If level is non-zero then the user key bounds of
1471 : // v.files[level] are assumed to not overlap (although they may touch). If level
1472 : // is zero then that assumption cannot be made, and the given bounds are
1473 : // expanded to the union of those matching bounds so far and the computation is
1474 : // repeated until the bounds stabilize.
1475 : // The returned files are a subsequence of the input files, i.e., the ordering
1476 : // is not changed.
1477 1 : func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice {
1478 1 : if level == 0 {
1479 1 : // Indices that have been selected as overlapping.
1480 1 : l0 := v.Levels[level]
1481 1 : l0Iter := l0.Iter()
1482 1 : selectedIndices := make([]bool, l0.Len())
1483 1 : numSelected := 0
1484 1 : var slice LevelSlice
1485 1 : for {
1486 1 : restart := false
1487 1 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1488 1 : selected := selectedIndices[i]
1489 1 : if selected {
1490 1 : continue
1491 : }
1492 1 : if !meta.Overlaps(v.cmp.Compare, &bounds) {
1493 1 : // meta is completely outside the specified range; skip it.
1494 1 : continue
1495 : }
1496 : // Overlaps.
1497 1 : selectedIndices[i] = true
1498 1 : numSelected++
1499 1 :
1500 1 : // Since this is L0, check if the newly added fileMetadata has expanded
1501 1 : // the range. We expand the range immediately for files we have
1502 1 : // remaining to check in this loop. All already checked and unselected
1503 1 : // files will need to be rechecked via the restart below.
1504 1 : if v.cmp.Compare(meta.Smallest.UserKey, bounds.Start) < 0 {
1505 1 : bounds.Start = meta.Smallest.UserKey
1506 1 : restart = true
1507 1 : }
1508 1 : if !bounds.End.IsUpperBoundForInternalKey(v.cmp.Compare, meta.Largest) {
1509 1 : bounds.End = base.UserKeyExclusiveIf(meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
1510 1 : restart = true
1511 1 : }
1512 : }
1513 :
1514 1 : if !restart {
1515 1 : // Construct a B-Tree containing only the matching items.
1516 1 : var tr btree
1517 1 : tr.bcmp = v.Levels[level].tree.bcmp
1518 1 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1519 1 : if selectedIndices[i] {
1520 1 : err := tr.Insert(meta)
1521 1 : if err != nil {
1522 0 : panic(err)
1523 : }
1524 : }
1525 : }
1526 1 : slice = newLevelSlice(tr.Iter())
1527 1 : // TODO(jackson): Avoid the oddity of constructing and
1528 1 : // immediately releasing a B-Tree. Make LevelSlice an
1529 1 : // interface?
1530 1 : tr.Release()
1531 1 : break
1532 : }
1533 : // Continue looping to retry the files that were not selected.
1534 : }
1535 1 : return slice
1536 : }
1537 :
1538 1 : return v.Levels[level].Slice().Overlaps(v.cmp.Compare, bounds)
1539 : }
1540 :
1541 : // IterAllLevelsAndSublevels calls fn with an iterator for each L0 sublevel
1542 : // (from top to bottom), then once for each level below L0.
1543 1 : func (v *Version) IterAllLevelsAndSublevels(fn func(it LevelIterator, level Layer)) {
1544 1 : for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
1545 1 : fn(v.L0SublevelFiles[sublevel].Iter(), L0Sublevel(sublevel))
1546 1 : }
1547 1 : for level := 1; level < NumLevels; level++ {
1548 1 : fn(v.Levels[level].Iter(), Level(level))
1549 1 : }
1550 : }
1551 :
1552 : // CheckOrdering checks that the files are consistent with respect to
1553 : // increasing file numbers (for level 0 files) and increasing and non-
1554 : // overlapping internal key ranges (for level non-0 files).
1555 0 : func (v *Version) CheckOrdering() error {
1556 0 : for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
1557 0 : sublevelIter := v.L0SublevelFiles[sublevel].Iter()
1558 0 : if err := CheckOrdering(v.cmp.Compare, v.cmp.FormatKey, L0Sublevel(sublevel), sublevelIter); err != nil {
1559 0 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString())
1560 0 : }
1561 : }
1562 :
1563 0 : for level, lm := range v.Levels {
1564 0 : if err := CheckOrdering(v.cmp.Compare, v.cmp.FormatKey, Level(level), lm.Iter()); err != nil {
1565 0 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString())
1566 0 : }
1567 : }
1568 0 : return nil
1569 : }
1570 :
1571 : // VersionList holds a list of versions. The versions are ordered from oldest
1572 : // to newest.
1573 : type VersionList struct {
1574 : mu *sync.Mutex
1575 : root Version
1576 : }
1577 :
1578 : // Init initializes the version list.
1579 1 : func (l *VersionList) Init(mu *sync.Mutex) {
1580 1 : l.mu = mu
1581 1 : l.root.next = &l.root
1582 1 : l.root.prev = &l.root
1583 1 : }
1584 :
1585 : // Empty returns true if the list is empty, and false otherwise.
1586 1 : func (l *VersionList) Empty() bool {
1587 1 : return l.root.next == &l.root
1588 1 : }
1589 :
1590 : // Front returns the oldest version in the list. Note that this version is only
1591 : // valid if Empty() returns true.
1592 1 : func (l *VersionList) Front() *Version {
1593 1 : return l.root.next
1594 1 : }
1595 :
1596 : // Back returns the newest version in the list. Note that this version is only
1597 : // valid if Empty() returns true.
1598 1 : func (l *VersionList) Back() *Version {
1599 1 : return l.root.prev
1600 1 : }
1601 :
1602 : // PushBack adds a new version to the back of the list. This new version
1603 : // becomes the "newest" version in the list.
1604 1 : func (l *VersionList) PushBack(v *Version) {
1605 1 : if v.list != nil || v.prev != nil || v.next != nil {
1606 0 : panic("pebble: version list is inconsistent")
1607 : }
1608 1 : v.prev = l.root.prev
1609 1 : v.prev.next = v
1610 1 : v.next = &l.root
1611 1 : v.next.prev = v
1612 1 : v.list = l
1613 1 : // Let L0Sublevels on the second newest version get GC'd, as it is no longer
1614 1 : // necessary. See the comment in Version.
1615 1 : v.prev.L0Sublevels = nil
1616 : }
1617 :
1618 : // Remove removes the specified version from the list.
1619 1 : func (l *VersionList) Remove(v *Version) {
1620 1 : if v == &l.root {
1621 0 : panic("pebble: cannot remove version list root node")
1622 : }
1623 1 : if v.list != l {
1624 0 : panic("pebble: version list is inconsistent")
1625 : }
1626 1 : v.prev.next = v.next
1627 1 : v.next.prev = v.prev
1628 1 : v.next = nil // avoid memory leaks
1629 1 : v.prev = nil // avoid memory leaks
1630 1 : v.list = nil // avoid memory leaks
1631 : }
1632 :
1633 : // CheckOrdering checks that the files are consistent with respect to
1634 : // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
1635 : // overlapping internal key ranges (for non-level 0 files).
1636 1 : func CheckOrdering(cmp Compare, format base.FormatKey, level Layer, files LevelIterator) error {
1637 1 : // The invariants to check for L0 sublevels are the same as the ones to
1638 1 : // check for all other levels. However, if L0 is not organized into
1639 1 : // sublevels, or if all L0 files are being passed in, we do the legacy L0
1640 1 : // checks, defined in the detailed comment below.
1641 1 : if level == Level(0) {
1642 1 : // We have 2 kinds of files:
1643 1 : // - Files with exactly one sequence number: these could be either ingested files
1644 1 : // or flushed files. We cannot tell the difference between them based on FileMetadata,
1645 1 : // so our consistency checking here uses the weaker checks assuming it is a narrow
1646 1 : // flushed file. We cannot error on ingested files having sequence numbers coincident
1647 1 : // with flushed files as the seemingly ingested file could just be a flushed file
1648 1 : // with just one key in it which is a truncated range tombstone sharing sequence numbers
1649 1 : // with other files in the same flush.
1650 1 : // - Files with multiple sequence numbers: these are necessarily flushed files.
1651 1 : //
1652 1 : // Three cases of overlapping sequence numbers:
1653 1 : // Case 1:
1654 1 : // An ingested file contained in the sequence numbers of the flushed file -- it must be
1655 1 : // fully contained (not coincident with either end of the flushed file) since the memtable
1656 1 : // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
1657 1 : // num b, and the memtable got a subsequent update that was given sequence num b+1, before
1658 1 : // being flushed.
1659 1 : //
1660 1 : // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
1661 1 : // third file are inconsistent with each other. So comparing adjacent files is insufficient
1662 1 : // for consistency checking.
1663 1 : //
1664 1 : // Visually we have something like
1665 1 : // x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
1666 1 : // y y y y (y's represent ingested files)
1667 1 : // And these are ordered in increasing order of y. Note that y's must be unique.
1668 1 : //
1669 1 : // Case 2:
1670 1 : // A flushed file that did not overlap in keys with any file in any level, but does overlap
1671 1 : // in the file key intervals. This file is placed in L0 since it overlaps in the file
1672 1 : // key intervals but since it has no overlapping data, it is assigned a sequence number
1673 1 : // of 0 in RocksDB. We handle this case for compatibility with RocksDB.
1674 1 : //
1675 1 : // Case 3:
1676 1 : // A sequence of flushed files that overlap in sequence numbers with one another,
1677 1 : // but do not overlap in keys inside the sstables. These files correspond to
1678 1 : // partitioned flushes or the results of intra-L0 compactions of partitioned
1679 1 : // flushes.
1680 1 : //
1681 1 : // Since these types of SSTables violate most other sequence number
1682 1 : // overlap invariants, and handling this case is important for compatibility
1683 1 : // with future versions of pebble, this method relaxes most L0 invariant
1684 1 : // checks.
1685 1 :
1686 1 : var prev *FileMetadata
1687 1 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1688 1 : if prev == nil {
1689 1 : continue
1690 : }
1691 : // Validate that the sorting is sane.
1692 1 : if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
1693 0 : // Multiple files satisfying case 2 mentioned above.
1694 1 : } else if !prev.lessSeqNum(f) {
1695 0 : return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
1696 0 : errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1697 0 : errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
1698 0 : errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
1699 0 : }
1700 : }
1701 1 : } else {
1702 1 : var prev *FileMetadata
1703 1 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1704 1 : if err := f.Validate(cmp, format); err != nil {
1705 0 : return errors.Wrapf(err, "%s ", level)
1706 0 : }
1707 1 : if prev != nil {
1708 1 : if prev.cmpSmallestKey(f, cmp) >= 0 {
1709 0 : return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
1710 0 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1711 0 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1712 0 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1713 0 : }
1714 :
1715 : // In all supported format major version, split user keys are
1716 : // prohibited, so both files cannot contain keys with the same user
1717 : // keys. If the bounds have the same user key, the previous file's
1718 : // boundary must have a InternalKeyTrailer indicating that it's exclusive.
1719 1 : if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) {
1720 0 : return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
1721 0 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1722 0 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1723 0 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1724 0 : }
1725 : }
1726 : }
1727 : }
1728 1 : return nil
1729 : }
|