Line data Source code
1 : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "sort"
11 : "strconv"
12 : "strings"
13 : "sync"
14 : "sync/atomic"
15 : "unicode"
16 :
17 : "github.com/cockroachdb/errors"
18 : "github.com/cockroachdb/pebble/internal/base"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : )
21 :
22 : // Compare exports the base.Compare type.
23 : type Compare = base.Compare
24 :
25 : // InternalKey exports the base.InternalKey type.
26 : type InternalKey = base.InternalKey
27 :
28 : // TableInfo contains the common information for table related events.
29 : type TableInfo struct {
30 : // FileNum is the internal DB identifier for the table.
31 : FileNum base.FileNum
32 : // Size is the size of the file in bytes.
33 : Size uint64
34 : // Smallest is the smallest internal key in the table.
35 : Smallest InternalKey
36 : // Largest is the largest internal key in the table.
37 : Largest InternalKey
38 : // SmallestSeqNum is the smallest sequence number in the table.
39 : SmallestSeqNum uint64
40 : // LargestSeqNum is the largest sequence number in the table.
41 : LargestSeqNum uint64
42 : }
43 :
44 : // TableStats contains statistics on a table used for compaction heuristics,
45 : // and export via Metrics.
46 : type TableStats struct {
47 : // The total number of entries in the table.
48 : NumEntries uint64
49 : // The number of point and range deletion entries in the table.
50 : NumDeletions uint64
51 : // NumRangeKeySets is the total number of range key sets in the table.
52 : NumRangeKeySets uint64
53 : // Estimate of the total disk space that may be dropped by this table's
54 : // point deletions by compacting them.
55 : PointDeletionsBytesEstimate uint64
56 : // Estimate of the total disk space that may be dropped by this table's
57 : // range deletions by compacting them. This estimate is at data-block
58 : // granularity and is not updated if compactions beneath the table reduce
59 : // the amount of reclaimable disk space. It also does not account for
60 : // overlapping data in L0 and ignores L0 sublevels, but the error that
61 : // introduces is expected to be small.
62 : //
63 : // Tables in the bottommost level of the LSM may have a nonzero estimate if
64 : // snapshots or move compactions prevented the elision of their range
65 : // tombstones. A table in the bottommost level that was ingested into L6
66 : // will have a zero estimate, because the file's sequence numbers indicate
67 : // that the tombstone cannot drop any data contained within the file itself.
68 : RangeDeletionsBytesEstimate uint64
69 : // Total size of value blocks and value index block.
70 : ValueBlocksSize uint64
71 : }
72 :
73 : // boundType represents the type of key (point or range) present as the smallest
74 : // and largest keys.
75 : type boundType uint8
76 :
77 : const (
78 : boundTypePointKey boundType = iota + 1
79 : boundTypeRangeKey
80 : )
81 :
82 : // CompactionState is the compaction state of a file.
83 : //
84 : // The following shows the valid state transitions:
85 : //
86 : // NotCompacting --> Compacting --> Compacted
87 : // ^ |
88 : // | |
89 : // +-------<-------+
90 : //
91 : // Input files to a compaction transition to Compacting when a compaction is
92 : // picked. A file that has finished compacting typically transitions into the
93 : // Compacted state, at which point it is effectively obsolete ("zombied") and
94 : // will eventually be removed from the LSM. A file that has been move-compacted
95 : // will transition from Compacting back into the NotCompacting state, signaling
96 : // that the file may be selected for a subsequent compaction. A failed
97 : // compaction will result in all input tables transitioning from Compacting to
98 : // NotCompacting.
99 : //
100 : // This state is in-memory only. It is not persisted to the manifest.
101 : type CompactionState uint8
102 :
103 : // CompactionStates.
104 : const (
105 : CompactionStateNotCompacting CompactionState = iota
106 : CompactionStateCompacting
107 : CompactionStateCompacted
108 : )
109 :
110 : // String implements fmt.Stringer.
111 0 : func (s CompactionState) String() string {
112 0 : switch s {
113 0 : case CompactionStateNotCompacting:
114 0 : return "NotCompacting"
115 0 : case CompactionStateCompacting:
116 0 : return "Compacting"
117 0 : case CompactionStateCompacted:
118 0 : return "Compacted"
119 0 : default:
120 0 : panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
121 : }
122 : }
123 :
124 : // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
125 : // some version. FileMetadata does not contain the actual level of the sst,
126 : // since such leveled-ssts can move across levels in different versions, while
127 : // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
128 : // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
129 : // only state is FileBacking. A backing-sst is level-less. It is possible for a
130 : // backing-sst to be referred to by a physical sst in one version and by one or
131 : // more virtual ssts in one or more versions. A backing-sst becomes obsolete
132 : // and can be deleted once it is no longer required by any physical or virtual
133 : // sst in any version.
134 : //
135 : // We maintain some invariants:
136 : //
137 : // 1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
138 : // and there will be exactly one FileMetadata associated with the FileNum.
139 : //
140 : // 2. Within a version, a backing-sst is either only referred to by one
141 : // physical sst or one or more virtual ssts.
142 : //
143 : // 3. Once a backing-sst is referred to by a virtual sst in the latest version,
144 : // it cannot go back to being referred to by a physical sst in any future
145 : // version.
146 : //
147 : // Once a physical sst is no longer needed by any version, we will no longer
148 : // maintain the file metadata associated with it. We will still maintain the
149 : // FileBacking associated with the physical sst if the backing sst is required
150 : // by any virtual ssts in any version.
151 : type FileMetadata struct {
152 : // AllowedSeeks is used to determine if a file should be picked for
153 : // a read triggered compaction. It is decremented when read sampling
154 : // in pebble.Iterator after every after every positioning operation
155 : // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
156 : AllowedSeeks atomic.Int64
157 :
158 : // statsValid indicates if stats have been loaded for the table. The
159 : // TableStats structure is populated only if valid is true.
160 : statsValid atomic.Bool
161 :
162 : // FileBacking is the state which backs either a physical or virtual
163 : // sstables.
164 : FileBacking *FileBacking
165 :
166 : // InitAllowedSeeks is the inital value of allowed seeks. This is used
167 : // to re-set allowed seeks on a file once it hits 0.
168 : InitAllowedSeeks int64
169 : // FileNum is the file number.
170 : //
171 : // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
172 : FileNum base.FileNum
173 : // Size is the size of the file, in bytes. Size is an approximate value for
174 : // virtual sstables.
175 : //
176 : // INVARIANT: when !FileMetadata.Virtual, Size == FileBacking.Size.
177 : //
178 : // TODO(bananabrick): Size is currently used in metrics, and for many key
179 : // Pebble level heuristics. Make sure that the heuristics will still work
180 : // appropriately with an approximate value of size.
181 : Size uint64
182 : // File creation time in seconds since the epoch (1970-01-01 00:00:00
183 : // UTC). For ingested sstables, this corresponds to the time the file was
184 : // ingested. For virtual sstables, this corresponds to the wall clock time
185 : // when the FileMetadata for the virtual sstable was first created.
186 : CreationTime int64
187 : // Lower and upper bounds for the smallest and largest sequence numbers in
188 : // the table, across both point and range keys. For physical sstables, these
189 : // values are tight bounds. For virtual sstables, there is no guarantee that
190 : // there will be keys with SmallestSeqNum or LargestSeqNum within virtual
191 : // sstable bounds.
192 : SmallestSeqNum uint64
193 : LargestSeqNum uint64
194 : // SmallestPointKey and LargestPointKey are the inclusive bounds for the
195 : // internal point keys stored in the table. This includes RANGEDELs, which
196 : // alter point keys.
197 : // NB: these field should be set using ExtendPointKeyBounds. They are left
198 : // exported for reads as an optimization.
199 : SmallestPointKey InternalKey
200 : LargestPointKey InternalKey
201 : // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
202 : // internal range keys stored in the table.
203 : // NB: these field should be set using ExtendRangeKeyBounds. They are left
204 : // exported for reads as an optimization.
205 : SmallestRangeKey InternalKey
206 : LargestRangeKey InternalKey
207 : // Smallest and Largest are the inclusive bounds for the internal keys stored
208 : // in the table, across both point and range keys.
209 : // NB: these fields are derived from their point and range key equivalents,
210 : // and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
211 : Smallest InternalKey
212 : Largest InternalKey
213 : // Stats describe table statistics. Protected by DB.mu.
214 : //
215 : // For virtual sstables, set stats upon virtual sstable creation as
216 : // asynchronous computation of stats is not currently supported.
217 : //
218 : // TODO(bananabrick): To support manifest replay for virtual sstables, we
219 : // probably need to compute virtual sstable stats asynchronously. Otherwise,
220 : // we'd have to write virtual sstable stats to the version edit.
221 : Stats TableStats
222 :
223 : // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
224 : // pick L0 compactions. Only accurate for the most recent Version.
225 : SubLevel int
226 : L0Index int
227 : minIntervalIndex int
228 : maxIntervalIndex int
229 :
230 : // NB: the alignment of this struct is 8 bytes. We pack all the bools to
231 : // ensure an optimal packing.
232 :
233 : // IsIntraL0Compacting is set to True if this file is part of an intra-L0
234 : // compaction. When it's true, IsCompacting must also return true. If
235 : // Compacting is true and IsIntraL0Compacting is false for an L0 file, the
236 : // file must be part of a compaction to Lbase.
237 : IsIntraL0Compacting bool
238 : CompactionState CompactionState
239 : // True if compaction of this file has been explicitly requested.
240 : // Previously, RocksDB and earlier versions of Pebble allowed this
241 : // flag to be set by a user table property collector. Some earlier
242 : // versions of Pebble respected this flag, while other more recent
243 : // versions ignored this flag.
244 : //
245 : // More recently this flag has been repurposed to facilitate the
246 : // compaction of 'atomic compaction units'. Files marked for
247 : // compaction are compacted in a rewrite compaction at the lowest
248 : // possible compaction priority.
249 : //
250 : // NB: A count of files marked for compaction is maintained on
251 : // Version, and compaction picking reads cached annotations
252 : // determined by this field.
253 : //
254 : // Protected by DB.mu.
255 : MarkedForCompaction bool
256 : // HasPointKeys tracks whether the table contains point keys (including
257 : // RANGEDELs). If a table contains only range deletions, HasPointsKeys is
258 : // still true.
259 : HasPointKeys bool
260 : // HasRangeKeys tracks whether the table contains any range keys.
261 : HasRangeKeys bool
262 : // smallestSet and largestSet track whether the overall bounds have been set.
263 : boundsSet bool
264 : // boundTypeSmallest and boundTypeLargest provide an indication as to which
265 : // key type (point or range) corresponds to the smallest and largest overall
266 : // table bounds.
267 : boundTypeSmallest, boundTypeLargest boundType
268 : // Virtual is true if the FileMetadata belongs to a virtual sstable.
269 : Virtual bool
270 : }
271 :
272 : // PhysicalFileMeta is used by functions which want a guarantee that their input
273 : // belongs to a physical sst and not a virtual sst.
274 : //
275 : // NB: This type should only be constructed by calling
276 : // FileMetadata.PhysicalMeta.
277 : type PhysicalFileMeta struct {
278 : *FileMetadata
279 : }
280 :
281 : // VirtualFileMeta is used by functions which want a guarantee that their input
282 : // belongs to a virtual sst and not a physical sst.
283 : //
284 : // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
285 : // fields have additional invariants imposed on them, and/or slightly varying
286 : // meanings:
287 : // - Smallest and Largest (and their counterparts
288 : // {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
289 : // key at that exact bound. We make the effort to determine the next smallest
290 : // or largest key in an sstable after virtualizing it, to maintain this
291 : // tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
292 : // could mean that a rangedel or range key ends at that user key, or has been
293 : // truncated to that user key.
294 : // - One invariant is that if a rangedel or range key is truncated on its
295 : // upper bound, the virtual sstable *must* have a rangedel or range key
296 : // sentinel key as its upper bound. This is because truncation yields
297 : // an exclusive upper bound for the rangedel/rangekey, and if there are
298 : // any points at that exclusive upper bound within the same virtual
299 : // sstable, those could get uncovered by this truncation. We enforce this
300 : // invariant in calls to keyspan.Truncate.
301 : // - Size is an estimate of the size of the virtualized portion of this sstable.
302 : // The underlying file's size is stored in FileBacking.Size, though it could
303 : // also be estimated or could correspond to just the referenced portion of
304 : // a file (eg. if the file originated on another node).
305 : // - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
306 : // This means that all keys in the virtual sstable must have seqnums within
307 : // [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
308 : // a key with a seqnum at either of the bounds. Calculating tight seqnum
309 : // bounds would be too expensive and deliver little value.
310 : //
311 : // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
312 : type VirtualFileMeta struct {
313 : *FileMetadata
314 : }
315 :
316 : // PhysicalMeta should be the only source of creating the PhysicalFileMeta
317 : // wrapper type.
318 2 : func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
319 2 : if m.Virtual {
320 0 : panic("pebble: file metadata does not belong to a physical sstable")
321 : }
322 2 : return PhysicalFileMeta{
323 2 : m,
324 2 : }
325 : }
326 :
327 : // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
328 : // type.
329 1 : func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
330 1 : if !m.Virtual {
331 0 : panic("pebble: file metadata does not belong to a virtual sstable")
332 : }
333 1 : return VirtualFileMeta{
334 1 : m,
335 1 : }
336 : }
337 :
338 : // FileBacking either backs a single physical sstable, or one or more virtual
339 : // sstables.
340 : //
341 : // See the comment above the FileMetadata type for sstable terminology.
342 : type FileBacking struct {
343 : // Reference count for the backing file on disk: incremented when a
344 : // physical or virtual sstable which is backed by the FileBacking is
345 : // added to a version and decremented when the version is unreferenced.
346 : // We ref count in order to determine when it is safe to delete a
347 : // backing sst file from disk. The backing file is obsolete when the
348 : // reference count falls to zero.
349 : refs atomic.Int32
350 : // latestVersionRefs are the references to the FileBacking in the
351 : // latest version. This reference can be through a single physical
352 : // sstable in the latest version, or one or more virtual sstables in the
353 : // latest version.
354 : //
355 : // INVARIANT: latestVersionRefs <= refs.
356 : latestVersionRefs atomic.Int32
357 : // VirtualizedSize is set iff the backing sst is only referred to by
358 : // virtual ssts in the latest version. VirtualizedSize is the sum of the
359 : // virtual sstable sizes of all of the virtual sstables in the latest
360 : // version which are backed by the physical sstable. When a virtual
361 : // sstable is removed from the latest version, we will decrement the
362 : // VirtualizedSize. During compaction picking, we'll compensate a
363 : // virtual sstable file size by
364 : // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
365 : // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
366 : // is high, then the space amplification due to virtual sstables is
367 : // high, and we should pick the virtual sstable with a higher priority.
368 : //
369 : // TODO(bananabrick): Compensate the virtual sstable file size using
370 : // the VirtualizedSize during compaction picking and test.
371 : VirtualizedSize atomic.Uint64
372 : DiskFileNum base.DiskFileNum
373 : Size uint64
374 : }
375 :
376 : // InitPhysicalBacking allocates and sets the FileBacking which is required by a
377 : // physical sstable FileMetadata.
378 : //
379 : // Ensure that the state required by FileBacking, such as the FileNum, is
380 : // already set on the FileMetadata before InitPhysicalBacking is called.
381 : // Calling InitPhysicalBacking only after the relevant state has been set in the
382 : // FileMetadata is not necessary in tests which don't rely on FileBacking.
383 2 : func (m *FileMetadata) InitPhysicalBacking() {
384 2 : if m.Virtual {
385 0 : panic("pebble: virtual sstables should use a pre-existing FileBacking")
386 : }
387 2 : if m.FileBacking == nil {
388 2 : m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
389 2 : }
390 : }
391 :
392 : // InitProviderBacking creates a new FileBacking for a file backed by
393 : // an objstorage.Provider.
394 1 : func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
395 1 : if !m.Virtual {
396 0 : panic("pebble: provider-backed sstables must be virtual")
397 : }
398 1 : if m.FileBacking == nil {
399 1 : m.FileBacking = &FileBacking{DiskFileNum: fileNum}
400 1 : }
401 : }
402 :
403 : // ValidateVirtual should be called once the FileMetadata for a virtual sstable
404 : // is created to verify that the fields of the virtual sstable are sound.
405 1 : func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
406 1 : if !m.Virtual {
407 0 : panic("pebble: invalid virtual sstable")
408 : }
409 :
410 1 : if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
411 0 : panic("pebble: invalid smallest sequence number for virtual sstable")
412 : }
413 :
414 1 : if createdFrom.LargestSeqNum != m.LargestSeqNum {
415 0 : panic("pebble: invalid largest sequence number for virtual sstable")
416 : }
417 :
418 1 : if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
419 0 : panic("pebble: invalid physical sstable state for virtual sstable")
420 : }
421 :
422 1 : if m.Size == 0 {
423 0 : panic("pebble: virtual sstable size must be set upon creation")
424 : }
425 : }
426 :
427 : // Refs returns the refcount of backing sstable.
428 2 : func (m *FileMetadata) Refs() int32 {
429 2 : return m.FileBacking.refs.Load()
430 2 : }
431 :
432 : // Ref increments the ref count associated with the backing sstable.
433 2 : func (m *FileMetadata) Ref() {
434 2 : m.FileBacking.refs.Add(1)
435 2 : }
436 :
437 : // Unref decrements the ref count associated with the backing sstable.
438 2 : func (m *FileMetadata) Unref() int32 {
439 2 : v := m.FileBacking.refs.Add(-1)
440 2 : if invariants.Enabled && v < 0 {
441 0 : panic("pebble: invalid FileMetadata refcounting")
442 : }
443 2 : return v
444 : }
445 :
446 : // LatestRef increments the latest ref count associated with the backing
447 : // sstable.
448 2 : func (m *FileMetadata) LatestRef() {
449 2 : m.FileBacking.latestVersionRefs.Add(1)
450 2 :
451 2 : if m.Virtual {
452 1 : m.FileBacking.VirtualizedSize.Add(m.Size)
453 1 : }
454 : }
455 :
456 : // LatestUnref decrements the latest ref count associated with the backing
457 : // sstable.
458 2 : func (m *FileMetadata) LatestUnref() int32 {
459 2 : if m.Virtual {
460 1 : m.FileBacking.VirtualizedSize.Add(-m.Size)
461 1 : }
462 :
463 2 : v := m.FileBacking.latestVersionRefs.Add(-1)
464 2 : if invariants.Enabled && v < 0 {
465 0 : panic("pebble: invalid FileMetadata latest refcounting")
466 : }
467 2 : return v
468 : }
469 :
470 : // LatestRefs returns the latest ref count associated with the backing sstable.
471 2 : func (m *FileMetadata) LatestRefs() int32 {
472 2 : return m.FileBacking.latestVersionRefs.Load()
473 2 : }
474 :
475 : // SetCompactionState transitions this file's compaction state to the given
476 : // state. Protected by DB.mu.
477 2 : func (m *FileMetadata) SetCompactionState(to CompactionState) {
478 2 : if invariants.Enabled {
479 2 : transitionErr := func() error {
480 0 : return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
481 0 : }
482 2 : switch m.CompactionState {
483 2 : case CompactionStateNotCompacting:
484 2 : if to != CompactionStateCompacting {
485 0 : panic(transitionErr())
486 : }
487 2 : case CompactionStateCompacting:
488 2 : if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
489 0 : panic(transitionErr())
490 : }
491 0 : case CompactionStateCompacted:
492 0 : panic(transitionErr())
493 0 : default:
494 0 : panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
495 : }
496 : }
497 2 : m.CompactionState = to
498 : }
499 :
500 : // IsCompacting returns true if this file's compaction state is
501 : // CompactionStateCompacting. Protected by DB.mu.
502 2 : func (m *FileMetadata) IsCompacting() bool {
503 2 : return m.CompactionState == CompactionStateCompacting
504 2 : }
505 :
506 : // StatsValid returns true if the table stats have been populated. If StatValid
507 : // returns true, the Stats field may be read (with or without holding the
508 : // database mutex).
509 2 : func (m *FileMetadata) StatsValid() bool {
510 2 : return m.statsValid.Load()
511 2 : }
512 :
513 : // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
514 : // while populating TableStats and calling StatsMarkValud. Once stats are
515 : // populated, they must not be mutated.
516 2 : func (m *FileMetadata) StatsMarkValid() {
517 2 : m.statsValid.Store(true)
518 2 : }
519 :
520 : // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
521 : // and overall table bounds with the given smallest and largest keys. The
522 : // smallest and largest bounds may not be extended if the table already has a
523 : // bound that is smaller or larger, respectively. The receiver is returned.
524 : // NB: calling this method should be preferred to manually setting the bounds by
525 : // manipulating the fields directly, to maintain certain invariants.
526 : func (m *FileMetadata) ExtendPointKeyBounds(
527 : cmp Compare, smallest, largest InternalKey,
528 2 : ) *FileMetadata {
529 2 : // Update the point key bounds.
530 2 : if !m.HasPointKeys {
531 2 : m.SmallestPointKey, m.LargestPointKey = smallest, largest
532 2 : m.HasPointKeys = true
533 2 : } else {
534 2 : if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
535 2 : m.SmallestPointKey = smallest
536 2 : }
537 2 : if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
538 2 : m.LargestPointKey = largest
539 2 : }
540 : }
541 : // Update the overall bounds.
542 2 : m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
543 2 : return m
544 : }
545 :
546 : // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
547 : // and overall table bounds with the given smallest and largest keys. The
548 : // smallest and largest bounds may not be extended if the table already has a
549 : // bound that is smaller or larger, respectively. The receiver is returned.
550 : // NB: calling this method should be preferred to manually setting the bounds by
551 : // manipulating the fields directly, to maintain certain invariants.
552 : func (m *FileMetadata) ExtendRangeKeyBounds(
553 : cmp Compare, smallest, largest InternalKey,
554 2 : ) *FileMetadata {
555 2 : // Update the range key bounds.
556 2 : if !m.HasRangeKeys {
557 2 : m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
558 2 : m.HasRangeKeys = true
559 2 : } else {
560 1 : if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
561 0 : m.SmallestRangeKey = smallest
562 0 : }
563 1 : if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
564 1 : m.LargestRangeKey = largest
565 1 : }
566 : }
567 : // Update the overall bounds.
568 2 : m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
569 2 : return m
570 : }
571 :
572 : // extendOverallBounds attempts to extend the overall table lower and upper
573 : // bounds. The given bounds may not be used if a lower or upper bound already
574 : // exists that is smaller or larger than the given keys, respectively. The given
575 : // boundType will be used if the bounds are updated.
576 : func (m *FileMetadata) extendOverallBounds(
577 : cmp Compare, smallest, largest InternalKey, bTyp boundType,
578 2 : ) {
579 2 : if !m.boundsSet {
580 2 : m.Smallest, m.Largest = smallest, largest
581 2 : m.boundsSet = true
582 2 : m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
583 2 : } else {
584 2 : if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
585 2 : m.Smallest = smallest
586 2 : m.boundTypeSmallest = bTyp
587 2 : }
588 2 : if base.InternalCompare(cmp, largest, m.Largest) > 0 {
589 2 : m.Largest = largest
590 2 : m.boundTypeLargest = bTyp
591 2 : }
592 : }
593 : }
594 :
595 : // Overlaps returns true if the file key range overlaps with the given range.
596 2 : func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
597 2 : if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
598 2 : // f is completely before the specified range; no overlap.
599 2 : return false
600 2 : }
601 2 : if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
602 2 : // f is completely after the specified range; no overlap.
603 2 : return false
604 2 : }
605 2 : return true
606 : }
607 :
608 : // ContainedWithinSpan returns true if the file key range completely overlaps with the
609 : // given range ("end" is assumed to exclusive).
610 1 : func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
611 1 : lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
612 1 : return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
613 1 : }
614 :
615 : // ContainsKeyType returns whether or not the file contains keys of the provided
616 : // type.
617 2 : func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
618 2 : switch kt {
619 2 : case KeyTypePointAndRange:
620 2 : return true
621 2 : case KeyTypePoint:
622 2 : return m.HasPointKeys
623 2 : case KeyTypeRange:
624 2 : return m.HasRangeKeys
625 0 : default:
626 0 : panic("unrecognized key type")
627 : }
628 : }
629 :
630 : // SmallestBound returns the file's smallest bound of the key type. It returns a
631 : // false second return value if the file does not contain any keys of the key
632 : // type.
633 2 : func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
634 2 : switch kt {
635 0 : case KeyTypePointAndRange:
636 0 : return &m.Smallest, true
637 2 : case KeyTypePoint:
638 2 : return &m.SmallestPointKey, m.HasPointKeys
639 2 : case KeyTypeRange:
640 2 : return &m.SmallestRangeKey, m.HasRangeKeys
641 0 : default:
642 0 : panic("unrecognized key type")
643 : }
644 : }
645 :
646 : // LargestBound returns the file's largest bound of the key type. It returns a
647 : // false second return value if the file does not contain any keys of the key
648 : // type.
649 2 : func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
650 2 : switch kt {
651 0 : case KeyTypePointAndRange:
652 0 : return &m.Largest, true
653 2 : case KeyTypePoint:
654 2 : return &m.LargestPointKey, m.HasPointKeys
655 2 : case KeyTypeRange:
656 2 : return &m.LargestRangeKey, m.HasRangeKeys
657 0 : default:
658 0 : panic("unrecognized key type")
659 : }
660 : }
661 :
662 : const (
663 : maskContainsPointKeys = 1 << 0
664 : maskSmallest = 1 << 1
665 : maskLargest = 1 << 2
666 : )
667 :
668 : // boundsMarker returns a marker byte whose bits encode the following
669 : // information (in order from least significant bit):
670 : // - if the table contains point keys
671 : // - if the table's smallest key is a point key
672 : // - if the table's largest key is a point key
673 2 : func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
674 2 : if m.HasPointKeys {
675 2 : sentinel |= maskContainsPointKeys
676 2 : }
677 2 : switch m.boundTypeSmallest {
678 2 : case boundTypePointKey:
679 2 : sentinel |= maskSmallest
680 2 : case boundTypeRangeKey:
681 : // No op - leave bit unset.
682 0 : default:
683 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
684 : }
685 2 : switch m.boundTypeLargest {
686 2 : case boundTypePointKey:
687 2 : sentinel |= maskLargest
688 2 : case boundTypeRangeKey:
689 : // No op - leave bit unset.
690 0 : default:
691 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
692 : }
693 2 : return
694 : }
695 :
696 : // String implements fmt.Stringer, printing the file number and the overall
697 : // table bounds.
698 2 : func (m *FileMetadata) String() string {
699 2 : return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
700 2 : }
701 :
702 : // DebugString returns a verbose representation of FileMetadata, typically for
703 : // use in tests and debugging, returning the file number and the point, range
704 : // and overall bounds for the table.
705 2 : func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
706 2 : var b bytes.Buffer
707 2 : fmt.Fprintf(&b, "%s:[%s-%s]",
708 2 : m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
709 2 : if !verbose {
710 2 : return b.String()
711 2 : }
712 1 : fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
713 1 : if m.HasPointKeys {
714 1 : fmt.Fprintf(&b, " points:[%s-%s]",
715 1 : m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
716 1 : }
717 1 : if m.HasRangeKeys {
718 1 : fmt.Fprintf(&b, " ranges:[%s-%s]",
719 1 : m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
720 1 : }
721 1 : return b.String()
722 : }
723 :
724 : // ParseFileMetadataDebug parses a FileMetadata from its DebugString
725 : // representation.
726 1 : func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
727 1 : // Split lines of the form:
728 1 : // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
729 1 : fields := strings.FieldsFunc(s, func(c rune) bool {
730 1 : switch c {
731 1 : case ':', '[', '-', ']':
732 1 : return true
733 1 : default:
734 1 : return unicode.IsSpace(c) // NB: also trim whitespace padding.
735 : }
736 : })
737 1 : if len(fields)%3 != 0 {
738 0 : return nil, errors.Newf("malformed input: %s", s)
739 0 : }
740 1 : m := &FileMetadata{}
741 1 : for len(fields) > 0 {
742 1 : prefix := fields[0]
743 1 : if prefix == "seqnums" {
744 1 : smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
745 1 : if err != nil {
746 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
747 0 : }
748 1 : largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
749 1 : if err != nil {
750 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
751 0 : }
752 1 : m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
753 1 : fields = fields[3:]
754 1 : continue
755 : }
756 1 : smallest := base.ParsePrettyInternalKey(fields[1])
757 1 : largest := base.ParsePrettyInternalKey(fields[2])
758 1 : switch prefix {
759 1 : case "points":
760 1 : m.SmallestPointKey, m.LargestPointKey = smallest, largest
761 1 : m.HasPointKeys = true
762 1 : case "ranges":
763 1 : m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
764 1 : m.HasRangeKeys = true
765 1 : default:
766 1 : fileNum, err := strconv.ParseUint(prefix, 10, 64)
767 1 : if err != nil {
768 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
769 0 : }
770 1 : m.FileNum = base.FileNum(fileNum)
771 1 : m.Smallest, m.Largest = smallest, largest
772 1 : m.boundsSet = true
773 : }
774 1 : fields = fields[3:]
775 : }
776 : // By default, when the parser sees just the overall bounds, we set the point
777 : // keys. This preserves backwards compatability with existing test cases that
778 : // specify only the overall bounds.
779 1 : if !m.HasPointKeys && !m.HasRangeKeys {
780 1 : m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
781 1 : m.HasPointKeys = true
782 1 : }
783 1 : m.InitPhysicalBacking()
784 1 : return m, nil
785 : }
786 :
787 : // Validate validates the metadata for consistency with itself, returning an
788 : // error if inconsistent.
789 2 : func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
790 2 : // Combined range and point key validation.
791 2 :
792 2 : if !m.HasPointKeys && !m.HasRangeKeys {
793 0 : return base.CorruptionErrorf("file %s has neither point nor range keys",
794 0 : errors.Safe(m.FileNum))
795 0 : }
796 2 : if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
797 1 : return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
798 1 : errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
799 1 : m.Largest.Pretty(formatKey))
800 1 : }
801 2 : if m.SmallestSeqNum > m.LargestSeqNum {
802 0 : return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
803 0 : errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
804 0 : }
805 :
806 : // Point key validation.
807 :
808 2 : if m.HasPointKeys {
809 2 : if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
810 0 : return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
811 0 : errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
812 0 : m.LargestPointKey.Pretty(formatKey))
813 0 : }
814 2 : if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
815 2 : base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
816 0 : return base.CorruptionErrorf(
817 0 : "file %s has inconsistent point key bounds relative to overall bounds: "+
818 0 : "overall = [%s-%s], point keys = [%s-%s]",
819 0 : errors.Safe(m.FileNum),
820 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
821 0 : m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
822 0 : )
823 0 : }
824 : }
825 :
826 : // Range key validation.
827 :
828 2 : if m.HasRangeKeys {
829 2 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
830 0 : return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
831 0 : errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
832 0 : m.LargestRangeKey.Pretty(formatKey))
833 0 : }
834 2 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
835 2 : base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
836 0 : return base.CorruptionErrorf(
837 0 : "file %s has inconsistent range key bounds relative to overall bounds: "+
838 0 : "overall = [%s-%s], range keys = [%s-%s]",
839 0 : errors.Safe(m.FileNum),
840 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
841 0 : m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
842 0 : )
843 0 : }
844 : }
845 :
846 : // Ensure that FileMetadata.Init was called.
847 2 : if m.FileBacking == nil {
848 0 : return base.CorruptionErrorf("file metadata FileBacking not set")
849 0 : }
850 :
851 2 : return nil
852 : }
853 :
854 : // TableInfo returns a subset of the FileMetadata state formatted as a
855 : // TableInfo.
856 2 : func (m *FileMetadata) TableInfo() TableInfo {
857 2 : return TableInfo{
858 2 : FileNum: m.FileNum,
859 2 : Size: m.Size,
860 2 : Smallest: m.Smallest,
861 2 : Largest: m.Largest,
862 2 : SmallestSeqNum: m.SmallestSeqNum,
863 2 : LargestSeqNum: m.LargestSeqNum,
864 2 : }
865 2 : }
866 :
867 2 : func cmpUint64(a, b uint64) int {
868 2 : switch {
869 2 : case a < b:
870 2 : return -1
871 2 : case a > b:
872 2 : return +1
873 2 : default:
874 2 : return 0
875 : }
876 : }
877 :
878 2 : func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
879 2 : // NB: This is the same ordering that RocksDB uses for L0 files.
880 2 :
881 2 : // Sort first by largest sequence number.
882 2 : if m.LargestSeqNum != b.LargestSeqNum {
883 2 : return cmpUint64(m.LargestSeqNum, b.LargestSeqNum)
884 2 : }
885 : // Then by smallest sequence number.
886 2 : if m.SmallestSeqNum != b.SmallestSeqNum {
887 2 : return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum)
888 2 : }
889 : // Break ties by file number.
890 2 : return cmpUint64(uint64(m.FileNum), uint64(b.FileNum))
891 : }
892 :
893 2 : func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
894 2 : return m.cmpSeqNum(b) < 0
895 2 : }
896 :
897 2 : func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
898 2 : return base.InternalCompare(cmp, m.Smallest, b.Smallest)
899 2 : }
900 :
901 : // KeyRange returns the minimum smallest and maximum largest internalKey for
902 : // all the FileMetadata in iters.
903 2 : func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
904 2 : first := true
905 2 : for _, iter := range iters {
906 2 : for meta := iter.First(); meta != nil; meta = iter.Next() {
907 2 : if first {
908 2 : first = false
909 2 : smallest, largest = meta.Smallest, meta.Largest
910 2 : continue
911 : }
912 2 : if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
913 2 : smallest = meta.Smallest
914 2 : }
915 2 : if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
916 2 : largest = meta.Largest
917 2 : }
918 : }
919 : }
920 2 : return smallest, largest
921 : }
922 :
923 : type bySeqNum []*FileMetadata
924 :
925 2 : func (b bySeqNum) Len() int { return len(b) }
926 2 : func (b bySeqNum) Less(i, j int) bool {
927 2 : return b[i].lessSeqNum(b[j])
928 2 : }
929 2 : func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
930 :
931 : // SortBySeqNum sorts the specified files by increasing sequence number.
932 2 : func SortBySeqNum(files []*FileMetadata) {
933 2 : sort.Sort(bySeqNum(files))
934 2 : }
935 :
936 : type bySmallest struct {
937 : files []*FileMetadata
938 : cmp Compare
939 : }
940 :
941 1 : func (b bySmallest) Len() int { return len(b.files) }
942 1 : func (b bySmallest) Less(i, j int) bool {
943 1 : return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
944 1 : }
945 0 : func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
946 :
947 : // SortBySmallest sorts the specified files by smallest key using the supplied
948 : // comparison function to order user keys.
949 1 : func SortBySmallest(files []*FileMetadata, cmp Compare) {
950 1 : sort.Sort(bySmallest{files, cmp})
951 1 : }
952 :
953 2 : func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
954 2 : startIter := iter.Clone()
955 2 : {
956 2 : startIterFile := startIter.SeekGE(cmp, start)
957 2 : // SeekGE compares user keys. The user key `start` may be equal to the
958 2 : // f.Largest because f.Largest is a range deletion sentinel, indicating
959 2 : // that the user key `start` is NOT contained within the file f. If
960 2 : // that's the case, we can narrow the overlapping bounds to exclude the
961 2 : // file with the sentinel.
962 2 : if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
963 2 : cmp(startIterFile.Largest.UserKey, start) == 0 {
964 2 : startIterFile = startIter.Next()
965 2 : }
966 2 : _ = startIterFile // Ignore unused assignment.
967 : }
968 :
969 2 : endIter := iter.Clone()
970 2 : {
971 2 : endIterFile := endIter.SeekGE(cmp, end)
972 2 :
973 2 : if !exclusiveEnd {
974 2 : // endIter is now pointing at the *first* file with a largest key >= end.
975 2 : // If there are multiple files including the user key `end`, we want all
976 2 : // of them, so move forward.
977 2 : for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
978 2 : endIterFile = endIter.Next()
979 2 : }
980 : }
981 :
982 : // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
983 : // or nexted too far because Largest.UserKey equaled `end`, go back.
984 : //
985 : // Consider !exclusiveEnd and end = 'f', with the following file bounds:
986 : //
987 : // [b,d] [e, f] [f, f] [g, h]
988 : //
989 : // the above for loop will Next until it arrives at [g, h]. We need to
990 : // observe that g > f, and Prev to the file with bounds [f, f].
991 2 : if endIterFile == nil {
992 2 : endIterFile = endIter.Prev()
993 2 : } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
994 2 : endIterFile = endIter.Prev()
995 2 : }
996 2 : _ = endIterFile // Ignore unused assignment.
997 : }
998 2 : return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
999 : }
1000 :
1001 : // NumLevels is the number of levels a Version contains.
1002 : const NumLevels = 7
1003 :
1004 : // NewVersion constructs a new Version with the provided files. It requires
1005 : // the provided files are already well-ordered. It's intended for testing.
1006 : func NewVersion(
1007 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
1008 1 : ) *Version {
1009 1 : var v Version
1010 1 : for l := range files {
1011 1 : // NB: We specifically insert `files` into the B-Tree in the order
1012 1 : // they appear within `files`. Some tests depend on this behavior in
1013 1 : // order to test consistency checking, etc. Once we've constructed the
1014 1 : // initial B-Tree, we swap out the btreeCmp for the correct one.
1015 1 : // TODO(jackson): Adjust or remove the tests and remove this.
1016 1 : v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
1017 1 : v.Levels[l].level = l
1018 1 : if l == 0 {
1019 1 : v.Levels[l].tree.cmp = btreeCmpSeqNum
1020 1 : } else {
1021 1 : v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
1022 1 : }
1023 1 : for _, f := range files[l] {
1024 1 : v.Levels[l].totalSize += f.Size
1025 1 : }
1026 : }
1027 1 : if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
1028 0 : panic(err)
1029 : }
1030 1 : return &v
1031 : }
1032 :
1033 : // Version is a collection of file metadata for on-disk tables at various
1034 : // levels. In-memory DBs are written to level-0 tables, and compactions
1035 : // migrate data from level N to level N+1. The tables map internal keys (which
1036 : // are a user key, a delete or set bit, and a sequence number) to user values.
1037 : //
1038 : // The tables at level 0 are sorted by largest sequence number. Due to file
1039 : // ingestion, there may be overlap in the ranges of sequence numbers contain in
1040 : // level 0 sstables. In particular, it is valid for one level 0 sstable to have
1041 : // the seqnum range [1,100] while an adjacent sstable has the seqnum range
1042 : // [50,50]. This occurs when the [50,50] table was ingested and given a global
1043 : // seqnum. The ingestion code will have ensured that the [50,50] sstable will
1044 : // not have any keys that overlap with the [1,100] in the seqnum range
1045 : // [1,49]. The range of internal keys [fileMetadata.smallest,
1046 : // fileMetadata.largest] in each level 0 table may overlap.
1047 : //
1048 : // The tables at any non-0 level are sorted by their internal key range and any
1049 : // two tables at the same non-0 level do not overlap.
1050 : //
1051 : // The internal key ranges of two tables at different levels X and Y may
1052 : // overlap, for any X != Y.
1053 : //
1054 : // Finally, for every internal key in a table at level X, there is no internal
1055 : // key in a higher level table that has both the same user key and a higher
1056 : // sequence number.
1057 : type Version struct {
1058 : refs atomic.Int32
1059 :
1060 : // The level 0 sstables are organized in a series of sublevels. Similar to
1061 : // the seqnum invariant in normal levels, there is no internal key in a
1062 : // higher level table that has both the same user key and a higher sequence
1063 : // number. Within a sublevel, tables are sorted by their internal key range
1064 : // and any two tables at the same sublevel do not overlap. Unlike the normal
1065 : // levels, sublevel n contains older tables (lower sequence numbers) than
1066 : // sublevel n+1.
1067 : //
1068 : // The L0Sublevels struct is mostly used for compaction picking. As most
1069 : // internal data structures in it are only necessary for compaction picking
1070 : // and not for iterator creation, the reference to L0Sublevels is nil'd
1071 : // after this version becomes the non-newest version, to reduce memory
1072 : // usage.
1073 : //
1074 : // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
1075 : // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
1076 : // a reference to that slice, as that slice is necessary for iterator
1077 : // creation and needs to outlast L0Sublevels.
1078 : L0Sublevels *L0Sublevels
1079 : L0SublevelFiles []LevelSlice
1080 :
1081 : Levels [NumLevels]LevelMetadata
1082 :
1083 : // RangeKeyLevels holds a subset of the same files as Levels that contain range
1084 : // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
1085 : // duplication should be minimal, as range keys are expected to be rare.
1086 : RangeKeyLevels [NumLevels]LevelMetadata
1087 :
1088 : // The callback to invoke when the last reference to a version is
1089 : // removed. Will be called with list.mu held.
1090 : Deleted func(obsolete []*FileBacking)
1091 :
1092 : // Stats holds aggregated stats about the version maintained from
1093 : // version to version.
1094 : Stats struct {
1095 : // MarkedForCompaction records the count of files marked for
1096 : // compaction within the version.
1097 : MarkedForCompaction int
1098 : }
1099 :
1100 : // The list the version is linked into.
1101 : list *VersionList
1102 :
1103 : // The next/prev link for the versionList doubly-linked list of versions.
1104 : prev, next *Version
1105 : }
1106 :
1107 : // String implements fmt.Stringer, printing the FileMetadata for each level in
1108 : // the Version.
1109 1 : func (v *Version) String() string {
1110 1 : return v.string(base.DefaultFormatter, false)
1111 1 : }
1112 :
1113 : // DebugString returns an alternative format to String() which includes sequence
1114 : // number and kind information for the sstable boundaries.
1115 1 : func (v *Version) DebugString(format base.FormatKey) string {
1116 1 : return v.string(format, true)
1117 1 : }
1118 :
1119 2 : func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
1120 2 : var buf bytes.Buffer
1121 2 : for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
1122 2 : fmt.Fprintf(&buf, "0.%d:\n", sublevel)
1123 2 : sublevels[sublevel].Each(func(f *FileMetadata) {
1124 2 : fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose))
1125 2 : })
1126 : }
1127 2 : return buf.String()
1128 : }
1129 :
1130 1 : func (v *Version) string(format base.FormatKey, verbose bool) string {
1131 1 : var buf bytes.Buffer
1132 1 : if len(v.L0SublevelFiles) > 0 {
1133 1 : fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
1134 1 : }
1135 1 : for level := 1; level < NumLevels; level++ {
1136 1 : if v.Levels[level].Empty() {
1137 1 : continue
1138 : }
1139 1 : fmt.Fprintf(&buf, "%d:\n", level)
1140 1 : iter := v.Levels[level].Iter()
1141 1 : for f := iter.First(); f != nil; f = iter.Next() {
1142 1 : fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose))
1143 1 : }
1144 : }
1145 1 : return buf.String()
1146 : }
1147 :
1148 : // ParseVersionDebug parses a Version from its DebugString output.
1149 : func ParseVersionDebug(
1150 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
1151 1 : ) (*Version, error) {
1152 1 : var level int
1153 1 : var files [NumLevels][]*FileMetadata
1154 1 : for _, l := range strings.Split(s, "\n") {
1155 1 : l = strings.TrimSpace(l)
1156 1 :
1157 1 : switch l[:2] {
1158 1 : case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
1159 1 : var err error
1160 1 : level, err = strconv.Atoi(l[:1])
1161 1 : if err != nil {
1162 0 : return nil, err
1163 0 : }
1164 1 : default:
1165 1 : m, err := ParseFileMetadataDebug(l)
1166 1 : if err != nil {
1167 0 : return nil, err
1168 0 : }
1169 : // If we only parsed overall bounds, default to setting the point bounds.
1170 1 : if !m.HasPointKeys && !m.HasRangeKeys {
1171 0 : m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
1172 0 : m.HasPointKeys = true
1173 0 : }
1174 1 : files[level] = append(files[level], m)
1175 : }
1176 : }
1177 : // Reverse the order of L0 files. This ensures we construct the same
1178 : // sublevels. (They're printed from higher sublevel to lower, which means in
1179 : // a partial order that represents newest to oldest).
1180 1 : for i := 0; i < len(files[0])/2; i++ {
1181 1 : files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
1182 1 : }
1183 1 : return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
1184 : }
1185 :
1186 : // Refs returns the number of references to the version.
1187 2 : func (v *Version) Refs() int32 {
1188 2 : return v.refs.Load()
1189 2 : }
1190 :
1191 : // Ref increments the version refcount.
1192 2 : func (v *Version) Ref() {
1193 2 : v.refs.Add(1)
1194 2 : }
1195 :
1196 : // Unref decrements the version refcount. If the last reference to the version
1197 : // was removed, the version is removed from the list of versions and the
1198 : // Deleted callback is invoked. Requires that the VersionList mutex is NOT
1199 : // locked.
1200 2 : func (v *Version) Unref() {
1201 2 : if v.refs.Add(-1) == 0 {
1202 2 : l := v.list
1203 2 : l.mu.Lock()
1204 2 : l.Remove(v)
1205 2 : obsolete := v.unrefFiles()
1206 2 : fileBacking := make([]*FileBacking, len(obsolete))
1207 2 : for i, f := range obsolete {
1208 2 : fileBacking[i] = f.FileBacking
1209 2 : }
1210 2 : v.Deleted(fileBacking)
1211 2 : l.mu.Unlock()
1212 : }
1213 : }
1214 :
1215 : // UnrefLocked decrements the version refcount. If the last reference to the
1216 : // version was removed, the version is removed from the list of versions and
1217 : // the Deleted callback is invoked. Requires that the VersionList mutex is
1218 : // already locked.
1219 2 : func (v *Version) UnrefLocked() {
1220 2 : if v.refs.Add(-1) == 0 {
1221 2 : v.list.Remove(v)
1222 2 : obsolete := v.unrefFiles()
1223 2 : fileBacking := make([]*FileBacking, len(obsolete))
1224 2 : for i, f := range obsolete {
1225 2 : fileBacking[i] = f.FileBacking
1226 2 : }
1227 2 : v.Deleted(fileBacking)
1228 : }
1229 : }
1230 :
1231 2 : func (v *Version) unrefFiles() []*FileMetadata {
1232 2 : var obsolete []*FileMetadata
1233 2 : for _, lm := range v.Levels {
1234 2 : obsolete = append(obsolete, lm.release()...)
1235 2 : }
1236 2 : for _, lm := range v.RangeKeyLevels {
1237 2 : obsolete = append(obsolete, lm.release()...)
1238 2 : }
1239 2 : return obsolete
1240 : }
1241 :
1242 : // Next returns the next version in the list of versions.
1243 1 : func (v *Version) Next() *Version {
1244 1 : return v.next
1245 1 : }
1246 :
1247 : // InitL0Sublevels initializes the L0Sublevels
1248 : func (v *Version) InitL0Sublevels(
1249 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
1250 2 : ) error {
1251 2 : var err error
1252 2 : v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
1253 2 : if err == nil && v.L0Sublevels != nil {
1254 2 : v.L0SublevelFiles = v.L0Sublevels.Levels
1255 2 : }
1256 2 : return err
1257 : }
1258 :
1259 : // Contains returns a boolean indicating whether the provided file exists in
1260 : // the version at the given level. If level is non-zero then Contains binary
1261 : // searches among the files. If level is zero, Contains scans the entire
1262 : // level.
1263 2 : func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
1264 2 : iter := v.Levels[level].Iter()
1265 2 : if level > 0 {
1266 2 : overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
1267 2 : m.Largest.IsExclusiveSentinel())
1268 2 : iter = overlaps.Iter()
1269 2 : }
1270 2 : for f := iter.First(); f != nil; f = iter.Next() {
1271 2 : if f == m {
1272 2 : return true
1273 2 : }
1274 : }
1275 2 : return false
1276 : }
1277 :
1278 : // Overlaps returns all elements of v.files[level] whose user key range
1279 : // intersects the given range. If level is non-zero then the user key ranges of
1280 : // v.files[level] are assumed to not overlap (although they may touch). If level
1281 : // is zero then that assumption cannot be made, and the [start, end] range is
1282 : // expanded to the union of those matching ranges so far and the computation is
1283 : // repeated until [start, end] stabilizes.
1284 : // The returned files are a subsequence of the input files, i.e., the ordering
1285 : // is not changed.
1286 : func (v *Version) Overlaps(
1287 : level int, cmp Compare, start, end []byte, exclusiveEnd bool,
1288 2 : ) LevelSlice {
1289 2 : if level == 0 {
1290 2 : // Indices that have been selected as overlapping.
1291 2 : l0 := v.Levels[level]
1292 2 : l0Iter := l0.Iter()
1293 2 : selectedIndices := make([]bool, l0.Len())
1294 2 : numSelected := 0
1295 2 : var slice LevelSlice
1296 2 : for {
1297 2 : restart := false
1298 2 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1299 2 : selected := selectedIndices[i]
1300 2 : if selected {
1301 2 : continue
1302 : }
1303 2 : if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
1304 2 : // meta is completely outside the specified range; skip it.
1305 2 : continue
1306 : }
1307 : // Overlaps.
1308 2 : selectedIndices[i] = true
1309 2 : numSelected++
1310 2 :
1311 2 : smallest := meta.Smallest.UserKey
1312 2 : largest := meta.Largest.UserKey
1313 2 : // Since level == 0, check if the newly added fileMetadata has
1314 2 : // expanded the range. We expand the range immediately for files
1315 2 : // we have remaining to check in this loop. All already checked
1316 2 : // and unselected files will need to be rechecked via the
1317 2 : // restart below.
1318 2 : if cmp(smallest, start) < 0 {
1319 2 : start = smallest
1320 2 : restart = true
1321 2 : }
1322 2 : if v := cmp(largest, end); v > 0 {
1323 2 : end = largest
1324 2 : exclusiveEnd = meta.Largest.IsExclusiveSentinel()
1325 2 : restart = true
1326 2 : } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
1327 2 : // Only update the exclusivity of our existing `end`
1328 2 : // bound.
1329 2 : exclusiveEnd = false
1330 2 : restart = true
1331 2 : }
1332 : }
1333 :
1334 2 : if !restart {
1335 2 : // Construct a B-Tree containing only the matching items.
1336 2 : var tr btree
1337 2 : tr.cmp = v.Levels[level].tree.cmp
1338 2 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1339 2 : if selectedIndices[i] {
1340 2 : err := tr.Insert(meta)
1341 2 : if err != nil {
1342 0 : panic(err)
1343 : }
1344 : }
1345 : }
1346 2 : slice = newLevelSlice(tr.Iter())
1347 2 : // TODO(jackson): Avoid the oddity of constructing and
1348 2 : // immediately releasing a B-Tree. Make LevelSlice an
1349 2 : // interface?
1350 2 : tr.Release()
1351 2 : break
1352 : }
1353 : // Continue looping to retry the files that were not selected.
1354 : }
1355 2 : return slice
1356 : }
1357 :
1358 2 : return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
1359 : }
1360 :
1361 : // CheckOrdering checks that the files are consistent with respect to
1362 : // increasing file numbers (for level 0 files) and increasing and non-
1363 : // overlapping internal key ranges (for level non-0 files).
1364 1 : func (v *Version) CheckOrdering(cmp Compare, format base.FormatKey) error {
1365 1 : for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
1366 1 : sublevelIter := v.L0SublevelFiles[sublevel].Iter()
1367 1 : if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter); err != nil {
1368 0 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
1369 0 : }
1370 : }
1371 :
1372 1 : for level, lm := range v.Levels {
1373 1 : if err := CheckOrdering(cmp, format, Level(level), lm.Iter()); err != nil {
1374 1 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
1375 1 : }
1376 : }
1377 1 : return nil
1378 : }
1379 :
1380 : // VersionList holds a list of versions. The versions are ordered from oldest
1381 : // to newest.
1382 : type VersionList struct {
1383 : mu *sync.Mutex
1384 : root Version
1385 : }
1386 :
1387 : // Init initializes the version list.
1388 2 : func (l *VersionList) Init(mu *sync.Mutex) {
1389 2 : l.mu = mu
1390 2 : l.root.next = &l.root
1391 2 : l.root.prev = &l.root
1392 2 : }
1393 :
1394 : // Empty returns true if the list is empty, and false otherwise.
1395 2 : func (l *VersionList) Empty() bool {
1396 2 : return l.root.next == &l.root
1397 2 : }
1398 :
1399 : // Front returns the oldest version in the list. Note that this version is only
1400 : // valid if Empty() returns true.
1401 2 : func (l *VersionList) Front() *Version {
1402 2 : return l.root.next
1403 2 : }
1404 :
1405 : // Back returns the newest version in the list. Note that this version is only
1406 : // valid if Empty() returns true.
1407 2 : func (l *VersionList) Back() *Version {
1408 2 : return l.root.prev
1409 2 : }
1410 :
1411 : // PushBack adds a new version to the back of the list. This new version
1412 : // becomes the "newest" version in the list.
1413 2 : func (l *VersionList) PushBack(v *Version) {
1414 2 : if v.list != nil || v.prev != nil || v.next != nil {
1415 0 : panic("pebble: version list is inconsistent")
1416 : }
1417 2 : v.prev = l.root.prev
1418 2 : v.prev.next = v
1419 2 : v.next = &l.root
1420 2 : v.next.prev = v
1421 2 : v.list = l
1422 2 : // Let L0Sublevels on the second newest version get GC'd, as it is no longer
1423 2 : // necessary. See the comment in Version.
1424 2 : v.prev.L0Sublevels = nil
1425 : }
1426 :
1427 : // Remove removes the specified version from the list.
1428 2 : func (l *VersionList) Remove(v *Version) {
1429 2 : if v == &l.root {
1430 0 : panic("pebble: cannot remove version list root node")
1431 : }
1432 2 : if v.list != l {
1433 0 : panic("pebble: version list is inconsistent")
1434 : }
1435 2 : v.prev.next = v.next
1436 2 : v.next.prev = v.prev
1437 2 : v.next = nil // avoid memory leaks
1438 2 : v.prev = nil // avoid memory leaks
1439 2 : v.list = nil // avoid memory leaks
1440 : }
1441 :
1442 : // CheckOrdering checks that the files are consistent with respect to
1443 : // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
1444 : // overlapping internal key ranges (for non-level 0 files).
1445 2 : func CheckOrdering(cmp Compare, format base.FormatKey, level Level, files LevelIterator) error {
1446 2 : // The invariants to check for L0 sublevels are the same as the ones to
1447 2 : // check for all other levels. However, if L0 is not organized into
1448 2 : // sublevels, or if all L0 files are being passed in, we do the legacy L0
1449 2 : // checks, defined in the detailed comment below.
1450 2 : if level == Level(0) {
1451 2 : // We have 2 kinds of files:
1452 2 : // - Files with exactly one sequence number: these could be either ingested files
1453 2 : // or flushed files. We cannot tell the difference between them based on FileMetadata,
1454 2 : // so our consistency checking here uses the weaker checks assuming it is a narrow
1455 2 : // flushed file. We cannot error on ingested files having sequence numbers coincident
1456 2 : // with flushed files as the seemingly ingested file could just be a flushed file
1457 2 : // with just one key in it which is a truncated range tombstone sharing sequence numbers
1458 2 : // with other files in the same flush.
1459 2 : // - Files with multiple sequence numbers: these are necessarily flushed files.
1460 2 : //
1461 2 : // Three cases of overlapping sequence numbers:
1462 2 : // Case 1:
1463 2 : // An ingested file contained in the sequence numbers of the flushed file -- it must be
1464 2 : // fully contained (not coincident with either end of the flushed file) since the memtable
1465 2 : // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
1466 2 : // num b, and the memtable got a subsequent update that was given sequence num b+1, before
1467 2 : // being flushed.
1468 2 : //
1469 2 : // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
1470 2 : // third file are inconsistent with each other. So comparing adjacent files is insufficient
1471 2 : // for consistency checking.
1472 2 : //
1473 2 : // Visually we have something like
1474 2 : // x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
1475 2 : // y y y y (y's represent ingested files)
1476 2 : // And these are ordered in increasing order of y. Note that y's must be unique.
1477 2 : //
1478 2 : // Case 2:
1479 2 : // A flushed file that did not overlap in keys with any file in any level, but does overlap
1480 2 : // in the file key intervals. This file is placed in L0 since it overlaps in the file
1481 2 : // key intervals but since it has no overlapping data, it is assigned a sequence number
1482 2 : // of 0 in RocksDB. We handle this case for compatibility with RocksDB.
1483 2 : //
1484 2 : // Case 3:
1485 2 : // A sequence of flushed files that overlap in sequence numbers with one another,
1486 2 : // but do not overlap in keys inside the sstables. These files correspond to
1487 2 : // partitioned flushes or the results of intra-L0 compactions of partitioned
1488 2 : // flushes.
1489 2 : //
1490 2 : // Since these types of SSTables violate most other sequence number
1491 2 : // overlap invariants, and handling this case is important for compatibility
1492 2 : // with future versions of pebble, this method relaxes most L0 invariant
1493 2 : // checks.
1494 2 :
1495 2 : var prev *FileMetadata
1496 2 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1497 2 : if prev == nil {
1498 2 : continue
1499 : }
1500 : // Validate that the sorting is sane.
1501 2 : if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
1502 1 : // Multiple files satisfying case 2 mentioned above.
1503 2 : } else if !prev.lessSeqNum(f) {
1504 1 : return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
1505 1 : errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1506 1 : errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
1507 1 : errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
1508 1 : }
1509 : }
1510 2 : } else {
1511 2 : var prev *FileMetadata
1512 2 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1513 2 : if err := f.Validate(cmp, format); err != nil {
1514 1 : return errors.Wrapf(err, "%s ", level)
1515 1 : }
1516 2 : if prev != nil {
1517 2 : if prev.cmpSmallestKey(f, cmp) >= 0 {
1518 1 : return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
1519 1 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1520 1 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1521 1 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1522 1 : }
1523 2 : if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
1524 1 : return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
1525 1 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1526 1 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1527 1 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1528 1 : }
1529 : }
1530 : }
1531 : }
1532 2 : return nil
1533 : }
|