Line data Source code
1 : // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : stdcmp "cmp"
10 : "fmt"
11 : "sort"
12 : "strconv"
13 : "strings"
14 : "sync"
15 : "sync/atomic"
16 : "unicode"
17 :
18 : "github.com/cockroachdb/errors"
19 : "github.com/cockroachdb/pebble/internal/base"
20 : "github.com/cockroachdb/pebble/internal/invariants"
21 : )
22 :
23 : // Compare exports the base.Compare type.
24 : type Compare = base.Compare
25 :
26 : // InternalKey exports the base.InternalKey type.
27 : type InternalKey = base.InternalKey
28 :
29 : // TableInfo contains the common information for table related events.
30 : type TableInfo struct {
31 : // FileNum is the internal DB identifier for the table.
32 : FileNum base.FileNum
33 : // Size is the size of the file in bytes.
34 : Size uint64
35 : // Smallest is the smallest internal key in the table.
36 : Smallest InternalKey
37 : // Largest is the largest internal key in the table.
38 : Largest InternalKey
39 : // SmallestSeqNum is the smallest sequence number in the table.
40 : SmallestSeqNum uint64
41 : // LargestSeqNum is the largest sequence number in the table.
42 : LargestSeqNum uint64
43 : }
44 :
45 : // TableStats contains statistics on a table used for compaction heuristics,
46 : // and export via Metrics.
47 : type TableStats struct {
48 : // The total number of entries in the table.
49 : NumEntries uint64
50 : // The number of point and range deletion entries in the table.
51 : NumDeletions uint64
52 : // NumRangeKeySets is the total number of range key sets in the table.
53 : //
54 : // NB: If there's a chance that the sstable contains any range key sets,
55 : // then NumRangeKeySets must be > 0.
56 : NumRangeKeySets uint64
57 : // Estimate of the total disk space that may be dropped by this table's
58 : // point deletions by compacting them.
59 : PointDeletionsBytesEstimate uint64
60 : // Estimate of the total disk space that may be dropped by this table's
61 : // range deletions by compacting them. This estimate is at data-block
62 : // granularity and is not updated if compactions beneath the table reduce
63 : // the amount of reclaimable disk space. It also does not account for
64 : // overlapping data in L0 and ignores L0 sublevels, but the error that
65 : // introduces is expected to be small.
66 : //
67 : // Tables in the bottommost level of the LSM may have a nonzero estimate if
68 : // snapshots or move compactions prevented the elision of their range
69 : // tombstones. A table in the bottommost level that was ingested into L6
70 : // will have a zero estimate, because the file's sequence numbers indicate
71 : // that the tombstone cannot drop any data contained within the file itself.
72 : RangeDeletionsBytesEstimate uint64
73 : // Total size of value blocks and value index block.
74 : ValueBlocksSize uint64
75 : }
76 :
77 : // boundType represents the type of key (point or range) present as the smallest
78 : // and largest keys.
79 : type boundType uint8
80 :
81 : const (
82 : boundTypePointKey boundType = iota + 1
83 : boundTypeRangeKey
84 : )
85 :
86 : // CompactionState is the compaction state of a file.
87 : //
88 : // The following shows the valid state transitions:
89 : //
90 : // NotCompacting --> Compacting --> Compacted
91 : // ^ |
92 : // | |
93 : // +-------<-------+
94 : //
95 : // Input files to a compaction transition to Compacting when a compaction is
96 : // picked. A file that has finished compacting typically transitions into the
97 : // Compacted state, at which point it is effectively obsolete ("zombied") and
98 : // will eventually be removed from the LSM. A file that has been move-compacted
99 : // will transition from Compacting back into the NotCompacting state, signaling
100 : // that the file may be selected for a subsequent compaction. A failed
101 : // compaction will result in all input tables transitioning from Compacting to
102 : // NotCompacting.
103 : //
104 : // This state is in-memory only. It is not persisted to the manifest.
105 : type CompactionState uint8
106 :
107 : // CompactionStates.
108 : const (
109 : CompactionStateNotCompacting CompactionState = iota
110 : CompactionStateCompacting
111 : CompactionStateCompacted
112 : )
113 :
114 : // String implements fmt.Stringer.
115 0 : func (s CompactionState) String() string {
116 0 : switch s {
117 0 : case CompactionStateNotCompacting:
118 0 : return "NotCompacting"
119 0 : case CompactionStateCompacting:
120 0 : return "Compacting"
121 0 : case CompactionStateCompacted:
122 0 : return "Compacted"
123 0 : default:
124 0 : panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
125 : }
126 : }
127 :
128 : // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
129 : // some version. FileMetadata does not contain the actual level of the sst,
130 : // since such leveled-ssts can move across levels in different versions, while
131 : // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
132 : // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
133 : // only state is FileBacking. A backing-sst is level-less. It is possible for a
134 : // backing-sst to be referred to by a physical sst in one version and by one or
135 : // more virtual ssts in one or more versions. A backing-sst becomes obsolete
136 : // and can be deleted once it is no longer required by any physical or virtual
137 : // sst in any version.
138 : //
139 : // We maintain some invariants:
140 : //
141 : // 1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
142 : // and there will be exactly one FileMetadata associated with the FileNum.
143 : //
144 : // 2. Within a version, a backing-sst is either only referred to by one
145 : // physical sst or one or more virtual ssts.
146 : //
147 : // 3. Once a backing-sst is referred to by a virtual sst in the latest version,
148 : // it cannot go back to being referred to by a physical sst in any future
149 : // version.
150 : //
151 : // Once a physical sst is no longer needed by any version, we will no longer
152 : // maintain the file metadata associated with it. We will still maintain the
153 : // FileBacking associated with the physical sst if the backing sst is required
154 : // by any virtual ssts in any version.
155 : type FileMetadata struct {
156 : // AllowedSeeks is used to determine if a file should be picked for
157 : // a read triggered compaction. It is decremented when read sampling
158 : // in pebble.Iterator after every after every positioning operation
159 : // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
160 : AllowedSeeks atomic.Int64
161 :
162 : // statsValid indicates if stats have been loaded for the table. The
163 : // TableStats structure is populated only if valid is true.
164 : statsValid atomic.Bool
165 :
166 : // FileBacking is the state which backs either a physical or virtual
167 : // sstables.
168 : FileBacking *FileBacking
169 :
170 : // InitAllowedSeeks is the inital value of allowed seeks. This is used
171 : // to re-set allowed seeks on a file once it hits 0.
172 : InitAllowedSeeks int64
173 : // FileNum is the file number.
174 : //
175 : // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
176 : FileNum base.FileNum
177 : // Size is the size of the file, in bytes. Size is an approximate value for
178 : // virtual sstables.
179 : //
180 : // INVARIANTS:
181 : // - When !FileMetadata.Virtual, Size == FileBacking.Size.
182 : // - Size should be non-zero. Size 0 virtual sstables must not be created.
183 : Size uint64
184 : // File creation time in seconds since the epoch (1970-01-01 00:00:00
185 : // UTC). For ingested sstables, this corresponds to the time the file was
186 : // ingested. For virtual sstables, this corresponds to the wall clock time
187 : // when the FileMetadata for the virtual sstable was first created.
188 : CreationTime int64
189 : // Lower and upper bounds for the smallest and largest sequence numbers in
190 : // the table, across both point and range keys. For physical sstables, these
191 : // values are tight bounds. For virtual sstables, there is no guarantee that
192 : // there will be keys with SmallestSeqNum or LargestSeqNum within virtual
193 : // sstable bounds.
194 : SmallestSeqNum uint64
195 : LargestSeqNum uint64
196 : // SmallestPointKey and LargestPointKey are the inclusive bounds for the
197 : // internal point keys stored in the table. This includes RANGEDELs, which
198 : // alter point keys.
199 : // NB: these field should be set using ExtendPointKeyBounds. They are left
200 : // exported for reads as an optimization.
201 : SmallestPointKey InternalKey
202 : LargestPointKey InternalKey
203 : // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
204 : // internal range keys stored in the table.
205 : // NB: these field should be set using ExtendRangeKeyBounds. They are left
206 : // exported for reads as an optimization.
207 : SmallestRangeKey InternalKey
208 : LargestRangeKey InternalKey
209 : // Smallest and Largest are the inclusive bounds for the internal keys stored
210 : // in the table, across both point and range keys.
211 : // NB: these fields are derived from their point and range key equivalents,
212 : // and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
213 : Smallest InternalKey
214 : Largest InternalKey
215 : // Stats describe table statistics. Protected by DB.mu.
216 : //
217 : // For virtual sstables, set stats upon virtual sstable creation as
218 : // asynchronous computation of stats is not currently supported.
219 : //
220 : // TODO(bananabrick): To support manifest replay for virtual sstables, we
221 : // probably need to compute virtual sstable stats asynchronously. Otherwise,
222 : // we'd have to write virtual sstable stats to the version edit.
223 : Stats TableStats
224 :
225 : // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
226 : // pick L0 compactions. Only accurate for the most recent Version.
227 : SubLevel int
228 : L0Index int
229 : minIntervalIndex int
230 : maxIntervalIndex int
231 :
232 : // NB: the alignment of this struct is 8 bytes. We pack all the bools to
233 : // ensure an optimal packing.
234 :
235 : // IsIntraL0Compacting is set to True if this file is part of an intra-L0
236 : // compaction. When it's true, IsCompacting must also return true. If
237 : // Compacting is true and IsIntraL0Compacting is false for an L0 file, the
238 : // file must be part of a compaction to Lbase.
239 : IsIntraL0Compacting bool
240 : CompactionState CompactionState
241 : // True if compaction of this file has been explicitly requested.
242 : // Previously, RocksDB and earlier versions of Pebble allowed this
243 : // flag to be set by a user table property collector. Some earlier
244 : // versions of Pebble respected this flag, while other more recent
245 : // versions ignored this flag.
246 : //
247 : // More recently this flag has been repurposed to facilitate the
248 : // compaction of 'atomic compaction units'. Files marked for
249 : // compaction are compacted in a rewrite compaction at the lowest
250 : // possible compaction priority.
251 : //
252 : // NB: A count of files marked for compaction is maintained on
253 : // Version, and compaction picking reads cached annotations
254 : // determined by this field.
255 : //
256 : // Protected by DB.mu.
257 : MarkedForCompaction bool
258 : // HasPointKeys tracks whether the table contains point keys (including
259 : // RANGEDELs). If a table contains only range deletions, HasPointsKeys is
260 : // still true.
261 : HasPointKeys bool
262 : // HasRangeKeys tracks whether the table contains any range keys.
263 : HasRangeKeys bool
264 : // smallestSet and largestSet track whether the overall bounds have been set.
265 : boundsSet bool
266 : // boundTypeSmallest and boundTypeLargest provide an indication as to which
267 : // key type (point or range) corresponds to the smallest and largest overall
268 : // table bounds.
269 : boundTypeSmallest, boundTypeLargest boundType
270 : // Virtual is true if the FileMetadata belongs to a virtual sstable.
271 : Virtual bool
272 : }
273 :
274 : // PhysicalFileMeta is used by functions which want a guarantee that their input
275 : // belongs to a physical sst and not a virtual sst.
276 : //
277 : // NB: This type should only be constructed by calling
278 : // FileMetadata.PhysicalMeta.
279 : type PhysicalFileMeta struct {
280 : *FileMetadata
281 : }
282 :
283 : // VirtualFileMeta is used by functions which want a guarantee that their input
284 : // belongs to a virtual sst and not a physical sst.
285 : //
286 : // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
287 : // fields have additional invariants imposed on them, and/or slightly varying
288 : // meanings:
289 : // - Smallest and Largest (and their counterparts
290 : // {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
291 : // key at that exact bound. We make the effort to determine the next smallest
292 : // or largest key in an sstable after virtualizing it, to maintain this
293 : // tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
294 : // could mean that a rangedel or range key ends at that user key, or has been
295 : // truncated to that user key.
296 : // - One invariant is that if a rangedel or range key is truncated on its
297 : // upper bound, the virtual sstable *must* have a rangedel or range key
298 : // sentinel key as its upper bound. This is because truncation yields
299 : // an exclusive upper bound for the rangedel/rangekey, and if there are
300 : // any points at that exclusive upper bound within the same virtual
301 : // sstable, those could get uncovered by this truncation. We enforce this
302 : // invariant in calls to keyspan.Truncate.
303 : // - Size is an estimate of the size of the virtualized portion of this sstable.
304 : // The underlying file's size is stored in FileBacking.Size, though it could
305 : // also be estimated or could correspond to just the referenced portion of
306 : // a file (eg. if the file originated on another node).
307 : // - Size must be > 0.
308 : // - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
309 : // This means that all keys in the virtual sstable must have seqnums within
310 : // [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
311 : // a key with a seqnum at either of the bounds. Calculating tight seqnum
312 : // bounds would be too expensive and deliver little value.
313 : //
314 : // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
315 : type VirtualFileMeta struct {
316 : *FileMetadata
317 : }
318 :
319 : // PhysicalMeta should be the only source of creating the PhysicalFileMeta
320 : // wrapper type.
321 2 : func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
322 2 : if m.Virtual {
323 0 : panic("pebble: file metadata does not belong to a physical sstable")
324 : }
325 2 : return PhysicalFileMeta{
326 2 : m,
327 2 : }
328 : }
329 :
330 : // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
331 : // type.
332 2 : func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
333 2 : if !m.Virtual {
334 0 : panic("pebble: file metadata does not belong to a virtual sstable")
335 : }
336 2 : return VirtualFileMeta{
337 2 : m,
338 2 : }
339 : }
340 :
341 : // FileBacking either backs a single physical sstable, or one or more virtual
342 : // sstables.
343 : //
344 : // See the comment above the FileMetadata type for sstable terminology.
345 : type FileBacking struct {
346 : // Reference count for the backing file on disk: incremented when a
347 : // physical or virtual sstable which is backed by the FileBacking is
348 : // added to a version and decremented when the version is unreferenced.
349 : // We ref count in order to determine when it is safe to delete a
350 : // backing sst file from disk. The backing file is obsolete when the
351 : // reference count falls to zero.
352 : refs atomic.Int32
353 : // latestVersionRefs are the references to the FileBacking in the
354 : // latest version. This reference can be through a single physical
355 : // sstable in the latest version, or one or more virtual sstables in the
356 : // latest version.
357 : //
358 : // INVARIANT: latestVersionRefs <= refs.
359 : latestVersionRefs atomic.Int32
360 : // VirtualizedSize is set iff the backing sst is only referred to by
361 : // virtual ssts in the latest version. VirtualizedSize is the sum of the
362 : // virtual sstable sizes of all of the virtual sstables in the latest
363 : // version which are backed by the physical sstable. When a virtual
364 : // sstable is removed from the latest version, we will decrement the
365 : // VirtualizedSize. During compaction picking, we'll compensate a
366 : // virtual sstable file size by
367 : // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
368 : // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
369 : // is high, then the space amplification due to virtual sstables is
370 : // high, and we should pick the virtual sstable with a higher priority.
371 : //
372 : // TODO(bananabrick): Compensate the virtual sstable file size using
373 : // the VirtualizedSize during compaction picking and test.
374 : VirtualizedSize atomic.Uint64
375 : DiskFileNum base.DiskFileNum
376 : Size uint64
377 : }
378 :
379 : // InitPhysicalBacking allocates and sets the FileBacking which is required by a
380 : // physical sstable FileMetadata.
381 : //
382 : // Ensure that the state required by FileBacking, such as the FileNum, is
383 : // already set on the FileMetadata before InitPhysicalBacking is called.
384 : // Calling InitPhysicalBacking only after the relevant state has been set in the
385 : // FileMetadata is not necessary in tests which don't rely on FileBacking.
386 2 : func (m *FileMetadata) InitPhysicalBacking() {
387 2 : if m.Virtual {
388 0 : panic("pebble: virtual sstables should use a pre-existing FileBacking")
389 : }
390 2 : if m.FileBacking == nil {
391 2 : m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
392 2 : }
393 : }
394 :
395 : // InitProviderBacking creates a new FileBacking for a file backed by
396 : // an objstorage.Provider.
397 1 : func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
398 1 : if !m.Virtual {
399 0 : panic("pebble: provider-backed sstables must be virtual")
400 : }
401 1 : if m.FileBacking == nil {
402 1 : m.FileBacking = &FileBacking{DiskFileNum: fileNum}
403 1 : }
404 : }
405 :
406 : // ValidateVirtual should be called once the FileMetadata for a virtual sstable
407 : // is created to verify that the fields of the virtual sstable are sound.
408 2 : func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
409 2 : if !m.Virtual {
410 0 : panic("pebble: invalid virtual sstable")
411 : }
412 :
413 2 : if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
414 0 : panic("pebble: invalid smallest sequence number for virtual sstable")
415 : }
416 :
417 2 : if createdFrom.LargestSeqNum != m.LargestSeqNum {
418 0 : panic("pebble: invalid largest sequence number for virtual sstable")
419 : }
420 :
421 2 : if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
422 0 : panic("pebble: invalid physical sstable state for virtual sstable")
423 : }
424 :
425 2 : if m.Size == 0 {
426 0 : panic("pebble: virtual sstable size must be set upon creation")
427 : }
428 : }
429 :
430 : // Refs returns the refcount of backing sstable.
431 2 : func (m *FileMetadata) Refs() int32 {
432 2 : return m.FileBacking.refs.Load()
433 2 : }
434 :
435 : // Ref increments the ref count associated with the backing sstable.
436 2 : func (m *FileMetadata) Ref() {
437 2 : m.FileBacking.refs.Add(1)
438 2 : }
439 :
440 : // Unref decrements the ref count associated with the backing sstable.
441 2 : func (m *FileMetadata) Unref() int32 {
442 2 : v := m.FileBacking.refs.Add(-1)
443 2 : if invariants.Enabled && v < 0 {
444 0 : panic("pebble: invalid FileMetadata refcounting")
445 : }
446 2 : return v
447 : }
448 :
449 : // LatestRef increments the latest ref count associated with the backing
450 : // sstable.
451 2 : func (m *FileMetadata) LatestRef() {
452 2 : m.FileBacking.latestVersionRefs.Add(1)
453 2 :
454 2 : if m.Virtual {
455 2 : m.FileBacking.VirtualizedSize.Add(m.Size)
456 2 : }
457 : }
458 :
459 : // LatestUnref decrements the latest ref count associated with the backing
460 : // sstable.
461 2 : func (m *FileMetadata) LatestUnref() int32 {
462 2 : if m.Virtual {
463 2 : m.FileBacking.VirtualizedSize.Add(-m.Size)
464 2 : }
465 :
466 2 : v := m.FileBacking.latestVersionRefs.Add(-1)
467 2 : if invariants.Enabled && v < 0 {
468 0 : panic("pebble: invalid FileMetadata latest refcounting")
469 : }
470 2 : return v
471 : }
472 :
473 : // LatestRefs returns the latest ref count associated with the backing sstable.
474 2 : func (m *FileMetadata) LatestRefs() int32 {
475 2 : return m.FileBacking.latestVersionRefs.Load()
476 2 : }
477 :
478 : // SetCompactionState transitions this file's compaction state to the given
479 : // state. Protected by DB.mu.
480 2 : func (m *FileMetadata) SetCompactionState(to CompactionState) {
481 2 : if invariants.Enabled {
482 2 : transitionErr := func() error {
483 0 : return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
484 0 : }
485 2 : switch m.CompactionState {
486 2 : case CompactionStateNotCompacting:
487 2 : if to != CompactionStateCompacting {
488 0 : panic(transitionErr())
489 : }
490 2 : case CompactionStateCompacting:
491 2 : if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
492 0 : panic(transitionErr())
493 : }
494 0 : case CompactionStateCompacted:
495 0 : panic(transitionErr())
496 0 : default:
497 0 : panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
498 : }
499 : }
500 2 : m.CompactionState = to
501 : }
502 :
503 : // IsCompacting returns true if this file's compaction state is
504 : // CompactionStateCompacting. Protected by DB.mu.
505 2 : func (m *FileMetadata) IsCompacting() bool {
506 2 : return m.CompactionState == CompactionStateCompacting
507 2 : }
508 :
509 : // StatsValid returns true if the table stats have been populated. If StatValid
510 : // returns true, the Stats field may be read (with or without holding the
511 : // database mutex).
512 2 : func (m *FileMetadata) StatsValid() bool {
513 2 : return m.statsValid.Load()
514 2 : }
515 :
516 : // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
517 : // while populating TableStats and calling StatsMarkValud. Once stats are
518 : // populated, they must not be mutated.
519 2 : func (m *FileMetadata) StatsMarkValid() {
520 2 : m.statsValid.Store(true)
521 2 : }
522 :
523 : // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
524 : // and overall table bounds with the given smallest and largest keys. The
525 : // smallest and largest bounds may not be extended if the table already has a
526 : // bound that is smaller or larger, respectively. The receiver is returned.
527 : // NB: calling this method should be preferred to manually setting the bounds by
528 : // manipulating the fields directly, to maintain certain invariants.
529 : func (m *FileMetadata) ExtendPointKeyBounds(
530 : cmp Compare, smallest, largest InternalKey,
531 2 : ) *FileMetadata {
532 2 : // Update the point key bounds.
533 2 : if !m.HasPointKeys {
534 2 : m.SmallestPointKey, m.LargestPointKey = smallest, largest
535 2 : m.HasPointKeys = true
536 2 : } else {
537 2 : if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
538 2 : m.SmallestPointKey = smallest
539 2 : }
540 2 : if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
541 2 : m.LargestPointKey = largest
542 2 : }
543 : }
544 : // Update the overall bounds.
545 2 : m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
546 2 : return m
547 : }
548 :
549 : // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
550 : // and overall table bounds with the given smallest and largest keys. The
551 : // smallest and largest bounds may not be extended if the table already has a
552 : // bound that is smaller or larger, respectively. The receiver is returned.
553 : // NB: calling this method should be preferred to manually setting the bounds by
554 : // manipulating the fields directly, to maintain certain invariants.
555 : func (m *FileMetadata) ExtendRangeKeyBounds(
556 : cmp Compare, smallest, largest InternalKey,
557 2 : ) *FileMetadata {
558 2 : // Update the range key bounds.
559 2 : if !m.HasRangeKeys {
560 2 : m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
561 2 : m.HasRangeKeys = true
562 2 : } else {
563 1 : if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
564 0 : m.SmallestRangeKey = smallest
565 0 : }
566 1 : if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
567 1 : m.LargestRangeKey = largest
568 1 : }
569 : }
570 : // Update the overall bounds.
571 2 : m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
572 2 : return m
573 : }
574 :
575 : // extendOverallBounds attempts to extend the overall table lower and upper
576 : // bounds. The given bounds may not be used if a lower or upper bound already
577 : // exists that is smaller or larger than the given keys, respectively. The given
578 : // boundType will be used if the bounds are updated.
579 : func (m *FileMetadata) extendOverallBounds(
580 : cmp Compare, smallest, largest InternalKey, bTyp boundType,
581 2 : ) {
582 2 : if !m.boundsSet {
583 2 : m.Smallest, m.Largest = smallest, largest
584 2 : m.boundsSet = true
585 2 : m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
586 2 : } else {
587 2 : if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
588 2 : m.Smallest = smallest
589 2 : m.boundTypeSmallest = bTyp
590 2 : }
591 2 : if base.InternalCompare(cmp, largest, m.Largest) > 0 {
592 2 : m.Largest = largest
593 2 : m.boundTypeLargest = bTyp
594 2 : }
595 : }
596 : }
597 :
598 : // Overlaps returns true if the file key range overlaps with the given range.
599 2 : func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
600 2 : if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
601 2 : // f is completely before the specified range; no overlap.
602 2 : return false
603 2 : }
604 2 : if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
605 2 : // f is completely after the specified range; no overlap.
606 2 : return false
607 2 : }
608 2 : return true
609 : }
610 :
611 : // ContainedWithinSpan returns true if the file key range completely overlaps with the
612 : // given range ("end" is assumed to exclusive).
613 1 : func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
614 1 : lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
615 1 : return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
616 1 : }
617 :
618 : // ContainsKeyType returns whether or not the file contains keys of the provided
619 : // type.
620 2 : func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
621 2 : switch kt {
622 2 : case KeyTypePointAndRange:
623 2 : return true
624 2 : case KeyTypePoint:
625 2 : return m.HasPointKeys
626 2 : case KeyTypeRange:
627 2 : return m.HasRangeKeys
628 0 : default:
629 0 : panic("unrecognized key type")
630 : }
631 : }
632 :
633 : // SmallestBound returns the file's smallest bound of the key type. It returns a
634 : // false second return value if the file does not contain any keys of the key
635 : // type.
636 2 : func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
637 2 : switch kt {
638 0 : case KeyTypePointAndRange:
639 0 : return &m.Smallest, true
640 2 : case KeyTypePoint:
641 2 : return &m.SmallestPointKey, m.HasPointKeys
642 2 : case KeyTypeRange:
643 2 : return &m.SmallestRangeKey, m.HasRangeKeys
644 0 : default:
645 0 : panic("unrecognized key type")
646 : }
647 : }
648 :
649 : // LargestBound returns the file's largest bound of the key type. It returns a
650 : // false second return value if the file does not contain any keys of the key
651 : // type.
652 2 : func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
653 2 : switch kt {
654 0 : case KeyTypePointAndRange:
655 0 : return &m.Largest, true
656 2 : case KeyTypePoint:
657 2 : return &m.LargestPointKey, m.HasPointKeys
658 2 : case KeyTypeRange:
659 2 : return &m.LargestRangeKey, m.HasRangeKeys
660 0 : default:
661 0 : panic("unrecognized key type")
662 : }
663 : }
664 :
665 : const (
666 : maskContainsPointKeys = 1 << 0
667 : maskSmallest = 1 << 1
668 : maskLargest = 1 << 2
669 : )
670 :
671 : // boundsMarker returns a marker byte whose bits encode the following
672 : // information (in order from least significant bit):
673 : // - if the table contains point keys
674 : // - if the table's smallest key is a point key
675 : // - if the table's largest key is a point key
676 2 : func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
677 2 : if m.HasPointKeys {
678 2 : sentinel |= maskContainsPointKeys
679 2 : }
680 2 : switch m.boundTypeSmallest {
681 2 : case boundTypePointKey:
682 2 : sentinel |= maskSmallest
683 2 : case boundTypeRangeKey:
684 : // No op - leave bit unset.
685 0 : default:
686 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
687 : }
688 2 : switch m.boundTypeLargest {
689 2 : case boundTypePointKey:
690 2 : sentinel |= maskLargest
691 2 : case boundTypeRangeKey:
692 : // No op - leave bit unset.
693 0 : default:
694 0 : return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
695 : }
696 2 : return
697 : }
698 :
699 : // String implements fmt.Stringer, printing the file number and the overall
700 : // table bounds.
701 2 : func (m *FileMetadata) String() string {
702 2 : return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
703 2 : }
704 :
705 : // DebugString returns a verbose representation of FileMetadata, typically for
706 : // use in tests and debugging, returning the file number and the point, range
707 : // and overall bounds for the table.
708 2 : func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
709 2 : var b bytes.Buffer
710 2 : fmt.Fprintf(&b, "%s:[%s-%s]",
711 2 : m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
712 2 : if !verbose {
713 2 : return b.String()
714 2 : }
715 1 : fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
716 1 : if m.HasPointKeys {
717 1 : fmt.Fprintf(&b, " points:[%s-%s]",
718 1 : m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
719 1 : }
720 1 : if m.HasRangeKeys {
721 1 : fmt.Fprintf(&b, " ranges:[%s-%s]",
722 1 : m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
723 1 : }
724 1 : return b.String()
725 : }
726 :
727 : // ParseFileMetadataDebug parses a FileMetadata from its DebugString
728 : // representation.
729 1 : func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
730 1 : // Split lines of the form:
731 1 : // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
732 1 : fields := strings.FieldsFunc(s, func(c rune) bool {
733 1 : switch c {
734 1 : case ':', '[', '-', ']':
735 1 : return true
736 1 : default:
737 1 : return unicode.IsSpace(c) // NB: also trim whitespace padding.
738 : }
739 : })
740 1 : if len(fields)%3 != 0 {
741 0 : return nil, errors.Newf("malformed input: %s", s)
742 0 : }
743 1 : m := &FileMetadata{}
744 1 : for len(fields) > 0 {
745 1 : prefix := fields[0]
746 1 : if prefix == "seqnums" {
747 1 : smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
748 1 : if err != nil {
749 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
750 0 : }
751 1 : largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
752 1 : if err != nil {
753 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
754 0 : }
755 1 : m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
756 1 : fields = fields[3:]
757 1 : continue
758 : }
759 1 : smallest := base.ParsePrettyInternalKey(fields[1])
760 1 : largest := base.ParsePrettyInternalKey(fields[2])
761 1 : switch prefix {
762 1 : case "points":
763 1 : m.SmallestPointKey, m.LargestPointKey = smallest, largest
764 1 : m.HasPointKeys = true
765 1 : case "ranges":
766 1 : m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
767 1 : m.HasRangeKeys = true
768 1 : default:
769 1 : fileNum, err := strconv.ParseUint(prefix, 10, 64)
770 1 : if err != nil {
771 0 : return m, errors.Newf("malformed input: %s: %s", s, err)
772 0 : }
773 1 : m.FileNum = base.FileNum(fileNum)
774 1 : m.Smallest, m.Largest = smallest, largest
775 1 : m.boundsSet = true
776 : }
777 1 : fields = fields[3:]
778 : }
779 : // By default, when the parser sees just the overall bounds, we set the point
780 : // keys. This preserves backwards compatability with existing test cases that
781 : // specify only the overall bounds.
782 1 : if !m.HasPointKeys && !m.HasRangeKeys {
783 1 : m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
784 1 : m.HasPointKeys = true
785 1 : }
786 1 : m.InitPhysicalBacking()
787 1 : return m, nil
788 : }
789 :
790 : // Validate validates the metadata for consistency with itself, returning an
791 : // error if inconsistent.
792 2 : func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
793 2 : // Combined range and point key validation.
794 2 :
795 2 : if !m.HasPointKeys && !m.HasRangeKeys {
796 0 : return base.CorruptionErrorf("file %s has neither point nor range keys",
797 0 : errors.Safe(m.FileNum))
798 0 : }
799 2 : if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
800 1 : return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
801 1 : errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
802 1 : m.Largest.Pretty(formatKey))
803 1 : }
804 2 : if m.SmallestSeqNum > m.LargestSeqNum {
805 0 : return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
806 0 : errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
807 0 : }
808 :
809 : // Point key validation.
810 :
811 2 : if m.HasPointKeys {
812 2 : if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
813 0 : return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
814 0 : errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
815 0 : m.LargestPointKey.Pretty(formatKey))
816 0 : }
817 2 : if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
818 2 : base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
819 0 : return base.CorruptionErrorf(
820 0 : "file %s has inconsistent point key bounds relative to overall bounds: "+
821 0 : "overall = [%s-%s], point keys = [%s-%s]",
822 0 : errors.Safe(m.FileNum),
823 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
824 0 : m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
825 0 : )
826 0 : }
827 : }
828 :
829 : // Range key validation.
830 :
831 2 : if m.HasRangeKeys {
832 2 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
833 0 : return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
834 0 : errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
835 0 : m.LargestRangeKey.Pretty(formatKey))
836 0 : }
837 2 : if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
838 2 : base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
839 0 : return base.CorruptionErrorf(
840 0 : "file %s has inconsistent range key bounds relative to overall bounds: "+
841 0 : "overall = [%s-%s], range keys = [%s-%s]",
842 0 : errors.Safe(m.FileNum),
843 0 : m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
844 0 : m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
845 0 : )
846 0 : }
847 : }
848 :
849 : // Ensure that FileMetadata.Init was called.
850 2 : if m.FileBacking == nil {
851 0 : return base.CorruptionErrorf("file metadata FileBacking not set")
852 0 : }
853 :
854 2 : return nil
855 : }
856 :
857 : // TableInfo returns a subset of the FileMetadata state formatted as a
858 : // TableInfo.
859 2 : func (m *FileMetadata) TableInfo() TableInfo {
860 2 : return TableInfo{
861 2 : FileNum: m.FileNum,
862 2 : Size: m.Size,
863 2 : Smallest: m.Smallest,
864 2 : Largest: m.Largest,
865 2 : SmallestSeqNum: m.SmallestSeqNum,
866 2 : LargestSeqNum: m.LargestSeqNum,
867 2 : }
868 2 : }
869 :
870 2 : func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
871 2 : // NB: This is the same ordering that RocksDB uses for L0 files.
872 2 :
873 2 : // Sort first by largest sequence number.
874 2 : if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
875 2 : return v
876 2 : }
877 : // Then by smallest sequence number.
878 2 : if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
879 2 : return v
880 2 : }
881 : // Break ties by file number.
882 2 : return stdcmp.Compare(m.FileNum, b.FileNum)
883 : }
884 :
885 2 : func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
886 2 : return m.cmpSeqNum(b) < 0
887 2 : }
888 :
889 2 : func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
890 2 : return base.InternalCompare(cmp, m.Smallest, b.Smallest)
891 2 : }
892 :
893 : // KeyRange returns the minimum smallest and maximum largest internalKey for
894 : // all the FileMetadata in iters.
895 2 : func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
896 2 : first := true
897 2 : for _, iter := range iters {
898 2 : for meta := iter.First(); meta != nil; meta = iter.Next() {
899 2 : if first {
900 2 : first = false
901 2 : smallest, largest = meta.Smallest, meta.Largest
902 2 : continue
903 : }
904 2 : if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
905 2 : smallest = meta.Smallest
906 2 : }
907 2 : if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
908 2 : largest = meta.Largest
909 2 : }
910 : }
911 : }
912 2 : return smallest, largest
913 : }
914 :
915 : type bySeqNum []*FileMetadata
916 :
917 2 : func (b bySeqNum) Len() int { return len(b) }
918 2 : func (b bySeqNum) Less(i, j int) bool {
919 2 : return b[i].lessSeqNum(b[j])
920 2 : }
921 2 : func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
922 :
923 : // SortBySeqNum sorts the specified files by increasing sequence number.
924 2 : func SortBySeqNum(files []*FileMetadata) {
925 2 : sort.Sort(bySeqNum(files))
926 2 : }
927 :
928 : type bySmallest struct {
929 : files []*FileMetadata
930 : cmp Compare
931 : }
932 :
933 1 : func (b bySmallest) Len() int { return len(b.files) }
934 1 : func (b bySmallest) Less(i, j int) bool {
935 1 : return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
936 1 : }
937 0 : func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
938 :
939 : // SortBySmallest sorts the specified files by smallest key using the supplied
940 : // comparison function to order user keys.
941 1 : func SortBySmallest(files []*FileMetadata, cmp Compare) {
942 1 : sort.Sort(bySmallest{files, cmp})
943 1 : }
944 :
945 2 : func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
946 2 : startIter := iter.Clone()
947 2 : {
948 2 : startIterFile := startIter.SeekGE(cmp, start)
949 2 : // SeekGE compares user keys. The user key `start` may be equal to the
950 2 : // f.Largest because f.Largest is a range deletion sentinel, indicating
951 2 : // that the user key `start` is NOT contained within the file f. If
952 2 : // that's the case, we can narrow the overlapping bounds to exclude the
953 2 : // file with the sentinel.
954 2 : if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
955 2 : cmp(startIterFile.Largest.UserKey, start) == 0 {
956 2 : startIterFile = startIter.Next()
957 2 : }
958 2 : _ = startIterFile // Ignore unused assignment.
959 : }
960 :
961 2 : endIter := iter.Clone()
962 2 : {
963 2 : endIterFile := endIter.SeekGE(cmp, end)
964 2 :
965 2 : if !exclusiveEnd {
966 2 : // endIter is now pointing at the *first* file with a largest key >= end.
967 2 : // If there are multiple files including the user key `end`, we want all
968 2 : // of them, so move forward.
969 2 : for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
970 2 : endIterFile = endIter.Next()
971 2 : }
972 : }
973 :
974 : // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
975 : // or nexted too far because Largest.UserKey equaled `end`, go back.
976 : //
977 : // Consider !exclusiveEnd and end = 'f', with the following file bounds:
978 : //
979 : // [b,d] [e, f] [f, f] [g, h]
980 : //
981 : // the above for loop will Next until it arrives at [g, h]. We need to
982 : // observe that g > f, and Prev to the file with bounds [f, f].
983 2 : if endIterFile == nil {
984 2 : endIterFile = endIter.Prev()
985 2 : } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
986 2 : endIterFile = endIter.Prev()
987 2 : }
988 2 : _ = endIterFile // Ignore unused assignment.
989 : }
990 2 : return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
991 : }
992 :
993 : // NumLevels is the number of levels a Version contains.
994 : const NumLevels = 7
995 :
996 : // NewVersion constructs a new Version with the provided files. It requires
997 : // the provided files are already well-ordered. It's intended for testing.
998 : func NewVersion(
999 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
1000 1 : ) *Version {
1001 1 : var v Version
1002 1 : for l := range files {
1003 1 : // NB: We specifically insert `files` into the B-Tree in the order
1004 1 : // they appear within `files`. Some tests depend on this behavior in
1005 1 : // order to test consistency checking, etc. Once we've constructed the
1006 1 : // initial B-Tree, we swap out the btreeCmp for the correct one.
1007 1 : // TODO(jackson): Adjust or remove the tests and remove this.
1008 1 : v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
1009 1 : v.Levels[l].level = l
1010 1 : if l == 0 {
1011 1 : v.Levels[l].tree.cmp = btreeCmpSeqNum
1012 1 : } else {
1013 1 : v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
1014 1 : }
1015 1 : for _, f := range files[l] {
1016 1 : v.Levels[l].totalSize += f.Size
1017 1 : }
1018 : }
1019 1 : if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
1020 0 : panic(err)
1021 : }
1022 1 : return &v
1023 : }
1024 :
1025 : // Version is a collection of file metadata for on-disk tables at various
1026 : // levels. In-memory DBs are written to level-0 tables, and compactions
1027 : // migrate data from level N to level N+1. The tables map internal keys (which
1028 : // are a user key, a delete or set bit, and a sequence number) to user values.
1029 : //
1030 : // The tables at level 0 are sorted by largest sequence number. Due to file
1031 : // ingestion, there may be overlap in the ranges of sequence numbers contain in
1032 : // level 0 sstables. In particular, it is valid for one level 0 sstable to have
1033 : // the seqnum range [1,100] while an adjacent sstable has the seqnum range
1034 : // [50,50]. This occurs when the [50,50] table was ingested and given a global
1035 : // seqnum. The ingestion code will have ensured that the [50,50] sstable will
1036 : // not have any keys that overlap with the [1,100] in the seqnum range
1037 : // [1,49]. The range of internal keys [fileMetadata.smallest,
1038 : // fileMetadata.largest] in each level 0 table may overlap.
1039 : //
1040 : // The tables at any non-0 level are sorted by their internal key range and any
1041 : // two tables at the same non-0 level do not overlap.
1042 : //
1043 : // The internal key ranges of two tables at different levels X and Y may
1044 : // overlap, for any X != Y.
1045 : //
1046 : // Finally, for every internal key in a table at level X, there is no internal
1047 : // key in a higher level table that has both the same user key and a higher
1048 : // sequence number.
1049 : type Version struct {
1050 : refs atomic.Int32
1051 :
1052 : // The level 0 sstables are organized in a series of sublevels. Similar to
1053 : // the seqnum invariant in normal levels, there is no internal key in a
1054 : // higher level table that has both the same user key and a higher sequence
1055 : // number. Within a sublevel, tables are sorted by their internal key range
1056 : // and any two tables at the same sublevel do not overlap. Unlike the normal
1057 : // levels, sublevel n contains older tables (lower sequence numbers) than
1058 : // sublevel n+1.
1059 : //
1060 : // The L0Sublevels struct is mostly used for compaction picking. As most
1061 : // internal data structures in it are only necessary for compaction picking
1062 : // and not for iterator creation, the reference to L0Sublevels is nil'd
1063 : // after this version becomes the non-newest version, to reduce memory
1064 : // usage.
1065 : //
1066 : // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
1067 : // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
1068 : // a reference to that slice, as that slice is necessary for iterator
1069 : // creation and needs to outlast L0Sublevels.
1070 : L0Sublevels *L0Sublevels
1071 : L0SublevelFiles []LevelSlice
1072 :
1073 : Levels [NumLevels]LevelMetadata
1074 :
1075 : // RangeKeyLevels holds a subset of the same files as Levels that contain range
1076 : // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
1077 : // duplication should be minimal, as range keys are expected to be rare.
1078 : RangeKeyLevels [NumLevels]LevelMetadata
1079 :
1080 : // The callback to invoke when the last reference to a version is
1081 : // removed. Will be called with list.mu held.
1082 : Deleted func(obsolete []*FileBacking)
1083 :
1084 : // Stats holds aggregated stats about the version maintained from
1085 : // version to version.
1086 : Stats struct {
1087 : // MarkedForCompaction records the count of files marked for
1088 : // compaction within the version.
1089 : MarkedForCompaction int
1090 : }
1091 :
1092 : // The list the version is linked into.
1093 : list *VersionList
1094 :
1095 : // The next/prev link for the versionList doubly-linked list of versions.
1096 : prev, next *Version
1097 : }
1098 :
1099 : // String implements fmt.Stringer, printing the FileMetadata for each level in
1100 : // the Version.
1101 1 : func (v *Version) String() string {
1102 1 : return v.string(base.DefaultFormatter, false)
1103 1 : }
1104 :
1105 : // DebugString returns an alternative format to String() which includes sequence
1106 : // number and kind information for the sstable boundaries.
1107 1 : func (v *Version) DebugString(format base.FormatKey) string {
1108 1 : return v.string(format, true)
1109 1 : }
1110 :
1111 2 : func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
1112 2 : var buf bytes.Buffer
1113 2 : for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
1114 2 : fmt.Fprintf(&buf, "0.%d:\n", sublevel)
1115 2 : sublevels[sublevel].Each(func(f *FileMetadata) {
1116 2 : fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose))
1117 2 : })
1118 : }
1119 2 : return buf.String()
1120 : }
1121 :
1122 1 : func (v *Version) string(format base.FormatKey, verbose bool) string {
1123 1 : var buf bytes.Buffer
1124 1 : if len(v.L0SublevelFiles) > 0 {
1125 1 : fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
1126 1 : }
1127 1 : for level := 1; level < NumLevels; level++ {
1128 1 : if v.Levels[level].Empty() {
1129 1 : continue
1130 : }
1131 1 : fmt.Fprintf(&buf, "%d:\n", level)
1132 1 : iter := v.Levels[level].Iter()
1133 1 : for f := iter.First(); f != nil; f = iter.Next() {
1134 1 : fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose))
1135 1 : }
1136 : }
1137 1 : return buf.String()
1138 : }
1139 :
1140 : // ParseVersionDebug parses a Version from its DebugString output.
1141 : func ParseVersionDebug(
1142 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
1143 1 : ) (*Version, error) {
1144 1 : var level int
1145 1 : var files [NumLevels][]*FileMetadata
1146 1 : for _, l := range strings.Split(s, "\n") {
1147 1 : l = strings.TrimSpace(l)
1148 1 :
1149 1 : switch l[:2] {
1150 1 : case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
1151 1 : var err error
1152 1 : level, err = strconv.Atoi(l[:1])
1153 1 : if err != nil {
1154 0 : return nil, err
1155 0 : }
1156 1 : default:
1157 1 : m, err := ParseFileMetadataDebug(l)
1158 1 : if err != nil {
1159 0 : return nil, err
1160 0 : }
1161 : // If we only parsed overall bounds, default to setting the point bounds.
1162 1 : if !m.HasPointKeys && !m.HasRangeKeys {
1163 0 : m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
1164 0 : m.HasPointKeys = true
1165 0 : }
1166 1 : files[level] = append(files[level], m)
1167 : }
1168 : }
1169 : // Reverse the order of L0 files. This ensures we construct the same
1170 : // sublevels. (They're printed from higher sublevel to lower, which means in
1171 : // a partial order that represents newest to oldest).
1172 1 : for i := 0; i < len(files[0])/2; i++ {
1173 1 : files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
1174 1 : }
1175 1 : return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
1176 : }
1177 :
1178 : // Refs returns the number of references to the version.
1179 2 : func (v *Version) Refs() int32 {
1180 2 : return v.refs.Load()
1181 2 : }
1182 :
1183 : // Ref increments the version refcount.
1184 2 : func (v *Version) Ref() {
1185 2 : v.refs.Add(1)
1186 2 : }
1187 :
1188 : // Unref decrements the version refcount. If the last reference to the version
1189 : // was removed, the version is removed from the list of versions and the
1190 : // Deleted callback is invoked. Requires that the VersionList mutex is NOT
1191 : // locked.
1192 2 : func (v *Version) Unref() {
1193 2 : if v.refs.Add(-1) == 0 {
1194 2 : l := v.list
1195 2 : l.mu.Lock()
1196 2 : l.Remove(v)
1197 2 : v.Deleted(v.unrefFiles())
1198 2 : l.mu.Unlock()
1199 2 : }
1200 : }
1201 :
1202 : // UnrefLocked decrements the version refcount. If the last reference to the
1203 : // version was removed, the version is removed from the list of versions and
1204 : // the Deleted callback is invoked. Requires that the VersionList mutex is
1205 : // already locked.
1206 2 : func (v *Version) UnrefLocked() {
1207 2 : if v.refs.Add(-1) == 0 {
1208 2 : v.list.Remove(v)
1209 2 : v.Deleted(v.unrefFiles())
1210 2 : }
1211 : }
1212 :
1213 2 : func (v *Version) unrefFiles() []*FileBacking {
1214 2 : var obsolete []*FileBacking
1215 2 : for _, lm := range v.Levels {
1216 2 : obsolete = append(obsolete, lm.release()...)
1217 2 : }
1218 2 : for _, lm := range v.RangeKeyLevels {
1219 2 : obsolete = append(obsolete, lm.release()...)
1220 2 : }
1221 2 : return obsolete
1222 : }
1223 :
1224 : // Next returns the next version in the list of versions.
1225 1 : func (v *Version) Next() *Version {
1226 1 : return v.next
1227 1 : }
1228 :
1229 : // InitL0Sublevels initializes the L0Sublevels
1230 : func (v *Version) InitL0Sublevels(
1231 : cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
1232 2 : ) error {
1233 2 : var err error
1234 2 : v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
1235 2 : if err == nil && v.L0Sublevels != nil {
1236 2 : v.L0SublevelFiles = v.L0Sublevels.Levels
1237 2 : }
1238 2 : return err
1239 : }
1240 :
1241 : // Contains returns a boolean indicating whether the provided file exists in
1242 : // the version at the given level. If level is non-zero then Contains binary
1243 : // searches among the files. If level is zero, Contains scans the entire
1244 : // level.
1245 2 : func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
1246 2 : iter := v.Levels[level].Iter()
1247 2 : if level > 0 {
1248 2 : overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
1249 2 : m.Largest.IsExclusiveSentinel())
1250 2 : iter = overlaps.Iter()
1251 2 : }
1252 2 : for f := iter.First(); f != nil; f = iter.Next() {
1253 2 : if f == m {
1254 2 : return true
1255 2 : }
1256 : }
1257 2 : return false
1258 : }
1259 :
1260 : // Overlaps returns all elements of v.files[level] whose user key range
1261 : // intersects the given range. If level is non-zero then the user key ranges of
1262 : // v.files[level] are assumed to not overlap (although they may touch). If level
1263 : // is zero then that assumption cannot be made, and the [start, end] range is
1264 : // expanded to the union of those matching ranges so far and the computation is
1265 : // repeated until [start, end] stabilizes.
1266 : // The returned files are a subsequence of the input files, i.e., the ordering
1267 : // is not changed.
1268 : func (v *Version) Overlaps(
1269 : level int, cmp Compare, start, end []byte, exclusiveEnd bool,
1270 2 : ) LevelSlice {
1271 2 : if level == 0 {
1272 2 : // Indices that have been selected as overlapping.
1273 2 : l0 := v.Levels[level]
1274 2 : l0Iter := l0.Iter()
1275 2 : selectedIndices := make([]bool, l0.Len())
1276 2 : numSelected := 0
1277 2 : var slice LevelSlice
1278 2 : for {
1279 2 : restart := false
1280 2 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1281 2 : selected := selectedIndices[i]
1282 2 : if selected {
1283 2 : continue
1284 : }
1285 2 : if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
1286 2 : // meta is completely outside the specified range; skip it.
1287 2 : continue
1288 : }
1289 : // Overlaps.
1290 2 : selectedIndices[i] = true
1291 2 : numSelected++
1292 2 :
1293 2 : smallest := meta.Smallest.UserKey
1294 2 : largest := meta.Largest.UserKey
1295 2 : // Since level == 0, check if the newly added fileMetadata has
1296 2 : // expanded the range. We expand the range immediately for files
1297 2 : // we have remaining to check in this loop. All already checked
1298 2 : // and unselected files will need to be rechecked via the
1299 2 : // restart below.
1300 2 : if cmp(smallest, start) < 0 {
1301 2 : start = smallest
1302 2 : restart = true
1303 2 : }
1304 2 : if v := cmp(largest, end); v > 0 {
1305 2 : end = largest
1306 2 : exclusiveEnd = meta.Largest.IsExclusiveSentinel()
1307 2 : restart = true
1308 2 : } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
1309 2 : // Only update the exclusivity of our existing `end`
1310 2 : // bound.
1311 2 : exclusiveEnd = false
1312 2 : restart = true
1313 2 : }
1314 : }
1315 :
1316 2 : if !restart {
1317 2 : // Construct a B-Tree containing only the matching items.
1318 2 : var tr btree
1319 2 : tr.cmp = v.Levels[level].tree.cmp
1320 2 : for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
1321 2 : if selectedIndices[i] {
1322 2 : err := tr.Insert(meta)
1323 2 : if err != nil {
1324 0 : panic(err)
1325 : }
1326 : }
1327 : }
1328 2 : slice = newLevelSlice(tr.Iter())
1329 2 : // TODO(jackson): Avoid the oddity of constructing and
1330 2 : // immediately releasing a B-Tree. Make LevelSlice an
1331 2 : // interface?
1332 2 : tr.Release()
1333 2 : break
1334 : }
1335 : // Continue looping to retry the files that were not selected.
1336 : }
1337 2 : return slice
1338 : }
1339 :
1340 2 : return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
1341 : }
1342 :
1343 : // CheckOrdering checks that the files are consistent with respect to
1344 : // increasing file numbers (for level 0 files) and increasing and non-
1345 : // overlapping internal key ranges (for level non-0 files).
1346 1 : func (v *Version) CheckOrdering(cmp Compare, format base.FormatKey) error {
1347 1 : for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
1348 1 : sublevelIter := v.L0SublevelFiles[sublevel].Iter()
1349 1 : if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter); err != nil {
1350 0 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
1351 0 : }
1352 : }
1353 :
1354 1 : for level, lm := range v.Levels {
1355 1 : if err := CheckOrdering(cmp, format, Level(level), lm.Iter()); err != nil {
1356 1 : return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
1357 1 : }
1358 : }
1359 1 : return nil
1360 : }
1361 :
1362 : // VersionList holds a list of versions. The versions are ordered from oldest
1363 : // to newest.
1364 : type VersionList struct {
1365 : mu *sync.Mutex
1366 : root Version
1367 : }
1368 :
1369 : // Init initializes the version list.
1370 2 : func (l *VersionList) Init(mu *sync.Mutex) {
1371 2 : l.mu = mu
1372 2 : l.root.next = &l.root
1373 2 : l.root.prev = &l.root
1374 2 : }
1375 :
1376 : // Empty returns true if the list is empty, and false otherwise.
1377 2 : func (l *VersionList) Empty() bool {
1378 2 : return l.root.next == &l.root
1379 2 : }
1380 :
1381 : // Front returns the oldest version in the list. Note that this version is only
1382 : // valid if Empty() returns true.
1383 2 : func (l *VersionList) Front() *Version {
1384 2 : return l.root.next
1385 2 : }
1386 :
1387 : // Back returns the newest version in the list. Note that this version is only
1388 : // valid if Empty() returns true.
1389 2 : func (l *VersionList) Back() *Version {
1390 2 : return l.root.prev
1391 2 : }
1392 :
1393 : // PushBack adds a new version to the back of the list. This new version
1394 : // becomes the "newest" version in the list.
1395 2 : func (l *VersionList) PushBack(v *Version) {
1396 2 : if v.list != nil || v.prev != nil || v.next != nil {
1397 0 : panic("pebble: version list is inconsistent")
1398 : }
1399 2 : v.prev = l.root.prev
1400 2 : v.prev.next = v
1401 2 : v.next = &l.root
1402 2 : v.next.prev = v
1403 2 : v.list = l
1404 2 : // Let L0Sublevels on the second newest version get GC'd, as it is no longer
1405 2 : // necessary. See the comment in Version.
1406 2 : v.prev.L0Sublevels = nil
1407 : }
1408 :
1409 : // Remove removes the specified version from the list.
1410 2 : func (l *VersionList) Remove(v *Version) {
1411 2 : if v == &l.root {
1412 0 : panic("pebble: cannot remove version list root node")
1413 : }
1414 2 : if v.list != l {
1415 0 : panic("pebble: version list is inconsistent")
1416 : }
1417 2 : v.prev.next = v.next
1418 2 : v.next.prev = v.prev
1419 2 : v.next = nil // avoid memory leaks
1420 2 : v.prev = nil // avoid memory leaks
1421 2 : v.list = nil // avoid memory leaks
1422 : }
1423 :
1424 : // CheckOrdering checks that the files are consistent with respect to
1425 : // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
1426 : // overlapping internal key ranges (for non-level 0 files).
1427 2 : func CheckOrdering(cmp Compare, format base.FormatKey, level Level, files LevelIterator) error {
1428 2 : // The invariants to check for L0 sublevels are the same as the ones to
1429 2 : // check for all other levels. However, if L0 is not organized into
1430 2 : // sublevels, or if all L0 files are being passed in, we do the legacy L0
1431 2 : // checks, defined in the detailed comment below.
1432 2 : if level == Level(0) {
1433 2 : // We have 2 kinds of files:
1434 2 : // - Files with exactly one sequence number: these could be either ingested files
1435 2 : // or flushed files. We cannot tell the difference between them based on FileMetadata,
1436 2 : // so our consistency checking here uses the weaker checks assuming it is a narrow
1437 2 : // flushed file. We cannot error on ingested files having sequence numbers coincident
1438 2 : // with flushed files as the seemingly ingested file could just be a flushed file
1439 2 : // with just one key in it which is a truncated range tombstone sharing sequence numbers
1440 2 : // with other files in the same flush.
1441 2 : // - Files with multiple sequence numbers: these are necessarily flushed files.
1442 2 : //
1443 2 : // Three cases of overlapping sequence numbers:
1444 2 : // Case 1:
1445 2 : // An ingested file contained in the sequence numbers of the flushed file -- it must be
1446 2 : // fully contained (not coincident with either end of the flushed file) since the memtable
1447 2 : // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
1448 2 : // num b, and the memtable got a subsequent update that was given sequence num b+1, before
1449 2 : // being flushed.
1450 2 : //
1451 2 : // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
1452 2 : // third file are inconsistent with each other. So comparing adjacent files is insufficient
1453 2 : // for consistency checking.
1454 2 : //
1455 2 : // Visually we have something like
1456 2 : // x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
1457 2 : // y y y y (y's represent ingested files)
1458 2 : // And these are ordered in increasing order of y. Note that y's must be unique.
1459 2 : //
1460 2 : // Case 2:
1461 2 : // A flushed file that did not overlap in keys with any file in any level, but does overlap
1462 2 : // in the file key intervals. This file is placed in L0 since it overlaps in the file
1463 2 : // key intervals but since it has no overlapping data, it is assigned a sequence number
1464 2 : // of 0 in RocksDB. We handle this case for compatibility with RocksDB.
1465 2 : //
1466 2 : // Case 3:
1467 2 : // A sequence of flushed files that overlap in sequence numbers with one another,
1468 2 : // but do not overlap in keys inside the sstables. These files correspond to
1469 2 : // partitioned flushes or the results of intra-L0 compactions of partitioned
1470 2 : // flushes.
1471 2 : //
1472 2 : // Since these types of SSTables violate most other sequence number
1473 2 : // overlap invariants, and handling this case is important for compatibility
1474 2 : // with future versions of pebble, this method relaxes most L0 invariant
1475 2 : // checks.
1476 2 :
1477 2 : var prev *FileMetadata
1478 2 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1479 2 : if prev == nil {
1480 2 : continue
1481 : }
1482 : // Validate that the sorting is sane.
1483 2 : if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
1484 1 : // Multiple files satisfying case 2 mentioned above.
1485 2 : } else if !prev.lessSeqNum(f) {
1486 1 : return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
1487 1 : errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1488 1 : errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
1489 1 : errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
1490 1 : }
1491 : }
1492 2 : } else {
1493 2 : var prev *FileMetadata
1494 2 : for f := files.First(); f != nil; f, prev = files.Next(), f {
1495 2 : if err := f.Validate(cmp, format); err != nil {
1496 1 : return errors.Wrapf(err, "%s ", level)
1497 1 : }
1498 2 : if prev != nil {
1499 2 : if prev.cmpSmallestKey(f, cmp) >= 0 {
1500 1 : return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
1501 1 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1502 1 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1503 1 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1504 1 : }
1505 2 : if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
1506 1 : return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
1507 1 : errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
1508 1 : prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
1509 1 : f.Smallest.Pretty(format), f.Largest.Pretty(format))
1510 1 : }
1511 : }
1512 : }
1513 : }
1514 2 : return nil
1515 : }
|