LCOV - code coverage report
Current view: top level - pebble/sstable - format.go (source / functions) Hit Total Coverage
Test: 2025-01-11 08:16Z ce9b648a - tests + meta.lcov Lines: 80 86 93.0 %
Date: 2025-01-11 08:17:30 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package sstable
       6             : 
       7             : import (
       8             :         "github.com/cockroachdb/errors"
       9             :         "github.com/cockroachdb/pebble/internal/base"
      10             :         "github.com/cockroachdb/pebble/sstable/block"
      11             :         "github.com/cockroachdb/pebble/sstable/colblk"
      12             :         "github.com/cockroachdb/pebble/sstable/rowblk"
      13             : )
      14             : 
      15             : // TableFormat specifies the format version for sstables. The legacy LevelDB
      16             : // format is format version 1.
      17             : type TableFormat uint32
      18             : 
      19             : // The available table formats, representing the tuple (magic number, version
      20             : // number). Note that these values are not (and should not) be serialized to
      21             : // disk. The ordering should follow the order the versions were introduced to
      22             : // Pebble (i.e. the history is linear).
      23             : const (
      24             :         TableFormatUnspecified TableFormat = iota
      25             :         TableFormatLevelDB
      26             :         TableFormatRocksDBv2
      27             :         TableFormatPebblev1 // Block properties.
      28             :         TableFormatPebblev2 // Range keys.
      29             :         TableFormatPebblev3 // Value blocks.
      30             :         TableFormatPebblev4 // DELSIZED tombstones.
      31             :         TableFormatPebblev5 // Columnar blocks.
      32             :         NumTableFormats
      33             : 
      34             :         TableFormatMax = NumTableFormats - 1
      35             : 
      36             :         // TableFormatMinSupported is the minimum format supported by Pebble.  This
      37             :         // package still supports older formats for uses outside of Pebble
      38             :         // (CockroachDB uses it to read data from backups that could be old).
      39             :         TableFormatMinSupported = TableFormatPebblev1
      40             : )
      41             : 
      42             : // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
      43             : // InternalKeyKindSSTableInternalObsoleteBit.
      44             : //
      45             : // 1. Motivation
      46             : //
      47             : // We have various related problems caused by Pebble snapshots:
      48             : //
      49             : // - P1: RANGEDELs that delete points in the same sstable, but the points
      50             : //   happen to not get deleted during compactions because of an open snapshot.
      51             : //   This causes very expensive iteration, that has been observed in
      52             : //   production deployments
      53             : //
      54             : // - P2: When iterating over a foreign sstable (in disaggregated storage), we
      55             : //   need to do (a) point collapsing to expose at most one point per user key,
      56             : //   (b) apply RANGEDELs in the sstable to hide deleted points in the same
      57             : //   sstable. This per-sstable point collapsing iteration needs to be very
      58             : //   efficient (ideally as efficient from a CPU perspective as iteration over
      59             : //   regular sstables) since foreign sstables can be very long-lived -- one of
      60             : //   the goals of disaggregated storage is to scale compute and disk bandwidth
      61             : //   resources as a function of the hot (from a write perspective) data and
      62             : //   not the whole data, so we don't want to have to rewrite foreign sstables
      63             : //   solely to improve read performance.
      64             : //
      65             : // The ideal solution for P2 would allow user-facing reads to utilize the
      66             : // existing SST iterators (with slight modifications) and with no loss of
      67             : // efficiency. And for P1 and P2 we would like to skip whole blocks of
      68             : // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
      69             : // key comparisons at iteration time to discover what points are deleted is
      70             : // very desirable, since keys can be long.
      71             : //
      72             : // We observe that:
      73             : //
      74             : // - Reads:
      75             : //   - All user-facing reads in CockroachDB use iterators over the DB, hence
      76             : //     have a higher read seqnum than all sstables (there are some rare cases
      77             : //     that can violate this, but those are not important from a performance
      78             : //     optimization perspective).
      79             : //
      80             : //   - Certain internal-facing reads in CockroachDB use snapshots, but the
      81             : //     snapshots are shortlived enough that most L5 and L6 sstables will have
      82             : //     all seqnums lower than the snapshot seqnum.
      83             : //
      84             : // - Writes:
      85             : //   - We already do key comparisons between points when writing the sstable
      86             : //     to ensure that the sstable invariant (monotonically increasing internal
      87             : //     keys) is not violated. So we know which points share the same userkey,
      88             : //     and thereby which points are obsolete because there is a more recent
      89             : //     point in the same sstable.
      90             : //
      91             : //   - The compactionIter knows which point id deleted by a RANGEDEL even if
      92             : //     the point does need to be written because of a snapshot.
      93             : //
      94             : //   So this known information can be encoded in the sstable at write time and
      95             : //   utilized for optimized reading.
      96             : //
      97             : // 2. Solution
      98             : //
      99             : // We primarily scope the solution to the following point kinds: SET,
     100             : // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
     101             : // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
     102             : // most one point will be exposed per user key. MERGE keys create more
     103             : // complexity: MERGE followed by MERGE causes multiple keys to not be
     104             : // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
     105             : // that:
     106             : //
     107             : // - For regular sst iteration, the obsolete marking is a performance
     108             : //   optimization, and multiple keys for the same userkey can be handled by
     109             : //   higher layers in the iterator tree (specifically pebble.Iterator).
     110             : //
     111             : // - For foreign sst iteration, we disallow MERGEs to be written to such
     112             : //   shared ssts (details below).
     113             : //
     114             : // The key kinds are marked with an obsolete bit
     115             : // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
     116             : // obsolete. This marking is done within blockWriter, based on information
     117             : // passed to it by Writer. In turn, Writer uses a combination of key
     118             : // comparisons, and information provided by compactionIter to decide whether a
     119             : // key-value pair is obsolete. Additionally, a Pebble-internal
     120             : // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
     121             : // blocks where all key-value pairs are obsolete. Since the common case is
     122             : // non-obsolete blocks, this block property collector uses the empty byte
     123             : // slice to represent a non-obsolete block, which consumes no space in
     124             : // BlockHandleWithProperties.Props.
     125             : //
     126             : // At read time, the obsolete bit is only visible to the blockIter, which can
     127             : // be optionally configured to hide obsolete points. This hiding is only
     128             : // configured for data block iterators for sstables being read by user-facing
     129             : // iterators at a seqnum greater than the max seqnum in the sstable.
     130             : // Additionally, when this hiding is configured, a Pebble-internal block
     131             : // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
     132             : // blocks that are obsolete.
     133             : //
     134             : // 2.1 Correctness
     135             : //
     136             : // Due to the level invariant, the sequence of seqnums for a user key in a
     137             : // sstable represents a contiguous subsequence of the seqnums for the userkey
     138             : // across the whole LSM, and is more recent than the seqnums in a sstable in a
     139             : // lower level. So exposing exactly one point from a sstable for a userkey
     140             : // will also mask the points for the userkey in lower levels. If we expose no
     141             : // point, because of RANGEDELs, that RANGEDEL will also mask the points in
     142             : // lower levels.
     143             : //
     144             : // Note that we do not need to do anything special at write time for
     145             : // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
     146             : // specially only by compactions, which typically do not hide obsolete points
     147             : // (see exception below). For regular reads, SETWITHDEL behaves the same as
     148             : // SET and SINGLEDEL behaves the same as DEL.
     149             : //
     150             : // 2.1.1 Compaction reads of a foreign sstable
     151             : //
     152             : // Compaction reads of a foreign sstable behave like regular reads in that
     153             : // only non-obsolete points are exposed. Consider a L5 foreign sstable with
     154             : // b.SINGLEDEL that is non-obsolete followed by obsolete b.DEL. And a L6
     155             : // foreign sstable with two b.SETs. The SINGLEDEL will be exposed, and not the
     156             : // DEL, but this is not a correctness issue since only one of the SETs in the
     157             : // L6 sstable will be exposed. However, this works only because we have
     158             : // limited the number of foreign sst levels to two, and is extremely fragile.
     159             : // For robust correctness, non-obsolete SINGLEDELs in foreign sstables should
     160             : // be exposed as DELs.
     161             : //
     162             : // Additionally, to avoid false positive accounting errors in DELSIZED, we
     163             : // should expose them as DEL.
     164             : //
     165             : // NB: as of writing this comment, we do not have end-to-end support for
     166             : // SINGLEDEL for disaggregated storage since pointCollapsingIterator (used by
     167             : // ScanInternal) does not support SINGLEDEL. So the disaggregated key spans
     168             : // are required to never have SINGLEDELs (which is fine for CockroachDB since
     169             : // only the MVCC key space uses disaggregated storage, and SINGLEDELs are only
     170             : // used for the non-MVCC locks and intents).
     171             : //
     172             : // 2.2 Strictness and MERGE
     173             : //
     174             : // Setting the obsolete bit on point keys is advanced usage, so we support two
     175             : // modes, both of which must be truthful when setting the obsolete bit, but
     176             : // vary in when they don't set the obsolete bit.
     177             : //
     178             : // - Non-strict: In this mode, the bit does not need to be set for keys that
     179             : //   are obsolete. Additionally, any sstable containing MERGE keys can only
     180             : //   use this mode. An iterator over such an sstable, when configured to
     181             : //   hideObsoletePoints, can expose multiple internal keys per user key, and
     182             : //   can expose keys that are deleted by rangedels in the same sstable. This
     183             : //   is the mode that non-advanced users should use. Pebble without
     184             : //   disaggregated storage will also use this mode and will best-effort set
     185             : //   the obsolete bit, to optimize iteration when snapshots have retained many
     186             : //   obsolete keys.
     187             : //
     188             : // - Strict: In this mode, every obsolete key must have the obsolete bit set,
     189             : //   and no MERGE keys are permitted. An iterator over such an sstable, when
     190             : //   configured to hideObsoletePoints satisfies two properties:
     191             : //   - S1: will expose at most one internal key per user key, which is the
     192             : //     most recent one.
     193             : //   - S2: will never expose keys that are deleted by rangedels in the same
     194             : //     sstable.
     195             : //
     196             : //   This is the mode for two use cases in disaggregated storage (which will
     197             : //   exclude parts of the key space that has MERGEs), for levels that contain
     198             : //   sstables that can become foreign sstables:
     199             : //   - Pebble compaction output to these levels that can become foreign
     200             : //     sstables.
     201             : //
     202             : //   - CockroachDB ingest operations that can ingest into the levels that can
     203             : //     become foreign sstables. Note, these are not sstables corresponding to
     204             : //     copied data for CockroachDB range snapshots. This case occurs for
     205             : //     operations like index backfills: these trivially satisfy the strictness
     206             : //     criteria since they only write one key per userkey.
     207             : //
     208             : //     TODO(sumeer): this latter case is not currently supported, since only
     209             : //     Writer.AddWithForceObsolete calls are permitted for writing strict
     210             : //     obsolete sstables. This is done to reduce the likelihood of bugs. One
     211             : //     simple way to lift this limitation would be to disallow adding any
     212             : //     RANGEDELs when a Pebble-external writer is trying to construct a strict
     213             : //     obsolete sstable.
     214             : 
     215             : // parseTableFormat parses the given magic bytes and version into its
     216             : // corresponding internal TableFormat.
     217           2 : func parseTableFormat(magic []byte, version uint32) (TableFormat, error) {
     218           2 :         switch string(magic) {
     219           1 :         case levelDBMagic:
     220           1 :                 return TableFormatLevelDB, nil
     221           1 :         case rocksDBMagic:
     222           1 :                 if version != rocksDBFormatVersion2 {
     223           1 :                         return TableFormatUnspecified, base.CorruptionErrorf(
     224           1 :                                 "(unsupported rocksdb format version %d)", errors.Safe(version))
     225           1 :                 }
     226           1 :                 return TableFormatRocksDBv2, nil
     227           2 :         case pebbleDBMagic:
     228           2 :                 switch version {
     229           1 :                 case 1:
     230           1 :                         return TableFormatPebblev1, nil
     231           2 :                 case 2:
     232           2 :                         return TableFormatPebblev2, nil
     233           2 :                 case 3:
     234           2 :                         return TableFormatPebblev3, nil
     235           2 :                 case 4:
     236           2 :                         return TableFormatPebblev4, nil
     237           2 :                 case 5:
     238           2 :                         return TableFormatPebblev5, nil
     239           1 :                 default:
     240           1 :                         return TableFormatUnspecified, base.CorruptionErrorf(
     241           1 :                                 "(unsupported pebble format version %d)", errors.Safe(version))
     242             :                 }
     243           1 :         default:
     244           1 :                 return TableFormatUnspecified, base.CorruptionErrorf(
     245           1 :                         "(bad magic number: 0x%x)", magic)
     246             :         }
     247             : }
     248             : 
     249             : // BlockColumnar returns true iff the table format uses the columnar format for
     250             : // data, index and keyspan blocks.
     251           2 : func (f TableFormat) BlockColumnar() bool {
     252           2 :         return f >= TableFormatPebblev5
     253           2 : }
     254             : 
     255           2 : func (f TableFormat) newIndexIter() block.IndexBlockIterator {
     256           2 :         if !f.BlockColumnar() {
     257           2 :                 return new(rowblk.IndexIter)
     258           2 :         }
     259           2 :         return new(colblk.IndexIter)
     260             : }
     261             : 
     262             : // AsTuple returns the TableFormat's (Magic String, Version) tuple.
     263           2 : func (f TableFormat) AsTuple() (string, uint32) {
     264           2 :         switch f {
     265           1 :         case TableFormatLevelDB:
     266           1 :                 return levelDBMagic, 0
     267           1 :         case TableFormatRocksDBv2:
     268           1 :                 return rocksDBMagic, 2
     269           1 :         case TableFormatPebblev1:
     270           1 :                 return pebbleDBMagic, 1
     271           2 :         case TableFormatPebblev2:
     272           2 :                 return pebbleDBMagic, 2
     273           2 :         case TableFormatPebblev3:
     274           2 :                 return pebbleDBMagic, 3
     275           2 :         case TableFormatPebblev4:
     276           2 :                 return pebbleDBMagic, 4
     277           2 :         case TableFormatPebblev5:
     278           2 :                 return pebbleDBMagic, 5
     279           0 :         default:
     280           0 :                 panic("sstable: unknown table format version tuple")
     281             :         }
     282             : }
     283             : 
     284             : // String returns the TableFormat (Magic String,Version) tuple.
     285           2 : func (f TableFormat) String() string {
     286           2 :         switch f {
     287           2 :         case TableFormatUnspecified:
     288           2 :                 return "unspecified"
     289           2 :         case TableFormatLevelDB:
     290           2 :                 return "(LevelDB)"
     291           2 :         case TableFormatRocksDBv2:
     292           2 :                 return "(RocksDB,v2)"
     293           2 :         case TableFormatPebblev1:
     294           2 :                 return "(Pebble,v1)"
     295           2 :         case TableFormatPebblev2:
     296           2 :                 return "(Pebble,v2)"
     297           2 :         case TableFormatPebblev3:
     298           2 :                 return "(Pebble,v3)"
     299           2 :         case TableFormatPebblev4:
     300           2 :                 return "(Pebble,v4)"
     301           2 :         case TableFormatPebblev5:
     302           2 :                 return "(Pebble,v5)"
     303           0 :         default:
     304           0 :                 panic("sstable: unknown table format version tuple")
     305             :         }
     306             : }
     307             : 
     308           2 : var tableFormatStrings = func() map[string]TableFormat {
     309           2 :         strs := make(map[string]TableFormat, NumTableFormats)
     310           2 :         for f := TableFormatUnspecified; f < NumTableFormats; f++ {
     311           2 :                 strs[f.String()] = f
     312           2 :         }
     313           2 :         return strs
     314             : }()
     315             : 
     316             : // ParseTableFormatString parses a TableFormat from its human-readable string
     317             : // representation.
     318           1 : func ParseTableFormatString(s string) (TableFormat, error) {
     319           1 :         f, ok := tableFormatStrings[s]
     320           1 :         if !ok {
     321           0 :                 return TableFormatUnspecified, errors.Errorf("unknown table format %q", s)
     322           0 :         }
     323           1 :         return f, nil
     324             : }

Generated by: LCOV version 1.14