LCOV - code coverage report
Current view: top level - pebble/sstable - format.go (source / functions) Hit Total Coverage
Test: 2024-06-20 08:16Z 71d59d67 - meta test only.lcov Lines: 18 61 29.5 %
Date: 2024-06-20 08:17:31 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package sstable
       6             : 
       7             : import (
       8             :         "github.com/cockroachdb/errors"
       9             :         "github.com/cockroachdb/pebble/internal/base"
      10             : )
      11             : 
      12             : // TableFormat specifies the format version for sstables. The legacy LevelDB
      13             : // format is format version 1.
      14             : type TableFormat uint32
      15             : 
      16             : // The available table formats, representing the tuple (magic number, version
      17             : // number). Note that these values are not (and should not) be serialized to
      18             : // disk. The ordering should follow the order the versions were introduced to
      19             : // Pebble (i.e. the history is linear).
      20             : const (
      21             :         TableFormatUnspecified TableFormat = iota
      22             :         TableFormatLevelDB
      23             :         TableFormatRocksDBv2
      24             :         TableFormatPebblev1 // Block properties.
      25             :         TableFormatPebblev2 // Range keys.
      26             :         TableFormatPebblev3 // Value blocks.
      27             :         TableFormatPebblev4 // DELSIZED tombstones.
      28             :         NumTableFormats
      29             : 
      30             :         TableFormatMax = NumTableFormats - 1
      31             : 
      32             :         // TableFormatMinSupported is the minimum format supported by Pebble.  This
      33             :         // package still supports older formats for uses outside of Pebble
      34             :         // (CockroachDB uses it to read data from backups that could be old).
      35             :         TableFormatMinSupported = TableFormatPebblev1
      36             : )
      37             : 
      38             : // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
      39             : // InternalKeyKindSSTableInternalObsoleteBit.
      40             : //
      41             : // 1. Motivation
      42             : //
      43             : // We have various related problems caused by Pebble snapshots:
      44             : //
      45             : // - P1: RANGEDELs that delete points in the same sstable, but the points
      46             : //   happen to not get deleted during compactions because of an open snapshot.
      47             : //   This causes very expensive iteration, that has been observed in
      48             : //   production deployments
      49             : //
      50             : // - P2: When iterating over a foreign sstable (in disaggregated storage), we
      51             : //   need to do (a) point collapsing to expose at most one point per user key,
      52             : //   (b) apply RANGEDELs in the sstable to hide deleted points in the same
      53             : //   sstable. This per-sstable point collapsing iteration needs to be very
      54             : //   efficient (ideally as efficient from a CPU perspective as iteration over
      55             : //   regular sstables) since foreign sstables can be very long-lived -- one of
      56             : //   the goals of disaggregated storage is to scale compute and disk bandwidth
      57             : //   resources as a function of the hot (from a write perspective) data and
      58             : //   not the whole data, so we don't want to have to rewrite foreign sstables
      59             : //   solely to improve read performance.
      60             : //
      61             : // The ideal solution for P2 would allow user-facing reads to utilize the
      62             : // existing SST iterators (with slight modifications) and with no loss of
      63             : // efficiency. And for P1 and P2 we would like to skip whole blocks of
      64             : // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
      65             : // key comparisons at iteration time to discover what points are deleted is
      66             : // very desirable, since keys can be long.
      67             : //
      68             : // We observe that:
      69             : //
      70             : // - Reads:
      71             : //   - All user-facing reads in CockroachDB use iterators over the DB, hence
      72             : //     have a higher read seqnum than all sstables (there are some rare cases
      73             : //     that can violate this, but those are not important from a performance
      74             : //     optimization perspective).
      75             : //
      76             : //   - Certain internal-facing reads in CockroachDB use snapshots, but the
      77             : //     snapshots are shortlived enough that most L5 and L6 sstables will have
      78             : //     all seqnums lower than the snapshot seqnum.
      79             : //
      80             : // - Writes:
      81             : //   - We already do key comparisons between points when writing the sstable
      82             : //     to ensure that the sstable invariant (monotonically increasing internal
      83             : //     keys) is not violated. So we know which points share the same userkey,
      84             : //     and thereby which points are obsolete because there is a more recent
      85             : //     point in the same sstable.
      86             : //
      87             : //   - The compactionIter knows which point id deleted by a RANGEDEL even if
      88             : //     the point does need to be written because of a snapshot.
      89             : //
      90             : //   So this known information can be encoded in the sstable at write time and
      91             : //   utilized for optimized reading.
      92             : //
      93             : // 2. Solution
      94             : //
      95             : // We primarily scope the solution to the following point kinds: SET,
      96             : // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
      97             : // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
      98             : // most one point will be exposed per user key. MERGE keys create more
      99             : // complexity: MERGE followed by MERGE causes multiple keys to not be
     100             : // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
     101             : // that:
     102             : //
     103             : // - For regular sst iteration, the obsolete marking is a performance
     104             : //   optimization, and multiple keys for the same userkey can be handled by
     105             : //   higher layers in the iterator tree (specifically pebble.Iterator).
     106             : //
     107             : // - For foreign sst iteration, we disallow MERGEs to be written to such
     108             : //   shared ssts (details below).
     109             : //
     110             : // The key kinds are marked with an obsolete bit
     111             : // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
     112             : // obsolete. This marking is done within blockWriter, based on information
     113             : // passed to it by Writer. In turn, Writer uses a combination of key
     114             : // comparisons, and information provided by compactionIter to decide whether a
     115             : // key-value pair is obsolete. Additionally, a Pebble-internal
     116             : // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
     117             : // blocks where all key-value pairs are obsolete. Since the common case is
     118             : // non-obsolete blocks, this block property collector uses the empty byte
     119             : // slice to represent a non-obsolete block, which consumes no space in
     120             : // BlockHandleWithProperties.Props.
     121             : //
     122             : // At read time, the obsolete bit is only visible to the blockIter, which can
     123             : // be optionally configured to hide obsolete points. This hiding is only
     124             : // configured for data block iterators for sstables being read by user-facing
     125             : // iterators at a seqnum greater than the max seqnum in the sstable.
     126             : // Additionally, when this hiding is configured, a Pebble-internal block
     127             : // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
     128             : // blocks that are obsolete.
     129             : //
     130             : // 2.1 Correctness
     131             : //
     132             : // Due to the level invariant, the sequence of seqnums for a user key in a
     133             : // sstable represents a contiguous subsequence of the seqnums for the userkey
     134             : // across the whole LSM, and is more recent than the seqnums in a sstable in a
     135             : // lower level. So exposing exactly one point from a sstable for a userkey
     136             : // will also mask the points for the userkey in lower levels. If we expose no
     137             : // point, because of RANGEDELs, that RANGEDEL will also mask the points in
     138             : // lower levels.
     139             : //
     140             : // Note that we do not need to do anything special at write time for
     141             : // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
     142             : // specially only by compactions, which typically do not hide obsolete points
     143             : // (see exception below). For regular reads, SETWITHDEL behaves the same as
     144             : // SET and SINGLEDEL behaves the same as DEL.
     145             : //
     146             : // 2.1.1 Compaction reads of a foreign sstable
     147             : //
     148             : // Compaction reads of a foreign sstable behave like regular reads in that
     149             : // only non-obsolete points are exposed. Consider a L5 foreign sstable with
     150             : // b.SINGLEDEL that is non-obsolete followed by obsolete b.DEL. And a L6
     151             : // foreign sstable with two b.SETs. The SINGLEDEL will be exposed, and not the
     152             : // DEL, but this is not a correctness issue since only one of the SETs in the
     153             : // L6 sstable will be exposed. However, this works only because we have
     154             : // limited the number of foreign sst levels to two, and is extremely fragile.
     155             : // For robust correctness, non-obsolete SINGLEDELs in foreign sstables should
     156             : // be exposed as DELs.
     157             : //
     158             : // Additionally, to avoid false positive accounting errors in DELSIZED, we
     159             : // should expose them as DEL.
     160             : //
     161             : // NB: as of writing this comment, we do not have end-to-end support for
     162             : // SINGLEDEL for disaggregated storage since pointCollapsingIterator (used by
     163             : // ScanInternal) does not support SINGLEDEL. So the disaggregated key spans
     164             : // are required to never have SINGLEDELs (which is fine for CockroachDB since
     165             : // only the MVCC key space uses disaggregated storage, and SINGLEDELs are only
     166             : // used for the non-MVCC locks and intents).
     167             : //
     168             : // 2.2 Strictness and MERGE
     169             : //
     170             : // Setting the obsolete bit on point keys is advanced usage, so we support two
     171             : // modes, both of which must be truthful when setting the obsolete bit, but
     172             : // vary in when they don't set the obsolete bit.
     173             : //
     174             : // - Non-strict: In this mode, the bit does not need to be set for keys that
     175             : //   are obsolete. Additionally, any sstable containing MERGE keys can only
     176             : //   use this mode. An iterator over such an sstable, when configured to
     177             : //   hideObsoletePoints, can expose multiple internal keys per user key, and
     178             : //   can expose keys that are deleted by rangedels in the same sstable. This
     179             : //   is the mode that non-advanced users should use. Pebble without
     180             : //   disaggregated storage will also use this mode and will best-effort set
     181             : //   the obsolete bit, to optimize iteration when snapshots have retained many
     182             : //   obsolete keys.
     183             : //
     184             : // - Strict: In this mode, every obsolete key must have the obsolete bit set,
     185             : //   and no MERGE keys are permitted. An iterator over such an sstable, when
     186             : //   configured to hideObsoletePoints satisfies two properties:
     187             : //   - S1: will expose at most one internal key per user key, which is the
     188             : //     most recent one.
     189             : //   - S2: will never expose keys that are deleted by rangedels in the same
     190             : //     sstable.
     191             : //
     192             : //   This is the mode for two use cases in disaggregated storage (which will
     193             : //   exclude parts of the key space that has MERGEs), for levels that contain
     194             : //   sstables that can become foreign sstables:
     195             : //   - Pebble compaction output to these levels that can become foreign
     196             : //     sstables.
     197             : //
     198             : //   - CockroachDB ingest operations that can ingest into the levels that can
     199             : //     become foreign sstables. Note, these are not sstables corresponding to
     200             : //     copied data for CockroachDB range snapshots. This case occurs for
     201             : //     operations like index backfills: these trivially satisfy the strictness
     202             : //     criteria since they only write one key per userkey.
     203             : //
     204             : //     TODO(sumeer): this latter case is not currently supported, since only
     205             : //     Writer.AddWithForceObsolete calls are permitted for writing strict
     206             : //     obsolete sstables. This is done to reduce the likelihood of bugs. One
     207             : //     simple way to lift this limitation would be to disallow adding any
     208             : //     RANGEDELs when a Pebble-external writer is trying to construct a strict
     209             : //     obsolete sstable.
     210             : 
     211             : // ParseTableFormat parses the given magic bytes and version into its
     212             : // corresponding internal TableFormat.
     213           1 : func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
     214           1 :         switch string(magic) {
     215           0 :         case levelDBMagic:
     216           0 :                 return TableFormatLevelDB, nil
     217           0 :         case rocksDBMagic:
     218           0 :                 if version != rocksDBFormatVersion2 {
     219           0 :                         return TableFormatUnspecified, base.CorruptionErrorf(
     220           0 :                                 "pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
     221           0 :                         )
     222           0 :                 }
     223           0 :                 return TableFormatRocksDBv2, nil
     224           1 :         case pebbleDBMagic:
     225           1 :                 switch version {
     226           0 :                 case 1:
     227           0 :                         return TableFormatPebblev1, nil
     228           1 :                 case 2:
     229           1 :                         return TableFormatPebblev2, nil
     230           1 :                 case 3:
     231           1 :                         return TableFormatPebblev3, nil
     232           1 :                 case 4:
     233           1 :                         return TableFormatPebblev4, nil
     234           0 :                 default:
     235           0 :                         return TableFormatUnspecified, base.CorruptionErrorf(
     236           0 :                                 "pebble/table: unsupported pebble format version %d", errors.Safe(version),
     237           0 :                         )
     238             :                 }
     239           0 :         default:
     240           0 :                 return TableFormatUnspecified, base.CorruptionErrorf(
     241           0 :                         "pebble/table: invalid table (bad magic number: 0x%x)", magic,
     242           0 :                 )
     243             :         }
     244             : }
     245             : 
     246             : // AsTuple returns the TableFormat's (Magic String, Version) tuple.
     247           1 : func (f TableFormat) AsTuple() (string, uint32) {
     248           1 :         switch f {
     249           0 :         case TableFormatLevelDB:
     250           0 :                 return levelDBMagic, 0
     251           0 :         case TableFormatRocksDBv2:
     252           0 :                 return rocksDBMagic, 2
     253           0 :         case TableFormatPebblev1:
     254           0 :                 return pebbleDBMagic, 1
     255           1 :         case TableFormatPebblev2:
     256           1 :                 return pebbleDBMagic, 2
     257           1 :         case TableFormatPebblev3:
     258           1 :                 return pebbleDBMagic, 3
     259           1 :         case TableFormatPebblev4:
     260           1 :                 return pebbleDBMagic, 4
     261           0 :         default:
     262           0 :                 panic("sstable: unknown table format version tuple")
     263             :         }
     264             : }
     265             : 
     266             : // String returns the TableFormat (Magic String,Version) tuple.
     267           0 : func (f TableFormat) String() string {
     268           0 :         switch f {
     269           0 :         case TableFormatLevelDB:
     270           0 :                 return "(LevelDB)"
     271           0 :         case TableFormatRocksDBv2:
     272           0 :                 return "(RocksDB,v2)"
     273           0 :         case TableFormatPebblev1:
     274           0 :                 return "(Pebble,v1)"
     275           0 :         case TableFormatPebblev2:
     276           0 :                 return "(Pebble,v2)"
     277           0 :         case TableFormatPebblev3:
     278           0 :                 return "(Pebble,v3)"
     279           0 :         case TableFormatPebblev4:
     280           0 :                 return "(Pebble,v4)"
     281           0 :         default:
     282           0 :                 panic("sstable: unknown table format version tuple")
     283             :         }
     284             : }

Generated by: LCOV version 1.14