LCOV - code coverage report
Current view: top level - pebble/sstable - table.go (source / functions) Hit Total Coverage
Test: 2024-07-01 08:16Z a28d244d - tests only.lcov Lines: 104 131 79.4 %
Date: 2024-07-01 08:17:15 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : // Package sstable implements readers and writers of pebble tables.
       6             : //
       7             : // Tables are either opened for reading or created for writing but not both.
       8             : //
       9             : // A reader can create iterators, which allow seeking and next/prev
      10             : // iteration. There may be multiple key/value pairs that have the same key and
      11             : // different sequence numbers.
      12             : //
      13             : // A reader can be used concurrently. Multiple goroutines can call NewIter
      14             : // concurrently, and each iterator can run concurrently with other iterators.
      15             : // However, any particular iterator should not be used concurrently, and iterators
      16             : // should not be used once a reader is closed.
      17             : //
      18             : // A writer writes key/value pairs in increasing key order, and cannot be used
      19             : // concurrently. A table cannot be read until the writer has finished.
      20             : //
      21             : // Readers and writers can be created with various options. Passing a nil
      22             : // Options pointer is valid and means to use the default values.
      23             : //
      24             : // One such option is to define the 'less than' ordering for keys. The default
      25             : // Comparer uses the natural ordering consistent with bytes.Compare. The same
      26             : // ordering should be used for reading and writing a table.
      27             : //
      28             : // To return the value for a key:
      29             : //
      30             : //      r := table.NewReader(file, options)
      31             : //      defer r.Close()
      32             : //      i := r.NewIter(nil, nil)
      33             : //      defer i.Close()
      34             : //      ikey, value := r.SeekGE(key)
      35             : //      if options.Comparer.Compare(ikey.UserKey, key) != 0 {
      36             : //        // not found
      37             : //      } else {
      38             : //        // value is the first record containing key
      39             : //      }
      40             : //
      41             : // To count the number of entries in a table:
      42             : //
      43             : //      i, n := r.NewIter(nil, nil), 0
      44             : //      for key, value := i.First(); key != nil; key, value = i.Next() {
      45             : //              n++
      46             : //      }
      47             : //      if err := i.Close(); err != nil {
      48             : //              return 0, err
      49             : //      }
      50             : //      return n, nil
      51             : //
      52             : // To write a table with three entries:
      53             : //
      54             : //      w := table.NewWriter(file, options)
      55             : //      if err := w.Set([]byte("apple"), []byte("red")); err != nil {
      56             : //              w.Close()
      57             : //              return err
      58             : //      }
      59             : //      if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
      60             : //              w.Close()
      61             : //              return err
      62             : //      }
      63             : //      if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
      64             : //              w.Close()
      65             : //              return err
      66             : //      }
      67             : //      return w.Close()
      68             : package sstable // import "github.com/cockroachdb/pebble/sstable"
      69             : 
      70             : import (
      71             :         "context"
      72             :         "encoding/binary"
      73             : 
      74             :         "github.com/cockroachdb/errors"
      75             :         "github.com/cockroachdb/pebble/internal/base"
      76             :         "github.com/cockroachdb/pebble/objstorage"
      77             :         "github.com/cockroachdb/pebble/sstable/block"
      78             : )
      79             : 
      80             : /*
      81             : The table file format looks like:
      82             : 
      83             : <start_of_file>
      84             : [data block 0]
      85             : [data block 1]
      86             : ...
      87             : [data block N-1]
      88             : [meta filter block] (optional)
      89             : [index block] (for single level index)
      90             : [meta rangedel block] (optional)
      91             : [meta range key block] (optional)
      92             : [value block 0] (optional)
      93             : [value block M-1] (optional)
      94             : [meta value index block] (optional)
      95             : [meta properties block]
      96             : [metaindex block]
      97             : [footer]
      98             : <end_of_file>
      99             : 
     100             : A Reader eagerly loads the footer, metaindex block and meta properties block,
     101             : because the data contained in those blocks is needed on every read, and even
     102             : before reading. For example, the meta properties block is used to verify the
     103             : comparer and merger are compatible, and the metaindex block contains the
     104             : location of the meta properties (and other meta blocks). In situations where
     105             : file system locality matters, or one wants to minimize number of read
     106             : requests when eagerly loading these blocks, having these three as a suffix
     107             : of the file is convenient.
     108             : 
     109             : The interleaving of the index block(s) between the meta blocks is done to
     110             : match RocksDB/LevelDB behavior.
     111             : 
     112             : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
     113             : 4 byte checksum. The checksum is computed over the compressed data and the first
     114             : byte of the trailer (i.e. the block type), and is serialized as little-endian.
     115             : The block type gives the per-block compression used; each block is compressed
     116             : independently. The checksum algorithm is described in the pebble/crc package.
     117             : 
     118             : Most blocks, other than the meta filter block, value blocks and meta value
     119             : index block, contain key/value pairs. The remainder of this comment refers to
     120             : the decompressed block, containing key/value pairs, which has its 5 byte
     121             : trailer stripped. The decompressed block data consists of a sequence of such
     122             : key/value entries followed by a block suffix. Each key is encoded as a shared
     123             : prefix length and a remainder string. For example, if two adjacent keys are
     124             : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
     125             : "um"}. The shared prefix length is varint encoded. The remainder string and the
     126             : value are encoded as a varint-encoded length followed by the literal contents.
     127             : To continue the example, suppose that the key "tweedledum" mapped to the value
     128             : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
     129             : 
     130             : Every block has a restart interval I. Every I'th key/value entry in that block
     131             : is called a restart point, and shares no key prefix with the previous entry.
     132             : Continuing the example above, if the key after "tweedledum" was "two", but was
     133             : part of a restart point, then that key would be encoded as {0, "two"} instead
     134             : of {2, "o"}. If a block has P restart points, then the block suffix consists
     135             : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
     136             : uint32 values are the block offsets of each restart point. The final uint32
     137             : value is P itself. Thus, when seeking for a particular key, one can use binary
     138             : search to find the largest restart point whose key is <= the key sought.
     139             : 
     140             : An index block is a block with N key/value entries. The i'th value is the
     141             : encoded block handle of the i'th data block. The i'th key is a separator for
     142             : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
     143             : is a key that is >= every key in block i and is < every key i block i+1. The
     144             : successor for the final block is a key that is >= every key in block N-1. The
     145             : index block restart interval is 1: every entry is a restart point.
     146             : 
     147             : A block handle is an offset, a length, and optional block properties (for data
     148             : blocks and first/lower level index blocks); the length does not include the 5
     149             : byte trailer. All numbers are varint-encoded, with no padding between the two
     150             : values. The maximum size of an encoded block handle without properties is 20
     151             : bytes. It is not advised to have properties that accumulate to be longer than
     152             : 100 bytes.
     153             : 
     154             : Instead of a single index block, the sstable can have a two-level index (this
     155             : is used to prevent a single huge index block). A two-level index consists of a
     156             : sequence of lower-level index blocks with block handles for data blocks
     157             : followed by a single top-level index block with block handles for the
     158             : lower-level index blocks.
     159             : 
     160             : The metaindex block also contains block handles as values, with keys being
     161             : the names of the meta blocks.
     162             : 
     163             : For a description of value blocks and the meta value index block, see
     164             : value_block.go.
     165             : 
     166             : Data blocks have some additional features:
     167             : - For TableFormatPebblev3 onwards:
     168             :   - For SETs, the value has a 1 byte value prefix, which indicates whether the
     169             :     value is inline, or in a separate value block, and indicates whether the
     170             :     prefix of the userkey (as defined by split) has changed or not. See
     171             :     value_block.go for details.
     172             :   - The most significant bit of the restart points is used to indicate whether
     173             :     userkey prefix has changed since the last restart point. See the detailed
     174             :     comment in blockWriter.
     175             :   - The maximum length of the "shared prefix" when encoding the key, is the
     176             :     length of the prefix of the userkey (as defined by split) of the previous
     177             :     key.
     178             : 
     179             : - For TableFormatPebblev4 onwards:
     180             :   - The key kinds may be altered to set the
     181             :     InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
     182             :     in the context of that sstable (for a reader that reads at a higher seqnum
     183             :     than the highest seqnum in the sstable). For details, see the comment in
     184             :     format.go.
     185             : */
     186             : 
     187             : const (
     188             :         blockHandleMaxLenWithoutProperties = 10 + 10
     189             :         // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
     190             :         // reduce memory copies. It is not guaranteed that a block handle will not
     191             :         // exceed this length.
     192             :         blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
     193             : 
     194             :         levelDBFooterLen   = 48
     195             :         levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
     196             :         levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
     197             : 
     198             :         rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
     199             :         rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
     200             :         rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
     201             :         rocksDBVersionOffset         = rocksDBMagicOffset - 4
     202             :         rocksDBExternalFormatVersion = 2
     203             : 
     204             :         pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
     205             : 
     206             :         minFooterLen = levelDBFooterLen
     207             :         maxFooterLen = rocksDBFooterLen
     208             : 
     209             :         levelDBFormatVersion  = 0
     210             :         rocksDBFormatVersion2 = 2
     211             : 
     212             :         metaRangeKeyName   = "pebble.range_key"
     213             :         metaValueIndexName = "pebble.value_index"
     214             :         metaPropertiesName = "rocksdb.properties"
     215             :         metaRangeDelV1Name = "rocksdb.range_del"
     216             :         metaRangeDelV2Name = "rocksdb.range_del2"
     217             : 
     218             :         // Index Types.
     219             :         // A space efficient index block that is optimized for binary-search-based
     220             :         // index.
     221             :         binarySearchIndex = 0
     222             :         // hashSearchIndex               = 1
     223             :         // A two-level index implementation. Both levels are binary search indexes.
     224             :         twoLevelIndex = 2
     225             :         // binarySearchWithFirstKeyIndex = 3
     226             : 
     227             :         // RocksDB always includes this in the properties block. Since Pebble
     228             :         // doesn't use zstd compression, the string will always be the same.
     229             :         // This should be removed if we ever decide to diverge from the RocksDB
     230             :         // properties block.
     231             :         rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
     232             : )
     233             : 
     234             : type blockType byte
     235             : 
     236             : const (
     237             :         // The block type gives the per-block compression format.
     238             :         // These constants are part of the file format and should not be changed.
     239             :         // They are different from the Compression constants because the latter
     240             :         // are designed so that the zero value of the Compression type means to
     241             :         // use the default compression (which is snappy).
     242             :         // Not all compression types listed here are supported.
     243             :         noCompressionBlockType     blockType = 0
     244             :         snappyCompressionBlockType blockType = 1
     245             :         zlibCompressionBlockType   blockType = 2
     246             :         bzip2CompressionBlockType  blockType = 3
     247             :         lz4CompressionBlockType    blockType = 4
     248             :         lz4hcCompressionBlockType  blockType = 5
     249             :         xpressCompressionBlockType blockType = 6
     250             :         zstdCompressionBlockType   blockType = 7
     251             : )
     252             : 
     253             : // String implements fmt.Stringer.
     254           1 : func (t blockType) String() string {
     255           1 :         switch t {
     256           1 :         case 0:
     257           1 :                 return "none"
     258           1 :         case 1:
     259           1 :                 return "snappy"
     260           0 :         case 2:
     261           0 :                 return "zlib"
     262           0 :         case 3:
     263           0 :                 return "bzip2"
     264           0 :         case 4:
     265           0 :                 return "lz4"
     266           0 :         case 5:
     267           0 :                 return "lz4hc"
     268           0 :         case 6:
     269           0 :                 return "xpress"
     270           0 :         case 7:
     271           0 :                 return "zstd"
     272           0 :         default:
     273           0 :                 panic(errors.Newf("sstable: unknown block type: %d", t))
     274             :         }
     275             : }
     276             : 
     277             : // legacy (LevelDB) footer format:
     278             : //
     279             : //      metaindex handle (varint64 offset, varint64 size)
     280             : //      index handle     (varint64 offset, varint64 size)
     281             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
     282             : //      table_magic_number (8 bytes)
     283             : //
     284             : // new (RocksDB) footer format:
     285             : //
     286             : //      checksum type (char, 1 byte)
     287             : //      metaindex handle (varint64 offset, varint64 size)
     288             : //      index handle     (varint64 offset, varint64 size)
     289             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
     290             : //      footer version (4 bytes)
     291             : //      table_magic_number (8 bytes)
     292             : type footer struct {
     293             :         format      TableFormat
     294             :         checksum    block.ChecksumType
     295             :         metaindexBH block.Handle
     296             :         indexBH     block.Handle
     297             :         footerBH    block.Handle
     298             : }
     299             : 
     300             : // readHandle is optional.
     301             : func readFooter(
     302             :         ctx context.Context, f objstorage.Readable, readHandle objstorage.ReadHandle,
     303           1 : ) (footer, error) {
     304           1 :         var footer footer
     305           1 :         size := f.Size()
     306           1 :         if size < minFooterLen {
     307           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
     308           1 :         }
     309             : 
     310           1 :         buf := make([]byte, maxFooterLen)
     311           1 :         off := size - maxFooterLen
     312           1 :         if off < 0 {
     313           1 :                 off = 0
     314           1 :                 buf = buf[:size]
     315           1 :         }
     316           1 :         var err error
     317           1 :         if readHandle != nil {
     318           1 :                 err = readHandle.ReadAt(ctx, buf, off)
     319           1 :         } else {
     320           1 :                 err = f.ReadAt(ctx, buf, off)
     321           1 :         }
     322           1 :         if err != nil {
     323           1 :                 return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
     324           1 :         }
     325             : 
     326           1 :         switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
     327           1 :         case levelDBMagic:
     328           1 :                 if len(buf) < levelDBFooterLen {
     329           0 :                         return footer, base.CorruptionErrorf(
     330           0 :                                 "pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     331           0 :                 }
     332           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
     333           1 :                 buf = buf[len(buf)-levelDBFooterLen:]
     334           1 :                 footer.footerBH.Length = uint64(len(buf))
     335           1 :                 footer.format = TableFormatLevelDB
     336           1 :                 footer.checksum = block.ChecksumTypeCRC32c
     337             : 
     338           1 :         case rocksDBMagic, pebbleDBMagic:
     339           1 :                 // NOTE: The Pebble magic string implies the same footer format as that used
     340           1 :                 // by the RocksDBv2 table format.
     341           1 :                 if len(buf) < rocksDBFooterLen {
     342           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     343           1 :                 }
     344           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
     345           1 :                 buf = buf[len(buf)-rocksDBFooterLen:]
     346           1 :                 footer.footerBH.Length = uint64(len(buf))
     347           1 :                 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
     348           1 : 
     349           1 :                 format, err := ParseTableFormat(magic, version)
     350           1 :                 if err != nil {
     351           0 :                         return footer, err
     352           0 :                 }
     353           1 :                 footer.format = format
     354           1 : 
     355           1 :                 switch block.ChecksumType(buf[0]) {
     356           1 :                 case block.ChecksumTypeCRC32c:
     357           1 :                         footer.checksum = block.ChecksumTypeCRC32c
     358           1 :                 case block.ChecksumTypeXXHash64:
     359           1 :                         footer.checksum = block.ChecksumTypeXXHash64
     360           1 :                 default:
     361           1 :                         return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
     362             :                 }
     363           1 :                 buf = buf[1:]
     364             : 
     365           1 :         default:
     366           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
     367             :         }
     368             : 
     369           1 :         {
     370           1 :                 end := uint64(size)
     371           1 :                 var n int
     372           1 :                 footer.metaindexBH, n = decodeBlockHandle(buf)
     373           1 :                 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
     374           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
     375           1 :                 }
     376           1 :                 buf = buf[n:]
     377           1 : 
     378           1 :                 footer.indexBH, n = decodeBlockHandle(buf)
     379           1 :                 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
     380           0 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
     381           0 :                 }
     382             :         }
     383             : 
     384           1 :         return footer, nil
     385             : }
     386             : 
     387           1 : func (f footer) encode(buf []byte) []byte {
     388           1 :         switch magic, version := f.format.AsTuple(); magic {
     389           1 :         case levelDBMagic:
     390           1 :                 buf = buf[:levelDBFooterLen]
     391           1 :                 clear(buf)
     392           1 :                 n := encodeBlockHandle(buf[0:], f.metaindexBH)
     393           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     394           1 :                 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
     395             : 
     396           1 :         case rocksDBMagic, pebbleDBMagic:
     397           1 :                 buf = buf[:rocksDBFooterLen]
     398           1 :                 clear(buf)
     399           1 :                 switch f.checksum {
     400           1 :                 case block.ChecksumTypeNone:
     401           1 :                         buf[0] = byte(block.ChecksumTypeNone)
     402           1 :                 case block.ChecksumTypeCRC32c:
     403           1 :                         buf[0] = byte(block.ChecksumTypeCRC32c)
     404           1 :                 case block.ChecksumTypeXXHash:
     405           1 :                         buf[0] = byte(block.ChecksumTypeXXHash)
     406           1 :                 case block.ChecksumTypeXXHash64:
     407           1 :                         buf[0] = byte(block.ChecksumTypeXXHash64)
     408           0 :                 default:
     409           0 :                         panic("unknown checksum type")
     410             :                 }
     411           1 :                 n := 1
     412           1 :                 n += encodeBlockHandle(buf[n:], f.metaindexBH)
     413           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     414           1 :                 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
     415           1 :                 copy(buf[len(buf)-len(rocksDBMagic):], magic)
     416             : 
     417           0 :         default:
     418           0 :                 panic("sstable: unspecified table format version")
     419             :         }
     420             : 
     421           1 :         return buf
     422             : }
     423             : 
     424           1 : func supportsTwoLevelIndex(format TableFormat) bool {
     425           1 :         switch format {
     426           1 :         case TableFormatLevelDB:
     427           1 :                 return false
     428           1 :         case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
     429           1 :                 return true
     430           0 :         default:
     431           0 :                 panic("sstable: unspecified table format version")
     432             :         }
     433             : }

Generated by: LCOV version 1.14