LCOV - code coverage report
Current view: top level - pebble/sstable - table.go (source / functions) Hit Total Coverage
Test: 2024-04-01 08:16Z 1c7bcd1c - tests + meta.lcov Lines: 102 137 74.5 %
Date: 2024-04-01 08:17:59 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : // Package sstable implements readers and writers of pebble tables.
       6             : //
       7             : // Tables are either opened for reading or created for writing but not both.
       8             : //
       9             : // A reader can create iterators, which allow seeking and next/prev
      10             : // iteration. There may be multiple key/value pairs that have the same key and
      11             : // different sequence numbers.
      12             : //
      13             : // A reader can be used concurrently. Multiple goroutines can call NewIter
      14             : // concurrently, and each iterator can run concurrently with other iterators.
      15             : // However, any particular iterator should not be used concurrently, and iterators
      16             : // should not be used once a reader is closed.
      17             : //
      18             : // A writer writes key/value pairs in increasing key order, and cannot be used
      19             : // concurrently. A table cannot be read until the writer has finished.
      20             : //
      21             : // Readers and writers can be created with various options. Passing a nil
      22             : // Options pointer is valid and means to use the default values.
      23             : //
      24             : // One such option is to define the 'less than' ordering for keys. The default
      25             : // Comparer uses the natural ordering consistent with bytes.Compare. The same
      26             : // ordering should be used for reading and writing a table.
      27             : //
      28             : // To return the value for a key:
      29             : //
      30             : //      r := table.NewReader(file, options)
      31             : //      defer r.Close()
      32             : //      i := r.NewIter(nil, nil)
      33             : //      defer i.Close()
      34             : //      ikey, value := r.SeekGE(key)
      35             : //      if options.Comparer.Compare(ikey.UserKey, key) != 0 {
      36             : //        // not found
      37             : //      } else {
      38             : //        // value is the first record containing key
      39             : //      }
      40             : //
      41             : // To count the number of entries in a table:
      42             : //
      43             : //      i, n := r.NewIter(nil, nil), 0
      44             : //      for key, value := i.First(); key != nil; key, value = i.Next() {
      45             : //              n++
      46             : //      }
      47             : //      if err := i.Close(); err != nil {
      48             : //              return 0, err
      49             : //      }
      50             : //      return n, nil
      51             : //
      52             : // To write a table with three entries:
      53             : //
      54             : //      w := table.NewWriter(file, options)
      55             : //      if err := w.Set([]byte("apple"), []byte("red")); err != nil {
      56             : //              w.Close()
      57             : //              return err
      58             : //      }
      59             : //      if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
      60             : //              w.Close()
      61             : //              return err
      62             : //      }
      63             : //      if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
      64             : //              w.Close()
      65             : //              return err
      66             : //      }
      67             : //      return w.Close()
      68             : package sstable // import "github.com/cockroachdb/pebble/sstable"
      69             : 
      70             : import (
      71             :         "context"
      72             :         "encoding/binary"
      73             : 
      74             :         "github.com/cockroachdb/errors"
      75             :         "github.com/cockroachdb/pebble/internal/base"
      76             :         "github.com/cockroachdb/pebble/objstorage"
      77             : )
      78             : 
      79             : /*
      80             : The table file format looks like:
      81             : 
      82             : <start_of_file>
      83             : [data block 0]
      84             : [data block 1]
      85             : ...
      86             : [data block N-1]
      87             : [meta filter block] (optional)
      88             : [index block] (for single level index)
      89             : [meta rangedel block] (optional)
      90             : [meta range key block] (optional)
      91             : [value block 0] (optional)
      92             : [value block M-1] (optional)
      93             : [meta value index block] (optional)
      94             : [meta properties block]
      95             : [metaindex block]
      96             : [footer]
      97             : <end_of_file>
      98             : 
      99             : A Reader eagerly loads the footer, metaindex block and meta properties block,
     100             : because the data contained in those blocks is needed on every read, and even
     101             : before reading. For example, the meta properties block is used to verify the
     102             : comparer and merger are compatible, and the metaindex block contains the
     103             : location of the meta properties (and other meta blocks). In situations where
     104             : file system locality matters, or one wants to minimize number of read
     105             : requests when eagerly loading these blocks, having these three as a suffix
     106             : of the file is convenient.
     107             : 
     108             : The interleaving of the index block(s) between the meta blocks is done to
     109             : match RocksDB/LevelDB behavior.
     110             : 
     111             : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
     112             : 4 byte checksum. The checksum is computed over the compressed data and the first
     113             : byte of the trailer (i.e. the block type), and is serialized as little-endian.
     114             : The block type gives the per-block compression used; each block is compressed
     115             : independently. The checksum algorithm is described in the pebble/crc package.
     116             : 
     117             : Most blocks, other than the meta filter block, value blocks and meta value
     118             : index block, contain key/value pairs. The remainder of this comment refers to
     119             : the decompressed block, containing key/value pairs, which has its 5 byte
     120             : trailer stripped. The decompressed block data consists of a sequence of such
     121             : key/value entries followed by a block suffix. Each key is encoded as a shared
     122             : prefix length and a remainder string. For example, if two adjacent keys are
     123             : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
     124             : "um"}. The shared prefix length is varint encoded. The remainder string and the
     125             : value are encoded as a varint-encoded length followed by the literal contents.
     126             : To continue the example, suppose that the key "tweedledum" mapped to the value
     127             : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
     128             : 
     129             : Every block has a restart interval I. Every I'th key/value entry in that block
     130             : is called a restart point, and shares no key prefix with the previous entry.
     131             : Continuing the example above, if the key after "tweedledum" was "two", but was
     132             : part of a restart point, then that key would be encoded as {0, "two"} instead
     133             : of {2, "o"}. If a block has P restart points, then the block suffix consists
     134             : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
     135             : uint32 values are the block offsets of each restart point. The final uint32
     136             : value is P itself. Thus, when seeking for a particular key, one can use binary
     137             : search to find the largest restart point whose key is <= the key sought.
     138             : 
     139             : An index block is a block with N key/value entries. The i'th value is the
     140             : encoded block handle of the i'th data block. The i'th key is a separator for
     141             : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
     142             : is a key that is >= every key in block i and is < every key i block i+1. The
     143             : successor for the final block is a key that is >= every key in block N-1. The
     144             : index block restart interval is 1: every entry is a restart point.
     145             : 
     146             : A block handle is an offset, a length, and optional block properties (for data
     147             : blocks and first/lower level index blocks); the length does not include the 5
     148             : byte trailer. All numbers are varint-encoded, with no padding between the two
     149             : values. The maximum size of an encoded block handle without properties is 20
     150             : bytes. It is not advised to have properties that accumulate to be longer than
     151             : 100 bytes.
     152             : 
     153             : Instead of a single index block, the sstable can have a two-level index (this
     154             : is used to prevent a single huge index block). A two-level index consists of a
     155             : sequence of lower-level index blocks with block handles for data blocks
     156             : followed by a single top-level index block with block handles for the
     157             : lower-level index blocks.
     158             : 
     159             : The metaindex block also contains block handles as values, with keys being
     160             : the names of the meta blocks.
     161             : 
     162             : For a description of value blocks and the meta value index block, see
     163             : value_block.go.
     164             : 
     165             : Data blocks have some additional features:
     166             : - For TableFormatPebblev3 onwards:
     167             :   - For SETs, the value has a 1 byte value prefix, which indicates whether the
     168             :     value is inline, or in a separate value block, and indicates whether the
     169             :     prefix of the userkey (as defined by split) has changed or not. See
     170             :     value_block.go for details.
     171             :   - The most significant bit of the restart points is used to indicate whether
     172             :     userkey prefix has changed since the last restart point. See the detailed
     173             :     comment in blockWriter.
     174             :   - The maximum length of the "shared prefix" when encoding the key, is the
     175             :     length of the prefix of the userkey (as defined by split) of the previous
     176             :     key.
     177             : 
     178             : - For TableFormatPebblev4 onwards:
     179             :   - The key kinds may be altered to set the
     180             :     InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
     181             :     in the context of that sstable (for a reader that reads at a higher seqnum
     182             :     than the highest seqnum in the sstable). For details, see the comment in
     183             :     format.go.
     184             : */
     185             : 
     186             : const (
     187             :         blockTrailerLen                    = 5
     188             :         blockHandleMaxLenWithoutProperties = 10 + 10
     189             :         // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
     190             :         // reduce memory copies. It is not guaranteed that a block handle will not
     191             :         // exceed this length.
     192             :         blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
     193             : 
     194             :         levelDBFooterLen   = 48
     195             :         levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
     196             :         levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
     197             : 
     198             :         rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
     199             :         rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
     200             :         rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
     201             :         rocksDBVersionOffset         = rocksDBMagicOffset - 4
     202             :         rocksDBExternalFormatVersion = 2
     203             : 
     204             :         pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
     205             : 
     206             :         minFooterLen = levelDBFooterLen
     207             :         maxFooterLen = rocksDBFooterLen
     208             : 
     209             :         levelDBFormatVersion  = 0
     210             :         rocksDBFormatVersion2 = 2
     211             : 
     212             :         metaRangeKeyName   = "pebble.range_key"
     213             :         metaValueIndexName = "pebble.value_index"
     214             :         metaPropertiesName = "rocksdb.properties"
     215             :         metaRangeDelName   = "rocksdb.range_del"
     216             :         metaRangeDelV2Name = "rocksdb.range_del2"
     217             : 
     218             :         // Index Types.
     219             :         // A space efficient index block that is optimized for binary-search-based
     220             :         // index.
     221             :         binarySearchIndex = 0
     222             :         // hashSearchIndex               = 1
     223             :         // A two-level index implementation. Both levels are binary search indexes.
     224             :         twoLevelIndex = 2
     225             :         // binarySearchWithFirstKeyIndex = 3
     226             : 
     227             :         // RocksDB always includes this in the properties block. Since Pebble
     228             :         // doesn't use zstd compression, the string will always be the same.
     229             :         // This should be removed if we ever decide to diverge from the RocksDB
     230             :         // properties block.
     231             :         rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
     232             : )
     233             : 
     234             : // ChecksumType specifies the checksum used for blocks.
     235             : type ChecksumType byte
     236             : 
     237             : // The available checksum types.
     238             : const (
     239             :         ChecksumTypeNone     ChecksumType = 0
     240             :         ChecksumTypeCRC32c   ChecksumType = 1
     241             :         ChecksumTypeXXHash   ChecksumType = 2
     242             :         ChecksumTypeXXHash64 ChecksumType = 3
     243             : )
     244             : 
     245             : // String implements fmt.Stringer.
     246           1 : func (t ChecksumType) String() string {
     247           1 :         switch t {
     248           1 :         case ChecksumTypeCRC32c:
     249           1 :                 return "crc32c"
     250           0 :         case ChecksumTypeNone:
     251           0 :                 return "none"
     252           0 :         case ChecksumTypeXXHash:
     253           0 :                 return "xxhash"
     254           0 :         case ChecksumTypeXXHash64:
     255           0 :                 return "xxhash64"
     256           0 :         default:
     257           0 :                 panic(errors.Newf("sstable: unknown checksum type: %d", t))
     258             :         }
     259             : }
     260             : 
     261             : type blockType byte
     262             : 
     263             : const (
     264             :         // The block type gives the per-block compression format.
     265             :         // These constants are part of the file format and should not be changed.
     266             :         // They are different from the Compression constants because the latter
     267             :         // are designed so that the zero value of the Compression type means to
     268             :         // use the default compression (which is snappy).
     269             :         // Not all compression types listed here are supported.
     270             :         noCompressionBlockType     blockType = 0
     271             :         snappyCompressionBlockType blockType = 1
     272             :         zlibCompressionBlockType   blockType = 2
     273             :         bzip2CompressionBlockType  blockType = 3
     274             :         lz4CompressionBlockType    blockType = 4
     275             :         lz4hcCompressionBlockType  blockType = 5
     276             :         xpressCompressionBlockType blockType = 6
     277             :         zstdCompressionBlockType   blockType = 7
     278             : )
     279             : 
     280             : // String implements fmt.Stringer.
     281           1 : func (t blockType) String() string {
     282           1 :         switch t {
     283           1 :         case 0:
     284           1 :                 return "none"
     285           1 :         case 1:
     286           1 :                 return "snappy"
     287           0 :         case 2:
     288           0 :                 return "zlib"
     289           0 :         case 3:
     290           0 :                 return "bzip2"
     291           0 :         case 4:
     292           0 :                 return "lz4"
     293           0 :         case 5:
     294           0 :                 return "lz4hc"
     295           0 :         case 6:
     296           0 :                 return "xpress"
     297           0 :         case 7:
     298           0 :                 return "zstd"
     299           0 :         default:
     300           0 :                 panic(errors.Newf("sstable: unknown block type: %d", t))
     301             :         }
     302             : }
     303             : 
     304             : // legacy (LevelDB) footer format:
     305             : //
     306             : //      metaindex handle (varint64 offset, varint64 size)
     307             : //      index handle     (varint64 offset, varint64 size)
     308             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
     309             : //      table_magic_number (8 bytes)
     310             : //
     311             : // new (RocksDB) footer format:
     312             : //
     313             : //      checksum type (char, 1 byte)
     314             : //      metaindex handle (varint64 offset, varint64 size)
     315             : //      index handle     (varint64 offset, varint64 size)
     316             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
     317             : //      footer version (4 bytes)
     318             : //      table_magic_number (8 bytes)
     319             : type footer struct {
     320             :         format      TableFormat
     321             :         checksum    ChecksumType
     322             :         metaindexBH BlockHandle
     323             :         indexBH     BlockHandle
     324             :         footerBH    BlockHandle
     325             : }
     326             : 
     327           2 : func readFooter(f objstorage.Readable) (footer, error) {
     328           2 :         var footer footer
     329           2 :         size := f.Size()
     330           2 :         if size < minFooterLen {
     331           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
     332           1 :         }
     333             : 
     334           2 :         buf := make([]byte, maxFooterLen)
     335           2 :         off := size - maxFooterLen
     336           2 :         if off < 0 {
     337           1 :                 off = 0
     338           1 :                 buf = buf[:size]
     339           1 :         }
     340           2 :         if err := f.ReadAt(context.TODO(), buf, off); err != nil {
     341           1 :                 return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
     342           1 :         }
     343             : 
     344           2 :         switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
     345           1 :         case levelDBMagic:
     346           1 :                 if len(buf) < levelDBFooterLen {
     347           0 :                         return footer, base.CorruptionErrorf(
     348           0 :                                 "pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     349           0 :                 }
     350           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
     351           1 :                 buf = buf[len(buf)-levelDBFooterLen:]
     352           1 :                 footer.footerBH.Length = uint64(len(buf))
     353           1 :                 footer.format = TableFormatLevelDB
     354           1 :                 footer.checksum = ChecksumTypeCRC32c
     355             : 
     356           2 :         case rocksDBMagic, pebbleDBMagic:
     357           2 :                 // NOTE: The Pebble magic string implies the same footer format as that used
     358           2 :                 // by the RocksDBv2 table format.
     359           2 :                 if len(buf) < rocksDBFooterLen {
     360           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     361           1 :                 }
     362           2 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
     363           2 :                 buf = buf[len(buf)-rocksDBFooterLen:]
     364           2 :                 footer.footerBH.Length = uint64(len(buf))
     365           2 :                 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
     366           2 : 
     367           2 :                 format, err := ParseTableFormat(magic, version)
     368           2 :                 if err != nil {
     369           0 :                         return footer, err
     370           0 :                 }
     371           2 :                 footer.format = format
     372           2 : 
     373           2 :                 switch ChecksumType(buf[0]) {
     374           2 :                 case ChecksumTypeCRC32c:
     375           2 :                         footer.checksum = ChecksumTypeCRC32c
     376           1 :                 case ChecksumTypeXXHash64:
     377           1 :                         footer.checksum = ChecksumTypeXXHash64
     378           1 :                 default:
     379           1 :                         return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
     380             :                 }
     381           2 :                 buf = buf[1:]
     382             : 
     383           1 :         default:
     384           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
     385             :         }
     386             : 
     387           2 :         {
     388           2 :                 end := uint64(size)
     389           2 :                 var n int
     390           2 :                 footer.metaindexBH, n = decodeBlockHandle(buf)
     391           2 :                 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
     392           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
     393           1 :                 }
     394           2 :                 buf = buf[n:]
     395           2 : 
     396           2 :                 footer.indexBH, n = decodeBlockHandle(buf)
     397           2 :                 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
     398           0 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
     399           0 :                 }
     400             :         }
     401             : 
     402           2 :         return footer, nil
     403             : }
     404             : 
     405           2 : func (f footer) encode(buf []byte) []byte {
     406           2 :         switch magic, version := f.format.AsTuple(); magic {
     407           1 :         case levelDBMagic:
     408           1 :                 buf = buf[:levelDBFooterLen]
     409           1 :                 clear(buf)
     410           1 :                 n := encodeBlockHandle(buf[0:], f.metaindexBH)
     411           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     412           1 :                 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
     413             : 
     414           2 :         case rocksDBMagic, pebbleDBMagic:
     415           2 :                 buf = buf[:rocksDBFooterLen]
     416           2 :                 clear(buf)
     417           2 :                 switch f.checksum {
     418           1 :                 case ChecksumTypeNone:
     419           1 :                         buf[0] = byte(ChecksumTypeNone)
     420           2 :                 case ChecksumTypeCRC32c:
     421           2 :                         buf[0] = byte(ChecksumTypeCRC32c)
     422           1 :                 case ChecksumTypeXXHash:
     423           1 :                         buf[0] = byte(ChecksumTypeXXHash)
     424           1 :                 case ChecksumTypeXXHash64:
     425           1 :                         buf[0] = byte(ChecksumTypeXXHash64)
     426           0 :                 default:
     427           0 :                         panic("unknown checksum type")
     428             :                 }
     429           2 :                 n := 1
     430           2 :                 n += encodeBlockHandle(buf[n:], f.metaindexBH)
     431           2 :                 encodeBlockHandle(buf[n:], f.indexBH)
     432           2 :                 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
     433           2 :                 copy(buf[len(buf)-len(rocksDBMagic):], magic)
     434             : 
     435           0 :         default:
     436           0 :                 panic("sstable: unspecified table format version")
     437             :         }
     438             : 
     439           2 :         return buf
     440             : }
     441             : 
     442           2 : func supportsTwoLevelIndex(format TableFormat) bool {
     443           2 :         switch format {
     444           1 :         case TableFormatLevelDB:
     445           1 :                 return false
     446           2 :         case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
     447           2 :                 return true
     448           0 :         default:
     449           0 :                 panic("sstable: unspecified table format version")
     450             :         }
     451             : }

Generated by: LCOV version 1.14