LCOV - code coverage report
Current view: top level - pebble/sstable - table.go (source / functions) Hit Total Coverage
Test: 2025-01-11 08:16Z ce9b648a - tests + meta.lcov Lines: 112 124 90.3 %
Date: 2025-01-11 08:17:30 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : // Package sstable implements readers and writers of pebble tables.
       6             : //
       7             : // Tables are either opened for reading or created for writing but not both.
       8             : //
       9             : // A reader can create iterators, which allow seeking and next/prev
      10             : // iteration. There may be multiple key/value pairs that have the same key and
      11             : // different sequence numbers.
      12             : //
      13             : // A reader can be used concurrently. Multiple goroutines can call NewIter
      14             : // concurrently, and each iterator can run concurrently with other iterators.
      15             : // However, any particular iterator should not be used concurrently, and iterators
      16             : // should not be used once a reader is closed.
      17             : //
      18             : // A writer writes key/value pairs in increasing key order, and cannot be used
      19             : // concurrently. A table cannot be read until the writer has finished.
      20             : //
      21             : // Readers and writers can be created with various options. Passing a nil
      22             : // Options pointer is valid and means to use the default values.
      23             : //
      24             : // One such option is to define the 'less than' ordering for keys. The default
      25             : // Comparer uses the natural ordering consistent with bytes.Compare. The same
      26             : // ordering should be used for reading and writing a table.
      27             : //
      28             : // To return the value for a key:
      29             : //
      30             : //      r := table.NewReader(file, options)
      31             : //      defer r.Close()
      32             : //      i := r.NewIter(nil, nil)
      33             : //      defer i.Close()
      34             : //      ikey, value := r.SeekGE(key)
      35             : //      if options.Comparer.Compare(ikey.UserKey, key) != 0 {
      36             : //        // not found
      37             : //      } else {
      38             : //        // value is the first record containing key
      39             : //      }
      40             : //
      41             : // To count the number of entries in a table:
      42             : //
      43             : //      i, n := r.NewIter(nil, nil), 0
      44             : //      for key, value := i.First(); key != nil; key, value = i.Next() {
      45             : //              n++
      46             : //      }
      47             : //      if err := i.Close(); err != nil {
      48             : //              return 0, err
      49             : //      }
      50             : //      return n, nil
      51             : //
      52             : // To write a table with three entries:
      53             : //
      54             : //      w := table.NewWriter(file, options)
      55             : //      if err := w.Set([]byte("apple"), []byte("red")); err != nil {
      56             : //              w.Close()
      57             : //              return err
      58             : //      }
      59             : //      if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
      60             : //              w.Close()
      61             : //              return err
      62             : //      }
      63             : //      if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
      64             : //              w.Close()
      65             : //              return err
      66             : //      }
      67             : //      return w.Close()
      68             : package sstable // import "github.com/cockroachdb/pebble/sstable"
      69             : 
      70             : import (
      71             :         "context"
      72             :         "encoding/binary"
      73             :         "time"
      74             : 
      75             :         "github.com/cockroachdb/errors"
      76             :         "github.com/cockroachdb/pebble/internal/base"
      77             :         "github.com/cockroachdb/pebble/objstorage"
      78             :         "github.com/cockroachdb/pebble/sstable/block"
      79             : )
      80             : 
      81             : /*
      82             : The table file format looks like:
      83             : 
      84             : <start_of_file>
      85             : [data block 0]
      86             : [data block 1]
      87             : ...
      88             : [data block N-1]
      89             : [meta filter block] (optional)
      90             : [index block] (for single level index)
      91             : [meta rangedel block] (optional)
      92             : [meta range key block] (optional)
      93             : [value block 0] (optional)
      94             : [value block M-1] (optional)
      95             : [meta value index block] (optional)
      96             : [meta properties block]
      97             : [metaindex block]
      98             : [footer]
      99             : <end_of_file>
     100             : 
     101             : A Reader eagerly loads the footer, metaindex block and meta properties block,
     102             : because the data contained in those blocks is needed on every read, and even
     103             : before reading. For example, the meta properties block is used to verify the
     104             : comparer and merger are compatible, and the metaindex block contains the
     105             : location of the meta properties (and other meta blocks). In situations where
     106             : file system locality matters, or one wants to minimize number of read
     107             : requests when eagerly loading these blocks, having these three as a suffix
     108             : of the file is convenient.
     109             : 
     110             : The interleaving of the index block(s) between the meta blocks is done to
     111             : match RocksDB/LevelDB behavior.
     112             : 
     113             : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
     114             : 4 byte checksum. The checksum is computed over the compressed data and the first
     115             : byte of the trailer (i.e. the block type), and is serialized as little-endian.
     116             : The block type gives the per-block compression used; each block is compressed
     117             : independently. The checksum algorithm is described in the pebble/crc package.
     118             : 
     119             : Most blocks, other than the meta filter block, value blocks and meta value
     120             : index block, contain key/value pairs. The remainder of this comment refers to
     121             : the decompressed block, containing key/value pairs, which has its 5 byte
     122             : trailer stripped. The decompressed block data consists of a sequence of such
     123             : key/value entries followed by a block suffix. Each key is encoded as a shared
     124             : prefix length and a remainder string. For example, if two adjacent keys are
     125             : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
     126             : "um"}. The shared prefix length is varint encoded. The remainder string and the
     127             : value are encoded as a varint-encoded length followed by the literal contents.
     128             : To continue the example, suppose that the key "tweedledum" mapped to the value
     129             : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
     130             : 
     131             : Every block has a restart interval I. Every I'th key/value entry in that block
     132             : is called a restart point, and shares no key prefix with the previous entry.
     133             : Continuing the example above, if the key after "tweedledum" was "two", but was
     134             : part of a restart point, then that key would be encoded as {0, "two"} instead
     135             : of {2, "o"}. If a block has P restart points, then the block suffix consists
     136             : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
     137             : uint32 values are the block offsets of each restart point. The final uint32
     138             : value is P itself. Thus, when seeking for a particular key, one can use binary
     139             : search to find the largest restart point whose key is <= the key sought.
     140             : 
     141             : An index block is a block with N key/value entries. The i'th value is the
     142             : encoded block handle of the i'th data block. The i'th key is a separator for
     143             : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
     144             : is a key that is >= every key in block i and is < every key i block i+1. The
     145             : successor for the final block is a key that is >= every key in block N-1. The
     146             : index block restart interval is 1: every entry is a restart point.
     147             : 
     148             : A block handle is an offset, a length, and optional block properties (for data
     149             : blocks and first/lower level index blocks); the length does not include the 5
     150             : byte trailer. All numbers are varint-encoded, with no padding between the two
     151             : values. The maximum size of an encoded block handle without properties is 20
     152             : bytes. It is not advised to have properties that accumulate to be longer than
     153             : 100 bytes.
     154             : 
     155             : Instead of a single index block, the sstable can have a two-level index (this
     156             : is used to prevent a single huge index block). A two-level index consists of a
     157             : sequence of lower-level index blocks with block handles for data blocks
     158             : followed by a single top-level index block with block handles for the
     159             : lower-level index blocks.
     160             : 
     161             : The metaindex block also contains block handles as values, with keys being
     162             : the names of the meta blocks.
     163             : 
     164             : For a description of value blocks and the meta value index block, see
     165             : value_block.go.
     166             : 
     167             : Data blocks have some additional features:
     168             : - For TableFormatPebblev3 onwards:
     169             :   - For SETs, the value has a 1 byte value prefix, which indicates whether the
     170             :     value is inline, or in a separate value block, and indicates whether the
     171             :     prefix of the userkey (as defined by split) has changed or not. See
     172             :     value_block.go for details.
     173             :   - The most significant bit of the restart points is used to indicate whether
     174             :     userkey prefix has changed since the last restart point. See the detailed
     175             :     comment in blockWriter.
     176             :   - The maximum length of the "shared prefix" when encoding the key, is the
     177             :     length of the prefix of the userkey (as defined by split) of the previous
     178             :     key.
     179             : 
     180             : - For TableFormatPebblev4 onwards:
     181             :   - The key kinds may be altered to set the
     182             :     InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
     183             :     in the context of that sstable (for a reader that reads at a higher seqnum
     184             :     than the highest seqnum in the sstable). For details, see the comment in
     185             :     format.go.
     186             : */
     187             : 
     188             : const (
     189             :         blockHandleMaxLenWithoutProperties = 10 + 10
     190             :         // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
     191             :         // reduce memory copies. It is not guaranteed that a block handle will not
     192             :         // exceed this length.
     193             :         blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
     194             : 
     195             :         levelDBFooterLen   = 48
     196             :         levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
     197             :         levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
     198             : 
     199             :         rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
     200             :         rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
     201             :         rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
     202             :         rocksDBVersionOffset         = rocksDBMagicOffset - 4
     203             :         rocksDBExternalFormatVersion = 2
     204             : 
     205             :         pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
     206             : 
     207             :         minFooterLen = levelDBFooterLen
     208             :         maxFooterLen = rocksDBFooterLen
     209             : 
     210             :         levelDBFormatVersion  = 0
     211             :         rocksDBFormatVersion2 = 2
     212             : 
     213             :         metaRangeKeyName   = "pebble.range_key"
     214             :         metaValueIndexName = "pebble.value_index"
     215             :         metaPropertiesName = "rocksdb.properties"
     216             :         metaRangeDelV1Name = "rocksdb.range_del"
     217             :         metaRangeDelV2Name = "rocksdb.range_del2"
     218             : 
     219             :         // Index Types.
     220             :         // A space efficient index block that is optimized for binary-search-based
     221             :         // index.
     222             :         binarySearchIndex = 0
     223             :         // hashSearchIndex               = 1
     224             :         // A two-level index implementation. Both levels are binary search indexes.
     225             :         twoLevelIndex = 2
     226             :         // binarySearchWithFirstKeyIndex = 3
     227             : 
     228             :         // RocksDB always includes this in the properties block. Since Pebble
     229             :         // doesn't use zstd compression, the string will always be the same.
     230             :         // This should be removed if we ever decide to diverge from the RocksDB
     231             :         // properties block.
     232             :         rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
     233             : )
     234             : 
     235             : // legacy (LevelDB) footer format:
     236             : //
     237             : //      metaindex handle (varint64 offset, varint64 size)
     238             : //      index handle     (varint64 offset, varint64 size)
     239             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
     240             : //      table_magic_number (8 bytes)
     241             : //
     242             : // new (RocksDB) footer format:
     243             : //
     244             : //      checksum type (char, 1 byte)
     245             : //      metaindex handle (varint64 offset, varint64 size)
     246             : //      index handle     (varint64 offset, varint64 size)
     247             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
     248             : //      footer version (4 bytes)
     249             : //      table_magic_number (8 bytes)
     250             : type footer struct {
     251             :         format      TableFormat
     252             :         checksum    block.ChecksumType
     253             :         metaindexBH block.Handle
     254             :         indexBH     block.Handle
     255             :         footerBH    block.Handle
     256             : }
     257             : 
     258             : // TODO(sumeer): should the threshold be configurable.
     259             : const slowReadTracingThreshold = 5 * time.Millisecond
     260             : 
     261             : // readHandle is optional.
     262             : func readFooter(
     263             :         ctx context.Context,
     264             :         f objstorage.Readable,
     265             :         readHandle objstorage.ReadHandle,
     266             :         logger base.LoggerAndTracer,
     267             :         fileNum base.DiskFileNum,
     268           2 : ) (footer, error) {
     269           2 :         size := f.Size()
     270           2 :         if size < minFooterLen {
     271           1 :                 return footer{}, base.CorruptionErrorf("pebble/table: invalid table %s (file size is too small)", errors.Safe(fileNum))
     272           1 :         }
     273             : 
     274           2 :         buf := make([]byte, maxFooterLen)
     275           2 :         off := size - maxFooterLen
     276           2 :         if off < 0 {
     277           1 :                 off = 0
     278           1 :                 buf = buf[:size]
     279           1 :         }
     280           2 :         readStopwatch := makeStopwatch()
     281           2 :         var err error
     282           2 :         if readHandle != nil {
     283           2 :                 err = readHandle.ReadAt(ctx, buf, off)
     284           2 :         } else {
     285           1 :                 err = f.ReadAt(ctx, buf, off)
     286           1 :         }
     287           2 :         readDuration := readStopwatch.stop()
     288           2 :         // Call IsTracingEnabled to avoid the allocations of boxing integers into an
     289           2 :         // interface{}, unless necessary.
     290           2 :         if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
     291           1 :                 logger.Eventf(ctx, "reading footer of %d bytes took %s",
     292           1 :                         len(buf), readDuration.String())
     293           1 :         }
     294           2 :         if err != nil {
     295           1 :                 return footer{}, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
     296           1 :         }
     297           2 :         foot, err := parseFooter(buf, off, size)
     298           2 :         if err != nil {
     299           1 :                 return footer{}, errors.Wrapf(err, "pebble/table: invalid table %s", errors.Safe(fileNum))
     300           1 :         }
     301           2 :         return foot, nil
     302             : }
     303             : 
     304           2 : func parseFooter(buf []byte, off, size int64) (footer, error) {
     305           2 :         var footer footer
     306           2 :         switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
     307           1 :         case levelDBMagic:
     308           1 :                 if len(buf) < levelDBFooterLen {
     309           0 :                         return footer, base.CorruptionErrorf("(footer too short): %d", errors.Safe(len(buf)))
     310           0 :                 }
     311           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
     312           1 :                 buf = buf[len(buf)-levelDBFooterLen:]
     313           1 :                 footer.footerBH.Length = uint64(len(buf))
     314           1 :                 footer.format = TableFormatLevelDB
     315           1 :                 footer.checksum = block.ChecksumTypeCRC32c
     316             : 
     317           2 :         case rocksDBMagic, pebbleDBMagic:
     318           2 :                 // NOTE: The Pebble magic string implies the same footer format as that used
     319           2 :                 // by the RocksDBv2 table format.
     320           2 :                 if len(buf) < rocksDBFooterLen {
     321           1 :                         return footer, base.CorruptionErrorf("(footer too short): %d", errors.Safe(len(buf)))
     322           1 :                 }
     323           2 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
     324           2 :                 buf = buf[len(buf)-rocksDBFooterLen:]
     325           2 :                 footer.footerBH.Length = uint64(len(buf))
     326           2 :                 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
     327           2 : 
     328           2 :                 format, err := parseTableFormat(magic, version)
     329           2 :                 if err != nil {
     330           0 :                         return footer, err
     331           0 :                 }
     332           2 :                 footer.format = format
     333           2 : 
     334           2 :                 switch block.ChecksumType(buf[0]) {
     335           2 :                 case block.ChecksumTypeCRC32c:
     336           2 :                         footer.checksum = block.ChecksumTypeCRC32c
     337           1 :                 case block.ChecksumTypeXXHash64:
     338           1 :                         footer.checksum = block.ChecksumTypeXXHash64
     339           1 :                 default:
     340           1 :                         return footer, base.CorruptionErrorf("(unsupported checksum type %d)", errors.Safe(footer.checksum))
     341             :                 }
     342           2 :                 buf = buf[1:]
     343             : 
     344           1 :         default:
     345           1 :                 return footer, base.CorruptionErrorf("(bad magic number: 0x%x)", magic)
     346             :         }
     347             : 
     348           2 :         {
     349           2 :                 end := uint64(size)
     350           2 :                 var n int
     351           2 :                 footer.metaindexBH, n = block.DecodeHandle(buf)
     352           2 :                 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
     353           1 :                         return footer, base.CorruptionErrorf("(bad metaindex block handle)")
     354           1 :                 }
     355           2 :                 buf = buf[n:]
     356           2 : 
     357           2 :                 footer.indexBH, n = block.DecodeHandle(buf)
     358           2 :                 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
     359           0 :                         return footer, base.CorruptionErrorf("(bad index block handle)")
     360           0 :                 }
     361             :         }
     362             : 
     363           2 :         return footer, nil
     364             : }
     365             : 
     366           2 : func (f footer) encode(buf []byte) []byte {
     367           2 :         switch magic, version := f.format.AsTuple(); magic {
     368           1 :         case levelDBMagic:
     369           1 :                 buf = buf[:levelDBFooterLen]
     370           1 :                 clear(buf)
     371           1 :                 n := f.metaindexBH.EncodeVarints(buf[0:])
     372           1 :                 f.indexBH.EncodeVarints(buf[n:])
     373           1 :                 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
     374             : 
     375           2 :         case rocksDBMagic, pebbleDBMagic:
     376           2 :                 buf = buf[:rocksDBFooterLen]
     377           2 :                 clear(buf)
     378           2 :                 switch f.checksum {
     379           1 :                 case block.ChecksumTypeNone:
     380           1 :                         buf[0] = byte(block.ChecksumTypeNone)
     381           2 :                 case block.ChecksumTypeCRC32c:
     382           2 :                         buf[0] = byte(block.ChecksumTypeCRC32c)
     383           1 :                 case block.ChecksumTypeXXHash:
     384           1 :                         buf[0] = byte(block.ChecksumTypeXXHash)
     385           1 :                 case block.ChecksumTypeXXHash64:
     386           1 :                         buf[0] = byte(block.ChecksumTypeXXHash64)
     387           0 :                 default:
     388           0 :                         panic("unknown checksum type")
     389             :                 }
     390           2 :                 n := 1
     391           2 :                 n += f.metaindexBH.EncodeVarints(buf[n:])
     392           2 :                 n += f.indexBH.EncodeVarints(buf[n:])
     393           2 :                 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
     394           2 :                 copy(buf[len(buf)-len(rocksDBMagic):], magic)
     395             : 
     396           0 :         default:
     397           0 :                 panic("sstable: unspecified table format version")
     398             :         }
     399             : 
     400           2 :         return buf
     401             : }
     402             : 
     403           2 : func supportsTwoLevelIndex(format TableFormat) bool {
     404           2 :         switch format {
     405           1 :         case TableFormatLevelDB:
     406           1 :                 return false
     407           2 :         case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4, TableFormatPebblev5:
     408           2 :                 return true
     409           0 :         default:
     410           0 :                 panic("sstable: unspecified table format version")
     411             :         }
     412             : }

Generated by: LCOV version 1.14