LCOV - code coverage report
Current view: top level - pebble/sstable - table.go (source / functions) Hit Total Coverage
Test: 2024-08-03 08:16Z cda4471a - tests + meta.lcov Lines: 106 119 89.1 %
Date: 2024-08-03 08:17:19 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : // Package sstable implements readers and writers of pebble tables.
       6             : //
       7             : // Tables are either opened for reading or created for writing but not both.
       8             : //
       9             : // A reader can create iterators, which allow seeking and next/prev
      10             : // iteration. There may be multiple key/value pairs that have the same key and
      11             : // different sequence numbers.
      12             : //
      13             : // A reader can be used concurrently. Multiple goroutines can call NewIter
      14             : // concurrently, and each iterator can run concurrently with other iterators.
      15             : // However, any particular iterator should not be used concurrently, and iterators
      16             : // should not be used once a reader is closed.
      17             : //
      18             : // A writer writes key/value pairs in increasing key order, and cannot be used
      19             : // concurrently. A table cannot be read until the writer has finished.
      20             : //
      21             : // Readers and writers can be created with various options. Passing a nil
      22             : // Options pointer is valid and means to use the default values.
      23             : //
      24             : // One such option is to define the 'less than' ordering for keys. The default
      25             : // Comparer uses the natural ordering consistent with bytes.Compare. The same
      26             : // ordering should be used for reading and writing a table.
      27             : //
      28             : // To return the value for a key:
      29             : //
      30             : //      r := table.NewReader(file, options)
      31             : //      defer r.Close()
      32             : //      i := r.NewIter(nil, nil)
      33             : //      defer i.Close()
      34             : //      ikey, value := r.SeekGE(key)
      35             : //      if options.Comparer.Compare(ikey.UserKey, key) != 0 {
      36             : //        // not found
      37             : //      } else {
      38             : //        // value is the first record containing key
      39             : //      }
      40             : //
      41             : // To count the number of entries in a table:
      42             : //
      43             : //      i, n := r.NewIter(nil, nil), 0
      44             : //      for key, value := i.First(); key != nil; key, value = i.Next() {
      45             : //              n++
      46             : //      }
      47             : //      if err := i.Close(); err != nil {
      48             : //              return 0, err
      49             : //      }
      50             : //      return n, nil
      51             : //
      52             : // To write a table with three entries:
      53             : //
      54             : //      w := table.NewWriter(file, options)
      55             : //      if err := w.Set([]byte("apple"), []byte("red")); err != nil {
      56             : //              w.Close()
      57             : //              return err
      58             : //      }
      59             : //      if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
      60             : //              w.Close()
      61             : //              return err
      62             : //      }
      63             : //      if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
      64             : //              w.Close()
      65             : //              return err
      66             : //      }
      67             : //      return w.Close()
      68             : package sstable // import "github.com/cockroachdb/pebble/sstable"
      69             : 
      70             : import (
      71             :         "context"
      72             :         "encoding/binary"
      73             :         "time"
      74             : 
      75             :         "github.com/cockroachdb/errors"
      76             :         "github.com/cockroachdb/pebble/internal/base"
      77             :         "github.com/cockroachdb/pebble/objstorage"
      78             :         "github.com/cockroachdb/pebble/sstable/block"
      79             : )
      80             : 
      81             : /*
      82             : The table file format looks like:
      83             : 
      84             : <start_of_file>
      85             : [data block 0]
      86             : [data block 1]
      87             : ...
      88             : [data block N-1]
      89             : [meta filter block] (optional)
      90             : [index block] (for single level index)
      91             : [meta rangedel block] (optional)
      92             : [meta range key block] (optional)
      93             : [value block 0] (optional)
      94             : [value block M-1] (optional)
      95             : [meta value index block] (optional)
      96             : [meta properties block]
      97             : [metaindex block]
      98             : [footer]
      99             : <end_of_file>
     100             : 
     101             : A Reader eagerly loads the footer, metaindex block and meta properties block,
     102             : because the data contained in those blocks is needed on every read, and even
     103             : before reading. For example, the meta properties block is used to verify the
     104             : comparer and merger are compatible, and the metaindex block contains the
     105             : location of the meta properties (and other meta blocks). In situations where
     106             : file system locality matters, or one wants to minimize number of read
     107             : requests when eagerly loading these blocks, having these three as a suffix
     108             : of the file is convenient.
     109             : 
     110             : The interleaving of the index block(s) between the meta blocks is done to
     111             : match RocksDB/LevelDB behavior.
     112             : 
     113             : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
     114             : 4 byte checksum. The checksum is computed over the compressed data and the first
     115             : byte of the trailer (i.e. the block type), and is serialized as little-endian.
     116             : The block type gives the per-block compression used; each block is compressed
     117             : independently. The checksum algorithm is described in the pebble/crc package.
     118             : 
     119             : Most blocks, other than the meta filter block, value blocks and meta value
     120             : index block, contain key/value pairs. The remainder of this comment refers to
     121             : the decompressed block, containing key/value pairs, which has its 5 byte
     122             : trailer stripped. The decompressed block data consists of a sequence of such
     123             : key/value entries followed by a block suffix. Each key is encoded as a shared
     124             : prefix length and a remainder string. For example, if two adjacent keys are
     125             : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
     126             : "um"}. The shared prefix length is varint encoded. The remainder string and the
     127             : value are encoded as a varint-encoded length followed by the literal contents.
     128             : To continue the example, suppose that the key "tweedledum" mapped to the value
     129             : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
     130             : 
     131             : Every block has a restart interval I. Every I'th key/value entry in that block
     132             : is called a restart point, and shares no key prefix with the previous entry.
     133             : Continuing the example above, if the key after "tweedledum" was "two", but was
     134             : part of a restart point, then that key would be encoded as {0, "two"} instead
     135             : of {2, "o"}. If a block has P restart points, then the block suffix consists
     136             : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
     137             : uint32 values are the block offsets of each restart point. The final uint32
     138             : value is P itself. Thus, when seeking for a particular key, one can use binary
     139             : search to find the largest restart point whose key is <= the key sought.
     140             : 
     141             : An index block is a block with N key/value entries. The i'th value is the
     142             : encoded block handle of the i'th data block. The i'th key is a separator for
     143             : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
     144             : is a key that is >= every key in block i and is < every key i block i+1. The
     145             : successor for the final block is a key that is >= every key in block N-1. The
     146             : index block restart interval is 1: every entry is a restart point.
     147             : 
     148             : A block handle is an offset, a length, and optional block properties (for data
     149             : blocks and first/lower level index blocks); the length does not include the 5
     150             : byte trailer. All numbers are varint-encoded, with no padding between the two
     151             : values. The maximum size of an encoded block handle without properties is 20
     152             : bytes. It is not advised to have properties that accumulate to be longer than
     153             : 100 bytes.
     154             : 
     155             : Instead of a single index block, the sstable can have a two-level index (this
     156             : is used to prevent a single huge index block). A two-level index consists of a
     157             : sequence of lower-level index blocks with block handles for data blocks
     158             : followed by a single top-level index block with block handles for the
     159             : lower-level index blocks.
     160             : 
     161             : The metaindex block also contains block handles as values, with keys being
     162             : the names of the meta blocks.
     163             : 
     164             : For a description of value blocks and the meta value index block, see
     165             : value_block.go.
     166             : 
     167             : Data blocks have some additional features:
     168             : - For TableFormatPebblev3 onwards:
     169             :   - For SETs, the value has a 1 byte value prefix, which indicates whether the
     170             :     value is inline, or in a separate value block, and indicates whether the
     171             :     prefix of the userkey (as defined by split) has changed or not. See
     172             :     value_block.go for details.
     173             :   - The most significant bit of the restart points is used to indicate whether
     174             :     userkey prefix has changed since the last restart point. See the detailed
     175             :     comment in blockWriter.
     176             :   - The maximum length of the "shared prefix" when encoding the key, is the
     177             :     length of the prefix of the userkey (as defined by split) of the previous
     178             :     key.
     179             : 
     180             : - For TableFormatPebblev4 onwards:
     181             :   - The key kinds may be altered to set the
     182             :     InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
     183             :     in the context of that sstable (for a reader that reads at a higher seqnum
     184             :     than the highest seqnum in the sstable). For details, see the comment in
     185             :     format.go.
     186             : */
     187             : 
     188             : const (
     189             :         blockHandleMaxLenWithoutProperties = 10 + 10
     190             :         // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
     191             :         // reduce memory copies. It is not guaranteed that a block handle will not
     192             :         // exceed this length.
     193             :         blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
     194             : 
     195             :         levelDBFooterLen   = 48
     196             :         levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
     197             :         levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
     198             : 
     199             :         rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
     200             :         rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
     201             :         rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
     202             :         rocksDBVersionOffset         = rocksDBMagicOffset - 4
     203             :         rocksDBExternalFormatVersion = 2
     204             : 
     205             :         pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
     206             : 
     207             :         minFooterLen = levelDBFooterLen
     208             :         maxFooterLen = rocksDBFooterLen
     209             : 
     210             :         levelDBFormatVersion  = 0
     211             :         rocksDBFormatVersion2 = 2
     212             : 
     213             :         metaRangeKeyName   = "pebble.range_key"
     214             :         metaValueIndexName = "pebble.value_index"
     215             :         metaPropertiesName = "rocksdb.properties"
     216             :         metaRangeDelV1Name = "rocksdb.range_del"
     217             :         metaRangeDelV2Name = "rocksdb.range_del2"
     218             : 
     219             :         // Index Types.
     220             :         // A space efficient index block that is optimized for binary-search-based
     221             :         // index.
     222             :         binarySearchIndex = 0
     223             :         // hashSearchIndex               = 1
     224             :         // A two-level index implementation. Both levels are binary search indexes.
     225             :         twoLevelIndex = 2
     226             :         // binarySearchWithFirstKeyIndex = 3
     227             : 
     228             :         // RocksDB always includes this in the properties block. Since Pebble
     229             :         // doesn't use zstd compression, the string will always be the same.
     230             :         // This should be removed if we ever decide to diverge from the RocksDB
     231             :         // properties block.
     232             :         rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
     233             : )
     234             : 
     235             : // legacy (LevelDB) footer format:
     236             : //
     237             : //      metaindex handle (varint64 offset, varint64 size)
     238             : //      index handle     (varint64 offset, varint64 size)
     239             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
     240             : //      table_magic_number (8 bytes)
     241             : //
     242             : // new (RocksDB) footer format:
     243             : //
     244             : //      checksum type (char, 1 byte)
     245             : //      metaindex handle (varint64 offset, varint64 size)
     246             : //      index handle     (varint64 offset, varint64 size)
     247             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
     248             : //      footer version (4 bytes)
     249             : //      table_magic_number (8 bytes)
     250             : type footer struct {
     251             :         format      TableFormat
     252             :         checksum    block.ChecksumType
     253             :         metaindexBH block.Handle
     254             :         indexBH     block.Handle
     255             :         footerBH    block.Handle
     256             : }
     257             : 
     258             : // TODO(sumeer): should the threshold be configurable.
     259             : const slowReadTracingThreshold = 5 * time.Millisecond
     260             : 
     261             : // readHandle is optional.
     262             : func readFooter(
     263             :         ctx context.Context,
     264             :         f objstorage.Readable,
     265             :         readHandle objstorage.ReadHandle,
     266             :         logger base.LoggerAndTracer,
     267             :         fileNum base.DiskFileNum,
     268           2 : ) (footer, error) {
     269           2 :         var footer footer
     270           2 :         size := f.Size()
     271           2 :         if size < minFooterLen {
     272           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table %s (file size is too small)", errors.Safe(fileNum))
     273           1 :         }
     274             : 
     275           2 :         buf := make([]byte, maxFooterLen)
     276           2 :         off := size - maxFooterLen
     277           2 :         if off < 0 {
     278           1 :                 off = 0
     279           1 :                 buf = buf[:size]
     280           1 :         }
     281           2 :         readStopwatch := makeStopwatch()
     282           2 :         var err error
     283           2 :         if readHandle != nil {
     284           2 :                 err = readHandle.ReadAt(ctx, buf, off)
     285           2 :         } else {
     286           1 :                 err = f.ReadAt(ctx, buf, off)
     287           1 :         }
     288           2 :         readDuration := readStopwatch.stop()
     289           2 :         // Call IsTracingEnabled to avoid the allocations of boxing integers into an
     290           2 :         // interface{}, unless necessary.
     291           2 :         if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
     292           1 :                 logger.Eventf(ctx, "reading footer of %d bytes took %s",
     293           1 :                         len(buf), readDuration.String())
     294           1 :         }
     295           2 :         if err != nil {
     296           1 :                 return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
     297           1 :         }
     298             : 
     299           2 :         switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
     300           1 :         case levelDBMagic:
     301           1 :                 if len(buf) < levelDBFooterLen {
     302           0 :                         return footer, base.CorruptionErrorf(
     303           0 :                                 "pebble/table: invalid table %s (footer too short): %d", errors.Safe(fileNum), errors.Safe(len(buf)))
     304           0 :                 }
     305           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
     306           1 :                 buf = buf[len(buf)-levelDBFooterLen:]
     307           1 :                 footer.footerBH.Length = uint64(len(buf))
     308           1 :                 footer.format = TableFormatLevelDB
     309           1 :                 footer.checksum = block.ChecksumTypeCRC32c
     310             : 
     311           2 :         case rocksDBMagic, pebbleDBMagic:
     312           2 :                 // NOTE: The Pebble magic string implies the same footer format as that used
     313           2 :                 // by the RocksDBv2 table format.
     314           2 :                 if len(buf) < rocksDBFooterLen {
     315           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table %s (footer too short): %d", errors.Safe(fileNum), errors.Safe(len(buf)))
     316           1 :                 }
     317           2 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
     318           2 :                 buf = buf[len(buf)-rocksDBFooterLen:]
     319           2 :                 footer.footerBH.Length = uint64(len(buf))
     320           2 :                 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
     321           2 : 
     322           2 :                 format, err := ParseTableFormat(magic, version, fileNum)
     323           2 :                 if err != nil {
     324           0 :                         return footer, err
     325           0 :                 }
     326           2 :                 footer.format = format
     327           2 : 
     328           2 :                 switch block.ChecksumType(buf[0]) {
     329           2 :                 case block.ChecksumTypeCRC32c:
     330           2 :                         footer.checksum = block.ChecksumTypeCRC32c
     331           1 :                 case block.ChecksumTypeXXHash64:
     332           1 :                         footer.checksum = block.ChecksumTypeXXHash64
     333           1 :                 default:
     334           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table %s (unsupported checksum type %d)", errors.Safe(fileNum), errors.Safe(footer.checksum))
     335             :                 }
     336           2 :                 buf = buf[1:]
     337             : 
     338           1 :         default:
     339           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table %s (bad magic number: 0x%x)", errors.Safe(fileNum), magic)
     340             :         }
     341             : 
     342           2 :         {
     343           2 :                 end := uint64(size)
     344           2 :                 var n int
     345           2 :                 footer.metaindexBH, n = decodeBlockHandle(buf)
     346           2 :                 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
     347           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table %s (bad metaindex block handle)", errors.Safe(fileNum))
     348           1 :                 }
     349           2 :                 buf = buf[n:]
     350           2 : 
     351           2 :                 footer.indexBH, n = decodeBlockHandle(buf)
     352           2 :                 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
     353           0 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table %s (bad index block handle)", errors.Safe(fileNum))
     354           0 :                 }
     355             :         }
     356             : 
     357           2 :         return footer, nil
     358             : }
     359             : 
     360           2 : func (f footer) encode(buf []byte) []byte {
     361           2 :         switch magic, version := f.format.AsTuple(); magic {
     362           1 :         case levelDBMagic:
     363           1 :                 buf = buf[:levelDBFooterLen]
     364           1 :                 clear(buf)
     365           1 :                 n := encodeBlockHandle(buf[0:], f.metaindexBH)
     366           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     367           1 :                 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
     368             : 
     369           2 :         case rocksDBMagic, pebbleDBMagic:
     370           2 :                 buf = buf[:rocksDBFooterLen]
     371           2 :                 clear(buf)
     372           2 :                 switch f.checksum {
     373           1 :                 case block.ChecksumTypeNone:
     374           1 :                         buf[0] = byte(block.ChecksumTypeNone)
     375           2 :                 case block.ChecksumTypeCRC32c:
     376           2 :                         buf[0] = byte(block.ChecksumTypeCRC32c)
     377           1 :                 case block.ChecksumTypeXXHash:
     378           1 :                         buf[0] = byte(block.ChecksumTypeXXHash)
     379           1 :                 case block.ChecksumTypeXXHash64:
     380           1 :                         buf[0] = byte(block.ChecksumTypeXXHash64)
     381           0 :                 default:
     382           0 :                         panic("unknown checksum type")
     383             :                 }
     384           2 :                 n := 1
     385           2 :                 n += encodeBlockHandle(buf[n:], f.metaindexBH)
     386           2 :                 encodeBlockHandle(buf[n:], f.indexBH)
     387           2 :                 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
     388           2 :                 copy(buf[len(buf)-len(rocksDBMagic):], magic)
     389             : 
     390           0 :         default:
     391           0 :                 panic("sstable: unspecified table format version")
     392             :         }
     393             : 
     394           2 :         return buf
     395             : }
     396             : 
     397           2 : func supportsTwoLevelIndex(format TableFormat) bool {
     398           2 :         switch format {
     399           1 :         case TableFormatLevelDB:
     400           1 :                 return false
     401           2 :         case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
     402           2 :                 return true
     403           0 :         default:
     404           0 :                 panic("sstable: unspecified table format version")
     405             :         }
     406             : }

Generated by: LCOV version 1.14