LCOV - code coverage report
Current view: top level - pebble/sstable - table.go (source / functions) Hit Total Coverage
Test: 2024-07-16 08:16Z b6c49f44 - tests only.lcov Lines: 112 139 80.6 %
Date: 2024-07-16 08:17:23 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : // Package sstable implements readers and writers of pebble tables.
       6             : //
       7             : // Tables are either opened for reading or created for writing but not both.
       8             : //
       9             : // A reader can create iterators, which allow seeking and next/prev
      10             : // iteration. There may be multiple key/value pairs that have the same key and
      11             : // different sequence numbers.
      12             : //
      13             : // A reader can be used concurrently. Multiple goroutines can call NewIter
      14             : // concurrently, and each iterator can run concurrently with other iterators.
      15             : // However, any particular iterator should not be used concurrently, and iterators
      16             : // should not be used once a reader is closed.
      17             : //
      18             : // A writer writes key/value pairs in increasing key order, and cannot be used
      19             : // concurrently. A table cannot be read until the writer has finished.
      20             : //
      21             : // Readers and writers can be created with various options. Passing a nil
      22             : // Options pointer is valid and means to use the default values.
      23             : //
      24             : // One such option is to define the 'less than' ordering for keys. The default
      25             : // Comparer uses the natural ordering consistent with bytes.Compare. The same
      26             : // ordering should be used for reading and writing a table.
      27             : //
      28             : // To return the value for a key:
      29             : //
      30             : //      r := table.NewReader(file, options)
      31             : //      defer r.Close()
      32             : //      i := r.NewIter(nil, nil)
      33             : //      defer i.Close()
      34             : //      ikey, value := r.SeekGE(key)
      35             : //      if options.Comparer.Compare(ikey.UserKey, key) != 0 {
      36             : //        // not found
      37             : //      } else {
      38             : //        // value is the first record containing key
      39             : //      }
      40             : //
      41             : // To count the number of entries in a table:
      42             : //
      43             : //      i, n := r.NewIter(nil, nil), 0
      44             : //      for key, value := i.First(); key != nil; key, value = i.Next() {
      45             : //              n++
      46             : //      }
      47             : //      if err := i.Close(); err != nil {
      48             : //              return 0, err
      49             : //      }
      50             : //      return n, nil
      51             : //
      52             : // To write a table with three entries:
      53             : //
      54             : //      w := table.NewWriter(file, options)
      55             : //      if err := w.Set([]byte("apple"), []byte("red")); err != nil {
      56             : //              w.Close()
      57             : //              return err
      58             : //      }
      59             : //      if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
      60             : //              w.Close()
      61             : //              return err
      62             : //      }
      63             : //      if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
      64             : //              w.Close()
      65             : //              return err
      66             : //      }
      67             : //      return w.Close()
      68             : package sstable // import "github.com/cockroachdb/pebble/sstable"
      69             : 
      70             : import (
      71             :         "context"
      72             :         "encoding/binary"
      73             :         "time"
      74             : 
      75             :         "github.com/cockroachdb/errors"
      76             :         "github.com/cockroachdb/pebble/internal/base"
      77             :         "github.com/cockroachdb/pebble/objstorage"
      78             :         "github.com/cockroachdb/pebble/sstable/block"
      79             : )
      80             : 
      81             : /*
      82             : The table file format looks like:
      83             : 
      84             : <start_of_file>
      85             : [data block 0]
      86             : [data block 1]
      87             : ...
      88             : [data block N-1]
      89             : [meta filter block] (optional)
      90             : [index block] (for single level index)
      91             : [meta rangedel block] (optional)
      92             : [meta range key block] (optional)
      93             : [value block 0] (optional)
      94             : [value block M-1] (optional)
      95             : [meta value index block] (optional)
      96             : [meta properties block]
      97             : [metaindex block]
      98             : [footer]
      99             : <end_of_file>
     100             : 
     101             : A Reader eagerly loads the footer, metaindex block and meta properties block,
     102             : because the data contained in those blocks is needed on every read, and even
     103             : before reading. For example, the meta properties block is used to verify the
     104             : comparer and merger are compatible, and the metaindex block contains the
     105             : location of the meta properties (and other meta blocks). In situations where
     106             : file system locality matters, or one wants to minimize number of read
     107             : requests when eagerly loading these blocks, having these three as a suffix
     108             : of the file is convenient.
     109             : 
     110             : The interleaving of the index block(s) between the meta blocks is done to
     111             : match RocksDB/LevelDB behavior.
     112             : 
     113             : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
     114             : 4 byte checksum. The checksum is computed over the compressed data and the first
     115             : byte of the trailer (i.e. the block type), and is serialized as little-endian.
     116             : The block type gives the per-block compression used; each block is compressed
     117             : independently. The checksum algorithm is described in the pebble/crc package.
     118             : 
     119             : Most blocks, other than the meta filter block, value blocks and meta value
     120             : index block, contain key/value pairs. The remainder of this comment refers to
     121             : the decompressed block, containing key/value pairs, which has its 5 byte
     122             : trailer stripped. The decompressed block data consists of a sequence of such
     123             : key/value entries followed by a block suffix. Each key is encoded as a shared
     124             : prefix length and a remainder string. For example, if two adjacent keys are
     125             : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
     126             : "um"}. The shared prefix length is varint encoded. The remainder string and the
     127             : value are encoded as a varint-encoded length followed by the literal contents.
     128             : To continue the example, suppose that the key "tweedledum" mapped to the value
     129             : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
     130             : 
     131             : Every block has a restart interval I. Every I'th key/value entry in that block
     132             : is called a restart point, and shares no key prefix with the previous entry.
     133             : Continuing the example above, if the key after "tweedledum" was "two", but was
     134             : part of a restart point, then that key would be encoded as {0, "two"} instead
     135             : of {2, "o"}. If a block has P restart points, then the block suffix consists
     136             : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
     137             : uint32 values are the block offsets of each restart point. The final uint32
     138             : value is P itself. Thus, when seeking for a particular key, one can use binary
     139             : search to find the largest restart point whose key is <= the key sought.
     140             : 
     141             : An index block is a block with N key/value entries. The i'th value is the
     142             : encoded block handle of the i'th data block. The i'th key is a separator for
     143             : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
     144             : is a key that is >= every key in block i and is < every key i block i+1. The
     145             : successor for the final block is a key that is >= every key in block N-1. The
     146             : index block restart interval is 1: every entry is a restart point.
     147             : 
     148             : A block handle is an offset, a length, and optional block properties (for data
     149             : blocks and first/lower level index blocks); the length does not include the 5
     150             : byte trailer. All numbers are varint-encoded, with no padding between the two
     151             : values. The maximum size of an encoded block handle without properties is 20
     152             : bytes. It is not advised to have properties that accumulate to be longer than
     153             : 100 bytes.
     154             : 
     155             : Instead of a single index block, the sstable can have a two-level index (this
     156             : is used to prevent a single huge index block). A two-level index consists of a
     157             : sequence of lower-level index blocks with block handles for data blocks
     158             : followed by a single top-level index block with block handles for the
     159             : lower-level index blocks.
     160             : 
     161             : The metaindex block also contains block handles as values, with keys being
     162             : the names of the meta blocks.
     163             : 
     164             : For a description of value blocks and the meta value index block, see
     165             : value_block.go.
     166             : 
     167             : Data blocks have some additional features:
     168             : - For TableFormatPebblev3 onwards:
     169             :   - For SETs, the value has a 1 byte value prefix, which indicates whether the
     170             :     value is inline, or in a separate value block, and indicates whether the
     171             :     prefix of the userkey (as defined by split) has changed or not. See
     172             :     value_block.go for details.
     173             :   - The most significant bit of the restart points is used to indicate whether
     174             :     userkey prefix has changed since the last restart point. See the detailed
     175             :     comment in blockWriter.
     176             :   - The maximum length of the "shared prefix" when encoding the key, is the
     177             :     length of the prefix of the userkey (as defined by split) of the previous
     178             :     key.
     179             : 
     180             : - For TableFormatPebblev4 onwards:
     181             :   - The key kinds may be altered to set the
     182             :     InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
     183             :     in the context of that sstable (for a reader that reads at a higher seqnum
     184             :     than the highest seqnum in the sstable). For details, see the comment in
     185             :     format.go.
     186             : */
     187             : 
     188             : const (
     189             :         blockHandleMaxLenWithoutProperties = 10 + 10
     190             :         // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
     191             :         // reduce memory copies. It is not guaranteed that a block handle will not
     192             :         // exceed this length.
     193             :         blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
     194             : 
     195             :         levelDBFooterLen   = 48
     196             :         levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
     197             :         levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
     198             : 
     199             :         rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
     200             :         rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
     201             :         rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
     202             :         rocksDBVersionOffset         = rocksDBMagicOffset - 4
     203             :         rocksDBExternalFormatVersion = 2
     204             : 
     205             :         pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
     206             : 
     207             :         minFooterLen = levelDBFooterLen
     208             :         maxFooterLen = rocksDBFooterLen
     209             : 
     210             :         levelDBFormatVersion  = 0
     211             :         rocksDBFormatVersion2 = 2
     212             : 
     213             :         metaRangeKeyName   = "pebble.range_key"
     214             :         metaValueIndexName = "pebble.value_index"
     215             :         metaPropertiesName = "rocksdb.properties"
     216             :         metaRangeDelV1Name = "rocksdb.range_del"
     217             :         metaRangeDelV2Name = "rocksdb.range_del2"
     218             : 
     219             :         // Index Types.
     220             :         // A space efficient index block that is optimized for binary-search-based
     221             :         // index.
     222             :         binarySearchIndex = 0
     223             :         // hashSearchIndex               = 1
     224             :         // A two-level index implementation. Both levels are binary search indexes.
     225             :         twoLevelIndex = 2
     226             :         // binarySearchWithFirstKeyIndex = 3
     227             : 
     228             :         // RocksDB always includes this in the properties block. Since Pebble
     229             :         // doesn't use zstd compression, the string will always be the same.
     230             :         // This should be removed if we ever decide to diverge from the RocksDB
     231             :         // properties block.
     232             :         rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
     233             : )
     234             : 
     235             : type blockType byte
     236             : 
     237             : const (
     238             :         // The block type gives the per-block compression format.
     239             :         // These constants are part of the file format and should not be changed.
     240             :         // They are different from the Compression constants because the latter
     241             :         // are designed so that the zero value of the Compression type means to
     242             :         // use the default compression (which is snappy).
     243             :         // Not all compression types listed here are supported.
     244             :         noCompressionBlockType     blockType = 0
     245             :         snappyCompressionBlockType blockType = 1
     246             :         zlibCompressionBlockType   blockType = 2
     247             :         bzip2CompressionBlockType  blockType = 3
     248             :         lz4CompressionBlockType    blockType = 4
     249             :         lz4hcCompressionBlockType  blockType = 5
     250             :         xpressCompressionBlockType blockType = 6
     251             :         zstdCompressionBlockType   blockType = 7
     252             : )
     253             : 
     254             : // String implements fmt.Stringer.
     255           1 : func (t blockType) String() string {
     256           1 :         switch t {
     257           1 :         case 0:
     258           1 :                 return "none"
     259           1 :         case 1:
     260           1 :                 return "snappy"
     261           0 :         case 2:
     262           0 :                 return "zlib"
     263           0 :         case 3:
     264           0 :                 return "bzip2"
     265           0 :         case 4:
     266           0 :                 return "lz4"
     267           0 :         case 5:
     268           0 :                 return "lz4hc"
     269           0 :         case 6:
     270           0 :                 return "xpress"
     271           0 :         case 7:
     272           0 :                 return "zstd"
     273           0 :         default:
     274           0 :                 panic(errors.Newf("sstable: unknown block type: %d", t))
     275             :         }
     276             : }
     277             : 
     278             : // legacy (LevelDB) footer format:
     279             : //
     280             : //      metaindex handle (varint64 offset, varint64 size)
     281             : //      index handle     (varint64 offset, varint64 size)
     282             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
     283             : //      table_magic_number (8 bytes)
     284             : //
     285             : // new (RocksDB) footer format:
     286             : //
     287             : //      checksum type (char, 1 byte)
     288             : //      metaindex handle (varint64 offset, varint64 size)
     289             : //      index handle     (varint64 offset, varint64 size)
     290             : //      <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
     291             : //      footer version (4 bytes)
     292             : //      table_magic_number (8 bytes)
     293             : type footer struct {
     294             :         format      TableFormat
     295             :         checksum    block.ChecksumType
     296             :         metaindexBH block.Handle
     297             :         indexBH     block.Handle
     298             :         footerBH    block.Handle
     299             : }
     300             : 
     301             : // TODO(sumeer): should the threshold be configurable.
     302             : const slowReadTracingThreshold = 5 * time.Millisecond
     303             : 
     304             : // readHandle is optional.
     305             : func readFooter(
     306             :         ctx context.Context,
     307             :         f objstorage.Readable,
     308             :         readHandle objstorage.ReadHandle,
     309             :         logger base.LoggerAndTracer,
     310           1 : ) (footer, error) {
     311           1 :         var footer footer
     312           1 :         size := f.Size()
     313           1 :         if size < minFooterLen {
     314           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
     315           1 :         }
     316             : 
     317           1 :         buf := make([]byte, maxFooterLen)
     318           1 :         off := size - maxFooterLen
     319           1 :         if off < 0 {
     320           1 :                 off = 0
     321           1 :                 buf = buf[:size]
     322           1 :         }
     323           1 :         readStopwatch := makeStopwatch()
     324           1 :         var err error
     325           1 :         if readHandle != nil {
     326           1 :                 err = readHandle.ReadAt(ctx, buf, off)
     327           1 :         } else {
     328           1 :                 err = f.ReadAt(ctx, buf, off)
     329           1 :         }
     330           1 :         readDuration := readStopwatch.stop()
     331           1 :         // Call IsTracingEnabled to avoid the allocations of boxing integers into an
     332           1 :         // interface{}, unless necessary.
     333           1 :         if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
     334           1 :                 logger.Eventf(ctx, "reading %d bytes took %s",
     335           1 :                         len(buf), readDuration.String())
     336           1 :         }
     337           1 :         if err != nil {
     338           1 :                 return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
     339           1 :         }
     340             : 
     341           1 :         switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
     342           1 :         case levelDBMagic:
     343           1 :                 if len(buf) < levelDBFooterLen {
     344           0 :                         return footer, base.CorruptionErrorf(
     345           0 :                                 "pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     346           0 :                 }
     347           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
     348           1 :                 buf = buf[len(buf)-levelDBFooterLen:]
     349           1 :                 footer.footerBH.Length = uint64(len(buf))
     350           1 :                 footer.format = TableFormatLevelDB
     351           1 :                 footer.checksum = block.ChecksumTypeCRC32c
     352             : 
     353           1 :         case rocksDBMagic, pebbleDBMagic:
     354           1 :                 // NOTE: The Pebble magic string implies the same footer format as that used
     355           1 :                 // by the RocksDBv2 table format.
     356           1 :                 if len(buf) < rocksDBFooterLen {
     357           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
     358           1 :                 }
     359           1 :                 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
     360           1 :                 buf = buf[len(buf)-rocksDBFooterLen:]
     361           1 :                 footer.footerBH.Length = uint64(len(buf))
     362           1 :                 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
     363           1 : 
     364           1 :                 format, err := ParseTableFormat(magic, version)
     365           1 :                 if err != nil {
     366           0 :                         return footer, err
     367           0 :                 }
     368           1 :                 footer.format = format
     369           1 : 
     370           1 :                 switch block.ChecksumType(buf[0]) {
     371           1 :                 case block.ChecksumTypeCRC32c:
     372           1 :                         footer.checksum = block.ChecksumTypeCRC32c
     373           1 :                 case block.ChecksumTypeXXHash64:
     374           1 :                         footer.checksum = block.ChecksumTypeXXHash64
     375           1 :                 default:
     376           1 :                         return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
     377             :                 }
     378           1 :                 buf = buf[1:]
     379             : 
     380           1 :         default:
     381           1 :                 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
     382             :         }
     383             : 
     384           1 :         {
     385           1 :                 end := uint64(size)
     386           1 :                 var n int
     387           1 :                 footer.metaindexBH, n = decodeBlockHandle(buf)
     388           1 :                 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
     389           1 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
     390           1 :                 }
     391           1 :                 buf = buf[n:]
     392           1 : 
     393           1 :                 footer.indexBH, n = decodeBlockHandle(buf)
     394           1 :                 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
     395           0 :                         return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
     396           0 :                 }
     397             :         }
     398             : 
     399           1 :         return footer, nil
     400             : }
     401             : 
     402           1 : func (f footer) encode(buf []byte) []byte {
     403           1 :         switch magic, version := f.format.AsTuple(); magic {
     404           1 :         case levelDBMagic:
     405           1 :                 buf = buf[:levelDBFooterLen]
     406           1 :                 clear(buf)
     407           1 :                 n := encodeBlockHandle(buf[0:], f.metaindexBH)
     408           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     409           1 :                 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
     410             : 
     411           1 :         case rocksDBMagic, pebbleDBMagic:
     412           1 :                 buf = buf[:rocksDBFooterLen]
     413           1 :                 clear(buf)
     414           1 :                 switch f.checksum {
     415           1 :                 case block.ChecksumTypeNone:
     416           1 :                         buf[0] = byte(block.ChecksumTypeNone)
     417           1 :                 case block.ChecksumTypeCRC32c:
     418           1 :                         buf[0] = byte(block.ChecksumTypeCRC32c)
     419           1 :                 case block.ChecksumTypeXXHash:
     420           1 :                         buf[0] = byte(block.ChecksumTypeXXHash)
     421           1 :                 case block.ChecksumTypeXXHash64:
     422           1 :                         buf[0] = byte(block.ChecksumTypeXXHash64)
     423           0 :                 default:
     424           0 :                         panic("unknown checksum type")
     425             :                 }
     426           1 :                 n := 1
     427           1 :                 n += encodeBlockHandle(buf[n:], f.metaindexBH)
     428           1 :                 encodeBlockHandle(buf[n:], f.indexBH)
     429           1 :                 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
     430           1 :                 copy(buf[len(buf)-len(rocksDBMagic):], magic)
     431             : 
     432           0 :         default:
     433           0 :                 panic("sstable: unspecified table format version")
     434             :         }
     435             : 
     436           1 :         return buf
     437             : }
     438             : 
     439           1 : func supportsTwoLevelIndex(format TableFormat) bool {
     440           1 :         switch format {
     441           1 :         case TableFormatLevelDB:
     442           1 :                 return false
     443           1 :         case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
     444           1 :                 return true
     445           0 :         default:
     446           0 :                 panic("sstable: unspecified table format version")
     447             :         }
     448             : }

Generated by: LCOV version 1.14