Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package sstable implements readers and writers of pebble tables.
6 : //
7 : // Tables are either opened for reading or created for writing but not both.
8 : //
9 : // A reader can create iterators, which allow seeking and next/prev
10 : // iteration. There may be multiple key/value pairs that have the same key and
11 : // different sequence numbers.
12 : //
13 : // A reader can be used concurrently. Multiple goroutines can call NewIter
14 : // concurrently, and each iterator can run concurrently with other iterators.
15 : // However, any particular iterator should not be used concurrently, and iterators
16 : // should not be used once a reader is closed.
17 : //
18 : // A writer writes key/value pairs in increasing key order, and cannot be used
19 : // concurrently. A table cannot be read until the writer has finished.
20 : //
21 : // Readers and writers can be created with various options. Passing a nil
22 : // Options pointer is valid and means to use the default values.
23 : //
24 : // One such option is to define the 'less than' ordering for keys. The default
25 : // Comparer uses the natural ordering consistent with bytes.Compare. The same
26 : // ordering should be used for reading and writing a table.
27 : //
28 : // To return the value for a key:
29 : //
30 : // r := table.NewReader(file, options)
31 : // defer r.Close()
32 : // i := r.NewIter(nil, nil)
33 : // defer i.Close()
34 : // ikey, value := r.SeekGE(key)
35 : // if options.Comparer.Compare(ikey.UserKey, key) != 0 {
36 : // // not found
37 : // } else {
38 : // // value is the first record containing key
39 : // }
40 : //
41 : // To count the number of entries in a table:
42 : //
43 : // i, n := r.NewIter(nil, nil), 0
44 : // for key, value := i.First(); key != nil; key, value = i.Next() {
45 : // n++
46 : // }
47 : // if err := i.Close(); err != nil {
48 : // return 0, err
49 : // }
50 : // return n, nil
51 : //
52 : // To write a table with three entries:
53 : //
54 : // w := table.NewWriter(file, options)
55 : // if err := w.Set([]byte("apple"), []byte("red")); err != nil {
56 : // w.Close()
57 : // return err
58 : // }
59 : // if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
60 : // w.Close()
61 : // return err
62 : // }
63 : // if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
64 : // w.Close()
65 : // return err
66 : // }
67 : // return w.Close()
68 : package sstable // import "github.com/cockroachdb/pebble/sstable"
69 :
70 : import (
71 : "context"
72 : "encoding/binary"
73 : "time"
74 :
75 : "github.com/cockroachdb/errors"
76 : "github.com/cockroachdb/pebble/internal/base"
77 : "github.com/cockroachdb/pebble/objstorage"
78 : "github.com/cockroachdb/pebble/sstable/block"
79 : )
80 :
81 : /*
82 : The table file format looks like:
83 :
84 : <start_of_file>
85 : [data block 0]
86 : [data block 1]
87 : ...
88 : [data block N-1]
89 : [meta filter block] (optional)
90 : [index block] (for single level index)
91 : [meta rangedel block] (optional)
92 : [meta range key block] (optional)
93 : [value block 0] (optional)
94 : [value block M-1] (optional)
95 : [meta value index block] (optional)
96 : [meta properties block]
97 : [metaindex block]
98 : [footer]
99 : <end_of_file>
100 :
101 : A Reader eagerly loads the footer, metaindex block and meta properties block,
102 : because the data contained in those blocks is needed on every read, and even
103 : before reading. For example, the meta properties block is used to verify the
104 : comparer and merger are compatible, and the metaindex block contains the
105 : location of the meta properties (and other meta blocks). In situations where
106 : file system locality matters, or one wants to minimize number of read
107 : requests when eagerly loading these blocks, having these three as a suffix
108 : of the file is convenient.
109 :
110 : The interleaving of the index block(s) between the meta blocks is done to
111 : match RocksDB/LevelDB behavior.
112 :
113 : Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
114 : 4 byte checksum. The checksum is computed over the compressed data and the first
115 : byte of the trailer (i.e. the block type), and is serialized as little-endian.
116 : The block type gives the per-block compression used; each block is compressed
117 : independently. The checksum algorithm is described in the pebble/crc package.
118 :
119 : Most blocks, other than the meta filter block, value blocks and meta value
120 : index block, contain key/value pairs. The remainder of this comment refers to
121 : the decompressed block, containing key/value pairs, which has its 5 byte
122 : trailer stripped. The decompressed block data consists of a sequence of such
123 : key/value entries followed by a block suffix. Each key is encoded as a shared
124 : prefix length and a remainder string. For example, if two adjacent keys are
125 : "tweedledee" and "tweedledum", then the second key would be encoded as {8,
126 : "um"}. The shared prefix length is varint encoded. The remainder string and the
127 : value are encoded as a varint-encoded length followed by the literal contents.
128 : To continue the example, suppose that the key "tweedledum" mapped to the value
129 : "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
130 :
131 : Every block has a restart interval I. Every I'th key/value entry in that block
132 : is called a restart point, and shares no key prefix with the previous entry.
133 : Continuing the example above, if the key after "tweedledum" was "two", but was
134 : part of a restart point, then that key would be encoded as {0, "two"} instead
135 : of {2, "o"}. If a block has P restart points, then the block suffix consists
136 : of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
137 : uint32 values are the block offsets of each restart point. The final uint32
138 : value is P itself. Thus, when seeking for a particular key, one can use binary
139 : search to find the largest restart point whose key is <= the key sought.
140 :
141 : An index block is a block with N key/value entries. The i'th value is the
142 : encoded block handle of the i'th data block. The i'th key is a separator for
143 : i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
144 : is a key that is >= every key in block i and is < every key i block i+1. The
145 : successor for the final block is a key that is >= every key in block N-1. The
146 : index block restart interval is 1: every entry is a restart point.
147 :
148 : A block handle is an offset, a length, and optional block properties (for data
149 : blocks and first/lower level index blocks); the length does not include the 5
150 : byte trailer. All numbers are varint-encoded, with no padding between the two
151 : values. The maximum size of an encoded block handle without properties is 20
152 : bytes. It is not advised to have properties that accumulate to be longer than
153 : 100 bytes.
154 :
155 : Instead of a single index block, the sstable can have a two-level index (this
156 : is used to prevent a single huge index block). A two-level index consists of a
157 : sequence of lower-level index blocks with block handles for data blocks
158 : followed by a single top-level index block with block handles for the
159 : lower-level index blocks.
160 :
161 : The metaindex block also contains block handles as values, with keys being
162 : the names of the meta blocks.
163 :
164 : For a description of value blocks and the meta value index block, see
165 : value_block.go.
166 :
167 : Data blocks have some additional features:
168 : - For TableFormatPebblev3 onwards:
169 : - For SETs, the value has a 1 byte value prefix, which indicates whether the
170 : value is inline, or in a separate value block, and indicates whether the
171 : prefix of the userkey (as defined by split) has changed or not. See
172 : value_block.go for details.
173 : - The most significant bit of the restart points is used to indicate whether
174 : userkey prefix has changed since the last restart point. See the detailed
175 : comment in blockWriter.
176 : - The maximum length of the "shared prefix" when encoding the key, is the
177 : length of the prefix of the userkey (as defined by split) of the previous
178 : key.
179 :
180 : - For TableFormatPebblev4 onwards:
181 : - The key kinds may be altered to set the
182 : InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
183 : in the context of that sstable (for a reader that reads at a higher seqnum
184 : than the highest seqnum in the sstable). For details, see the comment in
185 : format.go.
186 : */
187 :
188 : const (
189 : blockHandleMaxLenWithoutProperties = 10 + 10
190 : // blockHandleLikelyMaxLen can be used for pre-allocating buffers to
191 : // reduce memory copies. It is not guaranteed that a block handle will not
192 : // exceed this length.
193 : blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
194 :
195 : levelDBFooterLen = 48
196 : levelDBMagic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
197 : levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
198 :
199 : rocksDBFooterLen = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
200 : rocksDBMagic = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
201 : rocksDBMagicOffset = rocksDBFooterLen - len(rocksDBMagic)
202 : rocksDBVersionOffset = rocksDBMagicOffset - 4
203 : rocksDBExternalFormatVersion = 2
204 :
205 : pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
206 :
207 : minFooterLen = levelDBFooterLen
208 : maxFooterLen = rocksDBFooterLen
209 :
210 : levelDBFormatVersion = 0
211 : rocksDBFormatVersion2 = 2
212 :
213 : metaRangeKeyName = "pebble.range_key"
214 : metaValueIndexName = "pebble.value_index"
215 : metaPropertiesName = "rocksdb.properties"
216 : metaRangeDelV1Name = "rocksdb.range_del"
217 : metaRangeDelV2Name = "rocksdb.range_del2"
218 :
219 : // Index Types.
220 : // A space efficient index block that is optimized for binary-search-based
221 : // index.
222 : binarySearchIndex = 0
223 : // hashSearchIndex = 1
224 : // A two-level index implementation. Both levels are binary search indexes.
225 : twoLevelIndex = 2
226 : // binarySearchWithFirstKeyIndex = 3
227 :
228 : // RocksDB always includes this in the properties block. Since Pebble
229 : // doesn't use zstd compression, the string will always be the same.
230 : // This should be removed if we ever decide to diverge from the RocksDB
231 : // properties block.
232 : rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
233 : )
234 :
235 : type blockType byte
236 :
237 : const (
238 : // The block type gives the per-block compression format.
239 : // These constants are part of the file format and should not be changed.
240 : // They are different from the Compression constants because the latter
241 : // are designed so that the zero value of the Compression type means to
242 : // use the default compression (which is snappy).
243 : // Not all compression types listed here are supported.
244 : noCompressionBlockType blockType = 0
245 : snappyCompressionBlockType blockType = 1
246 : zlibCompressionBlockType blockType = 2
247 : bzip2CompressionBlockType blockType = 3
248 : lz4CompressionBlockType blockType = 4
249 : lz4hcCompressionBlockType blockType = 5
250 : xpressCompressionBlockType blockType = 6
251 : zstdCompressionBlockType blockType = 7
252 : )
253 :
254 : // String implements fmt.Stringer.
255 1 : func (t blockType) String() string {
256 1 : switch t {
257 1 : case 0:
258 1 : return "none"
259 1 : case 1:
260 1 : return "snappy"
261 0 : case 2:
262 0 : return "zlib"
263 0 : case 3:
264 0 : return "bzip2"
265 0 : case 4:
266 0 : return "lz4"
267 0 : case 5:
268 0 : return "lz4hc"
269 0 : case 6:
270 0 : return "xpress"
271 0 : case 7:
272 0 : return "zstd"
273 0 : default:
274 0 : panic(errors.Newf("sstable: unknown block type: %d", t))
275 : }
276 : }
277 :
278 : // legacy (LevelDB) footer format:
279 : //
280 : // metaindex handle (varint64 offset, varint64 size)
281 : // index handle (varint64 offset, varint64 size)
282 : // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
283 : // table_magic_number (8 bytes)
284 : //
285 : // new (RocksDB) footer format:
286 : //
287 : // checksum type (char, 1 byte)
288 : // metaindex handle (varint64 offset, varint64 size)
289 : // index handle (varint64 offset, varint64 size)
290 : // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
291 : // footer version (4 bytes)
292 : // table_magic_number (8 bytes)
293 : type footer struct {
294 : format TableFormat
295 : checksum block.ChecksumType
296 : metaindexBH block.Handle
297 : indexBH block.Handle
298 : footerBH block.Handle
299 : }
300 :
301 : // TODO(sumeer): should the threshold be configurable.
302 : const slowReadTracingThreshold = 5 * time.Millisecond
303 :
304 : // readHandle is optional.
305 : func readFooter(
306 : ctx context.Context,
307 : f objstorage.Readable,
308 : readHandle objstorage.ReadHandle,
309 : logger base.LoggerAndTracer,
310 1 : ) (footer, error) {
311 1 : var footer footer
312 1 : size := f.Size()
313 1 : if size < minFooterLen {
314 1 : return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
315 1 : }
316 :
317 1 : buf := make([]byte, maxFooterLen)
318 1 : off := size - maxFooterLen
319 1 : if off < 0 {
320 1 : off = 0
321 1 : buf = buf[:size]
322 1 : }
323 1 : readStopwatch := makeStopwatch()
324 1 : var err error
325 1 : if readHandle != nil {
326 1 : err = readHandle.ReadAt(ctx, buf, off)
327 1 : } else {
328 1 : err = f.ReadAt(ctx, buf, off)
329 1 : }
330 1 : readDuration := readStopwatch.stop()
331 1 : // Call IsTracingEnabled to avoid the allocations of boxing integers into an
332 1 : // interface{}, unless necessary.
333 1 : if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
334 1 : logger.Eventf(ctx, "reading %d bytes took %s",
335 1 : len(buf), readDuration.String())
336 1 : }
337 1 : if err != nil {
338 1 : return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
339 1 : }
340 :
341 1 : switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
342 1 : case levelDBMagic:
343 1 : if len(buf) < levelDBFooterLen {
344 0 : return footer, base.CorruptionErrorf(
345 0 : "pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
346 0 : }
347 1 : footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
348 1 : buf = buf[len(buf)-levelDBFooterLen:]
349 1 : footer.footerBH.Length = uint64(len(buf))
350 1 : footer.format = TableFormatLevelDB
351 1 : footer.checksum = block.ChecksumTypeCRC32c
352 :
353 1 : case rocksDBMagic, pebbleDBMagic:
354 1 : // NOTE: The Pebble magic string implies the same footer format as that used
355 1 : // by the RocksDBv2 table format.
356 1 : if len(buf) < rocksDBFooterLen {
357 1 : return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
358 1 : }
359 1 : footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
360 1 : buf = buf[len(buf)-rocksDBFooterLen:]
361 1 : footer.footerBH.Length = uint64(len(buf))
362 1 : version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
363 1 :
364 1 : format, err := ParseTableFormat(magic, version)
365 1 : if err != nil {
366 0 : return footer, err
367 0 : }
368 1 : footer.format = format
369 1 :
370 1 : switch block.ChecksumType(buf[0]) {
371 1 : case block.ChecksumTypeCRC32c:
372 1 : footer.checksum = block.ChecksumTypeCRC32c
373 1 : case block.ChecksumTypeXXHash64:
374 1 : footer.checksum = block.ChecksumTypeXXHash64
375 1 : default:
376 1 : return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
377 : }
378 1 : buf = buf[1:]
379 :
380 1 : default:
381 1 : return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
382 : }
383 :
384 1 : {
385 1 : end := uint64(size)
386 1 : var n int
387 1 : footer.metaindexBH, n = decodeBlockHandle(buf)
388 1 : if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
389 1 : return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
390 1 : }
391 1 : buf = buf[n:]
392 1 :
393 1 : footer.indexBH, n = decodeBlockHandle(buf)
394 1 : if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
395 0 : return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
396 0 : }
397 : }
398 :
399 1 : return footer, nil
400 : }
401 :
402 1 : func (f footer) encode(buf []byte) []byte {
403 1 : switch magic, version := f.format.AsTuple(); magic {
404 1 : case levelDBMagic:
405 1 : buf = buf[:levelDBFooterLen]
406 1 : clear(buf)
407 1 : n := encodeBlockHandle(buf[0:], f.metaindexBH)
408 1 : encodeBlockHandle(buf[n:], f.indexBH)
409 1 : copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
410 :
411 1 : case rocksDBMagic, pebbleDBMagic:
412 1 : buf = buf[:rocksDBFooterLen]
413 1 : clear(buf)
414 1 : switch f.checksum {
415 1 : case block.ChecksumTypeNone:
416 1 : buf[0] = byte(block.ChecksumTypeNone)
417 1 : case block.ChecksumTypeCRC32c:
418 1 : buf[0] = byte(block.ChecksumTypeCRC32c)
419 1 : case block.ChecksumTypeXXHash:
420 1 : buf[0] = byte(block.ChecksumTypeXXHash)
421 1 : case block.ChecksumTypeXXHash64:
422 1 : buf[0] = byte(block.ChecksumTypeXXHash64)
423 0 : default:
424 0 : panic("unknown checksum type")
425 : }
426 1 : n := 1
427 1 : n += encodeBlockHandle(buf[n:], f.metaindexBH)
428 1 : encodeBlockHandle(buf[n:], f.indexBH)
429 1 : binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
430 1 : copy(buf[len(buf)-len(rocksDBMagic):], magic)
431 :
432 0 : default:
433 0 : panic("sstable: unspecified table format version")
434 : }
435 :
436 1 : return buf
437 : }
438 :
439 1 : func supportsTwoLevelIndex(format TableFormat) bool {
440 1 : switch format {
441 1 : case TableFormatLevelDB:
442 1 : return false
443 1 : case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
444 1 : return true
445 0 : default:
446 0 : panic("sstable: unspecified table format version")
447 : }
448 : }
|