Coverage Report

Created: 2026-05-16 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/table/format.h
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#pragma once
11
12
#include <array>
13
#include <cstdint>
14
#include <string>
15
16
#include "file/file_prefetch_buffer.h"
17
#include "file/random_access_file_reader.h"
18
#include "memory/memory_allocator_impl.h"
19
#include "options/cf_options.h"
20
#include "port/malloc.h"
21
#include "port/port.h"  // noexcept
22
#include "rocksdb/slice.h"
23
#include "rocksdb/status.h"
24
#include "rocksdb/table.h"
25
#include "util/hash.h"
26
27
namespace ROCKSDB_NAMESPACE {
28
29
class RandomAccessFile;
30
struct ReadOptions;
31
32
bool ShouldReportDetailedTime(Env* env, Statistics* stats);
33
34
// the length of the magic number in bytes.
35
constexpr uint32_t kMagicNumberLengthByte = 8;
36
37
extern const uint64_t kBlockBasedTableMagicNumber;
38
39
extern const uint64_t kLegacyPlainTableMagicNumber;
40
extern const uint64_t kPlainTableMagicNumber;
41
42
extern const uint64_t kCuckooTableMagicNumber;
43
44
// BlockHandle is a pointer to the extent of a file that stores a data
45
// block or a meta block.
46
class BlockHandle {
47
 public:
48
  // Creates a block handle with special values indicating "uninitialized,"
49
  // distinct from the "null" block handle.
50
  BlockHandle();
51
  BlockHandle(uint64_t offset, uint64_t size);
52
53
  // The offset of the block in the file.
54
1.23M
  uint64_t offset() const { return offset_; }
55
89.4k
  void set_offset(uint64_t _offset) { offset_ = _offset; }
56
57
  // The size of the stored block, this size does not include the block trailer.
58
552k
  uint64_t size() const { return size_; }
59
89.4k
  void set_size(uint64_t _size) { size_ = _size; }
60
61
  void EncodeTo(std::string* dst) const;
62
  char* EncodeTo(char* dst) const;
63
  Status DecodeFrom(Slice* input);
64
  Status DecodeSizeFrom(uint64_t offset, Slice* input);
65
66
  // Return a string that contains the copy of handle.
67
  std::string ToString(bool hex = true) const;
68
69
  // if the block handle's offset and size are both "0", we will view it
70
  // as a null block handle that points to no where.
71
458k
  bool IsNull() const { return offset_ == 0 && size_ == 0; }
72
73
451k
  static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
74
75
  // Maximum encoding length of a BlockHandle
76
  static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
77
78
0
  inline bool operator==(const BlockHandle& rhs) const {
79
0
    return offset_ == rhs.offset_ && size_ == rhs.size_;
80
0
  }
81
0
  inline bool operator!=(const BlockHandle& rhs) const {
82
0
    return !(*this == rhs);
83
0
  }
84
85
 private:
86
  uint64_t offset_;
87
  uint64_t size_;
88
89
  static const BlockHandle kNullBlockHandle;
90
};
91
92
struct EncodedBlockHandle {
93
0
  explicit EncodedBlockHandle(const BlockHandle& h) {
94
0
    auto end = h.EncodeTo(buffer.data());
95
0
    size = end - buffer.data();
96
0
  }
97
0
  Slice AsSlice() const { return Slice(buffer.data(), size); }
98
  std::array<char, BlockHandle::kMaxEncodedLength> buffer;
99
  size_t size;
100
};
101
102
// Value in block-based table file index.
103
//
104
// The index entry for block n is: y -> h, [x],
105
// where: y is some key between the last key of block n (inclusive) and the
106
// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
107
// x, if present, is the first key of block n (unshortened).
108
// This struct represents the "h, [x]" part.
109
struct IndexValue {
110
  BlockHandle handle;
111
  // Empty means unknown.
112
  Slice first_internal_key;
113
114
67.2k
  IndexValue() = default;
115
  IndexValue(BlockHandle _handle, Slice _first_internal_key)
116
23.4k
      : handle(_handle), first_internal_key(_first_internal_key) {}
117
118
  // have_first_key indicates whether the `first_internal_key` is used.
119
  // If previous_handle is not null, delta encoding is used;
120
  // in this case, the two handles must point to consecutive blocks:
121
  // handle.offset() ==
122
  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
123
  void EncodeTo(std::string* dst, bool have_first_key,
124
                const BlockHandle* previous_handle) const;
125
  Status DecodeFrom(Slice* input, bool have_first_key,
126
                    const BlockHandle* previous_handle);
127
128
  std::string ToString(bool hex, bool have_first_key) const;
129
};
130
131
// Given a file's base_context_checksum and an offset of a block within that
132
// file, choose a 32-bit value that is as unique as possible. This value will
133
// be added to the standard checksum to get a checksum "with context," or can
134
// be subtracted to "remove" context. Returns zero (no modifier) if feature is
135
// disabled with base_context_checksum == 0.
136
inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
137
671k
                                           uint64_t offset) {
138
  // To disable on base_context_checksum == 0, we could write
139
  // `if (base_context_checksum == 0) return 0;` but benchmarking shows
140
  // measurable performance penalty vs. this: compute the modifier
141
  // unconditionally and use an "all or nothing" bit mask to enable
142
  // or disable.
143
671k
  uint32_t all_or_nothing = uint32_t{0} - (base_context_checksum != 0);
144
145
  // Desired properties:
146
  // (call this function f(b, o) where b = base and o = offset)
147
  // 1. Fast
148
  // 2. f(b1, o) == f(b2, o) iff b1 == b2
149
  //    (Perfectly preserve base entropy)
150
  // 3. f(b, o1) == f(b, o2) only if o1 == o2 or |o1-o2| >= 4 billion
151
  //    (Guaranteed uniqueness for nearby offsets)
152
  // 3. f(b, o + j * 2**32) == f(b, o + k * 2**32) only if j == k
153
  //    (Upper bits matter, and *aligned* misplacement fails check)
154
  // 4. f(b1, o) == f(b2, o + x) then preferably not
155
  //    f(b1, o + y) == f(b2, o + x + y)
156
  //    (Avoid linearly correlated matches)
157
  // 5. f(b, o) == 0 depends on both b and o
158
  //    (No predictable overlap with non-context checksums)
159
671k
  uint32_t modifier =
160
671k
      base_context_checksum ^ (Lower32of64(offset) + Upper32of64(offset));
161
162
671k
  return modifier & all_or_nothing;
163
671k
}
164
165
constexpr uint32_t kLatestBbtFormatVersion = 7;
166
167
// Minimum format version supported for reading SST files in block-based format.
168
//
169
// When phasing out old format versions, first increase the write minimum,
170
// then later (>= 6 mo) increase the read minimum when removing the
171
// implementation for both read and write.
172
constexpr uint32_t kMinSupportedBbtFormatVersionForRead = 2;
173
174
// Minimum format version supported for writing new SST files in block-based
175
// format. This should be >= kMinSupportedFormatVersionForRead.
176
//
177
// When phasing out old format versions, first increase the write minimum,
178
// then later (>= 6 mo) increase the read minimum when removing the
179
// implementation for both read and write.
180
constexpr uint32_t kMinSupportedBbtFormatVersionForWrite = 2;
181
static_assert(kMinSupportedBbtFormatVersionForWrite >=
182
              kMinSupportedBbtFormatVersionForRead);
183
184
215k
inline bool IsSupportedFormatVersionForRead(uint64_t magic, uint32_t version) {
185
215k
  if (magic == kBlockBasedTableMagicNumber) {
186
215k
    return version >= kMinSupportedBbtFormatVersionForRead &&
187
216k
           version <= kLatestBbtFormatVersion;
188
18.4E
  } else if (magic == kPlainTableMagicNumber) {
189
0
    return version == 0;
190
18.4E
  } else if (magic == kCuckooTableMagicNumber) {
191
0
    return version == 1;
192
18.4E
  } else {
193
18.4E
    return false;
194
18.4E
  }
195
215k
}
196
197
68.2k
inline bool IsSupportedFormatVersionForWrite(uint64_t magic, uint32_t version) {
198
68.2k
  if (magic == kBlockBasedTableMagicNumber) {
199
68.2k
    return version >= kMinSupportedBbtFormatVersionForWrite &&
200
68.2k
           version <= kLatestBbtFormatVersion;
201
68.2k
  } else if (magic == kPlainTableMagicNumber) {
202
0
    return version == 0;
203
0
  } else if (magic == kCuckooTableMagicNumber) {
204
0
    return version == 1;
205
0
  } else {
206
0
    return false;
207
0
  }
208
68.2k
}
209
210
// Same as having a unique id in footer.
211
21.7k
inline bool FormatVersionUsesContextChecksum(uint32_t version) {
212
21.7k
  return version >= 6;
213
21.7k
}
214
215
130k
inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
216
130k
  return version < 6;
217
130k
}
218
219
152k
inline bool FormatVersionUsesCompressionManagerName(uint32_t version) {
220
152k
  return version >= 7;
221
152k
}
222
223
// Footer encapsulates the fixed information stored at the tail end of every
224
// SST file. In general, it should only include things that cannot go
225
// elsewhere under the metaindex block. For example, checksum_type is
226
// required for verifying metaindex block checksum (when applicable), but
227
// index block handle can easily go in metaindex block. See also FooterBuilder
228
// below.
229
class Footer {
230
 public:
231
  // Create empty. Populate using DecodeFrom.
232
217k
  Footer() {}
233
234
0
  void Reset() {
235
0
    table_magic_number_ = kNullTableMagicNumber;
236
0
    format_version_ = kInvalidFormatVersion;
237
0
    base_context_checksum_ = 0;
238
0
    metaindex_handle_ = BlockHandle::NullBlockHandle();
239
0
    index_handle_ = BlockHandle::NullBlockHandle();
240
0
    checksum_type_ = kInvalidChecksumType;
241
0
    block_trailer_size_ = 0;
242
0
  }
243
244
  // Deserialize a footer (populate fields) from `input` and check for various
245
  // corruptions. `input_offset` is the offset within the target file of
246
  // `input` buffer, which is needed for verifying format_version >= 6 footer.
247
  // If enforce_table_magic_number != 0, will return corruption if table magic
248
  // number is not equal to enforce_table_magic_number.
249
  Status DecodeFrom(Slice input, uint64_t input_offset,
250
                    uint64_t enforce_table_magic_number = 0);
251
252
  // Table magic number identifies file as RocksDB SST file and which kind of
253
  // SST format is use.
254
0
  uint64_t table_magic_number() const { return table_magic_number_; }
255
256
  // A version (footer and more) within a kind of SST. (It would add more
257
  // unnecessary complexity to separate footer versions and
258
  // BBTO::format_version.)
259
326k
  uint32_t format_version() const { return format_version_; }
260
261
  // See ChecksumModifierForContext()
262
350k
  uint32_t base_context_checksum() const { return base_context_checksum_; }
263
264
  // Block handle for metaindex block.
265
108k
  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
266
267
  // Block handle for (top-level) index block.
268
  // TODO? remove from this struct and only read on decode for legacy cases
269
0
  const BlockHandle& index_handle() const { return index_handle_; }
270
271
  // Checksum type used in the file, including footer for format version >= 6.
272
566k
  ChecksumType checksum_type() const {
273
566k
    return static_cast<ChecksumType>(checksum_type_);
274
566k
  }
275
276
  // Block trailer size used by file with this footer (e.g. 5 for block-based
277
  // table and 0 for plain table). This is inferred from magic number so
278
  // not in the serialized form.
279
1.02M
  inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
280
281
  // Convert this object to a human readable form
282
  std::string ToString() const;
283
284
  // Encoded lengths of Footers. Bytes for serialized Footer will always be
285
  // >= kMinEncodedLength and <= kMaxEncodedLength.
286
  //
287
  // Footer version 0 (legacy) will always occupy exactly this many bytes.
288
  // It consists of two block handles, padding, and a magic number.
289
  static constexpr uint32_t kVersion0EncodedLength =
290
      2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
291
  static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
292
293
  // Footer of versions 1 and higher will always occupy exactly this many
294
  // bytes. It originally consisted of the checksum type, two block handles,
295
  // padding (to maximum handle encoding size), a format version number, and a
296
  // magic number.
297
  static constexpr uint32_t kNewVersionsEncodedLength =
298
      1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
299
  static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
300
301
  static constexpr uint64_t kNullTableMagicNumber = 0;
302
303
  static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
304
305
 private:
306
  static constexpr int kInvalidChecksumType =
307
      (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
308
309
  uint64_t table_magic_number_ = kNullTableMagicNumber;
310
  uint32_t format_version_ = kInvalidFormatVersion;
311
  uint32_t base_context_checksum_ = 0;
312
  BlockHandle metaindex_handle_;
313
  BlockHandle index_handle_;
314
  int checksum_type_ = kInvalidChecksumType;
315
  uint8_t block_trailer_size_ = 0;
316
};
317
318
// Builder for Footer
319
class FooterBuilder {
320
 public:
321
  // Run builder in inputs. This is a single step with lots of parameters for
322
  // efficiency (based on perf testing).
323
  // * table_magic_number identifies file as RocksDB SST file and which kind of
324
  // SST format is use.
325
  // * format_version is a version for the footer and can also apply to other
326
  // aspects of the SST file (see BlockBasedTableOptions::format_version).
327
  // NOTE: To save complexity in the caller, when format_version == 0 and
328
  // there is a corresponding legacy magic number to the one specified, the
329
  // legacy magic number will be written for forward compatibility.
330
  // * footer_offset is the file offset where the footer will be written
331
  // (for future use).
332
  // * checksum_type is for formats using block checksums.
333
  // * index_handle is optional for some SST kinds and (for caller convenience)
334
  // ignored when format_version >= 6. (Must be added to metaindex in that
335
  // case.)
336
  // * unique_id must be specified if format_vesion >= 6 and SST uses block
337
  // checksums with context. Otherwise, auto-generated if format_vesion >= 6.
338
  Status Build(uint64_t table_magic_number, uint32_t format_version,
339
               uint64_t footer_offset, ChecksumType checksum_type,
340
               const BlockHandle& metaindex_handle,
341
               const BlockHandle& index_handle = BlockHandle::NullBlockHandle(),
342
               uint32_t base_context_checksum = 0);
343
344
  // After Builder, get a Slice for the serialized Footer, backed by this
345
  // FooterBuilder.
346
63.3k
  const Slice& GetSlice() const {
347
63.3k
    assert(slice_.size());
348
63.3k
    return slice_;
349
63.3k
  }
350
351
 private:
352
  Slice slice_;
353
  std::array<char, Footer::kMaxEncodedLength> data_;
354
};
355
356
// Set to true to allow unit testing of writing unsupported block-based table
357
// format versions (to test read side)
358
bool& TEST_AllowUnsupportedFormatVersion();
359
360
// Read the footer from file
361
// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
362
// corruption if table_magic number is not equal to enforce_table_magic_number
363
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
364
                          FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
365
                          uint64_t file_size, Footer* footer,
366
                          uint64_t enforce_table_magic_number = 0,
367
                          Statistics* stats = nullptr);
368
369
// Computes a checksum using the given ChecksumType. Sometimes we need to
370
// include one more input byte logically at the end but not part of the main
371
// data buffer. If data_size >= 1, then
372
// ComputeBuiltinChecksum(type, data, size)
373
// ==
374
// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
375
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
376
                                size_t size);
377
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
378
                                            size_t size, char last_byte);
379
380
// Represents the contents of a block read from an SST file. Depending on how
381
// it's created, it may or may not own the actual block bytes. As an example,
382
// BlockContents objects representing data read from mmapped files only point
383
// into the mmapped region. Depending on context, it might be a serialized
384
// (potentially compressed) block, including a trailer beyond `size`, or an
385
// uncompressed block.
386
//
387
// Please try to use this terminology when dealing with blocks:
388
// * "Serialized block" - bytes that go into storage. For block-based table
389
// (usually the case) this includes the block trailer. Here the `size` does
390
// not include the trailer, but other places in code might include the trailer
391
// in the size.
392
// * "Maybe compressed block" - like a serialized block, but without the
393
// trailer (or no promise of including a trailer). Must be accompanied by a
394
// CompressionType in some other variable or field.
395
// * "Uncompressed block" - "payload" bytes that are either stored with no
396
// compression, used as input to compression function, or result of
397
// decompression function.
398
// * "Parsed block" - an in-memory form of a block in block cache, as it is
399
// used by the table reader. Different C++ types are used depending on the
400
// block type (see block_cache.h). Only trivially parsable block types
401
// use BlockContents as the parsed form.
402
//
403
struct BlockContents {
404
  // Points to block payload (without trailer)
405
  Slice data;
406
  CacheAllocationPtr allocation;
407
408
#ifndef NDEBUG
409
  // Whether there is a known trailer after what is pointed to by `data`.
410
  // See BlockBasedTable::GetCompressionType.
411
  bool has_trailer = false;
412
#endif  // NDEBUG
413
414
387k
  BlockContents() {}
415
416
  // Does not take ownership of the underlying data bytes.
417
0
  BlockContents(const Slice& _data) : data(_data) {}
418
419
  // Takes ownership of the underlying data bytes.
420
  BlockContents(CacheAllocationPtr&& _data, size_t _size)
421
353k
      : data(_data.get(), _size), allocation(std::move(_data)) {}
422
423
  // Takes ownership of the underlying data bytes.
424
  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
425
0
      : data(_data.get(), _size) {
426
0
    allocation.reset(_data.release());
427
0
  }
428
429
  // Returns whether the object has ownership of the underlying data bytes.
430
26.9k
  bool own_bytes() const { return allocation.get() != nullptr; }
431
432
  // The additional memory space taken by the block data.
433
26.9k
  size_t usable_size() const {
434
    // FIXME: doesn't account for possible block trailer
435
26.9k
    if (allocation.get() != nullptr) {
436
26.9k
      auto allocator = allocation.get_deleter().allocator;
437
26.9k
      if (allocator) {
438
0
        return allocator->UsableSize(allocation.get(), data.size());
439
0
      }
440
26.9k
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
441
26.9k
      return malloc_usable_size(allocation.get());
442
#else
443
      return data.size();
444
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
445
26.9k
    } else {
446
0
      return 0;  // no extra memory is occupied by the data
447
0
    }
448
26.9k
  }
449
450
0
  size_t ApproximateMemoryUsage() const {
451
0
    return usable_size() + sizeof(*this);
452
0
  }
453
454
355k
  BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
455
456
724k
  BlockContents& operator=(BlockContents&& other) {
457
724k
    data = std::move(other.data);
458
724k
    allocation = std::move(other.allocation);
459
#ifndef NDEBUG
460
    has_trailer = other.has_trailer;
461
#endif  // NDEBUG
462
724k
    return *this;
463
724k
  }
464
};
465
466
// The `data` points to serialized block contents read in from file, which
467
// must be compressed and include a trailer beyond `size`. A new buffer is
468
// allocated with the given allocator (or default) and the uncompressed
469
// contents are returned in `out_contents`. Statistics updated.
470
Status DecompressSerializedBlock(const char* data, size_t size,
471
                                 CompressionType type,
472
                                 Decompressor& decompressor,
473
                                 BlockContents* out_contents,
474
                                 const ImmutableOptions& ioptions,
475
                                 MemoryAllocator* allocator = nullptr);
476
477
Status DecompressSerializedBlock(Decompressor::Args& args,
478
                                 Decompressor& decompressor,
479
                                 BlockContents* out_contents,
480
                                 const ImmutableOptions& ioptions,
481
                                 MemoryAllocator* allocator = nullptr);
482
483
// This is a variant of DecompressSerializedBlock that does not expect a
484
// block trailer beyond `size`. (CompressionType is passed in.)
485
Status DecompressBlockData(
486
    const char* data, size_t size, CompressionType type,
487
    Decompressor& decompressor, BlockContents* out_contents,
488
    const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr,
489
    Decompressor::ManagedWorkingArea* working_area = nullptr);
490
491
Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
492
                           BlockContents* out_contents,
493
                           const ImmutableOptions& ioptions,
494
                           MemoryAllocator* allocator = nullptr);
495
496
// Replace db_host_id contents with the real hostname if necessary
497
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
498
499
// Implementation details follow.  Clients should ignore,
500
501
// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
502
// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
503
// uninitialized.
504
1.18M
inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
505
506
inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
507
1.29M
    : offset_(_offset), size_(_size) {}
508
509
}  // namespace ROCKSDB_NAMESPACE