Coverage Report

Created: 2026-02-14 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/table/format.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "table/format.h"
11
12
#include <cinttypes>
13
#include <cstdint>
14
#include <string>
15
16
#include "block_fetcher.h"
17
#include "file/random_access_file_reader.h"
18
#include "memory/memory_allocator_impl.h"
19
#include "monitoring/perf_context_imp.h"
20
#include "monitoring/statistics_impl.h"
21
#include "options/options_helper.h"
22
#include "port/likely.h"
23
#include "rocksdb/env.h"
24
#include "rocksdb/options.h"
25
#include "rocksdb/table.h"
26
#include "table/block_based/block.h"
27
#include "table/block_based/block_based_table_reader.h"
28
#include "table/persistent_cache_helper.h"
29
#include "unique_id_impl.h"
30
#include "util/cast_util.h"
31
#include "util/coding.h"
32
#include "util/compression.h"
33
#include "util/crc32c.h"
34
#include "util/hash.h"
35
#include "util/stop_watch.h"
36
#include "util/string_util.h"
37
#include "util/xxhash.h"
38
39
namespace ROCKSDB_NAMESPACE {
40
41
const char* kHostnameForDbHostId = "__hostname__";
42
43
8.71k
bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
44
8.71k
  return env != nullptr && stats != nullptr &&
45
0
         stats->get_stats_level() > kExceptDetailedTimers;
46
8.71k
}
47
48
23.8k
void BlockHandle::EncodeTo(std::string* dst) const {
49
  // Sanity check that all fields have been set
50
23.8k
  assert(offset_ != ~uint64_t{0});
51
23.8k
  assert(size_ != ~uint64_t{0});
52
23.8k
  PutVarint64Varint64(dst, offset_, size_);
53
23.8k
}
54
55
0
char* BlockHandle::EncodeTo(char* dst) const {
56
  // Sanity check that all fields have been set
57
0
  assert(offset_ != ~uint64_t{0});
58
0
  assert(size_ != ~uint64_t{0});
59
0
  char* cur = EncodeVarint64(dst, offset_);
60
0
  cur = EncodeVarint64(cur, size_);
61
0
  return cur;
62
0
}
63
64
80.3k
Status BlockHandle::DecodeFrom(Slice* input) {
65
80.5k
  if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
66
80.5k
    return Status::OK();
67
18.4E
  } else {
68
    // reset in case failure after partially decoding
69
18.4E
    offset_ = 0;
70
18.4E
    size_ = 0;
71
18.4E
    return Status::Corruption("bad block handle");
72
18.4E
  }
73
80.3k
}
74
75
0
Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
76
0
  if (GetVarint64(input, &size_)) {
77
0
    offset_ = _offset;
78
0
    return Status::OK();
79
0
  } else {
80
    // reset in case failure after partially decoding
81
0
    offset_ = 0;
82
0
    size_ = 0;
83
0
    return Status::Corruption("bad block handle");
84
0
  }
85
0
}
86
87
// Return a string that contains the copy of handle.
88
0
std::string BlockHandle::ToString(bool hex) const {
89
0
  std::string handle_str;
90
0
  EncodeTo(&handle_str);
91
0
  if (hex) {
92
0
    return Slice(handle_str).ToString(true);
93
0
  } else {
94
0
    return handle_str;
95
0
  }
96
0
}
97
98
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
99
100
void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
101
7.93k
                          const BlockHandle* previous_handle) const {
102
7.93k
  if (previous_handle) {
103
    // WART: this is specific to Block-based table
104
1
    assert(handle.offset() == previous_handle->offset() +
105
1
                                  previous_handle->size() +
106
1
                                  BlockBasedTable::kBlockTrailerSize);
107
1
    PutVarsignedint64(dst, handle.size() - previous_handle->size());
108
7.93k
  } else {
109
7.93k
    handle.EncodeTo(dst);
110
7.93k
  }
111
7.93k
  assert(dst->size() != 0);
112
113
7.93k
  if (have_first_key) {
114
0
    PutLengthPrefixedSlice(dst, first_internal_key);
115
0
  }
116
7.93k
}
117
118
Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
119
10.9k
                              const BlockHandle* previous_handle) {
120
10.9k
  if (previous_handle) {
121
0
    int64_t delta;
122
0
    if (!GetVarsignedint64(input, &delta)) {
123
0
      return Status::Corruption("bad delta-encoded index value");
124
0
    }
125
    // WART: this is specific to Block-based table
126
0
    handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
127
0
                             BlockBasedTable::kBlockTrailerSize,
128
0
                         previous_handle->size() + delta);
129
10.9k
  } else {
130
10.9k
    Status s = handle.DecodeFrom(input);
131
10.9k
    if (!s.ok()) {
132
0
      return s;
133
0
    }
134
10.9k
  }
135
136
10.9k
  if (!have_first_key) {
137
10.9k
    first_internal_key = Slice();
138
10.9k
  } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
139
0
    return Status::Corruption("bad first key in block info");
140
0
  }
141
142
10.9k
  return Status::OK();
143
10.9k
}
144
145
0
std::string IndexValue::ToString(bool hex, bool have_first_key) const {
146
0
  std::string s;
147
0
  EncodeTo(&s, have_first_key, nullptr);
148
0
  if (hex) {
149
0
    return Slice(s).ToString(true);
150
0
  } else {
151
0
    return s;
152
0
  }
153
0
}
154
155
namespace {
156
34.3k
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
157
34.3k
  return magic_number == kLegacyPlainTableMagicNumber;
158
34.3k
}
159
// Used when reading format_version=0 footers (plain tables)
160
0
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
161
0
  if (magic_number == kLegacyPlainTableMagicNumber) {
162
0
    return kPlainTableMagicNumber;
163
0
  }
164
0
  assert(false);
165
0
  return magic_number;
166
0
}
167
// Used by plain tables to write format_version=0 footers
168
0
inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
169
0
  if (magic_number == kPlainTableMagicNumber) {
170
0
    return kLegacyPlainTableMagicNumber;
171
0
  }
172
0
  assert(false);
173
0
  return magic_number;
174
0
}
175
42.6k
inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
176
42.6k
  if (magic_number == kBlockBasedTableMagicNumber) {
177
42.6k
    return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
178
18.4E
  } else {
179
18.4E
    return 0;
180
18.4E
  }
181
42.6k
}
182
183
// NOTE: format_version 0 is still used by plain tables and format_version 1 by
184
// cuckoo table. For block-based tables, format_version < 2 is no longer
185
// supported for reading or writing. Legacy magic numbers on block-based tables
186
// are used only for good error reporting.
187
//
188
// Footer format, in three parts:
189
// * Part1
190
//   -> format_version == 0 (inferred from legacy magic number)
191
//      <empty> (0 bytes)
192
//   -> format_version >= 1
193
//      checksum type (char, 1 byte)
194
// * Part2
195
//   -> format_version <= 5
196
//      metaindex handle (varint64 offset, varint64 size)
197
//      index handle     (varint64 offset, varint64 size)
198
//      <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
199
//        - This padding is unchecked/ignored
200
//   -> format_version >= 6
201
//      extended magic number (4 bytes) = 0x3e 0x00 0x7a 0x00
202
//        - Also surely invalid (size 0) handles if interpreted as older version
203
//        - (Helps ensure a corrupted format_version doesn't get us far with no
204
//           footer checksum.)
205
//      footer_checksum (uint32LE, 4 bytes)
206
//        - Checksum of above checksum type of whole footer, with this field
207
//          set to all zeros.
208
//      base_context_checksum (uint32LE, 4 bytes)
209
//      metaindex block size (uint32LE, 4 bytes)
210
//        - Assumed to be immediately before footer, < 4GB
211
//      <zero padding> (24 bytes, reserved for future use)
212
//        - Brings part2 size also to 40 bytes
213
//        - Checked that last eight bytes == 0, so reserved for a future
214
//          incompatible feature (but under format_version=6)
215
// * Part3
216
//   -> format_version == 0 (inferred from legacy magic number)
217
//      legacy magic number (8 bytes)
218
//   -> format_version >= 1 (inferred from NOT legacy magic number)
219
//      format_version (uint32LE, 4 bytes), also called "footer version"
220
//      newer magic number (8 bytes)
221
const std::array<char, 4> kExtendedMagic{{0x3e, 0x00, 0x7a, 0x00}};
222
constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
223
}  // namespace
224
225
Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
226
                            uint64_t footer_offset, ChecksumType checksum_type,
227
                            const BlockHandle& metaindex_handle,
228
                            const BlockHandle& index_handle,
229
7.93k
                            uint32_t base_context_checksum) {
230
7.93k
  assert(magic_number != Footer::kNullTableMagicNumber);
231
7.93k
  assert(IsSupportedFormatVersionForWrite(magic_number, format_version) ||
232
7.93k
         TEST_AllowUnsupportedFormatVersion());
233
234
7.93k
  char* part2;
235
7.93k
  char* part3;
236
7.93k
  if (format_version > 0) {
237
7.93k
    slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
238
    // Generate parts 1 and 3
239
7.93k
    char* cur = data_.data();
240
    // Part 1
241
7.93k
    *(cur++) = checksum_type;
242
    // Part 2
243
7.93k
    part2 = cur;
244
    // Skip over part 2 for now
245
7.93k
    cur += kFooterPart2Size;
246
    // Part 3
247
7.93k
    part3 = cur;
248
7.93k
    EncodeFixed32(cur, format_version);
249
7.93k
    cur += 4;
250
7.93k
    EncodeFixed64(cur, magic_number);
251
7.93k
    assert(cur + 8 == slice_.data() + slice_.size());
252
7.93k
  } else {
253
    // format_version == 0 is used by plain tables
254
0
    slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
255
    // Legacy SST files use kCRC32c checksum but it's not stored in footer.
256
0
    assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
257
    // Generate part 3 (part 1 empty, skip part 2 for now)
258
0
    part2 = data_.data();
259
0
    part3 = part2 + kFooterPart2Size;
260
0
    char* cur = part3;
261
    // Use legacy magic numbers to indicate format_version=0, for
262
    // compatibility. No other cases should use format_version=0.
263
0
    EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
264
0
    assert(cur + 8 == slice_.data() + slice_.size());
265
0
  }
266
267
7.93k
  if (format_version >= 6) {
268
7.93k
    if (BlockTrailerSizeForMagicNumber(magic_number) != 0) {
269
      // base context checksum required for table formats with block checksums
270
7.93k
      assert(base_context_checksum != 0);
271
7.93k
      assert(ChecksumModifierForContext(base_context_checksum, 0) != 0);
272
7.93k
    } else {
273
      // base context checksum not used
274
0
      assert(base_context_checksum == 0);
275
0
      assert(ChecksumModifierForContext(base_context_checksum, 0) == 0);
276
0
    }
277
278
    // Start populating Part 2
279
7.93k
    char* cur = data_.data() + /* part 1 size */ 1;
280
    // Set extended magic of part2
281
7.93k
    std::copy(kExtendedMagic.begin(), kExtendedMagic.end(), cur);
282
7.93k
    cur += kExtendedMagic.size();
283
    // Fill checksum data with zeros (for later computing checksum)
284
7.93k
    char* checksum_data = cur;
285
7.93k
    EncodeFixed32(cur, 0);
286
7.93k
    cur += 4;
287
    // Save base context checksum
288
7.93k
    EncodeFixed32(cur, base_context_checksum);
289
7.93k
    cur += 4;
290
    // Compute and save metaindex size
291
7.93k
    uint32_t metaindex_size = static_cast<uint32_t>(metaindex_handle.size());
292
7.93k
    if (metaindex_size != metaindex_handle.size()) {
293
0
      return Status::NotSupported("Metaindex block size > 4GB");
294
0
    }
295
    // Metaindex must be adjacent to footer
296
7.93k
    assert(metaindex_size == 0 ||
297
7.93k
           metaindex_handle.offset() + metaindex_handle.size() ==
298
7.93k
               footer_offset - BlockTrailerSizeForMagicNumber(magic_number));
299
7.93k
    EncodeFixed32(cur, metaindex_size);
300
7.93k
    cur += 4;
301
302
    // Zero pad remainder (for future use)
303
7.93k
    std::fill_n(cur, 24U, char{0});
304
7.93k
    assert(cur + 24 == part3);
305
306
    // Compute checksum, add context
307
7.93k
    uint32_t checksum = ComputeBuiltinChecksum(
308
7.93k
        checksum_type, data_.data(), Footer::kNewVersionsEncodedLength);
309
7.93k
    checksum +=
310
7.93k
        ChecksumModifierForContext(base_context_checksum, footer_offset);
311
    // Store it
312
7.93k
    EncodeFixed32(checksum_data, checksum);
313
7.93k
  } else {
314
    // Base context checksum not used
315
0
    assert(!FormatVersionUsesContextChecksum(format_version));
316
    // Should be left empty
317
0
    assert(base_context_checksum == 0);
318
0
    assert(ChecksumModifierForContext(base_context_checksum, 0) == 0);
319
320
    // Populate all of part 2
321
0
    char* cur = part2;
322
0
    cur = metaindex_handle.EncodeTo(cur);
323
0
    cur = index_handle.EncodeTo(cur);
324
    // Zero pad remainder
325
0
    std::fill(cur, part3, char{0});
326
0
  }
327
7.93k
  return Status::OK();
328
7.93k
}
329
330
Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
331
34.8k
                          uint64_t enforce_table_magic_number) {
332
  // Only decode to unused Footer
333
34.8k
  assert(table_magic_number_ == kNullTableMagicNumber);
334
34.8k
  assert(input != nullptr);
335
34.8k
  assert(input.size() >= kMinEncodedLength);
336
337
34.8k
  const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
338
34.8k
  uint64_t magic = DecodeFixed64(magic_ptr);
339
340
  // Legacy block-based tables (format_version < 2) are no longer supported.
341
  // (This constant is only used here and in the corresponding test.)
342
34.8k
  if (magic == 0xdb4775248b80fb57ull) {
343
0
    return Status::NotSupported(
344
0
        "Unsupported legacy magic number for block-based SST format. Load with "
345
0
        "RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade.");
346
0
  }
347
348
  // Check for legacy formats
349
34.8k
  bool legacy = IsLegacyFooterFormat(magic);
350
34.8k
  if (legacy) {
351
    // Legacy plain tables are still supported - upconvert magic
352
0
    magic = UpconvertLegacyFooterFormat(magic);
353
0
  }
354
34.8k
  if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) {
355
0
    return Status::Corruption("Bad table magic number: expected " +
356
0
                              std::to_string(enforce_table_magic_number) +
357
0
                              ", found " + std::to_string(magic));
358
0
  }
359
34.8k
  table_magic_number_ = magic;
360
34.8k
  block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
361
362
  // Parse Part3
363
34.8k
  const char* part3_ptr = magic_ptr;
364
34.8k
  uint32_t computed_checksum = 0;
365
34.8k
  uint64_t footer_offset = 0;
366
34.8k
  if (legacy) {
367
    // Legacy format (format_version=0, used by plain tables)
368
    // The size is already asserted to be at least kMinEncodedLength
369
    // at the beginning of the function
370
0
    input.remove_prefix(input.size() - kVersion0EncodedLength);
371
0
    format_version_ = 0 /* legacy */;
372
0
    checksum_type_ = kCRC32c;
373
34.8k
  } else {
374
34.8k
    part3_ptr = magic_ptr - 4;
375
34.8k
    format_version_ = DecodeFixed32(part3_ptr);
376
34.8k
    if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) &&
377
34.8k
                 !TEST_AllowUnsupportedFormatVersion())) {
378
0
      return Status::Corruption("Corrupt or unsupported format_version " +
379
0
                                std::to_string(format_version_) +
380
0
                                " for magic " + std::to_string(magic));
381
0
    }
382
    // All known format versions >= 1 occupy exactly this many bytes.
383
34.8k
    if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) {
384
0
      return Status::Corruption("Input is too short to be an SST file");
385
0
    }
386
34.8k
    uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
387
34.8k
    input.remove_prefix(adjustment);
388
34.8k
    footer_offset = input_offset + adjustment;
389
390
    // Parse Part1
391
34.8k
    char chksum = input.data()[0];
392
34.8k
    checksum_type_ = lossless_cast<ChecksumType>(chksum);
393
34.8k
    if (UNLIKELY(!IsSupportedChecksumType(checksum_type()))) {
394
0
      return Status::Corruption("Corrupt or unsupported checksum type: " +
395
0
                                std::to_string(lossless_cast<uint8_t>(chksum)));
396
0
    }
397
    // This is the most convenient place to compute the checksum
398
34.8k
    if (checksum_type_ != kNoChecksum && format_version_ >= 6) {
399
34.4k
      std::array<char, kNewVersionsEncodedLength> copy_without_checksum;
400
34.4k
      std::copy_n(input.data(), kNewVersionsEncodedLength,
401
34.4k
                  copy_without_checksum.data());
402
34.4k
      EncodeFixed32(&copy_without_checksum[5], 0);  // Clear embedded checksum
403
34.4k
      computed_checksum =
404
34.4k
          ComputeBuiltinChecksum(checksum_type(), copy_without_checksum.data(),
405
34.4k
                                 kNewVersionsEncodedLength);
406
34.4k
    }
407
    // Consume checksum type field
408
34.8k
    input.remove_prefix(1);
409
34.8k
  }
410
411
  // Parse Part2
412
34.8k
  if (format_version_ >= 6) {
413
34.0k
    Slice ext_magic(input.data(), 4);
414
34.0k
    if (UNLIKELY(ext_magic.compare(Slice(kExtendedMagic.data(),
415
34.0k
                                         kExtendedMagic.size())) != 0)) {
416
0
      return Status::Corruption("Bad extended magic number: 0x" +
417
0
                                ext_magic.ToString(/*hex*/ true));
418
0
    }
419
34.0k
    input.remove_prefix(4);
420
34.0k
    uint32_t stored_checksum = 0, metaindex_size = 0;
421
34.0k
    bool success;
422
34.0k
    success = GetFixed32(&input, &stored_checksum);
423
34.0k
    assert(success);
424
34.0k
    success = GetFixed32(&input, &base_context_checksum_);
425
34.0k
    assert(success);
426
34.0k
    if (UNLIKELY(ChecksumModifierForContext(base_context_checksum_, 0) == 0)) {
427
0
      return Status::Corruption("Invalid base context checksum");
428
0
    }
429
34.0k
    computed_checksum +=
430
34.0k
        ChecksumModifierForContext(base_context_checksum_, footer_offset);
431
34.0k
    if (UNLIKELY(computed_checksum != stored_checksum)) {
432
0
      return Status::Corruption("Footer at " + std::to_string(footer_offset) +
433
0
                                " checksum mismatch");
434
0
    }
435
34.0k
    success = GetFixed32(&input, &metaindex_size);
436
34.0k
    assert(success);
437
34.0k
    (void)success;
438
34.0k
    uint64_t metaindex_end = footer_offset - GetBlockTrailerSize();
439
34.0k
    metaindex_handle_ =
440
34.0k
        BlockHandle(metaindex_end - metaindex_size, metaindex_size);
441
442
    // Mark unpopulated
443
34.0k
    index_handle_ = BlockHandle::NullBlockHandle();
444
445
    // 16 bytes of unchecked reserved padding
446
34.0k
    input.remove_prefix(16U);
447
448
    // 8 bytes of checked reserved padding (expected to be zero unless using a
449
    // future feature).
450
34.0k
    uint64_t reserved = 0;
451
34.0k
    success = GetFixed64(&input, &reserved);
452
34.0k
    assert(success);
453
34.0k
    if (UNLIKELY(reserved != 0)) {
454
0
      return Status::NotSupported(
455
0
          "File uses a future feature not supported in this version");
456
0
    }
457
    // End of part 2
458
34.0k
    assert(input.data() == part3_ptr);
459
34.0k
  } else {
460
    // format_version_ < 6
461
813
    Status result = metaindex_handle_.DecodeFrom(&input);
462
813
    if (result.ok()) {
463
0
      result = index_handle_.DecodeFrom(&input);
464
0
    }
465
813
    if (!result.ok()) {
466
0
      return result;
467
0
    }
468
    // Padding in part2 is ignored
469
813
  }
470
34.8k
  return Status::OK();
471
34.8k
}
472
473
0
std::string Footer::ToString() const {
474
0
  std::string result;
475
0
  result.reserve(1024);
476
477
0
  result.append("metaindex handle: " + metaindex_handle_.ToString() +
478
0
                " offset: " + std::to_string(metaindex_handle_.offset()) +
479
0
                " size: " + std::to_string(metaindex_handle_.size()) + "\n  ");
480
0
  result.append("index handle: " + index_handle_.ToString() +
481
0
                " offset: " + std::to_string(index_handle_.offset()) +
482
0
                " size: " + std::to_string(index_handle_.size()) + "\n  ");
483
0
  result.append("table_magic_number: " + std::to_string(table_magic_number_) +
484
0
                "\n  ");
485
0
  if (!IsLegacyFooterFormat(table_magic_number_)) {
486
0
    result.append("format version: " + std::to_string(format_version_) + "\n");
487
0
  }
488
0
  return result;
489
0
}
490
491
0
bool& TEST_AllowUnsupportedFormatVersion() {
492
0
  static bool allow = false;
493
0
  return allow;
494
0
}
495
496
static Status ReadFooterFromFileInternal(
497
    const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs,
498
    FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size,
499
34.9k
    Footer* footer, uint64_t enforce_table_magic_number) {
500
34.9k
  uint64_t file_size_from_file_system = 0;
501
34.9k
  Status s;
502
  // Prefer the more efficient FSRandomAccessFile::GetFileSize when available
503
34.9k
  s = file->file()->GetFileSize(&file_size_from_file_system);
504
34.9k
  if (!s.ok()) {
505
    // Fall back on FileSystem::GetFileSize on failure
506
0
    s = fs.GetFileSize(file->file_name(), IOOptions(),
507
0
                       &file_size_from_file_system, nullptr);
508
0
    if (!s.ok()) {
509
0
      return s;
510
0
    }
511
0
  }
512
513
34.9k
  if (expected_file_size != file_size_from_file_system) {
514
    // When file is opened during DB Open, the expected file size is from
515
    // manifest. Otherwise it is not guaranteed.
516
0
    return Status::Corruption("Sst file size mismatch between expected " +
517
0
                              std::to_string(expected_file_size) +
518
0
                              " and file system " +
519
0
                              std::to_string(file_size_from_file_system) +
520
0
                              " sstable: " + file->file_name());
521
0
  }
522
523
34.9k
  if (expected_file_size < Footer::kMinEncodedLength) {
524
0
    return Status::Corruption("file is too short (" +
525
0
                              std::to_string(expected_file_size) +
526
0
                              " bytes) to be an "
527
0
                              "sstable: " +
528
0
                              file->file_name());
529
0
  }
530
531
34.9k
  std::array<char, Footer::kMaxEncodedLength + 1> footer_buf;
532
34.9k
  AlignedBuf internal_buf;
533
34.9k
  Slice footer_input;
534
34.9k
  uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength)
535
34.9k
                             ? expected_file_size - Footer::kMaxEncodedLength
536
34.9k
                             : 0;
537
  // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
538
  // there is no readahead for point lookups, so TryReadFromCache will fail if
539
  // the required data is not in the prefetch buffer. Once deadline is enabled
540
  // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
541
  // need to pass a timeout at that point
542
  // TODO: rate limit footer reads.
543
34.9k
  if (prefetch_buffer == nullptr ||
544
34.8k
      !prefetch_buffer->TryReadFromCache(opts, file, read_offset,
545
34.8k
                                         Footer::kMaxEncodedLength,
546
34.8k
                                         &footer_input, nullptr)) {
547
34.8k
    if (file->use_direct_io()) {
548
0
      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
549
0
                     &footer_input, nullptr, &internal_buf);
550
34.8k
    } else {
551
34.8k
      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
552
34.8k
                     &footer_input, footer_buf.data(), nullptr);
553
34.8k
    }
554
34.8k
    if (!s.ok()) {
555
0
      return s;
556
0
    }
557
34.8k
  }
558
559
34.9k
  TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
560
561
  // Check that we actually read the whole footer from the file.
562
34.9k
  if (footer_input.size() < Footer::kMinEncodedLength) {
563
0
    return Status::Corruption(
564
0
        "The number of bytes read for Footer input " +
565
0
        std::to_string(footer_input.size()) +
566
0
        " is smaller than minimum footer encoded length: " +
567
0
        std::to_string(Footer::kMinEncodedLength) + " for file " +
568
0
        file->file_name() + "\n");
569
0
  }
570
571
34.9k
  s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number);
572
34.9k
  if (!s.ok()) {
573
0
    s = Status::CopyAppendMessage(s, " in ", file->file_name());
574
0
    return s;
575
0
  }
576
34.9k
  return Status::OK();
577
34.9k
}
578
579
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
580
                          FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
581
                          uint64_t expected_file_size, Footer* footer,
582
                          uint64_t enforce_table_magic_number,
583
34.9k
                          Statistics* stats) {
584
34.9k
  Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer,
585
34.9k
                                        expected_file_size, footer,
586
34.9k
                                        enforce_table_magic_number);
587
34.9k
  if (s.IsCorruption() &&
588
0
      CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
589
0
    IOOptions new_opts = opts;
590
0
    new_opts.verify_and_reconstruct_read = true;
591
0
    footer->Reset();
592
0
    s = ReadFooterFromFileInternal(new_opts, file, fs,
593
0
                                   /*prefetch_buffer=*/nullptr,
594
0
                                   expected_file_size, footer,
595
0
                                   enforce_table_magic_number);
596
0
    RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
597
0
    if (s.ok()) {
598
0
      RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
599
0
    }
600
0
  }
601
34.9k
  return s;
602
34.9k
}
603
604
namespace {
605
// Custom handling for the last byte of a block, to avoid invoking streaming
606
// API to get an effective block checksum. This function is its own inverse
607
// because it uses xor.
608
182k
inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
609
  // This strategy bears some resemblance to extending a CRC checksum by one
610
  // more byte, except we don't need to re-mix the input checksum as long as
611
  // we do this step only once (per checksum).
612
182k
  const uint32_t kRandomPrime = 0x6b9083d9;
613
182k
  return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
614
182k
}
615
}  // namespace
616
617
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
618
151k
                                size_t data_size) {
619
151k
  switch (type) {
620
0
    case kCRC32c:
621
0
      return crc32c::Mask(crc32c::Value(data, data_size));
622
0
    case kxxHash:
623
0
      return XXH32(data, data_size, /*seed*/ 0);
624
0
    case kxxHash64:
625
0
      return Lower32of64(XXH64(data, data_size, /*seed*/ 0));
626
151k
    case kXXH3: {
627
151k
      if (data_size == 0) {
628
        // Special case because of special handling for last byte, not
629
        // present in this case. Can be any value different from other
630
        // small input size checksums.
631
0
        return 0;
632
151k
      } else {
633
        // See corresponding code in ComputeBuiltinChecksumWithLastByte
634
151k
        uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
635
151k
        return ModifyChecksumForLastByte(v, data[data_size - 1]);
636
151k
      }
637
151k
    }
638
0
    default:  // including kNoChecksum
639
0
      return 0;
640
151k
  }
641
151k
}
642
643
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
644
31.7k
                                            size_t data_size, char last_byte) {
645
31.7k
  switch (type) {
646
0
    case kCRC32c: {
647
0
      uint32_t crc = crc32c::Value(data, data_size);
648
      // Extend to cover last byte (compression type)
649
0
      crc = crc32c::Extend(crc, &last_byte, 1);
650
0
      return crc32c::Mask(crc);
651
0
    }
652
0
    case kxxHash: {
653
0
      XXH32_state_t* const state = XXH32_createState();
654
0
      XXH32_reset(state, 0);
655
0
      XXH32_update(state, data, data_size);
656
      // Extend to cover last byte (compression type)
657
0
      XXH32_update(state, &last_byte, 1);
658
0
      uint32_t v = XXH32_digest(state);
659
0
      XXH32_freeState(state);
660
0
      return v;
661
0
    }
662
0
    case kxxHash64: {
663
0
      XXH64_state_t* const state = XXH64_createState();
664
0
      XXH64_reset(state, 0);
665
0
      XXH64_update(state, data, data_size);
666
      // Extend to cover last byte (compression type)
667
0
      XXH64_update(state, &last_byte, 1);
668
0
      uint32_t v = Lower32of64(XXH64_digest(state));
669
0
      XXH64_freeState(state);
670
0
      return v;
671
0
    }
672
31.7k
    case kXXH3: {
673
      // XXH3 is a complicated hash function that is extremely fast on
674
      // contiguous input, but that makes its streaming support rather
675
      // complex. It is worth custom handling of the last byte (`type`)
676
      // in order to avoid allocating a large state object and bringing
677
      // that code complexity into CPU working set.
678
31.7k
      uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
679
31.7k
      return ModifyChecksumForLastByte(v, last_byte);
680
0
    }
681
0
    default:  // including kNoChecksum
682
0
      return 0;
683
31.7k
  }
684
31.7k
}
685
686
Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
687
                           BlockContents* out_contents,
688
                           const ImmutableOptions& ioptions,
689
0
                           MemoryAllocator* allocator) {
690
0
  assert(args.compression_type != kNoCompression && "Invalid compression type");
691
692
0
  StopWatchNano timer(ioptions.clock,
693
0
                      ShouldReportDetailedTime(ioptions.env, ioptions.stats));
694
695
0
  Status s = decompressor.ExtractUncompressedSize(args);
696
0
  if (UNLIKELY(!s.ok())) {
697
0
    return s;
698
0
  }
699
0
  CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator);
700
0
  s = decompressor.DecompressBlock(args, ubuf.get());
701
0
  if (UNLIKELY(!s.ok())) {
702
0
    return s;
703
0
  }
704
705
0
  *out_contents = BlockContents(std::move(ubuf), args.uncompressed_size);
706
707
0
  if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
708
0
    RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
709
0
                          timer.ElapsedNanos());
710
0
  }
711
0
  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM,
712
0
             args.compressed_data.size());
713
0
  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size());
714
0
  RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
715
716
0
  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue",
717
0
                           static_cast<void*>(&s));
718
0
  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput",
719
0
                           static_cast<void*>(out_contents));
720
721
0
  return s;
722
0
}
723
724
Status DecompressBlockData(const char* data, size_t size, CompressionType type,
725
                           Decompressor& decompressor,
726
                           BlockContents* out_contents,
727
                           const ImmutableOptions& ioptions,
728
                           MemoryAllocator* allocator,
729
0
                           Decompressor::ManagedWorkingArea* working_area) {
730
0
  Decompressor::Args args;
731
0
  args.compressed_data = Slice(data, size);
732
0
  args.compression_type = type;
733
0
  args.working_area = working_area;
734
0
  return DecompressBlockData(args, decompressor, out_contents, ioptions,
735
0
                             allocator);
736
0
}
737
738
Status DecompressSerializedBlock(const char* data, size_t size,
739
                                 CompressionType type,
740
                                 Decompressor& decompressor,
741
                                 BlockContents* out_contents,
742
                                 const ImmutableOptions& ioptions,
743
0
                                 MemoryAllocator* allocator) {
744
0
  assert(data[size] != kNoCompression);
745
0
  assert(data[size] == static_cast<char>(type));
746
0
  return DecompressBlockData(data, size, type, decompressor, out_contents,
747
0
                             ioptions, allocator);
748
0
}
749
750
Status DecompressSerializedBlock(Decompressor::Args& args,
751
                                 Decompressor& decompressor,
752
                                 BlockContents* out_contents,
753
                                 const ImmutableOptions& ioptions,
754
0
                                 MemoryAllocator* allocator) {
755
0
  assert(args.compressed_data.data()[args.compressed_data.size()] !=
756
0
         kNoCompression);
757
0
  assert(args.compressed_data.data()[args.compressed_data.size()] ==
758
0
         static_cast<char>(args.compression_type));
759
0
  return DecompressBlockData(args, decompressor, out_contents, ioptions,
760
0
                             allocator);
761
0
}
762
763
// Replace the contents of db_host_id with the actual hostname, if db_host_id
764
// matches the keyword kHostnameForDbHostId
765
8.06k
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
766
8.06k
  assert(db_host_id);
767
8.06k
  if (*db_host_id == kHostnameForDbHostId) {
768
8.06k
    Status s = env->GetHostNameString(db_host_id);
769
8.06k
    if (!s.ok()) {
770
0
      db_host_id->clear();
771
0
    }
772
8.06k
    return s;
773
8.06k
  }
774
775
0
  return Status::OK();
776
8.06k
}
777
}  // namespace ROCKSDB_NAMESPACE