/src/rocksdb/table/format.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "table/format.h" |
11 | | |
12 | | #include <cinttypes> |
13 | | #include <cstdint> |
14 | | #include <string> |
15 | | |
16 | | #include "block_fetcher.h" |
17 | | #include "file/random_access_file_reader.h" |
18 | | #include "memory/memory_allocator_impl.h" |
19 | | #include "monitoring/perf_context_imp.h" |
20 | | #include "monitoring/statistics_impl.h" |
21 | | #include "options/options_helper.h" |
22 | | #include "port/likely.h" |
23 | | #include "rocksdb/env.h" |
24 | | #include "rocksdb/options.h" |
25 | | #include "rocksdb/table.h" |
26 | | #include "table/block_based/block.h" |
27 | | #include "table/block_based/block_based_table_reader.h" |
28 | | #include "table/persistent_cache_helper.h" |
29 | | #include "unique_id_impl.h" |
30 | | #include "util/cast_util.h" |
31 | | #include "util/coding.h" |
32 | | #include "util/compression.h" |
33 | | #include "util/crc32c.h" |
34 | | #include "util/hash.h" |
35 | | #include "util/stop_watch.h" |
36 | | #include "util/string_util.h" |
37 | | #include "util/xxhash.h" |
38 | | |
39 | | namespace ROCKSDB_NAMESPACE { |
40 | | |
41 | | const char* kHostnameForDbHostId = "__hostname__"; |
42 | | |
43 | 8.71k | bool ShouldReportDetailedTime(Env* env, Statistics* stats) { |
44 | 8.71k | return env != nullptr && stats != nullptr && |
45 | 0 | stats->get_stats_level() > kExceptDetailedTimers; |
46 | 8.71k | } |
47 | | |
48 | 23.8k | void BlockHandle::EncodeTo(std::string* dst) const { |
49 | | // Sanity check that all fields have been set |
50 | 23.8k | assert(offset_ != ~uint64_t{0}); |
51 | 23.8k | assert(size_ != ~uint64_t{0}); |
52 | 23.8k | PutVarint64Varint64(dst, offset_, size_); |
53 | 23.8k | } |
54 | | |
55 | 0 | char* BlockHandle::EncodeTo(char* dst) const { |
56 | | // Sanity check that all fields have been set |
57 | 0 | assert(offset_ != ~uint64_t{0}); |
58 | 0 | assert(size_ != ~uint64_t{0}); |
59 | 0 | char* cur = EncodeVarint64(dst, offset_); |
60 | 0 | cur = EncodeVarint64(cur, size_); |
61 | 0 | return cur; |
62 | 0 | } |
63 | | |
64 | 80.3k | Status BlockHandle::DecodeFrom(Slice* input) { |
65 | 80.5k | if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { |
66 | 80.5k | return Status::OK(); |
67 | 18.4E | } else { |
68 | | // reset in case failure after partially decoding |
69 | 18.4E | offset_ = 0; |
70 | 18.4E | size_ = 0; |
71 | 18.4E | return Status::Corruption("bad block handle"); |
72 | 18.4E | } |
73 | 80.3k | } |
74 | | |
75 | 0 | Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { |
76 | 0 | if (GetVarint64(input, &size_)) { |
77 | 0 | offset_ = _offset; |
78 | 0 | return Status::OK(); |
79 | 0 | } else { |
80 | | // reset in case failure after partially decoding |
81 | 0 | offset_ = 0; |
82 | 0 | size_ = 0; |
83 | 0 | return Status::Corruption("bad block handle"); |
84 | 0 | } |
85 | 0 | } |
86 | | |
87 | | // Return a string that contains the copy of handle. |
88 | 0 | std::string BlockHandle::ToString(bool hex) const { |
89 | 0 | std::string handle_str; |
90 | 0 | EncodeTo(&handle_str); |
91 | 0 | if (hex) { |
92 | 0 | return Slice(handle_str).ToString(true); |
93 | 0 | } else { |
94 | 0 | return handle_str; |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | | const BlockHandle BlockHandle::kNullBlockHandle(0, 0); |
99 | | |
100 | | void IndexValue::EncodeTo(std::string* dst, bool have_first_key, |
101 | 7.93k | const BlockHandle* previous_handle) const { |
102 | 7.93k | if (previous_handle) { |
103 | | // WART: this is specific to Block-based table |
104 | 1 | assert(handle.offset() == previous_handle->offset() + |
105 | 1 | previous_handle->size() + |
106 | 1 | BlockBasedTable::kBlockTrailerSize); |
107 | 1 | PutVarsignedint64(dst, handle.size() - previous_handle->size()); |
108 | 7.93k | } else { |
109 | 7.93k | handle.EncodeTo(dst); |
110 | 7.93k | } |
111 | 7.93k | assert(dst->size() != 0); |
112 | | |
113 | 7.93k | if (have_first_key) { |
114 | 0 | PutLengthPrefixedSlice(dst, first_internal_key); |
115 | 0 | } |
116 | 7.93k | } |
117 | | |
118 | | Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, |
119 | 10.9k | const BlockHandle* previous_handle) { |
120 | 10.9k | if (previous_handle) { |
121 | 0 | int64_t delta; |
122 | 0 | if (!GetVarsignedint64(input, &delta)) { |
123 | 0 | return Status::Corruption("bad delta-encoded index value"); |
124 | 0 | } |
125 | | // WART: this is specific to Block-based table |
126 | 0 | handle = BlockHandle(previous_handle->offset() + previous_handle->size() + |
127 | 0 | BlockBasedTable::kBlockTrailerSize, |
128 | 0 | previous_handle->size() + delta); |
129 | 10.9k | } else { |
130 | 10.9k | Status s = handle.DecodeFrom(input); |
131 | 10.9k | if (!s.ok()) { |
132 | 0 | return s; |
133 | 0 | } |
134 | 10.9k | } |
135 | | |
136 | 10.9k | if (!have_first_key) { |
137 | 10.9k | first_internal_key = Slice(); |
138 | 10.9k | } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { |
139 | 0 | return Status::Corruption("bad first key in block info"); |
140 | 0 | } |
141 | | |
142 | 10.9k | return Status::OK(); |
143 | 10.9k | } |
144 | | |
145 | 0 | std::string IndexValue::ToString(bool hex, bool have_first_key) const { |
146 | 0 | std::string s; |
147 | 0 | EncodeTo(&s, have_first_key, nullptr); |
148 | 0 | if (hex) { |
149 | 0 | return Slice(s).ToString(true); |
150 | 0 | } else { |
151 | 0 | return s; |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | | namespace { |
156 | 34.3k | inline bool IsLegacyFooterFormat(uint64_t magic_number) { |
157 | 34.3k | return magic_number == kLegacyPlainTableMagicNumber; |
158 | 34.3k | } |
159 | | // Used when reading format_version=0 footers (plain tables) |
160 | 0 | inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { |
161 | 0 | if (magic_number == kLegacyPlainTableMagicNumber) { |
162 | 0 | return kPlainTableMagicNumber; |
163 | 0 | } |
164 | 0 | assert(false); |
165 | 0 | return magic_number; |
166 | 0 | } |
167 | | // Used by plain tables to write format_version=0 footers |
168 | 0 | inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) { |
169 | 0 | if (magic_number == kPlainTableMagicNumber) { |
170 | 0 | return kLegacyPlainTableMagicNumber; |
171 | 0 | } |
172 | 0 | assert(false); |
173 | 0 | return magic_number; |
174 | 0 | } |
175 | 42.6k | inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) { |
176 | 42.6k | if (magic_number == kBlockBasedTableMagicNumber) { |
177 | 42.6k | return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize); |
178 | 18.4E | } else { |
179 | 18.4E | return 0; |
180 | 18.4E | } |
181 | 42.6k | } |
182 | | |
183 | | // NOTE: format_version 0 is still used by plain tables and format_version 1 by |
184 | | // cuckoo table. For block-based tables, format_version < 2 is no longer |
185 | | // supported for reading or writing. Legacy magic numbers on block-based tables |
186 | | // are used only for good error reporting. |
187 | | // |
188 | | // Footer format, in three parts: |
189 | | // * Part1 |
190 | | // -> format_version == 0 (inferred from legacy magic number) |
191 | | // <empty> (0 bytes) |
192 | | // -> format_version >= 1 |
193 | | // checksum type (char, 1 byte) |
194 | | // * Part2 |
195 | | // -> format_version <= 5 |
196 | | // metaindex handle (varint64 offset, varint64 size) |
197 | | // index handle (varint64 offset, varint64 size) |
198 | | // <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40 |
199 | | // - This padding is unchecked/ignored |
200 | | // -> format_version >= 6 |
201 | | // extended magic number (4 bytes) = 0x3e 0x00 0x7a 0x00 |
202 | | // - Also surely invalid (size 0) handles if interpreted as older version |
203 | | // - (Helps ensure a corrupted format_version doesn't get us far with no |
204 | | // footer checksum.) |
205 | | // footer_checksum (uint32LE, 4 bytes) |
206 | | // - Checksum of above checksum type of whole footer, with this field |
207 | | // set to all zeros. |
208 | | // base_context_checksum (uint32LE, 4 bytes) |
209 | | // metaindex block size (uint32LE, 4 bytes) |
210 | | // - Assumed to be immediately before footer, < 4GB |
211 | | // <zero padding> (24 bytes, reserved for future use) |
212 | | // - Brings part2 size also to 40 bytes |
213 | | // - Checked that last eight bytes == 0, so reserved for a future |
214 | | // incompatible feature (but under format_version=6) |
215 | | // * Part3 |
216 | | // -> format_version == 0 (inferred from legacy magic number) |
217 | | // legacy magic number (8 bytes) |
218 | | // -> format_version >= 1 (inferred from NOT legacy magic number) |
219 | | // format_version (uint32LE, 4 bytes), also called "footer version" |
220 | | // newer magic number (8 bytes) |
221 | | const std::array<char, 4> kExtendedMagic{{0x3e, 0x00, 0x7a, 0x00}}; |
222 | | constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength; |
223 | | } // namespace |
224 | | |
225 | | Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, |
226 | | uint64_t footer_offset, ChecksumType checksum_type, |
227 | | const BlockHandle& metaindex_handle, |
228 | | const BlockHandle& index_handle, |
229 | 7.93k | uint32_t base_context_checksum) { |
230 | 7.93k | assert(magic_number != Footer::kNullTableMagicNumber); |
231 | 7.93k | assert(IsSupportedFormatVersionForWrite(magic_number, format_version) || |
232 | 7.93k | TEST_AllowUnsupportedFormatVersion()); |
233 | | |
234 | 7.93k | char* part2; |
235 | 7.93k | char* part3; |
236 | 7.93k | if (format_version > 0) { |
237 | 7.93k | slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength); |
238 | | // Generate parts 1 and 3 |
239 | 7.93k | char* cur = data_.data(); |
240 | | // Part 1 |
241 | 7.93k | *(cur++) = checksum_type; |
242 | | // Part 2 |
243 | 7.93k | part2 = cur; |
244 | | // Skip over part 2 for now |
245 | 7.93k | cur += kFooterPart2Size; |
246 | | // Part 3 |
247 | 7.93k | part3 = cur; |
248 | 7.93k | EncodeFixed32(cur, format_version); |
249 | 7.93k | cur += 4; |
250 | 7.93k | EncodeFixed64(cur, magic_number); |
251 | 7.93k | assert(cur + 8 == slice_.data() + slice_.size()); |
252 | 7.93k | } else { |
253 | | // format_version == 0 is used by plain tables |
254 | 0 | slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength); |
255 | | // Legacy SST files use kCRC32c checksum but it's not stored in footer. |
256 | 0 | assert(checksum_type == kNoChecksum || checksum_type == kCRC32c); |
257 | | // Generate part 3 (part 1 empty, skip part 2 for now) |
258 | 0 | part2 = data_.data(); |
259 | 0 | part3 = part2 + kFooterPart2Size; |
260 | 0 | char* cur = part3; |
261 | | // Use legacy magic numbers to indicate format_version=0, for |
262 | | // compatibility. No other cases should use format_version=0. |
263 | 0 | EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number)); |
264 | 0 | assert(cur + 8 == slice_.data() + slice_.size()); |
265 | 0 | } |
266 | | |
267 | 7.93k | if (format_version >= 6) { |
268 | 7.93k | if (BlockTrailerSizeForMagicNumber(magic_number) != 0) { |
269 | | // base context checksum required for table formats with block checksums |
270 | 7.93k | assert(base_context_checksum != 0); |
271 | 7.93k | assert(ChecksumModifierForContext(base_context_checksum, 0) != 0); |
272 | 7.93k | } else { |
273 | | // base context checksum not used |
274 | 0 | assert(base_context_checksum == 0); |
275 | 0 | assert(ChecksumModifierForContext(base_context_checksum, 0) == 0); |
276 | 0 | } |
277 | | |
278 | | // Start populating Part 2 |
279 | 7.93k | char* cur = data_.data() + /* part 1 size */ 1; |
280 | | // Set extended magic of part2 |
281 | 7.93k | std::copy(kExtendedMagic.begin(), kExtendedMagic.end(), cur); |
282 | 7.93k | cur += kExtendedMagic.size(); |
283 | | // Fill checksum data with zeros (for later computing checksum) |
284 | 7.93k | char* checksum_data = cur; |
285 | 7.93k | EncodeFixed32(cur, 0); |
286 | 7.93k | cur += 4; |
287 | | // Save base context checksum |
288 | 7.93k | EncodeFixed32(cur, base_context_checksum); |
289 | 7.93k | cur += 4; |
290 | | // Compute and save metaindex size |
291 | 7.93k | uint32_t metaindex_size = static_cast<uint32_t>(metaindex_handle.size()); |
292 | 7.93k | if (metaindex_size != metaindex_handle.size()) { |
293 | 0 | return Status::NotSupported("Metaindex block size > 4GB"); |
294 | 0 | } |
295 | | // Metaindex must be adjacent to footer |
296 | 7.93k | assert(metaindex_size == 0 || |
297 | 7.93k | metaindex_handle.offset() + metaindex_handle.size() == |
298 | 7.93k | footer_offset - BlockTrailerSizeForMagicNumber(magic_number)); |
299 | 7.93k | EncodeFixed32(cur, metaindex_size); |
300 | 7.93k | cur += 4; |
301 | | |
302 | | // Zero pad remainder (for future use) |
303 | 7.93k | std::fill_n(cur, 24U, char{0}); |
304 | 7.93k | assert(cur + 24 == part3); |
305 | | |
306 | | // Compute checksum, add context |
307 | 7.93k | uint32_t checksum = ComputeBuiltinChecksum( |
308 | 7.93k | checksum_type, data_.data(), Footer::kNewVersionsEncodedLength); |
309 | 7.93k | checksum += |
310 | 7.93k | ChecksumModifierForContext(base_context_checksum, footer_offset); |
311 | | // Store it |
312 | 7.93k | EncodeFixed32(checksum_data, checksum); |
313 | 7.93k | } else { |
314 | | // Base context checksum not used |
315 | 0 | assert(!FormatVersionUsesContextChecksum(format_version)); |
316 | | // Should be left empty |
317 | 0 | assert(base_context_checksum == 0); |
318 | 0 | assert(ChecksumModifierForContext(base_context_checksum, 0) == 0); |
319 | | |
320 | | // Populate all of part 2 |
321 | 0 | char* cur = part2; |
322 | 0 | cur = metaindex_handle.EncodeTo(cur); |
323 | 0 | cur = index_handle.EncodeTo(cur); |
324 | | // Zero pad remainder |
325 | 0 | std::fill(cur, part3, char{0}); |
326 | 0 | } |
327 | 7.93k | return Status::OK(); |
328 | 7.93k | } |
329 | | |
330 | | Status Footer::DecodeFrom(Slice input, uint64_t input_offset, |
331 | 34.8k | uint64_t enforce_table_magic_number) { |
332 | | // Only decode to unused Footer |
333 | 34.8k | assert(table_magic_number_ == kNullTableMagicNumber); |
334 | 34.8k | assert(input != nullptr); |
335 | 34.8k | assert(input.size() >= kMinEncodedLength); |
336 | | |
337 | 34.8k | const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte; |
338 | 34.8k | uint64_t magic = DecodeFixed64(magic_ptr); |
339 | | |
340 | | // Legacy block-based tables (format_version < 2) are no longer supported. |
341 | | // (This constant is only used here and in the corresponding test.) |
342 | 34.8k | if (magic == 0xdb4775248b80fb57ull) { |
343 | 0 | return Status::NotSupported( |
344 | 0 | "Unsupported legacy magic number for block-based SST format. Load with " |
345 | 0 | "RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade."); |
346 | 0 | } |
347 | | |
348 | | // Check for legacy formats |
349 | 34.8k | bool legacy = IsLegacyFooterFormat(magic); |
350 | 34.8k | if (legacy) { |
351 | | // Legacy plain tables are still supported - upconvert magic |
352 | 0 | magic = UpconvertLegacyFooterFormat(magic); |
353 | 0 | } |
354 | 34.8k | if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) { |
355 | 0 | return Status::Corruption("Bad table magic number: expected " + |
356 | 0 | std::to_string(enforce_table_magic_number) + |
357 | 0 | ", found " + std::to_string(magic)); |
358 | 0 | } |
359 | 34.8k | table_magic_number_ = magic; |
360 | 34.8k | block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic); |
361 | | |
362 | | // Parse Part3 |
363 | 34.8k | const char* part3_ptr = magic_ptr; |
364 | 34.8k | uint32_t computed_checksum = 0; |
365 | 34.8k | uint64_t footer_offset = 0; |
366 | 34.8k | if (legacy) { |
367 | | // Legacy format (format_version=0, used by plain tables) |
368 | | // The size is already asserted to be at least kMinEncodedLength |
369 | | // at the beginning of the function |
370 | 0 | input.remove_prefix(input.size() - kVersion0EncodedLength); |
371 | 0 | format_version_ = 0 /* legacy */; |
372 | 0 | checksum_type_ = kCRC32c; |
373 | 34.8k | } else { |
374 | 34.8k | part3_ptr = magic_ptr - 4; |
375 | 34.8k | format_version_ = DecodeFixed32(part3_ptr); |
376 | 34.8k | if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) && |
377 | 34.8k | !TEST_AllowUnsupportedFormatVersion())) { |
378 | 0 | return Status::Corruption("Corrupt or unsupported format_version " + |
379 | 0 | std::to_string(format_version_) + |
380 | 0 | " for magic " + std::to_string(magic)); |
381 | 0 | } |
382 | | // All known format versions >= 1 occupy exactly this many bytes. |
383 | 34.8k | if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) { |
384 | 0 | return Status::Corruption("Input is too short to be an SST file"); |
385 | 0 | } |
386 | 34.8k | uint64_t adjustment = input.size() - kNewVersionsEncodedLength; |
387 | 34.8k | input.remove_prefix(adjustment); |
388 | 34.8k | footer_offset = input_offset + adjustment; |
389 | | |
390 | | // Parse Part1 |
391 | 34.8k | char chksum = input.data()[0]; |
392 | 34.8k | checksum_type_ = lossless_cast<ChecksumType>(chksum); |
393 | 34.8k | if (UNLIKELY(!IsSupportedChecksumType(checksum_type()))) { |
394 | 0 | return Status::Corruption("Corrupt or unsupported checksum type: " + |
395 | 0 | std::to_string(lossless_cast<uint8_t>(chksum))); |
396 | 0 | } |
397 | | // This is the most convenient place to compute the checksum |
398 | 34.8k | if (checksum_type_ != kNoChecksum && format_version_ >= 6) { |
399 | 34.4k | std::array<char, kNewVersionsEncodedLength> copy_without_checksum; |
400 | 34.4k | std::copy_n(input.data(), kNewVersionsEncodedLength, |
401 | 34.4k | copy_without_checksum.data()); |
402 | 34.4k | EncodeFixed32(©_without_checksum[5], 0); // Clear embedded checksum |
403 | 34.4k | computed_checksum = |
404 | 34.4k | ComputeBuiltinChecksum(checksum_type(), copy_without_checksum.data(), |
405 | 34.4k | kNewVersionsEncodedLength); |
406 | 34.4k | } |
407 | | // Consume checksum type field |
408 | 34.8k | input.remove_prefix(1); |
409 | 34.8k | } |
410 | | |
411 | | // Parse Part2 |
412 | 34.8k | if (format_version_ >= 6) { |
413 | 34.0k | Slice ext_magic(input.data(), 4); |
414 | 34.0k | if (UNLIKELY(ext_magic.compare(Slice(kExtendedMagic.data(), |
415 | 34.0k | kExtendedMagic.size())) != 0)) { |
416 | 0 | return Status::Corruption("Bad extended magic number: 0x" + |
417 | 0 | ext_magic.ToString(/*hex*/ true)); |
418 | 0 | } |
419 | 34.0k | input.remove_prefix(4); |
420 | 34.0k | uint32_t stored_checksum = 0, metaindex_size = 0; |
421 | 34.0k | bool success; |
422 | 34.0k | success = GetFixed32(&input, &stored_checksum); |
423 | 34.0k | assert(success); |
424 | 34.0k | success = GetFixed32(&input, &base_context_checksum_); |
425 | 34.0k | assert(success); |
426 | 34.0k | if (UNLIKELY(ChecksumModifierForContext(base_context_checksum_, 0) == 0)) { |
427 | 0 | return Status::Corruption("Invalid base context checksum"); |
428 | 0 | } |
429 | 34.0k | computed_checksum += |
430 | 34.0k | ChecksumModifierForContext(base_context_checksum_, footer_offset); |
431 | 34.0k | if (UNLIKELY(computed_checksum != stored_checksum)) { |
432 | 0 | return Status::Corruption("Footer at " + std::to_string(footer_offset) + |
433 | 0 | " checksum mismatch"); |
434 | 0 | } |
435 | 34.0k | success = GetFixed32(&input, &metaindex_size); |
436 | 34.0k | assert(success); |
437 | 34.0k | (void)success; |
438 | 34.0k | uint64_t metaindex_end = footer_offset - GetBlockTrailerSize(); |
439 | 34.0k | metaindex_handle_ = |
440 | 34.0k | BlockHandle(metaindex_end - metaindex_size, metaindex_size); |
441 | | |
442 | | // Mark unpopulated |
443 | 34.0k | index_handle_ = BlockHandle::NullBlockHandle(); |
444 | | |
445 | | // 16 bytes of unchecked reserved padding |
446 | 34.0k | input.remove_prefix(16U); |
447 | | |
448 | | // 8 bytes of checked reserved padding (expected to be zero unless using a |
449 | | // future feature). |
450 | 34.0k | uint64_t reserved = 0; |
451 | 34.0k | success = GetFixed64(&input, &reserved); |
452 | 34.0k | assert(success); |
453 | 34.0k | if (UNLIKELY(reserved != 0)) { |
454 | 0 | return Status::NotSupported( |
455 | 0 | "File uses a future feature not supported in this version"); |
456 | 0 | } |
457 | | // End of part 2 |
458 | 34.0k | assert(input.data() == part3_ptr); |
459 | 34.0k | } else { |
460 | | // format_version_ < 6 |
461 | 813 | Status result = metaindex_handle_.DecodeFrom(&input); |
462 | 813 | if (result.ok()) { |
463 | 0 | result = index_handle_.DecodeFrom(&input); |
464 | 0 | } |
465 | 813 | if (!result.ok()) { |
466 | 0 | return result; |
467 | 0 | } |
468 | | // Padding in part2 is ignored |
469 | 813 | } |
470 | 34.8k | return Status::OK(); |
471 | 34.8k | } |
472 | | |
473 | 0 | std::string Footer::ToString() const { |
474 | 0 | std::string result; |
475 | 0 | result.reserve(1024); |
476 | |
|
477 | 0 | result.append("metaindex handle: " + metaindex_handle_.ToString() + |
478 | 0 | " offset: " + std::to_string(metaindex_handle_.offset()) + |
479 | 0 | " size: " + std::to_string(metaindex_handle_.size()) + "\n "); |
480 | 0 | result.append("index handle: " + index_handle_.ToString() + |
481 | 0 | " offset: " + std::to_string(index_handle_.offset()) + |
482 | 0 | " size: " + std::to_string(index_handle_.size()) + "\n "); |
483 | 0 | result.append("table_magic_number: " + std::to_string(table_magic_number_) + |
484 | 0 | "\n "); |
485 | 0 | if (!IsLegacyFooterFormat(table_magic_number_)) { |
486 | 0 | result.append("format version: " + std::to_string(format_version_) + "\n"); |
487 | 0 | } |
488 | 0 | return result; |
489 | 0 | } |
490 | | |
491 | 0 | bool& TEST_AllowUnsupportedFormatVersion() { |
492 | 0 | static bool allow = false; |
493 | 0 | return allow; |
494 | 0 | } |
495 | | |
496 | | static Status ReadFooterFromFileInternal( |
497 | | const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs, |
498 | | FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size, |
499 | 34.9k | Footer* footer, uint64_t enforce_table_magic_number) { |
500 | 34.9k | uint64_t file_size_from_file_system = 0; |
501 | 34.9k | Status s; |
502 | | // Prefer the more efficient FSRandomAccessFile::GetFileSize when available |
503 | 34.9k | s = file->file()->GetFileSize(&file_size_from_file_system); |
504 | 34.9k | if (!s.ok()) { |
505 | | // Fall back on FileSystem::GetFileSize on failure |
506 | 0 | s = fs.GetFileSize(file->file_name(), IOOptions(), |
507 | 0 | &file_size_from_file_system, nullptr); |
508 | 0 | if (!s.ok()) { |
509 | 0 | return s; |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | 34.9k | if (expected_file_size != file_size_from_file_system) { |
514 | | // When file is opened during DB Open, the expected file size is from |
515 | | // manifest. Otherwise it is not guaranteed. |
516 | 0 | return Status::Corruption("Sst file size mismatch between expected " + |
517 | 0 | std::to_string(expected_file_size) + |
518 | 0 | " and file system " + |
519 | 0 | std::to_string(file_size_from_file_system) + |
520 | 0 | " sstable: " + file->file_name()); |
521 | 0 | } |
522 | | |
523 | 34.9k | if (expected_file_size < Footer::kMinEncodedLength) { |
524 | 0 | return Status::Corruption("file is too short (" + |
525 | 0 | std::to_string(expected_file_size) + |
526 | 0 | " bytes) to be an " |
527 | 0 | "sstable: " + |
528 | 0 | file->file_name()); |
529 | 0 | } |
530 | | |
531 | 34.9k | std::array<char, Footer::kMaxEncodedLength + 1> footer_buf; |
532 | 34.9k | AlignedBuf internal_buf; |
533 | 34.9k | Slice footer_input; |
534 | 34.9k | uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength) |
535 | 34.9k | ? expected_file_size - Footer::kMaxEncodedLength |
536 | 34.9k | : 0; |
537 | | // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now, |
538 | | // there is no readahead for point lookups, so TryReadFromCache will fail if |
539 | | // the required data is not in the prefetch buffer. Once deadline is enabled |
540 | | // for iterator, TryReadFromCache might do a readahead. Revisit to see if we |
541 | | // need to pass a timeout at that point |
542 | | // TODO: rate limit footer reads. |
543 | 34.9k | if (prefetch_buffer == nullptr || |
544 | 34.8k | !prefetch_buffer->TryReadFromCache(opts, file, read_offset, |
545 | 34.8k | Footer::kMaxEncodedLength, |
546 | 34.8k | &footer_input, nullptr)) { |
547 | 34.8k | if (file->use_direct_io()) { |
548 | 0 | s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, |
549 | 0 | &footer_input, nullptr, &internal_buf); |
550 | 34.8k | } else { |
551 | 34.8k | s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, |
552 | 34.8k | &footer_input, footer_buf.data(), nullptr); |
553 | 34.8k | } |
554 | 34.8k | if (!s.ok()) { |
555 | 0 | return s; |
556 | 0 | } |
557 | 34.8k | } |
558 | | |
559 | 34.9k | TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input); |
560 | | |
561 | | // Check that we actually read the whole footer from the file. |
562 | 34.9k | if (footer_input.size() < Footer::kMinEncodedLength) { |
563 | 0 | return Status::Corruption( |
564 | 0 | "The number of bytes read for Footer input " + |
565 | 0 | std::to_string(footer_input.size()) + |
566 | 0 | " is smaller than minimum footer encoded length: " + |
567 | 0 | std::to_string(Footer::kMinEncodedLength) + " for file " + |
568 | 0 | file->file_name() + "\n"); |
569 | 0 | } |
570 | | |
571 | 34.9k | s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number); |
572 | 34.9k | if (!s.ok()) { |
573 | 0 | s = Status::CopyAppendMessage(s, " in ", file->file_name()); |
574 | 0 | return s; |
575 | 0 | } |
576 | 34.9k | return Status::OK(); |
577 | 34.9k | } |
578 | | |
579 | | Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, |
580 | | FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, |
581 | | uint64_t expected_file_size, Footer* footer, |
582 | | uint64_t enforce_table_magic_number, |
583 | 34.9k | Statistics* stats) { |
584 | 34.9k | Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, |
585 | 34.9k | expected_file_size, footer, |
586 | 34.9k | enforce_table_magic_number); |
587 | 34.9k | if (s.IsCorruption() && |
588 | 0 | CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) { |
589 | 0 | IOOptions new_opts = opts; |
590 | 0 | new_opts.verify_and_reconstruct_read = true; |
591 | 0 | footer->Reset(); |
592 | 0 | s = ReadFooterFromFileInternal(new_opts, file, fs, |
593 | 0 | /*prefetch_buffer=*/nullptr, |
594 | 0 | expected_file_size, footer, |
595 | 0 | enforce_table_magic_number); |
596 | 0 | RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT); |
597 | 0 | if (s.ok()) { |
598 | 0 | RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); |
599 | 0 | } |
600 | 0 | } |
601 | 34.9k | return s; |
602 | 34.9k | } |
603 | | |
604 | | namespace { |
605 | | // Custom handling for the last byte of a block, to avoid invoking streaming |
606 | | // API to get an effective block checksum. This function is its own inverse |
607 | | // because it uses xor. |
608 | 182k | inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) { |
609 | | // This strategy bears some resemblance to extending a CRC checksum by one |
610 | | // more byte, except we don't need to re-mix the input checksum as long as |
611 | | // we do this step only once (per checksum). |
612 | 182k | const uint32_t kRandomPrime = 0x6b9083d9; |
613 | 182k | return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime; |
614 | 182k | } |
615 | | } // namespace |
616 | | |
617 | | uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data, |
618 | 151k | size_t data_size) { |
619 | 151k | switch (type) { |
620 | 0 | case kCRC32c: |
621 | 0 | return crc32c::Mask(crc32c::Value(data, data_size)); |
622 | 0 | case kxxHash: |
623 | 0 | return XXH32(data, data_size, /*seed*/ 0); |
624 | 0 | case kxxHash64: |
625 | 0 | return Lower32of64(XXH64(data, data_size, /*seed*/ 0)); |
626 | 151k | case kXXH3: { |
627 | 151k | if (data_size == 0) { |
628 | | // Special case because of special handling for last byte, not |
629 | | // present in this case. Can be any value different from other |
630 | | // small input size checksums. |
631 | 0 | return 0; |
632 | 151k | } else { |
633 | | // See corresponding code in ComputeBuiltinChecksumWithLastByte |
634 | 151k | uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1)); |
635 | 151k | return ModifyChecksumForLastByte(v, data[data_size - 1]); |
636 | 151k | } |
637 | 151k | } |
638 | 0 | default: // including kNoChecksum |
639 | 0 | return 0; |
640 | 151k | } |
641 | 151k | } |
642 | | |
643 | | uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, |
644 | 31.7k | size_t data_size, char last_byte) { |
645 | 31.7k | switch (type) { |
646 | 0 | case kCRC32c: { |
647 | 0 | uint32_t crc = crc32c::Value(data, data_size); |
648 | | // Extend to cover last byte (compression type) |
649 | 0 | crc = crc32c::Extend(crc, &last_byte, 1); |
650 | 0 | return crc32c::Mask(crc); |
651 | 0 | } |
652 | 0 | case kxxHash: { |
653 | 0 | XXH32_state_t* const state = XXH32_createState(); |
654 | 0 | XXH32_reset(state, 0); |
655 | 0 | XXH32_update(state, data, data_size); |
656 | | // Extend to cover last byte (compression type) |
657 | 0 | XXH32_update(state, &last_byte, 1); |
658 | 0 | uint32_t v = XXH32_digest(state); |
659 | 0 | XXH32_freeState(state); |
660 | 0 | return v; |
661 | 0 | } |
662 | 0 | case kxxHash64: { |
663 | 0 | XXH64_state_t* const state = XXH64_createState(); |
664 | 0 | XXH64_reset(state, 0); |
665 | 0 | XXH64_update(state, data, data_size); |
666 | | // Extend to cover last byte (compression type) |
667 | 0 | XXH64_update(state, &last_byte, 1); |
668 | 0 | uint32_t v = Lower32of64(XXH64_digest(state)); |
669 | 0 | XXH64_freeState(state); |
670 | 0 | return v; |
671 | 0 | } |
672 | 31.7k | case kXXH3: { |
673 | | // XXH3 is a complicated hash function that is extremely fast on |
674 | | // contiguous input, but that makes its streaming support rather |
675 | | // complex. It is worth custom handling of the last byte (`type`) |
676 | | // in order to avoid allocating a large state object and bringing |
677 | | // that code complexity into CPU working set. |
678 | 31.7k | uint32_t v = Lower32of64(XXH3_64bits(data, data_size)); |
679 | 31.7k | return ModifyChecksumForLastByte(v, last_byte); |
680 | 0 | } |
681 | 0 | default: // including kNoChecksum |
682 | 0 | return 0; |
683 | 31.7k | } |
684 | 31.7k | } |
685 | | |
686 | | Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor, |
687 | | BlockContents* out_contents, |
688 | | const ImmutableOptions& ioptions, |
689 | 0 | MemoryAllocator* allocator) { |
690 | 0 | assert(args.compression_type != kNoCompression && "Invalid compression type"); |
691 | |
|
692 | 0 | StopWatchNano timer(ioptions.clock, |
693 | 0 | ShouldReportDetailedTime(ioptions.env, ioptions.stats)); |
694 | |
|
695 | 0 | Status s = decompressor.ExtractUncompressedSize(args); |
696 | 0 | if (UNLIKELY(!s.ok())) { |
697 | 0 | return s; |
698 | 0 | } |
699 | 0 | CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator); |
700 | 0 | s = decompressor.DecompressBlock(args, ubuf.get()); |
701 | 0 | if (UNLIKELY(!s.ok())) { |
702 | 0 | return s; |
703 | 0 | } |
704 | | |
705 | 0 | *out_contents = BlockContents(std::move(ubuf), args.uncompressed_size); |
706 | |
|
707 | 0 | if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) { |
708 | 0 | RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, |
709 | 0 | timer.ElapsedNanos()); |
710 | 0 | } |
711 | 0 | RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, |
712 | 0 | args.compressed_data.size()); |
713 | 0 | RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size()); |
714 | 0 | RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); |
715 | |
|
716 | 0 | TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue", |
717 | 0 | static_cast<void*>(&s)); |
718 | 0 | TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput", |
719 | 0 | static_cast<void*>(out_contents)); |
720 | |
|
721 | 0 | return s; |
722 | 0 | } |
723 | | |
724 | | Status DecompressBlockData(const char* data, size_t size, CompressionType type, |
725 | | Decompressor& decompressor, |
726 | | BlockContents* out_contents, |
727 | | const ImmutableOptions& ioptions, |
728 | | MemoryAllocator* allocator, |
729 | 0 | Decompressor::ManagedWorkingArea* working_area) { |
730 | 0 | Decompressor::Args args; |
731 | 0 | args.compressed_data = Slice(data, size); |
732 | 0 | args.compression_type = type; |
733 | 0 | args.working_area = working_area; |
734 | 0 | return DecompressBlockData(args, decompressor, out_contents, ioptions, |
735 | 0 | allocator); |
736 | 0 | } |
737 | | |
738 | | Status DecompressSerializedBlock(const char* data, size_t size, |
739 | | CompressionType type, |
740 | | Decompressor& decompressor, |
741 | | BlockContents* out_contents, |
742 | | const ImmutableOptions& ioptions, |
743 | 0 | MemoryAllocator* allocator) { |
744 | 0 | assert(data[size] != kNoCompression); |
745 | 0 | assert(data[size] == static_cast<char>(type)); |
746 | 0 | return DecompressBlockData(data, size, type, decompressor, out_contents, |
747 | 0 | ioptions, allocator); |
748 | 0 | } |
749 | | |
750 | | Status DecompressSerializedBlock(Decompressor::Args& args, |
751 | | Decompressor& decompressor, |
752 | | BlockContents* out_contents, |
753 | | const ImmutableOptions& ioptions, |
754 | 0 | MemoryAllocator* allocator) { |
755 | 0 | assert(args.compressed_data.data()[args.compressed_data.size()] != |
756 | 0 | kNoCompression); |
757 | 0 | assert(args.compressed_data.data()[args.compressed_data.size()] == |
758 | 0 | static_cast<char>(args.compression_type)); |
759 | 0 | return DecompressBlockData(args, decompressor, out_contents, ioptions, |
760 | 0 | allocator); |
761 | 0 | } |
762 | | |
763 | | // Replace the contents of db_host_id with the actual hostname, if db_host_id |
764 | | // matches the keyword kHostnameForDbHostId |
765 | 8.06k | Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) { |
766 | 8.06k | assert(db_host_id); |
767 | 8.06k | if (*db_host_id == kHostnameForDbHostId) { |
768 | 8.06k | Status s = env->GetHostNameString(db_host_id); |
769 | 8.06k | if (!s.ok()) { |
770 | 0 | db_host_id->clear(); |
771 | 0 | } |
772 | 8.06k | return s; |
773 | 8.06k | } |
774 | | |
775 | 0 | return Status::OK(); |
776 | 8.06k | } |
777 | | } // namespace ROCKSDB_NAMESPACE |