/src/rocksdb/table/block_based/block.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #pragma once |
11 | | #include <stddef.h> |
12 | | #include <stdint.h> |
13 | | |
14 | | #include <string> |
15 | | #include <vector> |
16 | | |
17 | | #include "db/kv_checksum.h" |
18 | | #include "db/pinned_iterators_manager.h" |
19 | | #include "port/malloc.h" |
20 | | #include "rocksdb/advanced_cache.h" |
21 | | #include "rocksdb/iterator.h" |
22 | | #include "rocksdb/options.h" |
23 | | #include "rocksdb/statistics.h" |
24 | | #include "rocksdb/table.h" |
25 | | #include "table/block_based/block_prefix_index.h" |
26 | | #include "table/block_based/data_block_hash_index.h" |
27 | | #include "table/format.h" |
28 | | #include "table/internal_iterator.h" |
29 | | #include "test_util/sync_point.h" |
30 | | #include "util/random.h" |
31 | | |
32 | | namespace ROCKSDB_NAMESPACE { |
33 | | |
34 | | struct BlockContents; |
35 | | class Comparator; |
36 | | template <class TValue> |
37 | | class BlockIter; |
38 | | class DataBlockIter; |
39 | | class IndexBlockIter; |
40 | | class MetaBlockIter; |
41 | | class BlockPrefixIndex; |
42 | | |
43 | | // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data |
44 | | // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of |
45 | | // bytes in the Block we update the bitmap and increment |
46 | | // READ_AMP_ESTIMATE_USEFUL_BYTES. |
47 | | class BlockReadAmpBitmap { |
48 | | public: |
49 | | explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, |
50 | | Statistics* statistics) |
51 | | : bitmap_(nullptr), |
52 | | bytes_per_bit_pow_(0), |
53 | | statistics_(statistics), |
54 | | rnd_(Random::GetTLSInstance()->Uniform( |
55 | 0 | static_cast<int>(bytes_per_bit))) { |
56 | 0 | TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); |
57 | 0 | assert(block_size > 0 && bytes_per_bit > 0); |
58 | | |
59 | | // convert bytes_per_bit to be a power of 2 |
60 | 0 | while (bytes_per_bit >>= 1) { |
61 | 0 | bytes_per_bit_pow_++; |
62 | 0 | } |
63 | | |
64 | | // num_bits_needed = ceil(block_size / bytes_per_bit) |
65 | 0 | size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; |
66 | 0 | assert(num_bits_needed > 0); |
67 | | |
68 | | // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) |
69 | 0 | size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; |
70 | | |
71 | | // Create bitmap and set all the bits to 0 |
72 | 0 | bitmap_ = new std::atomic<uint32_t>[bitmap_size](); |
73 | |
|
74 | 0 | RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); |
75 | 0 | } |
76 | | |
77 | 0 | ~BlockReadAmpBitmap() { delete[] bitmap_; } |
78 | | |
79 | 0 | void Mark(uint32_t start_offset, uint32_t end_offset) { |
80 | 0 | assert(end_offset >= start_offset); |
81 | | // Index of first bit in mask |
82 | 0 | uint32_t start_bit = |
83 | 0 | (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> |
84 | 0 | bytes_per_bit_pow_; |
85 | | // Index of last bit in mask + 1 |
86 | 0 | uint32_t exclusive_end_bit = |
87 | 0 | (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; |
88 | 0 | if (start_bit >= exclusive_end_bit) { |
89 | 0 | return; |
90 | 0 | } |
91 | 0 | assert(exclusive_end_bit > 0); |
92 | |
|
93 | 0 | if (GetAndSet(start_bit) == 0) { |
94 | 0 | uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) |
95 | 0 | << bytes_per_bit_pow_; |
96 | 0 | RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, |
97 | 0 | new_useful_bytes); |
98 | 0 | } |
99 | 0 | } |
100 | | |
101 | 0 | Statistics* GetStatistics() { |
102 | 0 | return statistics_.load(std::memory_order_relaxed); |
103 | 0 | } |
104 | | |
105 | 0 | void SetStatistics(Statistics* stats) { statistics_.store(stats); } |
106 | | |
107 | 0 | uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } |
108 | | |
109 | 0 | size_t ApproximateMemoryUsage() const { |
110 | 0 | #ifdef ROCKSDB_MALLOC_USABLE_SIZE |
111 | 0 | return malloc_usable_size((void*)this); |
112 | 0 | #endif // ROCKSDB_MALLOC_USABLE_SIZE |
113 | 0 | return sizeof(*this); |
114 | 0 | } |
115 | | |
116 | | private: |
117 | | // Get the current value of bit at `bit_idx` and set it to 1 |
118 | 0 | inline bool GetAndSet(uint32_t bit_idx) { |
119 | 0 | const uint32_t byte_idx = bit_idx / kBitsPerEntry; |
120 | 0 | const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); |
121 | |
|
122 | 0 | return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & |
123 | 0 | bit_mask; |
124 | 0 | } |
125 | | |
126 | | const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes |
127 | | const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits |
128 | | |
129 | | // Bitmap used to record the bytes that we read, use atomic to protect |
130 | | // against multiple threads updating the same bit |
131 | | std::atomic<uint32_t>* bitmap_; |
132 | | // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize |
133 | | // muliplication and division |
134 | | uint8_t bytes_per_bit_pow_; |
135 | | // Pointer to DB Statistics object, Since this bitmap may outlive the DB |
136 | | // this pointer maybe invalid, but the DB will update it to a valid pointer |
137 | | // by using SetStatistics() before calling Mark() |
138 | | std::atomic<Statistics*> statistics_; |
139 | | uint32_t rnd_; |
140 | | }; |
141 | | |
142 | | // class Block is the uncompressed and "parsed" form for blocks containing |
143 | | // key-value pairs. (See BlockContents comments for more on terminology.) |
144 | | // This includes the in-memory representation of data blocks, index blocks |
145 | | // (including partitions), range deletion blocks, properties blocks, metaindex |
146 | | // blocks, as well as the top level of the partitioned filter structure (which |
147 | | // is actually an index of the filter partitions). It is NOT suitable for |
148 | | // compressed blocks in general, filter blocks/partitions, or compression |
149 | | // dictionaries. |
150 | | // |
151 | | // See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format |
152 | | // for details of the format and the various block types. |
153 | | // |
154 | | // TODO: Rename to ParsedKvBlock? |
155 | | class Block { |
156 | | public: |
157 | | // Initialize the block with the specified contents. |
158 | | explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0, |
159 | | Statistics* statistics = nullptr); |
160 | | // No copying allowed |
161 | | Block(const Block&) = delete; |
162 | | void operator=(const Block&) = delete; |
163 | | |
164 | | ~Block(); |
165 | | |
166 | 0 | size_t size() const { return size_; } |
167 | 7.47k | const char* data() const { return data_; } |
168 | | // The additional memory space taken by the block data. |
169 | 11.6k | size_t usable_size() const { return contents_.usable_size(); } |
170 | | uint32_t NumRestarts() const; |
171 | 11.6k | bool own_bytes() const { return contents_.own_bytes(); } |
172 | | |
173 | | BlockBasedTableOptions::DataBlockIndexType IndexType() const; |
174 | | |
175 | | // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key |
176 | | // comparator. |
177 | | // |
178 | | // If iter is null, return new Iterator |
179 | | // If iter is not null, update this one and return it as Iterator* |
180 | | // |
181 | | // Updates read_amp_bitmap_ if it is not nullptr. |
182 | | // |
183 | | // If `block_contents_pinned` is true, the caller will guarantee that when |
184 | | // the cleanup functions are transferred from the iterator to other |
185 | | // classes, e.g. PinnableSlice, the pointer to the bytes will still be |
186 | | // valid. Either the iterator holds cache handle or ownership of some resource |
187 | | // and release them in a release function, or caller is sure that the data |
188 | | // will not go away (for example, it's from mmapped file which will not be |
189 | | // closed). |
190 | | // |
191 | | // `user_defined_timestamps_persisted` controls whether a min timestamp is |
192 | | // padded while key is being parsed from the block. |
193 | | // |
194 | | // NOTE: for the hash based lookup, if a key prefix doesn't match any key, |
195 | | // the iterator will simply be set as "invalid", rather than returning |
196 | | // the key that is just pass the target key. |
197 | | DataBlockIter* NewDataIterator(const Comparator* raw_ucmp, |
198 | | SequenceNumber global_seqno, |
199 | | DataBlockIter* iter = nullptr, |
200 | | Statistics* stats = nullptr, |
201 | | bool block_contents_pinned = false, |
202 | | bool user_defined_timestamps_persisted = true); |
203 | | |
204 | | // Returns an MetaBlockIter for iterating over blocks containing metadata |
205 | | // (like Properties blocks). Unlike data blocks, the keys for these blocks |
206 | | // do not contain sequence numbers, do not use a user-define comparator, and |
207 | | // do not track read amplification/statistics. Additionally, MetaBlocks will |
208 | | // not assert if the block is formatted improperly. |
209 | | // |
210 | | // If `block_contents_pinned` is true, the caller will guarantee that when |
211 | | // the cleanup functions are transferred from the iterator to other |
212 | | // classes, e.g. PinnableSlice, the pointer to the bytes will still be |
213 | | // valid. Either the iterator holds cache handle or ownership of some resource |
214 | | // and release them in a release function, or caller is sure that the data |
215 | | // will not go away (for example, it's from mmapped file which will not be |
216 | | // closed). |
217 | | MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false); |
218 | | |
219 | | // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key |
220 | | // comparator. |
221 | | // |
222 | | // key_includes_seq, default true, means that the keys are in internal key |
223 | | // format. |
224 | | // value_is_full, default true, means that no delta encoding is |
225 | | // applied to values. |
226 | | // |
227 | | // If `prefix_index` is not nullptr this block will do hash lookup for the key |
228 | | // prefix. If total_order_seek is true, prefix_index_ is ignored. |
229 | | // |
230 | | // `have_first_key` controls whether IndexValue will contain |
231 | | // first_internal_key. It affects data serialization format, so the same value |
232 | | // have_first_key must be used when writing and reading index. |
233 | | // It is determined by IndexType property of the table. |
234 | | // `user_defined_timestamps_persisted` controls whether a min timestamp is |
235 | | // padded while key is being parsed from the block. |
236 | | IndexBlockIter* NewIndexIterator( |
237 | | const Comparator* raw_ucmp, SequenceNumber global_seqno, |
238 | | IndexBlockIter* iter, Statistics* stats, bool total_order_seek, |
239 | | bool have_first_key, bool key_includes_seq, bool value_is_full, |
240 | | bool block_contents_pinned = false, |
241 | | bool user_defined_timestamps_persisted = true, |
242 | | BlockPrefixIndex* prefix_index = nullptr); |
243 | | |
244 | | // Report an approximation of how much memory has been used. |
245 | | size_t ApproximateMemoryUsage() const; |
246 | | |
247 | | // For TypedCacheInterface |
248 | 0 | const Slice& ContentSlice() const { return contents_.data; } |
249 | | |
250 | | // Initializes per key-value checksum protection. |
251 | | // After this method is called, each DataBlockIterator returned |
252 | | // by NewDataIterator will verify per key-value checksum for any key it read. |
253 | | void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, |
254 | | const Comparator* raw_ucmp); |
255 | | |
256 | | // Initializes per key-value checksum protection. |
257 | | // After this method is called, each IndexBlockIterator returned |
258 | | // by NewIndexIterator will verify per key-value checksum for any key it read. |
259 | | // value_is_full and index_has_first_key are needed to be able to parse |
260 | | // the index block content and construct checksums. |
261 | | void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, |
262 | | const Comparator* raw_ucmp, |
263 | | bool value_is_full, |
264 | | bool index_has_first_key); |
265 | | |
266 | | // Initializes per key-value checksum protection. |
267 | | // After this method is called, each MetaBlockIter returned |
268 | | // by NewMetaIterator will verify per key-value checksum for any key it read. |
269 | | void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); |
270 | | |
271 | | static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, |
272 | 0 | const Slice& key, const Slice& value) { |
273 | 0 | ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); |
274 | 0 | } |
275 | | |
276 | 0 | const char* TEST_GetKVChecksum() const { return kv_checksum_; } |
277 | | |
278 | | private: |
279 | | BlockContents contents_; |
280 | | const char* data_; // contents_.data.data() |
281 | | size_t size_; // contents_.data.size() |
282 | | uint32_t restart_offset_; // Offset in data_ of restart array |
283 | | uint32_t num_restarts_; |
284 | | std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_; |
285 | | char* kv_checksum_{nullptr}; |
286 | | uint32_t checksum_size_{0}; |
287 | | // Used by block iterators to calculate current key index within a block |
288 | | uint32_t block_restart_interval_{0}; |
289 | | uint8_t protection_bytes_per_key_{0}; |
290 | | DataBlockHashIndex data_block_hash_index_; |
291 | | }; |
292 | | |
293 | | // A `BlockIter` iterates over the entries in a `Block`'s data buffer. The |
294 | | // format of this data buffer is an uncompressed, sorted sequence of key-value |
295 | | // pairs (see `Block` API for more details). |
296 | | // |
297 | | // Notably, the keys may either be in internal key format or user key format. |
298 | | // Subclasses are responsible for configuring the key format. |
299 | | // |
300 | | // `BlockIter` intends to provide final overrides for all of |
301 | | // `InternalIteratorBase` functions that can move the iterator. It does |
302 | | // this to guarantee `UpdateKey()` is called exactly once after each key |
303 | | // movement potentially visible to users. In this step, the key is prepared |
304 | | // (e.g., serialized if global seqno is in effect) so it can be returned |
305 | | // immediately when the user asks for it via calling `key() const`. |
306 | | // |
307 | | // For its subclasses, it provides protected variants of the above-mentioned |
308 | | // final-overridden methods. They are named with the "Impl" suffix, e.g., |
309 | | // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These |
310 | | // "Impl" functions are responsible for positioning `raw_key_` but not |
311 | | // invoking `UpdateKey()`. |
312 | | // |
313 | | // Per key-value checksum is enabled if relevant states are passed in during |
314 | | // `InitializeBase()`. The checksum verification is done in each call to |
315 | | // UpdateKey() for the current key. Each subclass is responsible for keeping |
316 | | // track of cur_entry_idx_, the index of the current key within the block. |
317 | | // BlockIter uses this index to get the corresponding checksum for current key. |
318 | | // Additional checksum verification may be done in subclasses if they read keys |
319 | | // other than the key being processed in UpdateKey(). |
320 | | template <class TValue> |
321 | | class BlockIter : public InternalIteratorBase<TValue> { |
322 | | public: |
323 | | // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do |
324 | | // nothing. Calls cleanup functions. |
325 | 10.1k | virtual void Invalidate(const Status& s) { |
326 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. |
327 | 10.1k | assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); |
328 | | |
329 | 10.1k | data_ = nullptr; |
330 | 10.1k | current_ = restarts_; |
331 | 10.1k | status_ = s; |
332 | | |
333 | | // Call cleanup callbacks. |
334 | 10.1k | Cleanable::Reset(); |
335 | 10.1k | } rocksdb::BlockIter<rocksdb::Slice>::Invalidate(rocksdb::Status const&) Line | Count | Source | 325 | 10.1k | virtual void Invalidate(const Status& s) { | 326 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. | 327 | 10.1k | assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); | 328 | | | 329 | 10.1k | data_ = nullptr; | 330 | 10.1k | current_ = restarts_; | 331 | 10.1k | status_ = s; | 332 | | | 333 | | // Call cleanup callbacks. | 334 | 10.1k | Cleanable::Reset(); | 335 | 10.1k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::Invalidate(rocksdb::Status const&) |
336 | | |
337 | 1.14M | bool Valid() const override { |
338 | | // When status_ is not ok, iter should be invalid. |
339 | 1.14M | assert(status_.ok() || current_ >= restarts_); |
340 | 1.14M | return current_ < restarts_; |
341 | 1.14M | } rocksdb::BlockIter<rocksdb::Slice>::Valid() const Line | Count | Source | 337 | 1.10M | bool Valid() const override { | 338 | | // When status_ is not ok, iter should be invalid. | 339 | 1.10M | assert(status_.ok() || current_ >= restarts_); | 340 | 1.10M | return current_ < restarts_; | 341 | 1.10M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Valid() const Line | Count | Source | 337 | 36.8k | bool Valid() const override { | 338 | | // When status_ is not ok, iter should be invalid. | 339 | 36.8k | assert(status_.ok() || current_ >= restarts_); | 340 | 36.8k | return current_ < restarts_; | 341 | 36.8k | } |
|
342 | | |
343 | 28.5k | void SeekToFirst() override final { |
344 | | #ifndef NDEBUG |
345 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; |
346 | | #endif |
347 | 28.5k | SeekToFirstImpl(); |
348 | 28.5k | UpdateKey(); |
349 | 28.5k | } rocksdb::BlockIter<rocksdb::Slice>::SeekToFirst() Line | Count | Source | 343 | 22.5k | void SeekToFirst() override final { | 344 | | #ifndef NDEBUG | 345 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; | 346 | | #endif | 347 | 22.5k | SeekToFirstImpl(); | 348 | 22.5k | UpdateKey(); | 349 | 22.5k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToFirst() Line | Count | Source | 343 | 6.08k | void SeekToFirst() override final { | 344 | | #ifndef NDEBUG | 345 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; | 346 | | #endif | 347 | 6.08k | SeekToFirstImpl(); | 348 | 6.08k | UpdateKey(); | 349 | 6.08k | } |
|
350 | | |
351 | 778 | void SeekToLast() override final { |
352 | 778 | SeekToLastImpl(); |
353 | 778 | UpdateKey(); |
354 | 778 | } rocksdb::BlockIter<rocksdb::Slice>::SeekToLast() Line | Count | Source | 351 | 389 | void SeekToLast() override final { | 352 | 389 | SeekToLastImpl(); | 353 | 389 | UpdateKey(); | 354 | 389 | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToLast() Line | Count | Source | 351 | 389 | void SeekToLast() override final { | 352 | 389 | SeekToLastImpl(); | 353 | 389 | UpdateKey(); | 354 | 389 | } |
|
355 | | |
356 | 32.2k | void Seek(const Slice& target) override final { |
357 | 32.2k | SeekImpl(target); |
358 | 32.2k | UpdateKey(); |
359 | 32.2k | } rocksdb::BlockIter<rocksdb::Slice>::Seek(rocksdb::Slice const&) Line | Count | Source | 356 | 30.3k | void Seek(const Slice& target) override final { | 357 | 30.3k | SeekImpl(target); | 358 | 30.3k | UpdateKey(); | 359 | 30.3k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Seek(rocksdb::Slice const&) Line | Count | Source | 356 | 1.82k | void Seek(const Slice& target) override final { | 357 | 1.82k | SeekImpl(target); | 358 | 1.82k | UpdateKey(); | 359 | 1.82k | } |
|
360 | | |
361 | 1.14k | void SeekForPrev(const Slice& target) override final { |
362 | 1.14k | SeekForPrevImpl(target); |
363 | 1.14k | UpdateKey(); |
364 | 1.14k | } rocksdb::BlockIter<rocksdb::Slice>::SeekForPrev(rocksdb::Slice const&) Line | Count | Source | 361 | 1.14k | void SeekForPrev(const Slice& target) override final { | 362 | 1.14k | SeekForPrevImpl(target); | 363 | 1.14k | UpdateKey(); | 364 | 1.14k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SeekForPrev(rocksdb::Slice const&) |
365 | | |
366 | 488k | void Next() override final { |
367 | 488k | NextImpl(); |
368 | 488k | UpdateKey(); |
369 | 488k | } rocksdb::BlockIter<rocksdb::Slice>::Next() Line | Count | Source | 366 | 479k | void Next() override final { | 367 | 479k | NextImpl(); | 368 | 479k | UpdateKey(); | 369 | 479k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Next() Line | Count | Source | 366 | 8.59k | void Next() override final { | 367 | 8.59k | NextImpl(); | 368 | 8.59k | UpdateKey(); | 369 | 8.59k | } |
|
370 | | |
371 | 0 | bool NextAndGetResult(IterateResult* result) override final { |
372 | | // This does not need to call `UpdateKey()` as the parent class only has |
373 | | // access to the `UpdateKey()`-invoking functions. |
374 | 0 | return InternalIteratorBase<TValue>::NextAndGetResult(result); |
375 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NextAndGetResult(rocksdb::IterateResult*) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NextAndGetResult(rocksdb::IterateResult*) |
376 | | |
377 | 2.68k | void Prev() override final { |
378 | 2.68k | PrevImpl(); |
379 | 2.68k | UpdateKey(); |
380 | 2.68k | } rocksdb::BlockIter<rocksdb::Slice>::Prev() Line | Count | Source | 377 | 1.14k | void Prev() override final { | 378 | 1.14k | PrevImpl(); | 379 | 1.14k | UpdateKey(); | 380 | 1.14k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Prev() Line | Count | Source | 377 | 1.53k | void Prev() override final { | 378 | 1.53k | PrevImpl(); | 379 | 1.53k | UpdateKey(); | 380 | 1.53k | } |
|
381 | | |
382 | 330k | Status status() const override { return status_; } rocksdb::BlockIter<rocksdb::Slice>::status() const Line | Count | Source | 382 | 312k | Status status() const override { return status_; } |
rocksdb::BlockIter<rocksdb::IndexValue>::status() const Line | Count | Source | 382 | 17.7k | Status status() const override { return status_; } |
|
383 | | |
384 | 730k | Slice key() const override { |
385 | 730k | assert(Valid()); |
386 | 730k | return key_; |
387 | 730k | } rocksdb::BlockIter<rocksdb::Slice>::key() const Line | Count | Source | 384 | 730k | Slice key() const override { | 385 | 730k | assert(Valid()); | 386 | 730k | return key_; | 387 | 730k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::key() const |
388 | | |
389 | | #ifndef NDEBUG |
390 | | ~BlockIter() override { |
391 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. |
392 | | assert(!pinned_iters_mgr_ || |
393 | | (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); |
394 | | status_.PermitUncheckedError(); |
395 | | } |
396 | | |
397 | | void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { |
398 | | pinned_iters_mgr_ = pinned_iters_mgr; |
399 | | } |
400 | | |
401 | | PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; |
402 | | |
403 | | bool TEST_Corrupt_Callback(const std::string& sync_point) { |
404 | | bool corrupt = false; |
405 | | TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt)); |
406 | | |
407 | | if (corrupt) { |
408 | | CorruptionError(); |
409 | | } |
410 | | return corrupt; |
411 | | } |
412 | | #endif |
413 | | |
414 | 201k | bool IsKeyPinned() const override { |
415 | 201k | return block_contents_pinned_ && key_pinned_; |
416 | 201k | } rocksdb::BlockIter<rocksdb::Slice>::IsKeyPinned() const Line | Count | Source | 414 | 201k | bool IsKeyPinned() const override { | 415 | 201k | return block_contents_pinned_ && key_pinned_; | 416 | 201k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsKeyPinned() const |
417 | | |
418 | 100k | bool IsValuePinned() const override { return block_contents_pinned_; } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsValuePinned() const rocksdb::BlockIter<rocksdb::Slice>::IsValuePinned() const Line | Count | Source | 418 | 100k | bool IsValuePinned() const override { return block_contents_pinned_; } |
|
419 | | |
420 | | size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } |
421 | | |
422 | 0 | uint32_t ValueOffset() const { |
423 | 0 | return static_cast<uint32_t>(value_.data() - data_); |
424 | 0 | } |
425 | | |
426 | 13.5k | void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } rocksdb::BlockIter<rocksdb::Slice>::SetCacheHandle(rocksdb::Cache::Handle*) Line | Count | Source | 426 | 13.5k | void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SetCacheHandle(rocksdb::Cache::Handle*) |
427 | | |
428 | | Cache::Handle* cache_handle() { return cache_handle_; } |
429 | | |
430 | | protected: |
431 | | std::unique_ptr<InternalKeyComparator> icmp_; |
432 | | const char* data_; // underlying block contents |
433 | | uint32_t num_restarts_; // Number of uint32_t entries in restart array |
434 | | |
435 | | // Index of restart block in which current_ or current_-1 falls |
436 | | uint32_t restart_index_; |
437 | | uint32_t restarts_; // Offset of restart array (list of fixed32) |
438 | | // current_ is offset in data_ of current entry. >= restarts_ if !Valid |
439 | | uint32_t current_; |
440 | | // Raw key from block. |
441 | | IterKey raw_key_; |
442 | | // Buffer for key data when global seqno assignment is enabled. |
443 | | IterKey key_buf_; |
444 | | Slice value_; |
445 | | Status status_; |
446 | | // Key to be exposed to users. |
447 | | Slice key_; |
448 | | SequenceNumber global_seqno_; |
449 | | // Size of the user-defined timestamp. |
450 | | size_t ts_sz_ = 0; |
451 | | // If user-defined timestamp is enabled but not persisted. A min timestamp |
452 | | // will be padded to the key during key parsing where it applies. Such as when |
453 | | // parsing keys from data block, index block, parsing the first internal |
454 | | // key from IndexValue entry. Min timestamp padding is different for when |
455 | | // `raw_key_` is a user key vs is an internal key. |
456 | | // |
457 | | // This only applies to data block and index blocks including index block for |
458 | | // data blocks, index block for partitioned filter blocks, index block for |
459 | | // partitioned index blocks. In summary, this only applies to block whose key |
460 | | // are real user keys or internal keys created from user keys. |
461 | | bool pad_min_timestamp_; |
462 | | |
463 | | // Per key-value checksum related states |
464 | | const char* kv_checksum_; |
465 | | int32_t cur_entry_idx_; |
466 | | uint32_t block_restart_interval_; |
467 | | uint8_t protection_bytes_per_key_; |
468 | | |
469 | | bool key_pinned_; |
470 | | // Whether the block data is guaranteed to outlive this iterator, and |
471 | | // as long as the cleanup functions are transferred to another class, |
472 | | // e.g. PinnableSlice, the pointer to the bytes will still be valid. |
473 | | bool block_contents_pinned_; |
474 | | |
475 | | virtual void SeekToFirstImpl() = 0; |
476 | | virtual void SeekToLastImpl() = 0; |
477 | | virtual void SeekImpl(const Slice& target) = 0; |
478 | | virtual void SeekForPrevImpl(const Slice& target) = 0; |
479 | | virtual void NextImpl() = 0; |
480 | | virtual void PrevImpl() = 0; |
481 | | |
482 | | // Returns the restart interval of this block. |
483 | | // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. |
484 | 0 | virtual uint32_t GetRestartInterval() { |
485 | 0 | if (num_restarts_ <= 1 || data_ == nullptr) { |
486 | 0 | return 0; |
487 | 0 | } |
488 | 0 | SeekToFirstImpl(); |
489 | 0 | uint32_t end_index = GetRestartPoint(1); |
490 | 0 | uint32_t count = 1; |
491 | 0 | while (NextEntryOffset() < end_index && status_.ok()) { |
492 | 0 | assert(Valid()); |
493 | 0 | NextImpl(); |
494 | 0 | ++count; |
495 | 0 | } |
496 | 0 | return count; |
497 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::GetRestartInterval() Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartInterval() |
498 | | |
499 | | // Returns the number of keys in this block. |
500 | 0 | virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { |
501 | 0 | if (num_restarts_ == 0 || data_ == nullptr) { |
502 | 0 | return 0; |
503 | 0 | } |
504 | 0 | uint32_t count = (num_restarts_ - 1) * block_restart_interval; |
505 | | // Add number of keys from the last restart interval |
506 | 0 | SeekToRestartPoint(num_restarts_ - 1); |
507 | 0 | while (NextEntryOffset() < restarts_ && status_.ok()) { |
508 | 0 | NextImpl(); |
509 | 0 | ++count; |
510 | 0 | } |
511 | 0 | return count; |
512 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NumberOfKeys(unsigned int) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NumberOfKeys(unsigned int) |
513 | | |
514 | | // Stores whether the current key has a shared bytes with prev key in |
515 | | // *is_shared. |
516 | | // Sets raw_key_, value_ to the current parsed key and value. |
517 | | // Sets restart_index_ to point to the restart interval that contains |
518 | | // the current key. |
519 | | template <typename DecodeEntryFunc> |
520 | | inline bool ParseNextKey(bool* is_shared); |
521 | | |
522 | | // protection_bytes_per_key, kv_checksum, and block_restart_interval |
523 | | // are needed only for per kv checksum verification. |
524 | | void InitializeBase(const Comparator* raw_ucmp, const char* data, |
525 | | uint32_t restarts, uint32_t num_restarts, |
526 | | SequenceNumber global_seqno, bool block_contents_pinned, |
527 | | bool user_defined_timestamp_persisted, |
528 | | |
529 | | uint8_t protection_bytes_per_key, const char* kv_checksum, |
530 | 42.3k | uint32_t block_restart_interval) { |
531 | 42.3k | assert(data_ == nullptr); // Ensure it is called only once |
532 | 42.3k | assert(num_restarts > 0); // Ensure the param is valid |
533 | | |
534 | 42.3k | icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp); |
535 | 42.3k | data_ = data; |
536 | 42.3k | restarts_ = restarts; |
537 | 42.3k | num_restarts_ = num_restarts; |
538 | 42.3k | current_ = restarts_; |
539 | 42.3k | restart_index_ = num_restarts_; |
540 | 42.3k | global_seqno_ = global_seqno; |
541 | 42.3k | if (raw_ucmp != nullptr) { |
542 | 42.3k | ts_sz_ = raw_ucmp->timestamp_size(); |
543 | 42.3k | } |
544 | 42.3k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; |
545 | 42.3k | block_contents_pinned_ = block_contents_pinned; |
546 | 42.3k | cache_handle_ = nullptr; |
547 | 42.3k | cur_entry_idx_ = -1; |
548 | 42.3k | protection_bytes_per_key_ = protection_bytes_per_key; |
549 | 42.3k | kv_checksum_ = kv_checksum; |
550 | 42.3k | block_restart_interval_ = block_restart_interval; |
551 | | // Checksum related states are either all 0/nullptr or all non-zero. |
552 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 |
553 | | // since we are not able to compute it. |
554 | 42.3k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || |
555 | 42.3k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && |
556 | 42.3k | (block_restart_interval > 0 || num_restarts == 1))); |
557 | 42.3k | } rocksdb::BlockIter<rocksdb::Slice>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int) Line | Count | Source | 530 | 28.7k | uint32_t block_restart_interval) { | 531 | 28.7k | assert(data_ == nullptr); // Ensure it is called only once | 532 | 28.7k | assert(num_restarts > 0); // Ensure the param is valid | 533 | | | 534 | 28.7k | icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp); | 535 | 28.7k | data_ = data; | 536 | 28.7k | restarts_ = restarts; | 537 | 28.7k | num_restarts_ = num_restarts; | 538 | 28.7k | current_ = restarts_; | 539 | 28.7k | restart_index_ = num_restarts_; | 540 | 28.7k | global_seqno_ = global_seqno; | 541 | 28.7k | if (raw_ucmp != nullptr) { | 542 | 28.7k | ts_sz_ = raw_ucmp->timestamp_size(); | 543 | 28.7k | } | 544 | 28.7k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; | 545 | 28.7k | block_contents_pinned_ = block_contents_pinned; | 546 | 28.7k | cache_handle_ = nullptr; | 547 | 28.7k | cur_entry_idx_ = -1; | 548 | 28.7k | protection_bytes_per_key_ = protection_bytes_per_key; | 549 | 28.7k | kv_checksum_ = kv_checksum; | 550 | 28.7k | block_restart_interval_ = block_restart_interval; | 551 | | // Checksum related states are either all 0/nullptr or all non-zero. | 552 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 | 553 | | // since we are not able to compute it. | 554 | 28.7k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || | 555 | 28.7k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && | 556 | 28.7k | (block_restart_interval > 0 || num_restarts == 1))); | 557 | 28.7k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int) Line | Count | Source | 530 | 13.5k | uint32_t block_restart_interval) { | 531 | 13.5k | assert(data_ == nullptr); // Ensure it is called only once | 532 | 13.5k | assert(num_restarts > 0); // Ensure the param is valid | 533 | | | 534 | 13.5k | icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp); | 535 | 13.5k | data_ = data; | 536 | 13.5k | restarts_ = restarts; | 537 | 13.5k | num_restarts_ = num_restarts; | 538 | 13.5k | current_ = restarts_; | 539 | 13.5k | restart_index_ = num_restarts_; | 540 | 13.5k | global_seqno_ = global_seqno; | 541 | 13.5k | if (raw_ucmp != nullptr) { | 542 | 13.5k | ts_sz_ = raw_ucmp->timestamp_size(); | 543 | 13.5k | } | 544 | 13.5k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; | 545 | 13.5k | block_contents_pinned_ = block_contents_pinned; | 546 | 13.5k | cache_handle_ = nullptr; | 547 | 13.5k | cur_entry_idx_ = -1; | 548 | 13.5k | protection_bytes_per_key_ = protection_bytes_per_key; | 549 | 13.5k | kv_checksum_ = kv_checksum; | 550 | 13.5k | block_restart_interval_ = block_restart_interval; | 551 | | // Checksum related states are either all 0/nullptr or all non-zero. | 552 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 | 553 | | // since we are not able to compute it. | 554 | 13.5k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || | 555 | 13.5k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && | 556 | 13.5k | (block_restart_interval > 0 || num_restarts == 1))); | 557 | 13.5k | } |
|
558 | | |
559 | 0 | void CorruptionError(const std::string& error_msg = "bad entry in block") { |
560 | 0 | current_ = restarts_; |
561 | 0 | restart_index_ = num_restarts_; |
562 | 0 | status_ = Status::Corruption(error_msg); |
563 | 0 | raw_key_.Clear(); |
564 | 0 | value_.clear(); |
565 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) |
566 | | |
567 | 0 | void PerKVChecksumCorruptionError() { |
568 | 0 | std::string error_msg{ |
569 | 0 | "Corrupted block entry: per key-value checksum verification " |
570 | 0 | "failed."}; |
571 | 0 | error_msg.append(" Offset: " + std::to_string(current_) + "."); |
572 | 0 | error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); |
573 | 0 | CorruptionError(error_msg); |
574 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::PerKVChecksumCorruptionError() Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::PerKVChecksumCorruptionError() |
575 | | |
576 | 326k | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { |
577 | 326k | if (pad_min_timestamp_) { |
578 | 0 | std::string buf; |
579 | 0 | if (raw_key_.IsUserKey()) { |
580 | 0 | AppendKeyWithMinTimestamp(&buf, key, ts_sz_); |
581 | 0 | } else { |
582 | 0 | PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_); |
583 | 0 | } |
584 | 0 | raw_key_.SetKey(buf, true /* copy */); |
585 | 326k | } else { |
586 | 326k | raw_key_.SetKey(key, false /* copy */); |
587 | 326k | } |
588 | 326k | } rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&) Line | Count | Source | 576 | 313k | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { | 577 | 313k | if (pad_min_timestamp_) { | 578 | 0 | std::string buf; | 579 | 0 | if (raw_key_.IsUserKey()) { | 580 | 0 | AppendKeyWithMinTimestamp(&buf, key, ts_sz_); | 581 | 0 | } else { | 582 | 0 | PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_); | 583 | 0 | } | 584 | 0 | raw_key_.SetKey(buf, true /* copy */); | 585 | 313k | } else { | 586 | 313k | raw_key_.SetKey(key, false /* copy */); | 587 | 313k | } | 588 | 313k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&) Line | Count | Source | 576 | 12.8k | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { | 577 | 12.8k | if (pad_min_timestamp_) { | 578 | 0 | std::string buf; | 579 | 0 | if (raw_key_.IsUserKey()) { | 580 | 0 | AppendKeyWithMinTimestamp(&buf, key, ts_sz_); | 581 | 0 | } else { | 582 | 0 | PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_); | 583 | 0 | } | 584 | 0 | raw_key_.SetKey(buf, true /* copy */); | 585 | 12.8k | } else { | 586 | 12.8k | raw_key_.SetKey(key, false /* copy */); | 587 | 12.8k | } | 588 | 12.8k | } |
|
589 | | |
590 | | // Must be called every time a key is found that needs to be returned to user, |
591 | | // and may be called when no key is found (as a no-op). Updates `key_`, |
592 | | // `key_buf_`, and `key_pinned_` with info about the found key. |
593 | | // Per key-value checksum verification is done if available for the key to be |
594 | | // returned. Iterator is invalidated with corruption status if checksum |
595 | | // verification fails. |
596 | 554k | void UpdateKey() { |
597 | 554k | key_buf_.Clear(); |
598 | 554k | if (!Valid()) { |
599 | 35.6k | return; |
600 | 35.6k | } |
601 | 518k | if (raw_key_.IsUserKey()) { |
602 | 286k | assert(global_seqno_ == kDisableGlobalSequenceNumber); |
603 | 286k | key_ = raw_key_.GetUserKey(); |
604 | 286k | key_pinned_ = raw_key_.IsKeyPinned(); |
605 | 286k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { |
606 | 233k | key_ = raw_key_.GetInternalKey(); |
607 | 233k | key_pinned_ = raw_key_.IsKeyPinned(); |
608 | 18.4E | } else { |
609 | 18.4E | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, |
610 | 18.4E | ExtractValueType(raw_key_.GetInternalKey())); |
611 | 18.4E | key_ = key_buf_.GetInternalKey(); |
612 | 18.4E | key_pinned_ = false; |
613 | 18.4E | } |
614 | 518k | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", |
615 | 518k | (void*)value_.data()); |
616 | 518k | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", |
617 | 518k | &protection_bytes_per_key_); |
618 | 518k | if (protection_bytes_per_key_ > 0) { |
619 | 0 | if (!ProtectionInfo64() |
620 | 0 | .ProtectKV(raw_key_.GetKey(), value_) |
621 | 0 | .Verify( |
622 | 0 | protection_bytes_per_key_, |
623 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { |
624 | 0 | PerKVChecksumCorruptionError(); |
625 | 0 | } |
626 | 0 | } |
627 | 518k | } rocksdb::BlockIter<rocksdb::Slice>::UpdateKey() Line | Count | Source | 596 | 535k | void UpdateKey() { | 597 | 535k | key_buf_.Clear(); | 598 | 535k | if (!Valid()) { | 599 | 28.2k | return; | 600 | 28.2k | } | 601 | 507k | if (raw_key_.IsUserKey()) { | 602 | 275k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 603 | 275k | key_ = raw_key_.GetUserKey(); | 604 | 275k | key_pinned_ = raw_key_.IsKeyPinned(); | 605 | 275k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 606 | 233k | key_ = raw_key_.GetInternalKey(); | 607 | 233k | key_pinned_ = raw_key_.IsKeyPinned(); | 608 | 18.4E | } else { | 609 | 18.4E | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, | 610 | 18.4E | ExtractValueType(raw_key_.GetInternalKey())); | 611 | 18.4E | key_ = key_buf_.GetInternalKey(); | 612 | 18.4E | key_pinned_ = false; | 613 | 18.4E | } | 614 | 507k | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", | 615 | 507k | (void*)value_.data()); | 616 | 507k | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", | 617 | 507k | &protection_bytes_per_key_); | 618 | 507k | if (protection_bytes_per_key_ > 0) { | 619 | 0 | if (!ProtectionInfo64() | 620 | 0 | .ProtectKV(raw_key_.GetKey(), value_) | 621 | 0 | .Verify( | 622 | 0 | protection_bytes_per_key_, | 623 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { | 624 | 0 | PerKVChecksumCorruptionError(); | 625 | 0 | } | 626 | 0 | } | 627 | 507k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateKey() Line | Count | Source | 596 | 18.4k | void UpdateKey() { | 597 | 18.4k | key_buf_.Clear(); | 598 | 18.4k | if (!Valid()) { | 599 | 7.44k | return; | 600 | 7.44k | } | 601 | 10.9k | if (raw_key_.IsUserKey()) { | 602 | 10.9k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 603 | 10.9k | key_ = raw_key_.GetUserKey(); | 604 | 10.9k | key_pinned_ = raw_key_.IsKeyPinned(); | 605 | 10.9k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 606 | 0 | key_ = raw_key_.GetInternalKey(); | 607 | 0 | key_pinned_ = raw_key_.IsKeyPinned(); | 608 | 0 | } else { | 609 | 0 | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, | 610 | 0 | ExtractValueType(raw_key_.GetInternalKey())); | 611 | 0 | key_ = key_buf_.GetInternalKey(); | 612 | 0 | key_pinned_ = false; | 613 | 0 | } | 614 | 10.9k | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", | 615 | 10.9k | (void*)value_.data()); | 616 | 10.9k | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", | 617 | 10.9k | &protection_bytes_per_key_); | 618 | 10.9k | if (protection_bytes_per_key_ > 0) { | 619 | 0 | if (!ProtectionInfo64() | 620 | 0 | .ProtectKV(raw_key_.GetKey(), value_) | 621 | 0 | .Verify( | 622 | 0 | protection_bytes_per_key_, | 623 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { | 624 | 0 | PerKVChecksumCorruptionError(); | 625 | 0 | } | 626 | 0 | } | 627 | 10.9k | } |
|
628 | | |
629 | | // Returns the result of `Comparator::Compare()`, where the appropriate |
630 | | // comparator is used for the block contents, the LHS argument is the current |
631 | | // key with global seqno applied, and the RHS argument is `other`. |
632 | 52.4k | int CompareCurrentKey(const Slice& other) { |
633 | 52.4k | if (raw_key_.IsUserKey()) { |
634 | 49.7k | assert(global_seqno_ == kDisableGlobalSequenceNumber); |
635 | 49.7k | return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other); |
636 | 49.7k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { |
637 | 2.74k | return icmp_->Compare(raw_key_.GetInternalKey(), other); |
638 | 2.74k | } |
639 | 4 | return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other, |
640 | 4 | kDisableGlobalSequenceNumber); |
641 | 52.4k | } rocksdb::BlockIter<rocksdb::IndexValue>::CompareCurrentKey(rocksdb::Slice const&) Line | Count | Source | 632 | 1.82k | int CompareCurrentKey(const Slice& other) { | 633 | 1.82k | if (raw_key_.IsUserKey()) { | 634 | 1.82k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 635 | 1.82k | return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other); | 636 | 1.82k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 637 | 0 | return icmp_->Compare(raw_key_.GetInternalKey(), other); | 638 | 0 | } | 639 | 0 | return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other, | 640 | 0 | kDisableGlobalSequenceNumber); | 641 | 1.82k | } |
rocksdb::BlockIter<rocksdb::Slice>::CompareCurrentKey(rocksdb::Slice const&) Line | Count | Source | 632 | 50.6k | int CompareCurrentKey(const Slice& other) { | 633 | 50.6k | if (raw_key_.IsUserKey()) { | 634 | 47.8k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 635 | 47.8k | return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other); | 636 | 47.8k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 637 | 2.74k | return icmp_->Compare(raw_key_.GetInternalKey(), other); | 638 | 2.74k | } | 639 | 4 | return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other, | 640 | 4 | kDisableGlobalSequenceNumber); | 641 | 50.6k | } |
|
642 | | |
643 | | private: |
644 | | // Store the cache handle, if the block is cached. We need this since the |
645 | | // only other place the handle is stored is as an argument to the Cleanable |
646 | | // function callback, which is hard to retrieve. When multiple value |
647 | | // PinnableSlices reference the block, they need the cache handle in order |
648 | | // to bump up the ref count |
649 | | Cache::Handle* cache_handle_; |
650 | | |
651 | | public: |
652 | | // Return the offset in data_ just past the end of the current entry. |
653 | 557k | inline uint32_t NextEntryOffset() const { |
654 | | // NOTE: We don't support blocks bigger than 2GB |
655 | 557k | return static_cast<uint32_t>((value_.data() + value_.size()) - data_); |
656 | 557k | } rocksdb::BlockIter<rocksdb::Slice>::NextEntryOffset() const Line | Count | Source | 653 | 540k | inline uint32_t NextEntryOffset() const { | 654 | | // NOTE: We don't support blocks bigger than 2GB | 655 | 540k | return static_cast<uint32_t>((value_.data() + value_.size()) - data_); | 656 | 540k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::NextEntryOffset() const Line | Count | Source | 653 | 17.2k | inline uint32_t NextEntryOffset() const { | 654 | | // NOTE: We don't support blocks bigger than 2GB | 655 | 17.2k | return static_cast<uint32_t>((value_.data() + value_.size()) - data_); | 656 | 17.2k | } |
|
657 | | |
658 | 537k | uint32_t GetRestartPoint(uint32_t index) const { |
659 | 537k | assert(index < num_restarts_); |
660 | 537k | return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); |
661 | 537k | } rocksdb::BlockIter<rocksdb::Slice>::GetRestartPoint(unsigned int) const Line | Count | Source | 658 | 518k | uint32_t GetRestartPoint(uint32_t index) const { | 659 | 518k | assert(index < num_restarts_); | 660 | 518k | return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); | 661 | 518k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartPoint(unsigned int) const Line | Count | Source | 658 | 18.8k | uint32_t GetRestartPoint(uint32_t index) const { | 659 | 18.8k | assert(index < num_restarts_); | 660 | 18.8k | return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); | 661 | 18.8k | } |
|
662 | | |
663 | 63.6k | void SeekToRestartPoint(uint32_t index) { |
664 | 63.6k | raw_key_.Clear(); |
665 | 63.6k | restart_index_ = index; |
666 | | // current_ will be fixed by ParseNextKey(); |
667 | | |
668 | | // ParseNextKey() starts at the end of value_, so set value_ accordingly |
669 | 63.6k | uint32_t offset = GetRestartPoint(index); |
670 | 63.6k | value_ = Slice(data_ + offset, 0); |
671 | 63.6k | } rocksdb::BlockIter<rocksdb::Slice>::SeekToRestartPoint(unsigned int) Line | Count | Source | 663 | 55.3k | void SeekToRestartPoint(uint32_t index) { | 664 | 55.3k | raw_key_.Clear(); | 665 | 55.3k | restart_index_ = index; | 666 | | // current_ will be fixed by ParseNextKey(); | 667 | | | 668 | | // ParseNextKey() starts at the end of value_, so set value_ accordingly | 669 | 55.3k | uint32_t offset = GetRestartPoint(index); | 670 | 55.3k | value_ = Slice(data_ + offset, 0); | 671 | 55.3k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToRestartPoint(unsigned int) Line | Count | Source | 663 | 8.29k | void SeekToRestartPoint(uint32_t index) { | 664 | 8.29k | raw_key_.Clear(); | 665 | 8.29k | restart_index_ = index; | 666 | | // current_ will be fixed by ParseNextKey(); | 667 | | | 668 | | // ParseNextKey() starts at the end of value_, so set value_ accordingly | 669 | 8.29k | uint32_t offset = GetRestartPoint(index); | 670 | 8.29k | value_ = Slice(data_ + offset, 0); | 671 | 8.29k | } |
|
672 | | |
673 | | protected: |
674 | | template <typename DecodeKeyFunc> |
675 | | inline bool BinarySeek(const Slice& target, uint32_t* index, |
676 | | bool* is_index_key_result); |
677 | | |
678 | | // Find the first key in restart interval `index` that is >= `target`. |
679 | | // If there is no such key, iterator is positioned at the first key in |
680 | | // restart interval `index + 1`. |
681 | | // If is_index_key_result is true, it positions the iterator at the first key |
682 | | // in this restart interval. |
683 | | // Per key-value checksum verification is done for all keys scanned |
684 | | // up to but not including the last key (the key that current_ points to |
685 | | // when this function returns). This key's checksum is verified in |
686 | | // UpdateKey(). |
687 | | void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, |
688 | | bool is_index_key_result); |
689 | | }; |
690 | | |
691 | | class DataBlockIter final : public BlockIter<Slice> { |
692 | | public: |
693 | | DataBlockIter() |
694 | 16.6k | : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} |
695 | | void Initialize(const Comparator* raw_ucmp, const char* data, |
696 | | uint32_t restarts, uint32_t num_restarts, |
697 | | SequenceNumber global_seqno, |
698 | | BlockReadAmpBitmap* read_amp_bitmap, |
699 | | bool block_contents_pinned, |
700 | | bool user_defined_timestamps_persisted, |
701 | | DataBlockHashIndex* data_block_hash_index, |
702 | | uint8_t protection_bytes_per_key, const char* kv_checksum, |
703 | 13.8k | uint32_t block_restart_interval) { |
704 | 13.8k | InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, |
705 | 13.8k | block_contents_pinned, user_defined_timestamps_persisted, |
706 | 13.8k | protection_bytes_per_key, kv_checksum, |
707 | 13.8k | block_restart_interval); |
708 | 13.8k | raw_key_.SetIsUserKey(false); |
709 | 13.8k | read_amp_bitmap_ = read_amp_bitmap; |
710 | 13.8k | last_bitmap_offset_ = current_ + 1; |
711 | 13.8k | data_block_hash_index_ = data_block_hash_index; |
712 | 13.8k | } |
713 | | |
714 | 227k | Slice value() const override { |
715 | 227k | assert(Valid()); |
716 | 227k | if (read_amp_bitmap_ && current_ < restarts_ && |
717 | 227k | current_ != last_bitmap_offset_) { |
718 | 0 | read_amp_bitmap_->Mark(current_ /* current entry offset */, |
719 | 0 | NextEntryOffset() - 1); |
720 | 0 | last_bitmap_offset_ = current_; |
721 | 0 | } |
722 | 227k | return value_; |
723 | 227k | } |
724 | | |
725 | | // Returns if `target` may exist. |
726 | 98 | inline bool SeekForGet(const Slice& target) { |
727 | | #ifndef NDEBUG |
728 | | if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; |
729 | | #endif |
730 | 98 | if (!data_block_hash_index_) { |
731 | 98 | SeekImpl(target); |
732 | 98 | UpdateKey(); |
733 | 98 | return true; |
734 | 98 | } |
735 | 0 | bool res = SeekForGetImpl(target); |
736 | 0 | UpdateKey(); |
737 | 0 | return res; |
738 | 98 | } |
739 | | |
740 | 10.1k | void Invalidate(const Status& s) override { |
741 | 10.1k | BlockIter::Invalidate(s); |
742 | | // Clear prev entries cache. |
743 | 10.1k | prev_entries_keys_buff_.clear(); |
744 | 10.1k | prev_entries_.clear(); |
745 | 10.1k | prev_entries_idx_ = -1; |
746 | 10.1k | } |
747 | | |
748 | | protected: |
749 | | friend Block; |
750 | | inline bool ParseNextDataKey(bool* is_shared); |
751 | | void SeekToFirstImpl() override; |
752 | | void SeekToLastImpl() override; |
753 | | void SeekImpl(const Slice& target) override; |
754 | | void SeekForPrevImpl(const Slice& target) override; |
755 | | void NextImpl() override; |
756 | | void PrevImpl() override; |
757 | | |
758 | | private: |
759 | | // read-amp bitmap |
760 | | BlockReadAmpBitmap* read_amp_bitmap_; |
761 | | // last `current_` value we report to read-amp bitmp |
762 | | mutable uint32_t last_bitmap_offset_; |
763 | | struct CachedPrevEntry { |
764 | | explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, |
765 | | size_t _key_offset, size_t _key_size, Slice _value) |
766 | | : offset(_offset), |
767 | | key_ptr(_key_ptr), |
768 | | key_offset(_key_offset), |
769 | | key_size(_key_size), |
770 | 174 | value(_value) {} |
771 | | |
772 | | // offset of entry in block |
773 | | uint32_t offset; |
774 | | // Pointer to key data in block (nullptr if key is delta-encoded) |
775 | | const char* key_ptr; |
776 | | // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) |
777 | | size_t key_offset; |
778 | | // size of key |
779 | | size_t key_size; |
780 | | // value slice pointing to data in block |
781 | | Slice value; |
782 | | }; |
783 | | std::string prev_entries_keys_buff_; |
784 | | std::vector<CachedPrevEntry> prev_entries_; |
785 | | int32_t prev_entries_idx_ = -1; |
786 | | |
787 | | DataBlockHashIndex* data_block_hash_index_; |
788 | | |
789 | | bool SeekForGetImpl(const Slice& target); |
790 | | }; |
791 | | |
792 | | // Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and |
793 | | // are used to store Properties associated with table. |
794 | | // Meta blocks always store user keys (no sequence number) and always |
795 | | // use the BytewiseComparator. Additionally, MetaBlock accesses are |
796 | | // not recorded in the Statistics or for Read-Amplification. |
797 | | class MetaBlockIter final : public BlockIter<Slice> { |
798 | | public: |
799 | 14.9k | MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } |
800 | | void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, |
801 | | bool block_contents_pinned, uint8_t protection_bytes_per_key, |
802 | 14.9k | const char* kv_checksum, uint32_t block_restart_interval) { |
803 | | // Initializes the iterator with a BytewiseComparator and |
804 | | // the raw key being a user key. |
805 | 14.9k | InitializeBase(BytewiseComparator(), data, restarts, num_restarts, |
806 | 14.9k | kDisableGlobalSequenceNumber, block_contents_pinned, |
807 | 14.9k | /* user_defined_timestamps_persisted */ true, |
808 | 14.9k | protection_bytes_per_key, kv_checksum, |
809 | 14.9k | block_restart_interval); |
810 | 14.9k | raw_key_.SetIsUserKey(true); |
811 | 14.9k | } |
812 | | |
813 | 271k | Slice value() const override { |
814 | 271k | assert(Valid()); |
815 | 271k | return value_; |
816 | 271k | } |
817 | | |
818 | | protected: |
819 | | friend Block; |
820 | | void SeekToFirstImpl() override; |
821 | | void SeekToLastImpl() override; |
822 | | void SeekImpl(const Slice& target) override; |
823 | | void SeekForPrevImpl(const Slice& target) override; |
824 | | void NextImpl() override; |
825 | | void PrevImpl() override; |
826 | | // Meta index block's restart interval is always 1. See |
827 | | // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. |
828 | 0 | uint32_t GetRestartInterval() override { return 1; } |
829 | 0 | uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } |
830 | | }; |
831 | | |
832 | | class IndexBlockIter final : public BlockIter<IndexValue> { |
833 | | public: |
834 | 13.5k | IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} |
835 | | |
836 | | // key_includes_seq, default true, means that the keys are in internal key |
837 | | // format. |
838 | | // value_is_full, default true, means that no delta encoding is |
839 | | // applied to values. |
840 | | void Initialize(const Comparator* raw_ucmp, const char* data, |
841 | | uint32_t restarts, uint32_t num_restarts, |
842 | | SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, |
843 | | bool have_first_key, bool key_includes_seq, |
844 | | bool value_is_full, bool block_contents_pinned, |
845 | | bool user_defined_timestamps_persisted, |
846 | | uint8_t protection_bytes_per_key, const char* kv_checksum, |
847 | 13.5k | uint32_t block_restart_interval) { |
848 | 13.5k | InitializeBase(raw_ucmp, data, restarts, num_restarts, |
849 | 13.5k | kDisableGlobalSequenceNumber, block_contents_pinned, |
850 | 13.5k | user_defined_timestamps_persisted, protection_bytes_per_key, |
851 | 13.5k | kv_checksum, block_restart_interval); |
852 | 13.5k | raw_key_.SetIsUserKey(!key_includes_seq); |
853 | 13.5k | prefix_index_ = prefix_index; |
854 | 13.5k | value_delta_encoded_ = !value_is_full; |
855 | 13.5k | have_first_key_ = have_first_key; |
856 | 13.5k | if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { |
857 | 0 | global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); |
858 | 13.5k | } else { |
859 | 13.5k | global_seqno_state_.reset(); |
860 | 13.5k | } |
861 | 13.5k | } |
862 | | |
863 | 0 | Slice user_key() const override { |
864 | 0 | assert(Valid()); |
865 | 0 | return raw_key_.GetUserKey(); |
866 | 0 | } |
867 | | |
868 | 20.6k | IndexValue value() const override { |
869 | 20.6k | assert(Valid()); |
870 | 20.6k | if (value_delta_encoded_ || global_seqno_state_ != nullptr || |
871 | 20.6k | pad_min_timestamp_) { |
872 | 20.6k | return decoded_value_; |
873 | 20.6k | } else { |
874 | 0 | IndexValue entry; |
875 | 0 | Slice v = value_; |
876 | 0 | Status decode_s __attribute__((__unused__)) = |
877 | 0 | entry.DecodeFrom(&v, have_first_key_, nullptr); |
878 | 0 | assert(decode_s.ok()); |
879 | 0 | return entry; |
880 | 0 | } |
881 | 20.6k | } |
882 | | |
883 | 0 | Slice raw_value() const { |
884 | 0 | assert(Valid()); |
885 | 0 | return value_; |
886 | 0 | } |
887 | | |
888 | 0 | bool IsValuePinned() const override { |
889 | 0 | return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); |
890 | 0 | } |
891 | | |
892 | | protected: |
893 | | friend Block; |
894 | | // IndexBlockIter follows a different contract for prefix iterator |
895 | | // from data iterators. |
896 | | // If prefix of the seek key `target` exists in the file, it must |
897 | | // return the same result as total order seek. |
898 | | // If the prefix of `target` doesn't exist in the file, it can either |
899 | | // return the result of total order seek, or set both of Valid() = false |
900 | | // and status() = NotFound(). |
901 | | void SeekImpl(const Slice& target) override; |
902 | | |
903 | 0 | void SeekForPrevImpl(const Slice&) override { |
904 | 0 | assert(false); |
905 | 0 | current_ = restarts_; |
906 | 0 | restart_index_ = num_restarts_; |
907 | 0 | status_ = Status::InvalidArgument( |
908 | 0 | "RocksDB internal error: should never call SeekForPrev() on index " |
909 | 0 | "blocks"); |
910 | 0 | raw_key_.Clear(); |
911 | 0 | value_.clear(); |
912 | 0 | } |
913 | | |
914 | | void PrevImpl() override; |
915 | | void NextImpl() override; |
916 | | void SeekToFirstImpl() override; |
917 | | void SeekToLastImpl() override; |
918 | | |
919 | | private: |
920 | | bool value_delta_encoded_; |
921 | | bool have_first_key_; // value includes first_internal_key |
922 | | BlockPrefixIndex* prefix_index_; |
923 | | // Whether the value is delta encoded. In that case the value is assumed to be |
924 | | // BlockHandle. The first value in each restart interval is the full encoded |
925 | | // BlockHandle; the restart of encoded size part of the BlockHandle. The |
926 | | // offset of delta encoded BlockHandles is computed by adding the size of |
927 | | // previous delta encoded values in the same restart interval to the offset of |
928 | | // the first value in that restart interval. |
929 | | IndexValue decoded_value_; |
930 | | |
931 | | // When sequence number overwriting is enabled, this struct contains the seqno |
932 | | // to overwrite with, and current first_internal_key with overwritten seqno. |
933 | | // This is rarely used, so we put it behind a pointer and only allocate when |
934 | | // needed. |
935 | | struct GlobalSeqnoState { |
936 | | // First internal key according to current index entry, but with sequence |
937 | | // number overwritten to global_seqno. |
938 | | IterKey first_internal_key; |
939 | | SequenceNumber global_seqno; |
940 | | |
941 | 0 | explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} |
942 | | }; |
943 | | |
944 | | std::unique_ptr<GlobalSeqnoState> global_seqno_state_; |
945 | | |
946 | | // Buffers the `first_internal_key` referred by `decoded_value_` when |
947 | | // `pad_min_timestamp_` is true. |
948 | | std::string first_internal_key_with_ts_; |
949 | | |
950 | | // Set *prefix_may_exist to false if no key possibly share the same prefix |
951 | | // as `target`. If not set, the result position should be the same as total |
952 | | // order Seek. |
953 | | bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); |
954 | | // Set *prefix_may_exist to false if no key can possibly share the same |
955 | | // prefix as `target`. If not set, the result position should be the same |
956 | | // as total order seek. |
957 | | bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
958 | | uint32_t left, uint32_t right, uint32_t* index, |
959 | | bool* prefix_may_exist); |
960 | | inline int CompareBlockKey(uint32_t block_index, const Slice& target); |
961 | | |
962 | | inline bool ParseNextIndexKey(); |
963 | | |
964 | | // When value_delta_encoded_ is enabled it decodes the value which is assumed |
965 | | // to be BlockHandle and put it to decoded_value_ |
966 | | inline void DecodeCurrentValue(bool is_shared); |
967 | | }; |
968 | | |
969 | | } // namespace ROCKSDB_NAMESPACE |