/src/rocksdb/table/block_based/block.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #pragma once |
11 | | #include <stddef.h> |
12 | | #include <stdint.h> |
13 | | |
14 | | #include <string> |
15 | | #include <vector> |
16 | | |
17 | | #include "db/kv_checksum.h" |
18 | | #include "db/pinned_iterators_manager.h" |
19 | | #include "port/malloc.h" |
20 | | #include "rocksdb/advanced_cache.h" |
21 | | #include "rocksdb/iterator.h" |
22 | | #include "rocksdb/options.h" |
23 | | #include "rocksdb/statistics.h" |
24 | | #include "rocksdb/table.h" |
25 | | #include "table/block_based/block_prefix_index.h" |
26 | | #include "table/block_based/data_block_hash_index.h" |
27 | | #include "table/format.h" |
28 | | #include "table/internal_iterator.h" |
29 | | #include "test_util/sync_point.h" |
30 | | #include "util/random.h" |
31 | | |
32 | | namespace ROCKSDB_NAMESPACE { |
33 | | |
34 | | struct BlockContents; |
35 | | class Comparator; |
36 | | template <class TValue> |
37 | | class BlockIter; |
38 | | class DataBlockIter; |
39 | | class IndexBlockIter; |
40 | | class MetaBlockIter; |
41 | | class BlockPrefixIndex; |
42 | | |
43 | | // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data |
44 | | // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of |
45 | | // bytes in the Block we update the bitmap and increment |
46 | | // READ_AMP_ESTIMATE_USEFUL_BYTES. |
47 | | class BlockReadAmpBitmap { |
48 | | public: |
49 | | explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, |
50 | | Statistics* statistics) |
51 | 0 | : bitmap_(nullptr), |
52 | 0 | bytes_per_bit_pow_(0), |
53 | 0 | statistics_(statistics), |
54 | 0 | rnd_(Random::GetTLSInstance()->Uniform( |
55 | 0 | static_cast<int>(bytes_per_bit))) { |
56 | 0 | TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); |
57 | 0 | assert(block_size > 0 && bytes_per_bit > 0); |
58 | | |
59 | | // convert bytes_per_bit to be a power of 2 |
60 | 0 | while (bytes_per_bit >>= 1) { |
61 | 0 | bytes_per_bit_pow_++; |
62 | 0 | } |
63 | | |
64 | | // num_bits_needed = ceil(block_size / bytes_per_bit) |
65 | 0 | size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; |
66 | 0 | assert(num_bits_needed > 0); |
67 | | |
68 | | // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) |
69 | 0 | size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; |
70 | | |
71 | | // Create bitmap and set all the bits to 0 |
72 | 0 | bitmap_ = new std::atomic<uint32_t>[bitmap_size](); |
73 | |
|
74 | 0 | RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); |
75 | 0 | } |
76 | | |
77 | 0 | ~BlockReadAmpBitmap() { delete[] bitmap_; } |
78 | | |
79 | 0 | void Mark(uint32_t start_offset, uint32_t end_offset) { |
80 | 0 | assert(end_offset >= start_offset); |
81 | | // Index of first bit in mask |
82 | 0 | uint32_t start_bit = |
83 | 0 | (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> |
84 | 0 | bytes_per_bit_pow_; |
85 | | // Index of last bit in mask + 1 |
86 | 0 | uint32_t exclusive_end_bit = |
87 | 0 | (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; |
88 | 0 | if (start_bit >= exclusive_end_bit) { |
89 | 0 | return; |
90 | 0 | } |
91 | 0 | assert(exclusive_end_bit > 0); |
92 | |
|
93 | 0 | if (GetAndSet(start_bit) == 0) { |
94 | 0 | uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) |
95 | 0 | << bytes_per_bit_pow_; |
96 | 0 | RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, |
97 | 0 | new_useful_bytes); |
98 | 0 | } |
99 | 0 | } |
100 | | |
101 | 0 | Statistics* GetStatistics() { |
102 | 0 | return statistics_.load(std::memory_order_relaxed); |
103 | 0 | } |
104 | | |
105 | 0 | void SetStatistics(Statistics* stats) { statistics_.store(stats); } |
106 | | |
107 | 0 | uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } |
108 | | |
109 | 0 | size_t ApproximateMemoryUsage() const { |
110 | 0 | #ifdef ROCKSDB_MALLOC_USABLE_SIZE |
111 | 0 | return malloc_usable_size((void*)this); |
112 | 0 | #endif // ROCKSDB_MALLOC_USABLE_SIZE |
113 | 0 | return sizeof(*this); |
114 | 0 | } |
115 | | |
116 | | private: |
117 | | // Get the current value of bit at `bit_idx` and set it to 1 |
118 | 0 | inline bool GetAndSet(uint32_t bit_idx) { |
119 | 0 | const uint32_t byte_idx = bit_idx / kBitsPerEntry; |
120 | 0 | const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); |
121 | |
|
122 | 0 | return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & |
123 | 0 | bit_mask; |
124 | 0 | } |
125 | | |
126 | | const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes |
127 | | const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits |
128 | | |
129 | | // Bitmap used to record the bytes that we read, use atomic to protect |
130 | | // against multiple threads updating the same bit |
131 | | std::atomic<uint32_t>* bitmap_; |
132 | | // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize |
133 | | // muliplication and division |
134 | | uint8_t bytes_per_bit_pow_; |
135 | | // Pointer to DB Statistics object, Since this bitmap may outlive the DB |
136 | | // this pointer maybe invalid, but the DB will update it to a valid pointer |
137 | | // by using SetStatistics() before calling Mark() |
138 | | std::atomic<Statistics*> statistics_; |
139 | | uint32_t rnd_; |
140 | | }; |
141 | | |
142 | | // class Block is the uncompressed and "parsed" form for blocks containing |
143 | | // key-value pairs. (See BlockContents comments for more on terminology.) |
144 | | // This includes the in-memory representation of data blocks, index blocks |
145 | | // (including partitions), range deletion blocks, properties blocks, metaindex |
146 | | // blocks, as well as the top level of the partitioned filter structure (which |
147 | | // is actually an index of the filter partitions). It is NOT suitable for |
148 | | // compressed blocks in general, filter blocks/partitions, or compression |
149 | | // dictionaries. |
150 | | // |
151 | | // See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format |
152 | | // for details of the format and the various block types. |
153 | | // |
154 | | // TODO: Rename to ParsedKvBlock? |
155 | | class Block { |
156 | | public: |
157 | | // Initialize the block with the specified contents. |
158 | | // If restart_interval is provided (non-zero), it will be stored directly |
159 | | // instead of being calculated later. |
160 | | explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0, |
161 | | Statistics* statistics = nullptr, |
162 | | uint32_t restart_interval = 1); |
163 | | // No copying allowed |
164 | | Block(const Block&) = delete; |
165 | | void operator=(const Block&) = delete; |
166 | | |
167 | | ~Block(); |
168 | | |
169 | 263k | size_t size() const { return contents_.data.size(); } |
170 | 347k | const char* data() const { return contents_.data.data(); } |
171 | | // The additional memory space taken by the block data. |
172 | 24.9k | size_t usable_size() const { return contents_.usable_size(); } |
173 | 0 | uint32_t NumRestarts() const { return num_restarts_; } |
174 | 24.9k | bool own_bytes() const { return contents_.own_bytes(); } |
175 | 0 | bool IsUniform() const { return is_uniform_; } |
176 | | |
177 | | BlockBasedTableOptions::DataBlockIndexType IndexType() const; |
178 | | |
179 | | // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key |
180 | | // comparator. |
181 | | // |
182 | | // If iter is null, return new Iterator |
183 | | // If iter is not null, update this one and return it as Iterator* |
184 | | // |
185 | | // Updates read_amp_bitmap_ if it is not nullptr. |
186 | | // |
187 | | // If `block_contents_pinned` is true, the caller will guarantee that when |
188 | | // the cleanup functions are transferred from the iterator to other |
189 | | // classes, e.g. PinnableSlice, the pointer to the bytes will still be |
190 | | // valid. Either the iterator holds cache handle or ownership of some resource |
191 | | // and release them in a release function, or caller is sure that the data |
192 | | // will not go away (for example, it's from mmapped file which will not be |
193 | | // closed). |
194 | | // |
195 | | // `user_defined_timestamps_persisted` controls whether a min timestamp is |
196 | | // padded while key is being parsed from the block. |
197 | | // |
198 | | // NOTE: for the hash based lookup, if a key prefix doesn't match any key, |
199 | | // the iterator will simply be set as "invalid", rather than returning |
200 | | // the key that is just pass the target key. |
201 | | DataBlockIter* NewDataIterator(const Comparator* raw_ucmp, |
202 | | SequenceNumber global_seqno, |
203 | | DataBlockIter* iter = nullptr, |
204 | | Statistics* stats = nullptr, |
205 | | bool block_contents_pinned = false, |
206 | | bool user_defined_timestamps_persisted = true); |
207 | | |
208 | | // Returns an MetaBlockIter for iterating over blocks containing metadata |
209 | | // (like Properties blocks). Unlike data blocks, the keys for these blocks |
210 | | // do not contain sequence numbers, do not use a user-define comparator, and |
211 | | // do not track read amplification/statistics. Additionally, MetaBlocks will |
212 | | // not assert if the block is formatted improperly. |
213 | | // |
214 | | // If `block_contents_pinned` is true, the caller will guarantee that when |
215 | | // the cleanup functions are transferred from the iterator to other |
216 | | // classes, e.g. PinnableSlice, the pointer to the bytes will still be |
217 | | // valid. Either the iterator holds cache handle or ownership of some resource |
218 | | // and release them in a release function, or caller is sure that the data |
219 | | // will not go away (for example, it's from mmapped file which will not be |
220 | | // closed). |
221 | | MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false); |
222 | | |
223 | | // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key |
224 | | // comparator. |
225 | | // |
226 | | // key_includes_seq, default true, means that the keys are in internal key |
227 | | // format. |
228 | | // value_is_full, default true, means that no delta encoding is |
229 | | // applied to values. |
230 | | // |
231 | | // If `prefix_index` is not nullptr this block will do hash lookup for the key |
232 | | // prefix. If total_order_seek is true, prefix_index_ is ignored. |
233 | | // |
234 | | // `have_first_key` controls whether IndexValue will contain |
235 | | // first_internal_key. It affects data serialization format, so the same value |
236 | | // have_first_key must be used when writing and reading index. |
237 | | // It is determined by IndexType property of the table. |
238 | | // `user_defined_timestamps_persisted` controls whether a min timestamp is |
239 | | // padded while key is being parsed from the block. |
240 | | // `index_block_search_type` controls which search algorithm to use when |
241 | | // reading the index block. kBinary uses binary search, while |
242 | | // kInterpolation uses interpolation search which can be faster |
243 | | // for uniformly distributed keys. |
244 | | IndexBlockIter* NewIndexIterator( |
245 | | const Comparator* raw_ucmp, SequenceNumber global_seqno, |
246 | | IndexBlockIter* iter, Statistics* stats, bool total_order_seek, |
247 | | bool have_first_key, bool key_includes_seq, bool value_is_full, |
248 | | bool block_contents_pinned = false, |
249 | | bool user_defined_timestamps_persisted = true, |
250 | | BlockPrefixIndex* prefix_index = nullptr, |
251 | | BlockBasedTableOptions::BlockSearchType index_block_search_type = |
252 | | BlockBasedTableOptions::kBinary); |
253 | | |
254 | | // Report an approximation of how much memory has been used. |
255 | | size_t ApproximateMemoryUsage() const; |
256 | | |
257 | | // For TypedCacheInterface |
258 | 0 | const Slice& ContentSlice() const { return contents_.data; } |
259 | | |
260 | | // Initializes per key-value checksum protection. |
261 | | // After this method is called, each DataBlockIterator returned |
262 | | // by NewDataIterator will verify per key-value checksum for any key it read. |
263 | | void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, |
264 | | const Comparator* raw_ucmp); |
265 | | |
266 | | // Initializes per key-value checksum protection. |
267 | | // After this method is called, each IndexBlockIterator returned |
268 | | // by NewIndexIterator will verify per key-value checksum for any key it read. |
269 | | // value_is_full and index_has_first_key are needed to be able to parse |
270 | | // the index block content and construct checksums. |
271 | | void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, |
272 | | const Comparator* raw_ucmp, |
273 | | bool value_is_full, |
274 | | bool index_has_first_key); |
275 | | |
276 | | // Initializes per key-value checksum protection. |
277 | | // After this method is called, each MetaBlockIter returned |
278 | | // by NewMetaIterator will verify per key-value checksum for any key it read. |
279 | | void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); |
280 | | |
281 | | static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, |
282 | 0 | const Slice& key, const Slice& value) { |
283 | 0 | ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); |
284 | 0 | } |
285 | | |
286 | 0 | bool HasSeparatedKV() const { return values_section_ != nullptr; } |
287 | | |
288 | 0 | const char* TEST_GetKVChecksum() const { return kv_checksum_; } |
289 | | |
290 | | private: |
291 | | // Returns a detailed error status by re-processing the footer. |
292 | | // Should only be called when size() == 0 (error marker). |
293 | | Status GetCorruptionStatus() const; |
294 | | |
295 | | BlockContents contents_; |
296 | | // Normal state: offset in data_ of restart array. |
297 | | // Error state (size()==0): original data size if footer decode failed, |
298 | | // otherwise 0. Used by GetCorruptionStatus() to re-decode footer. |
299 | | uint32_t restart_offset_; |
300 | | uint32_t num_restarts_; |
301 | | bool is_uniform_{false}; |
302 | | std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_; |
303 | | char* kv_checksum_{nullptr}; |
304 | | uint32_t checksum_size_{0}; |
305 | | // Used by block iterators to calculate current key index within a block |
306 | | uint32_t block_restart_interval_{0}; |
307 | | uint8_t protection_bytes_per_key_{0}; |
308 | | DataBlockHashIndex data_block_hash_index_; |
309 | | |
310 | | // Pointer to values section, nullptr if not using separated KV |
311 | | const char* values_section_{nullptr}; |
312 | | }; |
313 | | |
314 | | // A `BlockIter` iterates over the entries in a `Block`'s data buffer. The |
315 | | // format of this data buffer is an uncompressed, sorted sequence of key-value |
316 | | // pairs (see `Block` API for more details). |
317 | | // |
318 | | // Notably, the keys may either be in internal key format or user key format. |
319 | | // Subclasses are responsible for configuring the key format. |
320 | | // |
321 | | // `BlockIter` intends to provide final overrides for all of |
322 | | // `InternalIteratorBase` functions that can move the iterator. It does |
323 | | // this to guarantee `UpdateKey()` is called exactly once after each key |
324 | | // movement potentially visible to users. In this step, the key is prepared |
325 | | // (e.g., serialized if global seqno is in effect) so it can be returned |
326 | | // immediately when the user asks for it via calling `key() const`. |
327 | | // |
328 | | // For its subclasses, it provides protected variants of the above-mentioned |
329 | | // final-overridden methods. They are named with the "Impl" suffix, e.g., |
330 | | // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These |
331 | | // "Impl" functions are responsible for positioning `raw_key_` but not |
332 | | // invoking `UpdateKey()`. |
333 | | // |
334 | | // Per key-value checksum is enabled if relevant states are passed in during |
335 | | // `InitializeBase()`. The checksum verification is done in each call to |
336 | | // UpdateKey() for the current key. Each subclass is responsible for keeping |
337 | | // track of cur_entry_idx_, the index of the current key within the block. |
338 | | // BlockIter uses this index to get the corresponding checksum for current key. |
339 | | // Additional checksum verification may be done in subclasses if they read keys |
340 | | // other than the key being processed in UpdateKey(). |
341 | | template <class TValue> |
342 | | class BlockIter : public InternalIteratorBase<TValue> { |
343 | | public: |
344 | | // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do |
345 | | // nothing. Calls cleanup functions. |
346 | 25.2k | virtual void Invalidate(const Status& s) { |
347 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. |
348 | 25.2k | assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); |
349 | | |
350 | 25.2k | data_ = nullptr; |
351 | 25.2k | current_ = GetKeysEndOffset(); |
352 | 25.2k | status_ = s; |
353 | | |
354 | | // Call cleanup callbacks. |
355 | 25.2k | Cleanable::Reset(); |
356 | 25.2k | } rocksdb::BlockIter<rocksdb::Slice>::Invalidate(rocksdb::Status const&) Line | Count | Source | 346 | 25.2k | virtual void Invalidate(const Status& s) { | 347 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. | 348 | 25.2k | assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); | 349 | | | 350 | 25.2k | data_ = nullptr; | 351 | 25.2k | current_ = GetKeysEndOffset(); | 352 | 25.2k | status_ = s; | 353 | | | 354 | | // Call cleanup callbacks. | 355 | 25.2k | Cleanable::Reset(); | 356 | 25.2k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::Invalidate(rocksdb::Status const&) |
357 | | |
358 | 7.41M | bool Valid() const override { |
359 | | // When status_ is not ok, iter should be invalid. |
360 | 7.41M | auto key_end = GetKeysEndOffset(); |
361 | 7.41M | assert(status_.ok() || current_ >= key_end); |
362 | 7.41M | auto valid = current_ < key_end; |
363 | 7.41M | return valid; |
364 | 7.41M | } rocksdb::BlockIter<rocksdb::Slice>::Valid() const Line | Count | Source | 358 | 7.29M | bool Valid() const override { | 359 | | // When status_ is not ok, iter should be invalid. | 360 | 7.29M | auto key_end = GetKeysEndOffset(); | 361 | | assert(status_.ok() || current_ >= key_end); | 362 | 7.29M | auto valid = current_ < key_end; | 363 | 7.29M | return valid; | 364 | 7.29M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Valid() const Line | Count | Source | 358 | 120k | bool Valid() const override { | 359 | | // When status_ is not ok, iter should be invalid. | 360 | 120k | auto key_end = GetKeysEndOffset(); | 361 | | assert(status_.ok() || current_ >= key_end); | 362 | 120k | auto valid = current_ < key_end; | 363 | 120k | return valid; | 364 | 120k | } |
|
365 | | |
366 | 136k | void SeekToFirst() override final { |
367 | | #ifndef NDEBUG |
368 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; |
369 | | #endif |
370 | 136k | SeekToFirstImpl(); |
371 | 136k | UpdateKey(); |
372 | 136k | } rocksdb::BlockIter<rocksdb::Slice>::SeekToFirst() Line | Count | Source | 366 | 114k | void SeekToFirst() override final { | 367 | | #ifndef NDEBUG | 368 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; | 369 | | #endif | 370 | 114k | SeekToFirstImpl(); | 371 | 114k | UpdateKey(); | 372 | 114k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToFirst() Line | Count | Source | 366 | 22.0k | void SeekToFirst() override final { | 367 | | #ifndef NDEBUG | 368 | | if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; | 369 | | #endif | 370 | 22.0k | SeekToFirstImpl(); | 371 | 22.0k | UpdateKey(); | 372 | 22.0k | } |
|
373 | | |
374 | 3.34k | void SeekToLast() override final { |
375 | 3.34k | SeekToLastImpl(); |
376 | 3.34k | UpdateKey(); |
377 | 3.34k | } rocksdb::BlockIter<rocksdb::Slice>::SeekToLast() Line | Count | Source | 374 | 1.60k | void SeekToLast() override final { | 375 | 1.60k | SeekToLastImpl(); | 376 | 1.60k | UpdateKey(); | 377 | 1.60k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToLast() Line | Count | Source | 374 | 1.74k | void SeekToLast() override final { | 375 | 1.74k | SeekToLastImpl(); | 376 | 1.74k | UpdateKey(); | 377 | 1.74k | } |
|
378 | | |
379 | 353k | void Seek(const Slice& target) override final { |
380 | 353k | SeekImpl(target); |
381 | 353k | UpdateKey(); |
382 | 353k | } rocksdb::BlockIter<rocksdb::Slice>::Seek(rocksdb::Slice const&) Line | Count | Source | 379 | 342k | void Seek(const Slice& target) override final { | 380 | 342k | SeekImpl(target); | 381 | 342k | UpdateKey(); | 382 | 342k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Seek(rocksdb::Slice const&) Line | Count | Source | 379 | 11.0k | void Seek(const Slice& target) override final { | 380 | 11.0k | SeekImpl(target); | 381 | 11.0k | UpdateKey(); | 382 | 11.0k | } |
|
383 | | |
384 | 5.54k | void SeekForPrev(const Slice& target) override final { |
385 | 5.54k | SeekForPrevImpl(target); |
386 | 5.54k | UpdateKey(); |
387 | 5.54k | } rocksdb::BlockIter<rocksdb::Slice>::SeekForPrev(rocksdb::Slice const&) Line | Count | Source | 384 | 5.54k | void SeekForPrev(const Slice& target) override final { | 385 | 5.54k | SeekForPrevImpl(target); | 386 | 5.54k | UpdateKey(); | 387 | 5.54k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SeekForPrev(rocksdb::Slice const&) |
388 | | |
389 | 3.54M | void Next() override final { |
390 | 3.54M | NextImpl(); |
391 | 3.54M | UpdateKey(); |
392 | 3.54M | } rocksdb::BlockIter<rocksdb::Slice>::Next() Line | Count | Source | 389 | 3.52M | void Next() override final { | 390 | 3.52M | NextImpl(); | 391 | 3.52M | UpdateKey(); | 392 | 3.52M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Next() Line | Count | Source | 389 | 18.2k | void Next() override final { | 390 | 18.2k | NextImpl(); | 391 | 18.2k | UpdateKey(); | 392 | 18.2k | } |
|
393 | | |
394 | 0 | bool NextAndGetResult(IterateResult* result) override final { |
395 | | // This does not need to call `UpdateKey()` as the parent class only has |
396 | | // access to the `UpdateKey()`-invoking functions. |
397 | 0 | return InternalIteratorBase<TValue>::NextAndGetResult(result); |
398 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NextAndGetResult(rocksdb::IterateResult*) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NextAndGetResult(rocksdb::IterateResult*) |
399 | | |
400 | 12.2k | void Prev() override final { |
401 | 12.2k | PrevImpl(); |
402 | 12.2k | UpdateKey(); |
403 | 12.2k | } rocksdb::BlockIter<rocksdb::Slice>::Prev() Line | Count | Source | 400 | 5.20k | void Prev() override final { | 401 | 5.20k | PrevImpl(); | 402 | 5.20k | UpdateKey(); | 403 | 5.20k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::Prev() Line | Count | Source | 400 | 7.05k | void Prev() override final { | 401 | 7.05k | PrevImpl(); | 402 | 7.05k | UpdateKey(); | 403 | 7.05k | } |
|
404 | | |
405 | 3.63M | Status status() const override { return status_; }rocksdb::BlockIter<rocksdb::Slice>::status() const Line | Count | Source | 405 | 3.56M | Status status() const override { return status_; } |
rocksdb::BlockIter<rocksdb::IndexValue>::status() const Line | Count | Source | 405 | 73.7k | Status status() const override { return status_; } |
|
406 | | |
407 | 3.52M | Slice key() const override { |
408 | 3.52M | assert(Valid()); |
409 | 3.52M | return key_; |
410 | 3.52M | } rocksdb::BlockIter<rocksdb::Slice>::key() const Line | Count | Source | 407 | 3.52M | Slice key() const override { | 408 | | assert(Valid()); | 409 | 3.52M | return key_; | 410 | 3.52M | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::key() const |
411 | | |
412 | | #ifndef NDEBUG |
413 | | ~BlockIter() override { |
414 | | // Assert that the BlockIter is never deleted while Pinning is Enabled. |
415 | | assert(!pinned_iters_mgr_ || |
416 | | (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); |
417 | | status_.PermitUncheckedError(); |
418 | | } |
419 | | |
420 | | void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { |
421 | | pinned_iters_mgr_ = pinned_iters_mgr; |
422 | | } |
423 | | |
424 | | PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; |
425 | | |
426 | | bool TEST_Corrupt_Callback(const std::string& sync_point) { |
427 | | bool corrupt = false; |
428 | | TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt)); |
429 | | |
430 | | if (corrupt) { |
431 | | CorruptionError(); |
432 | | } |
433 | | return corrupt; |
434 | | } |
435 | | #endif |
436 | | |
437 | 142k | bool IsKeyPinned() const override { |
438 | 142k | return block_contents_pinned_ && key_pinned_; |
439 | 142k | } rocksdb::BlockIter<rocksdb::Slice>::IsKeyPinned() const Line | Count | Source | 437 | 142k | bool IsKeyPinned() const override { | 438 | 142k | return block_contents_pinned_ && key_pinned_; | 439 | 142k | } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsKeyPinned() const |
440 | | |
441 | 72.8k | bool IsValuePinned() const override { return block_contents_pinned_; }Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsValuePinned() const rocksdb::BlockIter<rocksdb::Slice>::IsValuePinned() const Line | Count | Source | 441 | 72.8k | bool IsValuePinned() const override { return block_contents_pinned_; } |
|
442 | | |
443 | | size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } |
444 | | |
445 | 0 | uint32_t ValueOffset() const { |
446 | 0 | return static_cast<uint32_t>(value_.data() - data_); |
447 | 0 | } |
448 | | |
449 | 29.1k | void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }rocksdb::BlockIter<rocksdb::Slice>::SetCacheHandle(rocksdb::Cache::Handle*) Line | Count | Source | 449 | 29.1k | void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } |
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SetCacheHandle(rocksdb::Cache::Handle*) |
450 | | |
451 | | Cache::Handle* cache_handle() { return cache_handle_; } |
452 | | |
453 | | protected: |
454 | | InternalKeyComparator icmp_; |
455 | | const char* data_; // underlying block contents |
456 | | uint32_t num_restarts_; // Number of uint32_t entries in restart array |
457 | | |
458 | | const char* values_section_; |
459 | | // Slice of current entry in the data section. Does not contain value if |
460 | | // values_section_ exists |
461 | | Slice entry_; |
462 | | |
463 | | // Index of restart block in which current_ or current_-1 falls |
464 | | uint32_t restart_index_; |
465 | | uint32_t restarts_; // Offset of restart array (list of fixed32) |
466 | | // current_ is offset in data_ of current entry. >= restarts_ if !Valid |
467 | | uint32_t current_; |
468 | | // Raw key from block. |
469 | | IterKey raw_key_; |
470 | | // Buffer for key data when global seqno assignment is enabled. |
471 | | IterKey key_buf_; |
472 | | Slice value_; |
473 | | Status status_; |
474 | | // Key to be exposed to users. |
475 | | Slice key_; |
476 | | SequenceNumber global_seqno_; |
477 | | // Size of the user-defined timestamp. |
478 | | size_t ts_sz_ = 0; |
479 | | // If user-defined timestamp is enabled but not persisted. A min timestamp |
480 | | // will be padded to the key during key parsing where it applies. Such as when |
481 | | // parsing keys from data block, index block, parsing the first internal |
482 | | // key from IndexValue entry. Min timestamp padding is different for when |
483 | | // `raw_key_` is a user key vs is an internal key. |
484 | | // |
485 | | // This only applies to data block and index blocks including index block for |
486 | | // data blocks, index block for partitioned filter blocks, index block for |
487 | | // partitioned index blocks. In summary, this only applies to block whose key |
488 | | // are real user keys or internal keys created from user keys. |
489 | | bool pad_min_timestamp_; |
490 | | |
491 | | // Per key-value checksum related states |
492 | | const char* kv_checksum_; |
493 | | // Index of the next entry to be parsed (used for checksum verification |
494 | | // and to determine if we're at a restart point for separated KV storage) |
495 | | int32_t cur_entry_idx_; |
496 | | uint32_t block_restart_interval_; |
497 | | uint8_t protection_bytes_per_key_; |
498 | | |
499 | | bool key_pinned_; |
500 | | // Whether the block data is guaranteed to outlive this iterator, and |
501 | | // as long as the cleanup functions are transferred to another class, |
502 | | // e.g. PinnableSlice, the pointer to the bytes will still be valid. |
503 | | bool block_contents_pinned_; |
504 | | |
505 | | virtual void SeekToFirstImpl() = 0; |
506 | | virtual void SeekToLastImpl() = 0; |
507 | | virtual void SeekImpl(const Slice& target) = 0; |
508 | | virtual void SeekForPrevImpl(const Slice& target) = 0; |
509 | | virtual void NextImpl() = 0; |
510 | | virtual void PrevImpl() = 0; |
511 | | |
512 | | // Returns the restart interval of this block. |
513 | | // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. |
514 | 0 | virtual uint32_t GetRestartInterval() { |
515 | 0 | if (num_restarts_ <= 1 || data_ == nullptr) { |
516 | 0 | return 0; |
517 | 0 | } |
518 | 0 | SeekToFirstImpl(); |
519 | 0 | uint32_t end_index = GetRestartPoint(1); |
520 | 0 | uint32_t count = 1; |
521 | 0 | while (NextEntryOffset() < end_index && status_.ok()) { |
522 | 0 | assert(Valid()); |
523 | 0 | NextImpl(); |
524 | 0 | ++count; |
525 | 0 | } |
526 | 0 | return count; |
527 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::GetRestartInterval() Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartInterval() |
528 | | |
529 | | // Returns the number of keys in this block. |
530 | 0 | virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { |
531 | 0 | if (num_restarts_ == 0 || data_ == nullptr) { |
532 | 0 | return 0; |
533 | 0 | } |
534 | 0 | uint32_t count = (num_restarts_ - 1) * block_restart_interval; |
535 | | // Add number of keys from the last restart interval |
536 | 0 | SeekToRestartPoint(num_restarts_ - 1); |
537 | | // For separated KV storage, keys end at values_section_, not at restarts_ |
538 | 0 | uint32_t keys_end = values_section_ |
539 | 0 | ? static_cast<uint32_t>(values_section_ - data_) |
540 | 0 | : restarts_; |
541 | 0 | while (NextEntryOffset() < keys_end && status_.ok()) { |
542 | 0 | NextImpl(); |
543 | 0 | ++count; |
544 | 0 | } |
545 | 0 | return count; |
546 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NumberOfKeys(unsigned int) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NumberOfKeys(unsigned int) |
547 | | |
548 | | // Stores whether the current key has a shared bytes with prev key in |
549 | | // *is_shared. |
550 | | // Sets raw_key_, value_ to the current parsed key and value. |
551 | | // Sets restart_index_ to point to the restart interval that contains |
552 | | // the current key. |
553 | | template <typename DecodeEntryFunc, bool StrictCheck = false> |
554 | | inline bool ParseNextKey(bool* is_shared); |
555 | | |
556 | | // protection_bytes_per_key, kv_checksum, and block_restart_interval |
557 | | // are needed only for per kv checksum verification. |
558 | | void InitializeBase(const Comparator* raw_ucmp, const char* data, |
559 | | uint32_t restarts, uint32_t num_restarts, |
560 | | SequenceNumber global_seqno, bool block_contents_pinned, |
561 | | bool user_defined_timestamp_persisted, |
562 | | uint8_t protection_bytes_per_key, const char* kv_checksum, |
563 | | uint32_t block_restart_interval, |
564 | 263k | const char* values_section) { |
565 | 263k | assert(data_ == nullptr); // Ensure it is called only once |
566 | 263k | assert(num_restarts > 0); // Ensure the param is valid |
567 | 263k | assert(raw_ucmp != nullptr); |
568 | 263k | icmp_ = InternalKeyComparator(raw_ucmp); |
569 | 263k | data_ = data; |
570 | 263k | restarts_ = restarts; |
571 | 263k | num_restarts_ = num_restarts; |
572 | 263k | restart_index_ = num_restarts_; |
573 | 263k | entry_ = Slice(); |
574 | 263k | global_seqno_ = global_seqno; |
575 | 263k | ts_sz_ = raw_ucmp->timestamp_size(); |
576 | 263k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; |
577 | 263k | block_contents_pinned_ = block_contents_pinned; |
578 | 263k | cache_handle_ = nullptr; |
579 | 263k | cur_entry_idx_ = -1; |
580 | 263k | protection_bytes_per_key_ = protection_bytes_per_key; |
581 | 263k | kv_checksum_ = kv_checksum; |
582 | 263k | block_restart_interval_ = block_restart_interval; |
583 | | // Checksum related states are either all 0/nullptr or all non-zero. |
584 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 |
585 | | // since we are not able to compute it. |
586 | 263k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || |
587 | 263k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && |
588 | 263k | (block_restart_interval > 0 || num_restarts == 1))); |
589 | | |
590 | 263k | values_section_ = values_section; |
591 | 263k | current_ = GetKeysEndOffset(); |
592 | 263k | } rocksdb::BlockIter<rocksdb::Slice>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int, char const*) Line | Count | Source | 564 | 208k | const char* values_section) { | 565 | 208k | assert(data_ == nullptr); // Ensure it is called only once | 566 | 208k | assert(num_restarts > 0); // Ensure the param is valid | 567 | 208k | assert(raw_ucmp != nullptr); | 568 | 208k | icmp_ = InternalKeyComparator(raw_ucmp); | 569 | 208k | data_ = data; | 570 | 208k | restarts_ = restarts; | 571 | 208k | num_restarts_ = num_restarts; | 572 | 208k | restart_index_ = num_restarts_; | 573 | 208k | entry_ = Slice(); | 574 | 208k | global_seqno_ = global_seqno; | 575 | 208k | ts_sz_ = raw_ucmp->timestamp_size(); | 576 | 208k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; | 577 | 208k | block_contents_pinned_ = block_contents_pinned; | 578 | 208k | cache_handle_ = nullptr; | 579 | 208k | cur_entry_idx_ = -1; | 580 | 208k | protection_bytes_per_key_ = protection_bytes_per_key; | 581 | 208k | kv_checksum_ = kv_checksum; | 582 | 208k | block_restart_interval_ = block_restart_interval; | 583 | | // Checksum related states are either all 0/nullptr or all non-zero. | 584 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 | 585 | | // since we are not able to compute it. | 586 | 208k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || | 587 | 208k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && | 588 | 208k | (block_restart_interval > 0 || num_restarts == 1))); | 589 | | | 590 | 208k | values_section_ = values_section; | 591 | 208k | current_ = GetKeysEndOffset(); | 592 | 208k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int, char const*) Line | Count | Source | 564 | 54.7k | const char* values_section) { | 565 | 54.7k | assert(data_ == nullptr); // Ensure it is called only once | 566 | 54.7k | assert(num_restarts > 0); // Ensure the param is valid | 567 | 54.7k | assert(raw_ucmp != nullptr); | 568 | 54.7k | icmp_ = InternalKeyComparator(raw_ucmp); | 569 | 54.7k | data_ = data; | 570 | 54.7k | restarts_ = restarts; | 571 | 54.7k | num_restarts_ = num_restarts; | 572 | 54.7k | restart_index_ = num_restarts_; | 573 | 54.7k | entry_ = Slice(); | 574 | 54.7k | global_seqno_ = global_seqno; | 575 | 54.7k | ts_sz_ = raw_ucmp->timestamp_size(); | 576 | 54.7k | pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; | 577 | 54.7k | block_contents_pinned_ = block_contents_pinned; | 578 | 54.7k | cache_handle_ = nullptr; | 579 | 54.7k | cur_entry_idx_ = -1; | 580 | 54.7k | protection_bytes_per_key_ = protection_bytes_per_key; | 581 | 54.7k | kv_checksum_ = kv_checksum; | 582 | 54.7k | block_restart_interval_ = block_restart_interval; | 583 | | // Checksum related states are either all 0/nullptr or all non-zero. | 584 | | // One exception is when num_restarts == 0, block_restart_interval can be 0 | 585 | | // since we are not able to compute it. | 586 | 54.7k | assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || | 587 | 54.7k | (protection_bytes_per_key > 0 && kv_checksum != nullptr && | 588 | 54.7k | (block_restart_interval > 0 || num_restarts == 1))); | 589 | | | 590 | 54.7k | values_section_ = values_section; | 591 | 54.7k | current_ = GetKeysEndOffset(); | 592 | 54.7k | } |
|
593 | | |
594 | 0 | void CorruptionError(const std::string& error_msg = "bad entry in block") { |
595 | 0 | current_ = GetKeysEndOffset(); |
596 | 0 | restart_index_ = num_restarts_; |
597 | 0 | status_ = Status::Corruption(error_msg); |
598 | 0 | raw_key_.Clear(); |
599 | 0 | value_.clear(); |
600 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) |
601 | | |
602 | 0 | void PerKVChecksumCorruptionError() { |
603 | 0 | std::string error_msg{ |
604 | 0 | "Corrupted block entry: per key-value checksum verification " |
605 | 0 | "failed."}; |
606 | 0 | error_msg.append(" Offset: " + std::to_string(current_) + "."); |
607 | 0 | error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); |
608 | 0 | CorruptionError(error_msg); |
609 | 0 | } Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::PerKVChecksumCorruptionError() Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::PerKVChecksumCorruptionError() |
610 | | |
611 | 1.17M | void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) { |
612 | 1.17M | if (pad_min_timestamp_) { |
613 | 0 | raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_); |
614 | 1.17M | } else { |
615 | 1.17M | raw_key.SetKey(key, false /* copy */); |
616 | 1.17M | } |
617 | 1.17M | } rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::IterKey&, rocksdb::Slice const&) Line | Count | Source | 611 | 1.12M | void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) { | 612 | 1.12M | if (pad_min_timestamp_) { | 613 | 0 | raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_); | 614 | 1.12M | } else { | 615 | 1.12M | raw_key.SetKey(key, false /* copy */); | 616 | 1.12M | } | 617 | 1.12M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::IterKey&, rocksdb::Slice const&) Line | Count | Source | 611 | 48.1k | void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) { | 612 | 48.1k | if (pad_min_timestamp_) { | 613 | 0 | raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_); | 614 | 48.1k | } else { | 615 | 48.1k | raw_key.SetKey(key, false /* copy */); | 616 | 48.1k | } | 617 | 48.1k | } |
|
618 | | |
619 | 1.17M | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { |
620 | 1.17M | UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key); |
621 | 1.17M | } rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&) Line | Count | Source | 619 | 1.12M | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { | 620 | 1.12M | UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key); | 621 | 1.12M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&) Line | Count | Source | 619 | 48.1k | void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { | 620 | 48.1k | UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key); | 621 | 48.1k | } |
|
622 | | |
623 | | // Must be called every time a key is found that needs to be returned to user, |
624 | | // and may be called when no key is found (as a no-op). Updates `key_`, |
625 | | // `key_buf_`, and `key_pinned_` with info about the found key. |
626 | | // Per key-value checksum verification is done if available for the key to be |
627 | | // returned. Iterator is invalidated with corruption status if checksum |
628 | | // verification fails. |
629 | 3.75M | void UpdateKey() { |
630 | 3.75M | key_buf_.Clear(); |
631 | 3.75M | if (!Valid()) { |
632 | 220k | return; |
633 | 220k | } |
634 | 3.53M | if (raw_key_.IsUserKey()) { |
635 | 3.34M | assert(global_seqno_ == kDisableGlobalSequenceNumber); |
636 | 3.34M | key_ = raw_key_.GetUserKey(); |
637 | 3.34M | key_pinned_ = raw_key_.IsKeyPinned(); |
638 | 3.34M | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { |
639 | 196k | key_ = raw_key_.GetInternalKey(); |
640 | 196k | key_pinned_ = raw_key_.IsKeyPinned(); |
641 | 18.4E | } else { |
642 | 18.4E | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, |
643 | 18.4E | ExtractValueType(raw_key_.GetInternalKey())); |
644 | 18.4E | key_ = key_buf_.GetInternalKey(); |
645 | 18.4E | key_pinned_ = false; |
646 | 18.4E | } |
647 | 3.53M | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", |
648 | 3.53M | (void*)value_.data()); |
649 | 3.53M | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", |
650 | 3.53M | &protection_bytes_per_key_); |
651 | 3.53M | if (protection_bytes_per_key_ > 0) { |
652 | 0 | if (!ProtectionInfo64() |
653 | 0 | .ProtectKV(raw_key_.GetKey(), value_) |
654 | 0 | .Verify( |
655 | 0 | protection_bytes_per_key_, |
656 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { |
657 | 0 | PerKVChecksumCorruptionError(); |
658 | 0 | } |
659 | 0 | } |
660 | 3.53M | } rocksdb::BlockIter<rocksdb::Slice>::UpdateKey() Line | Count | Source | 629 | 3.69M | void UpdateKey() { | 630 | 3.69M | key_buf_.Clear(); | 631 | 3.69M | if (!Valid()) { | 632 | 197k | return; | 633 | 197k | } | 634 | 3.49M | if (raw_key_.IsUserKey()) { | 635 | 3.30M | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 636 | 3.30M | key_ = raw_key_.GetUserKey(); | 637 | 3.30M | key_pinned_ = raw_key_.IsKeyPinned(); | 638 | 3.30M | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 639 | 196k | key_ = raw_key_.GetInternalKey(); | 640 | 196k | key_pinned_ = raw_key_.IsKeyPinned(); | 641 | 18.4E | } else { | 642 | 18.4E | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, | 643 | 18.4E | ExtractValueType(raw_key_.GetInternalKey())); | 644 | 18.4E | key_ = key_buf_.GetInternalKey(); | 645 | 18.4E | key_pinned_ = false; | 646 | 18.4E | } | 647 | 3.49M | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", | 648 | 3.49M | (void*)value_.data()); | 649 | 3.49M | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", | 650 | 3.49M | &protection_bytes_per_key_); | 651 | 3.49M | if (protection_bytes_per_key_ > 0) { | 652 | 0 | if (!ProtectionInfo64() | 653 | 0 | .ProtectKV(raw_key_.GetKey(), value_) | 654 | 0 | .Verify( | 655 | 0 | protection_bytes_per_key_, | 656 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { | 657 | 0 | PerKVChecksumCorruptionError(); | 658 | 0 | } | 659 | 0 | } | 660 | 3.49M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateKey() Line | Count | Source | 629 | 60.0k | void UpdateKey() { | 630 | 60.0k | key_buf_.Clear(); | 631 | 60.0k | if (!Valid()) { | 632 | 23.1k | return; | 633 | 23.1k | } | 634 | 36.9k | if (raw_key_.IsUserKey()) { | 635 | 36.9k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 636 | 36.9k | key_ = raw_key_.GetUserKey(); | 637 | 36.9k | key_pinned_ = raw_key_.IsKeyPinned(); | 638 | 18.4E | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 639 | 0 | key_ = raw_key_.GetInternalKey(); | 640 | 0 | key_pinned_ = raw_key_.IsKeyPinned(); | 641 | 18.4E | } else { | 642 | 18.4E | key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, | 643 | 18.4E | ExtractValueType(raw_key_.GetInternalKey())); | 644 | 18.4E | key_ = key_buf_.GetInternalKey(); | 645 | 18.4E | key_pinned_ = false; | 646 | 18.4E | } | 647 | 36.9k | TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", | 648 | 36.9k | (void*)value_.data()); | 649 | 36.9k | TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", | 650 | 36.9k | &protection_bytes_per_key_); | 651 | 36.9k | if (protection_bytes_per_key_ > 0) { | 652 | 0 | if (!ProtectionInfo64() | 653 | 0 | .ProtectKV(raw_key_.GetKey(), value_) | 654 | 0 | .Verify( | 655 | 0 | protection_bytes_per_key_, | 656 | 0 | kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { | 657 | 0 | PerKVChecksumCorruptionError(); | 658 | 0 | } | 659 | 0 | } | 660 | 36.9k | } |
|
661 | | |
662 | | // Compares two keys using the appropriate comparator for the block contents. |
663 | | // Uses user comparator when the block stores user keys, otherwise uses the |
664 | | // internal key comparator. When global_seqno is not disabled, applies it to |
665 | | // the LHS key for comparison. |
666 | 536k | int CompareKey(const Slice& a, const Slice& b) const { |
667 | 536k | assert(icmp_.user_comparator() != nullptr); |
668 | 536k | if (raw_key_.IsUserKey()) { |
669 | 520k | assert(global_seqno_ == kDisableGlobalSequenceNumber); |
670 | 520k | return icmp_.user_comparator()->Compare(a, b); |
671 | 520k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { |
672 | 14.8k | return icmp_.Compare(a, b); |
673 | 14.8k | } |
674 | 486 | return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber); |
675 | 536k | } rocksdb::BlockIter<rocksdb::IndexValue>::CompareKey(rocksdb::Slice const&, rocksdb::Slice const&) const Line | Count | Source | 666 | 11.0k | int CompareKey(const Slice& a, const Slice& b) const { | 667 | 11.0k | assert(icmp_.user_comparator() != nullptr); | 668 | 11.0k | if (raw_key_.IsUserKey()) { | 669 | 11.0k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 670 | 11.0k | return icmp_.user_comparator()->Compare(a, b); | 671 | 11.0k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 672 | 0 | return icmp_.Compare(a, b); | 673 | 0 | } | 674 | 0 | return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber); | 675 | 11.0k | } |
rocksdb::BlockIter<rocksdb::Slice>::CompareKey(rocksdb::Slice const&, rocksdb::Slice const&) const Line | Count | Source | 666 | 525k | int CompareKey(const Slice& a, const Slice& b) const { | 667 | 525k | assert(icmp_.user_comparator() != nullptr); | 668 | 525k | if (raw_key_.IsUserKey()) { | 669 | 509k | assert(global_seqno_ == kDisableGlobalSequenceNumber); | 670 | 509k | return icmp_.user_comparator()->Compare(a, b); | 671 | 509k | } else if (global_seqno_ == kDisableGlobalSequenceNumber) { | 672 | 14.8k | return icmp_.Compare(a, b); | 673 | 14.8k | } | 674 | 486 | return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber); | 675 | 525k | } |
|
676 | | |
677 | 533k | int CompareKey(const IterKey& a, const Slice& b) const { |
678 | 533k | if (a.IsUserKey()) { |
679 | 518k | return CompareKey(a.GetUserKey(), b); |
680 | 518k | } |
681 | 14.3k | return CompareKey(a.GetInternalKey(), b); |
682 | 533k | } rocksdb::BlockIter<rocksdb::IndexValue>::CompareKey(rocksdb::IterKey const&, rocksdb::Slice const&) const Line | Count | Source | 677 | 11.0k | int CompareKey(const IterKey& a, const Slice& b) const { | 678 | 11.0k | if (a.IsUserKey()) { | 679 | 11.0k | return CompareKey(a.GetUserKey(), b); | 680 | 11.0k | } | 681 | 0 | return CompareKey(a.GetInternalKey(), b); | 682 | 11.0k | } |
rocksdb::BlockIter<rocksdb::Slice>::CompareKey(rocksdb::IterKey const&, rocksdb::Slice const&) const Line | Count | Source | 677 | 522k | int CompareKey(const IterKey& a, const Slice& b) const { | 678 | 522k | if (a.IsUserKey()) { | 679 | 507k | return CompareKey(a.GetUserKey(), b); | 680 | 507k | } | 681 | 14.3k | return CompareKey(a.GetInternalKey(), b); | 682 | 522k | } |
|
683 | | |
684 | | // Compares the current key (with global seqno applied) against `other`. |
685 | 533k | int CompareCurrentKey(const Slice& other) const { |
686 | 533k | return CompareKey(raw_key_, other); |
687 | 533k | } rocksdb::BlockIter<rocksdb::IndexValue>::CompareCurrentKey(rocksdb::Slice const&) const Line | Count | Source | 685 | 11.0k | int CompareCurrentKey(const Slice& other) const { | 686 | 11.0k | return CompareKey(raw_key_, other); | 687 | 11.0k | } |
rocksdb::BlockIter<rocksdb::Slice>::CompareCurrentKey(rocksdb::Slice const&) const Line | Count | Source | 685 | 522k | int CompareCurrentKey(const Slice& other) const { | 686 | 522k | return CompareKey(raw_key_, other); | 687 | 522k | } |
|
688 | | |
689 | | private: |
690 | | // Store the cache handle, if the block is cached. We need this since the |
691 | | // only other place the handle is stored is as an argument to the Cleanable |
692 | | // function callback, which is hard to retrieve. When multiple value |
693 | | // PinnableSlices reference the block, they need the cache handle in order |
694 | | // to bump up the ref count |
695 | | Cache::Handle* cache_handle_; |
696 | | |
697 | | public: |
698 | | // Return the offset in data_ just past the end of the current entry. |
699 | 4.06M | inline uint32_t NextEntryOffset() const { |
700 | | // NOTE: We don't support blocks bigger than 2GB |
701 | 4.06M | return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_); |
702 | 4.06M | } rocksdb::BlockIter<rocksdb::Slice>::NextEntryOffset() const Line | Count | Source | 699 | 4.00M | inline uint32_t NextEntryOffset() const { | 700 | | // NOTE: We don't support blocks bigger than 2GB | 701 | 4.00M | return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_); | 702 | 4.00M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::NextEntryOffset() const Line | Count | Source | 699 | 54.9k | inline uint32_t NextEntryOffset() const { | 700 | | // NOTE: We don't support blocks bigger than 2GB | 701 | 54.9k | return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_); | 702 | 54.9k | } |
|
703 | | |
704 | | // Return the offset where the keys section ends. |
705 | | // For separated KV storage, this is the start of the values section. |
706 | | // Otherwise, it's the start of the restart array. |
707 | 11.4M | inline uint32_t GetKeysEndOffset() const { |
708 | 11.4M | return values_section_ ? static_cast<uint32_t>(values_section_ - data_) |
709 | 11.4M | : restarts_; |
710 | 11.4M | } rocksdb::BlockIter<rocksdb::Slice>::GetKeysEndOffset() const Line | Count | Source | 707 | 11.2M | inline uint32_t GetKeysEndOffset() const { | 708 | 11.2M | return values_section_ ? static_cast<uint32_t>(values_section_ - data_) | 709 | 11.2M | : restarts_; | 710 | 11.2M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::GetKeysEndOffset() const Line | Count | Source | 707 | 253k | inline uint32_t GetKeysEndOffset() const { | 708 | 253k | return values_section_ ? static_cast<uint32_t>(values_section_ - data_) | 709 | 253k | : restarts_; | 710 | 253k | } |
|
711 | | |
712 | 1.49M | uint32_t GetRestartPoint(uint32_t index) const { |
713 | 1.49M | assert(index < num_restarts_); |
714 | 1.49M | uint32_t offset = |
715 | 1.49M | DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); |
716 | 1.49M | assert(!values_section_ || offset <= values_section_ - data_); |
717 | 1.49M | return offset; |
718 | 1.49M | } rocksdb::BlockIter<rocksdb::Slice>::GetRestartPoint(unsigned int) const Line | Count | Source | 712 | 1.43M | uint32_t GetRestartPoint(uint32_t index) const { | 713 | 1.43M | assert(index < num_restarts_); | 714 | 1.43M | uint32_t offset = | 715 | 1.43M | DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); | 716 | | assert(!values_section_ || offset <= values_section_ - data_); | 717 | 1.43M | return offset; | 718 | 1.43M | } |
rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartPoint(unsigned int) const Line | Count | Source | 712 | 59.0k | uint32_t GetRestartPoint(uint32_t index) const { | 713 | 59.0k | assert(index < num_restarts_); | 714 | 59.0k | uint32_t offset = | 715 | 59.0k | DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); | 716 | | assert(!values_section_ || offset <= values_section_ - data_); | 717 | 59.0k | return offset; | 718 | 59.0k | } |
|
719 | | |
720 | 504k | void SeekToRestartPoint(uint32_t index) { |
721 | 504k | raw_key_.Clear(); |
722 | 504k | restart_index_ = index; |
723 | | // Set to one before the first entry so ParseNextKey() increments to correct |
724 | | // position |
725 | 504k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; |
726 | | |
727 | | // ParseNextKey() starts at the end of entry_, so set value_ accordingly |
728 | 504k | uint32_t offset = GetRestartPoint(index); |
729 | 504k | entry_ = Slice(data_ + offset, 0); |
730 | 504k | } rocksdb::BlockIter<rocksdb::Slice>::SeekToRestartPoint(unsigned int) Line | Count | Source | 720 | 469k | void SeekToRestartPoint(uint32_t index) { | 721 | 469k | raw_key_.Clear(); | 722 | 469k | restart_index_ = index; | 723 | | // Set to one before the first entry so ParseNextKey() increments to correct | 724 | | // position | 725 | 469k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; | 726 | | | 727 | | // ParseNextKey() starts at the end of entry_, so set value_ accordingly | 728 | 469k | uint32_t offset = GetRestartPoint(index); | 729 | 469k | entry_ = Slice(data_ + offset, 0); | 730 | 469k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToRestartPoint(unsigned int) Line | Count | Source | 720 | 34.7k | void SeekToRestartPoint(uint32_t index) { | 721 | 34.7k | raw_key_.Clear(); | 722 | 34.7k | restart_index_ = index; | 723 | | // Set to one before the first entry so ParseNextKey() increments to correct | 724 | | // position | 725 | 34.7k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; | 726 | | | 727 | | // ParseNextKey() starts at the end of entry_, so set value_ accordingly | 728 | 34.7k | uint32_t offset = GetRestartPoint(index); | 729 | 34.7k | entry_ = Slice(data_ + offset, 0); | 730 | 34.7k | } |
|
731 | | |
732 | | protected: |
733 | | template <typename DecodeKeyFunc> |
734 | | inline bool GetRestartKey(uint32_t index, Slice* key); |
735 | | |
736 | | template <typename DecodeKeyFunc> |
737 | | inline bool BinarySeekRestartPointIndex(const Slice& target, uint32_t* index, |
738 | | bool* is_index_key_result); |
739 | | |
740 | | template <typename DecodeKeyFunc> |
741 | | inline bool InterpolationSeekRestartPointIndex(const Slice& target, |
742 | | uint32_t* index, |
743 | | bool* is_index_key_result); |
744 | | |
745 | | // Find the first key in restart interval `index` that is >= `target`. |
746 | | // If there is no such key, iterator is positioned at the first key in |
747 | | // restart interval `index + 1`. |
748 | | // If is_index_key_result is true, it positions the iterator at the first key |
749 | | // in this restart interval. |
750 | | // Per key-value checksum verification is done for all keys scanned |
751 | | // up to but not including the last key (the key that current_ points to |
752 | | // when this function returns). This key's checksum is verified in |
753 | | // UpdateKey(). |
754 | | void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, |
755 | | bool is_index_key_result); |
756 | | }; |
757 | | |
758 | | class DataBlockIter final : public BlockIter<Slice> { |
759 | | public: |
760 | | DataBlockIter() |
761 | 57.3k | : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} |
762 | | void Initialize(const Comparator* raw_ucmp, const char* data, |
763 | | uint32_t restarts, uint32_t num_restarts, |
764 | | SequenceNumber global_seqno, |
765 | | BlockReadAmpBitmap* read_amp_bitmap, |
766 | | bool block_contents_pinned, |
767 | | bool user_defined_timestamps_persisted, |
768 | | DataBlockHashIndex* data_block_hash_index, |
769 | | uint8_t protection_bytes_per_key, const char* kv_checksum, |
770 | 38.6k | uint32_t block_restart_interval, const char* values_section) { |
771 | 38.6k | InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, |
772 | 38.6k | block_contents_pinned, user_defined_timestamps_persisted, |
773 | 38.6k | protection_bytes_per_key, kv_checksum, |
774 | 38.6k | block_restart_interval, values_section); |
775 | 38.6k | raw_key_.SetIsUserKey(false); |
776 | 38.6k | read_amp_bitmap_ = read_amp_bitmap; |
777 | 38.6k | last_bitmap_offset_ = current_ + 1; |
778 | 38.6k | data_block_hash_index_ = data_block_hash_index; |
779 | 38.6k | } |
780 | | |
781 | 178k | Slice value() const override { |
782 | 178k | assert(Valid()); |
783 | 178k | if (read_amp_bitmap_ && current_ < restarts_ && |
784 | 0 | current_ != last_bitmap_offset_) { |
785 | 0 | read_amp_bitmap_->Mark(current_ /* current entry offset */, |
786 | 0 | NextEntryOffset() - 1); |
787 | 0 | last_bitmap_offset_ = current_; |
788 | 0 | } |
789 | 178k | return value_; |
790 | 178k | } |
791 | | |
792 | | // Returns if `target` may exist. |
793 | 1.43k | inline bool SeekForGet(const Slice& target) { |
794 | | #ifndef NDEBUG |
795 | | if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; |
796 | | #endif |
797 | 1.43k | if (!data_block_hash_index_) { |
798 | 1.43k | SeekImpl(target); |
799 | 1.43k | UpdateKey(); |
800 | 1.43k | return true; |
801 | 1.43k | } |
802 | 0 | bool res = SeekForGetImpl(target); |
803 | 0 | UpdateKey(); |
804 | 0 | return res; |
805 | 1.43k | } |
806 | | |
807 | 25.2k | void Invalidate(const Status& s) override { |
808 | 25.2k | BlockIter::Invalidate(s); |
809 | | // Clear prev entries cache. |
810 | 25.2k | prev_entries_keys_buff_.clear(); |
811 | 25.2k | prev_entries_.clear(); |
812 | 25.2k | prev_entries_idx_ = -1; |
813 | 25.2k | } |
814 | | |
815 | | protected: |
816 | | friend Block; |
817 | | inline bool ParseNextDataKey(bool* is_shared); |
818 | | void SeekToFirstImpl() override; |
819 | | void SeekToLastImpl() override; |
820 | | void SeekImpl(const Slice& target) override; |
821 | | void SeekForPrevImpl(const Slice& target) override; |
822 | | void NextImpl() override; |
823 | | void PrevImpl() override; |
824 | | |
825 | | private: |
826 | | // read-amp bitmap |
827 | | BlockReadAmpBitmap* read_amp_bitmap_; |
828 | | // last `current_` value we report to read-amp bitmp |
829 | | mutable uint32_t last_bitmap_offset_; |
830 | | struct CachedPrevEntry { |
831 | | explicit CachedPrevEntry(uint32_t _offset, uint32_t _entry_size, |
832 | | const char* _key_ptr, size_t _key_offset, |
833 | | size_t _key_size, Slice _value) |
834 | 637 | : offset(_offset), |
835 | 637 | entry_size(_entry_size), |
836 | 637 | key_ptr(_key_ptr), |
837 | 637 | key_offset(_key_offset), |
838 | 637 | key_size(_key_size), |
839 | 637 | value(_value) {} |
840 | | |
841 | | // offset of entry in block |
842 | | uint32_t offset; |
843 | | // size of entry (for NextEntryOffset calculation) |
844 | | uint32_t entry_size; |
845 | | // Pointer to key data in block (nullptr if key is delta-encoded) |
846 | | const char* key_ptr; |
847 | | // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) |
848 | | size_t key_offset; |
849 | | // size of key |
850 | | size_t key_size; |
851 | | // value slice pointing to data in block |
852 | | Slice value; |
853 | | }; |
854 | | std::string prev_entries_keys_buff_; |
855 | | std::vector<CachedPrevEntry> prev_entries_; |
856 | | int32_t prev_entries_idx_ = -1; |
857 | | |
858 | | DataBlockHashIndex* data_block_hash_index_; |
859 | | |
860 | | bool SeekForGetImpl(const Slice& target); |
861 | | }; |
862 | | |
863 | | // Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and |
864 | | // are used to store Properties associated with table. |
865 | | // Meta blocks always store user keys (no sequence number) and always |
866 | | // use the BytewiseComparator. Additionally, MetaBlock accesses are |
867 | | // not recorded in the Statistics or for Read-Amplification. |
868 | | class MetaBlockIter final : public BlockIter<Slice> { |
869 | | public: |
870 | 170k | MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } |
871 | | void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, |
872 | | bool block_contents_pinned, uint8_t protection_bytes_per_key, |
873 | | const char* kv_checksum, uint32_t block_restart_interval, |
874 | 169k | const char* values_section) { |
875 | | // Initializes the iterator with a BytewiseComparator and |
876 | | // the raw key being a user key. |
877 | 169k | InitializeBase(BytewiseComparator(), data, restarts, num_restarts, |
878 | 169k | kDisableGlobalSequenceNumber, block_contents_pinned, |
879 | 169k | /* user_defined_timestamps_persisted */ true, |
880 | 169k | protection_bytes_per_key, kv_checksum, |
881 | 169k | block_restart_interval, values_section); |
882 | 169k | raw_key_.SetIsUserKey(true); |
883 | 169k | } |
884 | | |
885 | 3.56M | Slice value() const override { |
886 | 3.56M | assert(Valid()); |
887 | 3.56M | return value_; |
888 | 3.56M | } |
889 | | |
890 | | protected: |
891 | | friend Block; |
892 | | void SeekToFirstImpl() override; |
893 | | void SeekToLastImpl() override; |
894 | | void SeekImpl(const Slice& target) override; |
895 | | void SeekForPrevImpl(const Slice& target) override; |
896 | | void NextImpl() override; |
897 | | void PrevImpl() override; |
898 | | // Meta index block's restart interval is always 1. See |
899 | | // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. |
900 | 0 | uint32_t GetRestartInterval() override { return 1; } |
901 | 0 | uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } |
902 | | }; |
903 | | |
904 | | class IndexBlockIter final : public BlockIter<IndexValue> { |
905 | | public: |
906 | 54.7k | IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} |
907 | | |
908 | | // key_includes_seq, default true, means that the keys are in internal key |
909 | | // format. |
910 | | // value_is_full, default true, means that no delta encoding is |
911 | | // applied to values. |
912 | | void Initialize( |
913 | | const Comparator* raw_ucmp, const char* data, uint32_t restarts, |
914 | | uint32_t num_restarts, SequenceNumber global_seqno, |
915 | | BlockPrefixIndex* prefix_index, bool have_first_key, |
916 | | bool key_includes_seq, bool value_is_full, bool block_contents_pinned, |
917 | | bool user_defined_timestamps_persisted, uint8_t protection_bytes_per_key, |
918 | | const char* kv_checksum, uint32_t block_restart_interval, |
919 | | const char* values_section, |
920 | 54.7k | BlockBasedTableOptions::BlockSearchType index_block_search_type) { |
921 | 54.7k | InitializeBase(raw_ucmp, data, restarts, num_restarts, |
922 | 54.7k | kDisableGlobalSequenceNumber, block_contents_pinned, |
923 | 54.7k | user_defined_timestamps_persisted, protection_bytes_per_key, |
924 | 54.7k | kv_checksum, block_restart_interval, values_section); |
925 | 54.7k | raw_key_.SetIsUserKey(!key_includes_seq); |
926 | 54.7k | prefix_index_ = prefix_index; |
927 | 54.7k | value_delta_encoded_ = !value_is_full; |
928 | 54.7k | have_first_key_ = have_first_key; |
929 | 54.7k | index_search_type_ = index_block_search_type; |
930 | 54.7k | if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { |
931 | 0 | global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); |
932 | 54.7k | } else { |
933 | 54.7k | global_seqno_state_.reset(); |
934 | 54.7k | } |
935 | 54.7k | } |
936 | | |
937 | 0 | Slice user_key() const override { |
938 | 0 | assert(Valid()); |
939 | 0 | return raw_key_.GetUserKey(); |
940 | 0 | } |
941 | | |
942 | 65.9k | IndexValue value() const override { |
943 | 65.9k | assert(Valid()); |
944 | 65.9k | if (value_delta_encoded_ || global_seqno_state_ != nullptr || |
945 | 65.9k | pad_min_timestamp_) { |
946 | 65.9k | return decoded_value_; |
947 | 65.9k | } else { |
948 | 0 | IndexValue entry; |
949 | 0 | Slice v = value_; |
950 | 0 | Status decode_s __attribute__((__unused__)) = |
951 | 0 | entry.DecodeFrom(&v, have_first_key_, nullptr); |
952 | 0 | assert(decode_s.ok()); |
953 | 0 | return entry; |
954 | 0 | } |
955 | 65.9k | } |
956 | | |
957 | 0 | Slice raw_value() const { |
958 | 0 | assert(Valid()); |
959 | 0 | return value_; |
960 | 0 | } |
961 | | |
962 | 0 | bool IsValuePinned() const override { |
963 | 0 | return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); |
964 | 0 | } |
965 | | |
966 | | protected: |
967 | | friend Block; |
968 | | // IndexBlockIter follows a different contract for prefix iterator |
969 | | // from data iterators. |
970 | | // If prefix of the seek key `target` exists in the file, it must |
971 | | // return the same result as total order seek. |
972 | | // If the prefix of `target` doesn't exist in the file, it can either |
973 | | // return the result of total order seek, or set both of Valid() = false |
974 | | // and status() = NotFound(). |
975 | | void SeekImpl(const Slice& target) override; |
976 | | |
977 | 0 | void SeekForPrevImpl(const Slice&) override { |
978 | 0 | assert(false); |
979 | 0 | current_ = GetKeysEndOffset(); |
980 | 0 | restart_index_ = num_restarts_; |
981 | 0 | status_ = Status::InvalidArgument( |
982 | 0 | "RocksDB internal error: should never call SeekForPrev() on index " |
983 | 0 | "blocks"); |
984 | 0 | raw_key_.Clear(); |
985 | 0 | value_.clear(); |
986 | 0 | } |
987 | | |
988 | | void PrevImpl() override; |
989 | | void NextImpl() override; |
990 | | void SeekToFirstImpl() override; |
991 | | void SeekToLastImpl() override; |
992 | | |
993 | | private: |
994 | | bool value_delta_encoded_; |
995 | | bool have_first_key_; // value includes first_internal_key |
996 | | BlockPrefixIndex* prefix_index_; |
997 | | // Whether the value is delta encoded. In that case the value is assumed to be |
998 | | // BlockHandle. The first value in each restart interval is the full encoded |
999 | | // BlockHandle; the restart of encoded size part of the BlockHandle. The |
1000 | | // offset of delta encoded BlockHandles is computed by adding the size of |
1001 | | // previous delta encoded values in the same restart interval to the offset of |
1002 | | // the first value in that restart interval. |
1003 | | IndexValue decoded_value_; |
1004 | | |
1005 | | // When sequence number overwriting is enabled, this struct contains the seqno |
1006 | | // to overwrite with, and current first_internal_key with overwritten seqno. |
1007 | | // This is rarely used, so we put it behind a pointer and only allocate when |
1008 | | // needed. |
1009 | | struct GlobalSeqnoState { |
1010 | | // First internal key according to current index entry, but with sequence |
1011 | | // number overwritten to global_seqno. |
1012 | | IterKey first_internal_key; |
1013 | | SequenceNumber global_seqno; |
1014 | | |
1015 | 0 | explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} |
1016 | | }; |
1017 | | |
1018 | | std::unique_ptr<GlobalSeqnoState> global_seqno_state_; |
1019 | | |
1020 | | // Buffers the `first_internal_key` referred by `decoded_value_` when |
1021 | | // `pad_min_timestamp_` is true. |
1022 | | std::string first_internal_key_with_ts_; |
1023 | | |
1024 | | // The search algorithm to use when reading the index block. |
1025 | | BlockBasedTableOptions::BlockSearchType index_search_type_ = |
1026 | | BlockBasedTableOptions::kBinary; |
1027 | | |
1028 | | // Set *prefix_may_exist to false if no key possibly share the same prefix |
1029 | | // as `target`. If not set, the result position should be the same as total |
1030 | | // order Seek. |
1031 | | bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); |
1032 | | // Set *prefix_may_exist to false if no key can possibly share the same |
1033 | | // prefix as `target`. If not set, the result position should be the same |
1034 | | // as total order seek. |
1035 | | bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
1036 | | uint32_t left, uint32_t right, uint32_t* index, |
1037 | | bool* prefix_may_exist); |
1038 | | inline int CompareBlockKey(uint32_t block_index, const Slice& target); |
1039 | | |
1040 | | template <typename DecodeKeyFunc> |
1041 | | bool FindRestartPointForSeek(const Slice& seek_key, uint32_t* index, |
1042 | | bool* skip_linear_scan); |
1043 | | |
1044 | | inline bool ParseNextIndexKey(); |
1045 | | |
1046 | | // When value_delta_encoded_ is enabled it decodes the value which is assumed |
1047 | | // to be BlockHandle and put it to decoded_value_ |
1048 | | inline void DecodeCurrentValue(bool is_shared); |
1049 | | }; |
1050 | | |
1051 | | } // namespace ROCKSDB_NAMESPACE |