Coverage Report

Created: 2026-04-10 07:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/table/block_based/block.h
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#pragma once
11
#include <stddef.h>
12
#include <stdint.h>
13
14
#include <string>
15
#include <vector>
16
17
#include "db/kv_checksum.h"
18
#include "db/pinned_iterators_manager.h"
19
#include "port/malloc.h"
20
#include "rocksdb/advanced_cache.h"
21
#include "rocksdb/iterator.h"
22
#include "rocksdb/options.h"
23
#include "rocksdb/statistics.h"
24
#include "rocksdb/table.h"
25
#include "table/block_based/block_prefix_index.h"
26
#include "table/block_based/data_block_hash_index.h"
27
#include "table/format.h"
28
#include "table/internal_iterator.h"
29
#include "test_util/sync_point.h"
30
#include "util/random.h"
31
32
namespace ROCKSDB_NAMESPACE {
33
34
struct BlockContents;
35
class Comparator;
36
template <class TValue>
37
class BlockIter;
38
class DataBlockIter;
39
class IndexBlockIter;
40
class MetaBlockIter;
41
class BlockPrefixIndex;
42
43
// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
44
// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
45
// bytes in the Block we update the bitmap and increment
46
// READ_AMP_ESTIMATE_USEFUL_BYTES.
47
class BlockReadAmpBitmap {
48
 public:
49
  explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
50
                              Statistics* statistics)
51
0
      : bitmap_(nullptr),
52
0
        bytes_per_bit_pow_(0),
53
0
        statistics_(statistics),
54
0
        rnd_(Random::GetTLSInstance()->Uniform(
55
0
            static_cast<int>(bytes_per_bit))) {
56
0
    TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
57
0
    assert(block_size > 0 && bytes_per_bit > 0);
58
59
    // convert bytes_per_bit to be a power of 2
60
0
    while (bytes_per_bit >>= 1) {
61
0
      bytes_per_bit_pow_++;
62
0
    }
63
64
    // num_bits_needed = ceil(block_size / bytes_per_bit)
65
0
    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
66
0
    assert(num_bits_needed > 0);
67
68
    // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
69
0
    size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
70
71
    // Create bitmap and set all the bits to 0
72
0
    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
73
74
0
    RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
75
0
  }
76
77
0
  ~BlockReadAmpBitmap() { delete[] bitmap_; }
78
79
0
  void Mark(uint32_t start_offset, uint32_t end_offset) {
80
0
    assert(end_offset >= start_offset);
81
    // Index of first bit in mask
82
0
    uint32_t start_bit =
83
0
        (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
84
0
        bytes_per_bit_pow_;
85
    // Index of last bit in mask + 1
86
0
    uint32_t exclusive_end_bit =
87
0
        (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
88
0
    if (start_bit >= exclusive_end_bit) {
89
0
      return;
90
0
    }
91
0
    assert(exclusive_end_bit > 0);
92
93
0
    if (GetAndSet(start_bit) == 0) {
94
0
      uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
95
0
                                  << bytes_per_bit_pow_;
96
0
      RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
97
0
                 new_useful_bytes);
98
0
    }
99
0
  }
100
101
0
  Statistics* GetStatistics() {
102
0
    return statistics_.load(std::memory_order_relaxed);
103
0
  }
104
105
0
  void SetStatistics(Statistics* stats) { statistics_.store(stats); }
106
107
0
  uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
108
109
0
  size_t ApproximateMemoryUsage() const {
110
0
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
111
0
    return malloc_usable_size((void*)this);
112
0
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
113
0
    return sizeof(*this);
114
0
  }
115
116
 private:
117
  // Get the current value of bit at `bit_idx` and set it to 1
118
0
  inline bool GetAndSet(uint32_t bit_idx) {
119
0
    const uint32_t byte_idx = bit_idx / kBitsPerEntry;
120
0
    const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
121
122
0
    return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
123
0
           bit_mask;
124
0
  }
125
126
  const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
127
  const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
128
129
  // Bitmap used to record the bytes that we read, use atomic to protect
130
  // against multiple threads updating the same bit
131
  std::atomic<uint32_t>* bitmap_;
132
  // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
133
  // muliplication and division
134
  uint8_t bytes_per_bit_pow_;
135
  // Pointer to DB Statistics object, Since this bitmap may outlive the DB
136
  // this pointer maybe invalid, but the DB will update it to a valid pointer
137
  // by using SetStatistics() before calling Mark()
138
  std::atomic<Statistics*> statistics_;
139
  uint32_t rnd_;
140
};
141
142
// class Block is the uncompressed and "parsed" form for blocks containing
143
// key-value pairs. (See BlockContents comments for more on terminology.)
144
// This includes the in-memory representation of data blocks, index blocks
145
// (including partitions), range deletion blocks, properties blocks, metaindex
146
// blocks, as well as the top level of the partitioned filter structure (which
147
// is actually an index of the filter partitions). It is NOT suitable for
148
// compressed blocks in general, filter blocks/partitions, or compression
149
// dictionaries.
150
//
151
// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
152
// for details of the format and the various block types.
153
//
154
// TODO: Rename to ParsedKvBlock?
155
class Block {
156
 public:
157
  // Initialize the block with the specified contents.
158
  // If restart_interval is provided (non-zero), it will be stored directly
159
  // instead of being calculated later.
160
  explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
161
                 Statistics* statistics = nullptr,
162
                 uint32_t restart_interval = 1);
163
  // No copying allowed
164
  Block(const Block&) = delete;
165
  void operator=(const Block&) = delete;
166
167
  ~Block();
168
169
263k
  size_t size() const { return contents_.data.size(); }
170
347k
  const char* data() const { return contents_.data.data(); }
171
  // The additional memory space taken by the block data.
172
24.9k
  size_t usable_size() const { return contents_.usable_size(); }
173
0
  uint32_t NumRestarts() const { return num_restarts_; }
174
24.9k
  bool own_bytes() const { return contents_.own_bytes(); }
175
0
  bool IsUniform() const { return is_uniform_; }
176
177
  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
178
179
  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
180
  // comparator.
181
  //
182
  // If iter is null, return new Iterator
183
  // If iter is not null, update this one and return it as Iterator*
184
  //
185
  // Updates read_amp_bitmap_ if it is not nullptr.
186
  //
187
  // If `block_contents_pinned` is true, the caller will guarantee that when
188
  // the cleanup functions are transferred from the iterator to other
189
  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
190
  // valid. Either the iterator holds cache handle or ownership of some resource
191
  // and release them in a release function, or caller is sure that the data
192
  // will not go away (for example, it's from mmapped file which will not be
193
  // closed).
194
  //
195
  // `user_defined_timestamps_persisted` controls whether a min timestamp is
196
  // padded while key is being parsed from the block.
197
  //
198
  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
199
  // the iterator will simply be set as "invalid", rather than returning
200
  // the key that is just pass the target key.
201
  DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
202
                                 SequenceNumber global_seqno,
203
                                 DataBlockIter* iter = nullptr,
204
                                 Statistics* stats = nullptr,
205
                                 bool block_contents_pinned = false,
206
                                 bool user_defined_timestamps_persisted = true);
207
208
  // Returns an MetaBlockIter for iterating over blocks containing metadata
209
  // (like Properties blocks).  Unlike data blocks, the keys for these blocks
210
  // do not contain sequence numbers, do not use a user-define comparator, and
211
  // do not track read amplification/statistics.  Additionally, MetaBlocks will
212
  // not assert if the block is formatted improperly.
213
  //
214
  // If `block_contents_pinned` is true, the caller will guarantee that when
215
  // the cleanup functions are transferred from the iterator to other
216
  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
217
  // valid. Either the iterator holds cache handle or ownership of some resource
218
  // and release them in a release function, or caller is sure that the data
219
  // will not go away (for example, it's from mmapped file which will not be
220
  // closed).
221
  MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
222
223
  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
224
  // comparator.
225
  //
226
  // key_includes_seq, default true, means that the keys are in internal key
227
  // format.
228
  // value_is_full, default true, means that no delta encoding is
229
  // applied to values.
230
  //
231
  // If `prefix_index` is not nullptr this block will do hash lookup for the key
232
  // prefix. If total_order_seek is true, prefix_index_ is ignored.
233
  //
234
  // `have_first_key` controls whether IndexValue will contain
235
  // first_internal_key. It affects data serialization format, so the same value
236
  // have_first_key must be used when writing and reading index.
237
  // It is determined by IndexType property of the table.
238
  // `user_defined_timestamps_persisted` controls whether a min timestamp is
239
  // padded while key is being parsed from the block.
240
  // `index_block_search_type` controls which search algorithm to use when
241
  // reading the index block. kBinary uses binary search, while
242
  // kInterpolation uses interpolation search which can be faster
243
  // for uniformly distributed keys.
244
  IndexBlockIter* NewIndexIterator(
245
      const Comparator* raw_ucmp, SequenceNumber global_seqno,
246
      IndexBlockIter* iter, Statistics* stats, bool total_order_seek,
247
      bool have_first_key, bool key_includes_seq, bool value_is_full,
248
      bool block_contents_pinned = false,
249
      bool user_defined_timestamps_persisted = true,
250
      BlockPrefixIndex* prefix_index = nullptr,
251
      BlockBasedTableOptions::BlockSearchType index_block_search_type =
252
          BlockBasedTableOptions::kBinary);
253
254
  // Report an approximation of how much memory has been used.
255
  size_t ApproximateMemoryUsage() const;
256
257
  // For TypedCacheInterface
258
0
  const Slice& ContentSlice() const { return contents_.data; }
259
260
  // Initializes per key-value checksum protection.
261
  // After this method is called, each DataBlockIterator returned
262
  // by NewDataIterator will verify per key-value checksum for any key it read.
263
  void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
264
                                         const Comparator* raw_ucmp);
265
266
  // Initializes per key-value checksum protection.
267
  // After this method is called, each IndexBlockIterator returned
268
  // by NewIndexIterator will verify per key-value checksum for any key it read.
269
  // value_is_full and index_has_first_key are needed to be able to parse
270
  // the index block content and construct checksums.
271
  void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
272
                                          const Comparator* raw_ucmp,
273
                                          bool value_is_full,
274
                                          bool index_has_first_key);
275
276
  // Initializes per key-value checksum protection.
277
  // After this method is called, each MetaBlockIter returned
278
  // by NewMetaIterator will verify per key-value checksum for any key it read.
279
  void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key);
280
281
  static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len,
282
0
                                 const Slice& key, const Slice& value) {
283
0
    ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr);
284
0
  }
285
286
0
  bool HasSeparatedKV() const { return values_section_ != nullptr; }
287
288
0
  const char* TEST_GetKVChecksum() const { return kv_checksum_; }
289
290
 private:
291
  // Returns a detailed error status by re-processing the footer.
292
  // Should only be called when size() == 0 (error marker).
293
  Status GetCorruptionStatus() const;
294
295
  BlockContents contents_;
296
  // Normal state: offset in data_ of restart array.
297
  // Error state (size()==0): original data size if footer decode failed,
298
  //   otherwise 0. Used by GetCorruptionStatus() to re-decode footer.
299
  uint32_t restart_offset_;
300
  uint32_t num_restarts_;
301
  bool is_uniform_{false};
302
  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
303
  char* kv_checksum_{nullptr};
304
  uint32_t checksum_size_{0};
305
  // Used by block iterators to calculate current key index within a block
306
  uint32_t block_restart_interval_{0};
307
  uint8_t protection_bytes_per_key_{0};
308
  DataBlockHashIndex data_block_hash_index_;
309
310
  // Pointer to values section, nullptr if not using separated KV
311
  const char* values_section_{nullptr};
312
};
313
314
// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
315
// format of this data buffer is an uncompressed, sorted sequence of key-value
316
// pairs (see `Block` API for more details).
317
//
318
// Notably, the keys may either be in internal key format or user key format.
319
// Subclasses are responsible for configuring the key format.
320
//
321
// `BlockIter` intends to provide final overrides for all of
322
// `InternalIteratorBase` functions that can move the iterator. It does
323
// this to guarantee `UpdateKey()` is called exactly once after each key
324
// movement potentially visible to users. In this step, the key is prepared
325
// (e.g., serialized if global seqno is in effect) so it can be returned
326
// immediately when the user asks for it via calling `key() const`.
327
//
328
// For its subclasses, it provides protected variants of the above-mentioned
329
// final-overridden methods. They are named with the "Impl" suffix, e.g.,
330
// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
331
// "Impl" functions are responsible for positioning `raw_key_` but not
332
// invoking `UpdateKey()`.
333
//
334
// Per key-value checksum is enabled if relevant states are passed in during
335
// `InitializeBase()`. The checksum verification is done in each call to
336
// UpdateKey() for the current key. Each subclass is responsible for keeping
337
// track of cur_entry_idx_, the index of the current key within the block.
338
// BlockIter uses this index to get the corresponding checksum for current key.
339
// Additional checksum verification may be done in subclasses if they read keys
340
// other than the key being processed in UpdateKey().
341
template <class TValue>
342
class BlockIter : public InternalIteratorBase<TValue> {
343
 public:
344
  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
345
  // nothing. Calls cleanup functions.
346
25.2k
  virtual void Invalidate(const Status& s) {
347
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
348
25.2k
    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
349
350
25.2k
    data_ = nullptr;
351
25.2k
    current_ = GetKeysEndOffset();
352
25.2k
    status_ = s;
353
354
    // Call cleanup callbacks.
355
25.2k
    Cleanable::Reset();
356
25.2k
  }
rocksdb::BlockIter<rocksdb::Slice>::Invalidate(rocksdb::Status const&)
Line
Count
Source
346
25.2k
  virtual void Invalidate(const Status& s) {
347
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
348
25.2k
    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
349
350
25.2k
    data_ = nullptr;
351
25.2k
    current_ = GetKeysEndOffset();
352
25.2k
    status_ = s;
353
354
    // Call cleanup callbacks.
355
25.2k
    Cleanable::Reset();
356
25.2k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::Invalidate(rocksdb::Status const&)
357
358
7.41M
  bool Valid() const override {
359
    // When status_ is not ok, iter should be invalid.
360
7.41M
    auto key_end = GetKeysEndOffset();
361
7.41M
    assert(status_.ok() || current_ >= key_end);
362
7.41M
    auto valid = current_ < key_end;
363
7.41M
    return valid;
364
7.41M
  }
rocksdb::BlockIter<rocksdb::Slice>::Valid() const
Line
Count
Source
358
7.29M
  bool Valid() const override {
359
    // When status_ is not ok, iter should be invalid.
360
7.29M
    auto key_end = GetKeysEndOffset();
361
    assert(status_.ok() || current_ >= key_end);
362
7.29M
    auto valid = current_ < key_end;
363
7.29M
    return valid;
364
7.29M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Valid() const
Line
Count
Source
358
120k
  bool Valid() const override {
359
    // When status_ is not ok, iter should be invalid.
360
120k
    auto key_end = GetKeysEndOffset();
361
    assert(status_.ok() || current_ >= key_end);
362
120k
    auto valid = current_ < key_end;
363
120k
    return valid;
364
120k
  }
365
366
136k
  void SeekToFirst() override final {
367
#ifndef NDEBUG
368
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
369
#endif
370
136k
    SeekToFirstImpl();
371
136k
    UpdateKey();
372
136k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToFirst()
Line
Count
Source
366
114k
  void SeekToFirst() override final {
367
#ifndef NDEBUG
368
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
369
#endif
370
114k
    SeekToFirstImpl();
371
114k
    UpdateKey();
372
114k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToFirst()
Line
Count
Source
366
22.0k
  void SeekToFirst() override final {
367
#ifndef NDEBUG
368
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
369
#endif
370
22.0k
    SeekToFirstImpl();
371
22.0k
    UpdateKey();
372
22.0k
  }
373
374
3.34k
  void SeekToLast() override final {
375
3.34k
    SeekToLastImpl();
376
3.34k
    UpdateKey();
377
3.34k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToLast()
Line
Count
Source
374
1.60k
  void SeekToLast() override final {
375
1.60k
    SeekToLastImpl();
376
1.60k
    UpdateKey();
377
1.60k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToLast()
Line
Count
Source
374
1.74k
  void SeekToLast() override final {
375
1.74k
    SeekToLastImpl();
376
1.74k
    UpdateKey();
377
1.74k
  }
378
379
353k
  void Seek(const Slice& target) override final {
380
353k
    SeekImpl(target);
381
353k
    UpdateKey();
382
353k
  }
rocksdb::BlockIter<rocksdb::Slice>::Seek(rocksdb::Slice const&)
Line
Count
Source
379
342k
  void Seek(const Slice& target) override final {
380
342k
    SeekImpl(target);
381
342k
    UpdateKey();
382
342k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Seek(rocksdb::Slice const&)
Line
Count
Source
379
11.0k
  void Seek(const Slice& target) override final {
380
11.0k
    SeekImpl(target);
381
11.0k
    UpdateKey();
382
11.0k
  }
383
384
5.54k
  void SeekForPrev(const Slice& target) override final {
385
5.54k
    SeekForPrevImpl(target);
386
5.54k
    UpdateKey();
387
5.54k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekForPrev(rocksdb::Slice const&)
Line
Count
Source
384
5.54k
  void SeekForPrev(const Slice& target) override final {
385
5.54k
    SeekForPrevImpl(target);
386
5.54k
    UpdateKey();
387
5.54k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SeekForPrev(rocksdb::Slice const&)
388
389
3.54M
  void Next() override final {
390
3.54M
    NextImpl();
391
3.54M
    UpdateKey();
392
3.54M
  }
rocksdb::BlockIter<rocksdb::Slice>::Next()
Line
Count
Source
389
3.52M
  void Next() override final {
390
3.52M
    NextImpl();
391
3.52M
    UpdateKey();
392
3.52M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Next()
Line
Count
Source
389
18.2k
  void Next() override final {
390
18.2k
    NextImpl();
391
18.2k
    UpdateKey();
392
18.2k
  }
393
394
0
  bool NextAndGetResult(IterateResult* result) override final {
395
    // This does not need to call `UpdateKey()` as the parent class only has
396
    // access to the `UpdateKey()`-invoking functions.
397
0
    return InternalIteratorBase<TValue>::NextAndGetResult(result);
398
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NextAndGetResult(rocksdb::IterateResult*)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NextAndGetResult(rocksdb::IterateResult*)
399
400
12.2k
  void Prev() override final {
401
12.2k
    PrevImpl();
402
12.2k
    UpdateKey();
403
12.2k
  }
rocksdb::BlockIter<rocksdb::Slice>::Prev()
Line
Count
Source
400
5.20k
  void Prev() override final {
401
5.20k
    PrevImpl();
402
5.20k
    UpdateKey();
403
5.20k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Prev()
Line
Count
Source
400
7.05k
  void Prev() override final {
401
7.05k
    PrevImpl();
402
7.05k
    UpdateKey();
403
7.05k
  }
404
405
3.63M
  Status status() const override { return status_; }
rocksdb::BlockIter<rocksdb::Slice>::status() const
Line
Count
Source
405
3.56M
  Status status() const override { return status_; }
rocksdb::BlockIter<rocksdb::IndexValue>::status() const
Line
Count
Source
405
73.7k
  Status status() const override { return status_; }
406
407
3.52M
  Slice key() const override {
408
3.52M
    assert(Valid());
409
3.52M
    return key_;
410
3.52M
  }
rocksdb::BlockIter<rocksdb::Slice>::key() const
Line
Count
Source
407
3.52M
  Slice key() const override {
408
    assert(Valid());
409
3.52M
    return key_;
410
3.52M
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::key() const
411
412
#ifndef NDEBUG
413
  ~BlockIter() override {
414
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
415
    assert(!pinned_iters_mgr_ ||
416
           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
417
    status_.PermitUncheckedError();
418
  }
419
420
  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
421
    pinned_iters_mgr_ = pinned_iters_mgr;
422
  }
423
424
  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
425
426
  bool TEST_Corrupt_Callback(const std::string& sync_point) {
427
    bool corrupt = false;
428
    TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt));
429
430
    if (corrupt) {
431
      CorruptionError();
432
    }
433
    return corrupt;
434
  }
435
#endif
436
437
142k
  bool IsKeyPinned() const override {
438
142k
    return block_contents_pinned_ && key_pinned_;
439
142k
  }
rocksdb::BlockIter<rocksdb::Slice>::IsKeyPinned() const
Line
Count
Source
437
142k
  bool IsKeyPinned() const override {
438
142k
    return block_contents_pinned_ && key_pinned_;
439
142k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsKeyPinned() const
440
441
72.8k
  bool IsValuePinned() const override { return block_contents_pinned_; }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsValuePinned() const
rocksdb::BlockIter<rocksdb::Slice>::IsValuePinned() const
Line
Count
Source
441
72.8k
  bool IsValuePinned() const override { return block_contents_pinned_; }
442
443
  size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
444
445
0
  uint32_t ValueOffset() const {
446
0
    return static_cast<uint32_t>(value_.data() - data_);
447
0
  }
448
449
29.1k
  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
rocksdb::BlockIter<rocksdb::Slice>::SetCacheHandle(rocksdb::Cache::Handle*)
Line
Count
Source
449
29.1k
  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SetCacheHandle(rocksdb::Cache::Handle*)
450
451
  Cache::Handle* cache_handle() { return cache_handle_; }
452
453
 protected:
454
  InternalKeyComparator icmp_;
455
  const char* data_;       // underlying block contents
456
  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
457
458
  const char* values_section_;
459
  // Slice of current entry in the data section. Does not contain value if
460
  // values_section_ exists
461
  Slice entry_;
462
463
  // Index of restart block in which current_ or current_-1 falls
464
  uint32_t restart_index_;
465
  uint32_t restarts_;  // Offset of restart array (list of fixed32)
466
  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
467
  uint32_t current_;
468
  // Raw key from block.
469
  IterKey raw_key_;
470
  // Buffer for key data when global seqno assignment is enabled.
471
  IterKey key_buf_;
472
  Slice value_;
473
  Status status_;
474
  // Key to be exposed to users.
475
  Slice key_;
476
  SequenceNumber global_seqno_;
477
  // Size of the user-defined timestamp.
478
  size_t ts_sz_ = 0;
479
  // If user-defined timestamp is enabled but not persisted. A min timestamp
480
  // will be padded to the key during key parsing where it applies. Such as when
481
  // parsing keys from data block, index block, parsing the first internal
482
  // key from IndexValue entry. Min timestamp padding is different for when
483
  // `raw_key_` is a user key vs is an internal key.
484
  //
485
  // This only applies to data block and index blocks including index block for
486
  // data blocks, index block for partitioned filter blocks, index block for
487
  // partitioned index blocks. In summary, this only applies to block whose key
488
  // are real user keys or internal keys created from user keys.
489
  bool pad_min_timestamp_;
490
491
  // Per key-value checksum related states
492
  const char* kv_checksum_;
493
  // Index of the next entry to be parsed (used for checksum verification
494
  // and to determine if we're at a restart point for separated KV storage)
495
  int32_t cur_entry_idx_;
496
  uint32_t block_restart_interval_;
497
  uint8_t protection_bytes_per_key_;
498
499
  bool key_pinned_;
500
  // Whether the block data is guaranteed to outlive this iterator, and
501
  // as long as the cleanup functions are transferred to another class,
502
  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
503
  bool block_contents_pinned_;
504
505
  virtual void SeekToFirstImpl() = 0;
506
  virtual void SeekToLastImpl() = 0;
507
  virtual void SeekImpl(const Slice& target) = 0;
508
  virtual void SeekForPrevImpl(const Slice& target) = 0;
509
  virtual void NextImpl() = 0;
510
  virtual void PrevImpl() = 0;
511
512
  // Returns the restart interval of this block.
513
  // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized.
514
0
  virtual uint32_t GetRestartInterval() {
515
0
    if (num_restarts_ <= 1 || data_ == nullptr) {
516
0
      return 0;
517
0
    }
518
0
    SeekToFirstImpl();
519
0
    uint32_t end_index = GetRestartPoint(1);
520
0
    uint32_t count = 1;
521
0
    while (NextEntryOffset() < end_index && status_.ok()) {
522
0
      assert(Valid());
523
0
      NextImpl();
524
0
      ++count;
525
0
    }
526
0
    return count;
527
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::GetRestartInterval()
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartInterval()
528
529
  // Returns the number of keys in this block.
530
0
  virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) {
531
0
    if (num_restarts_ == 0 || data_ == nullptr) {
532
0
      return 0;
533
0
    }
534
0
    uint32_t count = (num_restarts_ - 1) * block_restart_interval;
535
    // Add number of keys from the last restart interval
536
0
    SeekToRestartPoint(num_restarts_ - 1);
537
    // For separated KV storage, keys end at values_section_, not at restarts_
538
0
    uint32_t keys_end = values_section_
539
0
                            ? static_cast<uint32_t>(values_section_ - data_)
540
0
                            : restarts_;
541
0
    while (NextEntryOffset() < keys_end && status_.ok()) {
542
0
      NextImpl();
543
0
      ++count;
544
0
    }
545
0
    return count;
546
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NumberOfKeys(unsigned int)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NumberOfKeys(unsigned int)
547
548
  // Stores whether the current key has a shared bytes with prev key in
549
  // *is_shared.
550
  // Sets raw_key_, value_ to the current parsed key and value.
551
  // Sets restart_index_ to point to the restart interval that contains
552
  // the current key.
553
  template <typename DecodeEntryFunc, bool StrictCheck = false>
554
  inline bool ParseNextKey(bool* is_shared);
555
556
  // protection_bytes_per_key, kv_checksum, and block_restart_interval
557
  // are needed only for per kv checksum verification.
558
  void InitializeBase(const Comparator* raw_ucmp, const char* data,
559
                      uint32_t restarts, uint32_t num_restarts,
560
                      SequenceNumber global_seqno, bool block_contents_pinned,
561
                      bool user_defined_timestamp_persisted,
562
                      uint8_t protection_bytes_per_key, const char* kv_checksum,
563
                      uint32_t block_restart_interval,
564
263k
                      const char* values_section) {
565
263k
    assert(data_ == nullptr);  // Ensure it is called only once
566
263k
    assert(num_restarts > 0);  // Ensure the param is valid
567
263k
    assert(raw_ucmp != nullptr);
568
263k
    icmp_ = InternalKeyComparator(raw_ucmp);
569
263k
    data_ = data;
570
263k
    restarts_ = restarts;
571
263k
    num_restarts_ = num_restarts;
572
263k
    restart_index_ = num_restarts_;
573
263k
    entry_ = Slice();
574
263k
    global_seqno_ = global_seqno;
575
263k
    ts_sz_ = raw_ucmp->timestamp_size();
576
263k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
577
263k
    block_contents_pinned_ = block_contents_pinned;
578
263k
    cache_handle_ = nullptr;
579
263k
    cur_entry_idx_ = -1;
580
263k
    protection_bytes_per_key_ = protection_bytes_per_key;
581
263k
    kv_checksum_ = kv_checksum;
582
263k
    block_restart_interval_ = block_restart_interval;
583
    // Checksum related states are either all 0/nullptr or all non-zero.
584
    // One exception is when num_restarts == 0, block_restart_interval can be 0
585
    // since we are not able to compute it.
586
263k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
587
263k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
588
263k
            (block_restart_interval > 0 || num_restarts == 1)));
589
590
263k
    values_section_ = values_section;
591
263k
    current_ = GetKeysEndOffset();
592
263k
  }
rocksdb::BlockIter<rocksdb::Slice>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int, char const*)
Line
Count
Source
564
208k
                      const char* values_section) {
565
208k
    assert(data_ == nullptr);  // Ensure it is called only once
566
208k
    assert(num_restarts > 0);  // Ensure the param is valid
567
208k
    assert(raw_ucmp != nullptr);
568
208k
    icmp_ = InternalKeyComparator(raw_ucmp);
569
208k
    data_ = data;
570
208k
    restarts_ = restarts;
571
208k
    num_restarts_ = num_restarts;
572
208k
    restart_index_ = num_restarts_;
573
208k
    entry_ = Slice();
574
208k
    global_seqno_ = global_seqno;
575
208k
    ts_sz_ = raw_ucmp->timestamp_size();
576
208k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
577
208k
    block_contents_pinned_ = block_contents_pinned;
578
208k
    cache_handle_ = nullptr;
579
208k
    cur_entry_idx_ = -1;
580
208k
    protection_bytes_per_key_ = protection_bytes_per_key;
581
208k
    kv_checksum_ = kv_checksum;
582
208k
    block_restart_interval_ = block_restart_interval;
583
    // Checksum related states are either all 0/nullptr or all non-zero.
584
    // One exception is when num_restarts == 0, block_restart_interval can be 0
585
    // since we are not able to compute it.
586
208k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
587
208k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
588
208k
            (block_restart_interval > 0 || num_restarts == 1)));
589
590
208k
    values_section_ = values_section;
591
208k
    current_ = GetKeysEndOffset();
592
208k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int, char const*)
Line
Count
Source
564
54.7k
                      const char* values_section) {
565
54.7k
    assert(data_ == nullptr);  // Ensure it is called only once
566
54.7k
    assert(num_restarts > 0);  // Ensure the param is valid
567
54.7k
    assert(raw_ucmp != nullptr);
568
54.7k
    icmp_ = InternalKeyComparator(raw_ucmp);
569
54.7k
    data_ = data;
570
54.7k
    restarts_ = restarts;
571
54.7k
    num_restarts_ = num_restarts;
572
54.7k
    restart_index_ = num_restarts_;
573
54.7k
    entry_ = Slice();
574
54.7k
    global_seqno_ = global_seqno;
575
54.7k
    ts_sz_ = raw_ucmp->timestamp_size();
576
54.7k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
577
54.7k
    block_contents_pinned_ = block_contents_pinned;
578
54.7k
    cache_handle_ = nullptr;
579
54.7k
    cur_entry_idx_ = -1;
580
54.7k
    protection_bytes_per_key_ = protection_bytes_per_key;
581
54.7k
    kv_checksum_ = kv_checksum;
582
54.7k
    block_restart_interval_ = block_restart_interval;
583
    // Checksum related states are either all 0/nullptr or all non-zero.
584
    // One exception is when num_restarts == 0, block_restart_interval can be 0
585
    // since we are not able to compute it.
586
54.7k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
587
54.7k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
588
54.7k
            (block_restart_interval > 0 || num_restarts == 1)));
589
590
54.7k
    values_section_ = values_section;
591
54.7k
    current_ = GetKeysEndOffset();
592
54.7k
  }
593
594
0
  void CorruptionError(const std::string& error_msg = "bad entry in block") {
595
0
    current_ = GetKeysEndOffset();
596
0
    restart_index_ = num_restarts_;
597
0
    status_ = Status::Corruption(error_msg);
598
0
    raw_key_.Clear();
599
0
    value_.clear();
600
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
601
602
0
  void PerKVChecksumCorruptionError() {
603
0
    std::string error_msg{
604
0
        "Corrupted block entry: per key-value checksum verification "
605
0
        "failed."};
606
0
    error_msg.append(" Offset: " + std::to_string(current_) + ".");
607
0
    error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + ".");
608
0
    CorruptionError(error_msg);
609
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::PerKVChecksumCorruptionError()
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::PerKVChecksumCorruptionError()
610
611
1.17M
  void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) {
612
1.17M
    if (pad_min_timestamp_) {
613
0
      raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
614
1.17M
    } else {
615
1.17M
      raw_key.SetKey(key, false /* copy */);
616
1.17M
    }
617
1.17M
  }
rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::IterKey&, rocksdb::Slice const&)
Line
Count
Source
611
1.12M
  void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) {
612
1.12M
    if (pad_min_timestamp_) {
613
0
      raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
614
1.12M
    } else {
615
1.12M
      raw_key.SetKey(key, false /* copy */);
616
1.12M
    }
617
1.12M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::IterKey&, rocksdb::Slice const&)
Line
Count
Source
611
48.1k
  void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) {
612
48.1k
    if (pad_min_timestamp_) {
613
0
      raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
614
48.1k
    } else {
615
48.1k
      raw_key.SetKey(key, false /* copy */);
616
48.1k
    }
617
48.1k
  }
618
619
1.17M
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
620
1.17M
    UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key);
621
1.17M
  }
rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&)
Line
Count
Source
619
1.12M
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
620
1.12M
    UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key);
621
1.12M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&)
Line
Count
Source
619
48.1k
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
620
48.1k
    UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key);
621
48.1k
  }
622
623
  // Must be called every time a key is found that needs to be returned to user,
624
  // and may be called when no key is found (as a no-op). Updates `key_`,
625
  // `key_buf_`, and `key_pinned_` with info about the found key.
626
  // Per key-value checksum verification is done if available for the key to be
627
  // returned. Iterator is invalidated with corruption status if checksum
628
  // verification fails.
629
3.75M
  void UpdateKey() {
630
3.75M
    key_buf_.Clear();
631
3.75M
    if (!Valid()) {
632
220k
      return;
633
220k
    }
634
3.53M
    if (raw_key_.IsUserKey()) {
635
3.34M
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
636
3.34M
      key_ = raw_key_.GetUserKey();
637
3.34M
      key_pinned_ = raw_key_.IsKeyPinned();
638
3.34M
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
639
196k
      key_ = raw_key_.GetInternalKey();
640
196k
      key_pinned_ = raw_key_.IsKeyPinned();
641
18.4E
    } else {
642
18.4E
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
643
18.4E
                              ExtractValueType(raw_key_.GetInternalKey()));
644
18.4E
      key_ = key_buf_.GetInternalKey();
645
18.4E
      key_pinned_ = false;
646
18.4E
    }
647
3.53M
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
648
3.53M
                             (void*)value_.data());
649
3.53M
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
650
3.53M
                             &protection_bytes_per_key_);
651
3.53M
    if (protection_bytes_per_key_ > 0) {
652
0
      if (!ProtectionInfo64()
653
0
               .ProtectKV(raw_key_.GetKey(), value_)
654
0
               .Verify(
655
0
                   protection_bytes_per_key_,
656
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
657
0
        PerKVChecksumCorruptionError();
658
0
      }
659
0
    }
660
3.53M
  }
rocksdb::BlockIter<rocksdb::Slice>::UpdateKey()
Line
Count
Source
629
3.69M
  void UpdateKey() {
630
3.69M
    key_buf_.Clear();
631
3.69M
    if (!Valid()) {
632
197k
      return;
633
197k
    }
634
3.49M
    if (raw_key_.IsUserKey()) {
635
3.30M
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
636
3.30M
      key_ = raw_key_.GetUserKey();
637
3.30M
      key_pinned_ = raw_key_.IsKeyPinned();
638
3.30M
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
639
196k
      key_ = raw_key_.GetInternalKey();
640
196k
      key_pinned_ = raw_key_.IsKeyPinned();
641
18.4E
    } else {
642
18.4E
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
643
18.4E
                              ExtractValueType(raw_key_.GetInternalKey()));
644
18.4E
      key_ = key_buf_.GetInternalKey();
645
18.4E
      key_pinned_ = false;
646
18.4E
    }
647
3.49M
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
648
3.49M
                             (void*)value_.data());
649
3.49M
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
650
3.49M
                             &protection_bytes_per_key_);
651
3.49M
    if (protection_bytes_per_key_ > 0) {
652
0
      if (!ProtectionInfo64()
653
0
               .ProtectKV(raw_key_.GetKey(), value_)
654
0
               .Verify(
655
0
                   protection_bytes_per_key_,
656
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
657
0
        PerKVChecksumCorruptionError();
658
0
      }
659
0
    }
660
3.49M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateKey()
Line
Count
Source
629
60.0k
  void UpdateKey() {
630
60.0k
    key_buf_.Clear();
631
60.0k
    if (!Valid()) {
632
23.1k
      return;
633
23.1k
    }
634
36.9k
    if (raw_key_.IsUserKey()) {
635
36.9k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
636
36.9k
      key_ = raw_key_.GetUserKey();
637
36.9k
      key_pinned_ = raw_key_.IsKeyPinned();
638
18.4E
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
639
0
      key_ = raw_key_.GetInternalKey();
640
0
      key_pinned_ = raw_key_.IsKeyPinned();
641
18.4E
    } else {
642
18.4E
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
643
18.4E
                              ExtractValueType(raw_key_.GetInternalKey()));
644
18.4E
      key_ = key_buf_.GetInternalKey();
645
18.4E
      key_pinned_ = false;
646
18.4E
    }
647
36.9k
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
648
36.9k
                             (void*)value_.data());
649
36.9k
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
650
36.9k
                             &protection_bytes_per_key_);
651
36.9k
    if (protection_bytes_per_key_ > 0) {
652
0
      if (!ProtectionInfo64()
653
0
               .ProtectKV(raw_key_.GetKey(), value_)
654
0
               .Verify(
655
0
                   protection_bytes_per_key_,
656
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
657
0
        PerKVChecksumCorruptionError();
658
0
      }
659
0
    }
660
36.9k
  }
661
662
  // Compares two keys using the appropriate comparator for the block contents.
663
  // Uses user comparator when the block stores user keys, otherwise uses the
664
  // internal key comparator. When global_seqno is not disabled, applies it to
665
  // the LHS key for comparison.
666
536k
  int CompareKey(const Slice& a, const Slice& b) const {
667
536k
    assert(icmp_.user_comparator() != nullptr);
668
536k
    if (raw_key_.IsUserKey()) {
669
520k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
670
520k
      return icmp_.user_comparator()->Compare(a, b);
671
520k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
672
14.8k
      return icmp_.Compare(a, b);
673
14.8k
    }
674
486
    return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
675
536k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::CompareKey(rocksdb::Slice const&, rocksdb::Slice const&) const
Line
Count
Source
666
11.0k
  int CompareKey(const Slice& a, const Slice& b) const {
667
11.0k
    assert(icmp_.user_comparator() != nullptr);
668
11.0k
    if (raw_key_.IsUserKey()) {
669
11.0k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
670
11.0k
      return icmp_.user_comparator()->Compare(a, b);
671
11.0k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
672
0
      return icmp_.Compare(a, b);
673
0
    }
674
0
    return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
675
11.0k
  }
rocksdb::BlockIter<rocksdb::Slice>::CompareKey(rocksdb::Slice const&, rocksdb::Slice const&) const
Line
Count
Source
666
525k
  int CompareKey(const Slice& a, const Slice& b) const {
667
525k
    assert(icmp_.user_comparator() != nullptr);
668
525k
    if (raw_key_.IsUserKey()) {
669
509k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
670
509k
      return icmp_.user_comparator()->Compare(a, b);
671
509k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
672
14.8k
      return icmp_.Compare(a, b);
673
14.8k
    }
674
486
    return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
675
525k
  }
676
677
533k
  int CompareKey(const IterKey& a, const Slice& b) const {
678
533k
    if (a.IsUserKey()) {
679
518k
      return CompareKey(a.GetUserKey(), b);
680
518k
    }
681
14.3k
    return CompareKey(a.GetInternalKey(), b);
682
533k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::CompareKey(rocksdb::IterKey const&, rocksdb::Slice const&) const
Line
Count
Source
677
11.0k
  int CompareKey(const IterKey& a, const Slice& b) const {
678
11.0k
    if (a.IsUserKey()) {
679
11.0k
      return CompareKey(a.GetUserKey(), b);
680
11.0k
    }
681
0
    return CompareKey(a.GetInternalKey(), b);
682
11.0k
  }
rocksdb::BlockIter<rocksdb::Slice>::CompareKey(rocksdb::IterKey const&, rocksdb::Slice const&) const
Line
Count
Source
677
522k
  int CompareKey(const IterKey& a, const Slice& b) const {
678
522k
    if (a.IsUserKey()) {
679
507k
      return CompareKey(a.GetUserKey(), b);
680
507k
    }
681
14.3k
    return CompareKey(a.GetInternalKey(), b);
682
522k
  }
683
684
  // Compares the current key (with global seqno applied) against `other`.
685
533k
  int CompareCurrentKey(const Slice& other) const {
686
533k
    return CompareKey(raw_key_, other);
687
533k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::CompareCurrentKey(rocksdb::Slice const&) const
Line
Count
Source
685
11.0k
  int CompareCurrentKey(const Slice& other) const {
686
11.0k
    return CompareKey(raw_key_, other);
687
11.0k
  }
rocksdb::BlockIter<rocksdb::Slice>::CompareCurrentKey(rocksdb::Slice const&) const
Line
Count
Source
685
522k
  int CompareCurrentKey(const Slice& other) const {
686
522k
    return CompareKey(raw_key_, other);
687
522k
  }
688
689
 private:
690
  // Store the cache handle, if the block is cached. We need this since the
691
  // only other place the handle is stored is as an argument to the Cleanable
692
  // function callback, which is hard to retrieve. When multiple value
693
  // PinnableSlices reference the block, they need the cache handle in order
694
  // to bump up the ref count
695
  Cache::Handle* cache_handle_;
696
697
 public:
698
  // Return the offset in data_ just past the end of the current entry.
699
4.06M
  inline uint32_t NextEntryOffset() const {
700
    // NOTE: We don't support blocks bigger than 2GB
701
4.06M
    return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_);
702
4.06M
  }
rocksdb::BlockIter<rocksdb::Slice>::NextEntryOffset() const
Line
Count
Source
699
4.00M
  inline uint32_t NextEntryOffset() const {
700
    // NOTE: We don't support blocks bigger than 2GB
701
4.00M
    return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_);
702
4.00M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::NextEntryOffset() const
Line
Count
Source
699
54.9k
  inline uint32_t NextEntryOffset() const {
700
    // NOTE: We don't support blocks bigger than 2GB
701
54.9k
    return static_cast<uint32_t>((entry_.data() + entry_.size()) - data_);
702
54.9k
  }
703
704
  // Return the offset where the keys section ends.
705
  // For separated KV storage, this is the start of the values section.
706
  // Otherwise, it's the start of the restart array.
707
11.4M
  inline uint32_t GetKeysEndOffset() const {
708
11.4M
    return values_section_ ? static_cast<uint32_t>(values_section_ - data_)
709
11.4M
                           : restarts_;
710
11.4M
  }
rocksdb::BlockIter<rocksdb::Slice>::GetKeysEndOffset() const
Line
Count
Source
707
11.2M
  inline uint32_t GetKeysEndOffset() const {
708
11.2M
    return values_section_ ? static_cast<uint32_t>(values_section_ - data_)
709
11.2M
                           : restarts_;
710
11.2M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::GetKeysEndOffset() const
Line
Count
Source
707
253k
  inline uint32_t GetKeysEndOffset() const {
708
253k
    return values_section_ ? static_cast<uint32_t>(values_section_ - data_)
709
253k
                           : restarts_;
710
253k
  }
711
712
1.49M
  uint32_t GetRestartPoint(uint32_t index) const {
713
1.49M
    assert(index < num_restarts_);
714
1.49M
    uint32_t offset =
715
1.49M
        DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
716
1.49M
    assert(!values_section_ || offset <= values_section_ - data_);
717
1.49M
    return offset;
718
1.49M
  }
rocksdb::BlockIter<rocksdb::Slice>::GetRestartPoint(unsigned int) const
Line
Count
Source
712
1.43M
  uint32_t GetRestartPoint(uint32_t index) const {
713
1.43M
    assert(index < num_restarts_);
714
1.43M
    uint32_t offset =
715
1.43M
        DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
716
    assert(!values_section_ || offset <= values_section_ - data_);
717
1.43M
    return offset;
718
1.43M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartPoint(unsigned int) const
Line
Count
Source
712
59.0k
  uint32_t GetRestartPoint(uint32_t index) const {
713
59.0k
    assert(index < num_restarts_);
714
59.0k
    uint32_t offset =
715
59.0k
        DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
716
    assert(!values_section_ || offset <= values_section_ - data_);
717
59.0k
    return offset;
718
59.0k
  }
719
720
504k
  void SeekToRestartPoint(uint32_t index) {
721
504k
    raw_key_.Clear();
722
504k
    restart_index_ = index;
723
    // Set to one before the first entry so ParseNextKey() increments to correct
724
    // position
725
504k
    cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
726
727
    // ParseNextKey() starts at the end of entry_, so set value_ accordingly
728
504k
    uint32_t offset = GetRestartPoint(index);
729
504k
    entry_ = Slice(data_ + offset, 0);
730
504k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToRestartPoint(unsigned int)
Line
Count
Source
720
469k
  void SeekToRestartPoint(uint32_t index) {
721
469k
    raw_key_.Clear();
722
469k
    restart_index_ = index;
723
    // Set to one before the first entry so ParseNextKey() increments to correct
724
    // position
725
469k
    cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
726
727
    // ParseNextKey() starts at the end of entry_, so set value_ accordingly
728
469k
    uint32_t offset = GetRestartPoint(index);
729
469k
    entry_ = Slice(data_ + offset, 0);
730
469k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToRestartPoint(unsigned int)
Line
Count
Source
720
34.7k
  void SeekToRestartPoint(uint32_t index) {
721
34.7k
    raw_key_.Clear();
722
34.7k
    restart_index_ = index;
723
    // Set to one before the first entry so ParseNextKey() increments to correct
724
    // position
725
34.7k
    cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
726
727
    // ParseNextKey() starts at the end of entry_, so set value_ accordingly
728
34.7k
    uint32_t offset = GetRestartPoint(index);
729
34.7k
    entry_ = Slice(data_ + offset, 0);
730
34.7k
  }
731
732
 protected:
733
  template <typename DecodeKeyFunc>
734
  inline bool GetRestartKey(uint32_t index, Slice* key);
735
736
  template <typename DecodeKeyFunc>
737
  inline bool BinarySeekRestartPointIndex(const Slice& target, uint32_t* index,
738
                                          bool* is_index_key_result);
739
740
  template <typename DecodeKeyFunc>
741
  inline bool InterpolationSeekRestartPointIndex(const Slice& target,
742
                                                 uint32_t* index,
743
                                                 bool* is_index_key_result);
744
745
  // Find the first key in restart interval `index` that is >= `target`.
746
  // If there is no such key, iterator is positioned at the first key in
747
  // restart interval `index + 1`.
748
  // If is_index_key_result is true, it positions the iterator at the first key
749
  // in this restart interval.
750
  // Per key-value checksum verification is done for all keys scanned
751
  // up to but not including the last key (the key that current_ points to
752
  // when this function returns). This key's checksum is verified in
753
  // UpdateKey().
754
  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
755
                              bool is_index_key_result);
756
};
757
758
class DataBlockIter final : public BlockIter<Slice> {
759
 public:
760
  DataBlockIter()
761
57.3k
      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
762
  void Initialize(const Comparator* raw_ucmp, const char* data,
763
                  uint32_t restarts, uint32_t num_restarts,
764
                  SequenceNumber global_seqno,
765
                  BlockReadAmpBitmap* read_amp_bitmap,
766
                  bool block_contents_pinned,
767
                  bool user_defined_timestamps_persisted,
768
                  DataBlockHashIndex* data_block_hash_index,
769
                  uint8_t protection_bytes_per_key, const char* kv_checksum,
770
38.6k
                  uint32_t block_restart_interval, const char* values_section) {
771
38.6k
    InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
772
38.6k
                   block_contents_pinned, user_defined_timestamps_persisted,
773
38.6k
                   protection_bytes_per_key, kv_checksum,
774
38.6k
                   block_restart_interval, values_section);
775
38.6k
    raw_key_.SetIsUserKey(false);
776
38.6k
    read_amp_bitmap_ = read_amp_bitmap;
777
38.6k
    last_bitmap_offset_ = current_ + 1;
778
38.6k
    data_block_hash_index_ = data_block_hash_index;
779
38.6k
  }
780
781
178k
  Slice value() const override {
782
178k
    assert(Valid());
783
178k
    if (read_amp_bitmap_ && current_ < restarts_ &&
784
0
        current_ != last_bitmap_offset_) {
785
0
      read_amp_bitmap_->Mark(current_ /* current entry offset */,
786
0
                             NextEntryOffset() - 1);
787
0
      last_bitmap_offset_ = current_;
788
0
    }
789
178k
    return value_;
790
178k
  }
791
792
  // Returns if `target` may exist.
793
1.43k
  inline bool SeekForGet(const Slice& target) {
794
#ifndef NDEBUG
795
    if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true;
796
#endif
797
1.43k
    if (!data_block_hash_index_) {
798
1.43k
      SeekImpl(target);
799
1.43k
      UpdateKey();
800
1.43k
      return true;
801
1.43k
    }
802
0
    bool res = SeekForGetImpl(target);
803
0
    UpdateKey();
804
0
    return res;
805
1.43k
  }
806
807
25.2k
  void Invalidate(const Status& s) override {
808
25.2k
    BlockIter::Invalidate(s);
809
    // Clear prev entries cache.
810
25.2k
    prev_entries_keys_buff_.clear();
811
25.2k
    prev_entries_.clear();
812
25.2k
    prev_entries_idx_ = -1;
813
25.2k
  }
814
815
 protected:
816
  friend Block;
817
  inline bool ParseNextDataKey(bool* is_shared);
818
  void SeekToFirstImpl() override;
819
  void SeekToLastImpl() override;
820
  void SeekImpl(const Slice& target) override;
821
  void SeekForPrevImpl(const Slice& target) override;
822
  void NextImpl() override;
823
  void PrevImpl() override;
824
825
 private:
826
  // read-amp bitmap
827
  BlockReadAmpBitmap* read_amp_bitmap_;
828
  // last `current_` value we report to read-amp bitmp
829
  mutable uint32_t last_bitmap_offset_;
830
  struct CachedPrevEntry {
831
    explicit CachedPrevEntry(uint32_t _offset, uint32_t _entry_size,
832
                             const char* _key_ptr, size_t _key_offset,
833
                             size_t _key_size, Slice _value)
834
637
        : offset(_offset),
835
637
          entry_size(_entry_size),
836
637
          key_ptr(_key_ptr),
837
637
          key_offset(_key_offset),
838
637
          key_size(_key_size),
839
637
          value(_value) {}
840
841
    // offset of entry in block
842
    uint32_t offset;
843
    // size of entry (for NextEntryOffset calculation)
844
    uint32_t entry_size;
845
    // Pointer to key data in block (nullptr if key is delta-encoded)
846
    const char* key_ptr;
847
    // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
848
    size_t key_offset;
849
    // size of key
850
    size_t key_size;
851
    // value slice pointing to data in block
852
    Slice value;
853
  };
854
  std::string prev_entries_keys_buff_;
855
  std::vector<CachedPrevEntry> prev_entries_;
856
  int32_t prev_entries_idx_ = -1;
857
858
  DataBlockHashIndex* data_block_hash_index_;
859
860
  bool SeekForGetImpl(const Slice& target);
861
};
862
863
// Iterator over MetaBlocks.  MetaBlocks are similar to Data Blocks and
864
// are used to store Properties associated with table.
865
// Meta blocks always store user keys (no sequence number) and always
866
// use the BytewiseComparator.  Additionally, MetaBlock accesses are
867
// not recorded in the Statistics or for Read-Amplification.
868
class MetaBlockIter final : public BlockIter<Slice> {
869
 public:
870
170k
  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
871
  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
872
                  bool block_contents_pinned, uint8_t protection_bytes_per_key,
873
                  const char* kv_checksum, uint32_t block_restart_interval,
874
169k
                  const char* values_section) {
875
    // Initializes the iterator with a BytewiseComparator and
876
    // the raw key being a user key.
877
169k
    InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
878
169k
                   kDisableGlobalSequenceNumber, block_contents_pinned,
879
169k
                   /* user_defined_timestamps_persisted */ true,
880
169k
                   protection_bytes_per_key, kv_checksum,
881
169k
                   block_restart_interval, values_section);
882
169k
    raw_key_.SetIsUserKey(true);
883
169k
  }
884
885
3.56M
  Slice value() const override {
886
3.56M
    assert(Valid());
887
3.56M
    return value_;
888
3.56M
  }
889
890
 protected:
891
  friend Block;
892
  void SeekToFirstImpl() override;
893
  void SeekToLastImpl() override;
894
  void SeekImpl(const Slice& target) override;
895
  void SeekForPrevImpl(const Slice& target) override;
896
  void NextImpl() override;
897
  void PrevImpl() override;
898
  // Meta index block's restart interval is always 1. See
899
  // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval.
900
0
  uint32_t GetRestartInterval() override { return 1; }
901
0
  uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; }
902
};
903
904
class IndexBlockIter final : public BlockIter<IndexValue> {
905
 public:
906
54.7k
  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
907
908
  // key_includes_seq, default true, means that the keys are in internal key
909
  // format.
910
  // value_is_full, default true, means that no delta encoding is
911
  // applied to values.
912
  void Initialize(
913
      const Comparator* raw_ucmp, const char* data, uint32_t restarts,
914
      uint32_t num_restarts, SequenceNumber global_seqno,
915
      BlockPrefixIndex* prefix_index, bool have_first_key,
916
      bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
917
      bool user_defined_timestamps_persisted, uint8_t protection_bytes_per_key,
918
      const char* kv_checksum, uint32_t block_restart_interval,
919
      const char* values_section,
920
54.7k
      BlockBasedTableOptions::BlockSearchType index_block_search_type) {
921
54.7k
    InitializeBase(raw_ucmp, data, restarts, num_restarts,
922
54.7k
                   kDisableGlobalSequenceNumber, block_contents_pinned,
923
54.7k
                   user_defined_timestamps_persisted, protection_bytes_per_key,
924
54.7k
                   kv_checksum, block_restart_interval, values_section);
925
54.7k
    raw_key_.SetIsUserKey(!key_includes_seq);
926
54.7k
    prefix_index_ = prefix_index;
927
54.7k
    value_delta_encoded_ = !value_is_full;
928
54.7k
    have_first_key_ = have_first_key;
929
54.7k
    index_search_type_ = index_block_search_type;
930
54.7k
    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
931
0
      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
932
54.7k
    } else {
933
54.7k
      global_seqno_state_.reset();
934
54.7k
    }
935
54.7k
  }
936
937
0
  Slice user_key() const override {
938
0
    assert(Valid());
939
0
    return raw_key_.GetUserKey();
940
0
  }
941
942
65.9k
  IndexValue value() const override {
943
65.9k
    assert(Valid());
944
65.9k
    if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
945
65.9k
        pad_min_timestamp_) {
946
65.9k
      return decoded_value_;
947
65.9k
    } else {
948
0
      IndexValue entry;
949
0
      Slice v = value_;
950
0
      Status decode_s __attribute__((__unused__)) =
951
0
          entry.DecodeFrom(&v, have_first_key_, nullptr);
952
0
      assert(decode_s.ok());
953
0
      return entry;
954
0
    }
955
65.9k
  }
956
957
0
  Slice raw_value() const {
958
0
    assert(Valid());
959
0
    return value_;
960
0
  }
961
962
0
  bool IsValuePinned() const override {
963
0
    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
964
0
  }
965
966
 protected:
967
  friend Block;
968
  // IndexBlockIter follows a different contract for prefix iterator
969
  // from data iterators.
970
  // If prefix of the seek key `target` exists in the file, it must
971
  // return the same result as total order seek.
972
  // If the prefix of `target` doesn't exist in the file, it can either
973
  // return the result of total order seek, or set both of Valid() = false
974
  // and status() = NotFound().
975
  void SeekImpl(const Slice& target) override;
976
977
0
  void SeekForPrevImpl(const Slice&) override {
978
0
    assert(false);
979
0
    current_ = GetKeysEndOffset();
980
0
    restart_index_ = num_restarts_;
981
0
    status_ = Status::InvalidArgument(
982
0
        "RocksDB internal error: should never call SeekForPrev() on index "
983
0
        "blocks");
984
0
    raw_key_.Clear();
985
0
    value_.clear();
986
0
  }
987
988
  void PrevImpl() override;
989
  void NextImpl() override;
990
  void SeekToFirstImpl() override;
991
  void SeekToLastImpl() override;
992
993
 private:
994
  bool value_delta_encoded_;
995
  bool have_first_key_;  // value includes first_internal_key
996
  BlockPrefixIndex* prefix_index_;
997
  // Whether the value is delta encoded. In that case the value is assumed to be
998
  // BlockHandle. The first value in each restart interval is the full encoded
999
  // BlockHandle; the restart of encoded size part of the BlockHandle. The
1000
  // offset of delta encoded BlockHandles is computed by adding the size of
1001
  // previous delta encoded values in the same restart interval to the offset of
1002
  // the first value in that restart interval.
1003
  IndexValue decoded_value_;
1004
1005
  // When sequence number overwriting is enabled, this struct contains the seqno
1006
  // to overwrite with, and current first_internal_key with overwritten seqno.
1007
  // This is rarely used, so we put it behind a pointer and only allocate when
1008
  // needed.
1009
  struct GlobalSeqnoState {
1010
    // First internal key according to current index entry, but with sequence
1011
    // number overwritten to global_seqno.
1012
    IterKey first_internal_key;
1013
    SequenceNumber global_seqno;
1014
1015
0
    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
1016
  };
1017
1018
  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
1019
1020
  // Buffers the `first_internal_key` referred by `decoded_value_` when
1021
  // `pad_min_timestamp_` is true.
1022
  std::string first_internal_key_with_ts_;
1023
1024
  // The search algorithm to use when reading the index block.
1025
  BlockBasedTableOptions::BlockSearchType index_search_type_ =
1026
      BlockBasedTableOptions::kBinary;
1027
1028
  // Set *prefix_may_exist to false if no key possibly share the same prefix
1029
  // as `target`. If not set, the result position should be the same as total
1030
  // order Seek.
1031
  bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
1032
  // Set *prefix_may_exist to false if no key can possibly share the same
1033
  // prefix as `target`. If not set, the result position should be the same
1034
  // as total order seek.
1035
  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
1036
                            uint32_t left, uint32_t right, uint32_t* index,
1037
                            bool* prefix_may_exist);
1038
  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
1039
1040
  template <typename DecodeKeyFunc>
1041
  bool FindRestartPointForSeek(const Slice& seek_key, uint32_t* index,
1042
                               bool* skip_linear_scan);
1043
1044
  inline bool ParseNextIndexKey();
1045
1046
  // When value_delta_encoded_ is enabled it decodes the value which is assumed
1047
  // to be BlockHandle and put it to decoded_value_
1048
  inline void DecodeCurrentValue(bool is_shared);
1049
};
1050
1051
}  // namespace ROCKSDB_NAMESPACE