Coverage Report

Created: 2024-09-08 07:17

/src/rocksdb/table/block_based/block.h
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#pragma once
11
#include <stddef.h>
12
#include <stdint.h>
13
14
#include <string>
15
#include <vector>
16
17
#include "db/kv_checksum.h"
18
#include "db/pinned_iterators_manager.h"
19
#include "port/malloc.h"
20
#include "rocksdb/advanced_cache.h"
21
#include "rocksdb/iterator.h"
22
#include "rocksdb/options.h"
23
#include "rocksdb/statistics.h"
24
#include "rocksdb/table.h"
25
#include "table/block_based/block_prefix_index.h"
26
#include "table/block_based/data_block_hash_index.h"
27
#include "table/format.h"
28
#include "table/internal_iterator.h"
29
#include "test_util/sync_point.h"
30
#include "util/random.h"
31
32
namespace ROCKSDB_NAMESPACE {
33
34
struct BlockContents;
35
class Comparator;
36
template <class TValue>
37
class BlockIter;
38
class DataBlockIter;
39
class IndexBlockIter;
40
class MetaBlockIter;
41
class BlockPrefixIndex;
42
43
// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
44
// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
45
// bytes in the Block we update the bitmap and increment
46
// READ_AMP_ESTIMATE_USEFUL_BYTES.
47
class BlockReadAmpBitmap {
48
 public:
49
  explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
50
                              Statistics* statistics)
51
      : bitmap_(nullptr),
52
        bytes_per_bit_pow_(0),
53
        statistics_(statistics),
54
        rnd_(Random::GetTLSInstance()->Uniform(
55
0
            static_cast<int>(bytes_per_bit))) {
56
0
    TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
57
0
    assert(block_size > 0 && bytes_per_bit > 0);
58
59
    // convert bytes_per_bit to be a power of 2
60
0
    while (bytes_per_bit >>= 1) {
61
0
      bytes_per_bit_pow_++;
62
0
    }
63
64
    // num_bits_needed = ceil(block_size / bytes_per_bit)
65
0
    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
66
0
    assert(num_bits_needed > 0);
67
68
    // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
69
0
    size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
70
71
    // Create bitmap and set all the bits to 0
72
0
    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
73
74
0
    RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
75
0
  }
76
77
0
  ~BlockReadAmpBitmap() { delete[] bitmap_; }
78
79
0
  void Mark(uint32_t start_offset, uint32_t end_offset) {
80
0
    assert(end_offset >= start_offset);
81
    // Index of first bit in mask
82
0
    uint32_t start_bit =
83
0
        (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
84
0
        bytes_per_bit_pow_;
85
    // Index of last bit in mask + 1
86
0
    uint32_t exclusive_end_bit =
87
0
        (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
88
0
    if (start_bit >= exclusive_end_bit) {
89
0
      return;
90
0
    }
91
0
    assert(exclusive_end_bit > 0);
92
93
0
    if (GetAndSet(start_bit) == 0) {
94
0
      uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
95
0
                                  << bytes_per_bit_pow_;
96
0
      RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
97
0
                 new_useful_bytes);
98
0
    }
99
0
  }
100
101
0
  Statistics* GetStatistics() {
102
0
    return statistics_.load(std::memory_order_relaxed);
103
0
  }
104
105
0
  void SetStatistics(Statistics* stats) { statistics_.store(stats); }
106
107
0
  uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
108
109
0
  size_t ApproximateMemoryUsage() const {
110
0
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
111
0
    return malloc_usable_size((void*)this);
112
0
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
113
0
    return sizeof(*this);
114
0
  }
115
116
 private:
117
  // Get the current value of bit at `bit_idx` and set it to 1
118
0
  inline bool GetAndSet(uint32_t bit_idx) {
119
0
    const uint32_t byte_idx = bit_idx / kBitsPerEntry;
120
0
    const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
121
122
0
    return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
123
0
           bit_mask;
124
0
  }
125
126
  const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
127
  const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
128
129
  // Bitmap used to record the bytes that we read, use atomic to protect
130
  // against multiple threads updating the same bit
131
  std::atomic<uint32_t>* bitmap_;
132
  // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
133
  // muliplication and division
134
  uint8_t bytes_per_bit_pow_;
135
  // Pointer to DB Statistics object, Since this bitmap may outlive the DB
136
  // this pointer maybe invalid, but the DB will update it to a valid pointer
137
  // by using SetStatistics() before calling Mark()
138
  std::atomic<Statistics*> statistics_;
139
  uint32_t rnd_;
140
};
141
142
// class Block is the uncompressed and "parsed" form for blocks containing
143
// key-value pairs. (See BlockContents comments for more on terminology.)
144
// This includes the in-memory representation of data blocks, index blocks
145
// (including partitions), range deletion blocks, properties blocks, metaindex
146
// blocks, as well as the top level of the partitioned filter structure (which
147
// is actually an index of the filter partitions). It is NOT suitable for
148
// compressed blocks in general, filter blocks/partitions, or compression
149
// dictionaries.
150
//
151
// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
152
// for details of the format and the various block types.
153
//
154
// TODO: Rename to ParsedKvBlock?
155
class Block {
156
 public:
157
  // Initialize the block with the specified contents.
158
  explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
159
                 Statistics* statistics = nullptr);
160
  // No copying allowed
161
  Block(const Block&) = delete;
162
  void operator=(const Block&) = delete;
163
164
  ~Block();
165
166
0
  size_t size() const { return size_; }
167
7.47k
  const char* data() const { return data_; }
168
  // The additional memory space taken by the block data.
169
11.6k
  size_t usable_size() const { return contents_.usable_size(); }
170
  uint32_t NumRestarts() const;
171
11.6k
  bool own_bytes() const { return contents_.own_bytes(); }
172
173
  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
174
175
  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
176
  // comparator.
177
  //
178
  // If iter is null, return new Iterator
179
  // If iter is not null, update this one and return it as Iterator*
180
  //
181
  // Updates read_amp_bitmap_ if it is not nullptr.
182
  //
183
  // If `block_contents_pinned` is true, the caller will guarantee that when
184
  // the cleanup functions are transferred from the iterator to other
185
  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
186
  // valid. Either the iterator holds cache handle or ownership of some resource
187
  // and release them in a release function, or caller is sure that the data
188
  // will not go away (for example, it's from mmapped file which will not be
189
  // closed).
190
  //
191
  // `user_defined_timestamps_persisted` controls whether a min timestamp is
192
  // padded while key is being parsed from the block.
193
  //
194
  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
195
  // the iterator will simply be set as "invalid", rather than returning
196
  // the key that is just pass the target key.
197
  DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
198
                                 SequenceNumber global_seqno,
199
                                 DataBlockIter* iter = nullptr,
200
                                 Statistics* stats = nullptr,
201
                                 bool block_contents_pinned = false,
202
                                 bool user_defined_timestamps_persisted = true);
203
204
  // Returns an MetaBlockIter for iterating over blocks containing metadata
205
  // (like Properties blocks).  Unlike data blocks, the keys for these blocks
206
  // do not contain sequence numbers, do not use a user-define comparator, and
207
  // do not track read amplification/statistics.  Additionally, MetaBlocks will
208
  // not assert if the block is formatted improperly.
209
  //
210
  // If `block_contents_pinned` is true, the caller will guarantee that when
211
  // the cleanup functions are transferred from the iterator to other
212
  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
213
  // valid. Either the iterator holds cache handle or ownership of some resource
214
  // and release them in a release function, or caller is sure that the data
215
  // will not go away (for example, it's from mmapped file which will not be
216
  // closed).
217
  MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
218
219
  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
220
  // comparator.
221
  //
222
  // key_includes_seq, default true, means that the keys are in internal key
223
  // format.
224
  // value_is_full, default true, means that no delta encoding is
225
  // applied to values.
226
  //
227
  // If `prefix_index` is not nullptr this block will do hash lookup for the key
228
  // prefix. If total_order_seek is true, prefix_index_ is ignored.
229
  //
230
  // `have_first_key` controls whether IndexValue will contain
231
  // first_internal_key. It affects data serialization format, so the same value
232
  // have_first_key must be used when writing and reading index.
233
  // It is determined by IndexType property of the table.
234
  // `user_defined_timestamps_persisted` controls whether a min timestamp is
235
  // padded while key is being parsed from the block.
236
  IndexBlockIter* NewIndexIterator(
237
      const Comparator* raw_ucmp, SequenceNumber global_seqno,
238
      IndexBlockIter* iter, Statistics* stats, bool total_order_seek,
239
      bool have_first_key, bool key_includes_seq, bool value_is_full,
240
      bool block_contents_pinned = false,
241
      bool user_defined_timestamps_persisted = true,
242
      BlockPrefixIndex* prefix_index = nullptr);
243
244
  // Report an approximation of how much memory has been used.
245
  size_t ApproximateMemoryUsage() const;
246
247
  // For TypedCacheInterface
248
0
  const Slice& ContentSlice() const { return contents_.data; }
249
250
  // Initializes per key-value checksum protection.
251
  // After this method is called, each DataBlockIterator returned
252
  // by NewDataIterator will verify per key-value checksum for any key it read.
253
  void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
254
                                         const Comparator* raw_ucmp);
255
256
  // Initializes per key-value checksum protection.
257
  // After this method is called, each IndexBlockIterator returned
258
  // by NewIndexIterator will verify per key-value checksum for any key it read.
259
  // value_is_full and index_has_first_key are needed to be able to parse
260
  // the index block content and construct checksums.
261
  void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
262
                                          const Comparator* raw_ucmp,
263
                                          bool value_is_full,
264
                                          bool index_has_first_key);
265
266
  // Initializes per key-value checksum protection.
267
  // After this method is called, each MetaBlockIter returned
268
  // by NewMetaIterator will verify per key-value checksum for any key it read.
269
  void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key);
270
271
  static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len,
272
0
                                 const Slice& key, const Slice& value) {
273
0
    ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr);
274
0
  }
275
276
0
  const char* TEST_GetKVChecksum() const { return kv_checksum_; }
277
278
 private:
279
  BlockContents contents_;
280
  const char* data_;         // contents_.data.data()
281
  size_t size_;              // contents_.data.size()
282
  uint32_t restart_offset_;  // Offset in data_ of restart array
283
  uint32_t num_restarts_;
284
  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
285
  char* kv_checksum_{nullptr};
286
  uint32_t checksum_size_{0};
287
  // Used by block iterators to calculate current key index within a block
288
  uint32_t block_restart_interval_{0};
289
  uint8_t protection_bytes_per_key_{0};
290
  DataBlockHashIndex data_block_hash_index_;
291
};
292
293
// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
294
// format of this data buffer is an uncompressed, sorted sequence of key-value
295
// pairs (see `Block` API for more details).
296
//
297
// Notably, the keys may either be in internal key format or user key format.
298
// Subclasses are responsible for configuring the key format.
299
//
300
// `BlockIter` intends to provide final overrides for all of
301
// `InternalIteratorBase` functions that can move the iterator. It does
302
// this to guarantee `UpdateKey()` is called exactly once after each key
303
// movement potentially visible to users. In this step, the key is prepared
304
// (e.g., serialized if global seqno is in effect) so it can be returned
305
// immediately when the user asks for it via calling `key() const`.
306
//
307
// For its subclasses, it provides protected variants of the above-mentioned
308
// final-overridden methods. They are named with the "Impl" suffix, e.g.,
309
// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
310
// "Impl" functions are responsible for positioning `raw_key_` but not
311
// invoking `UpdateKey()`.
312
//
313
// Per key-value checksum is enabled if relevant states are passed in during
314
// `InitializeBase()`. The checksum verification is done in each call to
315
// UpdateKey() for the current key. Each subclass is responsible for keeping
316
// track of cur_entry_idx_, the index of the current key within the block.
317
// BlockIter uses this index to get the corresponding checksum for current key.
318
// Additional checksum verification may be done in subclasses if they read keys
319
// other than the key being processed in UpdateKey().
320
template <class TValue>
321
class BlockIter : public InternalIteratorBase<TValue> {
322
 public:
323
  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
324
  // nothing. Calls cleanup functions.
325
10.1k
  virtual void Invalidate(const Status& s) {
326
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
327
10.1k
    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
328
329
10.1k
    data_ = nullptr;
330
10.1k
    current_ = restarts_;
331
10.1k
    status_ = s;
332
333
    // Call cleanup callbacks.
334
10.1k
    Cleanable::Reset();
335
10.1k
  }
rocksdb::BlockIter<rocksdb::Slice>::Invalidate(rocksdb::Status const&)
Line
Count
Source
325
10.1k
  virtual void Invalidate(const Status& s) {
326
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
327
10.1k
    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
328
329
10.1k
    data_ = nullptr;
330
10.1k
    current_ = restarts_;
331
10.1k
    status_ = s;
332
333
    // Call cleanup callbacks.
334
10.1k
    Cleanable::Reset();
335
10.1k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::Invalidate(rocksdb::Status const&)
336
337
1.14M
  bool Valid() const override {
338
    // When status_ is not ok, iter should be invalid.
339
1.14M
    assert(status_.ok() || current_ >= restarts_);
340
1.14M
    return current_ < restarts_;
341
1.14M
  }
rocksdb::BlockIter<rocksdb::Slice>::Valid() const
Line
Count
Source
337
1.10M
  bool Valid() const override {
338
    // When status_ is not ok, iter should be invalid.
339
1.10M
    assert(status_.ok() || current_ >= restarts_);
340
1.10M
    return current_ < restarts_;
341
1.10M
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Valid() const
Line
Count
Source
337
36.8k
  bool Valid() const override {
338
    // When status_ is not ok, iter should be invalid.
339
36.8k
    assert(status_.ok() || current_ >= restarts_);
340
36.8k
    return current_ < restarts_;
341
36.8k
  }
342
343
28.5k
  void SeekToFirst() override final {
344
#ifndef NDEBUG
345
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
346
#endif
347
28.5k
    SeekToFirstImpl();
348
28.5k
    UpdateKey();
349
28.5k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToFirst()
Line
Count
Source
343
22.5k
  void SeekToFirst() override final {
344
#ifndef NDEBUG
345
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
346
#endif
347
22.5k
    SeekToFirstImpl();
348
22.5k
    UpdateKey();
349
22.5k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToFirst()
Line
Count
Source
343
6.08k
  void SeekToFirst() override final {
344
#ifndef NDEBUG
345
    if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
346
#endif
347
6.08k
    SeekToFirstImpl();
348
6.08k
    UpdateKey();
349
6.08k
  }
350
351
778
  void SeekToLast() override final {
352
778
    SeekToLastImpl();
353
778
    UpdateKey();
354
778
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToLast()
Line
Count
Source
351
389
  void SeekToLast() override final {
352
389
    SeekToLastImpl();
353
389
    UpdateKey();
354
389
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToLast()
Line
Count
Source
351
389
  void SeekToLast() override final {
352
389
    SeekToLastImpl();
353
389
    UpdateKey();
354
389
  }
355
356
32.2k
  void Seek(const Slice& target) override final {
357
32.2k
    SeekImpl(target);
358
32.2k
    UpdateKey();
359
32.2k
  }
rocksdb::BlockIter<rocksdb::Slice>::Seek(rocksdb::Slice const&)
Line
Count
Source
356
30.3k
  void Seek(const Slice& target) override final {
357
30.3k
    SeekImpl(target);
358
30.3k
    UpdateKey();
359
30.3k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Seek(rocksdb::Slice const&)
Line
Count
Source
356
1.82k
  void Seek(const Slice& target) override final {
357
1.82k
    SeekImpl(target);
358
1.82k
    UpdateKey();
359
1.82k
  }
360
361
1.14k
  void SeekForPrev(const Slice& target) override final {
362
1.14k
    SeekForPrevImpl(target);
363
1.14k
    UpdateKey();
364
1.14k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekForPrev(rocksdb::Slice const&)
Line
Count
Source
361
1.14k
  void SeekForPrev(const Slice& target) override final {
362
1.14k
    SeekForPrevImpl(target);
363
1.14k
    UpdateKey();
364
1.14k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SeekForPrev(rocksdb::Slice const&)
365
366
488k
  void Next() override final {
367
488k
    NextImpl();
368
488k
    UpdateKey();
369
488k
  }
rocksdb::BlockIter<rocksdb::Slice>::Next()
Line
Count
Source
366
479k
  void Next() override final {
367
479k
    NextImpl();
368
479k
    UpdateKey();
369
479k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Next()
Line
Count
Source
366
8.59k
  void Next() override final {
367
8.59k
    NextImpl();
368
8.59k
    UpdateKey();
369
8.59k
  }
370
371
0
  bool NextAndGetResult(IterateResult* result) override final {
372
    // This does not need to call `UpdateKey()` as the parent class only has
373
    // access to the `UpdateKey()`-invoking functions.
374
0
    return InternalIteratorBase<TValue>::NextAndGetResult(result);
375
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NextAndGetResult(rocksdb::IterateResult*)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NextAndGetResult(rocksdb::IterateResult*)
376
377
2.68k
  void Prev() override final {
378
2.68k
    PrevImpl();
379
2.68k
    UpdateKey();
380
2.68k
  }
rocksdb::BlockIter<rocksdb::Slice>::Prev()
Line
Count
Source
377
1.14k
  void Prev() override final {
378
1.14k
    PrevImpl();
379
1.14k
    UpdateKey();
380
1.14k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::Prev()
Line
Count
Source
377
1.53k
  void Prev() override final {
378
1.53k
    PrevImpl();
379
1.53k
    UpdateKey();
380
1.53k
  }
381
382
330k
  Status status() const override { return status_; }
rocksdb::BlockIter<rocksdb::Slice>::status() const
Line
Count
Source
382
312k
  Status status() const override { return status_; }
rocksdb::BlockIter<rocksdb::IndexValue>::status() const
Line
Count
Source
382
17.7k
  Status status() const override { return status_; }
383
384
730k
  Slice key() const override {
385
730k
    assert(Valid());
386
730k
    return key_;
387
730k
  }
rocksdb::BlockIter<rocksdb::Slice>::key() const
Line
Count
Source
384
730k
  Slice key() const override {
385
730k
    assert(Valid());
386
730k
    return key_;
387
730k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::key() const
388
389
#ifndef NDEBUG
390
  ~BlockIter() override {
391
    // Assert that the BlockIter is never deleted while Pinning is Enabled.
392
    assert(!pinned_iters_mgr_ ||
393
           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
394
    status_.PermitUncheckedError();
395
  }
396
397
  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
398
    pinned_iters_mgr_ = pinned_iters_mgr;
399
  }
400
401
  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
402
403
  bool TEST_Corrupt_Callback(const std::string& sync_point) {
404
    bool corrupt = false;
405
    TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt));
406
407
    if (corrupt) {
408
      CorruptionError();
409
    }
410
    return corrupt;
411
  }
412
#endif
413
414
201k
  bool IsKeyPinned() const override {
415
201k
    return block_contents_pinned_ && key_pinned_;
416
201k
  }
rocksdb::BlockIter<rocksdb::Slice>::IsKeyPinned() const
Line
Count
Source
414
201k
  bool IsKeyPinned() const override {
415
201k
    return block_contents_pinned_ && key_pinned_;
416
201k
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsKeyPinned() const
417
418
100k
  bool IsValuePinned() const override { return block_contents_pinned_; }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::IsValuePinned() const
rocksdb::BlockIter<rocksdb::Slice>::IsValuePinned() const
Line
Count
Source
418
100k
  bool IsValuePinned() const override { return block_contents_pinned_; }
419
420
  size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
421
422
0
  uint32_t ValueOffset() const {
423
0
    return static_cast<uint32_t>(value_.data() - data_);
424
0
  }
425
426
13.5k
  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
rocksdb::BlockIter<rocksdb::Slice>::SetCacheHandle(rocksdb::Cache::Handle*)
Line
Count
Source
426
13.5k
  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::SetCacheHandle(rocksdb::Cache::Handle*)
427
428
  Cache::Handle* cache_handle() { return cache_handle_; }
429
430
 protected:
431
  std::unique_ptr<InternalKeyComparator> icmp_;
432
  const char* data_;       // underlying block contents
433
  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
434
435
  // Index of restart block in which current_ or current_-1 falls
436
  uint32_t restart_index_;
437
  uint32_t restarts_;  // Offset of restart array (list of fixed32)
438
  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
439
  uint32_t current_;
440
  // Raw key from block.
441
  IterKey raw_key_;
442
  // Buffer for key data when global seqno assignment is enabled.
443
  IterKey key_buf_;
444
  Slice value_;
445
  Status status_;
446
  // Key to be exposed to users.
447
  Slice key_;
448
  SequenceNumber global_seqno_;
449
  // Size of the user-defined timestamp.
450
  size_t ts_sz_ = 0;
451
  // If user-defined timestamp is enabled but not persisted. A min timestamp
452
  // will be padded to the key during key parsing where it applies. Such as when
453
  // parsing keys from data block, index block, parsing the first internal
454
  // key from IndexValue entry. Min timestamp padding is different for when
455
  // `raw_key_` is a user key vs is an internal key.
456
  //
457
  // This only applies to data block and index blocks including index block for
458
  // data blocks, index block for partitioned filter blocks, index block for
459
  // partitioned index blocks. In summary, this only applies to block whose key
460
  // are real user keys or internal keys created from user keys.
461
  bool pad_min_timestamp_;
462
463
  // Per key-value checksum related states
464
  const char* kv_checksum_;
465
  int32_t cur_entry_idx_;
466
  uint32_t block_restart_interval_;
467
  uint8_t protection_bytes_per_key_;
468
469
  bool key_pinned_;
470
  // Whether the block data is guaranteed to outlive this iterator, and
471
  // as long as the cleanup functions are transferred to another class,
472
  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
473
  bool block_contents_pinned_;
474
475
  virtual void SeekToFirstImpl() = 0;
476
  virtual void SeekToLastImpl() = 0;
477
  virtual void SeekImpl(const Slice& target) = 0;
478
  virtual void SeekForPrevImpl(const Slice& target) = 0;
479
  virtual void NextImpl() = 0;
480
  virtual void PrevImpl() = 0;
481
482
  // Returns the restart interval of this block.
483
  // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized.
484
0
  virtual uint32_t GetRestartInterval() {
485
0
    if (num_restarts_ <= 1 || data_ == nullptr) {
486
0
      return 0;
487
0
    }
488
0
    SeekToFirstImpl();
489
0
    uint32_t end_index = GetRestartPoint(1);
490
0
    uint32_t count = 1;
491
0
    while (NextEntryOffset() < end_index && status_.ok()) {
492
0
      assert(Valid());
493
0
      NextImpl();
494
0
      ++count;
495
0
    }
496
0
    return count;
497
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::GetRestartInterval()
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartInterval()
498
499
  // Returns the number of keys in this block.
500
0
  virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) {
501
0
    if (num_restarts_ == 0 || data_ == nullptr) {
502
0
      return 0;
503
0
    }
504
0
    uint32_t count = (num_restarts_ - 1) * block_restart_interval;
505
    // Add number of keys from the last restart interval
506
0
    SeekToRestartPoint(num_restarts_ - 1);
507
0
    while (NextEntryOffset() < restarts_ && status_.ok()) {
508
0
      NextImpl();
509
0
      ++count;
510
0
    }
511
0
    return count;
512
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::NumberOfKeys(unsigned int)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::NumberOfKeys(unsigned int)
513
514
  // Stores whether the current key has a shared bytes with prev key in
515
  // *is_shared.
516
  // Sets raw_key_, value_ to the current parsed key and value.
517
  // Sets restart_index_ to point to the restart interval that contains
518
  // the current key.
519
  template <typename DecodeEntryFunc>
520
  inline bool ParseNextKey(bool* is_shared);
521
522
  // protection_bytes_per_key, kv_checksum, and block_restart_interval
523
  // are needed only for per kv checksum verification.
524
  void InitializeBase(const Comparator* raw_ucmp, const char* data,
525
                      uint32_t restarts, uint32_t num_restarts,
526
                      SequenceNumber global_seqno, bool block_contents_pinned,
527
                      bool user_defined_timestamp_persisted,
528
529
                      uint8_t protection_bytes_per_key, const char* kv_checksum,
530
42.3k
                      uint32_t block_restart_interval) {
531
42.3k
    assert(data_ == nullptr);  // Ensure it is called only once
532
42.3k
    assert(num_restarts > 0);  // Ensure the param is valid
533
534
42.3k
    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
535
42.3k
    data_ = data;
536
42.3k
    restarts_ = restarts;
537
42.3k
    num_restarts_ = num_restarts;
538
42.3k
    current_ = restarts_;
539
42.3k
    restart_index_ = num_restarts_;
540
42.3k
    global_seqno_ = global_seqno;
541
42.3k
    if (raw_ucmp != nullptr) {
542
42.3k
      ts_sz_ = raw_ucmp->timestamp_size();
543
42.3k
    }
544
42.3k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
545
42.3k
    block_contents_pinned_ = block_contents_pinned;
546
42.3k
    cache_handle_ = nullptr;
547
42.3k
    cur_entry_idx_ = -1;
548
42.3k
    protection_bytes_per_key_ = protection_bytes_per_key;
549
42.3k
    kv_checksum_ = kv_checksum;
550
42.3k
    block_restart_interval_ = block_restart_interval;
551
    // Checksum related states are either all 0/nullptr or all non-zero.
552
    // One exception is when num_restarts == 0, block_restart_interval can be 0
553
    // since we are not able to compute it.
554
42.3k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
555
42.3k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
556
42.3k
            (block_restart_interval > 0 || num_restarts == 1)));
557
42.3k
  }
rocksdb::BlockIter<rocksdb::Slice>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int)
Line
Count
Source
530
28.7k
                      uint32_t block_restart_interval) {
531
28.7k
    assert(data_ == nullptr);  // Ensure it is called only once
532
28.7k
    assert(num_restarts > 0);  // Ensure the param is valid
533
534
28.7k
    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
535
28.7k
    data_ = data;
536
28.7k
    restarts_ = restarts;
537
28.7k
    num_restarts_ = num_restarts;
538
28.7k
    current_ = restarts_;
539
28.7k
    restart_index_ = num_restarts_;
540
28.7k
    global_seqno_ = global_seqno;
541
28.7k
    if (raw_ucmp != nullptr) {
542
28.7k
      ts_sz_ = raw_ucmp->timestamp_size();
543
28.7k
    }
544
28.7k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
545
28.7k
    block_contents_pinned_ = block_contents_pinned;
546
28.7k
    cache_handle_ = nullptr;
547
28.7k
    cur_entry_idx_ = -1;
548
28.7k
    protection_bytes_per_key_ = protection_bytes_per_key;
549
28.7k
    kv_checksum_ = kv_checksum;
550
28.7k
    block_restart_interval_ = block_restart_interval;
551
    // Checksum related states are either all 0/nullptr or all non-zero.
552
    // One exception is when num_restarts == 0, block_restart_interval can be 0
553
    // since we are not able to compute it.
554
28.7k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
555
28.7k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
556
28.7k
            (block_restart_interval > 0 || num_restarts == 1)));
557
28.7k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::InitializeBase(rocksdb::Comparator const*, char const*, unsigned int, unsigned int, unsigned long, bool, bool, unsigned char, char const*, unsigned int)
Line
Count
Source
530
13.5k
                      uint32_t block_restart_interval) {
531
13.5k
    assert(data_ == nullptr);  // Ensure it is called only once
532
13.5k
    assert(num_restarts > 0);  // Ensure the param is valid
533
534
13.5k
    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
535
13.5k
    data_ = data;
536
13.5k
    restarts_ = restarts;
537
13.5k
    num_restarts_ = num_restarts;
538
13.5k
    current_ = restarts_;
539
13.5k
    restart_index_ = num_restarts_;
540
13.5k
    global_seqno_ = global_seqno;
541
13.5k
    if (raw_ucmp != nullptr) {
542
13.5k
      ts_sz_ = raw_ucmp->timestamp_size();
543
13.5k
    }
544
13.5k
    pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
545
13.5k
    block_contents_pinned_ = block_contents_pinned;
546
13.5k
    cache_handle_ = nullptr;
547
13.5k
    cur_entry_idx_ = -1;
548
13.5k
    protection_bytes_per_key_ = protection_bytes_per_key;
549
13.5k
    kv_checksum_ = kv_checksum;
550
13.5k
    block_restart_interval_ = block_restart_interval;
551
    // Checksum related states are either all 0/nullptr or all non-zero.
552
    // One exception is when num_restarts == 0, block_restart_interval can be 0
553
    // since we are not able to compute it.
554
13.5k
    assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
555
13.5k
           (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
556
13.5k
            (block_restart_interval > 0 || num_restarts == 1)));
557
13.5k
  }
558
559
0
  void CorruptionError(const std::string& error_msg = "bad entry in block") {
560
0
    current_ = restarts_;
561
0
    restart_index_ = num_restarts_;
562
0
    status_ = Status::Corruption(error_msg);
563
0
    raw_key_.Clear();
564
0
    value_.clear();
565
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::CorruptionError(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
566
567
0
  void PerKVChecksumCorruptionError() {
568
0
    std::string error_msg{
569
0
        "Corrupted block entry: per key-value checksum verification "
570
0
        "failed."};
571
0
    error_msg.append(" Offset: " + std::to_string(current_) + ".");
572
0
    error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + ".");
573
0
    CorruptionError(error_msg);
574
0
  }
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::Slice>::PerKVChecksumCorruptionError()
Unexecuted instantiation: rocksdb::BlockIter<rocksdb::IndexValue>::PerKVChecksumCorruptionError()
575
576
326k
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
577
326k
    if (pad_min_timestamp_) {
578
0
      std::string buf;
579
0
      if (raw_key_.IsUserKey()) {
580
0
        AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
581
0
      } else {
582
0
        PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
583
0
      }
584
0
      raw_key_.SetKey(buf, true /* copy */);
585
326k
    } else {
586
326k
      raw_key_.SetKey(key, false /* copy */);
587
326k
    }
588
326k
  }
rocksdb::BlockIter<rocksdb::Slice>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&)
Line
Count
Source
576
313k
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
577
313k
    if (pad_min_timestamp_) {
578
0
      std::string buf;
579
0
      if (raw_key_.IsUserKey()) {
580
0
        AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
581
0
      } else {
582
0
        PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
583
0
      }
584
0
      raw_key_.SetKey(buf, true /* copy */);
585
313k
    } else {
586
313k
      raw_key_.SetKey(key, false /* copy */);
587
313k
    }
588
313k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateRawKeyAndMaybePadMinTimestamp(rocksdb::Slice const&)
Line
Count
Source
576
12.8k
  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
577
12.8k
    if (pad_min_timestamp_) {
578
0
      std::string buf;
579
0
      if (raw_key_.IsUserKey()) {
580
0
        AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
581
0
      } else {
582
0
        PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
583
0
      }
584
0
      raw_key_.SetKey(buf, true /* copy */);
585
12.8k
    } else {
586
12.8k
      raw_key_.SetKey(key, false /* copy */);
587
12.8k
    }
588
12.8k
  }
589
590
  // Must be called every time a key is found that needs to be returned to user,
591
  // and may be called when no key is found (as a no-op). Updates `key_`,
592
  // `key_buf_`, and `key_pinned_` with info about the found key.
593
  // Per key-value checksum verification is done if available for the key to be
594
  // returned. Iterator is invalidated with corruption status if checksum
595
  // verification fails.
596
554k
  void UpdateKey() {
597
554k
    key_buf_.Clear();
598
554k
    if (!Valid()) {
599
35.6k
      return;
600
35.6k
    }
601
518k
    if (raw_key_.IsUserKey()) {
602
286k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
603
286k
      key_ = raw_key_.GetUserKey();
604
286k
      key_pinned_ = raw_key_.IsKeyPinned();
605
286k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
606
233k
      key_ = raw_key_.GetInternalKey();
607
233k
      key_pinned_ = raw_key_.IsKeyPinned();
608
18.4E
    } else {
609
18.4E
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
610
18.4E
                              ExtractValueType(raw_key_.GetInternalKey()));
611
18.4E
      key_ = key_buf_.GetInternalKey();
612
18.4E
      key_pinned_ = false;
613
18.4E
    }
614
518k
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
615
518k
                             (void*)value_.data());
616
518k
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
617
518k
                             &protection_bytes_per_key_);
618
518k
    if (protection_bytes_per_key_ > 0) {
619
0
      if (!ProtectionInfo64()
620
0
               .ProtectKV(raw_key_.GetKey(), value_)
621
0
               .Verify(
622
0
                   protection_bytes_per_key_,
623
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
624
0
        PerKVChecksumCorruptionError();
625
0
      }
626
0
    }
627
518k
  }
rocksdb::BlockIter<rocksdb::Slice>::UpdateKey()
Line
Count
Source
596
535k
  void UpdateKey() {
597
535k
    key_buf_.Clear();
598
535k
    if (!Valid()) {
599
28.2k
      return;
600
28.2k
    }
601
507k
    if (raw_key_.IsUserKey()) {
602
275k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
603
275k
      key_ = raw_key_.GetUserKey();
604
275k
      key_pinned_ = raw_key_.IsKeyPinned();
605
275k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
606
233k
      key_ = raw_key_.GetInternalKey();
607
233k
      key_pinned_ = raw_key_.IsKeyPinned();
608
18.4E
    } else {
609
18.4E
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
610
18.4E
                              ExtractValueType(raw_key_.GetInternalKey()));
611
18.4E
      key_ = key_buf_.GetInternalKey();
612
18.4E
      key_pinned_ = false;
613
18.4E
    }
614
507k
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
615
507k
                             (void*)value_.data());
616
507k
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
617
507k
                             &protection_bytes_per_key_);
618
507k
    if (protection_bytes_per_key_ > 0) {
619
0
      if (!ProtectionInfo64()
620
0
               .ProtectKV(raw_key_.GetKey(), value_)
621
0
               .Verify(
622
0
                   protection_bytes_per_key_,
623
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
624
0
        PerKVChecksumCorruptionError();
625
0
      }
626
0
    }
627
507k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::UpdateKey()
Line
Count
Source
596
18.4k
  void UpdateKey() {
597
18.4k
    key_buf_.Clear();
598
18.4k
    if (!Valid()) {
599
7.44k
      return;
600
7.44k
    }
601
10.9k
    if (raw_key_.IsUserKey()) {
602
10.9k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
603
10.9k
      key_ = raw_key_.GetUserKey();
604
10.9k
      key_pinned_ = raw_key_.IsKeyPinned();
605
10.9k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
606
0
      key_ = raw_key_.GetInternalKey();
607
0
      key_pinned_ = raw_key_.IsKeyPinned();
608
0
    } else {
609
0
      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
610
0
                              ExtractValueType(raw_key_.GetInternalKey()));
611
0
      key_ = key_buf_.GetInternalKey();
612
0
      key_pinned_ = false;
613
0
    }
614
10.9k
    TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
615
10.9k
                             (void*)value_.data());
616
10.9k
    TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
617
10.9k
                             &protection_bytes_per_key_);
618
10.9k
    if (protection_bytes_per_key_ > 0) {
619
0
      if (!ProtectionInfo64()
620
0
               .ProtectKV(raw_key_.GetKey(), value_)
621
0
               .Verify(
622
0
                   protection_bytes_per_key_,
623
0
                   kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
624
0
        PerKVChecksumCorruptionError();
625
0
      }
626
0
    }
627
10.9k
  }
628
629
  // Returns the result of `Comparator::Compare()`, where the appropriate
630
  // comparator is used for the block contents, the LHS argument is the current
631
  // key with global seqno applied, and the RHS argument is `other`.
632
52.4k
  int CompareCurrentKey(const Slice& other) {
633
52.4k
    if (raw_key_.IsUserKey()) {
634
49.7k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
635
49.7k
      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
636
49.7k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
637
2.74k
      return icmp_->Compare(raw_key_.GetInternalKey(), other);
638
2.74k
    }
639
4
    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
640
4
                          kDisableGlobalSequenceNumber);
641
52.4k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::CompareCurrentKey(rocksdb::Slice const&)
Line
Count
Source
632
1.82k
  int CompareCurrentKey(const Slice& other) {
633
1.82k
    if (raw_key_.IsUserKey()) {
634
1.82k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
635
1.82k
      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
636
1.82k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
637
0
      return icmp_->Compare(raw_key_.GetInternalKey(), other);
638
0
    }
639
0
    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
640
0
                          kDisableGlobalSequenceNumber);
641
1.82k
  }
rocksdb::BlockIter<rocksdb::Slice>::CompareCurrentKey(rocksdb::Slice const&)
Line
Count
Source
632
50.6k
  int CompareCurrentKey(const Slice& other) {
633
50.6k
    if (raw_key_.IsUserKey()) {
634
47.8k
      assert(global_seqno_ == kDisableGlobalSequenceNumber);
635
47.8k
      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
636
47.8k
    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
637
2.74k
      return icmp_->Compare(raw_key_.GetInternalKey(), other);
638
2.74k
    }
639
4
    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
640
4
                          kDisableGlobalSequenceNumber);
641
50.6k
  }
642
643
 private:
644
  // Store the cache handle, if the block is cached. We need this since the
645
  // only other place the handle is stored is as an argument to the Cleanable
646
  // function callback, which is hard to retrieve. When multiple value
647
  // PinnableSlices reference the block, they need the cache handle in order
648
  // to bump up the ref count
649
  Cache::Handle* cache_handle_;
650
651
 public:
652
  // Return the offset in data_ just past the end of the current entry.
653
557k
  inline uint32_t NextEntryOffset() const {
654
    // NOTE: We don't support blocks bigger than 2GB
655
557k
    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
656
557k
  }
rocksdb::BlockIter<rocksdb::Slice>::NextEntryOffset() const
Line
Count
Source
653
540k
  inline uint32_t NextEntryOffset() const {
654
    // NOTE: We don't support blocks bigger than 2GB
655
540k
    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
656
540k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::NextEntryOffset() const
Line
Count
Source
653
17.2k
  inline uint32_t NextEntryOffset() const {
654
    // NOTE: We don't support blocks bigger than 2GB
655
17.2k
    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
656
17.2k
  }
657
658
537k
  uint32_t GetRestartPoint(uint32_t index) const {
659
537k
    assert(index < num_restarts_);
660
537k
    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
661
537k
  }
rocksdb::BlockIter<rocksdb::Slice>::GetRestartPoint(unsigned int) const
Line
Count
Source
658
518k
  uint32_t GetRestartPoint(uint32_t index) const {
659
518k
    assert(index < num_restarts_);
660
518k
    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
661
518k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::GetRestartPoint(unsigned int) const
Line
Count
Source
658
18.8k
  uint32_t GetRestartPoint(uint32_t index) const {
659
18.8k
    assert(index < num_restarts_);
660
18.8k
    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
661
18.8k
  }
662
663
63.6k
  void SeekToRestartPoint(uint32_t index) {
664
63.6k
    raw_key_.Clear();
665
63.6k
    restart_index_ = index;
666
    // current_ will be fixed by ParseNextKey();
667
668
    // ParseNextKey() starts at the end of value_, so set value_ accordingly
669
63.6k
    uint32_t offset = GetRestartPoint(index);
670
63.6k
    value_ = Slice(data_ + offset, 0);
671
63.6k
  }
rocksdb::BlockIter<rocksdb::Slice>::SeekToRestartPoint(unsigned int)
Line
Count
Source
663
55.3k
  void SeekToRestartPoint(uint32_t index) {
664
55.3k
    raw_key_.Clear();
665
55.3k
    restart_index_ = index;
666
    // current_ will be fixed by ParseNextKey();
667
668
    // ParseNextKey() starts at the end of value_, so set value_ accordingly
669
55.3k
    uint32_t offset = GetRestartPoint(index);
670
55.3k
    value_ = Slice(data_ + offset, 0);
671
55.3k
  }
rocksdb::BlockIter<rocksdb::IndexValue>::SeekToRestartPoint(unsigned int)
Line
Count
Source
663
8.29k
  void SeekToRestartPoint(uint32_t index) {
664
8.29k
    raw_key_.Clear();
665
8.29k
    restart_index_ = index;
666
    // current_ will be fixed by ParseNextKey();
667
668
    // ParseNextKey() starts at the end of value_, so set value_ accordingly
669
8.29k
    uint32_t offset = GetRestartPoint(index);
670
8.29k
    value_ = Slice(data_ + offset, 0);
671
8.29k
  }
672
673
 protected:
674
  template <typename DecodeKeyFunc>
675
  inline bool BinarySeek(const Slice& target, uint32_t* index,
676
                         bool* is_index_key_result);
677
678
  // Find the first key in restart interval `index` that is >= `target`.
679
  // If there is no such key, iterator is positioned at the first key in
680
  // restart interval `index + 1`.
681
  // If is_index_key_result is true, it positions the iterator at the first key
682
  // in this restart interval.
683
  // Per key-value checksum verification is done for all keys scanned
684
  // up to but not including the last key (the key that current_ points to
685
  // when this function returns). This key's checksum is verified in
686
  // UpdateKey().
687
  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
688
                              bool is_index_key_result);
689
};
690
691
class DataBlockIter final : public BlockIter<Slice> {
692
 public:
693
  DataBlockIter()
694
16.6k
      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
695
  void Initialize(const Comparator* raw_ucmp, const char* data,
696
                  uint32_t restarts, uint32_t num_restarts,
697
                  SequenceNumber global_seqno,
698
                  BlockReadAmpBitmap* read_amp_bitmap,
699
                  bool block_contents_pinned,
700
                  bool user_defined_timestamps_persisted,
701
                  DataBlockHashIndex* data_block_hash_index,
702
                  uint8_t protection_bytes_per_key, const char* kv_checksum,
703
13.8k
                  uint32_t block_restart_interval) {
704
13.8k
    InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
705
13.8k
                   block_contents_pinned, user_defined_timestamps_persisted,
706
13.8k
                   protection_bytes_per_key, kv_checksum,
707
13.8k
                   block_restart_interval);
708
13.8k
    raw_key_.SetIsUserKey(false);
709
13.8k
    read_amp_bitmap_ = read_amp_bitmap;
710
13.8k
    last_bitmap_offset_ = current_ + 1;
711
13.8k
    data_block_hash_index_ = data_block_hash_index;
712
13.8k
  }
713
714
227k
  Slice value() const override {
715
227k
    assert(Valid());
716
227k
    if (read_amp_bitmap_ && current_ < restarts_ &&
717
227k
        current_ != last_bitmap_offset_) {
718
0
      read_amp_bitmap_->Mark(current_ /* current entry offset */,
719
0
                             NextEntryOffset() - 1);
720
0
      last_bitmap_offset_ = current_;
721
0
    }
722
227k
    return value_;
723
227k
  }
724
725
  // Returns if `target` may exist.
726
98
  inline bool SeekForGet(const Slice& target) {
727
#ifndef NDEBUG
728
    if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true;
729
#endif
730
98
    if (!data_block_hash_index_) {
731
98
      SeekImpl(target);
732
98
      UpdateKey();
733
98
      return true;
734
98
    }
735
0
    bool res = SeekForGetImpl(target);
736
0
    UpdateKey();
737
0
    return res;
738
98
  }
739
740
10.1k
  void Invalidate(const Status& s) override {
741
10.1k
    BlockIter::Invalidate(s);
742
    // Clear prev entries cache.
743
10.1k
    prev_entries_keys_buff_.clear();
744
10.1k
    prev_entries_.clear();
745
10.1k
    prev_entries_idx_ = -1;
746
10.1k
  }
747
748
 protected:
749
  friend Block;
750
  inline bool ParseNextDataKey(bool* is_shared);
751
  void SeekToFirstImpl() override;
752
  void SeekToLastImpl() override;
753
  void SeekImpl(const Slice& target) override;
754
  void SeekForPrevImpl(const Slice& target) override;
755
  void NextImpl() override;
756
  void PrevImpl() override;
757
758
 private:
759
  // read-amp bitmap
760
  BlockReadAmpBitmap* read_amp_bitmap_;
761
  // last `current_` value we report to read-amp bitmp
762
  mutable uint32_t last_bitmap_offset_;
763
  struct CachedPrevEntry {
764
    explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
765
                             size_t _key_offset, size_t _key_size, Slice _value)
766
        : offset(_offset),
767
          key_ptr(_key_ptr),
768
          key_offset(_key_offset),
769
          key_size(_key_size),
770
174
          value(_value) {}
771
772
    // offset of entry in block
773
    uint32_t offset;
774
    // Pointer to key data in block (nullptr if key is delta-encoded)
775
    const char* key_ptr;
776
    // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
777
    size_t key_offset;
778
    // size of key
779
    size_t key_size;
780
    // value slice pointing to data in block
781
    Slice value;
782
  };
783
  std::string prev_entries_keys_buff_;
784
  std::vector<CachedPrevEntry> prev_entries_;
785
  int32_t prev_entries_idx_ = -1;
786
787
  DataBlockHashIndex* data_block_hash_index_;
788
789
  bool SeekForGetImpl(const Slice& target);
790
};
791
792
// Iterator over MetaBlocks.  MetaBlocks are similar to Data Blocks and
793
// are used to store Properties associated with table.
794
// Meta blocks always store user keys (no sequence number) and always
795
// use the BytewiseComparator.  Additionally, MetaBlock accesses are
796
// not recorded in the Statistics or for Read-Amplification.
797
class MetaBlockIter final : public BlockIter<Slice> {
798
 public:
799
14.9k
  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
800
  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
801
                  bool block_contents_pinned, uint8_t protection_bytes_per_key,
802
14.9k
                  const char* kv_checksum, uint32_t block_restart_interval) {
803
    // Initializes the iterator with a BytewiseComparator and
804
    // the raw key being a user key.
805
14.9k
    InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
806
14.9k
                   kDisableGlobalSequenceNumber, block_contents_pinned,
807
14.9k
                   /* user_defined_timestamps_persisted */ true,
808
14.9k
                   protection_bytes_per_key, kv_checksum,
809
14.9k
                   block_restart_interval);
810
14.9k
    raw_key_.SetIsUserKey(true);
811
14.9k
  }
812
813
271k
  Slice value() const override {
814
271k
    assert(Valid());
815
271k
    return value_;
816
271k
  }
817
818
 protected:
819
  friend Block;
820
  void SeekToFirstImpl() override;
821
  void SeekToLastImpl() override;
822
  void SeekImpl(const Slice& target) override;
823
  void SeekForPrevImpl(const Slice& target) override;
824
  void NextImpl() override;
825
  void PrevImpl() override;
826
  // Meta index block's restart interval is always 1. See
827
  // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval.
828
0
  uint32_t GetRestartInterval() override { return 1; }
829
0
  uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; }
830
};
831
832
class IndexBlockIter final : public BlockIter<IndexValue> {
833
 public:
834
13.5k
  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
835
836
  // key_includes_seq, default true, means that the keys are in internal key
837
  // format.
838
  // value_is_full, default true, means that no delta encoding is
839
  // applied to values.
840
  void Initialize(const Comparator* raw_ucmp, const char* data,
841
                  uint32_t restarts, uint32_t num_restarts,
842
                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
843
                  bool have_first_key, bool key_includes_seq,
844
                  bool value_is_full, bool block_contents_pinned,
845
                  bool user_defined_timestamps_persisted,
846
                  uint8_t protection_bytes_per_key, const char* kv_checksum,
847
13.5k
                  uint32_t block_restart_interval) {
848
13.5k
    InitializeBase(raw_ucmp, data, restarts, num_restarts,
849
13.5k
                   kDisableGlobalSequenceNumber, block_contents_pinned,
850
13.5k
                   user_defined_timestamps_persisted, protection_bytes_per_key,
851
13.5k
                   kv_checksum, block_restart_interval);
852
13.5k
    raw_key_.SetIsUserKey(!key_includes_seq);
853
13.5k
    prefix_index_ = prefix_index;
854
13.5k
    value_delta_encoded_ = !value_is_full;
855
13.5k
    have_first_key_ = have_first_key;
856
13.5k
    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
857
0
      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
858
13.5k
    } else {
859
13.5k
      global_seqno_state_.reset();
860
13.5k
    }
861
13.5k
  }
862
863
0
  Slice user_key() const override {
864
0
    assert(Valid());
865
0
    return raw_key_.GetUserKey();
866
0
  }
867
868
20.6k
  IndexValue value() const override {
869
20.6k
    assert(Valid());
870
20.6k
    if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
871
20.6k
        pad_min_timestamp_) {
872
20.6k
      return decoded_value_;
873
20.6k
    } else {
874
0
      IndexValue entry;
875
0
      Slice v = value_;
876
0
      Status decode_s __attribute__((__unused__)) =
877
0
          entry.DecodeFrom(&v, have_first_key_, nullptr);
878
0
      assert(decode_s.ok());
879
0
      return entry;
880
0
    }
881
20.6k
  }
882
883
0
  Slice raw_value() const {
884
0
    assert(Valid());
885
0
    return value_;
886
0
  }
887
888
0
  bool IsValuePinned() const override {
889
0
    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
890
0
  }
891
892
 protected:
893
  friend Block;
894
  // IndexBlockIter follows a different contract for prefix iterator
895
  // from data iterators.
896
  // If prefix of the seek key `target` exists in the file, it must
897
  // return the same result as total order seek.
898
  // If the prefix of `target` doesn't exist in the file, it can either
899
  // return the result of total order seek, or set both of Valid() = false
900
  // and status() = NotFound().
901
  void SeekImpl(const Slice& target) override;
902
903
0
  void SeekForPrevImpl(const Slice&) override {
904
0
    assert(false);
905
0
    current_ = restarts_;
906
0
    restart_index_ = num_restarts_;
907
0
    status_ = Status::InvalidArgument(
908
0
        "RocksDB internal error: should never call SeekForPrev() on index "
909
0
        "blocks");
910
0
    raw_key_.Clear();
911
0
    value_.clear();
912
0
  }
913
914
  void PrevImpl() override;
915
  void NextImpl() override;
916
  void SeekToFirstImpl() override;
917
  void SeekToLastImpl() override;
918
919
 private:
920
  bool value_delta_encoded_;
921
  bool have_first_key_;  // value includes first_internal_key
922
  BlockPrefixIndex* prefix_index_;
923
  // Whether the value is delta encoded. In that case the value is assumed to be
924
  // BlockHandle. The first value in each restart interval is the full encoded
925
  // BlockHandle; the restart of encoded size part of the BlockHandle. The
926
  // offset of delta encoded BlockHandles is computed by adding the size of
927
  // previous delta encoded values in the same restart interval to the offset of
928
  // the first value in that restart interval.
929
  IndexValue decoded_value_;
930
931
  // When sequence number overwriting is enabled, this struct contains the seqno
932
  // to overwrite with, and current first_internal_key with overwritten seqno.
933
  // This is rarely used, so we put it behind a pointer and only allocate when
934
  // needed.
935
  struct GlobalSeqnoState {
936
    // First internal key according to current index entry, but with sequence
937
    // number overwritten to global_seqno.
938
    IterKey first_internal_key;
939
    SequenceNumber global_seqno;
940
941
0
    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
942
  };
943
944
  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
945
946
  // Buffers the `first_internal_key` referred by `decoded_value_` when
947
  // `pad_min_timestamp_` is true.
948
  std::string first_internal_key_with_ts_;
949
950
  // Set *prefix_may_exist to false if no key possibly share the same prefix
951
  // as `target`. If not set, the result position should be the same as total
952
  // order Seek.
953
  bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
954
  // Set *prefix_may_exist to false if no key can possibly share the same
955
  // prefix as `target`. If not set, the result position should be the same
956
  // as total order seek.
957
  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
958
                            uint32_t left, uint32_t right, uint32_t* index,
959
                            bool* prefix_may_exist);
960
  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
961
962
  inline bool ParseNextIndexKey();
963
964
  // When value_delta_encoded_ is enabled it decodes the value which is assumed
965
  // to be BlockHandle and put it to decoded_value_
966
  inline void DecodeCurrentValue(bool is_shared);
967
};
968
969
}  // namespace ROCKSDB_NAMESPACE