Coverage Report

Created: 2024-09-08 07:17

/src/rocksdb/table/block_based/block.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
//
10
// Decodes the blocks generated by block_builder.cc.
11
12
#include "table/block_based/block.h"
13
14
#include <algorithm>
15
#include <string>
16
#include <unordered_map>
17
#include <vector>
18
19
#include "monitoring/perf_context_imp.h"
20
#include "port/port.h"
21
#include "port/stack_trace.h"
22
#include "rocksdb/comparator.h"
23
#include "table/block_based/block_prefix_index.h"
24
#include "table/block_based/data_block_footer.h"
25
#include "table/format.h"
26
#include "util/coding.h"
27
28
namespace ROCKSDB_NAMESPACE {
29
30
// Helper routine: decode the next block entry starting at "p",
31
// storing the number of shared key bytes, non_shared key bytes,
32
// and the length of the value in "*shared", "*non_shared", and
33
// "*value_length", respectively.  Will not dereference past "limit".
34
//
35
// If any errors are detected, returns nullptr.  Otherwise, returns a
36
// pointer to the key delta (just past the three decoded values).
37
struct DecodeEntry {
38
  inline const char* operator()(const char* p, const char* limit,
39
                                uint32_t* shared, uint32_t* non_shared,
40
284k
                                uint32_t* value_length) {
41
    // We need 2 bytes for shared and non_shared size. We also need one more
42
    // byte either for value size or the actual value in case of value delta
43
    // encoding.
44
284k
    assert(limit - p >= 3);
45
284k
    *shared = reinterpret_cast<const unsigned char*>(p)[0];
46
284k
    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
47
284k
    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
48
284k
    if ((*shared | *non_shared | *value_length) < 128) {
49
      // Fast path: all three values are encoded in one byte each
50
205k
      p += 3;
51
205k
    } else {
52
78.8k
      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
53
0
        return nullptr;
54
0
      }
55
78.8k
      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
56
0
        return nullptr;
57
0
      }
58
78.8k
      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
59
0
        return nullptr;
60
0
      }
61
78.8k
    }
62
63
    // Using an assert in place of "return null" since we should not pay the
64
    // cost of checking for corruption on every single key decoding
65
284k
    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
66
284k
    return p;
67
284k
  }
68
};
69
70
// Helper routine: similar to DecodeEntry but does not have assertions.
71
// Instead, returns nullptr so that caller can detect and report failure.
72
struct CheckAndDecodeEntry {
73
  inline const char* operator()(const char* p, const char* limit,
74
                                uint32_t* shared, uint32_t* non_shared,
75
278k
                                uint32_t* value_length) {
76
    // We need 2 bytes for shared and non_shared size. We also need one more
77
    // byte either for value size or the actual value in case of value delta
78
    // encoding.
79
278k
    if (limit - p < 3) {
80
0
      return nullptr;
81
0
    }
82
278k
    *shared = reinterpret_cast<const unsigned char*>(p)[0];
83
278k
    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
84
278k
    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
85
278k
    if ((*shared | *non_shared | *value_length) < 128) {
86
      // Fast path: all three values are encoded in one byte each
87
271k
      p += 3;
88
271k
    } else {
89
7.06k
      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
90
0
        return nullptr;
91
0
      }
92
7.06k
      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
93
0
        return nullptr;
94
0
      }
95
7.06k
      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
96
0
        return nullptr;
97
0
      }
98
7.06k
    }
99
100
278k
    if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
101
0
      return nullptr;
102
0
    }
103
278k
    return p;
104
278k
  }
105
};
106
107
struct DecodeKey {
108
  inline const char* operator()(const char* p, const char* limit,
109
49.7k
                                uint32_t* shared, uint32_t* non_shared) {
110
49.7k
    uint32_t value_length;
111
49.7k
    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
112
49.7k
  }
113
};
114
115
// In format_version 4, which is used by index blocks, the value size is not
116
// encoded before the entry, as the value is known to be the handle with the
117
// known size.
118
struct DecodeKeyV4 {
119
  inline const char* operator()(const char* p, const char* limit,
120
12.8k
                                uint32_t* shared, uint32_t* non_shared) {
121
    // We need 2 bytes for shared and non_shared size. We also need one more
122
    // byte either for value size or the actual value in case of value delta
123
    // encoding.
124
12.8k
    if (limit - p < 3) {
125
0
      return nullptr;
126
0
    }
127
12.8k
    *shared = reinterpret_cast<const unsigned char*>(p)[0];
128
12.8k
    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
129
12.8k
    if ((*shared | *non_shared) < 128) {
130
      // Fast path: all three values are encoded in one byte each
131
9.64k
      p += 2;
132
9.64k
    } else {
133
3.16k
      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
134
0
        return nullptr;
135
0
      }
136
3.16k
      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
137
0
        return nullptr;
138
0
      }
139
3.16k
    }
140
12.8k
    return p;
141
12.8k
  }
142
};
143
144
struct DecodeEntryV4 {
145
  inline const char* operator()(const char* p, const char* limit,
146
                                uint32_t* shared, uint32_t* non_shared,
147
10.9k
                                uint32_t* value_length) {
148
10.9k
    assert(value_length);
149
150
10.9k
    *value_length = 0;
151
10.9k
    return DecodeKeyV4()(p, limit, shared, non_shared);
152
10.9k
  }
153
};
154
155
233k
void DataBlockIter::NextImpl() {
156
#ifndef NDEBUG
157
  if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
158
    return;
159
  }
160
#endif
161
233k
  bool is_shared = false;
162
233k
  ParseNextDataKey(&is_shared);
163
233k
  ++cur_entry_idx_;
164
233k
}
165
166
281k
void MetaBlockIter::NextImpl() {
167
281k
  bool is_shared = false;
168
281k
  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
169
281k
  ++cur_entry_idx_;
170
281k
}
171
172
10.4k
void IndexBlockIter::NextImpl() {
173
10.4k
  ParseNextIndexKey();
174
10.4k
  ++cur_entry_idx_;
175
10.4k
}
176
177
1.53k
void IndexBlockIter::PrevImpl() {
178
1.53k
  assert(Valid());
179
  // Scan backwards to a restart point before current_
180
1.53k
  const uint32_t original = current_;
181
1.53k
  while (GetRestartPoint(restart_index_) >= original) {
182
1.53k
    if (restart_index_ == 0) {
183
      // No more entries
184
1.53k
      current_ = restarts_;
185
1.53k
      restart_index_ = num_restarts_;
186
1.53k
      return;
187
1.53k
    }
188
0
    restart_index_--;
189
0
  }
190
0
  SeekToRestartPoint(restart_index_);
191
  // Loop until end of current entry hits the start of original entry
192
0
  while (ParseNextIndexKey() && NextEntryOffset() < original) {
193
0
  }
194
0
  --cur_entry_idx_;
195
0
}
196
197
0
void MetaBlockIter::PrevImpl() {
198
0
  assert(Valid());
199
  // Scan backwards to a restart point before current_
200
0
  const uint32_t original = current_;
201
0
  while (GetRestartPoint(restart_index_) >= original) {
202
0
    if (restart_index_ == 0) {
203
      // No more entries
204
0
      current_ = restarts_;
205
0
      restart_index_ = num_restarts_;
206
0
      return;
207
0
    }
208
0
    restart_index_--;
209
0
  }
210
0
  SeekToRestartPoint(restart_index_);
211
0
  bool is_shared = false;
212
  // Loop until end of current entry hits the start of original entry
213
0
  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
214
0
         NextEntryOffset() < original) {
215
0
  }
216
0
  --cur_entry_idx_;
217
0
}
218
219
// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
220
1.71k
void DataBlockIter::PrevImpl() {
221
1.71k
  assert(Valid());
222
223
1.71k
  assert(prev_entries_idx_ == -1 ||
224
1.71k
         static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
225
1.71k
  --cur_entry_idx_;
226
  // Check if we can use cached prev_entries_
227
1.71k
  if (prev_entries_idx_ > 0 &&
228
1.71k
      prev_entries_[prev_entries_idx_].offset == current_) {
229
    // Read cached CachedPrevEntry
230
0
    prev_entries_idx_--;
231
0
    const CachedPrevEntry& current_prev_entry =
232
0
        prev_entries_[prev_entries_idx_];
233
234
0
    const char* key_ptr = nullptr;
235
0
    bool raw_key_cached;
236
0
    if (current_prev_entry.key_ptr != nullptr) {
237
      // The key is not delta encoded and stored in the data block
238
0
      key_ptr = current_prev_entry.key_ptr;
239
0
      raw_key_cached = false;
240
0
    } else {
241
      // The key is delta encoded and stored in prev_entries_keys_buff_
242
0
      key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
243
0
      raw_key_cached = true;
244
0
    }
245
0
    const Slice current_key(key_ptr, current_prev_entry.key_size);
246
247
0
    current_ = current_prev_entry.offset;
248
    // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
249
    // not necessity. It is convenient since this class treats keys as pinned
250
    // when `raw_key_` points to an outside buffer. So we cannot allow
251
    // `raw_key_` point into Prev cache as it is a transient outside buffer
252
    // (i.e., keys in it are not actually pinned).
253
0
    raw_key_.SetKey(current_key, raw_key_cached /* copy */);
254
0
    value_ = current_prev_entry.value;
255
256
0
    return;
257
0
  }
258
259
  // Clear prev entries cache
260
1.71k
  prev_entries_idx_ = -1;
261
1.71k
  prev_entries_.clear();
262
1.71k
  prev_entries_keys_buff_.clear();
263
264
  // Scan backwards to a restart point before current_
265
1.71k
  const uint32_t original = current_;
266
1.71k
  while (GetRestartPoint(restart_index_) >= original) {
267
1.53k
    if (restart_index_ == 0) {
268
      // No more entries
269
1.53k
      current_ = restarts_;
270
1.53k
      restart_index_ = num_restarts_;
271
1.53k
      return;
272
1.53k
    }
273
0
    restart_index_--;
274
0
  }
275
276
174
  SeekToRestartPoint(restart_index_);
277
278
174
  do {
279
174
    bool is_shared = false;
280
174
    if (!ParseNextDataKey(&is_shared)) {
281
0
      break;
282
0
    }
283
174
    Slice current_key = raw_key_.GetKey();
284
285
174
    if (raw_key_.IsKeyPinned()) {
286
      // The key is not delta encoded
287
174
      prev_entries_.emplace_back(current_, current_key.data(), 0,
288
174
                                 current_key.size(), value());
289
174
    } else {
290
      // The key is delta encoded, cache decoded key in buffer
291
0
      size_t new_key_offset = prev_entries_keys_buff_.size();
292
0
      prev_entries_keys_buff_.append(current_key.data(), current_key.size());
293
294
0
      prev_entries_.emplace_back(current_, nullptr, new_key_offset,
295
0
                                 current_key.size(), value());
296
0
    }
297
    // Loop until end of current entry hits the start of original entry
298
174
  } while (NextEntryOffset() < original);
299
0
  prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
300
174
}
301
302
678
void DataBlockIter::SeekImpl(const Slice& target) {
303
678
  Slice seek_key = target;
304
678
  PERF_TIMER_GUARD(block_seek_nanos);
305
678
  if (data_ == nullptr) {  // Not init yet
306
0
    return;
307
0
  }
308
678
  uint32_t index = 0;
309
678
  bool skip_linear_scan = false;
310
678
  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
311
312
678
  if (!ok) {
313
0
    return;
314
0
  }
315
678
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
316
678
}
317
318
29.8k
void MetaBlockIter::SeekImpl(const Slice& target) {
319
29.8k
  Slice seek_key = target;
320
29.8k
  PERF_TIMER_GUARD(block_seek_nanos);
321
29.8k
  if (data_ == nullptr) {  // Not init yet
322
0
    return;
323
0
  }
324
29.8k
  uint32_t index = 0;
325
29.8k
  bool skip_linear_scan = false;
326
29.8k
  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
327
328
29.8k
  if (!ok) {
329
0
    return;
330
0
  }
331
29.8k
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
332
29.8k
}
333
334
// Optimized Seek for point lookup for an internal key `target`
335
// target = "seek_user_key @ type | seqno".
336
//
337
// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
338
// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
339
// kTypeMerge, this function behaves identically to Seek().
340
//
341
// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
342
// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
343
// kTypeMerge:
344
//
345
// If the return value is FALSE, iter location is undefined, and it means:
346
// 1) there is no key in this block falling into the range:
347
//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
348
//    inclusive; AND
349
// 2) the last key of this block has a greater user_key from seek_user_key
350
//
351
// If the return value is TRUE, iter location has two possibilities:
352
// 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
353
//    In this case, it points to the first key with a larger user_key or a
354
//    matching user_key with a seqno no greater than the seeking seqno.
355
// 2) If the iter is invalid, it means that either all the user_key is less
356
//    than the seek_user_key, or the block ends with a matching user_key but
357
//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
358
//    but larger type).
359
0
bool DataBlockIter::SeekForGetImpl(const Slice& target) {
360
0
  Slice target_user_key = ExtractUserKey(target);
361
0
  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
362
0
  uint8_t entry =
363
0
      data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
364
365
0
  if (entry == kCollision) {
366
    // HashSeek not effective, falling back
367
0
    SeekImpl(target);
368
0
    return true;
369
0
  }
370
371
0
  if (entry == kNoEntry) {
372
    // Even if we cannot find the user_key in this block, the result may
373
    // exist in the next block. Consider this example:
374
    //
375
    // Block N:    [aab@100, ... , app@120]
376
    // boundary key: axy@50 (we make minimal assumption about a boundary key)
377
    // Block N+1:  [axy@10, ...   ]
378
    //
379
    // If seek_key = axy@60, the search will start from Block N.
380
    // Even if the user_key is not found in the hash map, the caller still
381
    // have to continue searching the next block.
382
    //
383
    // In this case, we pretend the key is in the last restart interval.
384
    // The while-loop below will search the last restart interval for the
385
    // key. It will stop at the first key that is larger than the seek_key,
386
    // or to the end of the block if no one is larger.
387
0
    entry = static_cast<uint8_t>(num_restarts_ - 1);
388
0
  }
389
390
0
  uint32_t restart_index = entry;
391
392
  // check if the key is in the restart_interval
393
0
  assert(restart_index < num_restarts_);
394
0
  SeekToRestartPoint(restart_index);
395
0
  current_ = GetRestartPoint(restart_index);
396
0
  cur_entry_idx_ =
397
0
      static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
398
399
0
  uint32_t limit = restarts_;
400
0
  if (restart_index + 1 < num_restarts_) {
401
0
    limit = GetRestartPoint(restart_index + 1);
402
0
  }
403
0
  while (current_ < limit) {
404
0
    ++cur_entry_idx_;
405
0
    bool shared;
406
    // Here we only linear seek the target key inside the restart interval.
407
    // If a key does not exist inside a restart interval, we avoid
408
    // further searching the block content across restart interval boundary.
409
    //
410
    // TODO(fwu): check the left and right boundary of the restart interval
411
    // to avoid linear seek a target key that is out of range.
412
0
    if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
413
      // we stop at the first potential matching user key.
414
0
      break;
415
0
    }
416
    // If the loop exits due to CompareCurrentKey(target) >= 0, then current key
417
    // exists, and its checksum verification will be done in UpdateKey() called
418
    // in SeekForGet().
419
    // TODO(cbi): If this loop exits with current_ == restart_, per key-value
420
    //  checksum will not be verified in UpdateKey() since Valid()
421
    //  will return false.
422
0
  }
423
424
0
  if (current_ == restarts_) {
425
    // Search reaches to the end of the block. There are three possibilities:
426
    // 1) there is only one user_key match in the block (otherwise collision).
427
    //    the matching user_key resides in the last restart interval, and it
428
    //    is the last key of the restart interval and of the block as well.
429
    //    ParseNextKey() skipped it as its [ type | seqno ] is smaller.
430
    //
431
    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
432
    //    AND all existing user_keys in the restart interval are smaller than
433
    //    seek_user_key.
434
    //
435
    // 3) The seek_key is a false positive and happens to be hashed to the
436
    //    last restart interval, AND all existing user_keys in the restart
437
    //    interval are smaller than seek_user_key.
438
    //
439
    // The result may exist in the next block each case, so we return true.
440
0
    return true;
441
0
  }
442
443
0
  if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
444
0
                                        target_user_key) != 0) {
445
    // the key is not in this block and cannot be at the next block either.
446
0
    return false;
447
0
  }
448
449
  // Here we are conservative and only support a limited set of cases
450
0
  ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
451
0
  if (value_type != ValueType::kTypeValue &&
452
0
      value_type != ValueType::kTypeDeletion &&
453
0
      value_type != ValueType::kTypeMerge &&
454
0
      value_type != ValueType::kTypeSingleDeletion &&
455
0
      value_type != ValueType::kTypeBlobIndex &&
456
0
      value_type != ValueType::kTypeWideColumnEntity &&
457
0
      value_type != ValueType::kTypeValuePreferredSeqno) {
458
0
    SeekImpl(target);
459
0
  }
460
461
  // Result found, and the iter is correctly set.
462
0
  return true;
463
0
}
464
465
1.82k
void IndexBlockIter::SeekImpl(const Slice& target) {
466
#ifndef NDEBUG
467
  if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) {
468
    return;
469
  }
470
#endif
471
1.82k
  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
472
1.82k
  PERF_TIMER_GUARD(block_seek_nanos);
473
1.82k
  if (data_ == nullptr) {  // Not init yet
474
0
    return;
475
0
  }
476
1.82k
  Slice seek_key = target;
477
1.82k
  if (raw_key_.IsUserKey()) {
478
1.82k
    seek_key = ExtractUserKey(target);
479
1.82k
  }
480
1.82k
  status_ = Status::OK();
481
1.82k
  uint32_t index = 0;
482
1.82k
  bool skip_linear_scan = false;
483
1.82k
  bool ok = false;
484
1.82k
  if (prefix_index_) {
485
0
    bool prefix_may_exist = true;
486
0
    ok = PrefixSeek(target, &index, &prefix_may_exist);
487
0
    if (!prefix_may_exist) {
488
      // This is to let the caller to distinguish between non-existing prefix,
489
      // and when key is larger than the last key, which both set Valid() to
490
      // false.
491
0
      current_ = restarts_;
492
0
      status_ = Status::NotFound();
493
0
    }
494
    // restart interval must be one when hash search is enabled so the binary
495
    // search simply lands at the right place.
496
0
    skip_linear_scan = true;
497
1.82k
  } else if (value_delta_encoded_) {
498
1.82k
    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
499
1.82k
  } else {
500
0
    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
501
0
  }
502
503
1.82k
  if (!ok) {
504
0
    return;
505
0
  }
506
1.82k
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
507
1.82k
}
508
509
1.14k
void DataBlockIter::SeekForPrevImpl(const Slice& target) {
510
1.14k
  PERF_TIMER_GUARD(block_seek_nanos);
511
1.14k
  Slice seek_key = target;
512
1.14k
  if (data_ == nullptr) {  // Not init yet
513
0
    return;
514
0
  }
515
1.14k
  uint32_t index = 0;
516
1.14k
  bool skip_linear_scan = false;
517
1.14k
  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
518
519
1.14k
  if (!ok) {
520
0
    return;
521
0
  }
522
1.14k
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
523
524
1.14k
  if (!Valid()) {
525
581
    if (status_.ok()) {
526
581
      SeekToLastImpl();
527
581
    }
528
581
  } else {
529
1.13k
    while (Valid() && CompareCurrentKey(seek_key) > 0) {
530
566
      PrevImpl();
531
566
    }
532
566
  }
533
1.14k
}
534
535
0
void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
536
0
  PERF_TIMER_GUARD(block_seek_nanos);
537
0
  Slice seek_key = target;
538
0
  if (data_ == nullptr) {  // Not init yet
539
0
    return;
540
0
  }
541
0
  uint32_t index = 0;
542
0
  bool skip_linear_scan = false;
543
0
  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
544
545
0
  if (!ok) {
546
0
    return;
547
0
  }
548
0
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
549
550
0
  if (!Valid()) {
551
0
    if (status_.ok()) {
552
0
      SeekToLastImpl();
553
0
    }
554
0
  } else {
555
0
    while (Valid() && CompareCurrentKey(seek_key) > 0) {
556
0
      PrevImpl();
557
0
    }
558
0
  }
559
0
}
560
561
15.0k
void DataBlockIter::SeekToFirstImpl() {
562
15.0k
  if (data_ == nullptr) {  // Not init yet
563
0
    return;
564
0
  }
565
15.0k
  SeekToRestartPoint(0);
566
15.0k
  bool is_shared = false;
567
15.0k
  ParseNextDataKey(&is_shared);
568
15.0k
  cur_entry_idx_ = 0;
569
15.0k
}
570
571
7.48k
void MetaBlockIter::SeekToFirstImpl() {
572
7.48k
  if (data_ == nullptr) {  // Not init yet
573
0
    return;
574
0
  }
575
7.48k
  SeekToRestartPoint(0);
576
7.48k
  bool is_shared = false;
577
7.48k
  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
578
7.48k
  cur_entry_idx_ = 0;
579
7.48k
}
580
581
6.08k
void IndexBlockIter::SeekToFirstImpl() {
582
#ifndef NDEBUG
583
  if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) {
584
    return;
585
  }
586
#endif
587
6.08k
  if (data_ == nullptr) {  // Not init yet
588
0
    return;
589
0
  }
590
6.08k
  status_ = Status::OK();
591
6.08k
  SeekToRestartPoint(0);
592
6.08k
  ParseNextIndexKey();
593
6.08k
  cur_entry_idx_ = 0;
594
6.08k
}
595
596
970
void DataBlockIter::SeekToLastImpl() {
597
970
  if (data_ == nullptr) {  // Not init yet
598
0
    return;
599
0
  }
600
970
  SeekToRestartPoint(num_restarts_ - 1);
601
970
  bool is_shared = false;
602
970
  cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
603
970
  while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
604
    // Keep skipping
605
0
    ++cur_entry_idx_;
606
0
  }
607
970
}
608
609
0
void MetaBlockIter::SeekToLastImpl() {
610
0
  if (data_ == nullptr) {  // Not init yet
611
0
    return;
612
0
  }
613
0
  SeekToRestartPoint(num_restarts_ - 1);
614
0
  bool is_shared = false;
615
0
  assert(num_restarts_ >= 1);
616
0
  cur_entry_idx_ =
617
0
      static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_);
618
0
  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
619
0
         NextEntryOffset() < restarts_) {
620
    // Will probably never reach here since restart_interval is always 1
621
0
    ++cur_entry_idx_;
622
0
  }
623
0
}
624
625
389
void IndexBlockIter::SeekToLastImpl() {
626
389
  if (data_ == nullptr) {  // Not init yet
627
0
    return;
628
0
  }
629
389
  status_ = Status::OK();
630
389
  SeekToRestartPoint(num_restarts_ - 1);
631
389
  cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
632
389
  while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
633
0
    ++cur_entry_idx_;
634
0
  }
635
389
}
636
637
template <class TValue>
638
template <typename DecodeEntryFunc>
639
556k
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
640
556k
  current_ = NextEntryOffset();
641
556k
  const char* p = data_ + current_;
642
556k
  const char* limit = data_ + restarts_;  // Restarts come right after data
643
644
556k
  if (p >= limit) {
645
    // No more entries to return.  Mark as invalid.
646
33.1k
    current_ = restarts_;
647
33.1k
    restart_index_ = num_restarts_;
648
33.1k
    return false;
649
33.1k
  }
650
  // Decode next entry
651
523k
  uint32_t shared, non_shared, value_length;
652
523k
  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
653
523k
  if (p == nullptr || raw_key_.Size() < shared) {
654
0
    CorruptionError();
655
0
    return false;
656
523k
  } else {
657
523k
    if (shared == 0) {
658
274k
      *is_shared = false;
659
      // If this key doesn't share any bytes with prev key, and no min timestamp
660
      // needs to be padded to the key, then we don't need to decode it and
661
      // can use its address in the block directly (no copy).
662
274k
      UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
663
274k
    } else {
664
      // This key share `shared` bytes with prev key, we need to decode it
665
248k
      *is_shared = true;
666
      // If user-defined timestamp is stripped from user key before keys are
667
      // delta encoded, the decoded key consisting of the shared and non shared
668
      // bytes do not have user-defined timestamp yet. We need to pad min
669
      // timestamp to it.
670
248k
      if (pad_min_timestamp_) {
671
0
        raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
672
248k
      } else {
673
248k
        raw_key_.TrimAppend(shared, p, non_shared);
674
248k
      }
675
248k
    }
676
523k
    value_ = Slice(p + non_shared, value_length);
677
523k
    if (shared == 0) {
678
468k
      while (restart_index_ + 1 < num_restarts_ &&
679
468k
             GetRestartPoint(restart_index_ + 1) < current_) {
680
193k
        ++restart_index_;
681
193k
      }
682
274k
    }
683
    // else we are in the middle of a restart interval and the restart_index_
684
    // thus has not changed
685
523k
    return true;
686
523k
  }
687
523k
}
bool rocksdb::BlockIter<rocksdb::Slice>::ParseNextKey<rocksdb::DecodeEntry>(bool*)
Line
Count
Source
639
249k
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
640
249k
  current_ = NextEntryOffset();
641
249k
  const char* p = data_ + current_;
642
249k
  const char* limit = data_ + restarts_;  // Restarts come right after data
643
644
249k
  if (p >= limit) {
645
    // No more entries to return.  Mark as invalid.
646
15.4k
    current_ = restarts_;
647
15.4k
    restart_index_ = num_restarts_;
648
15.4k
    return false;
649
15.4k
  }
650
  // Decode next entry
651
234k
  uint32_t shared, non_shared, value_length;
652
234k
  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
653
234k
  if (p == nullptr || raw_key_.Size() < shared) {
654
0
    CorruptionError();
655
0
    return false;
656
234k
  } else {
657
234k
    if (shared == 0) {
658
226k
      *is_shared = false;
659
      // If this key doesn't share any bytes with prev key, and no min timestamp
660
      // needs to be padded to the key, then we don't need to decode it and
661
      // can use its address in the block directly (no copy).
662
226k
      UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
663
226k
    } else {
664
      // This key share `shared` bytes with prev key, we need to decode it
665
7.90k
      *is_shared = true;
666
      // If user-defined timestamp is stripped from user key before keys are
667
      // delta encoded, the decoded key consisting of the shared and non shared
668
      // bytes do not have user-defined timestamp yet. We need to pad min
669
      // timestamp to it.
670
7.90k
      if (pad_min_timestamp_) {
671
0
        raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
672
7.90k
      } else {
673
7.90k
        raw_key_.TrimAppend(shared, p, non_shared);
674
7.90k
      }
675
7.90k
    }
676
234k
    value_ = Slice(p + non_shared, value_length);
677
234k
    if (shared == 0) {
678
417k
      while (restart_index_ + 1 < num_restarts_ &&
679
417k
             GetRestartPoint(restart_index_ + 1) < current_) {
680
190k
        ++restart_index_;
681
190k
      }
682
226k
    }
683
    // else we are in the middle of a restart interval and the restart_index_
684
    // thus has not changed
685
234k
    return true;
686
234k
  }
687
234k
}
bool rocksdb::BlockIter<rocksdb::IndexValue>::ParseNextKey<rocksdb::DecodeEntryV4>(bool*)
Line
Count
Source
639
16.8k
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
640
16.8k
  current_ = NextEntryOffset();
641
16.8k
  const char* p = data_ + current_;
642
16.8k
  const char* limit = data_ + restarts_;  // Restarts come right after data
643
644
16.8k
  if (p >= limit) {
645
    // No more entries to return.  Mark as invalid.
646
5.91k
    current_ = restarts_;
647
5.91k
    restart_index_ = num_restarts_;
648
5.91k
    return false;
649
5.91k
  }
650
  // Decode next entry
651
10.9k
  uint32_t shared, non_shared, value_length;
652
10.9k
  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
653
10.9k
  if (p == nullptr || raw_key_.Size() < shared) {
654
0
    CorruptionError();
655
0
    return false;
656
10.9k
  } else {
657
10.9k
    if (shared == 0) {
658
10.9k
      *is_shared = false;
659
      // If this key doesn't share any bytes with prev key, and no min timestamp
660
      // needs to be padded to the key, then we don't need to decode it and
661
      // can use its address in the block directly (no copy).
662
10.9k
      UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
663
18.4E
    } else {
664
      // This key share `shared` bytes with prev key, we need to decode it
665
18.4E
      *is_shared = true;
666
      // If user-defined timestamp is stripped from user key before keys are
667
      // delta encoded, the decoded key consisting of the shared and non shared
668
      // bytes do not have user-defined timestamp yet. We need to pad min
669
      // timestamp to it.
670
18.4E
      if (pad_min_timestamp_) {
671
0
        raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
672
18.4E
      } else {
673
18.4E
        raw_key_.TrimAppend(shared, p, non_shared);
674
18.4E
      }
675
18.4E
    }
676
10.9k
    value_ = Slice(p + non_shared, value_length);
677
10.9k
    if (shared == 0) {
678
13.9k
      while (restart_index_ + 1 < num_restarts_ &&
679
13.9k
             GetRestartPoint(restart_index_ + 1) < current_) {
680
2.96k
        ++restart_index_;
681
2.96k
      }
682
10.9k
    }
683
    // else we are in the middle of a restart interval and the restart_index_
684
    // thus has not changed
685
10.9k
    return true;
686
10.9k
  }
687
10.9k
}
Unexecuted instantiation: bool rocksdb::BlockIter<rocksdb::IndexValue>::ParseNextKey<rocksdb::DecodeEntry>(bool*)
bool rocksdb::BlockIter<rocksdb::Slice>::ParseNextKey<rocksdb::CheckAndDecodeEntry>(bool*)
Line
Count
Source
639
289k
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
640
289k
  current_ = NextEntryOffset();
641
289k
  const char* p = data_ + current_;
642
289k
  const char* limit = data_ + restarts_;  // Restarts come right after data
643
644
289k
  if (p >= limit) {
645
    // No more entries to return.  Mark as invalid.
646
11.8k
    current_ = restarts_;
647
11.8k
    restart_index_ = num_restarts_;
648
11.8k
    return false;
649
11.8k
  }
650
  // Decode next entry
651
277k
  uint32_t shared, non_shared, value_length;
652
277k
  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
653
278k
  if (p == nullptr || raw_key_.Size() < shared) {
654
0
    CorruptionError();
655
0
    return false;
656
277k
  } else {
657
277k
    if (shared == 0) {
658
37.3k
      *is_shared = false;
659
      // If this key doesn't share any bytes with prev key, and no min timestamp
660
      // needs to be padded to the key, then we don't need to decode it and
661
      // can use its address in the block directly (no copy).
662
37.3k
      UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
663
240k
    } else {
664
      // This key share `shared` bytes with prev key, we need to decode it
665
240k
      *is_shared = true;
666
      // If user-defined timestamp is stripped from user key before keys are
667
      // delta encoded, the decoded key consisting of the shared and non shared
668
      // bytes do not have user-defined timestamp yet. We need to pad min
669
      // timestamp to it.
670
240k
      if (pad_min_timestamp_) {
671
0
        raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
672
240k
      } else {
673
240k
        raw_key_.TrimAppend(shared, p, non_shared);
674
240k
      }
675
240k
    }
676
277k
    value_ = Slice(p + non_shared, value_length);
677
277k
    if (shared == 0) {
678
37.2k
      while (restart_index_ + 1 < num_restarts_ &&
679
37.2k
             GetRestartPoint(restart_index_ + 1) < current_) {
680
0
        ++restart_index_;
681
0
      }
682
37.2k
    }
683
    // else we are in the middle of a restart interval and the restart_index_
684
    // thus has not changed
685
277k
    return true;
686
277k
  }
687
277k
}
688
689
249k
bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
690
249k
  if (ParseNextKey<DecodeEntry>(is_shared)) {
691
#ifndef NDEBUG
692
    if (global_seqno_ != kDisableGlobalSequenceNumber) {
693
      // If we are reading a file with a global sequence number we should
694
      // expect that all encoded sequence numbers are zeros and any value
695
      // type is kTypeValue, kTypeMerge, kTypeDeletion,
696
      // kTypeDeletionWithTimestamp, kTypeRangeDeletion, or
697
      // kTypeWideColumnEntity.
698
      uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
699
      SequenceNumber seqno;
700
      ValueType value_type;
701
      UnPackSequenceAndType(packed, &seqno, &value_type);
702
      assert(value_type == ValueType::kTypeValue ||
703
             value_type == ValueType::kTypeMerge ||
704
             value_type == ValueType::kTypeDeletion ||
705
             value_type == ValueType::kTypeDeletionWithTimestamp ||
706
             value_type == ValueType::kTypeRangeDeletion ||
707
             value_type == ValueType::kTypeWideColumnEntity);
708
      assert(seqno == 0);
709
    }
710
#endif  // NDEBUG
711
234k
    return true;
712
234k
  } else {
713
15.4k
    return false;
714
15.4k
  }
715
249k
}
716
717
16.8k
bool IndexBlockIter::ParseNextIndexKey() {
718
16.8k
  bool is_shared = false;
719
16.8k
  bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
720
16.8k
                                   : ParseNextKey<DecodeEntry>(&is_shared);
721
16.8k
  if (ok) {
722
10.9k
    if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
723
10.9k
        pad_min_timestamp_) {
724
10.9k
      DecodeCurrentValue(is_shared);
725
10.9k
    }
726
10.9k
  }
727
16.8k
  return ok;
728
16.8k
}
729
730
// The format:
731
// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
732
// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
733
// ...
734
// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
735
// where, k is key, v is value, and its encoding is in parentheses.
736
// The format of each key is (shared_size, non_shared_size, shared, non_shared)
737
// The format of each value, i.e., block handle, is (offset, size) whenever the
738
// is_shared is false, which included the first entry in each restart point.
739
// Otherwise, the format is delta-size = the size of current block - the size o
740
// last block.
741
10.9k
void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
742
10.9k
  Slice v(value_.data(), data_ + restarts_ - value_.data());
743
  // Delta encoding is used if `shared` != 0.
744
10.9k
  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
745
10.9k
      &v, have_first_key_,
746
10.9k
      (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
747
10.9k
  assert(decode_s.ok());
748
10.9k
  value_ = Slice(value_.data(), v.data() - value_.data());
749
750
10.9k
  if (global_seqno_state_ != nullptr) {
751
    // Overwrite sequence number the same way as in DataBlockIter.
752
753
0
    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
754
0
    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
755
0
                                      /* copy */ true);
756
757
0
    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
758
759
0
    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
760
0
    assert(value_type == ValueType::kTypeValue ||
761
0
           value_type == ValueType::kTypeMerge ||
762
0
           value_type == ValueType::kTypeDeletion ||
763
0
           value_type == ValueType::kTypeRangeDeletion ||
764
0
           value_type == ValueType::kTypeWideColumnEntity);
765
766
0
    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
767
0
                                         value_type);
768
0
    decoded_value_.first_internal_key = first_internal_key.GetKey();
769
0
  }
770
10.9k
  if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) {
771
0
    first_internal_key_with_ts_.clear();
772
0
    PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_,
773
0
                                   decoded_value_.first_internal_key, ts_sz_);
774
0
    decoded_value_.first_internal_key = first_internal_key_with_ts_;
775
0
  }
776
10.9k
}
777
778
template <class TValue>
779
void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
780
                                               uint32_t index,
781
33.5k
                                               bool skip_linear_scan) {
782
  // SeekToRestartPoint() only does the lookup in the restart block. We need
783
  // to follow it up with NextImpl() to position the iterator at the restart
784
  // key.
785
33.5k
  SeekToRestartPoint(index);
786
33.5k
  cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
787
33.5k
  NextImpl();
788
789
33.5k
  if (!skip_linear_scan) {
790
    // Linear search (within restart block) for first key >= target
791
5.10k
    uint32_t max_offset;
792
5.10k
    if (index + 1 < num_restarts_) {
793
      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
794
      // the next restart key is strictly greater than `target`, we can
795
      // terminate upon reaching it without any additional key comparison.
796
0
      max_offset = GetRestartPoint(index + 1);
797
5.10k
    } else {
798
      // We are in the last restart interval. The while-loop will terminate by
799
      // `Valid()` returning false upon advancing past the block's last key.
800
5.10k
      max_offset = std::numeric_limits<uint32_t>::max();
801
5.10k
    }
802
5.10k
    while (true) {
803
5.10k
      NextImpl();
804
5.10k
      if (!Valid()) {
805
        // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
806
        //  since Valid() will returns false.
807
4.93k
        break;
808
4.93k
      }
809
175
      if (current_ == max_offset) {
810
0
        assert(CompareCurrentKey(target) > 0);
811
0
        break;
812
175
      } else if (CompareCurrentKey(target) >= 0) {
813
175
        break;
814
175
      }
815
175
    }
816
5.10k
  }
817
33.5k
}
rocksdb::BlockIter<rocksdb::Slice>::FindKeyAfterBinarySeek(rocksdb::Slice const&, unsigned int, bool)
Line
Count
Source
781
31.7k
                                               bool skip_linear_scan) {
782
  // SeekToRestartPoint() only does the lookup in the restart block. We need
783
  // to follow it up with NextImpl() to position the iterator at the restart
784
  // key.
785
31.7k
  SeekToRestartPoint(index);
786
31.7k
  cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
787
31.7k
  NextImpl();
788
789
31.7k
  if (!skip_linear_scan) {
790
    // Linear search (within restart block) for first key >= target
791
5.10k
    uint32_t max_offset;
792
5.10k
    if (index + 1 < num_restarts_) {
793
      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
794
      // the next restart key is strictly greater than `target`, we can
795
      // terminate upon reaching it without any additional key comparison.
796
0
      max_offset = GetRestartPoint(index + 1);
797
5.10k
    } else {
798
      // We are in the last restart interval. The while-loop will terminate by
799
      // `Valid()` returning false upon advancing past the block's last key.
800
5.10k
      max_offset = std::numeric_limits<uint32_t>::max();
801
5.10k
    }
802
5.10k
    while (true) {
803
5.10k
      NextImpl();
804
5.10k
      if (!Valid()) {
805
        // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
806
        //  since Valid() will returns false.
807
4.93k
        break;
808
4.93k
      }
809
175
      if (current_ == max_offset) {
810
0
        assert(CompareCurrentKey(target) > 0);
811
0
        break;
812
175
      } else if (CompareCurrentKey(target) >= 0) {
813
175
        break;
814
175
      }
815
175
    }
816
5.10k
  }
817
31.7k
}
rocksdb::BlockIter<rocksdb::IndexValue>::FindKeyAfterBinarySeek(rocksdb::Slice const&, unsigned int, bool)
Line
Count
Source
781
1.82k
                                               bool skip_linear_scan) {
782
  // SeekToRestartPoint() only does the lookup in the restart block. We need
783
  // to follow it up with NextImpl() to position the iterator at the restart
784
  // key.
785
1.82k
  SeekToRestartPoint(index);
786
1.82k
  cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
787
1.82k
  NextImpl();
788
789
1.82k
  if (!skip_linear_scan) {
790
    // Linear search (within restart block) for first key >= target
791
0
    uint32_t max_offset;
792
0
    if (index + 1 < num_restarts_) {
793
      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
794
      // the next restart key is strictly greater than `target`, we can
795
      // terminate upon reaching it without any additional key comparison.
796
0
      max_offset = GetRestartPoint(index + 1);
797
0
    } else {
798
      // We are in the last restart interval. The while-loop will terminate by
799
      // `Valid()` returning false upon advancing past the block's last key.
800
0
      max_offset = std::numeric_limits<uint32_t>::max();
801
0
    }
802
0
    while (true) {
803
0
      NextImpl();
804
0
      if (!Valid()) {
805
        // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
806
        //  since Valid() will returns false.
807
0
        break;
808
0
      }
809
0
      if (current_ == max_offset) {
810
0
        assert(CompareCurrentKey(target) > 0);
811
0
        break;
812
0
      } else if (CompareCurrentKey(target) >= 0) {
813
0
        break;
814
0
      }
815
0
    }
816
0
  }
817
1.82k
}
818
819
// Binary searches in restart array to find the starting restart point for the
820
// linear scan, and stores it in `*index`. Assumes restart array does not
821
// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
822
// is strictly greater than `target` or does not exist (this can be used to
823
// elide a comparison when linear scan reaches all the way to the next restart
824
// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
825
// `*index`th restart key is the final result so that key does not need to be
826
// compared again later.
827
template <class TValue>
828
template <typename DecodeKeyFunc>
829
bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
830
33.5k
                                   bool* skip_linear_scan) {
831
33.5k
  if (restarts_ == 0) {
832
    // SST files dedicated to range tombstones are written with index blocks
833
    // that have no keys while also having `num_restarts_ == 1`. This would
834
    // cause a problem for `BinarySeek()` as it'd try to access the first key
835
    // which does not exist. We identify such blocks by the offset at which
836
    // their restarts are stored, and return false to prevent any attempted
837
    // key accesses.
838
0
    return false;
839
0
  }
840
841
33.5k
  *skip_linear_scan = false;
842
  // Loop invariants:
843
  // - Restart key at index `left` is less than or equal to the target key. The
844
  //   sentinel index `-1` is considered to have a key that is less than all
845
  //   keys.
846
  // - Any restart keys after index `right` are strictly greater than the target
847
  //   key.
848
33.5k
  int64_t left = -1, right = num_restarts_ - 1;
849
85.1k
  while (left != right) {
850
    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
851
51.6k
    int64_t mid = left + (right - left + 1) / 2;
852
51.6k
    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
853
51.6k
    uint32_t shared, non_shared;
854
51.6k
    const char* key_ptr = DecodeKeyFunc()(
855
51.6k
        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
856
51.6k
    if (key_ptr == nullptr || (shared != 0)) {
857
0
      CorruptionError();
858
0
      return false;
859
0
    }
860
51.6k
    Slice mid_key(key_ptr, non_shared);
861
51.6k
    UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
862
51.6k
    int cmp = CompareCurrentKey(target);
863
51.6k
    if (cmp < 0) {
864
      // Key at "mid" is smaller than "target". Therefore all
865
      // blocks before "mid" are uninteresting.
866
16.9k
      left = mid;
867
34.6k
    } else if (cmp > 0) {
868
      // Key at "mid" is >= "target". Therefore all blocks at or
869
      // after "mid" are uninteresting.
870
15.1k
      right = mid - 1;
871
19.4k
    } else {
872
19.4k
      *skip_linear_scan = true;
873
19.4k
      left = right = mid;
874
19.4k
    }
875
51.6k
  }
876
877
33.5k
  if (left == -1) {
878
    // All keys in the block were strictly greater than `target`. So the very
879
    // first key in the block is the final seek result.
880
8.91k
    *skip_linear_scan = true;
881
8.91k
    *index = 0;
882
24.6k
  } else {
883
24.6k
    *index = static_cast<uint32_t>(left);
884
24.6k
  }
885
33.5k
  return true;
886
33.5k
}
bool rocksdb::BlockIter<rocksdb::Slice>::BinarySeek<rocksdb::DecodeKey>(rocksdb::Slice const&, unsigned int*, bool*)
Line
Count
Source
830
31.7k
                                   bool* skip_linear_scan) {
831
31.7k
  if (restarts_ == 0) {
832
    // SST files dedicated to range tombstones are written with index blocks
833
    // that have no keys while also having `num_restarts_ == 1`. This would
834
    // cause a problem for `BinarySeek()` as it'd try to access the first key
835
    // which does not exist. We identify such blocks by the offset at which
836
    // their restarts are stored, and return false to prevent any attempted
837
    // key accesses.
838
0
    return false;
839
0
  }
840
841
31.7k
  *skip_linear_scan = false;
842
  // Loop invariants:
843
  // - Restart key at index `left` is less than or equal to the target key. The
844
  //   sentinel index `-1` is considered to have a key that is less than all
845
  //   keys.
846
  // - Any restart keys after index `right` are strictly greater than the target
847
  //   key.
848
31.7k
  int64_t left = -1, right = num_restarts_ - 1;
849
81.4k
  while (left != right) {
850
    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
851
49.7k
    int64_t mid = left + (right - left + 1) / 2;
852
49.7k
    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
853
49.7k
    uint32_t shared, non_shared;
854
49.7k
    const char* key_ptr = DecodeKeyFunc()(
855
49.7k
        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
856
49.7k
    if (key_ptr == nullptr || (shared != 0)) {
857
0
      CorruptionError();
858
0
      return false;
859
0
    }
860
49.7k
    Slice mid_key(key_ptr, non_shared);
861
49.7k
    UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
862
49.7k
    int cmp = CompareCurrentKey(target);
863
49.7k
    if (cmp < 0) {
864
      // Key at "mid" is smaller than "target". Therefore all
865
      // blocks before "mid" are uninteresting.
866
16.9k
      left = mid;
867
32.8k
    } else if (cmp > 0) {
868
      // Key at "mid" is >= "target". Therefore all blocks at or
869
      // after "mid" are uninteresting.
870
14.8k
      right = mid - 1;
871
18.0k
    } else {
872
18.0k
      *skip_linear_scan = true;
873
18.0k
      left = right = mid;
874
18.0k
    }
875
49.7k
  }
876
877
31.7k
  if (left == -1) {
878
    // All keys in the block were strictly greater than `target`. So the very
879
    // first key in the block is the final seek result.
880
8.54k
    *skip_linear_scan = true;
881
8.54k
    *index = 0;
882
23.1k
  } else {
883
23.1k
    *index = static_cast<uint32_t>(left);
884
23.1k
  }
885
31.7k
  return true;
886
31.7k
}
bool rocksdb::BlockIter<rocksdb::IndexValue>::BinarySeek<rocksdb::DecodeKeyV4>(rocksdb::Slice const&, unsigned int*, bool*)
Line
Count
Source
830
1.82k
                                   bool* skip_linear_scan) {
831
1.82k
  if (restarts_ == 0) {
832
    // SST files dedicated to range tombstones are written with index blocks
833
    // that have no keys while also having `num_restarts_ == 1`. This would
834
    // cause a problem for `BinarySeek()` as it'd try to access the first key
835
    // which does not exist. We identify such blocks by the offset at which
836
    // their restarts are stored, and return false to prevent any attempted
837
    // key accesses.
838
0
    return false;
839
0
  }
840
841
1.82k
  *skip_linear_scan = false;
842
  // Loop invariants:
843
  // - Restart key at index `left` is less than or equal to the target key. The
844
  //   sentinel index `-1` is considered to have a key that is less than all
845
  //   keys.
846
  // - Any restart keys after index `right` are strictly greater than the target
847
  //   key.
848
1.82k
  int64_t left = -1, right = num_restarts_ - 1;
849
3.65k
  while (left != right) {
850
    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
851
1.82k
    int64_t mid = left + (right - left + 1) / 2;
852
1.82k
    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
853
1.82k
    uint32_t shared, non_shared;
854
1.82k
    const char* key_ptr = DecodeKeyFunc()(
855
1.82k
        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
856
1.82k
    if (key_ptr == nullptr || (shared != 0)) {
857
0
      CorruptionError();
858
0
      return false;
859
0
    }
860
1.82k
    Slice mid_key(key_ptr, non_shared);
861
1.82k
    UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
862
1.82k
    int cmp = CompareCurrentKey(target);
863
1.82k
    if (cmp < 0) {
864
      // Key at "mid" is smaller than "target". Therefore all
865
      // blocks before "mid" are uninteresting.
866
0
      left = mid;
867
1.82k
    } else if (cmp > 0) {
868
      // Key at "mid" is >= "target". Therefore all blocks at or
869
      // after "mid" are uninteresting.
870
371
      right = mid - 1;
871
1.45k
    } else {
872
1.45k
      *skip_linear_scan = true;
873
1.45k
      left = right = mid;
874
1.45k
    }
875
1.82k
  }
876
877
1.82k
  if (left == -1) {
878
    // All keys in the block were strictly greater than `target`. So the very
879
    // first key in the block is the final seek result.
880
371
    *skip_linear_scan = true;
881
371
    *index = 0;
882
1.45k
  } else {
883
1.45k
    *index = static_cast<uint32_t>(left);
884
1.45k
  }
885
1.82k
  return true;
886
1.82k
}
Unexecuted instantiation: bool rocksdb::BlockIter<rocksdb::IndexValue>::BinarySeek<rocksdb::DecodeKey>(rocksdb::Slice const&, unsigned int*, bool*)
887
888
// Compare target key and the block key of the block of `block_index`.
889
// Return -1 if error.
890
0
int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
891
0
  uint32_t region_offset = GetRestartPoint(block_index);
892
0
  uint32_t shared, non_shared;
893
0
  const char* key_ptr =
894
0
      value_delta_encoded_
895
0
          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
896
0
                          &non_shared)
897
0
          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
898
0
                        &non_shared);
899
0
  if (key_ptr == nullptr || (shared != 0)) {
900
0
    CorruptionError();
901
0
    return 1;  // Return target is smaller
902
0
  }
903
0
  Slice block_key(key_ptr, non_shared);
904
0
  UpdateRawKeyAndMaybePadMinTimestamp(block_key);
905
0
  return CompareCurrentKey(target);
906
0
}
907
908
// Binary search in block_ids to find the first block
909
// with a key >= target
910
bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
911
                                          uint32_t* block_ids, uint32_t left,
912
                                          uint32_t right, uint32_t* index,
913
0
                                          bool* prefix_may_exist) {
914
0
  assert(left <= right);
915
0
  assert(index);
916
0
  assert(prefix_may_exist);
917
0
  *prefix_may_exist = true;
918
0
  uint32_t left_bound = left;
919
920
0
  while (left <= right) {
921
0
    uint32_t mid = (right + left) / 2;
922
923
0
    int cmp = CompareBlockKey(block_ids[mid], target);
924
0
    if (!status_.ok()) {
925
0
      return false;
926
0
    }
927
0
    if (cmp < 0) {
928
      // Key at "target" is larger than "mid". Therefore all
929
      // blocks before or at "mid" are uninteresting.
930
0
      left = mid + 1;
931
0
    } else {
932
      // Key at "target" is <= "mid". Therefore all blocks
933
      // after "mid" are uninteresting.
934
      // If there is only one block left, we found it.
935
0
      if (left == right) {
936
0
        break;
937
0
      }
938
0
      right = mid;
939
0
    }
940
0
  }
941
942
0
  if (left == right) {
943
    // In one of the two following cases:
944
    // (1) left is the first one of block_ids
945
    // (2) there is a gap of blocks between block of `left` and `left-1`.
946
    // we can further distinguish the case of key in the block or key not
947
    // existing, by comparing the target key and the key of the previous
948
    // block to the left of the block found.
949
0
    if (block_ids[left] > 0 &&
950
0
        (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
951
0
        CompareBlockKey(block_ids[left] - 1, target) > 0) {
952
0
      current_ = restarts_;
953
0
      *prefix_may_exist = false;
954
0
      return false;
955
0
    }
956
957
0
    *index = block_ids[left];
958
0
    return true;
959
0
  } else {
960
0
    assert(left > right);
961
962
    // If the next block key is larger than seek key, it is possible that
963
    // no key shares the prefix with `target`, or all keys with the same
964
    // prefix as `target` are smaller than prefix. In the latter case,
965
    // we are mandated to set the position the same as the total order.
966
    // In the latter case, either:
967
    // (1) `target` falls into the range of the next block. In this case,
968
    //     we can place the iterator to the next block, or
969
    // (2) `target` is larger than all block keys. In this case we can
970
    //     keep the iterator invalidate without setting `prefix_may_exist`
971
    //     to false.
972
    // We might sometimes end up with setting the total order position
973
    // while there is no key sharing the prefix as `target`, but it
974
    // still follows the contract.
975
0
    uint32_t right_index = block_ids[right];
976
0
    assert(right_index + 1 <= num_restarts_);
977
0
    if (right_index + 1 < num_restarts_) {
978
0
      if (CompareBlockKey(right_index + 1, target) >= 0) {
979
0
        *index = right_index + 1;
980
0
        return true;
981
0
      } else {
982
        // We have to set the flag here because we are not positioning
983
        // the iterator to the total order position.
984
0
        *prefix_may_exist = false;
985
0
      }
986
0
    }
987
988
    // Mark iterator invalid
989
0
    current_ = restarts_;
990
0
    return false;
991
0
  }
992
0
}
993
994
bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
995
0
                                bool* prefix_may_exist) {
996
0
  assert(index);
997
0
  assert(prefix_may_exist);
998
0
  assert(prefix_index_);
999
0
  *prefix_may_exist = true;
1000
0
  Slice seek_key = target;
1001
0
  if (raw_key_.IsUserKey()) {
1002
0
    seek_key = ExtractUserKey(target);
1003
0
  }
1004
0
  uint32_t* block_ids = nullptr;
1005
0
  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
1006
1007
0
  if (num_blocks == 0) {
1008
0
    current_ = restarts_;
1009
0
    *prefix_may_exist = false;
1010
0
    return false;
1011
0
  } else {
1012
0
    assert(block_ids);
1013
0
    return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
1014
0
                                prefix_may_exist);
1015
0
  }
1016
0
}
1017
1018
34.0k
uint32_t Block::NumRestarts() const {
1019
34.0k
  assert(size_ >= 2 * sizeof(uint32_t));
1020
34.0k
  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
1021
34.0k
  uint32_t num_restarts = block_footer;
1022
34.0k
  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
1023
    // In BlockBuilder, we have ensured a block with HashIndex is less than
1024
    // kMaxBlockSizeSupportedByHashIndex (64KiB).
1025
    //
1026
    // Therefore, if we encounter a block with a size > 64KiB, the block
1027
    // cannot have HashIndex. So the footer will directly interpreted as
1028
    // num_restarts.
1029
    //
1030
    // Such check is for backward compatibility. We can ensure legacy block
1031
    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
1032
    // correctly as no HashIndex even if the MSB of num_restarts is set.
1033
1.06k
    return num_restarts;
1034
1.06k
  }
1035
32.9k
  BlockBasedTableOptions::DataBlockIndexType index_type;
1036
32.9k
  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
1037
32.9k
  return num_restarts;
1038
34.0k
}
1039
1040
34.0k
BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
1041
34.0k
  assert(size_ >= 2 * sizeof(uint32_t));
1042
34.0k
  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
1043
    // The check is for the same reason as that in NumRestarts()
1044
1.06k
    return BlockBasedTableOptions::kDataBlockBinarySearch;
1045
1.06k
  }
1046
32.9k
  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
1047
32.9k
  uint32_t num_restarts = block_footer;
1048
32.9k
  BlockBasedTableOptions::DataBlockIndexType index_type;
1049
32.9k
  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
1050
32.9k
  return index_type;
1051
34.0k
}
1052
1053
34.0k
Block::~Block() {
1054
  // This sync point can be re-enabled if RocksDB can control the
1055
  // initialization order of any/all static options created by the user.
1056
  // TEST_SYNC_POINT("Block::~Block");
1057
34.0k
  delete[] kv_checksum_;
1058
34.0k
}
1059
1060
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
1061
             Statistics* statistics)
1062
    : contents_(std::move(contents)),
1063
      data_(contents_.data.data()),
1064
      size_(contents_.data.size()),
1065
      restart_offset_(0),
1066
34.0k
      num_restarts_(0) {
1067
34.0k
  TEST_SYNC_POINT("Block::Block:0");
1068
34.0k
  if (size_ < sizeof(uint32_t)) {
1069
0
    size_ = 0;  // Error marker
1070
34.0k
  } else {
1071
    // Should only decode restart points for uncompressed blocks
1072
34.0k
    num_restarts_ = NumRestarts();
1073
34.0k
    switch (IndexType()) {
1074
33.9k
      case BlockBasedTableOptions::kDataBlockBinarySearch:
1075
33.9k
        restart_offset_ = static_cast<uint32_t>(size_) -
1076
33.9k
                          (1 + num_restarts_) * sizeof(uint32_t);
1077
33.9k
        if (restart_offset_ > size_ - sizeof(uint32_t)) {
1078
          // The size is too small for NumRestarts() and therefore
1079
          // restart_offset_ wrapped around.
1080
0
          size_ = 0;
1081
0
        }
1082
33.9k
        break;
1083
0
      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
1084
0
        if (size_ < sizeof(uint32_t) /* block footer */ +
1085
0
                        sizeof(uint16_t) /* NUM_BUCK */) {
1086
0
          size_ = 0;
1087
0
          break;
1088
0
        }
1089
1090
0
        uint16_t map_offset;
1091
0
        data_block_hash_index_.Initialize(
1092
0
            data_, static_cast<uint16_t>(size_ - sizeof(uint32_t)), /*chop off
1093
                                                                NUM_RESTARTS*/
1094
0
            &map_offset);
1095
1096
0
        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
1097
1098
0
        if (restart_offset_ > map_offset) {
1099
          // map_offset is too small for NumRestarts() and
1100
          // therefore restart_offset_ wrapped around.
1101
0
          size_ = 0;
1102
0
          break;
1103
0
        }
1104
0
        break;
1105
0
      default:
1106
0
        size_ = 0;  // Error marker
1107
34.0k
    }
1108
34.0k
  }
1109
33.9k
  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
1110
0
    read_amp_bitmap_.reset(new BlockReadAmpBitmap(
1111
0
        restart_offset_, read_amp_bytes_per_bit, statistics));
1112
0
  }
1113
33.9k
}
1114
1115
void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
1116
11.6k
                                              const Comparator* raw_ucmp) {
1117
11.6k
  protection_bytes_per_key_ = 0;
1118
11.6k
  if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
1119
    // NewDataIterator() is called with protection_bytes_per_key_ = 0.
1120
    // This is intended since checksum is not constructed yet.
1121
    //
1122
    // We do not know global_seqno yet, so checksum computation and
1123
    // verification all assume global_seqno = 0.
1124
    // TODO(yuzhangyu): handle the implication of padding timestamp for kv
1125
    // protection.
1126
0
    std::unique_ptr<DataBlockIter> iter{NewDataIterator(
1127
0
        raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
1128
0
        nullptr /* stats */, true /* block_contents_pinned */,
1129
0
        true /* user_defined_timestamps_persisted */)};
1130
0
    if (iter->status().ok()) {
1131
0
      block_restart_interval_ = iter->GetRestartInterval();
1132
0
    }
1133
0
    uint32_t num_keys = 0;
1134
0
    if (iter->status().ok()) {
1135
0
      num_keys = iter->NumberOfKeys(block_restart_interval_);
1136
0
    }
1137
0
    if (iter->status().ok()) {
1138
0
      checksum_size_ = num_keys * protection_bytes_per_key;
1139
0
      kv_checksum_ = new char[(size_t)checksum_size_];
1140
0
      size_t i = 0;
1141
0
      iter->SeekToFirst();
1142
0
      while (iter->Valid()) {
1143
0
        GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
1144
0
                           iter->key(), iter->value());
1145
0
        iter->Next();
1146
0
        i += protection_bytes_per_key;
1147
0
      }
1148
0
      assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
1149
0
    }
1150
0
    if (!iter->status().ok()) {
1151
0
      size_ = 0;  // Error marker
1152
0
      return;
1153
0
    }
1154
0
    protection_bytes_per_key_ = protection_bytes_per_key;
1155
0
  }
1156
11.6k
}
1157
1158
void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
1159
                                               const Comparator* raw_ucmp,
1160
                                               bool value_is_full,
1161
7.47k
                                               bool index_has_first_key) {
1162
7.47k
  protection_bytes_per_key_ = 0;
1163
7.47k
  if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
1164
    // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
1165
    // do not impact how the index block is parsed. During checksum
1166
    // construction/verification, we use the entire key buffer from
1167
    // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
1168
    // checksum, and the content of this buffer do not change for different
1169
    // values of `global_seqno` or `key_includes_seq`.
1170
    // TODO(yuzhangyu): handle the implication of padding timestamp for kv
1171
    // protection.
1172
0
    std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
1173
0
        raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
1174
0
        nullptr /* Statistics */, true /* total_order_seek */,
1175
0
        index_has_first_key /* have_first_key */, false /* key_includes_seq */,
1176
0
        value_is_full, true /* block_contents_pinned */,
1177
0
        true /* user_defined_timestamps_persisted*/,
1178
0
        nullptr /* prefix_index */)};
1179
0
    if (iter->status().ok()) {
1180
0
      block_restart_interval_ = iter->GetRestartInterval();
1181
0
    }
1182
0
    uint32_t num_keys = 0;
1183
0
    if (iter->status().ok()) {
1184
0
      num_keys = iter->NumberOfKeys(block_restart_interval_);
1185
0
    }
1186
0
    if (iter->status().ok()) {
1187
0
      checksum_size_ = num_keys * protection_bytes_per_key;
1188
0
      kv_checksum_ = new char[(size_t)checksum_size_];
1189
0
      iter->SeekToFirst();
1190
0
      size_t i = 0;
1191
0
      while (iter->Valid()) {
1192
0
        GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
1193
0
                           iter->key(), iter->raw_value());
1194
0
        iter->Next();
1195
0
        i += protection_bytes_per_key;
1196
0
      }
1197
0
      assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
1198
0
    }
1199
0
    if (!iter->status().ok()) {
1200
0
      size_ = 0;  // Error marker
1201
0
      return;
1202
0
    }
1203
0
    protection_bytes_per_key_ = protection_bytes_per_key;
1204
0
  }
1205
7.47k
}
1206
1207
void Block::InitializeMetaIndexBlockProtectionInfo(
1208
7.47k
    uint8_t protection_bytes_per_key) {
1209
7.47k
  protection_bytes_per_key_ = 0;
1210
7.47k
  if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
1211
0
    std::unique_ptr<MetaBlockIter> iter{
1212
0
        NewMetaIterator(true /* block_contents_pinned */)};
1213
0
    if (iter->status().ok()) {
1214
0
      block_restart_interval_ = iter->GetRestartInterval();
1215
0
    }
1216
0
    uint32_t num_keys = 0;
1217
0
    if (iter->status().ok()) {
1218
0
      num_keys = iter->NumberOfKeys(block_restart_interval_);
1219
0
    }
1220
0
    if (iter->status().ok()) {
1221
0
      checksum_size_ = num_keys * protection_bytes_per_key;
1222
0
      kv_checksum_ = new char[(size_t)checksum_size_];
1223
0
      iter->SeekToFirst();
1224
0
      size_t i = 0;
1225
0
      while (iter->Valid()) {
1226
0
        GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
1227
0
                           iter->key(), iter->value());
1228
0
        iter->Next();
1229
0
        i += protection_bytes_per_key;
1230
0
      }
1231
0
      assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
1232
0
    }
1233
0
    if (!iter->status().ok()) {
1234
0
      size_ = 0;  // Error marker
1235
0
      return;
1236
0
    }
1237
0
    protection_bytes_per_key_ = protection_bytes_per_key;
1238
0
  }
1239
7.47k
}
1240
1241
14.9k
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
1242
14.9k
  MetaBlockIter* iter = new MetaBlockIter();
1243
14.9k
  if (size_ < 2 * sizeof(uint32_t)) {
1244
0
    iter->Invalidate(Status::Corruption("bad block contents"));
1245
0
    return iter;
1246
14.9k
  } else if (num_restarts_ == 0) {
1247
    // Empty block.
1248
0
    iter->Invalidate(Status::OK());
1249
14.9k
  } else {
1250
14.9k
    iter->Initialize(data_, restart_offset_, num_restarts_,
1251
14.9k
                     block_contents_pinned, protection_bytes_per_key_,
1252
14.9k
                     kv_checksum_, block_restart_interval_);
1253
14.9k
  }
1254
14.9k
  return iter;
1255
14.9k
}
1256
1257
DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
1258
                                      SequenceNumber global_seqno,
1259
                                      DataBlockIter* iter, Statistics* stats,
1260
                                      bool block_contents_pinned,
1261
13.8k
                                      bool user_defined_timestamps_persisted) {
1262
13.8k
  DataBlockIter* ret_iter;
1263
13.8k
  if (iter != nullptr) {
1264
13.8k
    ret_iter = iter;
1265
13.8k
  } else {
1266
0
    ret_iter = new DataBlockIter;
1267
0
  }
1268
13.8k
  if (size_ < 2 * sizeof(uint32_t)) {
1269
0
    ret_iter->Invalidate(Status::Corruption("bad block contents"));
1270
0
    return ret_iter;
1271
0
  }
1272
13.8k
  if (num_restarts_ == 0) {
1273
    // Empty block.
1274
0
    ret_iter->Invalidate(Status::OK());
1275
0
    return ret_iter;
1276
13.8k
  } else {
1277
13.8k
    ret_iter->Initialize(
1278
13.8k
        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
1279
13.8k
        read_amp_bitmap_.get(), block_contents_pinned,
1280
13.8k
        user_defined_timestamps_persisted,
1281
13.8k
        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
1282
13.8k
        protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
1283
13.8k
    if (read_amp_bitmap_) {
1284
0
      if (read_amp_bitmap_->GetStatistics() != stats) {
1285
        // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
1286
0
        read_amp_bitmap_->SetStatistics(stats);
1287
0
      }
1288
0
    }
1289
13.8k
  }
1290
1291
13.8k
  return ret_iter;
1292
13.8k
}
1293
1294
IndexBlockIter* Block::NewIndexIterator(
1295
    const Comparator* raw_ucmp, SequenceNumber global_seqno,
1296
    IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
1297
    bool have_first_key, bool key_includes_seq, bool value_is_full,
1298
    bool block_contents_pinned, bool user_defined_timestamps_persisted,
1299
13.5k
    BlockPrefixIndex* prefix_index) {
1300
13.5k
  IndexBlockIter* ret_iter;
1301
13.5k
  if (iter != nullptr) {
1302
98
    ret_iter = iter;
1303
13.4k
  } else {
1304
13.4k
    ret_iter = new IndexBlockIter;
1305
13.4k
  }
1306
13.5k
  if (size_ < 2 * sizeof(uint32_t)) {
1307
0
    ret_iter->Invalidate(Status::Corruption("bad block contents"));
1308
0
    return ret_iter;
1309
0
  }
1310
13.5k
  if (num_restarts_ == 0) {
1311
    // Empty block.
1312
0
    ret_iter->Invalidate(Status::OK());
1313
0
    return ret_iter;
1314
13.5k
  } else {
1315
13.5k
    BlockPrefixIndex* prefix_index_ptr =
1316
13.5k
        total_order_seek ? nullptr : prefix_index;
1317
13.5k
    ret_iter->Initialize(
1318
13.5k
        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
1319
13.5k
        prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
1320
13.5k
        block_contents_pinned, user_defined_timestamps_persisted,
1321
13.5k
        protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
1322
13.5k
  }
1323
1324
13.5k
  return ret_iter;
1325
13.5k
}
1326
1327
11.6k
size_t Block::ApproximateMemoryUsage() const {
1328
11.6k
  size_t usage = usable_size();
1329
11.6k
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
1330
11.6k
  usage += malloc_usable_size((void*)this);
1331
#else
1332
  usage += sizeof(*this);
1333
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
1334
11.6k
  if (read_amp_bitmap_) {
1335
0
    usage += read_amp_bitmap_->ApproximateMemoryUsage();
1336
0
  }
1337
11.6k
  usage += checksum_size_;
1338
11.6k
  return usage;
1339
11.6k
}
1340
1341
}  // namespace ROCKSDB_NAMESPACE