Coverage Report

Created: 2026-05-16 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/table/merging_iterator.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "table/merging_iterator.h"
11
12
#include "db/arena_wrapped_db_iter.h"
13
#include "monitoring/file_read_sample.h"
14
15
namespace ROCKSDB_NAMESPACE {
16
// MergingIterator uses a min/max heap to combine data from point iterators.
17
// Range tombstones can be added and keys covered by range tombstones will be
18
// skipped.
19
//
20
// The following are implementation details and can be ignored by user.
21
// For merging iterator to process range tombstones, it treats the start and end
22
// keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_
23
// together with regular point keys. Each range tombstone is active only within
24
// its internal key range [start_key, end_key). An `active_` set is used to
25
// track levels that have an active range tombstone. Take forward scanning
26
// for example. Level j is in active_ if its current range tombstone has its
27
// start_key popped from minHeap_ and its end_key in minHeap_. If the top of
28
// minHeap_ is a point key from level L, we can determine if the point key is
29
// covered by any range tombstone by checking if there is an l <= L in active_.
30
// The case of l == L also involves checking range tombstone's sequence number.
31
//
32
// The following (non-exhaustive) list of invariants are maintained by
33
// MergingIterator during forward scanning. After each InternalIterator API,
34
// i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty:
35
// (1) minHeap_.top().type == ITERATOR
36
// (2) minHeap_.top()->key() is not covered by any range tombstone.
37
//
38
// After each call to SeekImpl() in addition to the functions mentioned above:
39
// (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() <
40
// children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before
41
// the first range tombstone from level j with end_key() >
42
// children_[i].iter.key().
43
// (4) For all level i and j <= i, if j in active_, then
44
// range_tombstone_iters_[j]->start_key() < children_[i].iter.key().
45
// - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be
46
// the last range tombstone from that range tombstone iterator.
47
// - When referring to range tombstone start/end keys, assume it is the value of
48
// HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes
49
// range tombstone keys have distinct values from point keys.
50
//
51
// Applicable class variables have their own (forward scanning) invariants
52
// listed in the comments above their definition.
53
class MergingIterator : public InternalIterator {
54
 public:
55
  MergingIterator(const InternalKeyComparator* comparator,
56
                  InternalIterator** children, int n, bool is_arena_mode,
57
                  bool prefix_seek_mode,
58
                  const Slice* iterate_upper_bound = nullptr)
59
20.3k
      : is_arena_mode_(is_arena_mode),
60
20.3k
        prefix_seek_mode_(prefix_seek_mode),
61
20.3k
        direction_(kForward),
62
20.3k
        comparator_(comparator),
63
20.3k
        current_(nullptr),
64
20.3k
        minHeap_(MinHeapItemComparator(comparator_)),
65
20.3k
        pinned_iters_mgr_(nullptr),
66
20.3k
        iterate_upper_bound_(iterate_upper_bound) {
67
20.3k
    children_.resize(n);
68
20.3k
    for (int i = 0; i < n; i++) {
69
0
      children_[i].level = i;
70
0
      children_[i].iter.Set(children[i]);
71
0
    }
72
20.3k
  }
73
74
28.2k
  void considerStatus(Status s) {
75
28.2k
    if (!s.ok() && status_.ok()) {
76
0
      status_ = s;
77
0
    }
78
28.2k
  }
79
80
38.1k
  virtual void AddIterator(InternalIterator* iter) {
81
38.1k
    children_.emplace_back(children_.size(), iter);
82
38.1k
    if (pinned_iters_mgr_) {
83
0
      iter->SetPinnedItersMgr(pinned_iters_mgr_);
84
0
    }
85
    // Invalidate to ensure `Seek*()` is called to construct the heaps before
86
    // use.
87
38.1k
    current_ = nullptr;
88
38.1k
  }
89
90
  // There must be either no range tombstone iterator or the same number of
91
  // range tombstone iterators as point iterators after all iters are added.
92
  // The i-th added range tombstone iterator and the i-th point iterator
93
  // must point to the same LSM level.
94
  // Merging iterator takes ownership of `iter` and is responsible for freeing
95
  // it. One exception to this is when a LevelIterator moves to a different SST
96
  // file or when Iterator::Refresh() is called, the range tombstone iterator
97
  // could be updated. In that case, this merging iterator is only responsible
98
  // for freeing the new range tombstone iterator that it has pointers to in
99
  // range_tombstone_iters_.
100
  void AddRangeTombstoneIterator(
101
24.3k
      std::unique_ptr<TruncatedRangeDelIterator>&& iter) {
102
24.3k
    range_tombstone_iters_.emplace_back(std::move(iter));
103
24.3k
  }
104
105
  // Called by MergingIteratorBuilder when all point iterators and range
106
  // tombstone iterators are added. Initializes HeapItems for range tombstone
107
  // iterators.
108
13.2k
  void Finish() {
109
13.2k
    if (!range_tombstone_iters_.empty()) {
110
8.36k
      assert(range_tombstone_iters_.size() == children_.size());
111
8.36k
      pinned_heap_item_.resize(range_tombstone_iters_.size());
112
32.7k
      for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
113
24.3k
        pinned_heap_item_[i].level = i;
114
        // Range tombstone end key is exclusive. If a point internal key has the
115
        // same user key and sequence number as the start or end key of a range
116
        // tombstone, the order will be start < end key < internal key with the
117
        // following op_type change. This is helpful to ensure keys popped from
118
        // heap are in expected order since range tombstone start/end keys will
119
        // be distinct from point internal keys. Strictly speaking, this is only
120
        // needed for tombstone end points that are truncated in
121
        // TruncatedRangeDelIterator since untruncated tombstone end points
122
        // always have kMaxSequenceNumber and kTypeRangeDeletion (see
123
        // TruncatedRangeDelIterator::start_key()/end_key()).
124
24.3k
        pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid;
125
24.3k
      }
126
8.36k
    }
127
13.2k
  }
128
129
20.3k
  ~MergingIterator() override {
130
20.3k
    range_tombstone_iters_.clear();
131
132
38.1k
    for (auto& child : children_) {
133
38.1k
      child.iter.DeleteIter(is_arena_mode_);
134
38.1k
    }
135
20.3k
    status_.PermitUncheckedError();
136
20.3k
  }
137
138
0
  void SetRangeDelReadSeqno(SequenceNumber read_seqno) override {
139
0
    for (auto& child : children_) {
140
      // This should only be needed for LevelIterator (iterators from L1+).
141
0
      child.iter.SetRangeDelReadSeqno(read_seqno);
142
0
    }
143
0
    for (auto& child : range_tombstone_iters_) {
144
0
      if (child) {
145
0
        child->SetRangeDelReadSeqno(read_seqno);
146
0
      }
147
0
    }
148
0
  }
149
150
70.4k
  bool Valid() const override { return current_ != nullptr && status_.ok(); }
151
152
7.53k
  Status status() const override { return status_; }
153
154
  // Add range_tombstone_iters_[level] into min heap.
155
  // Updates active_ if the end key of a range tombstone is inserted.
156
  // pinned_heap_items_[level].type is updated based on `start_key`.
157
  //
158
  // If range_tombstone_iters_[level] is after iterate_upper_bound_,
159
  // it is removed from the heap.
160
  // @param start_key specifies which end point of the range tombstone to add.
161
  void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
162
142k
                                     bool replace_top = false) {
163
142k
    assert(!range_tombstone_iters_.empty() &&
164
142k
           range_tombstone_iters_[level]->Valid());
165
    // Maintains Invariant(phi)
166
142k
    if (start_key) {
167
71.3k
      pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
168
71.3k
      ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
169
      // iterate_upper_bound does not have timestamp
170
71.3k
      if (iterate_upper_bound_ &&
171
0
          comparator_->user_comparator()->CompareWithoutTimestamp(
172
0
              pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
173
0
              false /* b_has_ts */) >= 0) {
174
0
        if (replace_top) {
175
          // replace_top implies this range tombstone iterator is still in
176
          // minHeap_ and at the top.
177
0
          minHeap_.pop();
178
0
        }
179
0
        return;
180
0
      }
181
71.3k
      pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
182
      // Checks Invariant(active_)
183
71.3k
      assert(active_.count(level) == 0);
184
71.3k
    } else {
185
      // allow end key to go over upper bound (if present) since start key is
186
      // before upper bound and the range tombstone could still cover a
187
      // range before upper bound.
188
      // Maintains Invariant(active_)
189
71.3k
      pinned_heap_item_[level].SetTombstoneKey(
190
71.3k
          range_tombstone_iters_[level]->end_key());
191
71.3k
      pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
192
71.3k
      active_.insert(level);
193
71.3k
    }
194
142k
    if (replace_top) {
195
140k
      minHeap_.replace_top(&pinned_heap_item_[level]);
196
140k
    } else {
197
2.55k
      minHeap_.push(&pinned_heap_item_[level]);
198
2.55k
    }
199
142k
  }
200
201
  // Add range_tombstone_iters_[level] into max heap.
202
  // Updates active_ if the start key of a range tombstone is inserted.
203
  // @param end_key specifies which end point of the range tombstone to add.
204
  void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
205
0
                                     bool replace_top = false) {
206
0
    assert(!range_tombstone_iters_.empty() &&
207
0
           range_tombstone_iters_[level]->Valid());
208
0
    if (end_key) {
209
0
      pinned_heap_item_[level].SetTombstoneKey(
210
0
          range_tombstone_iters_[level]->end_key());
211
0
      pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
212
0
      assert(active_.count(level) == 0);
213
0
    } else {
214
0
      pinned_heap_item_[level].SetTombstoneKey(
215
0
          range_tombstone_iters_[level]->start_key());
216
0
      pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
217
0
      active_.insert(level);
218
0
    }
219
0
    if (replace_top) {
220
0
      maxHeap_->replace_top(&pinned_heap_item_[level]);
221
0
    } else {
222
0
      maxHeap_->push(&pinned_heap_item_[level]);
223
0
    }
224
0
  }
225
226
  // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
227
  // until minHeap_ is empty or the top of the minHeap_ is not of type
228
  // DELETE_RANGE_START. Each such item means a range tombstone becomes active,
229
  // so `active_` is updated accordingly.
230
115k
  void PopDeleteRangeStart() {
231
187k
    while (!minHeap_.empty() &&
232
180k
           minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) {
233
71.3k
      TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
234
      // Invariant(rti) holds since
235
      // range_tombstone_iters_[minHeap_.top()->level] is still valid, and
236
      // parameter `replace_top` is set to true here to ensure only one such
237
      // HeapItem is in minHeap_.
238
71.3k
      InsertRangeTombstoneToMinHeap(
239
71.3k
          minHeap_.top()->level, false /* start_key */, true /* replace_top */);
240
71.3k
    }
241
115k
  }
242
243
  // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END
244
  // until maxHeap_ is empty or the top of the maxHeap_ is not of type
245
  // DELETE_RANGE_END. Each such item means a range tombstone becomes active,
246
  // so `active_` is updated accordingly.
247
19.9k
  void PopDeleteRangeEnd() {
248
19.9k
    while (!maxHeap_->empty() &&
249
16.5k
           maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) {
250
      // insert start key of this range tombstone and updates active_
251
0
      InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
252
0
                                    true /* replace_top */);
253
0
    }
254
19.9k
  }
255
256
6.36k
  void SeekToFirst() override {
257
6.36k
    ClearHeaps();
258
6.36k
    status_ = Status::OK();
259
16.1k
    for (auto& child : children_) {
260
16.1k
      child.iter.SeekToFirst();
261
16.1k
      AddToMinHeapOrCheckStatus(&child);
262
16.1k
    }
263
264
17.3k
    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
265
10.9k
      if (range_tombstone_iters_[i]) {
266
2.55k
        range_tombstone_iters_[i]->SeekToFirst();
267
2.55k
        if (range_tombstone_iters_[i]->Valid()) {
268
          // It is possible to be invalid due to snapshots.
269
2.55k
          InsertRangeTombstoneToMinHeap(i);
270
2.55k
        }
271
2.55k
      }
272
10.9k
    }
273
6.36k
    FindNextVisibleKey();
274
6.36k
    direction_ = kForward;
275
6.36k
    current_ = CurrentForward();
276
6.36k
  }
277
278
0
  void SeekToLast() override {
279
0
    ClearHeaps();
280
0
    InitMaxHeap();
281
0
    status_ = Status::OK();
282
0
    for (auto& child : children_) {
283
0
      child.iter.SeekToLast();
284
0
      AddToMaxHeapOrCheckStatus(&child);
285
0
    }
286
287
0
    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
288
0
      if (range_tombstone_iters_[i]) {
289
0
        range_tombstone_iters_[i]->SeekToLast();
290
0
        if (range_tombstone_iters_[i]->Valid()) {
291
          // It is possible to be invalid due to snapshots.
292
0
          InsertRangeTombstoneToMaxHeap(i);
293
0
        }
294
0
      }
295
0
    }
296
0
    FindPrevVisibleKey();
297
0
    direction_ = kReverse;
298
0
    current_ = CurrentReverse();
299
0
  }
300
301
  // Position this merging iterator at the first key >= target (internal key).
302
  // If range tombstones are present, keys covered by range tombstones are
303
  // skipped, and this merging iter points to the first non-range-deleted key >=
304
  // target after Seek(). If !Valid() and status().ok() then this iterator
305
  // reaches the end.
306
  //
307
  // If range tombstones are present, cascading seeks may be called (an
308
  // optimization adapted from Pebble https://github.com/cockroachdb/pebble).
309
  // Roughly, if there is a range tombstone [start, end) that covers the
310
  // target user key at level L, then this range tombstone must cover the range
311
  // [target key, end) in all levels > L. So for all levels > L, we can pretend
312
  // the target key is `end`. This optimization is applied at each level and
313
  // hence the name "cascading seek".
314
668
  void Seek(const Slice& target) override {
315
    // Define LevelNextVisible(i, k) to be the first key >= k in level i that is
316
    // not covered by any range tombstone.
317
    // After SeekImpl(target, 0), invariants (3) and (4) hold.
318
    // For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i,
319
    // target). By the contract of FindNextVisibleKey(), Invariants (1)-(4)
320
    // holds after this call, and minHeap_.top().iter points to the
321
    // first key >= target among children_ that is not covered by any range
322
    // tombstone.
323
668
    status_ = Status::OK();
324
668
    SeekImpl(target);
325
668
    FindNextVisibleKey();
326
327
668
    direction_ = kForward;
328
668
    {
329
668
      PERF_TIMER_GUARD(seek_min_heap_time);
330
668
      current_ = CurrentForward();
331
668
    }
332
668
  }
333
334
3.46k
  void SeekForPrev(const Slice& target) override {
335
3.46k
    assert(range_tombstone_iters_.empty() ||
336
3.46k
           range_tombstone_iters_.size() == children_.size());
337
3.46k
    status_ = Status::OK();
338
3.46k
    SeekForPrevImpl(target);
339
3.46k
    FindPrevVisibleKey();
340
341
3.46k
    direction_ = kReverse;
342
3.46k
    {
343
3.46k
      PERF_TIMER_GUARD(seek_max_heap_time);
344
3.46k
      current_ = CurrentReverse();
345
3.46k
    }
346
3.46k
  }
347
348
34.0k
  void Next() override {
349
34.0k
    assert(Valid());
350
    // Ensure that all children are positioned after key().
351
    // If we are moving in the forward direction, it is already
352
    // true for all the non-current children since current_ is
353
    // the smallest child and key() == current_->key().
354
34.0k
    if (direction_ != kForward) {
355
      // The loop advanced all non-current children to be > key() so current_
356
      // should still be strictly the smallest key.
357
0
      SwitchToForward();
358
0
    }
359
360
    // For the heap modifications below to be correct, current_ must be the
361
    // current top of the heap.
362
34.0k
    assert(current_ == CurrentForward());
363
    // as the current points to the current record. move the iterator forward.
364
34.0k
    current_->Next();
365
34.0k
    if (current_->Valid()) {
366
      // current is still valid after the Next() call above.  Call
367
      // replace_top() to restore the heap property.  When the same child
368
      // iterator yields a sequence of keys, this is cheap.
369
25.8k
      assert(current_->status().ok());
370
25.8k
      minHeap_.replace_top(minHeap_.top());
371
25.8k
    } else {
372
      // current stopped being valid, remove it from the heap.
373
8.20k
      considerStatus(current_->status());
374
8.20k
      minHeap_.pop();
375
8.20k
    }
376
    // Invariants (3) and (4) hold when after advancing current_.
377
    // Let k be the smallest key among children_[i].iter.key().
378
    // k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all
379
    // level i. After FindNextVisible(), Invariants (1)-(4) hold and
380
    // minHeap_.top()->key() is the first key >= k from any children_ that is
381
    // not covered by any range tombstone.
382
34.0k
    FindNextVisibleKey();
383
34.0k
    current_ = CurrentForward();
384
34.0k
  }
385
386
34.0k
  bool NextAndGetResult(IterateResult* result) override {
387
34.0k
    Next();
388
34.0k
    bool is_valid = Valid();
389
34.0k
    if (is_valid) {
390
28.5k
      result->key = key();
391
28.5k
      result->bound_check_result = UpperBoundCheckResult();
392
28.5k
      result->value_prepared = current_->IsValuePrepared();
393
28.5k
    }
394
34.0k
    return is_valid;
395
34.0k
  }
396
397
12.5k
  void Prev() override {
398
12.5k
    assert(Valid());
399
    // Ensure that all children are positioned before key().
400
    // If we are moving in the reverse direction, it is already
401
    // true for all the non-current children since current_ is
402
    // the largest child and key() == current_->key().
403
12.5k
    if (direction_ != kReverse) {
404
      // Otherwise, retreat the non-current children.  We retreat current_
405
      // just after the if-block.
406
506
      SwitchToBackward();
407
506
    }
408
409
    // For the heap modifications below to be correct, current_ must be the
410
    // current top of the heap.
411
12.5k
    assert(current_ == CurrentReverse());
412
12.5k
    current_->Prev();
413
12.5k
    if (current_->Valid()) {
414
      // current is still valid after the Prev() call above.  Call
415
      // replace_top() to restore the heap property.  When the same child
416
      // iterator yields a sequence of keys, this is cheap.
417
7.57k
      assert(current_->status().ok());
418
7.57k
      maxHeap_->replace_top(maxHeap_->top());
419
7.57k
    } else {
420
      // current stopped being valid, remove it from the heap.
421
4.97k
      considerStatus(current_->status());
422
4.97k
      maxHeap_->pop();
423
4.97k
    }
424
12.5k
    FindPrevVisibleKey();
425
12.5k
    current_ = CurrentReverse();
426
12.5k
  }
427
428
65.2k
  Slice key() const override {
429
65.2k
    assert(Valid());
430
65.2k
    return current_->key();
431
65.2k
  }
432
433
30.9k
  uint64_t write_unix_time() const override {
434
30.9k
    assert(Valid());
435
30.9k
    return current_->write_unix_time();
436
30.9k
  }
437
438
30.9k
  Slice value() const override {
439
30.9k
    assert(Valid());
440
30.9k
    return current_->value();
441
30.9k
  }
442
443
17.3k
  bool PrepareValue() override {
444
17.3k
    assert(Valid());
445
17.3k
    if (current_->PrepareValue()) {
446
17.3k
      return true;
447
17.3k
    }
448
449
0
    considerStatus(current_->status());
450
0
    assert(!status_.ok());
451
0
    return false;
452
17.3k
  }
453
454
  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
455
  // from current child iterator. Potentially as long as one of child iterator
456
  // report out of bound is not possible, we know current key is within bound.
457
0
  bool MayBeOutOfLowerBound() override {
458
0
    assert(Valid());
459
0
    return current_->MayBeOutOfLowerBound();
460
0
  }
461
462
28.5k
  IterBoundCheck UpperBoundCheckResult() override {
463
28.5k
    assert(Valid());
464
28.5k
    return current_->UpperBoundCheckResult();
465
28.5k
  }
466
467
13.2k
  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
468
13.2k
    pinned_iters_mgr_ = pinned_iters_mgr;
469
38.1k
    for (auto& child : children_) {
470
38.1k
      child.iter.SetPinnedItersMgr(pinned_iters_mgr);
471
38.1k
    }
472
13.2k
  }
473
474
8.85k
  bool IsKeyPinned() const override {
475
8.85k
    assert(Valid());
476
8.85k
    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
477
79
           current_->IsKeyPinned();
478
8.85k
  }
479
480
7.77k
  bool IsValuePinned() const override {
481
7.77k
    assert(Valid());
482
7.77k
    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
483
7.77k
           current_->IsValuePinned();
484
7.77k
  }
485
486
0
  void Prepare(const MultiScanArgs* scan_opts) override {
487
0
    for (auto& child : children_) {
488
0
      child.iter.Prepare(scan_opts);
489
0
    }
490
0
  }
491
492
 private:
493
  // Represents an element in the min/max heap. Each HeapItem corresponds to a
494
  // point iterator or a range tombstone iterator, differentiated by
495
  // HeapItem::type.
496
  struct HeapItem {
497
24.3k
    HeapItem() = default;
498
499
    // corresponding point iterator
500
    IteratorWrapper iter;
501
    size_t level = 0;
502
    // corresponding range tombstone iterator's start or end key value
503
    // depending on value of `type`.
504
    ParsedInternalKey tombstone_pik;
505
    // Will be overwritten before use, initialize here so compiler does not
506
    // complain.
507
    enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
508
    Type type = Type::ITERATOR;
509
510
    explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
511
38.1k
        : level(_level), type(Type::ITERATOR) {
512
38.1k
      iter.Set(_iter);
513
38.1k
    }
514
515
142k
    void SetTombstoneKey(ParsedInternalKey&& pik) {
516
      // op_type is already initialized in MergingIterator::Finish().
517
142k
      tombstone_pik.user_key = pik.user_key;
518
142k
      tombstone_pik.sequence = pik.sequence;
519
142k
    }
520
  };
521
522
  class MinHeapItemComparator {
523
   public:
524
    explicit MinHeapItemComparator(const InternalKeyComparator* comparator)
525
20.3k
        : comparator_(comparator) {}
526
527
111k
    bool operator()(HeapItem* a, HeapItem* b) const {
528
111k
      if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
529
28.8k
        if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
530
14.5k
          return comparator_->Compare(a->iter.key(), b->iter.key()) > 0;
531
14.5k
        } else {
532
14.2k
          return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0;
533
14.2k
        }
534
82.5k
      } else {
535
82.5k
        if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
536
82.5k
          return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0;
537
82.5k
        } else {
538
0
          return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0;
539
0
        }
540
82.5k
      }
541
111k
    }
542
543
   private:
544
    const InternalKeyComparator* comparator_;
545
  };
546
547
  class MaxHeapItemComparator {
548
   public:
549
    explicit MaxHeapItemComparator(const InternalKeyComparator* comparator)
550
3.46k
        : comparator_(comparator) {}
551
552
14.7k
    bool operator()(HeapItem* a, HeapItem* b) const {
553
14.7k
      if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
554
14.7k
        if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
555
14.7k
          return comparator_->Compare(a->iter.key(), b->iter.key()) < 0;
556
14.7k
        } else {
557
0
          return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0;
558
0
        }
559
14.7k
      } else {
560
0
        if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
561
0
          return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0;
562
0
        } else {
563
0
          return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0;
564
0
        }
565
0
      }
566
14.7k
    }
567
568
   private:
569
    const InternalKeyComparator* comparator_;
570
  };
571
572
  using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
573
  using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
574
575
  friend class MergeIteratorBuilder;
576
  // Clears heaps for both directions, used when changing direction or seeking
577
  void ClearHeaps(bool clear_active = true);
578
  // Ensures that maxHeap_ is initialized when starting to go in the reverse
579
  // direction
580
  void InitMaxHeap();
581
  // Advance this merging iterator until the current key (minHeap_.top()) is
582
  // from a point iterator and is not covered by any range tombstone,
583
  // or that there is no more keys (heap is empty). SeekImpl() may be called
584
  // to seek to the end of a range tombstone as an optimization.
585
  void FindNextVisibleKey();
586
  void FindPrevVisibleKey();
587
588
  // Advance this merging iterators to the first key >= `target` for all
589
  // components from levels >= starting_level. All iterators before
590
  // starting_level are untouched.
591
  //
592
  // @param range_tombstone_reseek Whether target is some range tombstone
593
  // end, i.e., whether this SeekImpl() call is a part of a "cascading seek".
594
  // This is used only for recoding relevant perf_context.
595
  void SeekImpl(const Slice& target, size_t starting_level = 0,
596
                bool range_tombstone_reseek = false);
597
598
  // Seek to fist key <= target key (internal key) for
599
  // children_[starting_level:].
600
  void SeekForPrevImpl(const Slice& target, size_t starting_level = 0,
601
                       bool range_tombstone_reseek = false);
602
603
  bool is_arena_mode_;
604
  bool prefix_seek_mode_;
605
  // Which direction is the iterator moving?
606
  enum Direction : uint8_t { kForward, kReverse };
607
  Direction direction_;
608
  const InternalKeyComparator* comparator_;
609
  // HeapItem for all child point iterators.
610
  // Invariant(children_): children_[i] is in minHeap_ iff
611
  // children_[i].iter.Valid(), and at most one children_[i] is in minHeap_.
612
  // TODO: We could use an autovector with a larger reserved size.
613
  std::vector<HeapItem> children_;
614
  // HeapItem for range tombstone start and end keys.
615
  // pinned_heap_item_[i] corresponds to range_tombstone_iters_[i].
616
  // Invariant(phi): If range_tombstone_iters_[i]->Valid(),
617
  // pinned_heap_item_[i].tombstone_pik is equal to
618
  // range_tombstone_iters_[i]->start_key() when
619
  // pinned_heap_item_[i].type is DELETE_RANGE_START and
620
  // range_tombstone_iters_[i]->end_key() when
621
  // pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is
622
  // kMaxValid for all pinned_heap_item_.tombstone_pik).
623
  // pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END.
624
  std::vector<HeapItem> pinned_heap_item_;
625
  // range_tombstone_iters_[i] contains range tombstones in the sorted run that
626
  // corresponds to children_[i]. range_tombstone_iters_.empty() means not
627
  // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
628
  // nullptr means the sorted run of children_[i] does not have range
629
  // tombstones.
630
  // Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff
631
  // range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is
632
  // in minHeap_.
633
  std::vector<std::unique_ptr<TruncatedRangeDelIterator>>
634
      range_tombstone_iters_;
635
636
  // Levels (indices into range_tombstone_iters_/children_ ) that currently have
637
  // "active" range tombstones. See comments above MergingIterator for meaning
638
  // of "active".
639
  // Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid()
640
  // and pinned_heap_item_[i].type == DELETE_RANGE_END.
641
  std::set<size_t> active_;
642
643
  bool SkipNextDeleted();
644
645
  bool SkipPrevDeleted();
646
647
  // Invariant: at the end of each InternalIterator API,
648
  // current_ points to minHeap_.top().iter (maxHeap_ if backward scanning)
649
  // or nullptr if no child iterator is valid.
650
  // This follows from that current_ = CurrentForward()/CurrentReverse() is
651
  // called at the end of each InternalIterator API.
652
  IteratorWrapper* current_;
653
  // If any of the children have non-ok status, this is one of them.
654
  Status status_;
655
  // Invariant: min heap property is maintained (parent is always <= child).
656
  // This holds by using only BinaryHeap APIs to modify heap. One
657
  // exception is to modify heap top item directly (by caller iter->Next()), and
658
  // it should be followed by a call to replace_top() or pop().
659
  MergerMinIterHeap minHeap_;
660
661
  // Max heap is used for reverse iteration, which is way less common than
662
  // forward. Lazily initialize it to save memory.
663
  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
664
  PinnedIteratorsManager* pinned_iters_mgr_;
665
666
  // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
667
  // take care of boundary checking.
668
  const Slice* iterate_upper_bound_;
669
670
  // In forward direction, process a child that is not in the min heap.
671
  // If valid, add to the min heap. Otherwise, check status.
672
  void AddToMinHeapOrCheckStatus(HeapItem*);
673
674
  // In backward direction, process a child that is not in the max heap.
675
  // If valid, add to the min heap. Otherwise, check status.
676
  void AddToMaxHeapOrCheckStatus(HeapItem*);
677
678
  void SwitchToForward();
679
680
  // Switch the direction from forward to backward without changing the
681
  // position. Iterator should still be valid.
682
  void SwitchToBackward();
683
684
41.1k
  IteratorWrapper* CurrentForward() const {
685
41.1k
    assert(direction_ == kForward);
686
41.1k
    assert(minHeap_.empty() ||
687
41.1k
           minHeap_.top()->type == HeapItem::Type::ITERATOR);
688
41.1k
    return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
689
41.1k
  }
690
691
16.5k
  IteratorWrapper* CurrentReverse() const {
692
16.5k
    assert(direction_ == kReverse);
693
16.5k
    assert(maxHeap_);
694
16.5k
    assert(maxHeap_->empty() ||
695
16.5k
           maxHeap_->top()->type == HeapItem::Type::ITERATOR);
696
16.5k
    return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
697
16.5k
  }
698
};
699
700
// Pre-condition:
701
// - Invariants (3) and (4) hold for i < starting_level
702
// - For i < starting_level, range_tombstone_iters_[i].prev.end_key() <
703
// `target`.
704
// - For i < starting_level, if i in active_, then
705
// range_tombstone_iters_[i]->start_key() < `target`.
706
//
707
// Post-condition:
708
// - Invariants (3) and (4) hold for all level i.
709
// - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target)
710
// for i >= starting_level
711
// - (**) target < pinned_heap_item_[i].tombstone_pik if
712
// range_tombstone_iters_[i].Valid() for i >= starting_level
713
//
714
// Proof sketch:
715
// Invariant (3) holds for all level i.
716
// For j <= i < starting_level, it follows from Pre-condition that (3) holds
717
// and that SeekImpl(-, starting_level) does not update children_[i] or
718
// range_tombstone_iters_[j].
719
// For j < starting_level and i >= starting_level, it follows from
720
// - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target`
721
// - range_tombstone_iters_[j] is not updated in SeekImpl(), and
722
// - children_[i].iter.Seek(current_search_key) is called with
723
// current_search_key >= target (shown below).
724
//   When current_search_key is updated, it is updated to some
725
//   range_tombstone_iter->end_key() after
726
//   range_tombstone_iter->SeekInternalKey(current_search_key) was called. So
727
//   current_search_key increases if updated and >= target.
728
// For starting_level <= j <= i:
729
// children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2)
730
// are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >=
731
// k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at
732
// the first range tombstone from level j with end_key() > k2. It suffices to
733
// show that k1 >= k2. Since k1 and k2 are values of current_search_key where
734
// k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2.
735
//
736
// Invariant (4) holds for all level >= 0.
737
// By Pre-condition Invariant (4) holds for i < starting_level.
738
// Since children_[i], range_tombstone_iters_[i] and contents of active_ for
739
// i < starting_level do not change (4) holds for j <= i < starting_level.
740
// By Pre-condition: for all j < starting_level, if j in active_, then
741
// range_tombstone_iters_[j]->start_key() < target. For i >= starting_level,
742
// children_[i].iter.Seek(k) is called for k >= target. So
743
// children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key()
744
// for j < starting_level and i >= starting_level. So invariant (4) holds for
745
// j < starting_level and i >= starting_level.
746
// For starting_level <= j <= i, j is added to active_ only if
747
// - range_tombstone_iters_[j]->SeekInternalKey(k1) was called
748
// - range_tombstone_iters_[j]->start_key() <= k1
749
// Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all
750
// starting_level <= j <= i, (4) also holds for all starting_level <= j <= i.
751
//
752
// Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i,
753
// target) for i >= starting_level.
754
// target <= children_[i].iter.key() follows from that Seek() is called on some
755
// current_search_key >= target for children_[i].iter. If current_search_key
756
// is updated from k1 to k2 when level = i, we show that the range [k1, k2) is
757
// not visible for children_[j] for any j > i. When current_search_key is
758
// updated from k1 to k2,
759
//  - range_tombstone_iters_[i]->SeekInternalKey(k1) was called
760
//  - range_tombstone_iters_[i]->Valid()
761
//  - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key
762
//  - k2 = range_tombstone_iters_[i]->end_key()
763
// We assume that range_tombstone_iters_[i]->start_key() has a higher sequence
764
// number compared to any key from levels > i that has the same user key. So no
765
// point key from levels > i in range [k1, k2) is visible. So
766
// children_[i].iter.key() <= LevelNextVisible(i, target).
767
//
768
// Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >=
769
// starting_level if range_tombstone_iters_[i].Valid(). This follows from that
770
// SeekInternalKey() being called for each range_tombstone_iters_ with some key
771
// >= `target` and that we pick start/end key that is > `target` to insert to
772
// minHeap_.
773
void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
774
668
                               bool range_tombstone_reseek) {
775
  // active range tombstones before `starting_level` remain active
776
668
  ClearHeaps(false /* clear_active */);
777
668
  ParsedInternalKey pik;
778
668
  if (!range_tombstone_iters_.empty()) {
779
    // pik is only used in InsertRangeTombstoneToMinHeap().
780
489
    ParseInternalKey(target, &pik, false).PermitUncheckedError();
781
489
  }
782
783
  // TODO: perhaps we could save some upheap cost by add all child iters first
784
  //  and then do a single heapify.
785
  // Invariant(children_) for level < starting_level
786
668
  for (size_t level = 0; level < starting_level; ++level) {
787
0
    PERF_TIMER_GUARD(seek_min_heap_time);
788
0
    AddToMinHeapOrCheckStatus(&children_[level]);
789
0
  }
790
668
  if (!range_tombstone_iters_.empty()) {
791
    // Add range tombstones from levels < starting_level. We can insert from
792
    // pinned_heap_item_ for the following reasons:
793
    // - pinned_heap_item_[level] is in minHeap_ iff
794
    // range_tombstone_iters[level]->Valid().
795
    // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
796
    // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
797
489
    for (size_t level = 0; level < starting_level; ++level) {
798
      // Restores Invariants(rti), (phi) and (active_) for level <
799
      // starting_level
800
0
      if (range_tombstone_iters_[level] &&
801
0
          range_tombstone_iters_[level]->Valid()) {
802
        // use an iterator on active_ if performance becomes an issue here
803
0
        if (active_.count(level) > 0) {
804
0
          assert(pinned_heap_item_[level].type ==
805
0
                 HeapItem::Type::DELETE_RANGE_END);
806
          // if it was active, then start key must be within upper_bound,
807
          // so we can add to minHeap_ directly.
808
0
          minHeap_.push(&pinned_heap_item_[level]);
809
0
        } else {
810
0
          assert(pinned_heap_item_[level].type ==
811
0
                 HeapItem::Type::DELETE_RANGE_START);
812
          // this takes care of checking iterate_upper_bound, but with an extra
813
          // key comparison if range_tombstone_iters_[level] was already out of
814
          // bound. Consider using a new HeapItem type or some flag to remember
815
          // boundary checking result.
816
0
          InsertRangeTombstoneToMinHeap(level);
817
0
        }
818
0
      } else {
819
0
        assert(!active_.count(level));
820
0
      }
821
0
    }
822
    // levels >= starting_level will be reseeked below, so clearing their active
823
    // state here.
824
489
    active_.erase(active_.lower_bound(starting_level), active_.end());
825
489
  }
826
827
668
  IterKey current_search_key;
828
668
  current_search_key.SetInternalKey(target, false /* copy */);
829
  // Seek target might change to some range tombstone end key, so
830
  // we need to remember them for async requests.
831
  // (level, target) pairs
832
668
  autovector<std::pair<size_t, std::string>> prefetched_target;
833
2.87k
  for (auto level = starting_level; level < children_.size(); ++level) {
834
2.20k
    {
835
2.20k
      PERF_TIMER_GUARD(seek_child_seek_time);
836
2.20k
      children_[level].iter.Seek(current_search_key.GetInternalKey());
837
2.20k
    }
838
839
2.20k
    PERF_COUNTER_ADD(seek_child_seek_count, 1);
840
841
2.20k
    if (!range_tombstone_iters_.empty()) {
842
1.68k
      if (range_tombstone_reseek) {
843
        // This seek is to some range tombstone end key.
844
        // Should only happen when there are range tombstones.
845
0
        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
846
0
      }
847
1.68k
      if (children_[level].iter.status().IsTryAgain()) {
848
0
        prefetched_target.emplace_back(
849
0
            level, current_search_key.GetInternalKey().ToString());
850
0
      }
851
1.68k
      UnownedPtr<TruncatedRangeDelIterator> range_tombstone_iter =
852
1.68k
          range_tombstone_iters_[level].get();
853
1.68k
      if (range_tombstone_iter) {
854
0
        range_tombstone_iter->SeekInternalKey(
855
0
            current_search_key.GetInternalKey());
856
        // Invariants (rti) and (phi)
857
0
        if (range_tombstone_iter->Valid()) {
858
          // If range tombstone starts after `current_search_key`,
859
          // we should insert start key to heap as the range tombstone is not
860
          // active yet.
861
0
          InsertRangeTombstoneToMinHeap(
862
0
              level, comparator_->Compare(range_tombstone_iter->start_key(),
863
0
                                          pik) > 0 /* start_key */);
864
          // current_search_key < end_key guaranteed by the SeekInternalKey()
865
          // and Valid() calls above. Here we only need to compare user_key
866
          // since if target.user_key ==
867
          // range_tombstone_iter->start_key().user_key and target <
868
          // range_tombstone_iter->start_key(), no older level would have any
869
          // key in range [target, range_tombstone_iter->start_key()], so no
870
          // keys in range [target, range_tombstone_iter->end_key()) from older
871
          // level would be visible. So it is safe to seek to
872
          // range_tombstone_iter->end_key().
873
          //
874
          // TODO: range_tombstone_iter->Seek() finds the max covering
875
          //  sequence number, can make it cheaper by not looking for max.
876
0
          if (comparator_->user_comparator()->Compare(
877
0
                  range_tombstone_iter->start_key().user_key,
878
0
                  current_search_key.GetUserKey()) <= 0) {
879
0
            range_tombstone_reseek = true;
880
            // Note that for prefix seek case, it is possible that the prefix
881
            // is not the same as the original target, it should not affect
882
            // correctness. Besides, in most cases, range tombstone start and
883
            // end key should have the same prefix?
884
0
            current_search_key.SetInternalKey(range_tombstone_iter->end_key());
885
0
          }
886
0
        }
887
0
      }
888
1.68k
    }
889
    // child.iter.status() is set to Status::TryAgain indicating asynchronous
890
    // request for retrieval of data blocks has been submitted. So it should
891
    // return at this point and Seek should be called again to retrieve the
892
    // requested block and add the child to min heap.
893
2.20k
    if (children_[level].iter.status().IsTryAgain()) {
894
0
      continue;
895
0
    }
896
2.20k
    {
897
      // Strictly, we timed slightly more than min heap operation,
898
      // but these operations are very cheap.
899
2.20k
      PERF_TIMER_GUARD(seek_min_heap_time);
900
2.20k
      AddToMinHeapOrCheckStatus(&children_[level]);
901
2.20k
    }
902
2.20k
  }
903
904
668
  if (range_tombstone_iters_.empty()) {
905
517
    for (auto& child : children_) {
906
517
      if (child.iter.status().IsTryAgain()) {
907
0
        child.iter.Seek(target);
908
0
        {
909
0
          PERF_TIMER_GUARD(seek_min_heap_time);
910
0
          AddToMinHeapOrCheckStatus(&child);
911
0
        }
912
0
        PERF_COUNTER_ADD(number_async_seek, 1);
913
0
      }
914
517
    }
915
489
  } else {
916
489
    for (auto& prefetch : prefetched_target) {
917
      // (level, target) pairs
918
0
      children_[prefetch.first].iter.Seek(prefetch.second);
919
0
      {
920
0
        PERF_TIMER_GUARD(seek_min_heap_time);
921
0
        AddToMinHeapOrCheckStatus(&children_[prefetch.first]);
922
0
      }
923
0
      PERF_COUNTER_ADD(number_async_seek, 1);
924
0
    }
925
489
  }
926
668
}
927
928
// Returns true iff the current key (min heap top) should not be returned
929
// to user (of the merging iterator). This can be because the current key
930
// is deleted by some range tombstone, the current key is some fake file
931
// boundary sentinel key, or the current key is an end point of a range
932
// tombstone. Advance the iterator at heap top if needed. Heap order is restored
933
// and `active_` is updated accordingly.
934
// See FindNextVisibleKey() for more detail on internal implementation
935
// of advancing child iters.
936
// When false is returned, if minHeap is not empty, then minHeap_.top().type
937
// == ITERATOR
938
//
939
// REQUIRES:
940
// - min heap is currently not empty, and iter is in kForward direction.
941
// - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current).
942
83.7k
bool MergingIterator::SkipNextDeleted() {
943
  // 3 types of keys:
944
  // - point key
945
  // - file boundary sentinel keys
946
  // - range deletion end key
947
83.7k
  auto current = minHeap_.top();
948
83.7k
  if (current->type == HeapItem::Type::DELETE_RANGE_END) {
949
    // Invariant(active_): range_tombstone_iters_[current->level] is about to
950
    // become !Valid() or that its start key is going to be added to minHeap_.
951
71.3k
    active_.erase(current->level);
952
71.3k
    assert(range_tombstone_iters_[current->level] &&
953
71.3k
           range_tombstone_iters_[current->level]->Valid());
954
71.3k
    range_tombstone_iters_[current->level]->Next();
955
    // Maintain Invariants (rti) and (phi)
956
71.3k
    if (range_tombstone_iters_[current->level]->Valid()) {
957
68.8k
      InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
958
68.8k
                                    true /* replace_top */);
959
68.8k
    } else {
960
      // TruncatedRangeDelIterator does not have status
961
2.55k
      minHeap_.pop();
962
2.55k
    }
963
71.3k
    return true /* current key deleted */;
964
71.3k
  }
965
12.3k
  if (current->iter.IsDeleteRangeSentinelKey()) {
966
    // If the file boundary is defined by a range deletion, the range
967
    // tombstone's end key must come before this sentinel key (see op_type in
968
    // SetTombstoneKey()).
969
3.16k
    assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
970
3.16k
           active_.count(current->level) == 0);
971
    // When entering a new file, range tombstone iter from the old file is
972
    // freed, but the last key from that range tombstone iter may still be in
973
    // the heap. We need to ensure the data underlying its corresponding key
974
    // Slice is still alive. We do so by popping the range tombstone key from
975
    // heap before calling iter->Next(). Technically, this change is not needed:
976
    // if there is a range tombstone end key that is after file boundary
977
    // sentinel key in minHeap_, the range tombstone end key must have been
978
    // truncated at file boundary. The underlying data of the range tombstone
979
    // end key Slice is the SST file's largest internal key stored as file
980
    // metadata in Version. However, since there are too many implicit
981
    // assumptions made, it is safer to just ensure range tombstone iter is
982
    // still alive.
983
3.16k
    minHeap_.pop();
984
    // Remove last SST file's range tombstone end key if there is one.
985
    // This means file boundary is before range tombstone end key,
986
    // which could happen when a range tombstone and a user key
987
    // straddle two SST files. Note that in TruncatedRangeDelIterator
988
    // constructor, parsed_largest.sequence is decremented 1 in this case.
989
    // Maintains Invariant(rti) that at most one
990
    // pinned_heap_item_[current->level] is in minHeap_.
991
3.16k
    if (range_tombstone_iters_[current->level] &&
992
0
        range_tombstone_iters_[current->level]->Valid()) {
993
0
      if (!minHeap_.empty() && minHeap_.top()->level == current->level) {
994
0
        assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END);
995
0
        minHeap_.pop();
996
        // Invariant(active_): we are about to enter a new SST file with new
997
        // range_tombstone_iters[current->level]. Either it is !Valid() or its
998
        // start key is going to be added to minHeap_.
999
0
        active_.erase(current->level);
1000
0
      } else {
1001
        // range tombstone is still valid, but it is not on heap.
1002
        // This should only happen if the range tombstone is over iterator
1003
        // upper bound.
1004
0
        assert(iterate_upper_bound_ &&
1005
0
               comparator_->user_comparator()->CompareWithoutTimestamp(
1006
0
                   range_tombstone_iters_[current->level]->start_key().user_key,
1007
0
                   true /* a_has_ts */, *iterate_upper_bound_,
1008
0
                   false /* b_has_ts */) >= 0);
1009
0
      }
1010
0
    }
1011
    // LevelIterator enters a new SST file
1012
3.16k
    current->iter.Next();
1013
    // Invariant(children_): current is popped from heap and added back only if
1014
    // it is valid
1015
3.16k
    if (current->iter.Valid()) {
1016
1.56k
      assert(current->iter.status().ok());
1017
1.56k
      minHeap_.push(current);
1018
1.59k
    } else {
1019
      // TODO(cbi): check status and early return if non-ok.
1020
1.59k
      considerStatus(current->iter.status());
1021
1.59k
    }
1022
    // Invariants (rti) and (phi)
1023
3.16k
    if (range_tombstone_iters_[current->level] &&
1024
0
        range_tombstone_iters_[current->level]->Valid()) {
1025
0
      InsertRangeTombstoneToMinHeap(current->level);
1026
0
    }
1027
3.16k
    return true /* current key deleted */;
1028
3.16k
  }
1029
12.3k
  assert(current->type == HeapItem::Type::ITERATOR);
1030
  // Point key case: check active_ for range tombstone coverage.
1031
9.18k
  ParsedInternalKey pik;
1032
9.18k
  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
1033
9.18k
  if (!active_.empty()) {
1034
9.18k
    auto i = *active_.begin();
1035
9.18k
    if (i < current->level) {
1036
      // range tombstone is from a newer level, definitely covers
1037
0
      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1038
0
                                  pik) <= 0);
1039
0
      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1040
0
             0);
1041
0
      std::string target;
1042
0
      AppendInternalKey(&target, range_tombstone_iters_[i]->end_key());
1043
0
      SeekImpl(target, current->level, true);
1044
0
      return true /* current key deleted */;
1045
9.18k
    } else if (i == current->level) {
1046
      // range tombstone is from the same level as current, check sequence
1047
      // number. By `active_` we know current key is between start key and end
1048
      // key.
1049
9.18k
      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1050
9.18k
                                  pik) <= 0);
1051
9.18k
      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1052
9.18k
             0);
1053
9.18k
      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
1054
        // covered by range tombstone
1055
0
        current->iter.Next();
1056
        // Invariant (children_)
1057
0
        if (current->iter.Valid()) {
1058
0
          minHeap_.replace_top(current);
1059
0
        } else {
1060
0
          considerStatus(current->iter.status());
1061
0
          minHeap_.pop();
1062
0
        }
1063
0
        return true /* current key deleted */;
1064
9.18k
      } else {
1065
9.18k
        return false /* current key not deleted */;
1066
9.18k
      }
1067
9.18k
    } else {
1068
0
      return false /* current key not deleted */;
1069
      // range tombstone from an older sorted run with current key < end key.
1070
      // current key is not deleted and the older sorted run will have its range
1071
      // tombstone updated when the range tombstone's end key are popped from
1072
      // minHeap_.
1073
0
    }
1074
9.18k
  }
1075
  // we can reach here only if active_ is empty
1076
9.18k
  assert(active_.empty());
1077
0
  assert(minHeap_.top()->type == HeapItem::Type::ITERATOR);
1078
0
  return false /* current key not deleted */;
1079
9.18k
}
1080
1081
void MergingIterator::SeekForPrevImpl(const Slice& target,
1082
                                      size_t starting_level,
1083
3.46k
                                      bool range_tombstone_reseek) {
1084
  // active range tombstones before `starting_level` remain active
1085
3.46k
  ClearHeaps(false /* clear_active */);
1086
3.46k
  InitMaxHeap();
1087
3.46k
  ParsedInternalKey pik;
1088
3.46k
  if (!range_tombstone_iters_.empty()) {
1089
2.27k
    ParseInternalKey(target, &pik, false).PermitUncheckedError();
1090
2.27k
  }
1091
3.46k
  for (size_t level = 0; level < starting_level; ++level) {
1092
0
    PERF_TIMER_GUARD(seek_max_heap_time);
1093
0
    AddToMaxHeapOrCheckStatus(&children_[level]);
1094
0
  }
1095
3.46k
  if (!range_tombstone_iters_.empty()) {
1096
    // Add range tombstones before starting_level.
1097
2.27k
    for (size_t level = 0; level < starting_level; ++level) {
1098
0
      if (range_tombstone_iters_[level] &&
1099
0
          range_tombstone_iters_[level]->Valid()) {
1100
0
        assert(static_cast<bool>(active_.count(level)) ==
1101
0
               (pinned_heap_item_[level].type ==
1102
0
                HeapItem::Type::DELETE_RANGE_START));
1103
0
        maxHeap_->push(&pinned_heap_item_[level]);
1104
0
      } else {
1105
0
        assert(!active_.count(level));
1106
0
      }
1107
0
    }
1108
    // levels >= starting_level will be reseeked below,
1109
2.27k
    active_.erase(active_.lower_bound(starting_level), active_.end());
1110
2.27k
  }
1111
1112
3.46k
  IterKey current_search_key;
1113
3.46k
  current_search_key.SetInternalKey(target, false /* copy */);
1114
  // Seek target might change to some range tombstone end key, so
1115
  // we need to remember them for async requests.
1116
  // (level, target) pairs
1117
3.46k
  autovector<std::pair<size_t, std::string>> prefetched_target;
1118
14.3k
  for (auto level = starting_level; level < children_.size(); ++level) {
1119
10.8k
    {
1120
10.8k
      PERF_TIMER_GUARD(seek_child_seek_time);
1121
10.8k
      children_[level].iter.SeekForPrev(current_search_key.GetInternalKey());
1122
10.8k
    }
1123
1124
10.8k
    PERF_COUNTER_ADD(seek_child_seek_count, 1);
1125
1126
10.8k
    if (!range_tombstone_iters_.empty()) {
1127
7.16k
      if (range_tombstone_reseek) {
1128
        // This seek is to some range tombstone end key.
1129
        // Should only happen when there are range tombstones.
1130
0
        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
1131
0
      }
1132
7.16k
      if (children_[level].iter.status().IsTryAgain()) {
1133
0
        prefetched_target.emplace_back(
1134
0
            level, current_search_key.GetInternalKey().ToString());
1135
0
      }
1136
7.16k
      UnownedPtr<TruncatedRangeDelIterator> range_tombstone_iter =
1137
7.16k
          range_tombstone_iters_[level].get();
1138
7.16k
      if (range_tombstone_iter) {
1139
0
        range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey());
1140
0
        if (range_tombstone_iter->Valid()) {
1141
0
          InsertRangeTombstoneToMaxHeap(
1142
0
              level, comparator_->Compare(range_tombstone_iter->end_key(),
1143
0
                                          pik) <= 0 /* end_key */);
1144
          // start key <= current_search_key guaranteed by the Seek() call above
1145
          // Only interested in user key coverage since older sorted runs must
1146
          // have smaller sequence numbers than this tombstone.
1147
0
          if (comparator_->user_comparator()->Compare(
1148
0
                  current_search_key.GetUserKey(),
1149
0
                  range_tombstone_iter->end_key().user_key) < 0) {
1150
0
            range_tombstone_reseek = true;
1151
0
            current_search_key.SetInternalKey(
1152
0
                range_tombstone_iter->start_key().user_key, kMaxSequenceNumber,
1153
0
                kValueTypeForSeekForPrev);
1154
0
          }
1155
0
        }
1156
0
      }
1157
7.16k
    }
1158
    // child.iter.status() is set to Status::TryAgain indicating asynchronous
1159
    // request for retrieval of data blocks has been submitted. So it should
1160
    // return at this point and Seek should be called again to retrieve the
1161
    // requested block and add the child to min heap.
1162
10.8k
    if (children_[level].iter.status().IsTryAgain()) {
1163
0
      continue;
1164
0
    }
1165
10.8k
    {
1166
      // Strictly, we timed slightly more than min heap operation,
1167
      // but these operations are very cheap.
1168
10.8k
      PERF_TIMER_GUARD(seek_max_heap_time);
1169
10.8k
      AddToMaxHeapOrCheckStatus(&children_[level]);
1170
10.8k
    }
1171
10.8k
  }
1172
1173
3.46k
  if (range_tombstone_iters_.empty()) {
1174
3.69k
    for (auto& child : children_) {
1175
3.69k
      if (child.iter.status().IsTryAgain()) {
1176
0
        child.iter.SeekForPrev(target);
1177
0
        {
1178
0
          PERF_TIMER_GUARD(seek_min_heap_time);
1179
0
          AddToMaxHeapOrCheckStatus(&child);
1180
0
        }
1181
0
        PERF_COUNTER_ADD(number_async_seek, 1);
1182
0
      }
1183
3.69k
    }
1184
2.27k
  } else {
1185
2.27k
    for (auto& prefetch : prefetched_target) {
1186
      // (level, target) pairs
1187
0
      children_[prefetch.first].iter.SeekForPrev(prefetch.second);
1188
0
      {
1189
0
        PERF_TIMER_GUARD(seek_max_heap_time);
1190
0
        AddToMaxHeapOrCheckStatus(&children_[prefetch.first]);
1191
0
      }
1192
0
      PERF_COUNTER_ADD(number_async_seek, 1);
1193
0
    }
1194
2.27k
  }
1195
3.46k
}
1196
1197
// See more in comments above SkipNextDeleted().
1198
// REQUIRES:
1199
// - max heap is currently not empty, and iter is in kReverse direction.
1200
// - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current).
1201
3.96k
bool MergingIterator::SkipPrevDeleted() {
1202
  // 3 types of keys:
1203
  // - point key
1204
  // - file boundary sentinel keys
1205
  // - range deletion start key
1206
3.96k
  auto current = maxHeap_->top();
1207
3.96k
  if (current->type == HeapItem::Type::DELETE_RANGE_START) {
1208
0
    active_.erase(current->level);
1209
0
    assert(range_tombstone_iters_[current->level] &&
1210
0
           range_tombstone_iters_[current->level]->Valid());
1211
0
    range_tombstone_iters_[current->level]->Prev();
1212
0
    if (range_tombstone_iters_[current->level]->Valid()) {
1213
0
      InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
1214
0
                                    true /* replace_top */);
1215
0
    } else {
1216
0
      maxHeap_->pop();
1217
0
    }
1218
0
    return true /* current key deleted */;
1219
0
  }
1220
3.96k
  if (current->iter.IsDeleteRangeSentinelKey()) {
1221
    // LevelIterator enters a new SST file
1222
3.96k
    maxHeap_->pop();
1223
    // Remove last SST file's range tombstone key if there is one.
1224
3.96k
    if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
1225
0
        maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) {
1226
0
      maxHeap_->pop();
1227
0
      active_.erase(current->level);
1228
0
    }
1229
3.96k
    current->iter.Prev();
1230
3.96k
    if (current->iter.Valid()) {
1231
2.01k
      assert(current->iter.status().ok());
1232
2.01k
      maxHeap_->push(current);
1233
2.01k
    } else {
1234
1.95k
      considerStatus(current->iter.status());
1235
1.95k
    }
1236
1237
3.96k
    if (range_tombstone_iters_[current->level] &&
1238
0
        range_tombstone_iters_[current->level]->Valid()) {
1239
0
      InsertRangeTombstoneToMaxHeap(current->level);
1240
0
    }
1241
3.96k
    return true /* current key deleted */;
1242
3.96k
  }
1243
3.96k
  assert(current->type == HeapItem::Type::ITERATOR);
1244
  // Point key case: check active_ for range tombstone coverage.
1245
0
  ParsedInternalKey pik;
1246
0
  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
1247
0
  if (!active_.empty()) {
1248
0
    auto i = *active_.begin();
1249
0
    if (i < current->level) {
1250
      // range tombstone is from a newer level, definitely covers
1251
0
      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1252
0
                                  pik) <= 0);
1253
0
      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1254
0
             0);
1255
0
      std::string target;
1256
0
      AppendInternalKey(&target, range_tombstone_iters_[i]->start_key());
1257
      // This is different from SkipNextDeleted() which does reseek at sorted
1258
      // runs >= level (instead of i+1 here). With min heap, if level L is at
1259
      // top of the heap, then levels <L all have internal keys > level L's
1260
      // current internal key, which means levels <L are already at a different
1261
      // user key. With max heap, if level L is at top of the heap, then levels
1262
      // <L all have internal keys smaller than level L's current internal key,
1263
      // which might still be the same user key.
1264
0
      SeekForPrevImpl(target, i + 1, true);
1265
0
      return true /* current key deleted */;
1266
0
    } else if (i == current->level) {
1267
      // By `active_` we know current key is between start key and end key.
1268
0
      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1269
0
                                  pik) <= 0);
1270
0
      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1271
0
             0);
1272
0
      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
1273
0
        current->iter.Prev();
1274
0
        if (current->iter.Valid()) {
1275
0
          maxHeap_->replace_top(current);
1276
0
        } else {
1277
0
          considerStatus(current->iter.status());
1278
0
          maxHeap_->pop();
1279
0
        }
1280
0
        return true /* current key deleted */;
1281
0
      } else {
1282
0
        return false /* current key not deleted */;
1283
0
      }
1284
0
    } else {
1285
0
      return false /* current key not deleted */;
1286
0
    }
1287
0
  }
1288
1289
0
  assert(active_.empty());
1290
0
  assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR);
1291
0
  return false /* current key not deleted */;
1292
0
}
1293
1294
18.3k
void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
1295
  // Invariant(children_)
1296
18.3k
  if (child->iter.Valid()) {
1297
11.9k
    assert(child->iter.status().ok());
1298
11.9k
    minHeap_.push(child);
1299
11.9k
  } else {
1300
6.42k
    considerStatus(child->iter.status());
1301
6.42k
  }
1302
18.3k
}
1303
1304
12.6k
void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
1305
12.6k
  if (child->iter.Valid()) {
1306
7.50k
    assert(child->iter.status().ok());
1307
7.50k
    maxHeap_->push(child);
1308
7.50k
  } else {
1309
5.11k
    considerStatus(child->iter.status());
1310
5.11k
  }
1311
12.6k
}
1312
1313
// Advance all non current_ child to > current_.key().
1314
// We advance current_ after the this function call as it does not require
1315
// Seek().
1316
// Advance all range tombstones iters, including the one corresponding to
1317
// current_, to the first tombstone with end_key > current_.key().
1318
// TODO: potentially do cascading seek here too
1319
// TODO: show that invariants hold
1320
0
void MergingIterator::SwitchToForward() {
1321
0
  ClearHeaps();
1322
0
  Slice target = key();
1323
0
  for (auto& child : children_) {
1324
0
    if (&child.iter != current_) {
1325
0
      child.iter.Seek(target);
1326
      // child.iter.status() is set to Status::TryAgain indicating asynchronous
1327
      // request for retrieval of data blocks has been submitted. So it should
1328
      // return at this point and Seek should be called again to retrieve the
1329
      // requested block and add the child to min heap.
1330
0
      if (child.iter.status() == Status::TryAgain()) {
1331
0
        continue;
1332
0
      }
1333
0
      if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1334
0
        assert(child.iter.status().ok());
1335
0
        child.iter.Next();
1336
0
      }
1337
0
    }
1338
0
    AddToMinHeapOrCheckStatus(&child);
1339
0
  }
1340
1341
0
  for (auto& child : children_) {
1342
0
    if (child.iter.status() == Status::TryAgain()) {
1343
0
      child.iter.Seek(target);
1344
0
      if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1345
0
        assert(child.iter.status().ok());
1346
0
        child.iter.Next();
1347
0
      }
1348
0
      AddToMinHeapOrCheckStatus(&child);
1349
0
    }
1350
0
  }
1351
1352
  // Current range tombstone iter also needs to seek for the following case:
1353
  // Previous direction is backward, so range tombstone iter may point to a
1354
  // tombstone before current_. If there is no such tombstone, then the range
1355
  // tombstone iter is !Valid(). Need to reseek here to make it valid again.
1356
0
  if (!range_tombstone_iters_.empty()) {
1357
0
    ParsedInternalKey pik;
1358
0
    ParseInternalKey(target, &pik, false /* log_err_key */)
1359
0
        .PermitUncheckedError();
1360
0
    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
1361
0
      UnownedPtr<TruncatedRangeDelIterator> iter =
1362
0
          range_tombstone_iters_[i].get();
1363
0
      if (iter) {
1364
0
        iter->Seek(pik.user_key);
1365
        // The while loop is needed as the Seek() call above is only for user
1366
        // key. We could have a range tombstone with end_key covering user_key,
1367
        // but still is smaller than target. This happens when the range
1368
        // tombstone is truncated at iter.largest_.
1369
0
        while (iter->Valid() &&
1370
0
               comparator_->Compare(iter->end_key(), pik) <= 0) {
1371
0
          iter->Next();
1372
0
        }
1373
0
        if (range_tombstone_iters_[i]->Valid()) {
1374
0
          InsertRangeTombstoneToMinHeap(
1375
0
              i, comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1376
0
                                      pik) > 0 /* start_key */);
1377
0
        }
1378
0
      }
1379
0
    }
1380
0
  }
1381
1382
0
  direction_ = kForward;
1383
0
  assert(current_ == CurrentForward());
1384
0
}
1385
1386
// Advance all range tombstones iters, including the one corresponding to
1387
// current_, to the first tombstone with start_key <= current_.key().
1388
506
void MergingIterator::SwitchToBackward() {
1389
506
  ClearHeaps();
1390
506
  InitMaxHeap();
1391
506
  Slice target = key();
1392
1.76k
  for (auto& child : children_) {
1393
1.76k
    if (&child.iter != current_) {
1394
1.25k
      child.iter.SeekForPrev(target);
1395
1.25k
      TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
1396
1.25k
      if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1397
0
        assert(child.iter.status().ok());
1398
0
        child.iter.Prev();
1399
0
      }
1400
1.25k
    }
1401
1.76k
    AddToMaxHeapOrCheckStatus(&child);
1402
1.76k
  }
1403
1404
506
  ParsedInternalKey pik;
1405
506
  ParseInternalKey(target, &pik, false /* log_err_key */)
1406
506
      .PermitUncheckedError();
1407
1.86k
  for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
1408
1.36k
    UnownedPtr<TruncatedRangeDelIterator> iter =
1409
1.36k
        range_tombstone_iters_[i].get();
1410
1.36k
    if (iter) {
1411
0
      iter->SeekForPrev(pik.user_key);
1412
      // Since the SeekForPrev() call above is only for user key,
1413
      // we may end up with some range tombstone with start key having the
1414
      // same user key at current_, but with a smaller sequence number. This
1415
      // makes current_ not at maxHeap_ top for the CurrentReverse() call
1416
      // below. If there is a range tombstone start key with the same user
1417
      // key and the same sequence number as current_.key(), it will be fine as
1418
      // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest
1419
      // op_type.
1420
0
      while (iter->Valid() &&
1421
0
             comparator_->Compare(iter->start_key(), pik) > 0) {
1422
0
        iter->Prev();
1423
0
      }
1424
0
      if (iter->Valid()) {
1425
0
        InsertRangeTombstoneToMaxHeap(
1426
0
            i, comparator_->Compare(range_tombstone_iters_[i]->end_key(),
1427
0
                                    pik) <= 0 /* end_key */);
1428
0
      }
1429
0
    }
1430
1.36k
  }
1431
1432
506
  direction_ = kReverse;
1433
506
  if (!prefix_seek_mode_) {
1434
    // Note that we don't do assert(current_ == CurrentReverse()) here
1435
    // because it is possible to have some keys larger than the seek-key
1436
    // inserted between Seek() and SeekToLast(), which makes current_ not
1437
    // equal to CurrentReverse().
1438
506
    current_ = CurrentReverse();
1439
506
  }
1440
506
  assert(current_ == CurrentReverse());
1441
506
}
1442
1443
11.0k
void MergingIterator::ClearHeaps(bool clear_active) {
1444
11.0k
  minHeap_.clear();
1445
11.0k
  if (maxHeap_) {
1446
1.01k
    maxHeap_->clear();
1447
1.01k
  }
1448
11.0k
  if (clear_active) {
1449
6.87k
    active_.clear();
1450
6.87k
  }
1451
11.0k
}
1452
1453
3.97k
void MergingIterator::InitMaxHeap() {
1454
3.97k
  if (!maxHeap_) {
1455
3.46k
    maxHeap_ =
1456
3.46k
        std::make_unique<MergerMaxIterHeap>(MaxHeapItemComparator(comparator_));
1457
3.46k
  }
1458
3.97k
}
1459
1460
// Assume there is a next key that is not covered by range tombstone.
1461
// Pre-condition:
1462
// - Invariants (3) and (4)
1463
// - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i,
1464
// k) for all levels i (LevelNextVisible() defined in Seek()).
1465
//
1466
// Define NextVisible(k) to be the first key >= k from among children_ that
1467
// is not covered by any range tombstone.
1468
// Post-condition:
1469
// - Invariants (1)-(4) hold
1470
// - (*): minHeap_->top()->key() == NextVisible(k)
1471
//
1472
// Loop invariants:
1473
// - Invariants (3) and (4)
1474
// - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k)
1475
//
1476
// Progress: minHeap_.top()->key() is non-decreasing and strictly increases in
1477
// a finite number of iterations.
1478
// TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with
1479
//  k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range
1480
//  tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5
1481
//  triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers
1482
//  SeekImpl(3 /* target */, 3).
1483
//  Ideally we should only move iterators forward in SeekImpl(), and the
1484
//  progress condition can be made simpler: iterator only moves forward.
1485
//
1486
// Proof sketch:
1487
// Post-condition:
1488
// Invariant (1) holds when this method returns:
1489
// Ignoring the empty minHeap_ case, there are two cases:
1490
// Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey()
1491
// By invariants (rti) and (active_), active_ being empty means if a
1492
// pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note
1493
// that PopDeleteRangeStart() was called right before the while loop condition,
1494
// so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must
1495
// be of type ITERATOR.
1496
// Case 2: SkipNextDeleted() returns false. The method returns false only when
1497
// minHeap_.top().type == ITERATOR.
1498
//
1499
// Invariant (2) holds when this method returns:
1500
// From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is
1501
// children_[i] for some i. Suppose that children_[i].iter.key() is covered by
1502
// some range tombstone. This means there is a j <= i and a range tombstone from
1503
// level j with start_key() < children_[i].iter.key() < end_key().
1504
// - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi),
1505
// pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik
1506
// is either start or end key of this range tombstone. If
1507
// pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at
1508
// top of minHeap_ which would contradict Invariant (1). So
1509
// pinned_heap_item_[j].tombstone_pik > children_[i].iter.key().
1510
// By Invariant (3), range_tombstone_iters_[j].prev.end_key() <
1511
// children_[i].iter.key(). We assume that in each level, range tombstones
1512
// cover non-overlapping ranges. So range_tombstone_iters_[j] is at
1513
// the range tombstone with start_key() < children_[i].iter.key() < end_key()
1514
// and has its end_key() in minHeap_. By Invariants (phi) and (active_),
1515
// j is in active_. From while loop condition, SkipNextDeleted() must have
1516
// returned false for this method to return.
1517
//   - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i)
1518
// was called for some j' < i and j' in active_. Note that since j' is in
1519
// active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik =
1520
// range_tombstone_iters_[j']->end_key(). So
1521
// range_tombstone_iters_[j']->end_key() must be larger than
1522
// children_[i].iter.key() to not be at top of minHeap_. This means after
1523
// SeekImpl(), children_[i] would be at a key > children_[i].iter.key()
1524
// -- contradiction.
1525
//   - If j == i, children_[i]->Next() would have been called and children_[i]
1526
// would be at a key > children_[i].iter.key() -- contradiction.
1527
// - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j]
1528
// points to an SST file with all range tombstones from that file exhausted.
1529
// The file must come before the file containing the first
1530
// range tombstone with start_key() < children_[i].iter.key() < end_key().
1531
// Assume files from same level have non-overlapping ranges, the current file's
1532
// meta.largest is less than children_[i].iter.key(). So the file boundary key,
1533
// which has value meta.largest must have been popped from minHeap_ before
1534
// children_[i].iter.key(). So range_tombstone_iters_[j] would not point to
1535
// this SST file -- contradiction.
1536
// So it is impossible for children_[i].iter.key() to be covered by a range
1537
// tombstone.
1538
//
1539
// Post-condition (*) holds when the function returns:
1540
// From loop invariant (*) that k <= children_[i].iter.key() <=
1541
// LevelNextVisible(i, k) and Invariant (2) above, when the function returns,
1542
// minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels
1543
// i. This is equal to NextVisible(k).
1544
//
1545
// Invariant (3) holds after each iteration:
1546
// PopDeleteRangeStart() does not change range tombstone position.
1547
// In SkipNextDeleted():
1548
//   - If DELETE_RANGE_END is popped from minHeap_, it means the range
1549
//   tombstone's end key is < all other point keys, so it is safe to advance to
1550
//   next range tombstone.
1551
//   - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()),
1552
//   we assume that file's last range tombstone's
1553
//   end_key <= file boundary key < all other point keys. So it is safe to
1554
//   move to the first range tombstone in the next SST file.
1555
//   - If children_[i]->Next() is called, then it is fine as it is advancing a
1556
//   point iterator.
1557
//   - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s
1558
//   post-condition if its pre-condition holds. First pre-condition follows
1559
//   from loop invariant where Invariant (3) holds for all levels i.
1560
//   Now we should second pre-condition holds. Since Invariant (3) holds for
1561
//   all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key()
1562
//   < children_[l].iter.key(). `target` is the value of
1563
//   range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_.
1564
//   By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and
1565
//   pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key().
1566
//   This end_key must be larger than children_[l].key() since it was not at top
1567
//   of minHeap_. So for all levels j <= l,
1568
//   range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target
1569
//
1570
// Invariant (4) holds after each iteration:
1571
// A level i is inserted into active_ during calls to PopDeleteRangeStart().
1572
// In that case, range_tombstone_iters_[i].start_key() < all point keys
1573
// by heap property and the assumption that point keys and range tombstone keys
1574
// are distinct.
1575
// If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j]
1576
// where target = range_tombstone_iters_[j]->end_key() and children_[l]->key()
1577
// < target. By loop invariants, (3) and (4) holds for levels.
1578
// Since target > children_[l]->key(), it also holds that for j < l,
1579
// range_tombstone_iters_[j].prev.end_key() < target and that if j in active_,
1580
// range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of
1581
// SeekImpl(target, l) holds, and (4) follow from its post-condition.
1582
// All other places either in this function either advance point iterators
1583
// or remove some level from active_, so (4) still holds.
1584
//
1585
// Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i,
1586
// k).
1587
// k <= children_[i] follows from loop `progress` condition.
1588
// Consider when children_[i] is changed for any i. It is through
1589
// children_[i].iter.Next() or SeekImpl() in SkipNextDeleted().
1590
// If children_[i].iter.Next() is called, there is a range tombstone from level
1591
// i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_.
1592
// By Invariant (4), tombstone's start_key < children_[i].iter.key(). By
1593
// invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_
1594
// and that children_[i].iter.key() < end_key. So children_[i].iter.key() is
1595
// not visible, and it is safe to call Next().
1596
// If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns,
1597
// target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l,
1598
// and children_[<l] is not touched. We know `target` is
1599
// range_tombstone_iters_[j]->end_key() for some j < i and j is in active_.
1600
// By Invariant (4), range_tombstone_iters_[j]->start_key() <
1601
// children_[i].iter.key() for all i >= l. So for each level i >= l, the range
1602
// [children_[i].iter.key(), target) is not visible. So after SeekImpl(),
1603
// children_[i].iter.key() <= LevelNextVisible(i, target) <=
1604
// LevelNextVisible(i, k).
1605
//
1606
// `Progress` holds for each iteration:
1607
// Very sloppy intuition:
1608
// - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_
1609
// is updated from the start key to the end key of the same range tombstone.
1610
// We assume that start key <= end key for the same range tombstone.
1611
// - in SkipNextDeleted()
1612
//   - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced
1613
//     and the relevant pinned_heap_item_.tombstone_pik is increased or popped
1614
//     from minHeap_.
1615
//   - If the top of heap is a file boundary key, then both point iter and
1616
//     range tombstone iter are advanced to the next file.
1617
//   - If the top of heap is ITERATOR and current->iter.Next() is called, it
1618
//     moves to a larger point key.
1619
//   - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all
1620
//     iterators from levels >= l are advanced to some key >= k by its contract.
1621
//     And top of minHeap_ before SeekImpl(k, l) was less than k.
1622
// There are special cases where different heap items have the same key,
1623
// e.g. when two range tombstone end keys share the same value). In
1624
// these cases, iterators are being advanced, so the minimum key should increase
1625
// in a finite number of steps.
1626
41.1k
inline void MergingIterator::FindNextVisibleKey() {
1627
41.1k
  PopDeleteRangeStart();
1628
  // PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START
1629
  // active_ being empty implies no DELETE_RANGE_END in heap.
1630
  // So minHeap_->top() must be of type ITERATOR.
1631
41.1k
  while (
1632
115k
      !minHeap_.empty() &&
1633
109k
      (!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) &&
1634
83.7k
      SkipNextDeleted()) {
1635
74.5k
    PopDeleteRangeStart();
1636
74.5k
  }
1637
  // Checks Invariant (1)
1638
41.1k
  assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR);
1639
41.1k
}
1640
1641
16.0k
inline void MergingIterator::FindPrevVisibleKey() {
1642
16.0k
  PopDeleteRangeEnd();
1643
  // PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END
1644
  // active_ being empty implies no DELETE_RANGE_START in heap.
1645
  // So maxHeap_->top() must be of type ITERATOR.
1646
16.0k
  while (
1647
19.9k
      !maxHeap_->empty() &&
1648
16.5k
      (!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) &&
1649
3.96k
      SkipPrevDeleted()) {
1650
3.96k
    PopDeleteRangeEnd();
1651
3.96k
  }
1652
16.0k
}
1653
1654
InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
1655
                                     InternalIterator** list, int n,
1656
2.09k
                                     Arena* arena, bool prefix_seek_mode) {
1657
2.09k
  assert(n >= 0);
1658
2.09k
  if (n == 0) {
1659
0
    return NewEmptyInternalIterator<Slice>(arena);
1660
2.09k
  } else if (n == 1) {
1661
2.09k
    return list[0];
1662
2.09k
  } else {
1663
0
    if (arena == nullptr) {
1664
0
      return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
1665
0
    } else {
1666
0
      auto mem = arena->AllocateAligned(sizeof(MergingIterator));
1667
0
      return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
1668
0
    }
1669
0
  }
1670
2.09k
}
1671
1672
MergeIteratorBuilder::MergeIteratorBuilder(
1673
    const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
1674
    const Slice* iterate_upper_bound)
1675
20.3k
    : first_iter(nullptr), use_merging_iter(false), arena(a) {
1676
20.3k
  auto mem = arena->AllocateAligned(sizeof(MergingIterator));
1677
20.3k
  merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
1678
20.3k
                                         prefix_seek_mode, iterate_upper_bound);
1679
20.3k
}
1680
1681
20.3k
MergeIteratorBuilder::~MergeIteratorBuilder() {
1682
20.3k
  if (first_iter != nullptr) {
1683
0
    first_iter->~InternalIterator();
1684
0
  }
1685
20.3k
  if (merge_iter != nullptr) {
1686
7.00k
    merge_iter->~MergingIterator();
1687
7.00k
  }
1688
20.3k
}
1689
1690
4.40k
void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
1691
4.40k
  if (!use_merging_iter && first_iter != nullptr) {
1692
0
    merge_iter->AddIterator(first_iter);
1693
0
    use_merging_iter = true;
1694
0
    first_iter = nullptr;
1695
0
  }
1696
4.40k
  if (use_merging_iter) {
1697
0
    merge_iter->AddIterator(iter);
1698
4.40k
  } else {
1699
4.40k
    first_iter = iter;
1700
4.40k
  }
1701
4.40k
}
1702
1703
void MergeIteratorBuilder::AddPointAndTombstoneIterator(
1704
    InternalIterator* point_iter,
1705
    std::unique_ptr<TruncatedRangeDelIterator>&& tombstone_iter,
1706
40.7k
    std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr) {
1707
  // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
1708
40.7k
  bool add_range_tombstone = tombstone_iter ||
1709
38.2k
                             !merge_iter->range_tombstone_iters_.empty() ||
1710
38.2k
                             tombstone_iter_ptr;
1711
40.7k
  if (!use_merging_iter && (add_range_tombstone || first_iter)) {
1712
13.2k
    use_merging_iter = true;
1713
13.2k
    if (first_iter) {
1714
13.2k
      merge_iter->AddIterator(first_iter);
1715
13.2k
      first_iter = nullptr;
1716
13.2k
    }
1717
13.2k
  }
1718
40.7k
  if (use_merging_iter) {
1719
24.8k
    merge_iter->AddIterator(point_iter);
1720
24.8k
    if (add_range_tombstone) {
1721
      // If there was a gap, fill in nullptr as empty range tombstone iterators.
1722
24.3k
      while (merge_iter->range_tombstone_iters_.size() <
1723
24.3k
             merge_iter->children_.size() - 1) {
1724
16.0k
        merge_iter->AddRangeTombstoneIterator(nullptr);
1725
16.0k
      }
1726
8.36k
      merge_iter->AddRangeTombstoneIterator(std::move(tombstone_iter));
1727
8.36k
    }
1728
1729
24.8k
    if (tombstone_iter_ptr) {
1730
      // This is needed instead of setting to &range_tombstone_iters_[i]
1731
      // directly here since the memory address of range_tombstone_iters_[i]
1732
      // might change during vector resizing.
1733
5.81k
      range_del_iter_ptrs_.emplace_back(
1734
5.81k
          merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr);
1735
5.81k
    }
1736
24.8k
  } else {
1737
15.8k
    first_iter = point_iter;
1738
15.8k
  }
1739
40.7k
}
1740
1741
20.3k
InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) {
1742
20.3k
  InternalIterator* ret = nullptr;
1743
20.3k
  if (!use_merging_iter) {
1744
7.00k
    ret = first_iter;
1745
7.00k
    first_iter = nullptr;
1746
13.2k
  } else {
1747
13.2k
    for (auto& p : range_del_iter_ptrs_) {
1748
5.81k
      *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]);
1749
5.81k
    }
1750
13.2k
    if (db_iter && !merge_iter->range_tombstone_iters_.empty()) {
1751
      // memtable is always the first level
1752
8.36k
      db_iter->SetMemtableRangetombstoneIter(
1753
8.36k
          &merge_iter->range_tombstone_iters_.front());
1754
8.36k
    }
1755
13.2k
    merge_iter->Finish();
1756
13.2k
    ret = merge_iter;
1757
13.2k
    merge_iter = nullptr;
1758
13.2k
  }
1759
20.3k
  return ret;
1760
20.3k
}
1761
1762
}  // namespace ROCKSDB_NAMESPACE