/src/rocksdb/table/merging_iterator.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "table/merging_iterator.h" |
11 | | |
12 | | #include "db/arena_wrapped_db_iter.h" |
13 | | #include "monitoring/file_read_sample.h" |
14 | | |
15 | | namespace ROCKSDB_NAMESPACE { |
16 | | // MergingIterator uses a min/max heap to combine data from point iterators. |
17 | | // Range tombstones can be added and keys covered by range tombstones will be |
18 | | // skipped. |
19 | | // |
20 | | // The following are implementation details and can be ignored by user. |
21 | | // For merging iterator to process range tombstones, it treats the start and end |
22 | | // keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_ |
23 | | // together with regular point keys. Each range tombstone is active only within |
24 | | // its internal key range [start_key, end_key). An `active_` set is used to |
25 | | // track levels that have an active range tombstone. Take forward scanning |
26 | | // for example. Level j is in active_ if its current range tombstone has its |
27 | | // start_key popped from minHeap_ and its end_key in minHeap_. If the top of |
28 | | // minHeap_ is a point key from level L, we can determine if the point key is |
29 | | // covered by any range tombstone by checking if there is an l <= L in active_. |
30 | | // The case of l == L also involves checking range tombstone's sequence number. |
31 | | // |
32 | | // The following (non-exhaustive) list of invariants are maintained by |
33 | | // MergingIterator during forward scanning. After each InternalIterator API, |
34 | | // i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty: |
35 | | // (1) minHeap_.top().type == ITERATOR |
36 | | // (2) minHeap_.top()->key() is not covered by any range tombstone. |
37 | | // |
38 | | // After each call to SeekImpl() in addition to the functions mentioned above: |
39 | | // (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() < |
40 | | // children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before |
41 | | // the first range tombstone from level j with end_key() > |
42 | | // children_[i].iter.key(). |
43 | | // (4) For all level i and j <= i, if j in active_, then |
44 | | // range_tombstone_iters_[j]->start_key() < children_[i].iter.key(). |
45 | | // - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be |
46 | | // the last range tombstone from that range tombstone iterator. |
47 | | // - When referring to range tombstone start/end keys, assume it is the value of |
48 | | // HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes |
49 | | // range tombstone keys have distinct values from point keys. |
50 | | // |
51 | | // Applicable class variables have their own (forward scanning) invariants |
52 | | // listed in the comments above their definition. |
53 | | class MergingIterator : public InternalIterator { |
54 | | public: |
55 | | MergingIterator(const InternalKeyComparator* comparator, |
56 | | InternalIterator** children, int n, bool is_arena_mode, |
57 | | bool prefix_seek_mode, |
58 | | const Slice* iterate_upper_bound = nullptr) |
59 | 20.3k | : is_arena_mode_(is_arena_mode), |
60 | 20.3k | prefix_seek_mode_(prefix_seek_mode), |
61 | 20.3k | direction_(kForward), |
62 | 20.3k | comparator_(comparator), |
63 | 20.3k | current_(nullptr), |
64 | 20.3k | minHeap_(MinHeapItemComparator(comparator_)), |
65 | 20.3k | pinned_iters_mgr_(nullptr), |
66 | 20.3k | iterate_upper_bound_(iterate_upper_bound) { |
67 | 20.3k | children_.resize(n); |
68 | 20.3k | for (int i = 0; i < n; i++) { |
69 | 0 | children_[i].level = i; |
70 | 0 | children_[i].iter.Set(children[i]); |
71 | 0 | } |
72 | 20.3k | } |
73 | | |
74 | 28.2k | void considerStatus(Status s) { |
75 | 28.2k | if (!s.ok() && status_.ok()) { |
76 | 0 | status_ = s; |
77 | 0 | } |
78 | 28.2k | } |
79 | | |
80 | 38.1k | virtual void AddIterator(InternalIterator* iter) { |
81 | 38.1k | children_.emplace_back(children_.size(), iter); |
82 | 38.1k | if (pinned_iters_mgr_) { |
83 | 0 | iter->SetPinnedItersMgr(pinned_iters_mgr_); |
84 | 0 | } |
85 | | // Invalidate to ensure `Seek*()` is called to construct the heaps before |
86 | | // use. |
87 | 38.1k | current_ = nullptr; |
88 | 38.1k | } |
89 | | |
90 | | // There must be either no range tombstone iterator or the same number of |
91 | | // range tombstone iterators as point iterators after all iters are added. |
92 | | // The i-th added range tombstone iterator and the i-th point iterator |
93 | | // must point to the same LSM level. |
94 | | // Merging iterator takes ownership of `iter` and is responsible for freeing |
95 | | // it. One exception to this is when a LevelIterator moves to a different SST |
96 | | // file or when Iterator::Refresh() is called, the range tombstone iterator |
97 | | // could be updated. In that case, this merging iterator is only responsible |
98 | | // for freeing the new range tombstone iterator that it has pointers to in |
99 | | // range_tombstone_iters_. |
100 | | void AddRangeTombstoneIterator( |
101 | 24.3k | std::unique_ptr<TruncatedRangeDelIterator>&& iter) { |
102 | 24.3k | range_tombstone_iters_.emplace_back(std::move(iter)); |
103 | 24.3k | } |
104 | | |
105 | | // Called by MergingIteratorBuilder when all point iterators and range |
106 | | // tombstone iterators are added. Initializes HeapItems for range tombstone |
107 | | // iterators. |
108 | 13.2k | void Finish() { |
109 | 13.2k | if (!range_tombstone_iters_.empty()) { |
110 | 8.36k | assert(range_tombstone_iters_.size() == children_.size()); |
111 | 8.36k | pinned_heap_item_.resize(range_tombstone_iters_.size()); |
112 | 32.7k | for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { |
113 | 24.3k | pinned_heap_item_[i].level = i; |
114 | | // Range tombstone end key is exclusive. If a point internal key has the |
115 | | // same user key and sequence number as the start or end key of a range |
116 | | // tombstone, the order will be start < end key < internal key with the |
117 | | // following op_type change. This is helpful to ensure keys popped from |
118 | | // heap are in expected order since range tombstone start/end keys will |
119 | | // be distinct from point internal keys. Strictly speaking, this is only |
120 | | // needed for tombstone end points that are truncated in |
121 | | // TruncatedRangeDelIterator since untruncated tombstone end points |
122 | | // always have kMaxSequenceNumber and kTypeRangeDeletion (see |
123 | | // TruncatedRangeDelIterator::start_key()/end_key()). |
124 | 24.3k | pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid; |
125 | 24.3k | } |
126 | 8.36k | } |
127 | 13.2k | } |
128 | | |
129 | 20.3k | ~MergingIterator() override { |
130 | 20.3k | range_tombstone_iters_.clear(); |
131 | | |
132 | 38.1k | for (auto& child : children_) { |
133 | 38.1k | child.iter.DeleteIter(is_arena_mode_); |
134 | 38.1k | } |
135 | 20.3k | status_.PermitUncheckedError(); |
136 | 20.3k | } |
137 | | |
138 | 0 | void SetRangeDelReadSeqno(SequenceNumber read_seqno) override { |
139 | 0 | for (auto& child : children_) { |
140 | | // This should only be needed for LevelIterator (iterators from L1+). |
141 | 0 | child.iter.SetRangeDelReadSeqno(read_seqno); |
142 | 0 | } |
143 | 0 | for (auto& child : range_tombstone_iters_) { |
144 | 0 | if (child) { |
145 | 0 | child->SetRangeDelReadSeqno(read_seqno); |
146 | 0 | } |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | 70.4k | bool Valid() const override { return current_ != nullptr && status_.ok(); } |
151 | | |
152 | 7.53k | Status status() const override { return status_; } |
153 | | |
154 | | // Add range_tombstone_iters_[level] into min heap. |
155 | | // Updates active_ if the end key of a range tombstone is inserted. |
156 | | // pinned_heap_items_[level].type is updated based on `start_key`. |
157 | | // |
158 | | // If range_tombstone_iters_[level] is after iterate_upper_bound_, |
159 | | // it is removed from the heap. |
160 | | // @param start_key specifies which end point of the range tombstone to add. |
161 | | void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true, |
162 | 142k | bool replace_top = false) { |
163 | 142k | assert(!range_tombstone_iters_.empty() && |
164 | 142k | range_tombstone_iters_[level]->Valid()); |
165 | | // Maintains Invariant(phi) |
166 | 142k | if (start_key) { |
167 | 71.3k | pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; |
168 | 71.3k | ParsedInternalKey pik = range_tombstone_iters_[level]->start_key(); |
169 | | // iterate_upper_bound does not have timestamp |
170 | 71.3k | if (iterate_upper_bound_ && |
171 | 0 | comparator_->user_comparator()->CompareWithoutTimestamp( |
172 | 0 | pik.user_key, true /* a_has_ts */, *iterate_upper_bound_, |
173 | 0 | false /* b_has_ts */) >= 0) { |
174 | 0 | if (replace_top) { |
175 | | // replace_top implies this range tombstone iterator is still in |
176 | | // minHeap_ and at the top. |
177 | 0 | minHeap_.pop(); |
178 | 0 | } |
179 | 0 | return; |
180 | 0 | } |
181 | 71.3k | pinned_heap_item_[level].SetTombstoneKey(std::move(pik)); |
182 | | // Checks Invariant(active_) |
183 | 71.3k | assert(active_.count(level) == 0); |
184 | 71.3k | } else { |
185 | | // allow end key to go over upper bound (if present) since start key is |
186 | | // before upper bound and the range tombstone could still cover a |
187 | | // range before upper bound. |
188 | | // Maintains Invariant(active_) |
189 | 71.3k | pinned_heap_item_[level].SetTombstoneKey( |
190 | 71.3k | range_tombstone_iters_[level]->end_key()); |
191 | 71.3k | pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; |
192 | 71.3k | active_.insert(level); |
193 | 71.3k | } |
194 | 142k | if (replace_top) { |
195 | 140k | minHeap_.replace_top(&pinned_heap_item_[level]); |
196 | 140k | } else { |
197 | 2.55k | minHeap_.push(&pinned_heap_item_[level]); |
198 | 2.55k | } |
199 | 142k | } |
200 | | |
201 | | // Add range_tombstone_iters_[level] into max heap. |
202 | | // Updates active_ if the start key of a range tombstone is inserted. |
203 | | // @param end_key specifies which end point of the range tombstone to add. |
204 | | void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true, |
205 | 0 | bool replace_top = false) { |
206 | 0 | assert(!range_tombstone_iters_.empty() && |
207 | 0 | range_tombstone_iters_[level]->Valid()); |
208 | 0 | if (end_key) { |
209 | 0 | pinned_heap_item_[level].SetTombstoneKey( |
210 | 0 | range_tombstone_iters_[level]->end_key()); |
211 | 0 | pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; |
212 | 0 | assert(active_.count(level) == 0); |
213 | 0 | } else { |
214 | 0 | pinned_heap_item_[level].SetTombstoneKey( |
215 | 0 | range_tombstone_iters_[level]->start_key()); |
216 | 0 | pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; |
217 | 0 | active_.insert(level); |
218 | 0 | } |
219 | 0 | if (replace_top) { |
220 | 0 | maxHeap_->replace_top(&pinned_heap_item_[level]); |
221 | 0 | } else { |
222 | 0 | maxHeap_->push(&pinned_heap_item_[level]); |
223 | 0 | } |
224 | 0 | } |
225 | | |
226 | | // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START |
227 | | // until minHeap_ is empty or the top of the minHeap_ is not of type |
228 | | // DELETE_RANGE_START. Each such item means a range tombstone becomes active, |
229 | | // so `active_` is updated accordingly. |
230 | 115k | void PopDeleteRangeStart() { |
231 | 187k | while (!minHeap_.empty() && |
232 | 180k | minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) { |
233 | 71.3k | TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr); |
234 | | // Invariant(rti) holds since |
235 | | // range_tombstone_iters_[minHeap_.top()->level] is still valid, and |
236 | | // parameter `replace_top` is set to true here to ensure only one such |
237 | | // HeapItem is in minHeap_. |
238 | 71.3k | InsertRangeTombstoneToMinHeap( |
239 | 71.3k | minHeap_.top()->level, false /* start_key */, true /* replace_top */); |
240 | 71.3k | } |
241 | 115k | } |
242 | | |
243 | | // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END |
244 | | // until maxHeap_ is empty or the top of the maxHeap_ is not of type |
245 | | // DELETE_RANGE_END. Each such item means a range tombstone becomes active, |
246 | | // so `active_` is updated accordingly. |
247 | 19.9k | void PopDeleteRangeEnd() { |
248 | 19.9k | while (!maxHeap_->empty() && |
249 | 16.5k | maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) { |
250 | | // insert start key of this range tombstone and updates active_ |
251 | 0 | InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */, |
252 | 0 | true /* replace_top */); |
253 | 0 | } |
254 | 19.9k | } |
255 | | |
256 | 6.36k | void SeekToFirst() override { |
257 | 6.36k | ClearHeaps(); |
258 | 6.36k | status_ = Status::OK(); |
259 | 16.1k | for (auto& child : children_) { |
260 | 16.1k | child.iter.SeekToFirst(); |
261 | 16.1k | AddToMinHeapOrCheckStatus(&child); |
262 | 16.1k | } |
263 | | |
264 | 17.3k | for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { |
265 | 10.9k | if (range_tombstone_iters_[i]) { |
266 | 2.55k | range_tombstone_iters_[i]->SeekToFirst(); |
267 | 2.55k | if (range_tombstone_iters_[i]->Valid()) { |
268 | | // It is possible to be invalid due to snapshots. |
269 | 2.55k | InsertRangeTombstoneToMinHeap(i); |
270 | 2.55k | } |
271 | 2.55k | } |
272 | 10.9k | } |
273 | 6.36k | FindNextVisibleKey(); |
274 | 6.36k | direction_ = kForward; |
275 | 6.36k | current_ = CurrentForward(); |
276 | 6.36k | } |
277 | | |
278 | 0 | void SeekToLast() override { |
279 | 0 | ClearHeaps(); |
280 | 0 | InitMaxHeap(); |
281 | 0 | status_ = Status::OK(); |
282 | 0 | for (auto& child : children_) { |
283 | 0 | child.iter.SeekToLast(); |
284 | 0 | AddToMaxHeapOrCheckStatus(&child); |
285 | 0 | } |
286 | |
|
287 | 0 | for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { |
288 | 0 | if (range_tombstone_iters_[i]) { |
289 | 0 | range_tombstone_iters_[i]->SeekToLast(); |
290 | 0 | if (range_tombstone_iters_[i]->Valid()) { |
291 | | // It is possible to be invalid due to snapshots. |
292 | 0 | InsertRangeTombstoneToMaxHeap(i); |
293 | 0 | } |
294 | 0 | } |
295 | 0 | } |
296 | 0 | FindPrevVisibleKey(); |
297 | 0 | direction_ = kReverse; |
298 | 0 | current_ = CurrentReverse(); |
299 | 0 | } |
300 | | |
301 | | // Position this merging iterator at the first key >= target (internal key). |
302 | | // If range tombstones are present, keys covered by range tombstones are |
303 | | // skipped, and this merging iter points to the first non-range-deleted key >= |
304 | | // target after Seek(). If !Valid() and status().ok() then this iterator |
305 | | // reaches the end. |
306 | | // |
307 | | // If range tombstones are present, cascading seeks may be called (an |
308 | | // optimization adapted from Pebble https://github.com/cockroachdb/pebble). |
309 | | // Roughly, if there is a range tombstone [start, end) that covers the |
310 | | // target user key at level L, then this range tombstone must cover the range |
311 | | // [target key, end) in all levels > L. So for all levels > L, we can pretend |
312 | | // the target key is `end`. This optimization is applied at each level and |
313 | | // hence the name "cascading seek". |
314 | 668 | void Seek(const Slice& target) override { |
315 | | // Define LevelNextVisible(i, k) to be the first key >= k in level i that is |
316 | | // not covered by any range tombstone. |
317 | | // After SeekImpl(target, 0), invariants (3) and (4) hold. |
318 | | // For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i, |
319 | | // target). By the contract of FindNextVisibleKey(), Invariants (1)-(4) |
320 | | // holds after this call, and minHeap_.top().iter points to the |
321 | | // first key >= target among children_ that is not covered by any range |
322 | | // tombstone. |
323 | 668 | status_ = Status::OK(); |
324 | 668 | SeekImpl(target); |
325 | 668 | FindNextVisibleKey(); |
326 | | |
327 | 668 | direction_ = kForward; |
328 | 668 | { |
329 | 668 | PERF_TIMER_GUARD(seek_min_heap_time); |
330 | 668 | current_ = CurrentForward(); |
331 | 668 | } |
332 | 668 | } |
333 | | |
334 | 3.46k | void SeekForPrev(const Slice& target) override { |
335 | 3.46k | assert(range_tombstone_iters_.empty() || |
336 | 3.46k | range_tombstone_iters_.size() == children_.size()); |
337 | 3.46k | status_ = Status::OK(); |
338 | 3.46k | SeekForPrevImpl(target); |
339 | 3.46k | FindPrevVisibleKey(); |
340 | | |
341 | 3.46k | direction_ = kReverse; |
342 | 3.46k | { |
343 | 3.46k | PERF_TIMER_GUARD(seek_max_heap_time); |
344 | 3.46k | current_ = CurrentReverse(); |
345 | 3.46k | } |
346 | 3.46k | } |
347 | | |
348 | 34.0k | void Next() override { |
349 | 34.0k | assert(Valid()); |
350 | | // Ensure that all children are positioned after key(). |
351 | | // If we are moving in the forward direction, it is already |
352 | | // true for all the non-current children since current_ is |
353 | | // the smallest child and key() == current_->key(). |
354 | 34.0k | if (direction_ != kForward) { |
355 | | // The loop advanced all non-current children to be > key() so current_ |
356 | | // should still be strictly the smallest key. |
357 | 0 | SwitchToForward(); |
358 | 0 | } |
359 | | |
360 | | // For the heap modifications below to be correct, current_ must be the |
361 | | // current top of the heap. |
362 | 34.0k | assert(current_ == CurrentForward()); |
363 | | // as the current points to the current record. move the iterator forward. |
364 | 34.0k | current_->Next(); |
365 | 34.0k | if (current_->Valid()) { |
366 | | // current is still valid after the Next() call above. Call |
367 | | // replace_top() to restore the heap property. When the same child |
368 | | // iterator yields a sequence of keys, this is cheap. |
369 | 25.8k | assert(current_->status().ok()); |
370 | 25.8k | minHeap_.replace_top(minHeap_.top()); |
371 | 25.8k | } else { |
372 | | // current stopped being valid, remove it from the heap. |
373 | 8.20k | considerStatus(current_->status()); |
374 | 8.20k | minHeap_.pop(); |
375 | 8.20k | } |
376 | | // Invariants (3) and (4) hold when after advancing current_. |
377 | | // Let k be the smallest key among children_[i].iter.key(). |
378 | | // k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all |
379 | | // level i. After FindNextVisible(), Invariants (1)-(4) hold and |
380 | | // minHeap_.top()->key() is the first key >= k from any children_ that is |
381 | | // not covered by any range tombstone. |
382 | 34.0k | FindNextVisibleKey(); |
383 | 34.0k | current_ = CurrentForward(); |
384 | 34.0k | } |
385 | | |
386 | 34.0k | bool NextAndGetResult(IterateResult* result) override { |
387 | 34.0k | Next(); |
388 | 34.0k | bool is_valid = Valid(); |
389 | 34.0k | if (is_valid) { |
390 | 28.5k | result->key = key(); |
391 | 28.5k | result->bound_check_result = UpperBoundCheckResult(); |
392 | 28.5k | result->value_prepared = current_->IsValuePrepared(); |
393 | 28.5k | } |
394 | 34.0k | return is_valid; |
395 | 34.0k | } |
396 | | |
397 | 12.5k | void Prev() override { |
398 | 12.5k | assert(Valid()); |
399 | | // Ensure that all children are positioned before key(). |
400 | | // If we are moving in the reverse direction, it is already |
401 | | // true for all the non-current children since current_ is |
402 | | // the largest child and key() == current_->key(). |
403 | 12.5k | if (direction_ != kReverse) { |
404 | | // Otherwise, retreat the non-current children. We retreat current_ |
405 | | // just after the if-block. |
406 | 506 | SwitchToBackward(); |
407 | 506 | } |
408 | | |
409 | | // For the heap modifications below to be correct, current_ must be the |
410 | | // current top of the heap. |
411 | 12.5k | assert(current_ == CurrentReverse()); |
412 | 12.5k | current_->Prev(); |
413 | 12.5k | if (current_->Valid()) { |
414 | | // current is still valid after the Prev() call above. Call |
415 | | // replace_top() to restore the heap property. When the same child |
416 | | // iterator yields a sequence of keys, this is cheap. |
417 | 7.57k | assert(current_->status().ok()); |
418 | 7.57k | maxHeap_->replace_top(maxHeap_->top()); |
419 | 7.57k | } else { |
420 | | // current stopped being valid, remove it from the heap. |
421 | 4.97k | considerStatus(current_->status()); |
422 | 4.97k | maxHeap_->pop(); |
423 | 4.97k | } |
424 | 12.5k | FindPrevVisibleKey(); |
425 | 12.5k | current_ = CurrentReverse(); |
426 | 12.5k | } |
427 | | |
428 | 65.2k | Slice key() const override { |
429 | 65.2k | assert(Valid()); |
430 | 65.2k | return current_->key(); |
431 | 65.2k | } |
432 | | |
433 | 30.9k | uint64_t write_unix_time() const override { |
434 | 30.9k | assert(Valid()); |
435 | 30.9k | return current_->write_unix_time(); |
436 | 30.9k | } |
437 | | |
438 | 30.9k | Slice value() const override { |
439 | 30.9k | assert(Valid()); |
440 | 30.9k | return current_->value(); |
441 | 30.9k | } |
442 | | |
443 | 17.3k | bool PrepareValue() override { |
444 | 17.3k | assert(Valid()); |
445 | 17.3k | if (current_->PrepareValue()) { |
446 | 17.3k | return true; |
447 | 17.3k | } |
448 | | |
449 | 0 | considerStatus(current_->status()); |
450 | 0 | assert(!status_.ok()); |
451 | 0 | return false; |
452 | 17.3k | } |
453 | | |
454 | | // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result |
455 | | // from current child iterator. Potentially as long as one of child iterator |
456 | | // report out of bound is not possible, we know current key is within bound. |
457 | 0 | bool MayBeOutOfLowerBound() override { |
458 | 0 | assert(Valid()); |
459 | 0 | return current_->MayBeOutOfLowerBound(); |
460 | 0 | } |
461 | | |
462 | 28.5k | IterBoundCheck UpperBoundCheckResult() override { |
463 | 28.5k | assert(Valid()); |
464 | 28.5k | return current_->UpperBoundCheckResult(); |
465 | 28.5k | } |
466 | | |
467 | 13.2k | void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { |
468 | 13.2k | pinned_iters_mgr_ = pinned_iters_mgr; |
469 | 38.1k | for (auto& child : children_) { |
470 | 38.1k | child.iter.SetPinnedItersMgr(pinned_iters_mgr); |
471 | 38.1k | } |
472 | 13.2k | } |
473 | | |
474 | 8.85k | bool IsKeyPinned() const override { |
475 | 8.85k | assert(Valid()); |
476 | 8.85k | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && |
477 | 79 | current_->IsKeyPinned(); |
478 | 8.85k | } |
479 | | |
480 | 7.77k | bool IsValuePinned() const override { |
481 | 7.77k | assert(Valid()); |
482 | 7.77k | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && |
483 | 7.77k | current_->IsValuePinned(); |
484 | 7.77k | } |
485 | | |
486 | 0 | void Prepare(const MultiScanArgs* scan_opts) override { |
487 | 0 | for (auto& child : children_) { |
488 | 0 | child.iter.Prepare(scan_opts); |
489 | 0 | } |
490 | 0 | } |
491 | | |
492 | | private: |
493 | | // Represents an element in the min/max heap. Each HeapItem corresponds to a |
494 | | // point iterator or a range tombstone iterator, differentiated by |
495 | | // HeapItem::type. |
496 | | struct HeapItem { |
497 | 24.3k | HeapItem() = default; |
498 | | |
499 | | // corresponding point iterator |
500 | | IteratorWrapper iter; |
501 | | size_t level = 0; |
502 | | // corresponding range tombstone iterator's start or end key value |
503 | | // depending on value of `type`. |
504 | | ParsedInternalKey tombstone_pik; |
505 | | // Will be overwritten before use, initialize here so compiler does not |
506 | | // complain. |
507 | | enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; |
508 | | Type type = Type::ITERATOR; |
509 | | |
510 | | explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter) |
511 | 38.1k | : level(_level), type(Type::ITERATOR) { |
512 | 38.1k | iter.Set(_iter); |
513 | 38.1k | } |
514 | | |
515 | 142k | void SetTombstoneKey(ParsedInternalKey&& pik) { |
516 | | // op_type is already initialized in MergingIterator::Finish(). |
517 | 142k | tombstone_pik.user_key = pik.user_key; |
518 | 142k | tombstone_pik.sequence = pik.sequence; |
519 | 142k | } |
520 | | }; |
521 | | |
522 | | class MinHeapItemComparator { |
523 | | public: |
524 | | explicit MinHeapItemComparator(const InternalKeyComparator* comparator) |
525 | 20.3k | : comparator_(comparator) {} |
526 | | |
527 | 111k | bool operator()(HeapItem* a, HeapItem* b) const { |
528 | 111k | if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { |
529 | 28.8k | if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { |
530 | 14.5k | return comparator_->Compare(a->iter.key(), b->iter.key()) > 0; |
531 | 14.5k | } else { |
532 | 14.2k | return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0; |
533 | 14.2k | } |
534 | 82.5k | } else { |
535 | 82.5k | if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { |
536 | 82.5k | return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0; |
537 | 82.5k | } else { |
538 | 0 | return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0; |
539 | 0 | } |
540 | 82.5k | } |
541 | 111k | } |
542 | | |
543 | | private: |
544 | | const InternalKeyComparator* comparator_; |
545 | | }; |
546 | | |
547 | | class MaxHeapItemComparator { |
548 | | public: |
549 | | explicit MaxHeapItemComparator(const InternalKeyComparator* comparator) |
550 | 3.46k | : comparator_(comparator) {} |
551 | | |
552 | 14.7k | bool operator()(HeapItem* a, HeapItem* b) const { |
553 | 14.7k | if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { |
554 | 14.7k | if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { |
555 | 14.7k | return comparator_->Compare(a->iter.key(), b->iter.key()) < 0; |
556 | 14.7k | } else { |
557 | 0 | return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0; |
558 | 0 | } |
559 | 14.7k | } else { |
560 | 0 | if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { |
561 | 0 | return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0; |
562 | 0 | } else { |
563 | 0 | return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0; |
564 | 0 | } |
565 | 0 | } |
566 | 14.7k | } |
567 | | |
568 | | private: |
569 | | const InternalKeyComparator* comparator_; |
570 | | }; |
571 | | |
572 | | using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>; |
573 | | using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>; |
574 | | |
575 | | friend class MergeIteratorBuilder; |
576 | | // Clears heaps for both directions, used when changing direction or seeking |
577 | | void ClearHeaps(bool clear_active = true); |
578 | | // Ensures that maxHeap_ is initialized when starting to go in the reverse |
579 | | // direction |
580 | | void InitMaxHeap(); |
581 | | // Advance this merging iterator until the current key (minHeap_.top()) is |
582 | | // from a point iterator and is not covered by any range tombstone, |
583 | | // or that there is no more keys (heap is empty). SeekImpl() may be called |
584 | | // to seek to the end of a range tombstone as an optimization. |
585 | | void FindNextVisibleKey(); |
586 | | void FindPrevVisibleKey(); |
587 | | |
588 | | // Advance this merging iterators to the first key >= `target` for all |
589 | | // components from levels >= starting_level. All iterators before |
590 | | // starting_level are untouched. |
591 | | // |
592 | | // @param range_tombstone_reseek Whether target is some range tombstone |
593 | | // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". |
594 | | // This is used only for recoding relevant perf_context. |
595 | | void SeekImpl(const Slice& target, size_t starting_level = 0, |
596 | | bool range_tombstone_reseek = false); |
597 | | |
598 | | // Seek to fist key <= target key (internal key) for |
599 | | // children_[starting_level:]. |
600 | | void SeekForPrevImpl(const Slice& target, size_t starting_level = 0, |
601 | | bool range_tombstone_reseek = false); |
602 | | |
603 | | bool is_arena_mode_; |
604 | | bool prefix_seek_mode_; |
605 | | // Which direction is the iterator moving? |
606 | | enum Direction : uint8_t { kForward, kReverse }; |
607 | | Direction direction_; |
608 | | const InternalKeyComparator* comparator_; |
609 | | // HeapItem for all child point iterators. |
610 | | // Invariant(children_): children_[i] is in minHeap_ iff |
611 | | // children_[i].iter.Valid(), and at most one children_[i] is in minHeap_. |
612 | | // TODO: We could use an autovector with a larger reserved size. |
613 | | std::vector<HeapItem> children_; |
614 | | // HeapItem for range tombstone start and end keys. |
615 | | // pinned_heap_item_[i] corresponds to range_tombstone_iters_[i]. |
616 | | // Invariant(phi): If range_tombstone_iters_[i]->Valid(), |
617 | | // pinned_heap_item_[i].tombstone_pik is equal to |
618 | | // range_tombstone_iters_[i]->start_key() when |
619 | | // pinned_heap_item_[i].type is DELETE_RANGE_START and |
620 | | // range_tombstone_iters_[i]->end_key() when |
621 | | // pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is |
622 | | // kMaxValid for all pinned_heap_item_.tombstone_pik). |
623 | | // pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END. |
624 | | std::vector<HeapItem> pinned_heap_item_; |
625 | | // range_tombstone_iters_[i] contains range tombstones in the sorted run that |
626 | | // corresponds to children_[i]. range_tombstone_iters_.empty() means not |
627 | | // handling range tombstones in merging iterator. range_tombstone_iters_[i] == |
628 | | // nullptr means the sorted run of children_[i] does not have range |
629 | | // tombstones. |
630 | | // Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff |
631 | | // range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is |
632 | | // in minHeap_. |
633 | | std::vector<std::unique_ptr<TruncatedRangeDelIterator>> |
634 | | range_tombstone_iters_; |
635 | | |
636 | | // Levels (indices into range_tombstone_iters_/children_ ) that currently have |
637 | | // "active" range tombstones. See comments above MergingIterator for meaning |
638 | | // of "active". |
639 | | // Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid() |
640 | | // and pinned_heap_item_[i].type == DELETE_RANGE_END. |
641 | | std::set<size_t> active_; |
642 | | |
643 | | bool SkipNextDeleted(); |
644 | | |
645 | | bool SkipPrevDeleted(); |
646 | | |
647 | | // Invariant: at the end of each InternalIterator API, |
648 | | // current_ points to minHeap_.top().iter (maxHeap_ if backward scanning) |
649 | | // or nullptr if no child iterator is valid. |
650 | | // This follows from that current_ = CurrentForward()/CurrentReverse() is |
651 | | // called at the end of each InternalIterator API. |
652 | | IteratorWrapper* current_; |
653 | | // If any of the children have non-ok status, this is one of them. |
654 | | Status status_; |
655 | | // Invariant: min heap property is maintained (parent is always <= child). |
656 | | // This holds by using only BinaryHeap APIs to modify heap. One |
657 | | // exception is to modify heap top item directly (by caller iter->Next()), and |
658 | | // it should be followed by a call to replace_top() or pop(). |
659 | | MergerMinIterHeap minHeap_; |
660 | | |
661 | | // Max heap is used for reverse iteration, which is way less common than |
662 | | // forward. Lazily initialize it to save memory. |
663 | | std::unique_ptr<MergerMaxIterHeap> maxHeap_; |
664 | | PinnedIteratorsManager* pinned_iters_mgr_; |
665 | | |
666 | | // Used to bound range tombstones. For point keys, DBIter and SSTable iterator |
667 | | // take care of boundary checking. |
668 | | const Slice* iterate_upper_bound_; |
669 | | |
670 | | // In forward direction, process a child that is not in the min heap. |
671 | | // If valid, add to the min heap. Otherwise, check status. |
672 | | void AddToMinHeapOrCheckStatus(HeapItem*); |
673 | | |
674 | | // In backward direction, process a child that is not in the max heap. |
675 | | // If valid, add to the min heap. Otherwise, check status. |
676 | | void AddToMaxHeapOrCheckStatus(HeapItem*); |
677 | | |
678 | | void SwitchToForward(); |
679 | | |
680 | | // Switch the direction from forward to backward without changing the |
681 | | // position. Iterator should still be valid. |
682 | | void SwitchToBackward(); |
683 | | |
684 | 41.1k | IteratorWrapper* CurrentForward() const { |
685 | 41.1k | assert(direction_ == kForward); |
686 | 41.1k | assert(minHeap_.empty() || |
687 | 41.1k | minHeap_.top()->type == HeapItem::Type::ITERATOR); |
688 | 41.1k | return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr; |
689 | 41.1k | } |
690 | | |
691 | 16.5k | IteratorWrapper* CurrentReverse() const { |
692 | 16.5k | assert(direction_ == kReverse); |
693 | 16.5k | assert(maxHeap_); |
694 | 16.5k | assert(maxHeap_->empty() || |
695 | 16.5k | maxHeap_->top()->type == HeapItem::Type::ITERATOR); |
696 | 16.5k | return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr; |
697 | 16.5k | } |
698 | | }; |
699 | | |
700 | | // Pre-condition: |
701 | | // - Invariants (3) and (4) hold for i < starting_level |
702 | | // - For i < starting_level, range_tombstone_iters_[i].prev.end_key() < |
703 | | // `target`. |
704 | | // - For i < starting_level, if i in active_, then |
705 | | // range_tombstone_iters_[i]->start_key() < `target`. |
706 | | // |
707 | | // Post-condition: |
708 | | // - Invariants (3) and (4) hold for all level i. |
709 | | // - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target) |
710 | | // for i >= starting_level |
711 | | // - (**) target < pinned_heap_item_[i].tombstone_pik if |
712 | | // range_tombstone_iters_[i].Valid() for i >= starting_level |
713 | | // |
714 | | // Proof sketch: |
715 | | // Invariant (3) holds for all level i. |
716 | | // For j <= i < starting_level, it follows from Pre-condition that (3) holds |
717 | | // and that SeekImpl(-, starting_level) does not update children_[i] or |
718 | | // range_tombstone_iters_[j]. |
719 | | // For j < starting_level and i >= starting_level, it follows from |
720 | | // - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target` |
721 | | // - range_tombstone_iters_[j] is not updated in SeekImpl(), and |
722 | | // - children_[i].iter.Seek(current_search_key) is called with |
723 | | // current_search_key >= target (shown below). |
724 | | // When current_search_key is updated, it is updated to some |
725 | | // range_tombstone_iter->end_key() after |
726 | | // range_tombstone_iter->SeekInternalKey(current_search_key) was called. So |
727 | | // current_search_key increases if updated and >= target. |
728 | | // For starting_level <= j <= i: |
729 | | // children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2) |
730 | | // are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >= |
731 | | // k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at |
732 | | // the first range tombstone from level j with end_key() > k2. It suffices to |
733 | | // show that k1 >= k2. Since k1 and k2 are values of current_search_key where |
734 | | // k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2. |
735 | | // |
736 | | // Invariant (4) holds for all level >= 0. |
737 | | // By Pre-condition Invariant (4) holds for i < starting_level. |
738 | | // Since children_[i], range_tombstone_iters_[i] and contents of active_ for |
739 | | // i < starting_level do not change (4) holds for j <= i < starting_level. |
740 | | // By Pre-condition: for all j < starting_level, if j in active_, then |
741 | | // range_tombstone_iters_[j]->start_key() < target. For i >= starting_level, |
742 | | // children_[i].iter.Seek(k) is called for k >= target. So |
743 | | // children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key() |
744 | | // for j < starting_level and i >= starting_level. So invariant (4) holds for |
745 | | // j < starting_level and i >= starting_level. |
746 | | // For starting_level <= j <= i, j is added to active_ only if |
747 | | // - range_tombstone_iters_[j]->SeekInternalKey(k1) was called |
748 | | // - range_tombstone_iters_[j]->start_key() <= k1 |
749 | | // Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all |
750 | | // starting_level <= j <= i, (4) also holds for all starting_level <= j <= i. |
751 | | // |
752 | | // Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i, |
753 | | // target) for i >= starting_level. |
754 | | // target <= children_[i].iter.key() follows from that Seek() is called on some |
755 | | // current_search_key >= target for children_[i].iter. If current_search_key |
756 | | // is updated from k1 to k2 when level = i, we show that the range [k1, k2) is |
757 | | // not visible for children_[j] for any j > i. When current_search_key is |
758 | | // updated from k1 to k2, |
759 | | // - range_tombstone_iters_[i]->SeekInternalKey(k1) was called |
760 | | // - range_tombstone_iters_[i]->Valid() |
761 | | // - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key |
762 | | // - k2 = range_tombstone_iters_[i]->end_key() |
763 | | // We assume that range_tombstone_iters_[i]->start_key() has a higher sequence |
764 | | // number compared to any key from levels > i that has the same user key. So no |
765 | | // point key from levels > i in range [k1, k2) is visible. So |
766 | | // children_[i].iter.key() <= LevelNextVisible(i, target). |
767 | | // |
768 | | // Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >= |
769 | | // starting_level if range_tombstone_iters_[i].Valid(). This follows from that |
770 | | // SeekInternalKey() being called for each range_tombstone_iters_ with some key |
771 | | // >= `target` and that we pick start/end key that is > `target` to insert to |
772 | | // minHeap_. |
773 | | void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, |
774 | 668 | bool range_tombstone_reseek) { |
775 | | // active range tombstones before `starting_level` remain active |
776 | 668 | ClearHeaps(false /* clear_active */); |
777 | 668 | ParsedInternalKey pik; |
778 | 668 | if (!range_tombstone_iters_.empty()) { |
779 | | // pik is only used in InsertRangeTombstoneToMinHeap(). |
780 | 489 | ParseInternalKey(target, &pik, false).PermitUncheckedError(); |
781 | 489 | } |
782 | | |
783 | | // TODO: perhaps we could save some upheap cost by add all child iters first |
784 | | // and then do a single heapify. |
785 | | // Invariant(children_) for level < starting_level |
786 | 668 | for (size_t level = 0; level < starting_level; ++level) { |
787 | 0 | PERF_TIMER_GUARD(seek_min_heap_time); |
788 | 0 | AddToMinHeapOrCheckStatus(&children_[level]); |
789 | 0 | } |
790 | 668 | if (!range_tombstone_iters_.empty()) { |
791 | | // Add range tombstones from levels < starting_level. We can insert from |
792 | | // pinned_heap_item_ for the following reasons: |
793 | | // - pinned_heap_item_[level] is in minHeap_ iff |
794 | | // range_tombstone_iters[level]->Valid(). |
795 | | // - If `level` is in active_, then range_tombstone_iters_[level]->Valid() |
796 | | // and pinned_heap_item_[level] is of type RANGE_DELETION_END. |
797 | 489 | for (size_t level = 0; level < starting_level; ++level) { |
798 | | // Restores Invariants(rti), (phi) and (active_) for level < |
799 | | // starting_level |
800 | 0 | if (range_tombstone_iters_[level] && |
801 | 0 | range_tombstone_iters_[level]->Valid()) { |
802 | | // use an iterator on active_ if performance becomes an issue here |
803 | 0 | if (active_.count(level) > 0) { |
804 | 0 | assert(pinned_heap_item_[level].type == |
805 | 0 | HeapItem::Type::DELETE_RANGE_END); |
806 | | // if it was active, then start key must be within upper_bound, |
807 | | // so we can add to minHeap_ directly. |
808 | 0 | minHeap_.push(&pinned_heap_item_[level]); |
809 | 0 | } else { |
810 | 0 | assert(pinned_heap_item_[level].type == |
811 | 0 | HeapItem::Type::DELETE_RANGE_START); |
812 | | // this takes care of checking iterate_upper_bound, but with an extra |
813 | | // key comparison if range_tombstone_iters_[level] was already out of |
814 | | // bound. Consider using a new HeapItem type or some flag to remember |
815 | | // boundary checking result. |
816 | 0 | InsertRangeTombstoneToMinHeap(level); |
817 | 0 | } |
818 | 0 | } else { |
819 | 0 | assert(!active_.count(level)); |
820 | 0 | } |
821 | 0 | } |
822 | | // levels >= starting_level will be reseeked below, so clearing their active |
823 | | // state here. |
824 | 489 | active_.erase(active_.lower_bound(starting_level), active_.end()); |
825 | 489 | } |
826 | | |
827 | 668 | IterKey current_search_key; |
828 | 668 | current_search_key.SetInternalKey(target, false /* copy */); |
829 | | // Seek target might change to some range tombstone end key, so |
830 | | // we need to remember them for async requests. |
831 | | // (level, target) pairs |
832 | 668 | autovector<std::pair<size_t, std::string>> prefetched_target; |
833 | 2.87k | for (auto level = starting_level; level < children_.size(); ++level) { |
834 | 2.20k | { |
835 | 2.20k | PERF_TIMER_GUARD(seek_child_seek_time); |
836 | 2.20k | children_[level].iter.Seek(current_search_key.GetInternalKey()); |
837 | 2.20k | } |
838 | | |
839 | 2.20k | PERF_COUNTER_ADD(seek_child_seek_count, 1); |
840 | | |
841 | 2.20k | if (!range_tombstone_iters_.empty()) { |
842 | 1.68k | if (range_tombstone_reseek) { |
843 | | // This seek is to some range tombstone end key. |
844 | | // Should only happen when there are range tombstones. |
845 | 0 | PERF_COUNTER_ADD(internal_range_del_reseek_count, 1); |
846 | 0 | } |
847 | 1.68k | if (children_[level].iter.status().IsTryAgain()) { |
848 | 0 | prefetched_target.emplace_back( |
849 | 0 | level, current_search_key.GetInternalKey().ToString()); |
850 | 0 | } |
851 | 1.68k | UnownedPtr<TruncatedRangeDelIterator> range_tombstone_iter = |
852 | 1.68k | range_tombstone_iters_[level].get(); |
853 | 1.68k | if (range_tombstone_iter) { |
854 | 0 | range_tombstone_iter->SeekInternalKey( |
855 | 0 | current_search_key.GetInternalKey()); |
856 | | // Invariants (rti) and (phi) |
857 | 0 | if (range_tombstone_iter->Valid()) { |
858 | | // If range tombstone starts after `current_search_key`, |
859 | | // we should insert start key to heap as the range tombstone is not |
860 | | // active yet. |
861 | 0 | InsertRangeTombstoneToMinHeap( |
862 | 0 | level, comparator_->Compare(range_tombstone_iter->start_key(), |
863 | 0 | pik) > 0 /* start_key */); |
864 | | // current_search_key < end_key guaranteed by the SeekInternalKey() |
865 | | // and Valid() calls above. Here we only need to compare user_key |
866 | | // since if target.user_key == |
867 | | // range_tombstone_iter->start_key().user_key and target < |
868 | | // range_tombstone_iter->start_key(), no older level would have any |
869 | | // key in range [target, range_tombstone_iter->start_key()], so no |
870 | | // keys in range [target, range_tombstone_iter->end_key()) from older |
871 | | // level would be visible. So it is safe to seek to |
872 | | // range_tombstone_iter->end_key(). |
873 | | // |
874 | | // TODO: range_tombstone_iter->Seek() finds the max covering |
875 | | // sequence number, can make it cheaper by not looking for max. |
876 | 0 | if (comparator_->user_comparator()->Compare( |
877 | 0 | range_tombstone_iter->start_key().user_key, |
878 | 0 | current_search_key.GetUserKey()) <= 0) { |
879 | 0 | range_tombstone_reseek = true; |
880 | | // Note that for prefix seek case, it is possible that the prefix |
881 | | // is not the same as the original target, it should not affect |
882 | | // correctness. Besides, in most cases, range tombstone start and |
883 | | // end key should have the same prefix? |
884 | 0 | current_search_key.SetInternalKey(range_tombstone_iter->end_key()); |
885 | 0 | } |
886 | 0 | } |
887 | 0 | } |
888 | 1.68k | } |
889 | | // child.iter.status() is set to Status::TryAgain indicating asynchronous |
890 | | // request for retrieval of data blocks has been submitted. So it should |
891 | | // return at this point and Seek should be called again to retrieve the |
892 | | // requested block and add the child to min heap. |
893 | 2.20k | if (children_[level].iter.status().IsTryAgain()) { |
894 | 0 | continue; |
895 | 0 | } |
896 | 2.20k | { |
897 | | // Strictly, we timed slightly more than min heap operation, |
898 | | // but these operations are very cheap. |
899 | 2.20k | PERF_TIMER_GUARD(seek_min_heap_time); |
900 | 2.20k | AddToMinHeapOrCheckStatus(&children_[level]); |
901 | 2.20k | } |
902 | 2.20k | } |
903 | | |
904 | 668 | if (range_tombstone_iters_.empty()) { |
905 | 517 | for (auto& child : children_) { |
906 | 517 | if (child.iter.status().IsTryAgain()) { |
907 | 0 | child.iter.Seek(target); |
908 | 0 | { |
909 | 0 | PERF_TIMER_GUARD(seek_min_heap_time); |
910 | 0 | AddToMinHeapOrCheckStatus(&child); |
911 | 0 | } |
912 | 0 | PERF_COUNTER_ADD(number_async_seek, 1); |
913 | 0 | } |
914 | 517 | } |
915 | 489 | } else { |
916 | 489 | for (auto& prefetch : prefetched_target) { |
917 | | // (level, target) pairs |
918 | 0 | children_[prefetch.first].iter.Seek(prefetch.second); |
919 | 0 | { |
920 | 0 | PERF_TIMER_GUARD(seek_min_heap_time); |
921 | 0 | AddToMinHeapOrCheckStatus(&children_[prefetch.first]); |
922 | 0 | } |
923 | 0 | PERF_COUNTER_ADD(number_async_seek, 1); |
924 | 0 | } |
925 | 489 | } |
926 | 668 | } |
927 | | |
928 | | // Returns true iff the current key (min heap top) should not be returned |
929 | | // to user (of the merging iterator). This can be because the current key |
930 | | // is deleted by some range tombstone, the current key is some fake file |
931 | | // boundary sentinel key, or the current key is an end point of a range |
932 | | // tombstone. Advance the iterator at heap top if needed. Heap order is restored |
933 | | // and `active_` is updated accordingly. |
934 | | // See FindNextVisibleKey() for more detail on internal implementation |
935 | | // of advancing child iters. |
936 | | // When false is returned, if minHeap is not empty, then minHeap_.top().type |
937 | | // == ITERATOR |
938 | | // |
939 | | // REQUIRES: |
940 | | // - min heap is currently not empty, and iter is in kForward direction. |
941 | | // - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current). |
942 | 83.7k | bool MergingIterator::SkipNextDeleted() { |
943 | | // 3 types of keys: |
944 | | // - point key |
945 | | // - file boundary sentinel keys |
946 | | // - range deletion end key |
947 | 83.7k | auto current = minHeap_.top(); |
948 | 83.7k | if (current->type == HeapItem::Type::DELETE_RANGE_END) { |
949 | | // Invariant(active_): range_tombstone_iters_[current->level] is about to |
950 | | // become !Valid() or that its start key is going to be added to minHeap_. |
951 | 71.3k | active_.erase(current->level); |
952 | 71.3k | assert(range_tombstone_iters_[current->level] && |
953 | 71.3k | range_tombstone_iters_[current->level]->Valid()); |
954 | 71.3k | range_tombstone_iters_[current->level]->Next(); |
955 | | // Maintain Invariants (rti) and (phi) |
956 | 71.3k | if (range_tombstone_iters_[current->level]->Valid()) { |
957 | 68.8k | InsertRangeTombstoneToMinHeap(current->level, true /* start_key */, |
958 | 68.8k | true /* replace_top */); |
959 | 68.8k | } else { |
960 | | // TruncatedRangeDelIterator does not have status |
961 | 2.55k | minHeap_.pop(); |
962 | 2.55k | } |
963 | 71.3k | return true /* current key deleted */; |
964 | 71.3k | } |
965 | 12.3k | if (current->iter.IsDeleteRangeSentinelKey()) { |
966 | | // If the file boundary is defined by a range deletion, the range |
967 | | // tombstone's end key must come before this sentinel key (see op_type in |
968 | | // SetTombstoneKey()). |
969 | 3.16k | assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || |
970 | 3.16k | active_.count(current->level) == 0); |
971 | | // When entering a new file, range tombstone iter from the old file is |
972 | | // freed, but the last key from that range tombstone iter may still be in |
973 | | // the heap. We need to ensure the data underlying its corresponding key |
974 | | // Slice is still alive. We do so by popping the range tombstone key from |
975 | | // heap before calling iter->Next(). Technically, this change is not needed: |
976 | | // if there is a range tombstone end key that is after file boundary |
977 | | // sentinel key in minHeap_, the range tombstone end key must have been |
978 | | // truncated at file boundary. The underlying data of the range tombstone |
979 | | // end key Slice is the SST file's largest internal key stored as file |
980 | | // metadata in Version. However, since there are too many implicit |
981 | | // assumptions made, it is safer to just ensure range tombstone iter is |
982 | | // still alive. |
983 | 3.16k | minHeap_.pop(); |
984 | | // Remove last SST file's range tombstone end key if there is one. |
985 | | // This means file boundary is before range tombstone end key, |
986 | | // which could happen when a range tombstone and a user key |
987 | | // straddle two SST files. Note that in TruncatedRangeDelIterator |
988 | | // constructor, parsed_largest.sequence is decremented 1 in this case. |
989 | | // Maintains Invariant(rti) that at most one |
990 | | // pinned_heap_item_[current->level] is in minHeap_. |
991 | 3.16k | if (range_tombstone_iters_[current->level] && |
992 | 0 | range_tombstone_iters_[current->level]->Valid()) { |
993 | 0 | if (!minHeap_.empty() && minHeap_.top()->level == current->level) { |
994 | 0 | assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END); |
995 | 0 | minHeap_.pop(); |
996 | | // Invariant(active_): we are about to enter a new SST file with new |
997 | | // range_tombstone_iters[current->level]. Either it is !Valid() or its |
998 | | // start key is going to be added to minHeap_. |
999 | 0 | active_.erase(current->level); |
1000 | 0 | } else { |
1001 | | // range tombstone is still valid, but it is not on heap. |
1002 | | // This should only happen if the range tombstone is over iterator |
1003 | | // upper bound. |
1004 | 0 | assert(iterate_upper_bound_ && |
1005 | 0 | comparator_->user_comparator()->CompareWithoutTimestamp( |
1006 | 0 | range_tombstone_iters_[current->level]->start_key().user_key, |
1007 | 0 | true /* a_has_ts */, *iterate_upper_bound_, |
1008 | 0 | false /* b_has_ts */) >= 0); |
1009 | 0 | } |
1010 | 0 | } |
1011 | | // LevelIterator enters a new SST file |
1012 | 3.16k | current->iter.Next(); |
1013 | | // Invariant(children_): current is popped from heap and added back only if |
1014 | | // it is valid |
1015 | 3.16k | if (current->iter.Valid()) { |
1016 | 1.56k | assert(current->iter.status().ok()); |
1017 | 1.56k | minHeap_.push(current); |
1018 | 1.59k | } else { |
1019 | | // TODO(cbi): check status and early return if non-ok. |
1020 | 1.59k | considerStatus(current->iter.status()); |
1021 | 1.59k | } |
1022 | | // Invariants (rti) and (phi) |
1023 | 3.16k | if (range_tombstone_iters_[current->level] && |
1024 | 0 | range_tombstone_iters_[current->level]->Valid()) { |
1025 | 0 | InsertRangeTombstoneToMinHeap(current->level); |
1026 | 0 | } |
1027 | 3.16k | return true /* current key deleted */; |
1028 | 3.16k | } |
1029 | 12.3k | assert(current->type == HeapItem::Type::ITERATOR); |
1030 | | // Point key case: check active_ for range tombstone coverage. |
1031 | 9.18k | ParsedInternalKey pik; |
1032 | 9.18k | ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); |
1033 | 9.18k | if (!active_.empty()) { |
1034 | 9.18k | auto i = *active_.begin(); |
1035 | 9.18k | if (i < current->level) { |
1036 | | // range tombstone is from a newer level, definitely covers |
1037 | 0 | assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), |
1038 | 0 | pik) <= 0); |
1039 | 0 | assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < |
1040 | 0 | 0); |
1041 | 0 | std::string target; |
1042 | 0 | AppendInternalKey(&target, range_tombstone_iters_[i]->end_key()); |
1043 | 0 | SeekImpl(target, current->level, true); |
1044 | 0 | return true /* current key deleted */; |
1045 | 9.18k | } else if (i == current->level) { |
1046 | | // range tombstone is from the same level as current, check sequence |
1047 | | // number. By `active_` we know current key is between start key and end |
1048 | | // key. |
1049 | 9.18k | assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), |
1050 | 9.18k | pik) <= 0); |
1051 | 9.18k | assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < |
1052 | 9.18k | 0); |
1053 | 9.18k | if (pik.sequence < range_tombstone_iters_[current->level]->seq()) { |
1054 | | // covered by range tombstone |
1055 | 0 | current->iter.Next(); |
1056 | | // Invariant (children_) |
1057 | 0 | if (current->iter.Valid()) { |
1058 | 0 | minHeap_.replace_top(current); |
1059 | 0 | } else { |
1060 | 0 | considerStatus(current->iter.status()); |
1061 | 0 | minHeap_.pop(); |
1062 | 0 | } |
1063 | 0 | return true /* current key deleted */; |
1064 | 9.18k | } else { |
1065 | 9.18k | return false /* current key not deleted */; |
1066 | 9.18k | } |
1067 | 9.18k | } else { |
1068 | 0 | return false /* current key not deleted */; |
1069 | | // range tombstone from an older sorted run with current key < end key. |
1070 | | // current key is not deleted and the older sorted run will have its range |
1071 | | // tombstone updated when the range tombstone's end key are popped from |
1072 | | // minHeap_. |
1073 | 0 | } |
1074 | 9.18k | } |
1075 | | // we can reach here only if active_ is empty |
1076 | 9.18k | assert(active_.empty()); |
1077 | 0 | assert(minHeap_.top()->type == HeapItem::Type::ITERATOR); |
1078 | 0 | return false /* current key not deleted */; |
1079 | 9.18k | } |
1080 | | |
1081 | | void MergingIterator::SeekForPrevImpl(const Slice& target, |
1082 | | size_t starting_level, |
1083 | 3.46k | bool range_tombstone_reseek) { |
1084 | | // active range tombstones before `starting_level` remain active |
1085 | 3.46k | ClearHeaps(false /* clear_active */); |
1086 | 3.46k | InitMaxHeap(); |
1087 | 3.46k | ParsedInternalKey pik; |
1088 | 3.46k | if (!range_tombstone_iters_.empty()) { |
1089 | 2.27k | ParseInternalKey(target, &pik, false).PermitUncheckedError(); |
1090 | 2.27k | } |
1091 | 3.46k | for (size_t level = 0; level < starting_level; ++level) { |
1092 | 0 | PERF_TIMER_GUARD(seek_max_heap_time); |
1093 | 0 | AddToMaxHeapOrCheckStatus(&children_[level]); |
1094 | 0 | } |
1095 | 3.46k | if (!range_tombstone_iters_.empty()) { |
1096 | | // Add range tombstones before starting_level. |
1097 | 2.27k | for (size_t level = 0; level < starting_level; ++level) { |
1098 | 0 | if (range_tombstone_iters_[level] && |
1099 | 0 | range_tombstone_iters_[level]->Valid()) { |
1100 | 0 | assert(static_cast<bool>(active_.count(level)) == |
1101 | 0 | (pinned_heap_item_[level].type == |
1102 | 0 | HeapItem::Type::DELETE_RANGE_START)); |
1103 | 0 | maxHeap_->push(&pinned_heap_item_[level]); |
1104 | 0 | } else { |
1105 | 0 | assert(!active_.count(level)); |
1106 | 0 | } |
1107 | 0 | } |
1108 | | // levels >= starting_level will be reseeked below, |
1109 | 2.27k | active_.erase(active_.lower_bound(starting_level), active_.end()); |
1110 | 2.27k | } |
1111 | | |
1112 | 3.46k | IterKey current_search_key; |
1113 | 3.46k | current_search_key.SetInternalKey(target, false /* copy */); |
1114 | | // Seek target might change to some range tombstone end key, so |
1115 | | // we need to remember them for async requests. |
1116 | | // (level, target) pairs |
1117 | 3.46k | autovector<std::pair<size_t, std::string>> prefetched_target; |
1118 | 14.3k | for (auto level = starting_level; level < children_.size(); ++level) { |
1119 | 10.8k | { |
1120 | 10.8k | PERF_TIMER_GUARD(seek_child_seek_time); |
1121 | 10.8k | children_[level].iter.SeekForPrev(current_search_key.GetInternalKey()); |
1122 | 10.8k | } |
1123 | | |
1124 | 10.8k | PERF_COUNTER_ADD(seek_child_seek_count, 1); |
1125 | | |
1126 | 10.8k | if (!range_tombstone_iters_.empty()) { |
1127 | 7.16k | if (range_tombstone_reseek) { |
1128 | | // This seek is to some range tombstone end key. |
1129 | | // Should only happen when there are range tombstones. |
1130 | 0 | PERF_COUNTER_ADD(internal_range_del_reseek_count, 1); |
1131 | 0 | } |
1132 | 7.16k | if (children_[level].iter.status().IsTryAgain()) { |
1133 | 0 | prefetched_target.emplace_back( |
1134 | 0 | level, current_search_key.GetInternalKey().ToString()); |
1135 | 0 | } |
1136 | 7.16k | UnownedPtr<TruncatedRangeDelIterator> range_tombstone_iter = |
1137 | 7.16k | range_tombstone_iters_[level].get(); |
1138 | 7.16k | if (range_tombstone_iter) { |
1139 | 0 | range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey()); |
1140 | 0 | if (range_tombstone_iter->Valid()) { |
1141 | 0 | InsertRangeTombstoneToMaxHeap( |
1142 | 0 | level, comparator_->Compare(range_tombstone_iter->end_key(), |
1143 | 0 | pik) <= 0 /* end_key */); |
1144 | | // start key <= current_search_key guaranteed by the Seek() call above |
1145 | | // Only interested in user key coverage since older sorted runs must |
1146 | | // have smaller sequence numbers than this tombstone. |
1147 | 0 | if (comparator_->user_comparator()->Compare( |
1148 | 0 | current_search_key.GetUserKey(), |
1149 | 0 | range_tombstone_iter->end_key().user_key) < 0) { |
1150 | 0 | range_tombstone_reseek = true; |
1151 | 0 | current_search_key.SetInternalKey( |
1152 | 0 | range_tombstone_iter->start_key().user_key, kMaxSequenceNumber, |
1153 | 0 | kValueTypeForSeekForPrev); |
1154 | 0 | } |
1155 | 0 | } |
1156 | 0 | } |
1157 | 7.16k | } |
1158 | | // child.iter.status() is set to Status::TryAgain indicating asynchronous |
1159 | | // request for retrieval of data blocks has been submitted. So it should |
1160 | | // return at this point and Seek should be called again to retrieve the |
1161 | | // requested block and add the child to min heap. |
1162 | 10.8k | if (children_[level].iter.status().IsTryAgain()) { |
1163 | 0 | continue; |
1164 | 0 | } |
1165 | 10.8k | { |
1166 | | // Strictly, we timed slightly more than min heap operation, |
1167 | | // but these operations are very cheap. |
1168 | 10.8k | PERF_TIMER_GUARD(seek_max_heap_time); |
1169 | 10.8k | AddToMaxHeapOrCheckStatus(&children_[level]); |
1170 | 10.8k | } |
1171 | 10.8k | } |
1172 | | |
1173 | 3.46k | if (range_tombstone_iters_.empty()) { |
1174 | 3.69k | for (auto& child : children_) { |
1175 | 3.69k | if (child.iter.status().IsTryAgain()) { |
1176 | 0 | child.iter.SeekForPrev(target); |
1177 | 0 | { |
1178 | 0 | PERF_TIMER_GUARD(seek_min_heap_time); |
1179 | 0 | AddToMaxHeapOrCheckStatus(&child); |
1180 | 0 | } |
1181 | 0 | PERF_COUNTER_ADD(number_async_seek, 1); |
1182 | 0 | } |
1183 | 3.69k | } |
1184 | 2.27k | } else { |
1185 | 2.27k | for (auto& prefetch : prefetched_target) { |
1186 | | // (level, target) pairs |
1187 | 0 | children_[prefetch.first].iter.SeekForPrev(prefetch.second); |
1188 | 0 | { |
1189 | 0 | PERF_TIMER_GUARD(seek_max_heap_time); |
1190 | 0 | AddToMaxHeapOrCheckStatus(&children_[prefetch.first]); |
1191 | 0 | } |
1192 | 0 | PERF_COUNTER_ADD(number_async_seek, 1); |
1193 | 0 | } |
1194 | 2.27k | } |
1195 | 3.46k | } |
1196 | | |
1197 | | // See more in comments above SkipNextDeleted(). |
1198 | | // REQUIRES: |
1199 | | // - max heap is currently not empty, and iter is in kReverse direction. |
1200 | | // - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current). |
1201 | 3.96k | bool MergingIterator::SkipPrevDeleted() { |
1202 | | // 3 types of keys: |
1203 | | // - point key |
1204 | | // - file boundary sentinel keys |
1205 | | // - range deletion start key |
1206 | 3.96k | auto current = maxHeap_->top(); |
1207 | 3.96k | if (current->type == HeapItem::Type::DELETE_RANGE_START) { |
1208 | 0 | active_.erase(current->level); |
1209 | 0 | assert(range_tombstone_iters_[current->level] && |
1210 | 0 | range_tombstone_iters_[current->level]->Valid()); |
1211 | 0 | range_tombstone_iters_[current->level]->Prev(); |
1212 | 0 | if (range_tombstone_iters_[current->level]->Valid()) { |
1213 | 0 | InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */, |
1214 | 0 | true /* replace_top */); |
1215 | 0 | } else { |
1216 | 0 | maxHeap_->pop(); |
1217 | 0 | } |
1218 | 0 | return true /* current key deleted */; |
1219 | 0 | } |
1220 | 3.96k | if (current->iter.IsDeleteRangeSentinelKey()) { |
1221 | | // LevelIterator enters a new SST file |
1222 | 3.96k | maxHeap_->pop(); |
1223 | | // Remove last SST file's range tombstone key if there is one. |
1224 | 3.96k | if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && |
1225 | 0 | maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) { |
1226 | 0 | maxHeap_->pop(); |
1227 | 0 | active_.erase(current->level); |
1228 | 0 | } |
1229 | 3.96k | current->iter.Prev(); |
1230 | 3.96k | if (current->iter.Valid()) { |
1231 | 2.01k | assert(current->iter.status().ok()); |
1232 | 2.01k | maxHeap_->push(current); |
1233 | 2.01k | } else { |
1234 | 1.95k | considerStatus(current->iter.status()); |
1235 | 1.95k | } |
1236 | | |
1237 | 3.96k | if (range_tombstone_iters_[current->level] && |
1238 | 0 | range_tombstone_iters_[current->level]->Valid()) { |
1239 | 0 | InsertRangeTombstoneToMaxHeap(current->level); |
1240 | 0 | } |
1241 | 3.96k | return true /* current key deleted */; |
1242 | 3.96k | } |
1243 | 3.96k | assert(current->type == HeapItem::Type::ITERATOR); |
1244 | | // Point key case: check active_ for range tombstone coverage. |
1245 | 0 | ParsedInternalKey pik; |
1246 | 0 | ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); |
1247 | 0 | if (!active_.empty()) { |
1248 | 0 | auto i = *active_.begin(); |
1249 | 0 | if (i < current->level) { |
1250 | | // range tombstone is from a newer level, definitely covers |
1251 | 0 | assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), |
1252 | 0 | pik) <= 0); |
1253 | 0 | assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < |
1254 | 0 | 0); |
1255 | 0 | std::string target; |
1256 | 0 | AppendInternalKey(&target, range_tombstone_iters_[i]->start_key()); |
1257 | | // This is different from SkipNextDeleted() which does reseek at sorted |
1258 | | // runs >= level (instead of i+1 here). With min heap, if level L is at |
1259 | | // top of the heap, then levels <L all have internal keys > level L's |
1260 | | // current internal key, which means levels <L are already at a different |
1261 | | // user key. With max heap, if level L is at top of the heap, then levels |
1262 | | // <L all have internal keys smaller than level L's current internal key, |
1263 | | // which might still be the same user key. |
1264 | 0 | SeekForPrevImpl(target, i + 1, true); |
1265 | 0 | return true /* current key deleted */; |
1266 | 0 | } else if (i == current->level) { |
1267 | | // By `active_` we know current key is between start key and end key. |
1268 | 0 | assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), |
1269 | 0 | pik) <= 0); |
1270 | 0 | assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < |
1271 | 0 | 0); |
1272 | 0 | if (pik.sequence < range_tombstone_iters_[current->level]->seq()) { |
1273 | 0 | current->iter.Prev(); |
1274 | 0 | if (current->iter.Valid()) { |
1275 | 0 | maxHeap_->replace_top(current); |
1276 | 0 | } else { |
1277 | 0 | considerStatus(current->iter.status()); |
1278 | 0 | maxHeap_->pop(); |
1279 | 0 | } |
1280 | 0 | return true /* current key deleted */; |
1281 | 0 | } else { |
1282 | 0 | return false /* current key not deleted */; |
1283 | 0 | } |
1284 | 0 | } else { |
1285 | 0 | return false /* current key not deleted */; |
1286 | 0 | } |
1287 | 0 | } |
1288 | | |
1289 | 0 | assert(active_.empty()); |
1290 | 0 | assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR); |
1291 | 0 | return false /* current key not deleted */; |
1292 | 0 | } |
1293 | | |
1294 | 18.3k | void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) { |
1295 | | // Invariant(children_) |
1296 | 18.3k | if (child->iter.Valid()) { |
1297 | 11.9k | assert(child->iter.status().ok()); |
1298 | 11.9k | minHeap_.push(child); |
1299 | 11.9k | } else { |
1300 | 6.42k | considerStatus(child->iter.status()); |
1301 | 6.42k | } |
1302 | 18.3k | } |
1303 | | |
1304 | 12.6k | void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) { |
1305 | 12.6k | if (child->iter.Valid()) { |
1306 | 7.50k | assert(child->iter.status().ok()); |
1307 | 7.50k | maxHeap_->push(child); |
1308 | 7.50k | } else { |
1309 | 5.11k | considerStatus(child->iter.status()); |
1310 | 5.11k | } |
1311 | 12.6k | } |
1312 | | |
1313 | | // Advance all non current_ child to > current_.key(). |
1314 | | // We advance current_ after the this function call as it does not require |
1315 | | // Seek(). |
1316 | | // Advance all range tombstones iters, including the one corresponding to |
1317 | | // current_, to the first tombstone with end_key > current_.key(). |
1318 | | // TODO: potentially do cascading seek here too |
1319 | | // TODO: show that invariants hold |
1320 | 0 | void MergingIterator::SwitchToForward() { |
1321 | 0 | ClearHeaps(); |
1322 | 0 | Slice target = key(); |
1323 | 0 | for (auto& child : children_) { |
1324 | 0 | if (&child.iter != current_) { |
1325 | 0 | child.iter.Seek(target); |
1326 | | // child.iter.status() is set to Status::TryAgain indicating asynchronous |
1327 | | // request for retrieval of data blocks has been submitted. So it should |
1328 | | // return at this point and Seek should be called again to retrieve the |
1329 | | // requested block and add the child to min heap. |
1330 | 0 | if (child.iter.status() == Status::TryAgain()) { |
1331 | 0 | continue; |
1332 | 0 | } |
1333 | 0 | if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { |
1334 | 0 | assert(child.iter.status().ok()); |
1335 | 0 | child.iter.Next(); |
1336 | 0 | } |
1337 | 0 | } |
1338 | 0 | AddToMinHeapOrCheckStatus(&child); |
1339 | 0 | } |
1340 | |
|
1341 | 0 | for (auto& child : children_) { |
1342 | 0 | if (child.iter.status() == Status::TryAgain()) { |
1343 | 0 | child.iter.Seek(target); |
1344 | 0 | if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { |
1345 | 0 | assert(child.iter.status().ok()); |
1346 | 0 | child.iter.Next(); |
1347 | 0 | } |
1348 | 0 | AddToMinHeapOrCheckStatus(&child); |
1349 | 0 | } |
1350 | 0 | } |
1351 | | |
1352 | | // Current range tombstone iter also needs to seek for the following case: |
1353 | | // Previous direction is backward, so range tombstone iter may point to a |
1354 | | // tombstone before current_. If there is no such tombstone, then the range |
1355 | | // tombstone iter is !Valid(). Need to reseek here to make it valid again. |
1356 | 0 | if (!range_tombstone_iters_.empty()) { |
1357 | 0 | ParsedInternalKey pik; |
1358 | 0 | ParseInternalKey(target, &pik, false /* log_err_key */) |
1359 | 0 | .PermitUncheckedError(); |
1360 | 0 | for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { |
1361 | 0 | UnownedPtr<TruncatedRangeDelIterator> iter = |
1362 | 0 | range_tombstone_iters_[i].get(); |
1363 | 0 | if (iter) { |
1364 | 0 | iter->Seek(pik.user_key); |
1365 | | // The while loop is needed as the Seek() call above is only for user |
1366 | | // key. We could have a range tombstone with end_key covering user_key, |
1367 | | // but still is smaller than target. This happens when the range |
1368 | | // tombstone is truncated at iter.largest_. |
1369 | 0 | while (iter->Valid() && |
1370 | 0 | comparator_->Compare(iter->end_key(), pik) <= 0) { |
1371 | 0 | iter->Next(); |
1372 | 0 | } |
1373 | 0 | if (range_tombstone_iters_[i]->Valid()) { |
1374 | 0 | InsertRangeTombstoneToMinHeap( |
1375 | 0 | i, comparator_->Compare(range_tombstone_iters_[i]->start_key(), |
1376 | 0 | pik) > 0 /* start_key */); |
1377 | 0 | } |
1378 | 0 | } |
1379 | 0 | } |
1380 | 0 | } |
1381 | |
|
1382 | 0 | direction_ = kForward; |
1383 | 0 | assert(current_ == CurrentForward()); |
1384 | 0 | } |
1385 | | |
1386 | | // Advance all range tombstones iters, including the one corresponding to |
1387 | | // current_, to the first tombstone with start_key <= current_.key(). |
1388 | 506 | void MergingIterator::SwitchToBackward() { |
1389 | 506 | ClearHeaps(); |
1390 | 506 | InitMaxHeap(); |
1391 | 506 | Slice target = key(); |
1392 | 1.76k | for (auto& child : children_) { |
1393 | 1.76k | if (&child.iter != current_) { |
1394 | 1.25k | child.iter.SeekForPrev(target); |
1395 | 1.25k | TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); |
1396 | 1.25k | if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { |
1397 | 0 | assert(child.iter.status().ok()); |
1398 | 0 | child.iter.Prev(); |
1399 | 0 | } |
1400 | 1.25k | } |
1401 | 1.76k | AddToMaxHeapOrCheckStatus(&child); |
1402 | 1.76k | } |
1403 | | |
1404 | 506 | ParsedInternalKey pik; |
1405 | 506 | ParseInternalKey(target, &pik, false /* log_err_key */) |
1406 | 506 | .PermitUncheckedError(); |
1407 | 1.86k | for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { |
1408 | 1.36k | UnownedPtr<TruncatedRangeDelIterator> iter = |
1409 | 1.36k | range_tombstone_iters_[i].get(); |
1410 | 1.36k | if (iter) { |
1411 | 0 | iter->SeekForPrev(pik.user_key); |
1412 | | // Since the SeekForPrev() call above is only for user key, |
1413 | | // we may end up with some range tombstone with start key having the |
1414 | | // same user key at current_, but with a smaller sequence number. This |
1415 | | // makes current_ not at maxHeap_ top for the CurrentReverse() call |
1416 | | // below. If there is a range tombstone start key with the same user |
1417 | | // key and the same sequence number as current_.key(), it will be fine as |
1418 | | // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest |
1419 | | // op_type. |
1420 | 0 | while (iter->Valid() && |
1421 | 0 | comparator_->Compare(iter->start_key(), pik) > 0) { |
1422 | 0 | iter->Prev(); |
1423 | 0 | } |
1424 | 0 | if (iter->Valid()) { |
1425 | 0 | InsertRangeTombstoneToMaxHeap( |
1426 | 0 | i, comparator_->Compare(range_tombstone_iters_[i]->end_key(), |
1427 | 0 | pik) <= 0 /* end_key */); |
1428 | 0 | } |
1429 | 0 | } |
1430 | 1.36k | } |
1431 | | |
1432 | 506 | direction_ = kReverse; |
1433 | 506 | if (!prefix_seek_mode_) { |
1434 | | // Note that we don't do assert(current_ == CurrentReverse()) here |
1435 | | // because it is possible to have some keys larger than the seek-key |
1436 | | // inserted between Seek() and SeekToLast(), which makes current_ not |
1437 | | // equal to CurrentReverse(). |
1438 | 506 | current_ = CurrentReverse(); |
1439 | 506 | } |
1440 | 506 | assert(current_ == CurrentReverse()); |
1441 | 506 | } |
1442 | | |
1443 | 11.0k | void MergingIterator::ClearHeaps(bool clear_active) { |
1444 | 11.0k | minHeap_.clear(); |
1445 | 11.0k | if (maxHeap_) { |
1446 | 1.01k | maxHeap_->clear(); |
1447 | 1.01k | } |
1448 | 11.0k | if (clear_active) { |
1449 | 6.87k | active_.clear(); |
1450 | 6.87k | } |
1451 | 11.0k | } |
1452 | | |
1453 | 3.97k | void MergingIterator::InitMaxHeap() { |
1454 | 3.97k | if (!maxHeap_) { |
1455 | 3.46k | maxHeap_ = |
1456 | 3.46k | std::make_unique<MergerMaxIterHeap>(MaxHeapItemComparator(comparator_)); |
1457 | 3.46k | } |
1458 | 3.97k | } |
1459 | | |
1460 | | // Assume there is a next key that is not covered by range tombstone. |
1461 | | // Pre-condition: |
1462 | | // - Invariants (3) and (4) |
1463 | | // - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i, |
1464 | | // k) for all levels i (LevelNextVisible() defined in Seek()). |
1465 | | // |
1466 | | // Define NextVisible(k) to be the first key >= k from among children_ that |
1467 | | // is not covered by any range tombstone. |
1468 | | // Post-condition: |
1469 | | // - Invariants (1)-(4) hold |
1470 | | // - (*): minHeap_->top()->key() == NextVisible(k) |
1471 | | // |
1472 | | // Loop invariants: |
1473 | | // - Invariants (3) and (4) |
1474 | | // - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k) |
1475 | | // |
1476 | | // Progress: minHeap_.top()->key() is non-decreasing and strictly increases in |
1477 | | // a finite number of iterations. |
1478 | | // TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with |
1479 | | // k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range |
1480 | | // tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5 |
1481 | | // triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers |
1482 | | // SeekImpl(3 /* target */, 3). |
1483 | | // Ideally we should only move iterators forward in SeekImpl(), and the |
1484 | | // progress condition can be made simpler: iterator only moves forward. |
1485 | | // |
1486 | | // Proof sketch: |
1487 | | // Post-condition: |
1488 | | // Invariant (1) holds when this method returns: |
1489 | | // Ignoring the empty minHeap_ case, there are two cases: |
1490 | | // Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey() |
1491 | | // By invariants (rti) and (active_), active_ being empty means if a |
1492 | | // pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note |
1493 | | // that PopDeleteRangeStart() was called right before the while loop condition, |
1494 | | // so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must |
1495 | | // be of type ITERATOR. |
1496 | | // Case 2: SkipNextDeleted() returns false. The method returns false only when |
1497 | | // minHeap_.top().type == ITERATOR. |
1498 | | // |
1499 | | // Invariant (2) holds when this method returns: |
1500 | | // From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is |
1501 | | // children_[i] for some i. Suppose that children_[i].iter.key() is covered by |
1502 | | // some range tombstone. This means there is a j <= i and a range tombstone from |
1503 | | // level j with start_key() < children_[i].iter.key() < end_key(). |
1504 | | // - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi), |
1505 | | // pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik |
1506 | | // is either start or end key of this range tombstone. If |
1507 | | // pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at |
1508 | | // top of minHeap_ which would contradict Invariant (1). So |
1509 | | // pinned_heap_item_[j].tombstone_pik > children_[i].iter.key(). |
1510 | | // By Invariant (3), range_tombstone_iters_[j].prev.end_key() < |
1511 | | // children_[i].iter.key(). We assume that in each level, range tombstones |
1512 | | // cover non-overlapping ranges. So range_tombstone_iters_[j] is at |
1513 | | // the range tombstone with start_key() < children_[i].iter.key() < end_key() |
1514 | | // and has its end_key() in minHeap_. By Invariants (phi) and (active_), |
1515 | | // j is in active_. From while loop condition, SkipNextDeleted() must have |
1516 | | // returned false for this method to return. |
1517 | | // - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i) |
1518 | | // was called for some j' < i and j' in active_. Note that since j' is in |
1519 | | // active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik = |
1520 | | // range_tombstone_iters_[j']->end_key(). So |
1521 | | // range_tombstone_iters_[j']->end_key() must be larger than |
1522 | | // children_[i].iter.key() to not be at top of minHeap_. This means after |
1523 | | // SeekImpl(), children_[i] would be at a key > children_[i].iter.key() |
1524 | | // -- contradiction. |
1525 | | // - If j == i, children_[i]->Next() would have been called and children_[i] |
1526 | | // would be at a key > children_[i].iter.key() -- contradiction. |
1527 | | // - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j] |
1528 | | // points to an SST file with all range tombstones from that file exhausted. |
1529 | | // The file must come before the file containing the first |
1530 | | // range tombstone with start_key() < children_[i].iter.key() < end_key(). |
1531 | | // Assume files from same level have non-overlapping ranges, the current file's |
1532 | | // meta.largest is less than children_[i].iter.key(). So the file boundary key, |
1533 | | // which has value meta.largest must have been popped from minHeap_ before |
1534 | | // children_[i].iter.key(). So range_tombstone_iters_[j] would not point to |
1535 | | // this SST file -- contradiction. |
1536 | | // So it is impossible for children_[i].iter.key() to be covered by a range |
1537 | | // tombstone. |
1538 | | // |
1539 | | // Post-condition (*) holds when the function returns: |
1540 | | // From loop invariant (*) that k <= children_[i].iter.key() <= |
1541 | | // LevelNextVisible(i, k) and Invariant (2) above, when the function returns, |
1542 | | // minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels |
1543 | | // i. This is equal to NextVisible(k). |
1544 | | // |
1545 | | // Invariant (3) holds after each iteration: |
1546 | | // PopDeleteRangeStart() does not change range tombstone position. |
1547 | | // In SkipNextDeleted(): |
1548 | | // - If DELETE_RANGE_END is popped from minHeap_, it means the range |
1549 | | // tombstone's end key is < all other point keys, so it is safe to advance to |
1550 | | // next range tombstone. |
1551 | | // - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()), |
1552 | | // we assume that file's last range tombstone's |
1553 | | // end_key <= file boundary key < all other point keys. So it is safe to |
1554 | | // move to the first range tombstone in the next SST file. |
1555 | | // - If children_[i]->Next() is called, then it is fine as it is advancing a |
1556 | | // point iterator. |
1557 | | // - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s |
1558 | | // post-condition if its pre-condition holds. First pre-condition follows |
1559 | | // from loop invariant where Invariant (3) holds for all levels i. |
1560 | | // Now we should second pre-condition holds. Since Invariant (3) holds for |
1561 | | // all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key() |
1562 | | // < children_[l].iter.key(). `target` is the value of |
1563 | | // range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_. |
1564 | | // By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and |
1565 | | // pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key(). |
1566 | | // This end_key must be larger than children_[l].key() since it was not at top |
1567 | | // of minHeap_. So for all levels j <= l, |
1568 | | // range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target |
1569 | | // |
1570 | | // Invariant (4) holds after each iteration: |
1571 | | // A level i is inserted into active_ during calls to PopDeleteRangeStart(). |
1572 | | // In that case, range_tombstone_iters_[i].start_key() < all point keys |
1573 | | // by heap property and the assumption that point keys and range tombstone keys |
1574 | | // are distinct. |
1575 | | // If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j] |
1576 | | // where target = range_tombstone_iters_[j]->end_key() and children_[l]->key() |
1577 | | // < target. By loop invariants, (3) and (4) holds for levels. |
1578 | | // Since target > children_[l]->key(), it also holds that for j < l, |
1579 | | // range_tombstone_iters_[j].prev.end_key() < target and that if j in active_, |
1580 | | // range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of |
1581 | | // SeekImpl(target, l) holds, and (4) follow from its post-condition. |
1582 | | // All other places either in this function either advance point iterators |
1583 | | // or remove some level from active_, so (4) still holds. |
1584 | | // |
1585 | | // Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i, |
1586 | | // k). |
1587 | | // k <= children_[i] follows from loop `progress` condition. |
1588 | | // Consider when children_[i] is changed for any i. It is through |
1589 | | // children_[i].iter.Next() or SeekImpl() in SkipNextDeleted(). |
1590 | | // If children_[i].iter.Next() is called, there is a range tombstone from level |
1591 | | // i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_. |
1592 | | // By Invariant (4), tombstone's start_key < children_[i].iter.key(). By |
1593 | | // invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_ |
1594 | | // and that children_[i].iter.key() < end_key. So children_[i].iter.key() is |
1595 | | // not visible, and it is safe to call Next(). |
1596 | | // If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns, |
1597 | | // target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l, |
1598 | | // and children_[<l] is not touched. We know `target` is |
1599 | | // range_tombstone_iters_[j]->end_key() for some j < i and j is in active_. |
1600 | | // By Invariant (4), range_tombstone_iters_[j]->start_key() < |
1601 | | // children_[i].iter.key() for all i >= l. So for each level i >= l, the range |
1602 | | // [children_[i].iter.key(), target) is not visible. So after SeekImpl(), |
1603 | | // children_[i].iter.key() <= LevelNextVisible(i, target) <= |
1604 | | // LevelNextVisible(i, k). |
1605 | | // |
1606 | | // `Progress` holds for each iteration: |
1607 | | // Very sloppy intuition: |
1608 | | // - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_ |
1609 | | // is updated from the start key to the end key of the same range tombstone. |
1610 | | // We assume that start key <= end key for the same range tombstone. |
1611 | | // - in SkipNextDeleted() |
1612 | | // - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced |
1613 | | // and the relevant pinned_heap_item_.tombstone_pik is increased or popped |
1614 | | // from minHeap_. |
1615 | | // - If the top of heap is a file boundary key, then both point iter and |
1616 | | // range tombstone iter are advanced to the next file. |
1617 | | // - If the top of heap is ITERATOR and current->iter.Next() is called, it |
1618 | | // moves to a larger point key. |
1619 | | // - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all |
1620 | | // iterators from levels >= l are advanced to some key >= k by its contract. |
1621 | | // And top of minHeap_ before SeekImpl(k, l) was less than k. |
1622 | | // There are special cases where different heap items have the same key, |
1623 | | // e.g. when two range tombstone end keys share the same value). In |
1624 | | // these cases, iterators are being advanced, so the minimum key should increase |
1625 | | // in a finite number of steps. |
1626 | 41.1k | inline void MergingIterator::FindNextVisibleKey() { |
1627 | 41.1k | PopDeleteRangeStart(); |
1628 | | // PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START |
1629 | | // active_ being empty implies no DELETE_RANGE_END in heap. |
1630 | | // So minHeap_->top() must be of type ITERATOR. |
1631 | 41.1k | while ( |
1632 | 115k | !minHeap_.empty() && |
1633 | 109k | (!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) && |
1634 | 83.7k | SkipNextDeleted()) { |
1635 | 74.5k | PopDeleteRangeStart(); |
1636 | 74.5k | } |
1637 | | // Checks Invariant (1) |
1638 | 41.1k | assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR); |
1639 | 41.1k | } |
1640 | | |
1641 | 16.0k | inline void MergingIterator::FindPrevVisibleKey() { |
1642 | 16.0k | PopDeleteRangeEnd(); |
1643 | | // PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END |
1644 | | // active_ being empty implies no DELETE_RANGE_START in heap. |
1645 | | // So maxHeap_->top() must be of type ITERATOR. |
1646 | 16.0k | while ( |
1647 | 19.9k | !maxHeap_->empty() && |
1648 | 16.5k | (!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) && |
1649 | 3.96k | SkipPrevDeleted()) { |
1650 | 3.96k | PopDeleteRangeEnd(); |
1651 | 3.96k | } |
1652 | 16.0k | } |
1653 | | |
1654 | | InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, |
1655 | | InternalIterator** list, int n, |
1656 | 2.09k | Arena* arena, bool prefix_seek_mode) { |
1657 | 2.09k | assert(n >= 0); |
1658 | 2.09k | if (n == 0) { |
1659 | 0 | return NewEmptyInternalIterator<Slice>(arena); |
1660 | 2.09k | } else if (n == 1) { |
1661 | 2.09k | return list[0]; |
1662 | 2.09k | } else { |
1663 | 0 | if (arena == nullptr) { |
1664 | 0 | return new MergingIterator(cmp, list, n, false, prefix_seek_mode); |
1665 | 0 | } else { |
1666 | 0 | auto mem = arena->AllocateAligned(sizeof(MergingIterator)); |
1667 | 0 | return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode); |
1668 | 0 | } |
1669 | 0 | } |
1670 | 2.09k | } |
1671 | | |
1672 | | MergeIteratorBuilder::MergeIteratorBuilder( |
1673 | | const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode, |
1674 | | const Slice* iterate_upper_bound) |
1675 | 20.3k | : first_iter(nullptr), use_merging_iter(false), arena(a) { |
1676 | 20.3k | auto mem = arena->AllocateAligned(sizeof(MergingIterator)); |
1677 | 20.3k | merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true, |
1678 | 20.3k | prefix_seek_mode, iterate_upper_bound); |
1679 | 20.3k | } |
1680 | | |
1681 | 20.3k | MergeIteratorBuilder::~MergeIteratorBuilder() { |
1682 | 20.3k | if (first_iter != nullptr) { |
1683 | 0 | first_iter->~InternalIterator(); |
1684 | 0 | } |
1685 | 20.3k | if (merge_iter != nullptr) { |
1686 | 7.00k | merge_iter->~MergingIterator(); |
1687 | 7.00k | } |
1688 | 20.3k | } |
1689 | | |
1690 | 4.40k | void MergeIteratorBuilder::AddIterator(InternalIterator* iter) { |
1691 | 4.40k | if (!use_merging_iter && first_iter != nullptr) { |
1692 | 0 | merge_iter->AddIterator(first_iter); |
1693 | 0 | use_merging_iter = true; |
1694 | 0 | first_iter = nullptr; |
1695 | 0 | } |
1696 | 4.40k | if (use_merging_iter) { |
1697 | 0 | merge_iter->AddIterator(iter); |
1698 | 4.40k | } else { |
1699 | 4.40k | first_iter = iter; |
1700 | 4.40k | } |
1701 | 4.40k | } |
1702 | | |
1703 | | void MergeIteratorBuilder::AddPointAndTombstoneIterator( |
1704 | | InternalIterator* point_iter, |
1705 | | std::unique_ptr<TruncatedRangeDelIterator>&& tombstone_iter, |
1706 | 40.7k | std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr) { |
1707 | | // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator. |
1708 | 40.7k | bool add_range_tombstone = tombstone_iter || |
1709 | 38.2k | !merge_iter->range_tombstone_iters_.empty() || |
1710 | 38.2k | tombstone_iter_ptr; |
1711 | 40.7k | if (!use_merging_iter && (add_range_tombstone || first_iter)) { |
1712 | 13.2k | use_merging_iter = true; |
1713 | 13.2k | if (first_iter) { |
1714 | 13.2k | merge_iter->AddIterator(first_iter); |
1715 | 13.2k | first_iter = nullptr; |
1716 | 13.2k | } |
1717 | 13.2k | } |
1718 | 40.7k | if (use_merging_iter) { |
1719 | 24.8k | merge_iter->AddIterator(point_iter); |
1720 | 24.8k | if (add_range_tombstone) { |
1721 | | // If there was a gap, fill in nullptr as empty range tombstone iterators. |
1722 | 24.3k | while (merge_iter->range_tombstone_iters_.size() < |
1723 | 24.3k | merge_iter->children_.size() - 1) { |
1724 | 16.0k | merge_iter->AddRangeTombstoneIterator(nullptr); |
1725 | 16.0k | } |
1726 | 8.36k | merge_iter->AddRangeTombstoneIterator(std::move(tombstone_iter)); |
1727 | 8.36k | } |
1728 | | |
1729 | 24.8k | if (tombstone_iter_ptr) { |
1730 | | // This is needed instead of setting to &range_tombstone_iters_[i] |
1731 | | // directly here since the memory address of range_tombstone_iters_[i] |
1732 | | // might change during vector resizing. |
1733 | 5.81k | range_del_iter_ptrs_.emplace_back( |
1734 | 5.81k | merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr); |
1735 | 5.81k | } |
1736 | 24.8k | } else { |
1737 | 15.8k | first_iter = point_iter; |
1738 | 15.8k | } |
1739 | 40.7k | } |
1740 | | |
1741 | 20.3k | InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) { |
1742 | 20.3k | InternalIterator* ret = nullptr; |
1743 | 20.3k | if (!use_merging_iter) { |
1744 | 7.00k | ret = first_iter; |
1745 | 7.00k | first_iter = nullptr; |
1746 | 13.2k | } else { |
1747 | 13.2k | for (auto& p : range_del_iter_ptrs_) { |
1748 | 5.81k | *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]); |
1749 | 5.81k | } |
1750 | 13.2k | if (db_iter && !merge_iter->range_tombstone_iters_.empty()) { |
1751 | | // memtable is always the first level |
1752 | 8.36k | db_iter->SetMemtableRangetombstoneIter( |
1753 | 8.36k | &merge_iter->range_tombstone_iters_.front()); |
1754 | 8.36k | } |
1755 | 13.2k | merge_iter->Finish(); |
1756 | 13.2k | ret = merge_iter; |
1757 | 13.2k | merge_iter = nullptr; |
1758 | 13.2k | } |
1759 | 20.3k | return ret; |
1760 | 20.3k | } |
1761 | | |
1762 | | } // namespace ROCKSDB_NAMESPACE |