Coverage Report

Created: 2025-07-23 07:17

/src/rocksdb/db/db_iter.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/db_iter.h"
11
12
#include <limits>
13
#include <string>
14
15
#include "db/dbformat.h"
16
#include "db/merge_context.h"
17
#include "db/merge_helper.h"
18
#include "db/pinned_iterators_manager.h"
19
#include "db/wide/wide_column_serialization.h"
20
#include "db/wide/wide_columns_helper.h"
21
#include "file/filename.h"
22
#include "logging/logging.h"
23
#include "memory/arena.h"
24
#include "monitoring/perf_context_imp.h"
25
#include "rocksdb/env.h"
26
#include "rocksdb/iterator.h"
27
#include "rocksdb/merge_operator.h"
28
#include "rocksdb/options.h"
29
#include "rocksdb/system_clock.h"
30
#include "table/internal_iterator.h"
31
#include "table/iterator_wrapper.h"
32
#include "trace_replay/trace_replay.h"
33
#include "util/mutexlock.h"
34
#include "util/string_util.h"
35
#include "util/user_comparator_wrapper.h"
36
37
namespace ROCKSDB_NAMESPACE {
38
39
DBIter::DBIter(Env* _env, const ReadOptions& read_options,
40
               const ImmutableOptions& ioptions,
41
               const MutableCFOptions& mutable_cf_options,
42
               const Comparator* cmp, InternalIterator* iter,
43
               const Version* version, SequenceNumber s, bool arena_mode,
44
               ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
45
               bool expose_blob_index, ReadOnlyMemTable* active_mem)
46
18.2k
    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
47
18.2k
      env_(_env),
48
18.2k
      clock_(ioptions.clock),
49
18.2k
      logger_(ioptions.logger),
50
18.2k
      user_comparator_(cmp),
51
18.2k
      merge_operator_(ioptions.merge_operator.get()),
52
18.2k
      iter_(iter),
53
18.2k
      blob_reader_(version, read_options.read_tier,
54
18.2k
                   read_options.verify_checksums, read_options.fill_cache,
55
18.2k
                   read_options.io_activity),
56
18.2k
      read_callback_(read_callback),
57
18.2k
      sequence_(s),
58
18.2k
      statistics_(ioptions.stats),
59
18.2k
      max_skip_(mutable_cf_options.max_sequential_skip_in_iterations),
60
18.2k
      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
61
18.2k
      num_internal_keys_skipped_(0),
62
18.2k
      iterate_lower_bound_(read_options.iterate_lower_bound),
63
18.2k
      iterate_upper_bound_(read_options.iterate_upper_bound),
64
18.2k
      cfh_(cfh),
65
18.2k
      timestamp_ub_(read_options.timestamp),
66
18.2k
      timestamp_lb_(read_options.iter_start_ts),
67
18.2k
      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
68
18.2k
      active_mem_(active_mem),
69
18.2k
      memtable_seqno_lb_(kMaxSequenceNumber),
70
18.2k
      memtable_op_scan_flush_trigger_(0),
71
18.2k
      avg_op_scan_flush_trigger_(0),
72
18.2k
      iter_step_since_seek_(1),
73
18.2k
      mem_hidden_op_scanned_since_seek_(0),
74
18.2k
      direction_(kForward),
75
18.2k
      valid_(false),
76
18.2k
      current_entry_is_merged_(false),
77
18.2k
      is_key_seqnum_zero_(false),
78
      prefix_same_as_start_(
79
18.2k
          prefix_extractor_ ? read_options.prefix_same_as_start : false),
80
18.2k
      pin_thru_lifetime_(read_options.pin_data),
81
18.2k
      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
82
18.2k
                                     read_options.total_order_seek ||
83
18.2k
                                     read_options.auto_prefix_mode),
84
18.2k
      expose_blob_index_(expose_blob_index),
85
18.2k
      allow_unprepared_value_(read_options.allow_unprepared_value),
86
18.2k
      is_blob_(false),
87
18.2k
      arena_mode_(arena_mode) {
88
18.2k
  RecordTick(statistics_, NO_ITERATOR_CREATED);
89
18.2k
  if (pin_thru_lifetime_) {
90
0
    pinned_iters_mgr_.StartPinning();
91
0
  }
92
18.2k
  if (iter_.iter()) {
93
0
    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
94
0
  }
95
18.2k
  status_.PermitUncheckedError();
96
18.2k
  assert(timestamp_size_ ==
97
18.2k
         user_comparator_.user_comparator()->timestamp_size());
98
  // prefix_seek_opt_in_only should force total_order_seek whereever the caller
99
  // is duplicating the original ReadOptions
100
18.2k
  assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
101
18.2k
  if (active_mem_) {
102
    // FIXME: GetEarliestSequenceNumber() may return a seqno that is one smaller
103
    // than the smallest seqno in the memtable. This violates its comment and
104
    // entries with that seqno may not be in the active memtable. Before it's
105
    // fixed, we use GetFirstSequenceNumber() for more accurate result.
106
18.2k
    memtable_seqno_lb_ = active_mem_->IsEmpty()
107
18.2k
                             ? active_mem_->GetEarliestSequenceNumber()
108
18.2k
                             : active_mem_->GetFirstSequenceNumber();
109
18.2k
    memtable_op_scan_flush_trigger_ =
110
18.2k
        mutable_cf_options.memtable_op_scan_flush_trigger;
111
18.2k
    if (memtable_op_scan_flush_trigger_) {
112
      // avg_op_scan_flush_trigger_ requires memtable_op_scan_flush_trigger_ > 0
113
0
      avg_op_scan_flush_trigger_ =
114
0
          mutable_cf_options.memtable_avg_op_scan_flush_trigger;
115
0
    }
116
18.2k
  } else {
117
    // memtable_op_scan_flush_trigger_ and avg_op_scan_flush_trigger_ are
118
    // initialized to 0(disabled) as default.
119
0
  }
120
18.2k
}
121
122
0
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
123
0
  if (prop == nullptr) {
124
0
    return Status::InvalidArgument("prop is nullptr");
125
0
  }
126
0
  if (prop_name == "rocksdb.iterator.super-version-number") {
127
    // First try to pass the value returned from inner iterator.
128
0
    return iter_.iter()->GetProperty(prop_name, prop);
129
0
  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
130
0
    if (valid_) {
131
0
      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
132
0
    } else {
133
0
      *prop = "Iterator is not valid.";
134
0
    }
135
0
    return Status::OK();
136
0
  } else if (prop_name == "rocksdb.iterator.is-value-pinned") {
137
0
    if (valid_) {
138
0
      *prop = (pin_thru_lifetime_ && iter_.Valid() &&
139
0
               iter_.value().data() == value_.data())
140
0
                  ? "1"
141
0
                  : "0";
142
0
    } else {
143
0
      *prop = "Iterator is not valid.";
144
0
    }
145
0
    return Status::OK();
146
0
  } else if (prop_name == "rocksdb.iterator.internal-key") {
147
0
    *prop = saved_key_.GetUserKey().ToString();
148
0
    return Status::OK();
149
0
  } else if (prop_name == "rocksdb.iterator.write-time") {
150
0
    PutFixed64(prop, saved_write_unix_time_);
151
0
    return Status::OK();
152
0
  }
153
0
  return Status::InvalidArgument("Unidentified property.");
154
0
}
155
156
122k
bool DBIter::ParseKey(ParsedInternalKey* ikey) {
157
122k
  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
158
122k
  if (!s.ok()) {
159
0
    status_ = Status::Corruption("In DBIter: ", s.getState());
160
0
    valid_ = false;
161
0
    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
162
0
    return false;
163
122k
  } else {
164
122k
    return true;
165
122k
  }
166
122k
}
167
168
36.8k
void DBIter::Next() {
169
36.8k
  assert(valid_);
170
36.8k
  assert(status_.ok());
171
172
36.8k
  PERF_COUNTER_ADD(iter_next_count, 1);
173
36.8k
  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
174
  // Release temporarily pinned blocks from last operation
175
36.8k
  ReleaseTempPinnedData();
176
36.8k
  ResetBlobData();
177
36.8k
  ResetValueAndColumns();
178
36.8k
  local_stats_.skip_count_ += num_internal_keys_skipped_;
179
36.8k
  local_stats_.skip_count_--;
180
36.8k
  num_internal_keys_skipped_ = 0;
181
36.8k
  iter_step_since_seek_++;
182
36.8k
  bool ok = true;
183
36.8k
  if (direction_ == kReverse) {
184
0
    is_key_seqnum_zero_ = false;
185
0
    if (!ReverseToForward()) {
186
0
      ok = false;
187
0
    }
188
36.8k
  } else if (!current_entry_is_merged_) {
189
    // If the current value is not a merge, the iter position is the
190
    // current key, which is already returned. We can safely issue a
191
    // Next() without checking the current key.
192
    // If the current key is a merge, very likely iter already points
193
    // to the next internal position.
194
36.8k
    assert(iter_.Valid());
195
36.8k
    iter_.Next();
196
36.8k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
197
36.8k
  }
198
199
36.8k
  local_stats_.next_count_++;
200
36.8k
  if (ok && iter_.Valid()) {
201
31.2k
    ClearSavedValue();
202
203
31.2k
    if (prefix_same_as_start_) {
204
0
      assert(prefix_extractor_ != nullptr);
205
0
      const Slice prefix = prefix_.GetUserKey();
206
0
      FindNextUserEntry(true /* skipping the current user key */, &prefix);
207
31.2k
    } else {
208
31.2k
      FindNextUserEntry(true /* skipping the current user key */, nullptr);
209
31.2k
    }
210
31.2k
  } else {
211
5.54k
    is_key_seqnum_zero_ = false;
212
5.54k
    valid_ = false;
213
5.54k
  }
214
36.8k
  if (statistics_ != nullptr && valid_) {
215
0
    local_stats_.next_found_count_++;
216
0
    local_stats_.bytes_read_ += (key().size() + value().size());
217
0
  }
218
36.8k
}
219
220
Status DBIter::BlobReader::RetrieveAndSetBlobValue(const Slice& user_key,
221
0
                                                   const Slice& blob_index) {
222
0
  assert(blob_value_.empty());
223
224
0
  if (!version_) {
225
0
    return Status::Corruption("Encountered unexpected blob index.");
226
0
  }
227
228
  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
229
  // avoid having to copy options back and forth.
230
  // TODO: plumb Env::IOPriority
231
0
  ReadOptions read_options;
232
0
  read_options.read_tier = read_tier_;
233
0
  read_options.verify_checksums = verify_checksums_;
234
0
  read_options.fill_cache = fill_cache_;
235
0
  read_options.io_activity = io_activity_;
236
0
  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
237
0
  constexpr uint64_t* bytes_read = nullptr;
238
239
0
  const Status s = version_->GetBlob(read_options, user_key, blob_index,
240
0
                                     prefetch_buffer, &blob_value_, bytes_read);
241
242
0
  if (!s.ok()) {
243
0
    return s;
244
0
  }
245
246
0
  return Status::OK();
247
0
}
248
249
bool DBIter::SetValueAndColumnsFromBlobImpl(const Slice& user_key,
250
0
                                            const Slice& blob_index) {
251
0
  const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
252
0
  if (!s.ok()) {
253
0
    status_ = s;
254
0
    valid_ = false;
255
0
    is_blob_ = false;
256
0
    return false;
257
0
  }
258
259
0
  SetValueAndColumnsFromPlain(blob_reader_.GetBlobValue());
260
261
0
  return true;
262
0
}
263
264
bool DBIter::SetValueAndColumnsFromBlob(const Slice& user_key,
265
0
                                        const Slice& blob_index) {
266
0
  assert(!is_blob_);
267
0
  is_blob_ = true;
268
269
0
  if (expose_blob_index_) {
270
0
    SetValueAndColumnsFromPlain(blob_index);
271
0
    return true;
272
0
  }
273
274
0
  if (allow_unprepared_value_) {
275
0
    assert(value_.empty());
276
0
    assert(wide_columns_.empty());
277
278
0
    assert(lazy_blob_index_.empty());
279
0
    lazy_blob_index_ = blob_index;
280
281
0
    return true;
282
0
  }
283
284
0
  return SetValueAndColumnsFromBlobImpl(user_key, blob_index);
285
0
}
286
287
0
bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
288
0
  assert(value_.empty());
289
0
  assert(wide_columns_.empty());
290
291
0
  const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_);
292
293
0
  if (!s.ok()) {
294
0
    status_ = s;
295
0
    valid_ = false;
296
0
    wide_columns_.clear();
297
0
    return false;
298
0
  }
299
300
0
  if (WideColumnsHelper::HasDefaultColumn(wide_columns_)) {
301
0
    value_ = WideColumnsHelper::GetDefaultColumn(wide_columns_);
302
0
  }
303
304
0
  return true;
305
0
}
306
307
bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status,
308
0
                                               ValueType result_type) {
309
0
  if (!merge_status.ok()) {
310
0
    valid_ = false;
311
0
    status_ = merge_status;
312
0
    return false;
313
0
  }
314
315
0
  if (result_type == kTypeWideColumnEntity) {
316
0
    if (!SetValueAndColumnsFromEntity(saved_value_)) {
317
0
      assert(!valid_);
318
0
      return false;
319
0
    }
320
321
0
    valid_ = true;
322
0
    return true;
323
0
  }
324
325
0
  assert(result_type == kTypeValue);
326
0
  SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_
327
0
                                                   : saved_value_);
328
0
  valid_ = true;
329
0
  return true;
330
0
}
331
332
0
bool DBIter::PrepareValue() {
333
0
  assert(valid_);
334
335
0
  if (lazy_blob_index_.empty()) {
336
0
    return true;
337
0
  }
338
339
0
  assert(allow_unprepared_value_);
340
0
  assert(is_blob_);
341
342
0
  const bool result =
343
0
      SetValueAndColumnsFromBlobImpl(saved_key_.GetUserKey(), lazy_blob_index_);
344
345
0
  lazy_blob_index_.clear();
346
347
0
  return result;
348
0
}
349
350
// PRE: saved_key_ has the current user key if skipping_saved_key
351
// POST: saved_key_ should have the next user key if valid_,
352
//       if the current entry is a result of merge
353
//           current_entry_is_merged_ => true
354
//           saved_value_             => the merged value
355
//
356
// NOTE: In between, saved_key_ can point to a user key that has
357
//       a delete marker or a sequence number higher than sequence_
358
//       saved_key_ MUST have a proper user_key before calling this function
359
//
360
// The prefix parameter, if not null, indicates that we need to iterate
361
// within the prefix, and the iterator needs to be made invalid, if no
362
// more entry for the prefix can be found.
363
39.0k
bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
364
39.0k
  PERF_TIMER_GUARD(find_next_user_entry_time);
365
39.0k
  return FindNextUserEntryInternal(skipping_saved_key, prefix);
366
39.0k
}
367
368
// Actual implementation of DBIter::FindNextUserEntry()
369
bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
370
39.0k
                                       const Slice* prefix) {
371
  // Loop until we hit an acceptable entry to yield
372
39.0k
  assert(iter_.Valid());
373
39.0k
  assert(status_.ok());
374
39.0k
  assert(direction_ == kForward);
375
39.0k
  current_entry_is_merged_ = false;
376
377
  // How many times in a row we have skipped an entry with user key less than
378
  // or equal to saved_key_. We could skip these entries either because
379
  // sequence numbers were too high or because skipping_saved_key = true.
380
  // What saved_key_ contains throughout this method:
381
  //  - if skipping_saved_key : saved_key_ contains the key that we need
382
  //                            to skip, and we haven't seen any keys greater
383
  //                            than that,
384
  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
385
  //                            num_skipped times, and we haven't seen any keys
386
  //                            greater than that,
387
  //  - none of the above     : saved_key_ can contain anything, it doesn't
388
  //                            matter.
389
39.0k
  uint64_t num_skipped = 0;
390
  // For write unprepared, the target sequence number in reseek could be larger
391
  // than the snapshot, and thus needs to be skipped again. This could result in
392
  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
393
  // to one.
394
39.0k
  bool reseek_done = false;
395
396
39.0k
  uint64_t mem_hidden_op_scanned = 0;
397
52.9k
  do {
398
    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
399
    // but we need to save the previous value to be used in the loop.
400
52.9k
    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
401
52.9k
    if (!ParseKey(&ikey_)) {
402
0
      is_key_seqnum_zero_ = false;
403
0
      return false;
404
0
    }
405
52.9k
    Slice user_key_without_ts =
406
52.9k
        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
407
408
52.9k
    is_key_seqnum_zero_ = (ikey_.sequence == 0);
409
410
52.9k
    assert(iterate_upper_bound_ == nullptr ||
411
52.9k
           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
412
52.9k
           user_comparator_.CompareWithoutTimestamp(
413
52.9k
               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
414
52.9k
               /*b_has_ts=*/false) < 0);
415
52.9k
    if (iterate_upper_bound_ != nullptr &&
416
52.9k
        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
417
52.9k
        user_comparator_.CompareWithoutTimestamp(
418
0
            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
419
0
            /*b_has_ts=*/false) >= 0) {
420
0
      break;
421
0
    }
422
423
52.9k
    assert(prefix == nullptr || prefix_extractor_ != nullptr);
424
52.9k
    if (prefix != nullptr &&
425
52.9k
        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
426
0
            0) {
427
0
      assert(prefix_same_as_start_);
428
0
      break;
429
0
    }
430
431
52.9k
    if (TooManyInternalKeysSkipped()) {
432
0
      return false;
433
0
    }
434
435
52.9k
    assert(ikey_.user_key.size() >= timestamp_size_);
436
52.9k
    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
437
0
                                         ikey_.user_key, timestamp_size_)
438
52.9k
                                   : Slice();
439
52.9k
    bool more_recent = false;
440
52.9k
    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
441
      // If the previous entry is of seqnum 0, the current entry will not
442
      // possibly be skipped. This condition can potentially be relaxed to
443
      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
444
      // prone to bugs causing the same user key with the same sequence number.
445
      // Note that with current timestamp implementation, the same user key can
446
      // have different timestamps and zero sequence number on the bottommost
447
      // level. This may change in the future.
448
52.9k
      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
449
52.9k
          skipping_saved_key &&
450
52.9k
          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
451
7.55k
        num_skipped++;  // skip this entry
452
7.55k
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
453
7.55k
        MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
454
45.3k
      } else {
455
45.3k
        assert(!skipping_saved_key ||
456
45.3k
               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
457
45.3k
        num_skipped = 0;
458
45.3k
        reseek_done = false;
459
45.3k
        switch (ikey_.type) {
460
8.57k
          case kTypeDeletion:
461
8.57k
          case kTypeDeletionWithTimestamp:
462
8.57k
          case kTypeSingleDeletion:
463
            // Arrange to skip all upcoming entries for this key since
464
            // they are hidden by this deletion.
465
8.57k
            if (timestamp_lb_) {
466
0
              saved_key_.SetInternalKey(ikey_);
467
0
              valid_ = true;
468
0
              return true;
469
8.57k
            } else {
470
8.57k
              saved_key_.SetUserKey(
471
8.57k
                  ikey_.user_key, !pin_thru_lifetime_ ||
472
8.57k
                                      !iter_.iter()->IsKeyPinned() /* copy */);
473
8.57k
              skipping_saved_key = true;
474
8.57k
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
475
8.57k
              MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
476
8.57k
            }
477
8.57k
            break;
478
36.8k
          case kTypeValue:
479
36.8k
          case kTypeValuePreferredSeqno:
480
36.8k
          case kTypeBlobIndex:
481
36.8k
          case kTypeWideColumnEntity:
482
36.8k
            if (!PrepareValueInternal()) {
483
0
              return false;
484
0
            }
485
36.8k
            if (timestamp_lb_) {
486
0
              saved_key_.SetInternalKey(ikey_);
487
36.8k
            } else {
488
36.8k
              saved_key_.SetUserKey(
489
36.8k
                  ikey_.user_key, !pin_thru_lifetime_ ||
490
36.8k
                                      !iter_.iter()->IsKeyPinned() /* copy */);
491
36.8k
            }
492
493
36.8k
            if (ikey_.type == kTypeBlobIndex) {
494
0
              if (!SetValueAndColumnsFromBlob(ikey_.user_key, iter_.value())) {
495
0
                return false;
496
0
              }
497
36.8k
            } else if (ikey_.type == kTypeWideColumnEntity) {
498
0
              if (!SetValueAndColumnsFromEntity(iter_.value())) {
499
0
                return false;
500
0
              }
501
36.8k
            } else {
502
36.8k
              assert(ikey_.type == kTypeValue ||
503
36.8k
                     ikey_.type == kTypeValuePreferredSeqno);
504
36.8k
              Slice value = iter_.value();
505
36.8k
              saved_write_unix_time_ = iter_.write_unix_time();
506
36.8k
              if (ikey_.type == kTypeValuePreferredSeqno) {
507
0
                value = ParsePackedValueForValue(value);
508
0
              }
509
36.8k
              SetValueAndColumnsFromPlain(value);
510
36.8k
            }
511
512
36.8k
            valid_ = true;
513
36.8k
            return true;
514
0
          case kTypeMerge:
515
0
            if (!PrepareValueInternal()) {
516
0
              return false;
517
0
            }
518
0
            saved_key_.SetUserKey(
519
0
                ikey_.user_key,
520
0
                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
521
            // By now, we are sure the current ikey is going to yield a value
522
0
            current_entry_is_merged_ = true;
523
0
            valid_ = true;
524
0
            return MergeValuesNewToOld();  // Go to a different state machine
525
0
          default:
526
0
            valid_ = false;
527
0
            status_ = Status::Corruption(
528
0
                "Unknown value type: " +
529
0
                std::to_string(static_cast<unsigned int>(ikey_.type)));
530
0
            return false;
531
45.3k
        }
532
45.3k
      }
533
52.9k
    } else {
534
0
      if (more_recent) {
535
0
        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
536
0
      }
537
538
      // This key was inserted after our snapshot was taken or skipped by
539
      // timestamp range. If this happens too many times in a row for the same
540
      // user key, we want to seek to the target sequence number.
541
0
      int cmp = user_comparator_.CompareWithoutTimestamp(
542
0
          ikey_.user_key, saved_key_.GetUserKey());
543
0
      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
544
0
        num_skipped++;
545
0
      } else {
546
0
        saved_key_.SetUserKey(
547
0
            ikey_.user_key,
548
0
            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
549
0
        skipping_saved_key = false;
550
0
        num_skipped = 0;
551
0
        reseek_done = false;
552
0
      }
553
0
    }
554
555
    // If we have sequentially iterated via numerous equal keys, then it's
556
    // better to seek so that we can avoid too many key comparisons.
557
    //
558
    // To avoid infinite loops, do not reseek if we have already attempted to
559
    // reseek previously.
560
    //
561
    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
562
    // then it does not make sense to reseek as we would actually land further
563
    // away from the desired key. There is opportunity for optimization here.
564
16.1k
    if (num_skipped > max_skip_ && !reseek_done) {
565
346
      is_key_seqnum_zero_ = false;
566
346
      num_skipped = 0;
567
346
      reseek_done = true;
568
346
      std::string last_key;
569
346
      if (skipping_saved_key) {
570
        // We're looking for the next user-key but all we see are the same
571
        // user-key with decreasing sequence numbers. Fast forward to
572
        // sequence number 0 and type deletion (the smallest type).
573
346
        if (timestamp_size_ == 0) {
574
346
          AppendInternalKey(
575
346
              &last_key,
576
346
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
577
346
        } else {
578
0
          const std::string kTsMin(timestamp_size_, '\0');
579
0
          AppendInternalKeyWithDifferentTimestamp(
580
0
              &last_key,
581
0
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
582
0
              kTsMin);
583
0
        }
584
        // Don't set skipping_saved_key = false because we may still see more
585
        // user-keys equal to saved_key_.
586
346
      } else {
587
        // We saw multiple entries with this user key and sequence numbers
588
        // higher than sequence_. Fast forward to sequence_.
589
        // Note that this only covers a case when a higher key was overwritten
590
        // many times since our snapshot was taken, not the case when a lot of
591
        // different keys were inserted after our snapshot was taken.
592
0
        if (timestamp_size_ == 0) {
593
0
          AppendInternalKey(
594
0
              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
595
0
                                           kValueTypeForSeek));
596
0
        } else {
597
0
          AppendInternalKeyWithDifferentTimestamp(
598
0
              &last_key,
599
0
              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
600
0
                                kValueTypeForSeek),
601
0
              *timestamp_ub_);
602
0
        }
603
0
      }
604
346
      iter_.Seek(last_key);
605
346
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
606
15.7k
    } else {
607
15.7k
      iter_.Next();
608
15.7k
    }
609
610
    // This could be a long-running operation due to tombstones, etc.
611
16.1k
    bool aborted = ROCKSDB_THREAD_YIELD_CHECK_ABORT();
612
16.1k
    if (aborted) {
613
0
      valid_ = false;
614
0
      status_ = Status::Aborted("Query abort.");
615
0
      return false;
616
0
    }
617
16.1k
  } while (iter_.Valid());
618
619
2.18k
  valid_ = false;
620
2.18k
  return iter_.status().ok();
621
39.0k
}
622
623
// Merge values of the same user key starting from the current iter_ position
624
// Scan from the newer entries to older entries.
625
// PRE: iter_.key() points to the first merge type entry
626
//      saved_key_ stores the user key
627
//      iter_.PrepareValue() has been called
628
// POST: saved_value_ has the merged value for the user key
629
//       iter_ points to the next entry (or invalid)
630
0
bool DBIter::MergeValuesNewToOld() {
631
0
  if (!merge_operator_) {
632
0
    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
633
0
    status_ = Status::InvalidArgument("merge_operator_ must be set.");
634
0
    valid_ = false;
635
0
    return false;
636
0
  }
637
638
  // Temporarily pin the blocks that hold merge operands
639
0
  TempPinData();
640
0
  merge_context_.Clear();
641
  // Start the merge process by pushing the first operand
642
0
  merge_context_.PushOperand(
643
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
644
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
645
646
0
  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
647
648
0
  ParsedInternalKey ikey;
649
0
  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
650
0
    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
651
0
    if (!ParseKey(&ikey)) {
652
0
      return false;
653
0
    }
654
655
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
656
0
                                                saved_key_.GetUserKey())) {
657
      // hit the next user key, stop right here
658
0
      break;
659
0
    }
660
0
    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
661
0
        kTypeDeletionWithTimestamp == ikey.type) {
662
      // hit a delete with the same user key, stop right here
663
      // iter_ is positioned after delete
664
0
      iter_.Next();
665
0
      break;
666
0
    }
667
0
    if (!PrepareValueInternal()) {
668
0
      return false;
669
0
    }
670
671
0
    if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) {
672
0
      Slice value = iter_.value();
673
0
      saved_write_unix_time_ = iter_.write_unix_time();
674
0
      if (kTypeValuePreferredSeqno == ikey.type) {
675
0
        value = ParsePackedValueForValue(value);
676
0
      }
677
      // hit a put or put equivalent, merge the put value with operands and
678
      // store the final result in saved_value_. We are done!
679
0
      if (!MergeWithPlainBaseValue(value, ikey.user_key)) {
680
0
        return false;
681
0
      }
682
      // iter_ is positioned after put
683
0
      iter_.Next();
684
0
      if (!iter_.status().ok()) {
685
0
        valid_ = false;
686
0
        return false;
687
0
      }
688
0
      return true;
689
0
    } else if (kTypeMerge == ikey.type) {
690
      // hit a merge, add the value as an operand and run associative merge.
691
      // when complete, add result to operands and continue.
692
0
      merge_context_.PushOperand(
693
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
694
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
695
0
    } else if (kTypeBlobIndex == ikey.type) {
696
0
      if (!MergeWithBlobBaseValue(iter_.value(), ikey.user_key)) {
697
0
        return false;
698
0
      }
699
700
      // iter_ is positioned after put
701
0
      iter_.Next();
702
0
      if (!iter_.status().ok()) {
703
0
        valid_ = false;
704
0
        return false;
705
0
      }
706
707
0
      return true;
708
0
    } else if (kTypeWideColumnEntity == ikey.type) {
709
0
      if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) {
710
0
        return false;
711
0
      }
712
713
      // iter_ is positioned after put
714
0
      iter_.Next();
715
0
      if (!iter_.status().ok()) {
716
0
        valid_ = false;
717
0
        return false;
718
0
      }
719
720
0
      return true;
721
0
    } else {
722
0
      valid_ = false;
723
0
      status_ = Status::Corruption(
724
0
          "Unrecognized value type: " +
725
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
726
0
      return false;
727
0
    }
728
0
  }
729
730
0
  if (!iter_.status().ok()) {
731
0
    valid_ = false;
732
0
    return false;
733
0
  }
734
735
  // we either exhausted all internal keys under this user key, or hit
736
  // a deletion marker.
737
  // feed null as the existing value to the merge operator, such that
738
  // client can differentiate this scenario and do things accordingly.
739
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
740
0
    return false;
741
0
  }
742
0
  assert(status_.ok());
743
0
  return true;
744
0
}
745
746
0
void DBIter::Prev() {
747
0
  assert(valid_);
748
0
  assert(status_.ok());
749
750
0
  PERF_COUNTER_ADD(iter_prev_count, 1);
751
0
  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
752
0
  ReleaseTempPinnedData();
753
0
  ResetBlobData();
754
0
  ResetValueAndColumns();
755
0
  ResetInternalKeysSkippedCounter();
756
0
  bool ok = true;
757
0
  if (direction_ == kForward) {
758
0
    if (!ReverseToBackward()) {
759
0
      ok = false;
760
0
    }
761
0
  }
762
0
  if (ok) {
763
0
    ClearSavedValue();
764
765
0
    Slice prefix;
766
0
    if (prefix_same_as_start_) {
767
0
      assert(prefix_extractor_ != nullptr);
768
0
      prefix = prefix_.GetUserKey();
769
0
    }
770
0
    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
771
0
  }
772
773
0
  if (statistics_ != nullptr) {
774
0
    local_stats_.prev_count_++;
775
0
    if (valid_) {
776
0
      local_stats_.prev_found_count_++;
777
0
      local_stats_.bytes_read_ += (key().size() + value().size());
778
0
    }
779
0
  }
780
0
}
781
782
0
bool DBIter::ReverseToForward() {
783
0
  assert(iter_.status().ok());
784
785
  // When moving backwards, iter_ is positioned on _previous_ key, which may
786
  // not exist or may have different prefix than the current key().
787
  // If that's the case, seek iter_ to current key.
788
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
789
0
    std::string last_key;
790
0
    if (timestamp_size_ == 0) {
791
0
      AppendInternalKey(
792
0
          &last_key, ParsedInternalKey(saved_key_.GetUserKey(),
793
0
                                       kMaxSequenceNumber, kValueTypeForSeek));
794
0
    } else {
795
      // TODO: pre-create kTsMax.
796
0
      const std::string kTsMax(timestamp_size_, '\xff');
797
0
      AppendInternalKeyWithDifferentTimestamp(
798
0
          &last_key,
799
0
          ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
800
0
                            kValueTypeForSeek),
801
0
          kTsMax);
802
0
    }
803
0
    iter_.Seek(last_key);
804
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
805
0
  }
806
807
0
  direction_ = kForward;
808
  // Skip keys less than the current key() (a.k.a. saved_key_).
809
0
  while (iter_.Valid()) {
810
0
    ParsedInternalKey ikey;
811
0
    if (!ParseKey(&ikey)) {
812
0
      return false;
813
0
    }
814
0
    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
815
0
      return true;
816
0
    }
817
0
    iter_.Next();
818
0
  }
819
820
0
  if (!iter_.status().ok()) {
821
0
    valid_ = false;
822
0
    return false;
823
0
  }
824
825
0
  return true;
826
0
}
827
828
// Move iter_ to the key before saved_key_.
829
0
bool DBIter::ReverseToBackward() {
830
0
  assert(iter_.status().ok());
831
832
  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
833
  // key, which may not exist or may have prefix different from current.
834
  // If that's the case, seek to saved_key_.
835
0
  if (current_entry_is_merged_ &&
836
0
      (!expect_total_order_inner_iter() || !iter_.Valid())) {
837
0
    IterKey last_key;
838
    // Using kMaxSequenceNumber and kValueTypeForSeek
839
    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
840
    // than saved_key_.
841
0
    last_key.SetInternalKey(ParsedInternalKey(
842
0
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
843
0
    if (!expect_total_order_inner_iter()) {
844
0
      iter_.SeekForPrev(last_key.GetInternalKey());
845
0
    } else {
846
      // Some iterators may not support SeekForPrev(), so we avoid using it
847
      // when prefix seek mode is disabled. This is somewhat expensive
848
      // (an extra Prev(), as well as an extra change of direction of iter_),
849
      // so we may need to reconsider it later.
850
0
      iter_.Seek(last_key.GetInternalKey());
851
0
      if (!iter_.Valid() && iter_.status().ok()) {
852
0
        iter_.SeekToLast();
853
0
      }
854
0
    }
855
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
856
0
  }
857
858
0
  direction_ = kReverse;
859
0
  return FindUserKeyBeforeSavedKey();
860
0
}
861
862
3.96k
void DBIter::PrevInternal(const Slice* prefix) {
863
5.74k
  while (iter_.Valid()) {
864
4.07k
    saved_key_.SetUserKey(
865
4.07k
        ExtractUserKey(iter_.key()),
866
4.07k
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
867
868
4.07k
    assert(prefix == nullptr || prefix_extractor_ != nullptr);
869
4.07k
    if (prefix != nullptr &&
870
4.07k
        prefix_extractor_
871
0
                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
872
0
                                                      timestamp_size_))
873
0
                .compare(*prefix) != 0) {
874
0
      assert(prefix_same_as_start_);
875
      // Current key does not have the same prefix as start
876
0
      valid_ = false;
877
0
      return;
878
0
    }
879
880
4.07k
    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
881
4.07k
           user_comparator_.CompareWithoutTimestamp(
882
4.07k
               saved_key_.GetUserKey(), /*a_has_ts=*/true,
883
4.07k
               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
884
4.07k
    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
885
4.07k
        user_comparator_.CompareWithoutTimestamp(
886
0
            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
887
0
            /*b_has_ts=*/false) < 0) {
888
      // We've iterated earlier than the user-specified lower bound.
889
0
      valid_ = false;
890
0
      return;
891
0
    }
892
893
4.07k
    if (!FindValueForCurrentKey()) {  // assigns valid_
894
0
      return;
895
0
    }
896
897
    // Whether or not we found a value for current key, we need iter_ to end up
898
    // on a smaller key.
899
4.07k
    if (!FindUserKeyBeforeSavedKey()) {
900
0
      return;
901
0
    }
902
903
4.07k
    if (valid_) {
904
      // Found the value.
905
2.29k
      return;
906
2.29k
    }
907
908
1.78k
    if (TooManyInternalKeysSkipped(false)) {
909
0
      return;
910
0
    }
911
1.78k
  }
912
913
  // We haven't found any key - iterator is not valid
914
1.66k
  valid_ = false;
915
1.66k
}
916
917
// Used for backwards iteration.
918
// Looks at the entries with user key saved_key_ and finds the most up-to-date
919
// value for it, or executes a merge, or determines that the value was deleted.
920
// Sets valid_ to true if the value is found and is ready to be presented to
921
// the user through value().
922
// Sets valid_ to false if the value was deleted, and we should try another key.
923
// Returns false if an error occurred, and !status().ok() and !valid_.
924
//
925
// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
926
// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
927
//       the entry just before them, or on the entry just after them.
928
4.07k
bool DBIter::FindValueForCurrentKey() {
929
4.07k
  assert(iter_.Valid());
930
4.07k
  merge_context_.Clear();
931
4.07k
  current_entry_is_merged_ = false;
932
  // last entry before merge (could be kTypeDeletion,
933
  // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue
934
  // kTypeBlobIndex, kTypeWideColumnEntity or kTypeValuePreferredSeqno)
935
4.07k
  ValueType last_not_merge_type = kTypeDeletion;
936
4.07k
  ValueType last_key_entry_type = kTypeDeletion;
937
938
  // If false, it indicates that we have not seen any valid entry, even though
939
  // last_key_entry_type is initialized to kTypeDeletion.
940
4.07k
  bool valid_entry_seen = false;
941
942
  // Temporarily pin blocks that hold (merge operands / the value)
943
4.07k
  ReleaseTempPinnedData();
944
4.07k
  TempPinData();
945
4.07k
  size_t num_skipped = 0;
946
18.9k
  while (iter_.Valid()) {
947
15.5k
    ParsedInternalKey ikey;
948
15.5k
    if (!ParseKey(&ikey)) {
949
0
      return false;
950
0
    }
951
952
15.5k
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
953
15.5k
                                                saved_key_.GetUserKey())) {
954
      // Found a smaller user key, thus we are done with current user key.
955
143
      break;
956
143
    }
957
958
15.4k
    assert(ikey.user_key.size() >= timestamp_size_);
959
15.4k
    Slice ts;
960
15.4k
    if (timestamp_size_ > 0) {
961
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
962
0
                 timestamp_size_);
963
0
    }
964
965
15.4k
    bool visible = IsVisible(ikey.sequence, ts);
966
15.4k
    if (!visible &&
967
15.4k
        (timestamp_lb_ == nullptr ||
968
0
         user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
969
      // Found an invisible version of the current user key, and it must have
970
      // a higher sequence number or timestamp. Therefore, we are done with the
971
      // current user key.
972
0
      break;
973
0
    }
974
975
15.4k
    if (!ts.empty()) {
976
0
      saved_timestamp_.assign(ts.data(), ts.size());
977
0
    }
978
979
15.4k
    if (TooManyInternalKeysSkipped()) {
980
0
      return false;
981
0
    }
982
983
    // This user key has lots of entries.
984
    // We're going from old to new, and it's taking too long. Let's do a Seek()
985
    // and go from new to old. This helps when a key was overwritten many times.
986
15.4k
    if (num_skipped >= max_skip_) {
987
612
      return FindValueForCurrentKeyUsingSeek();
988
612
    }
989
990
14.8k
    if (!PrepareValueInternal()) {
991
0
      return false;
992
0
    }
993
994
14.8k
    if (timestamp_lb_ != nullptr) {
995
      // Only needed when timestamp_lb_ is not null
996
0
      [[maybe_unused]] const bool ret = ParseKey(&ikey_);
997
      // Since the preceding ParseKey(&ikey) succeeds, so must this.
998
0
      assert(ret);
999
0
      saved_key_.SetInternalKey(ikey);
1000
14.8k
    } else if (user_comparator_.Compare(ikey.user_key,
1001
14.8k
                                        saved_key_.GetUserKey()) < 0) {
1002
0
      saved_key_.SetUserKey(
1003
0
          ikey.user_key,
1004
0
          !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
1005
0
    }
1006
1007
14.8k
    valid_entry_seen = true;
1008
14.8k
    last_key_entry_type = ikey.type;
1009
14.8k
    switch (last_key_entry_type) {
1010
9.50k
      case kTypeValue:
1011
9.50k
      case kTypeValuePreferredSeqno:
1012
9.50k
      case kTypeBlobIndex:
1013
9.50k
      case kTypeWideColumnEntity:
1014
9.50k
        if (iter_.iter()->IsValuePinned()) {
1015
9.50k
          saved_write_unix_time_ = iter_.write_unix_time();
1016
9.50k
          if (last_key_entry_type == kTypeValuePreferredSeqno) {
1017
0
            pinned_value_ = ParsePackedValueForValue(iter_.value());
1018
9.50k
          } else {
1019
9.50k
            pinned_value_ = iter_.value();
1020
9.50k
          }
1021
9.50k
        } else {
1022
0
          valid_ = false;
1023
0
          status_ = Status::NotSupported(
1024
0
              "Backward iteration not supported if underlying iterator's value "
1025
0
              "cannot be pinned.");
1026
0
        }
1027
9.50k
        merge_context_.Clear();
1028
9.50k
        last_not_merge_type = last_key_entry_type;
1029
9.50k
        if (!status_.ok()) {
1030
0
          return false;
1031
0
        }
1032
9.50k
        break;
1033
9.50k
      case kTypeDeletion:
1034
5.33k
      case kTypeDeletionWithTimestamp:
1035
5.33k
      case kTypeSingleDeletion:
1036
5.33k
        merge_context_.Clear();
1037
5.33k
        last_not_merge_type = last_key_entry_type;
1038
5.33k
        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
1039
5.33k
        break;
1040
0
      case kTypeMerge: {
1041
0
        assert(merge_operator_ != nullptr);
1042
0
        merge_context_.PushOperandBack(
1043
0
            iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1044
0
        PERF_COUNTER_ADD(internal_merge_count, 1);
1045
0
      } break;
1046
0
      default:
1047
0
        valid_ = false;
1048
0
        status_ = Status::Corruption(
1049
0
            "Unknown value type: " +
1050
0
            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
1051
0
        return false;
1052
14.8k
    }
1053
1054
14.8k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
1055
14.8k
    iter_.Prev();
1056
14.8k
    ++num_skipped;
1057
1058
14.8k
    if (visible && timestamp_lb_ != nullptr) {
1059
      // If timestamp_lb_ is not nullptr, we do not have to look further for
1060
      // another internal key. We can return this current internal key. Yet we
1061
      // still keep the invariant that iter_ is positioned before the returned
1062
      // key.
1063
0
      break;
1064
0
    }
1065
14.8k
  }
1066
1067
3.46k
  if (!iter_.status().ok()) {
1068
0
    valid_ = false;
1069
0
    return false;
1070
0
  }
1071
1072
3.46k
  if (!valid_entry_seen) {
1073
    // Since we haven't seen any valid entry, last_key_entry_type remains
1074
    // unchanged and the same as its initial value.
1075
0
    assert(last_key_entry_type == kTypeDeletion);
1076
0
    assert(last_not_merge_type == kTypeDeletion);
1077
0
    valid_ = false;
1078
0
    return true;
1079
0
  }
1080
1081
3.46k
  if (timestamp_lb_ != nullptr) {
1082
0
    assert(last_key_entry_type == ikey_.type);
1083
0
  }
1084
1085
3.46k
  switch (last_key_entry_type) {
1086
1.62k
    case kTypeDeletion:
1087
1.62k
    case kTypeDeletionWithTimestamp:
1088
1.62k
    case kTypeSingleDeletion:
1089
1.62k
      if (timestamp_lb_ == nullptr) {
1090
1.62k
        valid_ = false;
1091
1.62k
      } else {
1092
0
        valid_ = true;
1093
0
      }
1094
1.62k
      return true;
1095
0
    case kTypeMerge:
1096
0
      current_entry_is_merged_ = true;
1097
0
      if (last_not_merge_type == kTypeDeletion ||
1098
0
          last_not_merge_type == kTypeSingleDeletion ||
1099
0
          last_not_merge_type == kTypeDeletionWithTimestamp) {
1100
0
        if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1101
0
          return false;
1102
0
        }
1103
0
        return true;
1104
0
      } else if (last_not_merge_type == kTypeBlobIndex) {
1105
0
        if (!MergeWithBlobBaseValue(pinned_value_, saved_key_.GetUserKey())) {
1106
0
          return false;
1107
0
        }
1108
1109
0
        return true;
1110
0
      } else if (last_not_merge_type == kTypeWideColumnEntity) {
1111
0
        if (!MergeWithWideColumnBaseValue(pinned_value_,
1112
0
                                          saved_key_.GetUserKey())) {
1113
0
          return false;
1114
0
        }
1115
1116
0
        return true;
1117
0
      } else {
1118
0
        assert(last_not_merge_type == kTypeValue ||
1119
0
               last_not_merge_type == kTypeValuePreferredSeqno);
1120
0
        if (!MergeWithPlainBaseValue(pinned_value_, saved_key_.GetUserKey())) {
1121
0
          return false;
1122
0
        }
1123
0
        return true;
1124
0
      }
1125
1.84k
    case kTypeValue:
1126
1.84k
    case kTypeValuePreferredSeqno:
1127
1.84k
      SetValueAndColumnsFromPlain(pinned_value_);
1128
1129
1.84k
      break;
1130
0
    case kTypeBlobIndex:
1131
0
      if (!SetValueAndColumnsFromBlob(saved_key_.GetUserKey(), pinned_value_)) {
1132
0
        return false;
1133
0
      }
1134
0
      break;
1135
0
    case kTypeWideColumnEntity:
1136
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1137
0
        return false;
1138
0
      }
1139
0
      break;
1140
0
    default:
1141
0
      valid_ = false;
1142
0
      status_ = Status::Corruption(
1143
0
          "Unknown value type: " +
1144
0
          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
1145
0
      return false;
1146
3.46k
  }
1147
1.84k
  valid_ = true;
1148
1.84k
  return true;
1149
3.46k
}
1150
1151
// This function is used in FindValueForCurrentKey.
1152
// We use Seek() function instead of Prev() to find necessary value
1153
// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
1154
//       Would be nice to reuse some code.
1155
612
bool DBIter::FindValueForCurrentKeyUsingSeek() {
1156
  // FindValueForCurrentKey will enable pinning before calling
1157
  // FindValueForCurrentKeyUsingSeek()
1158
612
  assert(pinned_iters_mgr_.PinningEnabled());
1159
612
  std::string last_key;
1160
612
  if (0 == timestamp_size_) {
1161
612
    AppendInternalKey(&last_key,
1162
612
                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1163
612
                                        kValueTypeForSeek));
1164
612
  } else {
1165
0
    AppendInternalKeyWithDifferentTimestamp(
1166
0
        &last_key,
1167
0
        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1168
0
                          kValueTypeForSeek),
1169
0
        timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
1170
0
  }
1171
612
  iter_.Seek(last_key);
1172
612
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1173
1174
  // In case read_callback presents, the value we seek to may not be visible.
1175
  // Find the next value that's visible.
1176
612
  ParsedInternalKey ikey;
1177
1178
612
  while (true) {
1179
612
    if (!iter_.Valid()) {
1180
0
      valid_ = false;
1181
0
      return iter_.status().ok();
1182
0
    }
1183
1184
612
    if (!ParseKey(&ikey)) {
1185
0
      return false;
1186
0
    }
1187
612
    assert(ikey.user_key.size() >= timestamp_size_);
1188
612
    Slice ts;
1189
612
    if (timestamp_size_ > 0) {
1190
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1191
0
                 timestamp_size_);
1192
0
    }
1193
1194
612
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1195
612
                                                saved_key_.GetUserKey())) {
1196
      // No visible values for this key, even though FindValueForCurrentKey()
1197
      // has seen some. This is possible if we're using a tailing iterator, and
1198
      // the entries were discarded in a compaction.
1199
0
      valid_ = false;
1200
0
      return true;
1201
0
    }
1202
1203
612
    if (IsVisible(ikey.sequence, ts)) {
1204
612
      break;
1205
612
    }
1206
1207
0
    iter_.Next();
1208
0
  }
1209
1210
612
  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1211
612
      kTypeDeletionWithTimestamp == ikey.type) {
1212
156
    if (timestamp_lb_ == nullptr) {
1213
156
      valid_ = false;
1214
156
    } else {
1215
0
      valid_ = true;
1216
0
      saved_key_.SetInternalKey(ikey);
1217
0
    }
1218
156
    return true;
1219
156
  }
1220
456
  if (!PrepareValueInternal()) {
1221
0
    return false;
1222
0
  }
1223
456
  if (timestamp_size_ > 0) {
1224
0
    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
1225
0
    saved_timestamp_.assign(ts.data(), ts.size());
1226
0
  }
1227
456
  if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno ||
1228
456
      ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) {
1229
456
    assert(iter_.iter()->IsValuePinned());
1230
456
    saved_write_unix_time_ = iter_.write_unix_time();
1231
456
    if (ikey.type == kTypeValuePreferredSeqno) {
1232
0
      pinned_value_ = ParsePackedValueForValue(iter_.value());
1233
456
    } else {
1234
456
      pinned_value_ = iter_.value();
1235
456
    }
1236
456
    if (ikey.type == kTypeBlobIndex) {
1237
0
      if (!SetValueAndColumnsFromBlob(ikey.user_key, pinned_value_)) {
1238
0
        return false;
1239
0
      }
1240
456
    } else if (ikey.type == kTypeWideColumnEntity) {
1241
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1242
0
        return false;
1243
0
      }
1244
456
    } else {
1245
456
      assert(ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno);
1246
456
      SetValueAndColumnsFromPlain(pinned_value_);
1247
456
    }
1248
1249
456
    if (timestamp_lb_ != nullptr) {
1250
0
      saved_key_.SetInternalKey(ikey);
1251
456
    } else {
1252
456
      saved_key_.SetUserKey(ikey.user_key);
1253
456
    }
1254
1255
456
    valid_ = true;
1256
456
    return true;
1257
456
  }
1258
1259
  // kTypeMerge. We need to collect all kTypeMerge values and save them
1260
  // in operands
1261
0
  assert(ikey.type == kTypeMerge);
1262
0
  current_entry_is_merged_ = true;
1263
0
  merge_context_.Clear();
1264
0
  merge_context_.PushOperand(
1265
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1266
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
1267
1268
0
  while (true) {
1269
0
    iter_.Next();
1270
1271
0
    if (!iter_.Valid()) {
1272
0
      if (!iter_.status().ok()) {
1273
0
        valid_ = false;
1274
0
        return false;
1275
0
      }
1276
0
      break;
1277
0
    }
1278
0
    if (!ParseKey(&ikey)) {
1279
0
      return false;
1280
0
    }
1281
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1282
0
                                                saved_key_.GetUserKey())) {
1283
0
      break;
1284
0
    }
1285
0
    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1286
0
        ikey.type == kTypeDeletionWithTimestamp) {
1287
0
      break;
1288
0
    }
1289
0
    if (!PrepareValueInternal()) {
1290
0
      return false;
1291
0
    }
1292
1293
0
    if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno) {
1294
0
      Slice value = iter_.value();
1295
0
      if (ikey.type == kTypeValuePreferredSeqno) {
1296
0
        value = ParsePackedValueForValue(value);
1297
0
      }
1298
0
      if (!MergeWithPlainBaseValue(value, saved_key_.GetUserKey())) {
1299
0
        return false;
1300
0
      }
1301
0
      return true;
1302
0
    } else if (ikey.type == kTypeMerge) {
1303
0
      merge_context_.PushOperand(
1304
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1305
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
1306
0
    } else if (ikey.type == kTypeBlobIndex) {
1307
0
      if (!MergeWithBlobBaseValue(iter_.value(), saved_key_.GetUserKey())) {
1308
0
        return false;
1309
0
      }
1310
1311
0
      return true;
1312
0
    } else if (ikey.type == kTypeWideColumnEntity) {
1313
0
      if (!MergeWithWideColumnBaseValue(iter_.value(),
1314
0
                                        saved_key_.GetUserKey())) {
1315
0
        return false;
1316
0
      }
1317
1318
0
      return true;
1319
0
    } else {
1320
0
      valid_ = false;
1321
0
      status_ = Status::Corruption(
1322
0
          "Unknown value type: " +
1323
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
1324
0
      return false;
1325
0
    }
1326
0
  }
1327
1328
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1329
0
    return false;
1330
0
  }
1331
1332
  // Make sure we leave iter_ in a good state. If it's valid and we don't care
1333
  // about prefixes, that's already good enough. Otherwise it needs to be
1334
  // seeked to the current key.
1335
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
1336
0
    if (!expect_total_order_inner_iter()) {
1337
0
      iter_.SeekForPrev(last_key);
1338
0
    } else {
1339
0
      iter_.Seek(last_key);
1340
0
      if (!iter_.Valid() && iter_.status().ok()) {
1341
0
        iter_.SeekToLast();
1342
0
      }
1343
0
    }
1344
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1345
0
  }
1346
1347
0
  valid_ = true;
1348
0
  return true;
1349
0
}
1350
1351
0
bool DBIter::MergeWithNoBaseValue(const Slice& user_key) {
1352
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1353
  // since a failure must be propagated regardless of its value.
1354
0
  ValueType result_type;
1355
0
  const Status s = MergeHelper::TimedFullMerge(
1356
0
      merge_operator_, user_key, MergeHelper::kNoBaseValue,
1357
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1358
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1359
0
      &saved_value_, &pinned_value_, &result_type);
1360
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1361
0
}
1362
1363
bool DBIter::MergeWithPlainBaseValue(const Slice& value,
1364
0
                                     const Slice& user_key) {
1365
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1366
  // since a failure must be propagated regardless of its value.
1367
0
  ValueType result_type;
1368
0
  const Status s = MergeHelper::TimedFullMerge(
1369
0
      merge_operator_, user_key, MergeHelper::kPlainBaseValue, value,
1370
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1371
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1372
0
      &saved_value_, &pinned_value_, &result_type);
1373
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1374
0
}
1375
1376
bool DBIter::MergeWithBlobBaseValue(const Slice& blob_index,
1377
0
                                    const Slice& user_key) {
1378
0
  assert(!is_blob_);
1379
1380
0
  if (expose_blob_index_) {
1381
0
    status_ =
1382
0
        Status::NotSupported("Legacy BlobDB does not support merge operator.");
1383
0
    valid_ = false;
1384
0
    return false;
1385
0
  }
1386
1387
0
  const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
1388
0
  if (!s.ok()) {
1389
0
    status_ = s;
1390
0
    valid_ = false;
1391
0
    return false;
1392
0
  }
1393
1394
0
  valid_ = true;
1395
1396
0
  if (!MergeWithPlainBaseValue(blob_reader_.GetBlobValue(), user_key)) {
1397
0
    return false;
1398
0
  }
1399
1400
0
  blob_reader_.ResetBlobValue();
1401
1402
0
  return true;
1403
0
}
1404
1405
bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity,
1406
0
                                          const Slice& user_key) {
1407
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1408
  // since a failure must be propagated regardless of its value.
1409
0
  ValueType result_type;
1410
0
  const Status s = MergeHelper::TimedFullMerge(
1411
0
      merge_operator_, user_key, MergeHelper::kWideBaseValue, entity,
1412
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1413
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1414
0
      &saved_value_, &pinned_value_, &result_type);
1415
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1416
0
}
1417
1418
// Move backwards until the key smaller than saved_key_.
1419
// Changes valid_ only if return value is false.
1420
4.07k
bool DBIter::FindUserKeyBeforeSavedKey() {
1421
4.07k
  assert(status_.ok());
1422
4.07k
  size_t num_skipped = 0;
1423
4.68k
  while (iter_.Valid()) {
1424
755
    ParsedInternalKey ikey;
1425
755
    if (!ParseKey(&ikey)) {
1426
0
      return false;
1427
0
    }
1428
1429
755
    if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
1430
143
      return true;
1431
143
    }
1432
1433
612
    if (TooManyInternalKeysSkipped()) {
1434
0
      return false;
1435
0
    }
1436
1437
612
    assert(ikey.sequence != kMaxSequenceNumber);
1438
612
    assert(ikey.user_key.size() >= timestamp_size_);
1439
612
    Slice ts;
1440
612
    if (timestamp_size_ > 0) {
1441
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1442
0
                 timestamp_size_);
1443
0
    }
1444
612
    if (!IsVisible(ikey.sequence, ts)) {
1445
0
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
1446
612
    } else {
1447
612
      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
1448
612
    }
1449
1450
612
    if (num_skipped >= max_skip_) {
1451
0
      num_skipped = 0;
1452
0
      std::string last_key;
1453
0
      if (timestamp_size_ == 0) {
1454
0
        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
1455
0
                                                       kMaxSequenceNumber,
1456
0
                                                       kValueTypeForSeek));
1457
0
      } else {
1458
        // TODO: pre-create kTsMax.
1459
0
        const std::string kTsMax(timestamp_size_, '\xff');
1460
0
        AppendInternalKeyWithDifferentTimestamp(
1461
0
            &last_key,
1462
0
            ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
1463
0
                              kValueTypeForSeek),
1464
0
            kTsMax);
1465
0
      }
1466
      // It would be more efficient to use SeekForPrev() here, but some
1467
      // iterators may not support it.
1468
0
      iter_.Seek(last_key);
1469
0
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1470
0
      if (!iter_.Valid()) {
1471
0
        break;
1472
0
      }
1473
612
    } else {
1474
612
      ++num_skipped;
1475
612
    }
1476
1477
612
    iter_.Prev();
1478
612
  }
1479
1480
3.93k
  if (!iter_.status().ok()) {
1481
0
    valid_ = false;
1482
0
    return false;
1483
0
  }
1484
1485
3.93k
  return true;
1486
3.93k
}
1487
1488
70.7k
bool DBIter::TooManyInternalKeysSkipped(bool increment) {
1489
70.7k
  if ((max_skippable_internal_keys_ > 0) &&
1490
70.7k
      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
1491
0
    valid_ = false;
1492
0
    status_ = Status::Incomplete("Too many internal keys skipped.");
1493
0
    return true;
1494
70.7k
  } else if (increment) {
1495
69.0k
    num_internal_keys_skipped_++;
1496
69.0k
  }
1497
70.7k
  return false;
1498
70.7k
}
1499
1500
bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
1501
69.6k
                       bool* more_recent) {
1502
  // Remember that comparator orders preceding timestamp as larger.
1503
  // TODO(yanqin): support timestamp in read_callback_.
1504
69.6k
  bool visible_by_seq = (read_callback_ == nullptr)
1505
69.6k
                            ? sequence <= sequence_
1506
69.6k
                            : read_callback_->IsVisible(sequence);
1507
1508
69.6k
  bool visible_by_ts =
1509
69.6k
      (timestamp_ub_ == nullptr ||
1510
69.6k
       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
1511
69.6k
      (timestamp_lb_ == nullptr ||
1512
69.6k
       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
1513
1514
69.6k
  if (more_recent) {
1515
52.9k
    *more_recent = !visible_by_seq;
1516
52.9k
  }
1517
69.6k
  return visible_by_seq && visible_by_ts;
1518
69.6k
}
1519
1520
0
void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
1521
0
  is_key_seqnum_zero_ = false;
1522
0
  SequenceNumber seq = sequence_;
1523
0
  saved_key_.Clear();
1524
0
  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
1525
1526
0
  if (iterate_lower_bound_ != nullptr &&
1527
0
      user_comparator_.CompareWithoutTimestamp(
1528
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
1529
0
          /*b_has_ts=*/false) < 0) {
1530
    // Seek key is smaller than the lower bound.
1531
0
    saved_key_.Clear();
1532
0
    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
1533
0
                              timestamp_ub_);
1534
0
  }
1535
0
}
1536
1537
4.63k
void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
1538
4.63k
  is_key_seqnum_zero_ = false;
1539
4.63k
  saved_key_.Clear();
1540
  // now saved_key is used to store internal key.
1541
4.63k
  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
1542
4.63k
                            kValueTypeForSeekForPrev, timestamp_ub_);
1543
1544
4.63k
  if (timestamp_size_ > 0) {
1545
0
    const std::string kTsMin(timestamp_size_, '\0');
1546
0
    Slice ts = kTsMin;
1547
0
    saved_key_.UpdateInternalKey(
1548
0
        /*seq=*/0, kValueTypeForSeekForPrev,
1549
0
        timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
1550
0
  }
1551
1552
4.63k
  if (iterate_upper_bound_ != nullptr &&
1553
4.63k
      user_comparator_.CompareWithoutTimestamp(
1554
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
1555
0
          /*b_has_ts=*/false) >= 0) {
1556
0
    saved_key_.Clear();
1557
0
    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
1558
0
                              kValueTypeForSeekForPrev, timestamp_ub_);
1559
0
    if (timestamp_size_ > 0) {
1560
0
      const std::string kTsMax(timestamp_size_, '\xff');
1561
0
      Slice ts = kTsMax;
1562
0
      saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
1563
0
                                   &ts);
1564
0
    }
1565
0
  }
1566
4.63k
}
1567
1568
0
void DBIter::Seek(const Slice& target) {
1569
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
1570
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1571
0
  StopWatch sw(clock_, statistics_, DB_SEEK);
1572
1573
0
  if (cfh_ != nullptr) {
1574
    // TODO: What do we do if this returns an error?
1575
0
    Slice lower_bound, upper_bound;
1576
0
    if (iterate_lower_bound_ != nullptr) {
1577
0
      lower_bound = *iterate_lower_bound_;
1578
0
    } else {
1579
0
      lower_bound = Slice("");
1580
0
    }
1581
0
    if (iterate_upper_bound_ != nullptr) {
1582
0
      upper_bound = *iterate_upper_bound_;
1583
0
    } else {
1584
0
      upper_bound = Slice("");
1585
0
    }
1586
0
    cfh_->db()
1587
0
        ->TraceIteratorSeek(cfh_->cfd()->GetID(), target, lower_bound,
1588
0
                            upper_bound)
1589
0
        .PermitUncheckedError();
1590
0
  }
1591
1592
0
  status_ = Status::OK();
1593
0
  ReleaseTempPinnedData();
1594
0
  ResetBlobData();
1595
0
  ResetValueAndColumns();
1596
0
  ResetInternalKeysSkippedCounter();
1597
0
  MarkMemtableForFlushForAvgTrigger();
1598
1599
  // Seek the inner iterator based on the target key.
1600
0
  {
1601
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
1602
1603
0
    SetSavedKeyToSeekTarget(target);
1604
0
    iter_.Seek(saved_key_.GetInternalKey());
1605
1606
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
1607
0
  }
1608
0
  if (!iter_.Valid()) {
1609
0
    valid_ = false;
1610
0
    return;
1611
0
  }
1612
0
  direction_ = kForward;
1613
1614
  // Now the inner iterator is placed to the target position. From there,
1615
  // we need to find out the next key that is visible to the user.
1616
0
  ClearSavedValue();
1617
0
  if (prefix_same_as_start_) {
1618
    // The case where the iterator needs to be invalidated if it has exhausted
1619
    // keys within the same prefix of the seek key.
1620
0
    assert(prefix_extractor_ != nullptr);
1621
0
    Slice target_prefix = prefix_extractor_->Transform(target);
1622
0
    FindNextUserEntry(false /* not skipping saved_key */,
1623
0
                      &target_prefix /* prefix */);
1624
0
    if (valid_) {
1625
      // Remember the prefix of the seek key for the future Next() call to
1626
      // check.
1627
0
      prefix_.SetUserKey(target_prefix);
1628
0
    }
1629
0
  } else {
1630
0
    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
1631
0
  }
1632
0
  if (!valid_) {
1633
0
    return;
1634
0
  }
1635
1636
  // Updating stats and perf context counters.
1637
0
  if (statistics_ != nullptr) {
1638
    // Decrement since we don't want to count this key as skipped
1639
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1640
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1641
0
  }
1642
0
  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1643
0
}
1644
1645
4.63k
void DBIter::SeekForPrev(const Slice& target) {
1646
4.63k
  PERF_COUNTER_ADD(iter_seek_count, 1);
1647
4.63k
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1648
4.63k
  StopWatch sw(clock_, statistics_, DB_SEEK);
1649
1650
4.63k
  if (cfh_ != nullptr) {
1651
    // TODO: What do we do if this returns an error?
1652
4.63k
    Slice lower_bound, upper_bound;
1653
4.63k
    if (iterate_lower_bound_ != nullptr) {
1654
0
      lower_bound = *iterate_lower_bound_;
1655
4.63k
    } else {
1656
4.63k
      lower_bound = Slice("");
1657
4.63k
    }
1658
4.63k
    if (iterate_upper_bound_ != nullptr) {
1659
0
      upper_bound = *iterate_upper_bound_;
1660
4.63k
    } else {
1661
4.63k
      upper_bound = Slice("");
1662
4.63k
    }
1663
4.63k
    cfh_->db()
1664
4.63k
        ->TraceIteratorSeekForPrev(cfh_->cfd()->GetID(), target, lower_bound,
1665
4.63k
                                   upper_bound)
1666
4.63k
        .PermitUncheckedError();
1667
4.63k
  }
1668
1669
4.63k
  status_ = Status::OK();
1670
4.63k
  ReleaseTempPinnedData();
1671
4.63k
  ResetBlobData();
1672
4.63k
  ResetValueAndColumns();
1673
4.63k
  ResetInternalKeysSkippedCounter();
1674
4.63k
  MarkMemtableForFlushForAvgTrigger();
1675
1676
  // Seek the inner iterator based on the target key.
1677
4.63k
  {
1678
4.63k
    PERF_TIMER_GUARD(seek_internal_seek_time);
1679
4.63k
    SetSavedKeyToSeekForPrevTarget(target);
1680
4.63k
    iter_.SeekForPrev(saved_key_.GetInternalKey());
1681
4.63k
    RecordTick(statistics_, NUMBER_DB_SEEK);
1682
4.63k
  }
1683
4.63k
  if (!iter_.Valid()) {
1684
670
    valid_ = false;
1685
670
    return;
1686
670
  }
1687
3.96k
  direction_ = kReverse;
1688
1689
  // Now the inner iterator is placed to the target position. From there,
1690
  // we need to find out the first key that is visible to the user in the
1691
  // backward direction.
1692
3.96k
  ClearSavedValue();
1693
3.96k
  if (prefix_same_as_start_) {
1694
    // The case where the iterator needs to be invalidated if it has exhausted
1695
    // keys within the same prefix of the seek key.
1696
0
    assert(prefix_extractor_ != nullptr);
1697
0
    Slice target_prefix = prefix_extractor_->Transform(target);
1698
0
    PrevInternal(&target_prefix);
1699
0
    if (valid_) {
1700
      // Remember the prefix of the seek key for the future Prev() call to
1701
      // check.
1702
0
      prefix_.SetUserKey(target_prefix);
1703
0
    }
1704
3.96k
  } else {
1705
3.96k
    PrevInternal(nullptr);
1706
3.96k
  }
1707
1708
  // Report stats and perf context.
1709
3.96k
  if (statistics_ != nullptr && valid_) {
1710
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1711
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1712
0
    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1713
0
  }
1714
3.96k
}
1715
1716
9.41k
void DBIter::SeekToFirst() {
1717
9.41k
  if (iterate_lower_bound_ != nullptr) {
1718
0
    Seek(*iterate_lower_bound_);
1719
0
    return;
1720
0
  }
1721
9.41k
  PERF_COUNTER_ADD(iter_seek_count, 1);
1722
9.41k
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1723
  // Don't use iter_::Seek() if we set a prefix extractor
1724
  // because prefix seek will be used.
1725
9.41k
  if (!expect_total_order_inner_iter()) {
1726
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
1727
0
  }
1728
9.41k
  status_ = Status::OK();
1729
  // if iterator is empty, this status_ could be unchecked.
1730
9.41k
  status_.PermitUncheckedError();
1731
9.41k
  direction_ = kForward;
1732
9.41k
  ReleaseTempPinnedData();
1733
9.41k
  ResetBlobData();
1734
9.41k
  ResetValueAndColumns();
1735
9.41k
  ResetInternalKeysSkippedCounter();
1736
9.41k
  MarkMemtableForFlushForAvgTrigger();
1737
9.41k
  ClearSavedValue();
1738
9.41k
  is_key_seqnum_zero_ = false;
1739
1740
9.41k
  {
1741
9.41k
    PERF_TIMER_GUARD(seek_internal_seek_time);
1742
9.41k
    iter_.SeekToFirst();
1743
9.41k
  }
1744
1745
9.41k
  RecordTick(statistics_, NUMBER_DB_SEEK);
1746
9.41k
  if (iter_.Valid()) {
1747
7.73k
    saved_key_.SetUserKey(
1748
7.73k
        ExtractUserKey(iter_.key()),
1749
7.73k
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
1750
7.73k
    FindNextUserEntry(false /* not skipping saved_key */,
1751
7.73k
                      nullptr /* no prefix check */);
1752
7.73k
    if (statistics_ != nullptr) {
1753
0
      if (valid_) {
1754
0
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1755
0
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1756
0
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1757
0
      }
1758
0
    }
1759
7.73k
  } else {
1760
1.68k
    valid_ = false;
1761
1.68k
  }
1762
9.41k
  if (valid_ && prefix_same_as_start_) {
1763
0
    assert(prefix_extractor_ != nullptr);
1764
0
    prefix_.SetUserKey(prefix_extractor_->Transform(
1765
0
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
1766
0
  }
1767
9.41k
}
1768
1769
0
void DBIter::SeekToLast() {
1770
0
  if (iterate_upper_bound_ != nullptr) {
1771
    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
1772
0
    SeekForPrev(*iterate_upper_bound_);
1773
#ifndef NDEBUG
1774
    Slice k = Valid() ? key() : Slice();
1775
    if (Valid() && timestamp_size_ > 0 && timestamp_lb_) {
1776
      k.remove_suffix(kNumInternalBytes + timestamp_size_);
1777
    }
1778
    assert(!Valid() || user_comparator_.CompareWithoutTimestamp(
1779
                           k, /*a_has_ts=*/false, *iterate_upper_bound_,
1780
                           /*b_has_ts=*/false) < 0);
1781
#endif
1782
0
    return;
1783
0
  }
1784
1785
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
1786
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1787
  // Don't use iter_::Seek() if we set a prefix extractor
1788
  // because prefix seek will be used.
1789
0
  if (!expect_total_order_inner_iter()) {
1790
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
1791
0
  }
1792
0
  status_ = Status::OK();
1793
  // if iterator is empty, this status_ could be unchecked.
1794
0
  status_.PermitUncheckedError();
1795
0
  direction_ = kReverse;
1796
0
  ReleaseTempPinnedData();
1797
0
  ResetBlobData();
1798
0
  ResetValueAndColumns();
1799
0
  ResetInternalKeysSkippedCounter();
1800
0
  MarkMemtableForFlushForAvgTrigger();
1801
0
  ClearSavedValue();
1802
0
  is_key_seqnum_zero_ = false;
1803
1804
0
  {
1805
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
1806
0
    iter_.SeekToLast();
1807
0
  }
1808
0
  PrevInternal(nullptr);
1809
0
  if (statistics_ != nullptr) {
1810
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
1811
0
    if (valid_) {
1812
0
      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1813
0
      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1814
0
      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1815
0
    }
1816
0
  }
1817
0
  if (valid_ && prefix_same_as_start_) {
1818
0
    assert(prefix_extractor_ != nullptr);
1819
0
    prefix_.SetUserKey(prefix_extractor_->Transform(
1820
0
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
1821
0
  }
1822
0
}
1823
}  // namespace ROCKSDB_NAMESPACE