Coverage Report

Created: 2026-05-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/db/db_iter.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/db_iter.h"
11
12
#include <limits>
13
#include <string>
14
#include <vector>
15
16
#include "db/blob/blob_fetcher.h"
17
#include "db/blob/blob_file_partition_manager.h"
18
#include "db/blob/blob_index.h"
19
#include "db/dbformat.h"
20
#include "db/merge_context.h"
21
#include "db/merge_helper.h"
22
#include "db/pinned_iterators_manager.h"
23
#include "db/wide/wide_column_serialization.h"
24
#include "db/wide/wide_columns_helper.h"
25
#include "file/filename.h"
26
#include "logging/logging.h"
27
#include "memory/arena.h"
28
#include "monitoring/perf_context_imp.h"
29
#include "port/likely.h"
30
#include "rocksdb/env.h"
31
#include "rocksdb/io_dispatcher.h"
32
#include "rocksdb/iterator.h"
33
#include "rocksdb/merge_operator.h"
34
#include "rocksdb/options.h"
35
#include "rocksdb/system_clock.h"
36
#include "table/internal_iterator.h"
37
#include "table/iterator_wrapper.h"
38
#include "trace_replay/trace_replay.h"
39
#include "util/mutexlock.h"
40
#include "util/string_util.h"
41
#include "util/user_comparator_wrapper.h"
42
43
namespace ROCKSDB_NAMESPACE {
44
45
namespace {
46
47
15.8k
bool HasFullTimestampVisibility(const ReadOptions& read_options) {
48
15.8k
  if (read_options.iter_start_ts != nullptr) {
49
0
    return false;
50
0
  }
51
15.8k
  if (read_options.timestamp == nullptr) {
52
15.8k
    return true;
53
15.8k
  }
54
0
  const Slice ts = *read_options.timestamp;
55
0
  for (size_t i = 0; i < ts.size(); ++i) {
56
0
    if (static_cast<unsigned char>(ts[i]) != 0xff) {
57
0
      return false;
58
0
    }
59
0
  }
60
0
  return true;
61
0
}
62
63
}  // namespace
64
65
DBIter::DBIter(Env* _env, const ReadOptions& read_options,
66
               const ImmutableOptions& ioptions,
67
               const MutableCFOptions& mutable_cf_options,
68
               const Comparator* cmp, InternalIterator* iter,
69
               const Version* version, SequenceNumber s, bool arena_mode,
70
               ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
71
               bool expose_blob_index, ReadOnlyMemTable* active_mem)
72
15.8k
    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
73
15.8k
      env_(_env),
74
15.8k
      clock_(ioptions.clock),
75
15.8k
      logger_(ioptions.logger),
76
15.8k
      user_comparator_(cmp),
77
15.8k
      merge_operator_(ioptions.merge_operator.get()),
78
15.8k
      iter_(iter),
79
15.8k
      blob_state_(
80
15.8k
          version, read_options.read_tier, read_options.verify_checksums,
81
15.8k
          read_options.fill_cache, read_options.io_activity,
82
15.8k
          cfh ? cfh->cfd()->blob_file_cache() : nullptr,
83
15.8k
          cfh != nullptr && cfh->cfd()->blob_partition_manager() != nullptr),
84
15.8k
      read_callback_(read_callback),
85
15.8k
      sequence_(s),
86
15.8k
      value_columns_state_(version, read_options, cfh),
87
15.8k
      statistics_(ioptions.stats),
88
15.8k
      max_skip_(mutable_cf_options.max_sequential_skip_in_iterations),
89
15.8k
      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
90
15.8k
      num_internal_keys_skipped_(0),
91
15.8k
      iterate_lower_bound_(read_options.iterate_lower_bound),
92
15.8k
      iterate_upper_bound_(read_options.iterate_upper_bound),
93
15.8k
      cfh_(cfh),
94
15.8k
      timestamp_ub_(read_options.timestamp),
95
15.8k
      timestamp_lb_(read_options.iter_start_ts),
96
15.8k
      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
97
15.8k
      active_mem_(active_mem),
98
15.8k
      memtable_seqno_lb_(kMaxSequenceNumber),
99
15.8k
      memtable_op_scan_flush_trigger_(0),
100
15.8k
      avg_op_scan_flush_trigger_(0),
101
15.8k
      iter_step_since_seek_(1),
102
15.8k
      mem_hidden_op_scanned_since_seek_(0),
103
15.8k
      contiguous_tombstone_count_(0),
104
15.8k
      direction_(kForward),
105
15.8k
      valid_(false),
106
15.8k
      current_entry_is_merged_(false),
107
15.8k
      is_key_seqnum_zero_(false),
108
      prefix_same_as_start_(
109
15.8k
          prefix_extractor_ ? read_options.prefix_same_as_start : false),
110
15.8k
      pin_thru_lifetime_(read_options.pin_data),
111
15.8k
      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
112
0
                                     read_options.total_order_seek ||
113
0
                                     read_options.auto_prefix_mode),
114
      // Read-path range conversion assumes the scan can observe all interior
115
      // live keys. table_filter can hide whole SSTs, and timestamp filtering
116
      // can hide newer UDT versions unless the read is at max timestamp with no
117
      // lower timestamp bound. Legacy prefix iterators without
118
      // prefix_same_as_start do not guarantee complete scans, so conversion
119
      // must stay disabled for the iterator lifetime.
120
      min_tombstones_for_range_conversion_(
121
15.8k
          active_mem != nullptr && !read_options.table_filter &&
122
15.8k
                  (expect_total_order_inner_iter_ || prefix_same_as_start_) &&
123
15.8k
                  HasFullTimestampVisibility(read_options)
124
15.8k
              ? mutable_cf_options.min_tombstones_for_range_conversion
125
15.8k
              : 0),
126
15.8k
      expose_blob_index_(expose_blob_index),
127
15.8k
      allow_unprepared_value_(read_options.allow_unprepared_value),
128
15.8k
      arena_mode_(arena_mode) {
129
15.8k
  RecordTick(statistics_, NO_ITERATOR_CREATED);
130
15.8k
  if (pin_thru_lifetime_) {
131
0
    pinned_iters_mgr_.StartPinning();
132
0
  }
133
15.8k
  if (iter_.iter()) {
134
0
    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
135
0
  }
136
15.8k
  status_.PermitUncheckedError();
137
15.8k
  assert(timestamp_size_ ==
138
15.8k
         user_comparator_.user_comparator()->timestamp_size());
139
  // prefix_seek_opt_in_only should force total_order_seek whereever the caller
140
  // is duplicating the original ReadOptions
141
15.8k
  assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
142
15.8k
  if (active_mem_) {
143
    // FIXME: GetEarliestSequenceNumber() may return a seqno that is one smaller
144
    // than the smallest seqno in the memtable. This violates its comment and
145
    // entries with that seqno may not be in the active memtable. Before it's
146
    // fixed, we use GetFirstSequenceNumber() for more accurate result.
147
15.8k
    memtable_seqno_lb_ = active_mem_->IsEmpty()
148
15.8k
                             ? active_mem_->GetEarliestSequenceNumber()
149
15.8k
                             : active_mem_->GetFirstSequenceNumber();
150
15.8k
    memtable_op_scan_flush_trigger_ =
151
15.8k
        mutable_cf_options.memtable_op_scan_flush_trigger;
152
15.8k
    if (memtable_op_scan_flush_trigger_) {
153
      // avg_op_scan_flush_trigger_ requires memtable_op_scan_flush_trigger_ > 0
154
0
      avg_op_scan_flush_trigger_ =
155
0
          mutable_cf_options.memtable_avg_op_scan_flush_trigger;
156
0
    }
157
15.8k
  } else {
158
    // memtable_op_scan_flush_trigger_ and avg_op_scan_flush_trigger_ are
159
    // initialized to 0(disabled) as default.
160
0
  }
161
15.8k
}
162
163
0
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
164
0
  if (prop == nullptr) {
165
0
    return Status::InvalidArgument("prop is nullptr");
166
0
  }
167
0
  if (prop_name == "rocksdb.iterator.super-version-number") {
168
    // First try to pass the value returned from inner iterator.
169
0
    return iter_.iter()->GetProperty(prop_name, prop);
170
0
  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
171
0
    if (valid_) {
172
0
      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
173
0
    } else {
174
0
      *prop = "Iterator is not valid.";
175
0
    }
176
0
    return Status::OK();
177
0
  } else if (prop_name == "rocksdb.iterator.is-value-pinned") {
178
0
    if (valid_) {
179
0
      *prop = (pin_thru_lifetime_ && iter_.Valid() &&
180
0
               iter_.value().data() == value_columns_state_->value().data())
181
0
                  ? "1"
182
0
                  : "0";
183
0
    } else {
184
0
      *prop = "Iterator is not valid.";
185
0
    }
186
0
    return Status::OK();
187
0
  } else if (prop_name == "rocksdb.iterator.internal-key") {
188
0
    *prop = saved_key_.GetUserKey().ToString();
189
0
    return Status::OK();
190
0
  } else if (prop_name == "rocksdb.iterator.write-time") {
191
0
    PutFixed64(prop, saved_write_unix_time_);
192
0
    return Status::OK();
193
0
  }
194
0
  return Status::InvalidArgument("Unidentified property.");
195
0
}
196
197
70.2k
bool DBIter::ParseKey(ParsedInternalKey* ikey) {
198
70.2k
  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
199
70.2k
  if (!s.ok()) {
200
0
    status_ = Status::Corruption("In DBIter: ", s.getState());
201
0
    valid_ = false;
202
0
    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
203
0
    return false;
204
70.2k
  } else {
205
70.2k
    return true;
206
70.2k
  }
207
70.2k
}
208
209
23.1k
void DBIter::Next() {
210
23.1k
  assert(valid_);
211
23.1k
  assert(status_.ok());
212
213
23.1k
  PERF_COUNTER_ADD(iter_next_count, 1);
214
23.1k
  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
215
  // Release temporarily pinned blocks from last operation
216
23.1k
  ReleaseTempPinnedData();
217
23.1k
  ResetBlobData();
218
23.1k
  ResetValueAndColumns();
219
23.1k
  local_stats_.skip_count_ += num_internal_keys_skipped_;
220
23.1k
  local_stats_.skip_count_--;
221
23.1k
  num_internal_keys_skipped_ = 0;
222
23.1k
  iter_step_since_seek_++;
223
23.1k
  bool ok = true;
224
23.1k
  if (direction_ == kReverse) {
225
0
    is_key_seqnum_zero_ = false;
226
0
    ResetContiguousTombstoneTracking();
227
0
    if (!ReverseToForward()) {
228
0
      ok = false;
229
0
    }
230
23.1k
  } else if (!current_entry_is_merged_) {
231
    // If the current value is not a merge, the iter position is the
232
    // current key, which is already returned. We can safely issue a
233
    // Next() without checking the current key.
234
    // If the current key is a merge, very likely iter already points
235
    // to the next internal position.
236
23.1k
    assert(iter_.Valid());
237
23.1k
    iter_.Next();
238
23.1k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
239
23.1k
  }
240
241
23.1k
  local_stats_.next_count_++;
242
23.1k
  if (ok && iter_.Valid()) {
243
19.0k
    ClearSavedValue();
244
245
19.0k
    FindNextUserEntry(true /* skipping the current user key */);
246
19.0k
  } else {
247
4.12k
    is_key_seqnum_zero_ = false;
248
4.12k
    valid_ = false;
249
4.12k
  }
250
23.1k
  if (statistics_ != nullptr && valid_) {
251
0
    local_stats_.next_found_count_++;
252
0
    local_stats_.bytes_read_ += (key().size() + value().size());
253
0
  }
254
23.1k
}
255
256
Status DBIter::BlobReader::RetrieveAndSetBlobValue(
257
    const Slice& user_key, const Slice& blob_index,
258
0
    bool allow_write_path_fallback) {
259
0
  assert(blob_value_.empty());
260
261
0
  if (!version_ && (!allow_write_path_fallback || !blob_file_cache_)) {
262
0
    return Status::Corruption("Encountered unexpected blob index.");
263
0
  }
264
265
  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
266
  // avoid having to copy options back and forth.
267
  // TODO: plumb Env::IOPriority
268
0
  ReadOptions read_options;
269
0
  read_options.read_tier = read_tier_;
270
0
  read_options.verify_checksums = verify_checksums_;
271
0
  read_options.fill_cache = fill_cache_;
272
0
  read_options.io_activity = io_activity_;
273
0
  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
274
0
  constexpr uint64_t* bytes_read = nullptr;
275
276
0
  if (!allow_write_path_fallback) {
277
0
    assert(version_ != nullptr);
278
0
    return version_->GetBlob(read_options, user_key, blob_index,
279
0
                             prefetch_buffer, &blob_value_, bytes_read);
280
0
  }
281
282
0
  BlobIndex blob_idx;
283
0
  Status s = blob_idx.DecodeFrom(blob_index);
284
0
  if (!s.ok()) {
285
0
    return s;
286
0
  }
287
288
0
  return BlobFilePartitionManager::ResolveBlobDirectWriteIndex(
289
0
      read_options, user_key, blob_idx, version_, blob_file_cache_,
290
0
      prefetch_buffer, &blob_value_, bytes_read);
291
0
}
292
293
0
BlobFetcher DBIter::BlobReader::CreateBlobFetcher() const {
294
0
  ReadOptions read_options;
295
0
  read_options.read_tier = read_tier_;
296
0
  read_options.verify_checksums = verify_checksums_;
297
0
  read_options.fill_cache = fill_cache_;
298
0
  read_options.io_activity = io_activity_;
299
0
  return BlobFetcher(version_, read_options, blob_file_cache_,
300
0
                     allow_write_path_fallback_);
301
0
}
302
303
bool DBIter::SetValueAndColumnsFromBlobImpl(const Slice& user_key,
304
0
                                            const Slice& blob_index) {
305
  // Keep the non-BDW iterator path on the pre-existing Version::GetBlob()
306
  // fast path. Only enable the direct-write fallback when this CF actually
307
  // has a write-path partition manager.
308
0
  const bool allow_write_path_fallback =
309
0
      cfh_ != nullptr && cfh_->cfd()->blob_partition_manager() != nullptr;
310
0
  const Status s = blob_state_.mut()->reader.RetrieveAndSetBlobValue(
311
0
      user_key, blob_index, allow_write_path_fallback);
312
0
  if (!s.ok()) {
313
0
    status_ = s;
314
0
    valid_ = false;
315
0
    blob_state_.mut()->is_blob = false;
316
0
    return false;
317
0
  }
318
319
0
  SetValueAndColumnsFromPlain(blob_state_->reader.GetBlobValue());
320
321
0
  return true;
322
0
}
323
324
bool DBIter::SetValueAndColumnsFromBlob(const Slice& user_key,
325
0
                                        const Slice& blob_index) {
326
0
  assert(!blob_state_->is_blob);
327
0
  blob_state_.mut()->is_blob = true;
328
329
0
  if (expose_blob_index_) {
330
0
    SetValueAndColumnsFromPlain(blob_index);
331
0
    return true;
332
0
  }
333
334
0
  if (allow_unprepared_value_) {
335
0
    assert(value_columns_state_->value().empty());
336
0
    assert(value_columns_state_->wide_columns().empty());
337
338
0
    assert(blob_state_->lazy_blob_index.empty());
339
0
    blob_state_.mut()->lazy_blob_index = blob_index;
340
341
0
    return true;
342
0
  }
343
344
0
  return SetValueAndColumnsFromBlobImpl(user_key, blob_index);
345
0
}
346
347
0
bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
348
  // Auto-marks dirty via mut() up front since every successful path below
349
  // populates wide_columns_ (and possibly the lazy entity/blob column
350
  // vectors).
351
0
  auto& state = *value_columns_state_.mut();
352
0
  state.AssertReadyForEntity();
353
354
  // Fast path: if no blob columns, use the simpler Deserialize
355
0
  bool has_blob_columns = false;
356
0
  {
357
0
    const Status s_hbc =
358
0
        WideColumnSerialization::HasBlobColumns(slice, has_blob_columns);
359
0
    if (!s_hbc.ok()) {
360
0
      status_ = s_hbc;
361
0
      valid_ = false;
362
0
      return false;
363
0
    }
364
0
  }
365
0
  if (LIKELY(!has_blob_columns)) {
366
0
    WideColumns& wide_columns = state.wide_columns();
367
0
    const Status s = WideColumnSerialization::Deserialize(slice, wide_columns);
368
369
0
    if (!s.ok()) {
370
0
      status_ = s;
371
0
      valid_ = false;
372
0
      state.ClearWideColumns();
373
0
      return false;
374
0
    }
375
376
0
    state.MaybeSetValueFromMaterializedDefaultColumn();
377
0
    return true;
378
0
  }
379
380
  // Entity has blob columns.
381
  // First, copy the serialized data to the saved entity buffer so that column
382
  // name/value Slices remain valid after the internal iterator moves.
383
  // Guard: if slice already aliases that saved buffer (e.g., when called from
384
  // SetValueAndColumnsFromMergeResult), skip the redundant copy to
385
  // avoid self-aliased std::string::assign (undefined behavior).
386
0
  state.SaveEntitySliceIfNeeded(slice);
387
388
0
  {
389
0
    Slice input_copy = state.PrepareForLazyEntityDeserialize();
390
0
    const Status s = WideColumnSerialization::DeserializeV2(
391
0
        input_copy, state.lazy_entity_columns(), state.lazy_blob_columns());
392
393
0
    if (!s.ok()) {
394
0
      status_ = s;
395
0
      valid_ = false;
396
0
      state.ClearLazyEntity();
397
0
      return false;
398
0
    }
399
0
  }
400
401
  // Iterator positions must expose fully prepared values and columns once
402
  // Valid() becomes true, so resolve and materialize all blob columns here.
403
0
  state.BindLazyEntity(saved_key_.GetUserKey());
404
0
  if (!MaterializeLazyEntityColumns()) {
405
0
    state.ClearLazyEntity();
406
0
    return false;
407
0
  }
408
0
  state.MaybeSetValueFromMaterializedDefaultColumn();
409
410
0
  return true;
411
0
}
412
413
0
bool DBIter::MaterializeLazyEntityColumns() const {
414
0
  const auto& state = *value_columns_state_;
415
0
  if (state.lazy_entity_columns().empty() || !state.wide_columns().empty()) {
416
0
    return true;
417
0
  }
418
419
0
  std::lock_guard<std::mutex> lock(state.lazy_entity_columns_mutex());
420
0
  if (state.lazy_entity_columns().empty() || !state.wide_columns().empty()) {
421
0
    return true;
422
0
  }
423
424
0
  DBIter* const mutable_this = const_cast<DBIter*>(this);
425
0
  auto& mutable_state = *mutable_this->value_columns_state_.mut();
426
0
  WideColumns materialized_columns;
427
0
  materialized_columns.reserve(state.lazy_entity_columns().size());
428
0
  for (const auto& col : state.lazy_entity_columns()) {
429
0
    materialized_columns.emplace_back(col.name(), col.value());
430
0
  }
431
432
0
  for (const auto& blob_col : state.lazy_blob_columns()) {
433
0
    Slice resolved_value;
434
0
    const Status s = mutable_state.entity_blob_resolver().ResolveColumn(
435
0
        blob_col.first, &resolved_value);
436
0
    if (!s.ok()) {
437
0
      mutable_this->status_ = s;
438
0
      mutable_this->valid_ = false;
439
0
      mutable_state.wide_columns().clear();
440
0
      return false;
441
0
    }
442
443
0
    materialized_columns[blob_col.first].value() = resolved_value;
444
0
  }
445
446
0
  mutable_state.wide_columns() = std::move(materialized_columns);
447
0
  return true;
448
0
}
449
450
bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status,
451
0
                                               ValueType result_type) {
452
0
  if (!merge_status.ok()) {
453
0
    valid_ = false;
454
0
    status_ = merge_status;
455
0
    return false;
456
0
  }
457
458
0
  if (result_type == kTypeWideColumnEntity) {
459
0
    if (!SetValueAndColumnsFromEntity(value_columns_state_->saved_value())) {
460
0
      assert(!valid_);
461
0
      return false;
462
0
    }
463
464
0
    valid_ = true;
465
0
    return true;
466
0
  }
467
468
0
  assert(result_type == kTypeValue);
469
0
  SetValueAndColumnsFromPlain(pinned_value_.data()
470
0
                                  ? pinned_value_
471
0
                                  : value_columns_state_->saved_value());
472
0
  valid_ = true;
473
0
  return true;
474
0
}
475
476
0
bool DBIter::PrepareValue() {
477
0
  assert(valid_);
478
479
0
  if (blob_state_->lazy_blob_index.empty()) {
480
0
    return true;
481
0
  }
482
483
0
  assert(allow_unprepared_value_);
484
0
  assert(blob_state_->is_blob);
485
486
0
  const bool result = SetValueAndColumnsFromBlobImpl(
487
0
      saved_key_.GetUserKey(), blob_state_->lazy_blob_index);
488
489
0
  blob_state_.mut()->lazy_blob_index.clear();
490
491
0
  return result;
492
0
}
493
494
// PRE: saved_key_ has the current user key if skipping_saved_key
495
// POST: saved_key_ should have the next user key if valid_,
496
//       if the current entry is a result of merge
497
//           current_entry_is_merged_ => true
498
//           the saved merge buffer   => the merged value
499
//
500
// NOTE: In between, saved_key_ can point to a user key that has
501
//       a delete marker or a sequence number higher than sequence_
502
//       saved_key_ MUST have a proper user_key before calling this function
503
//
504
// The prefix parameter, if not null, indicates that we need to iterate
505
// within the prefix, and the iterator needs to be made invalid, if no
506
// more entry for the prefix can be found.
507
25.0k
bool DBIter::FindNextUserEntry(bool skipping_saved_key) {
508
25.0k
  PERF_TIMER_GUARD(find_next_user_entry_time);
509
25.0k
  return FindNextUserEntryInternal(skipping_saved_key);
510
25.0k
}
511
512
// Actual implementation of DBIter::FindNextUserEntry()
513
25.0k
bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key) {
514
  // Loop until we hit an acceptable entry to yield
515
25.0k
  assert(iter_.Valid());
516
25.0k
  assert(status_.ok());
517
25.0k
  assert(direction_ == kForward);
518
25.0k
  current_entry_is_merged_ = false;
519
520
  // How many times in a row we have skipped an entry with user key less than
521
  // or equal to saved_key_. We could skip these entries either because
522
  // sequence numbers were too high or because skipping_saved_key = true.
523
  // What saved_key_ contains throughout this method:
524
  //  - if skipping_saved_key : saved_key_ contains the key that we need
525
  //                            to skip, and we haven't seen any keys greater
526
  //                            than that,
527
  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
528
  //                            num_skipped times, and we haven't seen any keys
529
  //                            greater than that,
530
  //  - none of the above     : saved_key_ can contain anything, it doesn't
531
  //                            matter.
532
25.0k
  uint64_t num_skipped = 0;
533
  // For write unprepared, the target sequence number in reseek could be larger
534
  // than the snapshot, and thus needs to be skipped again. This could result in
535
  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
536
  // to one.
537
25.0k
  bool reseek_done = false;
538
539
25.0k
  uint64_t mem_hidden_op_scanned = 0;
540
35.7k
  do {
541
    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
542
    // but we need to save the previous value to be used in the loop.
543
35.7k
    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
544
35.7k
    if (!ParseKey(&ikey_)) {
545
0
      is_key_seqnum_zero_ = false;
546
0
      return false;
547
0
    }
548
35.7k
    Slice user_key_without_ts =
549
35.7k
        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
550
551
35.7k
    is_key_seqnum_zero_ = (ikey_.sequence == 0);
552
553
35.7k
    assert(iterate_upper_bound_ == nullptr ||
554
35.7k
           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
555
35.7k
           user_comparator_.CompareWithoutTimestamp(
556
35.7k
               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
557
35.7k
               /*b_has_ts=*/false) < 0);
558
35.7k
    if (iterate_upper_bound_ != nullptr &&
559
0
        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
560
0
        user_comparator_.CompareWithoutTimestamp(
561
0
            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
562
0
            /*b_has_ts=*/false) >= 0) {
563
0
      break;
564
0
    }
565
566
35.7k
    assert(!prefix_.has_value() || prefix_extractor_ != nullptr);
567
35.7k
    if (!PrefixCheck(user_key_without_ts)) {
568
      // Insert any pending tombstone run using the last tracked delete
569
      // (saved_key_) as the end key.  We cannot use the current key's
570
      // prefix as the boundary because bloom filters may have hidden
571
      // entire prefixes between the seek prefix and the current key.
572
      // The tombstone covers n-1 of n deletes; the last remains as a
573
      // point delete.
574
0
      FlushPendingTombstoneRun(saved_key_.GetUserKey());
575
0
      if (prefix_same_as_start_) {
576
0
        break;
577
0
      }
578
0
    }
579
580
35.7k
    if (TooManyInternalKeysSkipped()) {
581
0
      return false;
582
0
    }
583
584
35.7k
    assert(ikey_.user_key.size() >= timestamp_size_);
585
35.7k
    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
586
0
                                         ikey_.user_key, timestamp_size_)
587
35.7k
                                   : Slice();
588
35.7k
    bool more_recent = false;
589
35.7k
    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
590
      // If the previous entry is of seqnum 0, the current entry will not
591
      // possibly be skipped. This condition can potentially be relaxed to
592
      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
593
      // prone to bugs causing the same user key with the same sequence number.
594
      // Note that with current timestamp implementation, the same user key can
595
      // have different timestamps and zero sequence number on the bottommost
596
      // level. This may change in the future.
597
35.7k
      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
598
35.5k
          skipping_saved_key &&
599
29.5k
          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
600
6.49k
        num_skipped++;  // skip this entry
601
6.49k
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
602
6.49k
        MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
603
29.2k
      } else {
604
29.2k
        assert(!skipping_saved_key ||
605
29.2k
               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
606
29.2k
        num_skipped = 0;
607
29.2k
        reseek_done = false;
608
29.2k
        switch (ikey_.type) {
609
6.14k
          case kTypeDeletion:
610
6.14k
          case kTypeDeletionWithTimestamp:
611
6.14k
          case kTypeSingleDeletion:
612
            // Arrange to skip all upcoming entries for this key since
613
            // they are hidden by this deletion.
614
6.14k
            if (timestamp_lb_) {
615
0
              saved_key_.SetInternalKey(ikey_);
616
0
              valid_ = true;
617
0
              return true;
618
6.14k
            } else {
619
6.14k
              saved_key_.SetUserKey(
620
6.14k
                  ikey_.user_key, !pin_thru_lifetime_ ||
621
0
                                      !iter_.iter()->IsKeyPinned() /* copy */);
622
6.14k
              skipping_saved_key = true;
623
6.14k
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
624
6.14k
              MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
625
              // Track contiguous tombstones for range conversion.
626
              // Skip if outside seek prefix -- the top-of-loop check
627
              // flushed any pending run, but we must also avoid starting
628
              // a new run outside the prefix.
629
6.14k
              if (min_tombstones_for_range_conversion_ > 0 &&
630
0
                  PrefixCheck(user_key_without_ts)) {
631
0
                TrackContiguousTombstone(ikey_.user_key,
632
0
                                         /*always_update_first_key=*/false);
633
0
              }
634
6.14k
            }
635
6.14k
            break;
636
23.1k
          case kTypeValue:
637
23.1k
          case kTypeValuePreferredSeqno:
638
23.1k
          case kTypeBlobIndex:
639
23.1k
          case kTypeWideColumnEntity:
640
23.1k
            if (!PrepareValueInternal()) {
641
0
              return false;
642
0
            }
643
23.1k
            FlushPendingTombstoneRun(ikey_.user_key);
644
23.1k
            if (timestamp_lb_) {
645
0
              saved_key_.SetInternalKey(ikey_);
646
23.1k
            } else {
647
23.1k
              saved_key_.SetUserKey(
648
23.1k
                  ikey_.user_key, !pin_thru_lifetime_ ||
649
0
                                      !iter_.iter()->IsKeyPinned() /* copy */);
650
23.1k
            }
651
652
23.1k
            if (ikey_.type == kTypeBlobIndex) {
653
0
              if (!SetValueAndColumnsFromBlob(ikey_.user_key, iter_.value())) {
654
0
                return false;
655
0
              }
656
23.1k
            } else if (ikey_.type == kTypeWideColumnEntity) {
657
0
              if (!SetValueAndColumnsFromEntity(iter_.value())) {
658
0
                return false;
659
0
              }
660
23.1k
            } else {
661
23.1k
              assert(ikey_.type == kTypeValue ||
662
23.1k
                     ikey_.type == kTypeValuePreferredSeqno);
663
23.1k
              Slice value = iter_.value();
664
23.1k
              saved_write_unix_time_ = iter_.write_unix_time();
665
23.1k
              if (ikey_.type == kTypeValuePreferredSeqno) {
666
0
                value = ParsePackedValueForValue(value);
667
0
              }
668
23.1k
              SetValueAndColumnsFromPlain(value);
669
23.1k
            }
670
671
23.1k
            valid_ = true;
672
23.1k
            return true;
673
0
          case kTypeMerge:
674
0
            if (!PrepareValueInternal()) {
675
0
              return false;
676
0
            }
677
0
            FlushPendingTombstoneRun(ikey_.user_key);
678
0
            saved_key_.SetUserKey(
679
0
                ikey_.user_key,
680
0
                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
681
            // By now, we are sure the current ikey is going to yield a value
682
0
            current_entry_is_merged_ = true;
683
0
            valid_ = true;
684
0
            return MergeValuesNewToOld();  // Go to a different state machine
685
0
          default:
686
0
            valid_ = false;
687
0
            status_ = Status::Corruption(
688
0
                "Unknown value type: " +
689
0
                std::to_string(static_cast<unsigned int>(ikey_.type)));
690
0
            return false;
691
29.2k
        }
692
29.2k
      }
693
35.7k
    } else {
694
0
      if (more_recent) {
695
0
        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
696
0
      }
697
698
      // This key was inserted after our snapshot was taken or skipped by
699
      // timestamp range. If this happens too many times in a row for the same
700
      // user key, we want to seek to the target sequence number.
701
0
      int cmp = user_comparator_.CompareWithoutTimestamp(
702
0
          ikey_.user_key, saved_key_.GetUserKey());
703
0
      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
704
0
        num_skipped++;
705
0
      } else {
706
0
        saved_key_.SetUserKey(
707
0
            ikey_.user_key,
708
0
            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
709
0
        skipping_saved_key = false;
710
0
        num_skipped = 0;
711
0
        reseek_done = false;
712
0
      }
713
0
    }
714
715
    // If we have sequentially iterated via numerous equal keys, then it's
716
    // better to seek so that we can avoid too many key comparisons.
717
    //
718
    // To avoid infinite loops, do not reseek if we have already attempted to
719
    // reseek previously.
720
    //
721
    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
722
    // then it does not make sense to reseek as we would actually land further
723
    // away from the desired key. There is opportunity for optimization here.
724
12.6k
    if (num_skipped > max_skip_ && !reseek_done) {
725
230
      is_key_seqnum_zero_ = false;
726
230
      num_skipped = 0;
727
230
      reseek_done = true;
728
230
      std::string last_key;
729
230
      if (skipping_saved_key) {
730
        // We're looking for the next user-key but all we see are the same
731
        // user-key with decreasing sequence numbers. Fast forward to
732
        // sequence number 0 and type deletion (the smallest type).
733
230
        if (timestamp_size_ == 0) {
734
230
          AppendInternalKey(
735
230
              &last_key,
736
230
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
737
230
        } else {
738
0
          const std::string kTsMin(timestamp_size_, '\0');
739
0
          AppendInternalKeyWithDifferentTimestamp(
740
0
              &last_key,
741
0
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
742
0
              kTsMin);
743
0
        }
744
        // Don't set skipping_saved_key = false because we may still see more
745
        // user-keys equal to saved_key_.
746
230
      } else {
747
        // We saw multiple entries with this user key and sequence numbers
748
        // higher than sequence_. Fast forward to sequence_.
749
        // Note that this only covers a case when a higher key was overwritten
750
        // many times since our snapshot was taken, not the case when a lot of
751
        // different keys were inserted after our snapshot was taken.
752
0
        if (timestamp_size_ == 0) {
753
0
          AppendInternalKey(
754
0
              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
755
0
                                           kValueTypeForSeek));
756
0
        } else {
757
0
          AppendInternalKeyWithDifferentTimestamp(
758
0
              &last_key,
759
0
              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
760
0
                                kValueTypeForSeek),
761
0
              *timestamp_ub_);
762
0
        }
763
0
      }
764
230
      iter_.Seek(last_key);
765
230
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
766
12.4k
    } else {
767
12.4k
      iter_.Next();
768
12.4k
    }
769
770
    // This could be a long-running operation due to tombstones, etc.
771
12.6k
    bool aborted = ROCKSDB_THREAD_YIELD_CHECK_ABORT();
772
12.6k
    if (aborted) {
773
0
      valid_ = false;
774
0
      status_ = Status::Aborted("Query abort.");
775
0
      return false;
776
0
    }
777
12.6k
  } while (iter_.Valid());
778
779
  // If we accumulated tombstones, use the last tracked tombstone as the
780
  // exclusive end key. It may be more optimal to use iterator upper bound if it
781
  // exists, but the current iterator API makes that dangerous as upper bound
782
  // points to user memory which is not guaranteed immutable.
783
1.88k
  if (contiguous_tombstone_count_ > 0 && iter_.status().ok()) {
784
    // It is unsafe to use iter_.key() here even when iter_.Valid() and the key
785
    // is within the seek prefix. This is because memtable iterators are still
786
    // valid past the upper bound, but sst iterators are not. So iter_.key() can
787
    // point to a memtable entry that has skipped past real live entries in
788
    // ssts.
789
0
    assert(PrefixCheck(saved_key_.GetUserKey()));
790
0
    MaybeInsertRangeTombstone(saved_key_.GetUserKey());
791
0
  }
792
1.88k
  ResetContiguousTombstoneTracking();
793
794
1.88k
  valid_ = false;
795
1.88k
  return iter_.status().ok();
796
25.0k
}
797
798
// Merge values of the same user key starting from the current iter_ position
799
// Scan from the newer entries to older entries.
800
// PRE: iter_.key() points to the first merge type entry
801
//      saved_key_ stores the user key
802
//      iter_.PrepareValue() has been called
803
// POST: the saved merge buffer has the merged value for the user key
804
//       iter_ points to the next entry (or invalid)
805
0
bool DBIter::MergeValuesNewToOld() {
806
0
  if (!merge_operator_) {
807
0
    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
808
0
    status_ = Status::InvalidArgument("merge_operator_ must be set.");
809
0
    valid_ = false;
810
0
    return false;
811
0
  }
812
813
  // Temporarily pin the blocks that hold merge operands
814
0
  TempPinData();
815
0
  merge_context_.Clear();
816
  // Start the merge process by pushing the first operand
817
0
  merge_context_.PushOperand(
818
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
819
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
820
821
0
  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
822
823
0
  ParsedInternalKey ikey;
824
0
  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
825
0
    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
826
0
    if (!ParseKey(&ikey)) {
827
0
      return false;
828
0
    }
829
830
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
831
0
                                                saved_key_.GetUserKey())) {
832
      // hit the next user key, stop right here
833
0
      break;
834
0
    }
835
0
    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
836
0
        kTypeDeletionWithTimestamp == ikey.type) {
837
      // hit a delete with the same user key, stop right here
838
      // iter_ is positioned after delete
839
0
      iter_.Next();
840
0
      break;
841
0
    }
842
0
    if (!PrepareValueInternal()) {
843
0
      return false;
844
0
    }
845
846
0
    if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) {
847
0
      Slice value = iter_.value();
848
0
      saved_write_unix_time_ = iter_.write_unix_time();
849
0
      if (kTypeValuePreferredSeqno == ikey.type) {
850
0
        value = ParsePackedValueForValue(value);
851
0
      }
852
      // hit a put or put equivalent, merge the put value with operands and
853
      // store the final result in the saved merge buffer. We are done!
854
0
      if (!MergeWithPlainBaseValue(value, ikey.user_key)) {
855
0
        return false;
856
0
      }
857
      // iter_ is positioned after put
858
0
      iter_.Next();
859
0
      if (!iter_.status().ok()) {
860
0
        valid_ = false;
861
0
        return false;
862
0
      }
863
0
      return true;
864
0
    } else if (kTypeMerge == ikey.type) {
865
      // hit a merge, add the value as an operand and run associative merge.
866
      // when complete, add result to operands and continue.
867
0
      merge_context_.PushOperand(
868
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
869
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
870
0
    } else if (kTypeBlobIndex == ikey.type) {
871
0
      if (!MergeWithBlobBaseValue(iter_.value(), ikey.user_key)) {
872
0
        return false;
873
0
      }
874
875
      // iter_ is positioned after put
876
0
      iter_.Next();
877
0
      if (!iter_.status().ok()) {
878
0
        valid_ = false;
879
0
        return false;
880
0
      }
881
882
0
      return true;
883
0
    } else if (kTypeWideColumnEntity == ikey.type) {
884
0
      if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) {
885
0
        return false;
886
0
      }
887
888
      // iter_ is positioned after put
889
0
      iter_.Next();
890
0
      if (!iter_.status().ok()) {
891
0
        valid_ = false;
892
0
        return false;
893
0
      }
894
895
0
      return true;
896
0
    } else {
897
0
      valid_ = false;
898
0
      status_ = Status::Corruption(
899
0
          "Unrecognized value type: " +
900
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
901
0
      return false;
902
0
    }
903
0
  }
904
905
0
  if (!iter_.status().ok()) {
906
0
    valid_ = false;
907
0
    return false;
908
0
  }
909
910
  // we either exhausted all internal keys under this user key, or hit
911
  // a deletion marker.
912
  // feed null as the existing value to the merge operator, such that
913
  // client can differentiate this scenario and do things accordingly.
914
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
915
0
    return false;
916
0
  }
917
0
  assert(status_.ok());
918
0
  return true;
919
0
}
920
921
0
void DBIter::Prev() {
922
0
  assert(valid_);
923
0
  assert(status_.ok());
924
925
0
  PERF_COUNTER_ADD(iter_prev_count, 1);
926
0
  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
927
0
  ReleaseTempPinnedData();
928
0
  ResetBlobData();
929
0
  ResetValueAndColumns();
930
0
  ResetInternalKeysSkippedCounter();
931
0
  bool ok = true;
932
0
  if (direction_ == kForward) {
933
0
    ResetContiguousTombstoneTracking();
934
0
    if (!ReverseToBackward()) {
935
0
      ok = false;
936
0
    }
937
    // Transitioning to reverse: current key is the end bound
938
0
    if (ok && min_tombstones_for_range_conversion_ > 0) {
939
0
      range_tomb_end_key_.SetUserKey(saved_key_.GetUserKey(),
940
0
                                     !saved_key_.IsKeyPinned());
941
0
    }
942
0
  }
943
0
  if (ok) {
944
0
    ClearSavedValue();
945
946
0
    PrevInternal();
947
0
  }
948
949
0
  if (statistics_ != nullptr) {
950
0
    local_stats_.prev_count_++;
951
0
    if (valid_) {
952
0
      local_stats_.prev_found_count_++;
953
0
      local_stats_.bytes_read_ += (key().size() + value().size());
954
0
    }
955
0
  }
956
0
}
957
958
0
bool DBIter::ReverseToForward() {
959
0
  assert(iter_.status().ok());
960
961
  // When moving backwards, iter_ is positioned on _previous_ key, which may
962
  // not exist or may have different prefix than the current key().
963
  // If that's the case, seek iter_ to current key.
964
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
965
0
    std::string last_key;
966
0
    if (timestamp_size_ == 0) {
967
0
      AppendInternalKey(
968
0
          &last_key, ParsedInternalKey(saved_key_.GetUserKey(),
969
0
                                       kMaxSequenceNumber, kValueTypeForSeek));
970
0
    } else {
971
      // TODO: pre-create kTsMax.
972
0
      const std::string kTsMax(timestamp_size_, '\xff');
973
0
      AppendInternalKeyWithDifferentTimestamp(
974
0
          &last_key,
975
0
          ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
976
0
                            kValueTypeForSeek),
977
0
          kTsMax);
978
0
    }
979
0
    iter_.Seek(last_key);
980
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
981
0
  }
982
983
0
  direction_ = kForward;
984
  // Skip keys less than the current key() (a.k.a. saved_key_).
985
0
  while (iter_.Valid()) {
986
0
    ParsedInternalKey ikey;
987
0
    if (!ParseKey(&ikey)) {
988
0
      return false;
989
0
    }
990
0
    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
991
0
      return true;
992
0
    }
993
0
    iter_.Next();
994
0
  }
995
996
0
  if (!iter_.status().ok()) {
997
0
    valid_ = false;
998
0
    return false;
999
0
  }
1000
1001
0
  return true;
1002
0
}
1003
1004
// Move iter_ to the key before saved_key_.
1005
0
bool DBIter::ReverseToBackward() {
1006
0
  assert(iter_.status().ok());
1007
1008
  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
1009
  // key, which may not exist or may have prefix different from current.
1010
  // If that's the case, seek to saved_key_.
1011
0
  if (current_entry_is_merged_ &&
1012
0
      (!expect_total_order_inner_iter() || !iter_.Valid())) {
1013
0
    IterKey last_key;
1014
    // Using kMaxSequenceNumber and kValueTypeForSeek
1015
    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
1016
    // than saved_key_.
1017
0
    last_key.SetInternalKey(ParsedInternalKey(
1018
0
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
1019
0
    if (!expect_total_order_inner_iter()) {
1020
0
      iter_.SeekForPrev(last_key.GetInternalKey());
1021
0
    } else {
1022
      // Some iterators may not support SeekForPrev(), so we avoid using it
1023
      // when prefix seek mode is disabled. This is somewhat expensive
1024
      // (an extra Prev(), as well as an extra change of direction of iter_),
1025
      // so we may need to reconsider it later.
1026
0
      iter_.Seek(last_key.GetInternalKey());
1027
0
      if (!iter_.Valid() && iter_.status().ok()) {
1028
0
        iter_.SeekToLast();
1029
0
      }
1030
0
    }
1031
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1032
0
  }
1033
1034
0
  direction_ = kReverse;
1035
0
  return FindUserKeyBeforeSavedKey();
1036
0
}
1037
1038
3.61k
void DBIter::PrevInternal() {
1039
  // Capture saved_key_ (previous live key) into range_tomb_end_key_ before
1040
  // saved_key_ is overwritten below.
1041
3.61k
  if (min_tombstones_for_range_conversion_ > 0) {
1042
0
    range_tomb_end_key_.Swap(saved_key_);
1043
0
  }
1044
1045
4.89k
  while (iter_.Valid()) {
1046
3.69k
    saved_key_.SetUserKey(
1047
3.69k
        ExtractUserKey(iter_.key()),
1048
3.69k
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
1049
1050
3.69k
    assert(!prefix_.has_value() || prefix_extractor_ != nullptr);
1051
3.69k
    Slice saved_key_without_ts =
1052
3.69k
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_);
1053
    // When prefix filtering is active, insert any pending tombstone run
1054
    // before we leave the seek prefix.
1055
3.69k
    if (!PrefixCheck(saved_key_without_ts)) {
1056
      // Insert any pending tombstone run before leaving the seek prefix.
1057
      // Only insert if end_key (previous live key) is within the seek prefix.
1058
0
      if (range_tomb_end_key_.Size() > 0) {
1059
0
        FlushPendingTombstoneRun(range_tomb_end_key_.GetUserKey(),
1060
0
                                 /*check_prefix_match=*/true);
1061
0
      }
1062
0
      if (prefix_same_as_start_) {
1063
0
        valid_ = false;
1064
0
        return;
1065
0
      }
1066
0
    }
1067
1068
3.69k
    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
1069
3.69k
           user_comparator_.CompareWithoutTimestamp(
1070
3.69k
               saved_key_.GetUserKey(), /*a_has_ts=*/true,
1071
3.69k
               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
1072
3.69k
    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
1073
0
        user_comparator_.CompareWithoutTimestamp(
1074
0
            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
1075
0
            /*b_has_ts=*/false) < 0) {
1076
      // We've iterated earlier than the user-specified lower bound.
1077
0
      if (range_tomb_end_key_.Size() > 0) {
1078
0
        FlushPendingTombstoneRun(range_tomb_end_key_.GetUserKey(),
1079
0
                                 /*check_prefix_match=*/true);
1080
0
      }
1081
0
      valid_ = false;
1082
0
      return;
1083
0
    }
1084
1085
3.69k
    bool found_visible = false;
1086
3.69k
    if (!FindValueForCurrentKey(found_visible)) {  // assigns valid_
1087
0
      return;
1088
0
    }
1089
1090
    // Track contiguous tombstones for reverse range tombstone conversion.
1091
    // Only track when FindValueForCurrentKey found a visible entry
1092
    // (found_visible == true).  When no visible entry exists (all seqno >
1093
    // snapshot), the key doesn't exist at this snapshot and must not be
1094
    // treated as a tombstone.  Additionally, ikey_ is only updated when a
1095
    // visible entry is found, so reading ikey_.sequence without this guard
1096
    // would use a stale value.
1097
3.69k
    if (min_tombstones_for_range_conversion_ > 0 &&
1098
0
        range_tomb_end_key_.Size() > 0 && timestamp_lb_ == nullptr) {
1099
0
      if (!valid_ && found_visible && PrefixCheck(saved_key_without_ts)) {
1100
        // Key was deleted and is within the seek prefix -- track it.
1101
0
        TrackContiguousTombstone(saved_key_.GetUserKey(),
1102
0
                                 /*always_update_first_key=*/true);
1103
0
      } else if (valid_) {
1104
        // Live key breaks the run.
1105
0
        FlushPendingTombstoneRun(range_tomb_end_key_.GetUserKey(),
1106
0
                                 /*check_prefix_match=*/true);
1107
0
      }
1108
0
    }
1109
1110
    // Whether or not we found a value for current key, we need iter_ to end up
1111
    // on a smaller key.
1112
3.69k
    if (!FindUserKeyBeforeSavedKey()) {
1113
0
      return;
1114
0
    }
1115
1116
3.69k
    if (valid_) {
1117
      // Found the value.
1118
2.40k
      return;
1119
2.40k
    }
1120
1121
1.28k
    if (TooManyInternalKeysSkipped(false)) {
1122
0
      return;
1123
0
    }
1124
1.28k
  }
1125
1126
1.20k
  if (range_tomb_end_key_.Size() > 0) {
1127
0
    FlushPendingTombstoneRun(range_tomb_end_key_.GetUserKey(),
1128
0
                             /*check_prefix_match=*/true);
1129
0
  }
1130
1131
  // We haven't found any key - iterator is not valid
1132
1.20k
  valid_ = false;
1133
1.20k
}
1134
1135
// Used for backwards iteration.
1136
// Looks at the entries with user key saved_key_ and finds the most up-to-date
1137
// value for it, or executes a merge, or determines that the value was deleted.
1138
// Sets valid_ to true if the value is found and is ready to be presented to
1139
// the user through value().
1140
// Sets valid_ to false if the value was deleted or no visible entry exists.
1141
// Sets ikey_ to the last visible entry's internal key.  When found_visible
1142
// is false, ikey_ is not updated and may contain stale data.
1143
// Sets found_visible to true if at least one entry passed the IsVisible()
1144
// check (seqno <= snapshot).  When false, no entry was visible -- the key
1145
// does not exist at this snapshot and should not be treated as a tombstone.
1146
// Returns false if an error occurred, and !status().ok() and !valid_.
1147
//
1148
// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
1149
// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
1150
//       the entry just before them, or on the entry just after them.
1151
3.69k
bool DBIter::FindValueForCurrentKey(bool& found_visible) {
1152
3.69k
  found_visible = false;
1153
3.69k
  assert(iter_.Valid());
1154
3.69k
  merge_context_.Clear();
1155
3.69k
  current_entry_is_merged_ = false;
1156
  // last entry before merge (could be kTypeDeletion,
1157
  // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue
1158
  // kTypeBlobIndex, kTypeWideColumnEntity or kTypeValuePreferredSeqno)
1159
3.69k
  ValueType last_not_merge_type = kTypeDeletion;
1160
3.69k
  ValueType last_key_entry_type = kTypeDeletion;
1161
1162
  // If false, it indicates that we have not seen any valid entry, even though
1163
  // last_key_entry_type is initialized to kTypeDeletion.
1164
3.69k
  bool valid_entry_seen = false;
1165
1166
  // Temporarily pin blocks that hold (merge operands / the value)
1167
3.69k
  ReleaseTempPinnedData();
1168
3.69k
  TempPinData();
1169
3.69k
  size_t num_skipped = 0;
1170
17.1k
  while (iter_.Valid()) {
1171
14.1k
    ParsedInternalKey ikey;
1172
14.1k
    if (!ParseKey(&ikey)) {
1173
0
      return false;
1174
0
    }
1175
1176
14.1k
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1177
14.1k
                                                saved_key_.GetUserKey())) {
1178
      // Found a smaller user key, thus we are done with current user key.
1179
108
      break;
1180
108
    }
1181
1182
14.1k
    assert(ikey.user_key.size() >= timestamp_size_);
1183
14.0k
    Slice ts;
1184
14.0k
    if (timestamp_size_ > 0) {
1185
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1186
0
                 timestamp_size_);
1187
0
    }
1188
1189
14.0k
    bool visible = IsVisible(ikey.sequence, ts);
1190
14.0k
    if (!visible &&
1191
0
        (timestamp_lb_ == nullptr ||
1192
0
         user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
1193
      // Found an invisible version of the current user key, and it must have
1194
      // a higher sequence number or timestamp. Therefore, we are done with the
1195
      // current user key.
1196
0
      break;
1197
0
    }
1198
1199
    // Entry survived the visibility check -- at least one visible version
1200
    // exists for this user key.
1201
14.0k
    found_visible = true;
1202
1203
14.0k
    if (!ts.empty()) {
1204
0
      saved_timestamp_.assign(ts.data(), ts.size());
1205
0
    }
1206
1207
14.0k
    if (TooManyInternalKeysSkipped()) {
1208
0
      return false;
1209
0
    }
1210
1211
    // This user key has lots of entries.
1212
    // We're going from old to new, and it's taking too long. Let's do a Seek()
1213
    // and go from new to old. This helps when a key was overwritten many times.
1214
14.0k
    if (num_skipped >= max_skip_) {
1215
579
      return FindValueForCurrentKeyUsingSeek();
1216
579
    }
1217
1218
13.4k
    if (!PrepareValueInternal()) {
1219
0
      return false;
1220
0
    }
1221
1222
13.4k
    if (timestamp_lb_ != nullptr) {
1223
0
      saved_key_.SetInternalKey(ikey);
1224
13.4k
    } else if (user_comparator_.Compare(ikey.user_key,
1225
13.4k
                                        saved_key_.GetUserKey()) < 0) {
1226
0
      saved_key_.SetUserKey(
1227
0
          ikey.user_key,
1228
0
          !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
1229
0
    }
1230
1231
    // Ensure ikey_ is only set to VISIBLE keys.
1232
13.4k
    ikey_ = ikey;
1233
13.4k
    valid_entry_seen = true;
1234
13.4k
    last_key_entry_type = ikey.type;
1235
13.4k
    switch (last_key_entry_type) {
1236
8.74k
      case kTypeValue:
1237
8.74k
      case kTypeValuePreferredSeqno:
1238
8.74k
      case kTypeBlobIndex:
1239
8.74k
      case kTypeWideColumnEntity:
1240
8.74k
        if (iter_.iter()->IsValuePinned()) {
1241
8.74k
          saved_write_unix_time_ = iter_.write_unix_time();
1242
8.74k
          if (last_key_entry_type == kTypeValuePreferredSeqno) {
1243
0
            pinned_value_ = ParsePackedValueForValue(iter_.value());
1244
8.74k
          } else {
1245
8.74k
            pinned_value_ = iter_.value();
1246
8.74k
          }
1247
8.74k
        } else {
1248
0
          valid_ = false;
1249
0
          status_ = Status::NotSupported(
1250
0
              "Backward iteration not supported if underlying iterator's value "
1251
0
              "cannot be pinned.");
1252
0
        }
1253
8.74k
        merge_context_.Clear();
1254
8.74k
        last_not_merge_type = last_key_entry_type;
1255
8.74k
        if (!status_.ok()) {
1256
0
          return false;
1257
0
        }
1258
8.74k
        break;
1259
8.74k
      case kTypeDeletion:
1260
4.68k
      case kTypeDeletionWithTimestamp:
1261
4.68k
      case kTypeSingleDeletion:
1262
4.68k
        merge_context_.Clear();
1263
4.68k
        last_not_merge_type = last_key_entry_type;
1264
4.68k
        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
1265
4.68k
        break;
1266
0
      case kTypeMerge: {
1267
0
        assert(merge_operator_ != nullptr);
1268
0
        merge_context_.PushOperandBack(
1269
0
            iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1270
0
        PERF_COUNTER_ADD(internal_merge_count, 1);
1271
0
      } break;
1272
0
      default:
1273
0
        valid_ = false;
1274
0
        status_ = Status::Corruption(
1275
0
            "Unknown value type: " +
1276
0
            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
1277
0
        return false;
1278
13.4k
    }
1279
1280
13.4k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
1281
13.4k
    iter_.Prev();
1282
13.4k
    ++num_skipped;
1283
1284
13.4k
    if (visible && timestamp_lb_ != nullptr) {
1285
      // If timestamp_lb_ is not nullptr, we do not have to look further for
1286
      // another internal key. We can return this current internal key. Yet we
1287
      // still keep the invariant that iter_ is positioned before the returned
1288
      // key.
1289
0
      break;
1290
0
    }
1291
13.4k
  }
1292
1293
3.11k
  if (!iter_.status().ok()) {
1294
0
    valid_ = false;
1295
0
    return false;
1296
0
  }
1297
1298
3.11k
  if (!valid_entry_seen) {
1299
    // Since we haven't seen any valid entry, last_key_entry_type remains
1300
    // unchanged and the same as its initial value.
1301
0
    assert(last_key_entry_type == kTypeDeletion);
1302
0
    assert(last_not_merge_type == kTypeDeletion);
1303
0
    valid_ = false;
1304
0
    return true;
1305
0
  }
1306
1307
3.11k
  if (timestamp_lb_ != nullptr) {
1308
0
    assert(last_key_entry_type == ikey_.type);
1309
0
  }
1310
1311
3.11k
  switch (last_key_entry_type) {
1312
1.14k
    case kTypeDeletion:
1313
1.14k
    case kTypeDeletionWithTimestamp:
1314
1.14k
    case kTypeSingleDeletion:
1315
1.14k
      if (timestamp_lb_ == nullptr) {
1316
1.14k
        valid_ = false;
1317
1.14k
      } else {
1318
0
        valid_ = true;
1319
0
      }
1320
1.14k
      return true;
1321
0
    case kTypeMerge:
1322
0
      current_entry_is_merged_ = true;
1323
0
      if (last_not_merge_type == kTypeDeletion ||
1324
0
          last_not_merge_type == kTypeSingleDeletion ||
1325
0
          last_not_merge_type == kTypeDeletionWithTimestamp) {
1326
0
        if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1327
0
          return false;
1328
0
        }
1329
0
        return true;
1330
0
      } else if (last_not_merge_type == kTypeBlobIndex) {
1331
0
        if (!MergeWithBlobBaseValue(pinned_value_, saved_key_.GetUserKey())) {
1332
0
          return false;
1333
0
        }
1334
1335
0
        return true;
1336
0
      } else if (last_not_merge_type == kTypeWideColumnEntity) {
1337
0
        if (!MergeWithWideColumnBaseValue(pinned_value_,
1338
0
                                          saved_key_.GetUserKey())) {
1339
0
          return false;
1340
0
        }
1341
1342
0
        return true;
1343
0
      } else {
1344
0
        assert(last_not_merge_type == kTypeValue ||
1345
0
               last_not_merge_type == kTypeValuePreferredSeqno);
1346
0
        if (!MergeWithPlainBaseValue(pinned_value_, saved_key_.GetUserKey())) {
1347
0
          return false;
1348
0
        }
1349
0
        return true;
1350
0
      }
1351
1.97k
    case kTypeValue:
1352
1.97k
    case kTypeValuePreferredSeqno:
1353
1.97k
      SetValueAndColumnsFromPlain(pinned_value_);
1354
1355
1.97k
      break;
1356
0
    case kTypeBlobIndex:
1357
0
      if (!SetValueAndColumnsFromBlob(saved_key_.GetUserKey(), pinned_value_)) {
1358
0
        return false;
1359
0
      }
1360
0
      break;
1361
0
    case kTypeWideColumnEntity:
1362
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1363
0
        return false;
1364
0
      }
1365
0
      break;
1366
0
    default:
1367
0
      valid_ = false;
1368
0
      status_ = Status::Corruption(
1369
0
          "Unknown value type: " +
1370
0
          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
1371
0
      return false;
1372
3.11k
  }
1373
1.97k
  valid_ = true;
1374
1.97k
  return true;
1375
3.11k
}
1376
1377
// This function is used in FindValueForCurrentKey.
1378
// We use Seek() function instead of Prev() to find necessary value
1379
// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
1380
//       Would be nice to reuse some code.
1381
579
bool DBIter::FindValueForCurrentKeyUsingSeek() {
1382
  // FindValueForCurrentKey will enable pinning before calling
1383
  // FindValueForCurrentKeyUsingSeek()
1384
579
  assert(pinned_iters_mgr_.PinningEnabled());
1385
579
  std::string last_key;
1386
579
  if (0 == timestamp_size_) {
1387
579
    AppendInternalKey(&last_key,
1388
579
                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1389
579
                                        kValueTypeForSeek));
1390
579
  } else {
1391
0
    AppendInternalKeyWithDifferentTimestamp(
1392
0
        &last_key,
1393
0
        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1394
0
                          kValueTypeForSeek),
1395
0
        timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
1396
0
  }
1397
579
  iter_.Seek(last_key);
1398
579
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1399
1400
  // In case read_callback presents, the value we seek to may not be visible.
1401
  // Find the next value that's visible.
1402
579
  ParsedInternalKey ikey;
1403
1404
579
  while (true) {
1405
579
    if (!iter_.Valid()) {
1406
0
      valid_ = false;
1407
0
      return iter_.status().ok();
1408
0
    }
1409
1410
579
    if (!ParseKey(&ikey)) {
1411
0
      return false;
1412
0
    }
1413
579
    assert(ikey.user_key.size() >= timestamp_size_);
1414
579
    Slice ts;
1415
579
    if (timestamp_size_ > 0) {
1416
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1417
0
                 timestamp_size_);
1418
0
    }
1419
1420
579
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1421
579
                                                saved_key_.GetUserKey())) {
1422
      // No visible values for this key, even though FindValueForCurrentKey()
1423
      // has seen some. This is possible if we're using a tailing iterator, and
1424
      // the entries were discarded in a compaction.
1425
0
      valid_ = false;
1426
0
      return true;
1427
0
    }
1428
1429
579
    if (IsVisible(ikey.sequence, ts)) {
1430
579
      break;
1431
579
    }
1432
1433
0
    iter_.Next();
1434
0
  }
1435
1436
  // Keep ikey_ in sync with the entry found by the seek.
1437
579
  ikey_ = ikey;
1438
579
  TEST_SYNC_POINT_CALLBACK(
1439
579
      "DBIter::FindValueForCurrentKeyUsingSeek:ikey_updated", &ikey_);
1440
1441
579
  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1442
436
      kTypeDeletionWithTimestamp == ikey.type) {
1443
143
    if (timestamp_lb_ == nullptr) {
1444
143
      valid_ = false;
1445
143
    } else {
1446
0
      valid_ = true;
1447
0
      saved_key_.SetInternalKey(ikey);
1448
0
    }
1449
143
    return true;
1450
143
  }
1451
436
  if (!PrepareValueInternal()) {
1452
0
    return false;
1453
0
  }
1454
436
  if (timestamp_size_ > 0) {
1455
0
    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
1456
0
    saved_timestamp_.assign(ts.data(), ts.size());
1457
0
  }
1458
436
  if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno ||
1459
436
      ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) {
1460
436
    assert(iter_.iter()->IsValuePinned());
1461
436
    saved_write_unix_time_ = iter_.write_unix_time();
1462
436
    if (ikey.type == kTypeValuePreferredSeqno) {
1463
0
      pinned_value_ = ParsePackedValueForValue(iter_.value());
1464
436
    } else {
1465
436
      pinned_value_ = iter_.value();
1466
436
    }
1467
436
    if (ikey.type == kTypeBlobIndex) {
1468
0
      if (!SetValueAndColumnsFromBlob(ikey.user_key, pinned_value_)) {
1469
0
        return false;
1470
0
      }
1471
436
    } else if (ikey.type == kTypeWideColumnEntity) {
1472
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1473
0
        return false;
1474
0
      }
1475
436
    } else {
1476
436
      assert(ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno);
1477
436
      SetValueAndColumnsFromPlain(pinned_value_);
1478
436
    }
1479
1480
436
    if (timestamp_lb_ != nullptr) {
1481
0
      saved_key_.SetInternalKey(ikey);
1482
436
    } else {
1483
436
      saved_key_.SetUserKey(
1484
436
          ikey.user_key,
1485
436
          !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
1486
436
    }
1487
1488
436
    valid_ = true;
1489
436
    return true;
1490
436
  }
1491
1492
  // kTypeMerge. We need to collect all kTypeMerge values and save them
1493
  // in operands
1494
436
  assert(ikey.type == kTypeMerge);
1495
0
  current_entry_is_merged_ = true;
1496
0
  merge_context_.Clear();
1497
0
  merge_context_.PushOperand(
1498
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1499
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
1500
1501
0
  while (true) {
1502
0
    iter_.Next();
1503
1504
0
    if (!iter_.Valid()) {
1505
0
      if (!iter_.status().ok()) {
1506
0
        valid_ = false;
1507
0
        return false;
1508
0
      }
1509
0
      break;
1510
0
    }
1511
0
    if (!ParseKey(&ikey)) {
1512
0
      return false;
1513
0
    }
1514
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1515
0
                                                saved_key_.GetUserKey())) {
1516
0
      break;
1517
0
    }
1518
0
    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1519
0
        ikey.type == kTypeDeletionWithTimestamp) {
1520
0
      break;
1521
0
    }
1522
0
    if (!PrepareValueInternal()) {
1523
0
      return false;
1524
0
    }
1525
1526
0
    if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno) {
1527
0
      Slice value = iter_.value();
1528
0
      if (ikey.type == kTypeValuePreferredSeqno) {
1529
0
        value = ParsePackedValueForValue(value);
1530
0
      }
1531
0
      if (!MergeWithPlainBaseValue(value, saved_key_.GetUserKey())) {
1532
0
        return false;
1533
0
      }
1534
0
      return true;
1535
0
    } else if (ikey.type == kTypeMerge) {
1536
0
      merge_context_.PushOperand(
1537
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1538
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
1539
0
    } else if (ikey.type == kTypeBlobIndex) {
1540
0
      if (!MergeWithBlobBaseValue(iter_.value(), saved_key_.GetUserKey())) {
1541
0
        return false;
1542
0
      }
1543
1544
0
      return true;
1545
0
    } else if (ikey.type == kTypeWideColumnEntity) {
1546
0
      if (!MergeWithWideColumnBaseValue(iter_.value(),
1547
0
                                        saved_key_.GetUserKey())) {
1548
0
        return false;
1549
0
      }
1550
1551
0
      return true;
1552
0
    } else {
1553
0
      valid_ = false;
1554
0
      status_ = Status::Corruption(
1555
0
          "Unknown value type: " +
1556
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
1557
0
      return false;
1558
0
    }
1559
0
  }
1560
1561
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1562
0
    return false;
1563
0
  }
1564
1565
  // Make sure we leave iter_ in a good state. If it's valid and we don't care
1566
  // about prefixes, that's already good enough. Otherwise it needs to be
1567
  // seeked to the current key.
1568
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
1569
0
    if (!expect_total_order_inner_iter()) {
1570
0
      iter_.SeekForPrev(last_key);
1571
0
    } else {
1572
0
      iter_.Seek(last_key);
1573
0
      if (!iter_.Valid() && iter_.status().ok()) {
1574
0
        iter_.SeekToLast();
1575
0
      }
1576
0
    }
1577
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1578
0
  }
1579
1580
0
  valid_ = true;
1581
0
  return true;
1582
0
}
1583
1584
0
bool DBIter::MergeWithNoBaseValue(const Slice& user_key) {
1585
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1586
  // since a failure must be propagated regardless of its value.
1587
0
  ValueType result_type;
1588
0
  const Status s = MergeHelper::TimedFullMerge(
1589
0
      merge_operator_, user_key, MergeHelper::kNoBaseValue,
1590
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1591
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1592
0
      &value_columns_state_.mut()->saved_value(), &pinned_value_, &result_type);
1593
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1594
0
}
1595
1596
bool DBIter::MergeWithPlainBaseValue(const Slice& value,
1597
0
                                     const Slice& user_key) {
1598
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1599
  // since a failure must be propagated regardless of its value.
1600
0
  ValueType result_type;
1601
0
  const Status s = MergeHelper::TimedFullMerge(
1602
0
      merge_operator_, user_key, MergeHelper::kPlainBaseValue, value,
1603
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1604
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1605
0
      &value_columns_state_.mut()->saved_value(), &pinned_value_, &result_type);
1606
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1607
0
}
1608
1609
bool DBIter::MergeWithBlobBaseValue(const Slice& blob_index,
1610
0
                                    const Slice& user_key) {
1611
0
  assert(!blob_state_->is_blob);
1612
1613
0
  if (expose_blob_index_) {
1614
0
    status_ =
1615
0
        Status::NotSupported("Legacy BlobDB does not support merge operator.");
1616
0
    valid_ = false;
1617
0
    return false;
1618
0
  }
1619
1620
0
  const bool allow_write_path_fallback =
1621
0
      cfh_ != nullptr && cfh_->cfd()->blob_partition_manager() != nullptr;
1622
0
  const Status s = blob_state_.mut()->reader.RetrieveAndSetBlobValue(
1623
0
      user_key, blob_index, allow_write_path_fallback);
1624
0
  if (!s.ok()) {
1625
0
    status_ = s;
1626
0
    valid_ = false;
1627
0
    return false;
1628
0
  }
1629
1630
0
  valid_ = true;
1631
1632
0
  if (!MergeWithPlainBaseValue(blob_state_->reader.GetBlobValue(), user_key)) {
1633
0
    return false;
1634
0
  }
1635
1636
0
  blob_state_.Reset();
1637
1638
0
  return true;
1639
0
}
1640
1641
bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity,
1642
0
                                          const Slice& user_key) {
1643
  // Resolve V2 entity blob columns if present, since TimedFullMerge only
1644
  // supports V1 format.
1645
0
  BlobFetcher blob_fetcher = blob_state_->reader.CreateBlobFetcher();
1646
0
  std::string resolved_entity;
1647
0
  Slice effective_entity;
1648
0
  Status s_resolve = WideColumnSerialization::ResolveEntityForMerge(
1649
0
      entity, user_key, &blob_fetcher, nullptr /* prefetch_buffers */,
1650
0
      resolved_entity, effective_entity);
1651
0
  if (!s_resolve.ok()) {
1652
0
    status_ = std::move(s_resolve);
1653
0
    valid_ = false;
1654
0
    return false;
1655
0
  }
1656
1657
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1658
  // since a failure must be propagated regardless of its value.
1659
0
  ValueType result_type;
1660
0
  Status s = MergeHelper::TimedFullMerge(
1661
0
      merge_operator_, user_key, MergeHelper::kWideBaseValue, effective_entity,
1662
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1663
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1664
0
      &value_columns_state_.mut()->saved_value(), &pinned_value_, &result_type);
1665
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1666
0
}
1667
1668
// Move backwards until the key smaller than saved_key_.
1669
// Changes valid_ only if return value is false.
1670
3.69k
bool DBIter::FindUserKeyBeforeSavedKey() {
1671
3.69k
  assert(status_.ok());
1672
3.69k
  size_t num_skipped = 0;
1673
4.27k
  while (iter_.Valid()) {
1674
688
    ParsedInternalKey ikey;
1675
688
    if (!ParseKey(&ikey)) {
1676
0
      return false;
1677
0
    }
1678
1679
688
    if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
1680
109
      return true;
1681
109
    }
1682
1683
579
    if (TooManyInternalKeysSkipped()) {
1684
0
      return false;
1685
0
    }
1686
1687
579
    assert(ikey.sequence != kMaxSequenceNumber);
1688
579
    assert(ikey.user_key.size() >= timestamp_size_);
1689
579
    Slice ts;
1690
579
    if (timestamp_size_ > 0) {
1691
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1692
0
                 timestamp_size_);
1693
0
    }
1694
579
    if (!IsVisible(ikey.sequence, ts)) {
1695
0
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
1696
579
    } else {
1697
579
      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
1698
579
    }
1699
1700
579
    if (num_skipped >= max_skip_) {
1701
0
      num_skipped = 0;
1702
0
      std::string last_key;
1703
0
      if (timestamp_size_ == 0) {
1704
0
        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
1705
0
                                                       kMaxSequenceNumber,
1706
0
                                                       kValueTypeForSeek));
1707
0
      } else {
1708
        // TODO: pre-create kTsMax.
1709
0
        const std::string kTsMax(timestamp_size_, '\xff');
1710
0
        AppendInternalKeyWithDifferentTimestamp(
1711
0
            &last_key,
1712
0
            ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
1713
0
                              kValueTypeForSeek),
1714
0
            kTsMax);
1715
0
      }
1716
      // It would be more efficient to use SeekForPrev() here, but some
1717
      // iterators may not support it.
1718
0
      iter_.Seek(last_key);
1719
0
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1720
0
      if (!iter_.Valid()) {
1721
0
        break;
1722
0
      }
1723
579
    } else {
1724
579
      ++num_skipped;
1725
579
    }
1726
1727
579
    iter_.Prev();
1728
579
  }
1729
1730
3.58k
  if (!iter_.status().ok()) {
1731
0
    valid_ = false;
1732
0
    return false;
1733
0
  }
1734
1735
3.58k
  return true;
1736
3.58k
}
1737
1738
void DBIter::TrackContiguousTombstone(const Slice& user_key,
1739
0
                                      bool always_update_first_key) {
1740
0
  if (always_update_first_key || contiguous_tombstone_count_ == 0) {
1741
0
    range_tomb_first_key_.SetUserKey(user_key, true /* copy */);
1742
0
  }
1743
0
  contiguous_tombstone_count_++;
1744
0
}
1745
1746
void DBIter::FlushPendingTombstoneRun(const Slice& end_key,
1747
23.1k
                                      bool check_prefix_match) {
1748
23.1k
  if (contiguous_tombstone_count_ == 0) {
1749
23.1k
    return;
1750
23.1k
  }
1751
1752
0
  if (check_prefix_match) {
1753
0
    Slice end_key_without_ts =
1754
0
        StripTimestampFromUserKey(end_key, timestamp_size_);
1755
0
    if (PrefixCheck(end_key_without_ts)) {
1756
0
      MaybeInsertRangeTombstone(end_key);
1757
0
    }
1758
0
  } else {
1759
0
    MaybeInsertRangeTombstone(end_key);
1760
0
  }
1761
1762
0
  ResetContiguousTombstoneTracking();
1763
0
}
1764
1765
0
void DBIter::MaybeInsertRangeTombstone(const Slice& end_key) {
1766
0
  if (contiguous_tombstone_count_ < min_tombstones_for_range_conversion_) {
1767
0
    return;
1768
0
  }
1769
1770
0
  if (active_mem_ == nullptr) {
1771
0
    return;
1772
0
  }
1773
1774
0
  assert(PrefixCheck(range_tomb_first_key_.GetUserKey()));
1775
0
  assert(PrefixCheck(end_key));
1776
0
  assert(user_comparator_.Compare(range_tomb_first_key_.GetUserKey(),
1777
0
                                  end_key) <= 0);
1778
1779
0
  auto earliest_seq = active_mem_->GetEarliestSequenceNumber();
1780
  // Skip if the iterator's snapshot predates the memtable. Otherwise entries
1781
  // added with seqno between sequence_ and earliest_seq will be unintentionally
1782
  // covered.
1783
0
  if (sequence_ < earliest_seq) {
1784
0
    RecordTick(statistics_, READ_PATH_RANGE_TOMBSTONES_DISCARDED);
1785
0
    return;
1786
0
  }
1787
1788
  // Insert at the read sequence so the converted tombstone is visible only
1789
  // to readers that could already observe the deletion run.
1790
0
  SequenceNumber insert_seq = sequence_;
1791
1792
  // Skip if the insertion seq could shadow prepared-but-uncommitted writes.
1793
0
  if (read_callback_ != nullptr &&
1794
0
      insert_seq >= read_callback_->min_uncommitted()) {
1795
0
    RecordTick(statistics_, READ_PATH_RANGE_TOMBSTONES_DISCARDED);
1796
0
    return;
1797
0
  }
1798
1799
  // Check if the memtable already has a range tombstone covering [start, end).
1800
0
  {
1801
0
    ReadOptions ro;
1802
    // Assumption is that this should be relatively cheap as other read requests
1803
    // should be building the cached core fragmented list.
1804
0
    std::unique_ptr<FragmentedRangeTombstoneIterator> range_iter(
1805
0
        active_mem_->NewRangeTombstoneIterator(ro, sequence_,
1806
0
                                               false /* immutable_memtable */));
1807
0
    if (range_iter) {
1808
0
      range_iter->Seek(range_tomb_first_key_.GetUserKey());
1809
0
      if (range_iter->Valid() &&
1810
0
          user_comparator_.Compare(range_iter->start_key(),
1811
0
                                   range_tomb_first_key_.GetUserKey()) <= 0 &&
1812
0
          user_comparator_.Compare(range_iter->end_key(), end_key) >= 0) {
1813
0
        RecordTick(statistics_, READ_PATH_RANGE_TOMBSTONES_DISCARDED);
1814
0
        return;
1815
0
      }
1816
0
    }
1817
0
  }
1818
1819
0
  assert(cfh_ != nullptr);
1820
0
  if (active_mem_->AddLogicallyRedundantRangeTombstone(
1821
0
          insert_seq, range_tomb_first_key_.GetUserKey(), end_key,
1822
0
          cfh_->cfd()->GetIngestSstLock())) {
1823
0
    RecordTick(statistics_, READ_PATH_RANGE_TOMBSTONES_INSERTED);
1824
0
    ROCKS_LOG_DEBUG(logger_,
1825
0
                    "Inserted range tombstone [%s, %s) @ seq %" PRIu64
1826
0
                    " (count=%" PRIu32 ", snapshot=%" PRIu64 ")",
1827
0
                    range_tomb_first_key_.GetUserKey().ToString(true).c_str(),
1828
0
                    end_key.ToString(true).c_str(), insert_seq,
1829
0
                    contiguous_tombstone_count_, sequence_);
1830
0
  } else {
1831
0
    RecordTick(statistics_, READ_PATH_RANGE_TOMBSTONES_DISCARDED);
1832
0
  }
1833
0
}
1834
1835
51.6k
bool DBIter::TooManyInternalKeysSkipped(bool increment) {
1836
51.6k
  if ((max_skippable_internal_keys_ > 0) &&
1837
0
      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
1838
0
    valid_ = false;
1839
0
    status_ = Status::Incomplete("Too many internal keys skipped.");
1840
0
    return true;
1841
51.6k
  } else if (increment) {
1842
50.3k
    num_internal_keys_skipped_++;
1843
50.3k
  }
1844
51.6k
  return false;
1845
51.6k
}
1846
1847
bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
1848
50.9k
                       bool* more_recent) {
1849
  // Remember that comparator orders preceding timestamp as larger.
1850
  // TODO(yanqin): support timestamp in read_callback_.
1851
50.9k
  bool visible_by_seq = (read_callback_ == nullptr)
1852
50.9k
                            ? sequence <= sequence_
1853
50.9k
                            : read_callback_->IsVisible(sequence);
1854
1855
50.9k
  bool visible_by_ts =
1856
50.9k
      (timestamp_ub_ == nullptr ||
1857
0
       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
1858
50.9k
      (timestamp_lb_ == nullptr ||
1859
0
       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
1860
1861
50.9k
  if (more_recent) {
1862
35.7k
    *more_recent = !visible_by_seq;
1863
35.7k
  }
1864
50.9k
  return visible_by_seq && visible_by_ts;
1865
50.9k
}
1866
1867
0
void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
1868
0
  is_key_seqnum_zero_ = false;
1869
0
  SequenceNumber seq = sequence_;
1870
0
  saved_key_.Clear();
1871
0
  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
1872
1873
0
  if (iterate_lower_bound_ != nullptr &&
1874
0
      user_comparator_.CompareWithoutTimestamp(
1875
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
1876
0
          /*b_has_ts=*/false) < 0) {
1877
    // Seek key is smaller than the lower bound.
1878
0
    saved_key_.Clear();
1879
0
    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
1880
0
                              timestamp_ub_);
1881
0
  }
1882
0
}
1883
1884
4.30k
void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
1885
4.30k
  is_key_seqnum_zero_ = false;
1886
4.30k
  saved_key_.Clear();
1887
  // now saved_key is used to store internal key.
1888
4.30k
  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
1889
4.30k
                            kValueTypeForSeekForPrev, timestamp_ub_);
1890
1891
4.30k
  if (timestamp_size_ > 0) {
1892
0
    const std::string kTsMin(timestamp_size_, '\0');
1893
0
    Slice ts = kTsMin;
1894
0
    saved_key_.UpdateInternalKey(
1895
0
        /*seq=*/0, kValueTypeForSeekForPrev,
1896
0
        timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
1897
0
  }
1898
1899
4.30k
  if (iterate_upper_bound_ != nullptr &&
1900
0
      user_comparator_.CompareWithoutTimestamp(
1901
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
1902
0
          /*b_has_ts=*/false) >= 0) {
1903
0
    saved_key_.Clear();
1904
0
    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
1905
0
                              kValueTypeForSeekForPrev, timestamp_ub_);
1906
0
    if (timestamp_size_ > 0) {
1907
0
      const std::string kTsMax(timestamp_size_, '\xff');
1908
0
      Slice ts = kTsMax;
1909
0
      saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
1910
0
                                   &ts);
1911
0
    }
1912
0
  }
1913
4.30k
}
1914
1915
0
Status DBIter::ValidateScanOptions(const MultiScanArgs& multiscan_opts) const {
1916
0
  if (multiscan_opts.empty()) {
1917
0
    return Status::InvalidArgument("Empty MultiScanArgs");
1918
0
  }
1919
1920
0
  const std::vector<ScanOptions>& scan_opts = multiscan_opts.GetScanRanges();
1921
0
  const bool has_limit = scan_opts.front().range.limit.has_value();
1922
0
  if (!has_limit && scan_opts.size() > 1) {
1923
0
    return Status::InvalidArgument("Scan has no upper bound");
1924
0
  }
1925
1926
0
  for (size_t i = 0; i < scan_opts.size(); ++i) {
1927
0
    const auto& scan_range = scan_opts[i].range;
1928
0
    if (!scan_range.start.has_value()) {
1929
0
      return Status::InvalidArgument("Scan has no start key at index " +
1930
0
                                     std::to_string(i));
1931
0
    }
1932
1933
0
    if (scan_range.limit.has_value()) {
1934
0
      if (user_comparator_.CompareWithoutTimestamp(
1935
0
              scan_range.start.value(), /*a_has_ts=*/false,
1936
0
              scan_range.limit.value(), /*b_has_ts=*/false) >= 0) {
1937
0
        return Status::InvalidArgument(
1938
0
            "Scan start key is large or equal than limit at index " +
1939
0
            std::to_string(i));
1940
0
      }
1941
0
    }
1942
1943
0
    if (i > 0) {
1944
0
      if (!scan_range.limit.has_value()) {
1945
        // multiple scan without limit scan ranges
1946
0
        return Status::InvalidArgument("Scan has no upper bound at index " +
1947
0
                                       std::to_string(i));
1948
0
      }
1949
1950
0
      const auto& last_end_key = scan_opts[i - 1].range.limit.value();
1951
0
      if (user_comparator_.CompareWithoutTimestamp(
1952
0
              scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
1953
0
              /*b_has_ts=*/false) < 0) {
1954
0
        return Status::InvalidArgument("Overlapping ranges at index " +
1955
0
                                       std::to_string(i));
1956
0
      }
1957
0
    }
1958
0
  }
1959
0
  return Status::OK();
1960
0
}
1961
1962
0
void DBIter::Prepare(const MultiScanArgs& scan_opts) {
1963
0
  status_ = ValidateScanOptions(scan_opts);
1964
0
  if (!status_.ok()) {
1965
0
    return;
1966
0
  }
1967
0
  std::optional<MultiScanArgs> new_scan_opts;
1968
0
  new_scan_opts.emplace(scan_opts);
1969
0
  scan_opts_.swap(new_scan_opts);
1970
0
  scan_index_ = 0;
1971
1972
  // Create a shared IODispatcher if not provided. This allows all
1973
  // BlockBasedTableIterators in this scan to share a single dispatcher,
1974
  // enabling better IO coordination and future rate limiting.
1975
0
  if (!scan_opts_.value().io_dispatcher) {
1976
0
    scan_opts_->io_dispatcher.reset(NewIODispatcher());
1977
0
  }
1978
1979
0
  if (!scan_opts.empty()) {
1980
0
    iter_.Prepare(&scan_opts_.value());
1981
0
  } else {
1982
0
    iter_.Prepare(nullptr);
1983
0
  }
1984
0
}
1985
1986
0
void DBIter::Seek(const Slice& target) {
1987
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
1988
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1989
0
  StopWatch sw(clock_, statistics_, DB_SEEK);
1990
1991
0
  if (scan_opts_.has_value()) {
1992
    // Validate the seek target is as expected in the previously prepared range
1993
0
    auto const& scan_ranges = scan_opts_.value().GetScanRanges();
1994
0
    if (scan_index_ >= scan_ranges.size()) {
1995
0
      status_ = Status::InvalidArgument(
1996
0
          "Seek called after exhausting all of the scan ranges");
1997
0
      valid_ = false;
1998
0
      return;
1999
0
    }
2000
2001
    // Validate start key of next prepare range matches the seek target
2002
0
    auto const& range = scan_ranges[scan_index_];
2003
0
    auto const& start = range.range.start;
2004
0
    assert(start.has_value());
2005
0
    if (user_comparator_.CompareWithoutTimestamp(target, *start) != 0) {
2006
0
      status_ = Status::InvalidArgument(
2007
0
          "Seek target does not match the start of the next prepared range at "
2008
0
          "index " +
2009
0
          std::to_string(scan_index_));
2010
0
      valid_ = false;
2011
0
      return;
2012
0
    }
2013
2014
    // validate the upper bound is set to the same value of limit, if limit
2015
    // exists
2016
0
    auto const& limit = range.range.limit;
2017
0
    if (limit.has_value()) {
2018
0
      if (iterate_upper_bound_ == nullptr ||
2019
0
          user_comparator_.CompareWithoutTimestamp(
2020
0
              limit.value(), *iterate_upper_bound_) != 0) {
2021
0
        status_ = Status::InvalidArgument(
2022
0
            "Upper bound is not set to the same limit value of the next "
2023
0
            "prepared range at index " +
2024
0
            std::to_string(scan_index_));
2025
0
        valid_ = false;
2026
0
        return;
2027
0
      }
2028
0
    }
2029
0
    scan_index_++;
2030
0
  }
2031
2032
0
  if (cfh_ != nullptr) {
2033
    // TODO: What do we do if this returns an error?
2034
0
    Slice lower_bound, upper_bound;
2035
0
    if (iterate_lower_bound_ != nullptr) {
2036
0
      lower_bound = *iterate_lower_bound_;
2037
0
    } else {
2038
0
      lower_bound = Slice("");
2039
0
    }
2040
0
    if (iterate_upper_bound_ != nullptr) {
2041
0
      upper_bound = *iterate_upper_bound_;
2042
0
    } else {
2043
0
      upper_bound = Slice("");
2044
0
    }
2045
0
    cfh_->db()
2046
0
        ->TraceIteratorSeek(cfh_->cfd()->GetID(), target, lower_bound,
2047
0
                            upper_bound)
2048
0
        .PermitUncheckedError();
2049
0
  }
2050
2051
0
  status_ = Status::OK();
2052
0
  ResetSeekState();
2053
2054
0
  MarkMemtableForFlushForAvgTrigger();
2055
2056
  // Seek the inner iterator based on the target key.
2057
0
  {
2058
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
2059
2060
0
    SetSavedKeyToSeekTarget(target);
2061
0
    iter_.Seek(saved_key_.GetInternalKey());
2062
2063
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
2064
0
  }
2065
0
  if (!iter_.Valid()) {
2066
0
    valid_ = false;
2067
0
    return;
2068
0
  }
2069
0
  direction_ = kForward;
2070
2071
  // Now the inner iterator is placed to the target position. From there,
2072
  // we need to find out the next key that is visible to the user.
2073
0
  ClearSavedValue();
2074
0
  if (ShouldSetPrefix(target)) {
2075
0
    prefix_.emplace();
2076
0
    prefix_->SetUserKey(prefix_extractor_->Transform(target));
2077
0
  }
2078
0
  FindNextUserEntry(false /* not skipping saved_key */);
2079
0
  if (!valid_) {
2080
0
    prefix_.reset();
2081
0
  }
2082
0
  if (!valid_) {
2083
0
    return;
2084
0
  }
2085
2086
  // Updating stats and perf context counters.
2087
0
  if (statistics_ != nullptr) {
2088
    // Decrement since we don't want to count this key as skipped
2089
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
2090
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
2091
0
  }
2092
0
  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
2093
0
}
2094
2095
4.30k
void DBIter::SeekForPrev(const Slice& target) {
2096
4.30k
  PERF_COUNTER_ADD(iter_seek_count, 1);
2097
4.30k
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
2098
4.30k
  StopWatch sw(clock_, statistics_, DB_SEEK);
2099
2100
4.30k
  if (cfh_ != nullptr) {
2101
    // TODO: What do we do if this returns an error?
2102
4.30k
    Slice lower_bound, upper_bound;
2103
4.30k
    if (iterate_lower_bound_ != nullptr) {
2104
0
      lower_bound = *iterate_lower_bound_;
2105
4.30k
    } else {
2106
4.30k
      lower_bound = Slice("");
2107
4.30k
    }
2108
4.30k
    if (iterate_upper_bound_ != nullptr) {
2109
0
      upper_bound = *iterate_upper_bound_;
2110
4.30k
    } else {
2111
4.30k
      upper_bound = Slice("");
2112
4.30k
    }
2113
4.30k
    cfh_->db()
2114
4.30k
        ->TraceIteratorSeekForPrev(cfh_->cfd()->GetID(), target, lower_bound,
2115
4.30k
                                   upper_bound)
2116
4.30k
        .PermitUncheckedError();
2117
4.30k
  }
2118
2119
4.30k
  status_ = Status::OK();
2120
4.30k
  ResetSeekState();
2121
2122
4.30k
  MarkMemtableForFlushForAvgTrigger();
2123
2124
  // Seek the inner iterator based on the target key.
2125
4.30k
  {
2126
4.30k
    PERF_TIMER_GUARD(seek_internal_seek_time);
2127
4.30k
    SetSavedKeyToSeekForPrevTarget(target);
2128
4.30k
    iter_.SeekForPrev(saved_key_.GetInternalKey());
2129
4.30k
    RecordTick(statistics_, NUMBER_DB_SEEK);
2130
4.30k
  }
2131
4.30k
  if (!iter_.Valid()) {
2132
691
    valid_ = false;
2133
691
    return;
2134
691
  }
2135
3.61k
  direction_ = kReverse;
2136
2137
  // Now the inner iterator is placed to the target position. From there,
2138
  // we need to find out the first key that is visible to the user in the
2139
  // backward direction.
2140
3.61k
  ClearSavedValue();
2141
3.61k
  if (ShouldSetPrefix(target)) {
2142
0
    prefix_.emplace();
2143
0
    prefix_->SetUserKey(prefix_extractor_->Transform(target));
2144
0
  }
2145
3.61k
  PrevInternal();
2146
3.61k
  if (!valid_) {
2147
1.20k
    prefix_.reset();
2148
1.20k
  }
2149
  // Set end key for first Prev() call's tombstone tracking
2150
3.61k
  if (valid_ && min_tombstones_for_range_conversion_ > 0) {
2151
0
    range_tomb_end_key_.SetUserKey(saved_key_.GetUserKey(),
2152
0
                                   !saved_key_.IsKeyPinned());
2153
0
  }
2154
2155
  // Report stats and perf context.
2156
3.61k
  if (statistics_ != nullptr && valid_) {
2157
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
2158
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
2159
0
    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
2160
0
  }
2161
3.61k
}
2162
2163
7.16k
void DBIter::SeekToFirst() {
2164
7.16k
  if (iterate_lower_bound_ != nullptr) {
2165
0
    Seek(*iterate_lower_bound_);
2166
0
    return;
2167
0
  }
2168
7.16k
  PERF_COUNTER_ADD(iter_seek_count, 1);
2169
7.16k
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
2170
  // Don't use iter_::Seek() if we set a prefix extractor
2171
  // because prefix seek will be used.
2172
7.16k
  if (!expect_total_order_inner_iter()) {
2173
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
2174
0
  }
2175
7.16k
  status_ = Status::OK();
2176
  // if iterator is empty, this status_ could be unchecked.
2177
7.16k
  status_.PermitUncheckedError();
2178
7.16k
  direction_ = kForward;
2179
7.16k
  ResetSeekState();
2180
2181
7.16k
  MarkMemtableForFlushForAvgTrigger();
2182
7.16k
  ClearSavedValue();
2183
7.16k
  is_key_seqnum_zero_ = false;
2184
2185
7.16k
  {
2186
7.16k
    PERF_TIMER_GUARD(seek_internal_seek_time);
2187
7.16k
    iter_.SeekToFirst();
2188
7.16k
  }
2189
2190
7.16k
  RecordTick(statistics_, NUMBER_DB_SEEK);
2191
7.16k
  if (iter_.Valid()) {
2192
6.00k
    saved_key_.SetUserKey(
2193
6.00k
        ExtractUserKey(iter_.key()),
2194
6.00k
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
2195
6.00k
    assert(!prefix_.has_value());
2196
6.00k
    FindNextUserEntry(false /* not skipping saved_key */);
2197
6.00k
    if (statistics_ != nullptr) {
2198
0
      if (valid_) {
2199
0
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
2200
0
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
2201
0
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
2202
0
      }
2203
0
    }
2204
6.00k
  } else {
2205
1.15k
    valid_ = false;
2206
1.15k
  }
2207
7.16k
  if (valid_) {
2208
5.05k
    Slice user_key_without_ts =
2209
5.05k
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_);
2210
5.05k
    if (ShouldSetPrefix(user_key_without_ts)) {
2211
0
      prefix_.emplace();
2212
0
      prefix_->SetUserKey(prefix_extractor_->Transform(user_key_without_ts));
2213
0
    }
2214
5.05k
  }
2215
7.16k
}
2216
2217
0
void DBIter::SeekToLast() {
2218
0
  if (iterate_upper_bound_ != nullptr) {
2219
    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
2220
0
    SeekForPrev(*iterate_upper_bound_);
2221
#ifndef NDEBUG
2222
    Slice k = Valid() ? key() : Slice();
2223
    if (Valid() && timestamp_size_ > 0 && timestamp_lb_) {
2224
      k.remove_suffix(kNumInternalBytes + timestamp_size_);
2225
    }
2226
    assert(!Valid() || user_comparator_.CompareWithoutTimestamp(
2227
                           k, /*a_has_ts=*/false, *iterate_upper_bound_,
2228
                           /*b_has_ts=*/false) < 0);
2229
#endif
2230
0
    return;
2231
0
  }
2232
2233
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
2234
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
2235
  // Don't use iter_::Seek() if we set a prefix extractor
2236
  // because prefix seek will be used.
2237
0
  if (!expect_total_order_inner_iter()) {
2238
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
2239
0
  }
2240
0
  status_ = Status::OK();
2241
  // if iterator is empty, this status_ could be unchecked.
2242
0
  status_.PermitUncheckedError();
2243
0
  direction_ = kReverse;
2244
0
  ResetSeekState();
2245
2246
0
  MarkMemtableForFlushForAvgTrigger();
2247
0
  ClearSavedValue();
2248
0
  is_key_seqnum_zero_ = false;
2249
2250
  // Clear stale saved_key_ so PrevInternal()'s Swap does not pollute
2251
  // range_tomb_end_key_ with a key from a previous seek operation.
2252
0
  saved_key_.Clear();
2253
2254
0
  {
2255
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
2256
0
    iter_.SeekToLast();
2257
0
  }
2258
0
  assert(!prefix_.has_value());
2259
0
  PrevInternal();
2260
  // Set end key for first Prev() call's tombstone tracking
2261
0
  if (valid_ && min_tombstones_for_range_conversion_ > 0) {
2262
0
    range_tomb_end_key_.SetUserKey(saved_key_.GetUserKey(),
2263
0
                                   !saved_key_.IsKeyPinned());
2264
0
  }
2265
0
  if (statistics_ != nullptr) {
2266
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
2267
0
    if (valid_) {
2268
0
      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
2269
0
      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
2270
0
      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
2271
0
    }
2272
0
  }
2273
0
  if (valid_) {
2274
0
    Slice user_key_without_ts =
2275
0
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_);
2276
0
    if (ShouldSetPrefix(user_key_without_ts)) {
2277
0
      prefix_.emplace();
2278
0
      prefix_->SetUserKey(prefix_extractor_->Transform(user_key_without_ts));
2279
0
    }
2280
0
  }
2281
0
}
2282
}  // namespace ROCKSDB_NAMESPACE