Coverage Report

Created: 2024-09-08 07:17

/src/rocksdb/db/db_iter.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/db_iter.h"
11
12
#include <iostream>
13
#include <limits>
14
#include <string>
15
16
#include "db/dbformat.h"
17
#include "db/merge_context.h"
18
#include "db/merge_helper.h"
19
#include "db/pinned_iterators_manager.h"
20
#include "db/wide/wide_column_serialization.h"
21
#include "db/wide/wide_columns_helper.h"
22
#include "file/filename.h"
23
#include "logging/logging.h"
24
#include "memory/arena.h"
25
#include "monitoring/perf_context_imp.h"
26
#include "rocksdb/env.h"
27
#include "rocksdb/iterator.h"
28
#include "rocksdb/merge_operator.h"
29
#include "rocksdb/options.h"
30
#include "rocksdb/system_clock.h"
31
#include "table/internal_iterator.h"
32
#include "table/iterator_wrapper.h"
33
#include "trace_replay/trace_replay.h"
34
#include "util/mutexlock.h"
35
#include "util/string_util.h"
36
#include "util/user_comparator_wrapper.h"
37
38
namespace ROCKSDB_NAMESPACE {
39
40
DBIter::DBIter(Env* _env, const ReadOptions& read_options,
41
               const ImmutableOptions& ioptions,
42
               const MutableCFOptions& mutable_cf_options,
43
               const Comparator* cmp, InternalIterator* iter,
44
               const Version* version, SequenceNumber s, bool arena_mode,
45
               uint64_t max_sequential_skip_in_iterations,
46
               ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
47
               bool expose_blob_index)
48
    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
49
      env_(_env),
50
      clock_(ioptions.clock),
51
      logger_(ioptions.logger),
52
      user_comparator_(cmp),
53
      merge_operator_(ioptions.merge_operator.get()),
54
      iter_(iter),
55
      version_(version),
56
      read_callback_(read_callback),
57
      sequence_(s),
58
      statistics_(ioptions.stats),
59
      max_skip_(max_sequential_skip_in_iterations),
60
      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
61
      num_internal_keys_skipped_(0),
62
      iterate_lower_bound_(read_options.iterate_lower_bound),
63
      iterate_upper_bound_(read_options.iterate_upper_bound),
64
      direction_(kForward),
65
      valid_(false),
66
      current_entry_is_merged_(false),
67
      is_key_seqnum_zero_(false),
68
      prefix_same_as_start_(mutable_cf_options.prefix_extractor
69
                                ? read_options.prefix_same_as_start
70
                                : false),
71
      pin_thru_lifetime_(read_options.pin_data),
72
      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
73
                                     read_options.total_order_seek ||
74
                                     read_options.auto_prefix_mode),
75
      read_tier_(read_options.read_tier),
76
      fill_cache_(read_options.fill_cache),
77
      verify_checksums_(read_options.verify_checksums),
78
      expose_blob_index_(expose_blob_index),
79
      is_blob_(false),
80
      arena_mode_(arena_mode),
81
      io_activity_(read_options.io_activity),
82
      cfh_(cfh),
83
      timestamp_ub_(read_options.timestamp),
84
      timestamp_lb_(read_options.iter_start_ts),
85
6.11k
      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
86
6.11k
  RecordTick(statistics_, NO_ITERATOR_CREATED);
87
6.11k
  if (pin_thru_lifetime_) {
88
0
    pinned_iters_mgr_.StartPinning();
89
0
  }
90
6.11k
  if (iter_.iter()) {
91
0
    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
92
0
  }
93
6.11k
  status_.PermitUncheckedError();
94
6.11k
  assert(timestamp_size_ ==
95
6.11k
         user_comparator_.user_comparator()->timestamp_size());
96
6.11k
}
97
98
0
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
99
0
  if (prop == nullptr) {
100
0
    return Status::InvalidArgument("prop is nullptr");
101
0
  }
102
0
  if (prop_name == "rocksdb.iterator.super-version-number") {
103
    // First try to pass the value returned from inner iterator.
104
0
    return iter_.iter()->GetProperty(prop_name, prop);
105
0
  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
106
0
    if (valid_) {
107
0
      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
108
0
    } else {
109
0
      *prop = "Iterator is not valid.";
110
0
    }
111
0
    return Status::OK();
112
0
  } else if (prop_name == "rocksdb.iterator.is-value-pinned") {
113
0
    if (valid_) {
114
0
      *prop = (pin_thru_lifetime_ && iter_.Valid() &&
115
0
               iter_.value().data() == value_.data())
116
0
                  ? "1"
117
0
                  : "0";
118
0
    } else {
119
0
      *prop = "Iterator is not valid.";
120
0
    }
121
0
    return Status::OK();
122
0
  } else if (prop_name == "rocksdb.iterator.internal-key") {
123
0
    *prop = saved_key_.GetUserKey().ToString();
124
0
    return Status::OK();
125
0
  } else if (prop_name == "rocksdb.iterator.write-time") {
126
0
    PutFixed64(prop, saved_write_unix_time_);
127
0
    return Status::OK();
128
0
  }
129
0
  return Status::InvalidArgument("Unidentified property.");
130
0
}
131
132
57.6k
bool DBIter::ParseKey(ParsedInternalKey* ikey) {
133
57.6k
  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
134
57.6k
  if (!s.ok()) {
135
0
    status_ = Status::Corruption("In DBIter: ", s.getState());
136
0
    valid_ = false;
137
0
    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
138
0
    return false;
139
57.6k
  } else {
140
57.6k
    return true;
141
57.6k
  }
142
57.6k
}
143
144
24.7k
void DBIter::Next() {
145
24.7k
  assert(valid_);
146
24.7k
  assert(status_.ok());
147
148
24.7k
  PERF_COUNTER_ADD(iter_next_count, 1);
149
24.7k
  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
150
  // Release temporarily pinned blocks from last operation
151
24.7k
  ReleaseTempPinnedData();
152
24.7k
  ResetBlobValue();
153
24.7k
  ResetValueAndColumns();
154
24.7k
  local_stats_.skip_count_ += num_internal_keys_skipped_;
155
24.7k
  local_stats_.skip_count_--;
156
24.7k
  num_internal_keys_skipped_ = 0;
157
24.7k
  bool ok = true;
158
24.7k
  if (direction_ == kReverse) {
159
0
    is_key_seqnum_zero_ = false;
160
0
    if (!ReverseToForward()) {
161
0
      ok = false;
162
0
    }
163
24.7k
  } else if (!current_entry_is_merged_) {
164
    // If the current value is not a merge, the iter position is the
165
    // current key, which is already returned. We can safely issue a
166
    // Next() without checking the current key.
167
    // If the current key is a merge, very likely iter already points
168
    // to the next internal position.
169
24.7k
    assert(iter_.Valid());
170
24.7k
    iter_.Next();
171
24.7k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
172
24.7k
  }
173
174
24.7k
  local_stats_.next_count_++;
175
24.7k
  if (ok && iter_.Valid()) {
176
21.2k
    ClearSavedValue();
177
178
21.2k
    if (prefix_same_as_start_) {
179
0
      assert(prefix_extractor_ != nullptr);
180
0
      const Slice prefix = prefix_.GetUserKey();
181
0
      FindNextUserEntry(true /* skipping the current user key */, &prefix);
182
21.2k
    } else {
183
21.2k
      FindNextUserEntry(true /* skipping the current user key */, nullptr);
184
21.2k
    }
185
21.2k
  } else {
186
3.48k
    is_key_seqnum_zero_ = false;
187
3.48k
    valid_ = false;
188
3.48k
  }
189
24.7k
  if (statistics_ != nullptr && valid_) {
190
0
    local_stats_.next_found_count_++;
191
0
    local_stats_.bytes_read_ += (key().size() + value().size());
192
0
  }
193
24.7k
}
194
195
bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
196
0
                                  const Slice& blob_index) {
197
0
  assert(!is_blob_);
198
0
  assert(blob_value_.empty());
199
200
0
  if (expose_blob_index_) {  // Stacked BlobDB implementation
201
0
    is_blob_ = true;
202
0
    return true;
203
0
  }
204
205
0
  if (!version_) {
206
0
    status_ = Status::Corruption("Encountered unexpected blob index.");
207
0
    valid_ = false;
208
0
    return false;
209
0
  }
210
211
  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
212
  // avoid having to copy options back and forth.
213
  // TODO: plumb Env::IOActivity, Env::IOPriority
214
0
  ReadOptions read_options;
215
0
  read_options.read_tier = read_tier_;
216
0
  read_options.fill_cache = fill_cache_;
217
0
  read_options.verify_checksums = verify_checksums_;
218
0
  read_options.io_activity = io_activity_;
219
0
  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
220
0
  constexpr uint64_t* bytes_read = nullptr;
221
222
0
  const Status s = version_->GetBlob(read_options, user_key, blob_index,
223
0
                                     prefetch_buffer, &blob_value_, bytes_read);
224
225
0
  if (!s.ok()) {
226
0
    status_ = s;
227
0
    valid_ = false;
228
0
    return false;
229
0
  }
230
231
0
  is_blob_ = true;
232
0
  return true;
233
0
}
234
235
0
bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
236
0
  assert(value_.empty());
237
0
  assert(wide_columns_.empty());
238
239
0
  const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_);
240
241
0
  if (!s.ok()) {
242
0
    status_ = s;
243
0
    valid_ = false;
244
0
    wide_columns_.clear();
245
0
    return false;
246
0
  }
247
248
0
  if (WideColumnsHelper::HasDefaultColumn(wide_columns_)) {
249
0
    value_ = WideColumnsHelper::GetDefaultColumn(wide_columns_);
250
0
  }
251
252
0
  return true;
253
0
}
254
255
bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status,
256
0
                                               ValueType result_type) {
257
0
  if (!merge_status.ok()) {
258
0
    valid_ = false;
259
0
    status_ = merge_status;
260
0
    return false;
261
0
  }
262
263
0
  if (result_type == kTypeWideColumnEntity) {
264
0
    if (!SetValueAndColumnsFromEntity(saved_value_)) {
265
0
      assert(!valid_);
266
0
      return false;
267
0
    }
268
269
0
    valid_ = true;
270
0
    return true;
271
0
  }
272
273
0
  assert(result_type == kTypeValue);
274
0
  SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_
275
0
                                                   : saved_value_);
276
0
  valid_ = true;
277
0
  return true;
278
0
}
279
280
// PRE: saved_key_ has the current user key if skipping_saved_key
281
// POST: saved_key_ should have the next user key if valid_,
282
//       if the current entry is a result of merge
283
//           current_entry_is_merged_ => true
284
//           saved_value_             => the merged value
285
//
286
// NOTE: In between, saved_key_ can point to a user key that has
287
//       a delete marker or a sequence number higher than sequence_
288
//       saved_key_ MUST have a proper user_key before calling this function
289
//
290
// The prefix parameter, if not null, indicates that we need to iterate
291
// within the prefix, and the iterator needs to be made invalid, if no
292
// more entry for the prefix can be found.
293
25.4k
bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
294
25.4k
  PERF_TIMER_GUARD(find_next_user_entry_time);
295
25.4k
  return FindNextUserEntryInternal(skipping_saved_key, prefix);
296
25.4k
}
297
298
// Actual implementation of DBIter::FindNextUserEntry()
299
bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
300
25.4k
                                       const Slice* prefix) {
301
  // Loop until we hit an acceptable entry to yield
302
25.4k
  assert(iter_.Valid());
303
25.4k
  assert(status_.ok());
304
25.4k
  assert(direction_ == kForward);
305
25.4k
  current_entry_is_merged_ = false;
306
307
  // How many times in a row we have skipped an entry with user key less than
308
  // or equal to saved_key_. We could skip these entries either because
309
  // sequence numbers were too high or because skipping_saved_key = true.
310
  // What saved_key_ contains throughout this method:
311
  //  - if skipping_saved_key : saved_key_ contains the key that we need
312
  //                            to skip, and we haven't seen any keys greater
313
  //                            than that,
314
  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
315
  //                            num_skipped times, and we haven't seen any keys
316
  //                            greater than that,
317
  //  - none of the above     : saved_key_ can contain anything, it doesn't
318
  //                            matter.
319
25.4k
  uint64_t num_skipped = 0;
320
  // For write unprepared, the target sequence number in reseek could be larger
321
  // than the snapshot, and thus needs to be skipped again. This could result in
322
  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
323
  // to one.
324
25.4k
  bool reseek_done = false;
325
326
29.4k
  do {
327
    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
328
    // but we need to save the previous value to be used in the loop.
329
29.4k
    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
330
29.4k
    if (!ParseKey(&ikey_)) {
331
0
      is_key_seqnum_zero_ = false;
332
0
      return false;
333
0
    }
334
29.4k
    Slice user_key_without_ts =
335
29.4k
        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
336
337
29.4k
    is_key_seqnum_zero_ = (ikey_.sequence == 0);
338
339
29.4k
    assert(iterate_upper_bound_ == nullptr ||
340
29.4k
           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
341
29.4k
           user_comparator_.CompareWithoutTimestamp(
342
29.4k
               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
343
29.4k
               /*b_has_ts=*/false) < 0);
344
29.4k
    if (iterate_upper_bound_ != nullptr &&
345
29.4k
        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
346
29.4k
        user_comparator_.CompareWithoutTimestamp(
347
0
            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
348
0
            /*b_has_ts=*/false) >= 0) {
349
0
      break;
350
0
    }
351
352
29.4k
    assert(prefix == nullptr || prefix_extractor_ != nullptr);
353
29.4k
    if (prefix != nullptr &&
354
29.4k
        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
355
0
            0) {
356
0
      assert(prefix_same_as_start_);
357
0
      break;
358
0
    }
359
360
29.4k
    if (TooManyInternalKeysSkipped()) {
361
0
      return false;
362
0
    }
363
364
29.4k
    assert(ikey_.user_key.size() >= timestamp_size_);
365
29.4k
    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
366
0
                                         ikey_.user_key, timestamp_size_)
367
29.4k
                                   : Slice();
368
29.4k
    bool more_recent = false;
369
29.4k
    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
370
      // If the previous entry is of seqnum 0, the current entry will not
371
      // possibly be skipped. This condition can potentially be relaxed to
372
      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
373
      // prone to bugs causing the same user key with the same sequence number.
374
      // Note that with current timestamp implementation, the same user key can
375
      // have different timestamps and zero sequence number on the bottommost
376
      // level. This may change in the future.
377
29.4k
      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
378
29.4k
          skipping_saved_key &&
379
29.4k
          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
380
465
        num_skipped++;  // skip this entry
381
465
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
382
28.9k
      } else {
383
28.9k
        assert(!skipping_saved_key ||
384
28.9k
               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
385
28.9k
        num_skipped = 0;
386
28.9k
        reseek_done = false;
387
28.9k
        switch (ikey_.type) {
388
4.22k
          case kTypeDeletion:
389
4.22k
          case kTypeDeletionWithTimestamp:
390
4.22k
          case kTypeSingleDeletion:
391
            // Arrange to skip all upcoming entries for this key since
392
            // they are hidden by this deletion.
393
4.22k
            if (timestamp_lb_) {
394
0
              saved_key_.SetInternalKey(ikey_);
395
0
              valid_ = true;
396
0
              return true;
397
4.22k
            } else {
398
4.22k
              saved_key_.SetUserKey(
399
4.22k
                  ikey_.user_key, !pin_thru_lifetime_ ||
400
4.22k
                                      !iter_.iter()->IsKeyPinned() /* copy */);
401
4.22k
              skipping_saved_key = true;
402
4.22k
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
403
4.22k
            }
404
4.22k
            break;
405
24.7k
          case kTypeValue:
406
24.7k
          case kTypeValuePreferredSeqno:
407
24.7k
          case kTypeBlobIndex:
408
24.7k
          case kTypeWideColumnEntity:
409
24.7k
            if (!PrepareValue()) {
410
0
              return false;
411
0
            }
412
24.7k
            if (timestamp_lb_) {
413
0
              saved_key_.SetInternalKey(ikey_);
414
24.7k
            } else {
415
24.7k
              saved_key_.SetUserKey(
416
24.7k
                  ikey_.user_key, !pin_thru_lifetime_ ||
417
24.7k
                                      !iter_.iter()->IsKeyPinned() /* copy */);
418
24.7k
            }
419
420
24.7k
            if (ikey_.type == kTypeBlobIndex) {
421
0
              if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
422
0
                return false;
423
0
              }
424
425
0
              SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
426
0
                                                             : blob_value_);
427
24.7k
            } else if (ikey_.type == kTypeWideColumnEntity) {
428
0
              if (!SetValueAndColumnsFromEntity(iter_.value())) {
429
0
                return false;
430
0
              }
431
24.7k
            } else {
432
24.7k
              assert(ikey_.type == kTypeValue ||
433
24.7k
                     ikey_.type == kTypeValuePreferredSeqno);
434
24.7k
              Slice value = iter_.value();
435
24.7k
              saved_write_unix_time_ = iter_.write_unix_time();
436
24.7k
              if (ikey_.type == kTypeValuePreferredSeqno) {
437
0
                value = ParsePackedValueForValue(value);
438
0
              }
439
24.7k
              SetValueAndColumnsFromPlain(value);
440
24.7k
            }
441
442
24.7k
            valid_ = true;
443
24.7k
            return true;
444
0
            break;
445
0
          case kTypeMerge:
446
0
            if (!PrepareValue()) {
447
0
              return false;
448
0
            }
449
0
            saved_key_.SetUserKey(
450
0
                ikey_.user_key,
451
0
                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
452
            // By now, we are sure the current ikey is going to yield a value
453
0
            current_entry_is_merged_ = true;
454
0
            valid_ = true;
455
0
            return MergeValuesNewToOld();  // Go to a different state machine
456
0
            break;
457
0
          default:
458
0
            valid_ = false;
459
0
            status_ = Status::Corruption(
460
0
                "Unknown value type: " +
461
0
                std::to_string(static_cast<unsigned int>(ikey_.type)));
462
0
            return false;
463
28.9k
        }
464
28.9k
      }
465
29.4k
    } else {
466
0
      if (more_recent) {
467
0
        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
468
0
      }
469
470
      // This key was inserted after our snapshot was taken or skipped by
471
      // timestamp range. If this happens too many times in a row for the same
472
      // user key, we want to seek to the target sequence number.
473
0
      int cmp = user_comparator_.CompareWithoutTimestamp(
474
0
          ikey_.user_key, saved_key_.GetUserKey());
475
0
      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
476
0
        num_skipped++;
477
0
      } else {
478
0
        saved_key_.SetUserKey(
479
0
            ikey_.user_key,
480
0
            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
481
0
        skipping_saved_key = false;
482
0
        num_skipped = 0;
483
0
        reseek_done = false;
484
0
      }
485
0
    }
486
487
    // If we have sequentially iterated via numerous equal keys, then it's
488
    // better to seek so that we can avoid too many key comparisons.
489
    //
490
    // To avoid infinite loops, do not reseek if we have already attempted to
491
    // reseek previously.
492
    //
493
    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
494
    // then it does not make sense to reseek as we would actually land further
495
    // away from the desired key. There is opportunity for optimization here.
496
4.68k
    if (num_skipped > max_skip_ && !reseek_done) {
497
2
      is_key_seqnum_zero_ = false;
498
2
      num_skipped = 0;
499
2
      reseek_done = true;
500
2
      std::string last_key;
501
2
      if (skipping_saved_key) {
502
        // We're looking for the next user-key but all we see are the same
503
        // user-key with decreasing sequence numbers. Fast forward to
504
        // sequence number 0 and type deletion (the smallest type).
505
2
        if (timestamp_size_ == 0) {
506
2
          AppendInternalKey(
507
2
              &last_key,
508
2
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
509
2
        } else {
510
0
          const std::string kTsMin(timestamp_size_, '\0');
511
0
          AppendInternalKeyWithDifferentTimestamp(
512
0
              &last_key,
513
0
              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
514
0
              kTsMin);
515
0
        }
516
        // Don't set skipping_saved_key = false because we may still see more
517
        // user-keys equal to saved_key_.
518
2
      } else {
519
        // We saw multiple entries with this user key and sequence numbers
520
        // higher than sequence_. Fast forward to sequence_.
521
        // Note that this only covers a case when a higher key was overwritten
522
        // many times since our snapshot was taken, not the case when a lot of
523
        // different keys were inserted after our snapshot was taken.
524
0
        if (timestamp_size_ == 0) {
525
0
          AppendInternalKey(
526
0
              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
527
0
                                           kValueTypeForSeek));
528
0
        } else {
529
0
          AppendInternalKeyWithDifferentTimestamp(
530
0
              &last_key,
531
0
              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
532
0
                                kValueTypeForSeek),
533
0
              *timestamp_ub_);
534
0
        }
535
0
      }
536
2
      iter_.Seek(last_key);
537
2
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
538
4.68k
    } else {
539
4.68k
      iter_.Next();
540
4.68k
    }
541
4.68k
  } while (iter_.Valid());
542
543
720
  valid_ = false;
544
720
  return iter_.status().ok();
545
25.4k
}
546
547
// Merge values of the same user key starting from the current iter_ position
548
// Scan from the newer entries to older entries.
549
// PRE: iter_.key() points to the first merge type entry
550
//      saved_key_ stores the user key
551
//      iter_.PrepareValue() has been called
552
// POST: saved_value_ has the merged value for the user key
553
//       iter_ points to the next entry (or invalid)
554
0
bool DBIter::MergeValuesNewToOld() {
555
0
  if (!merge_operator_) {
556
0
    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
557
0
    status_ = Status::InvalidArgument("merge_operator_ must be set.");
558
0
    valid_ = false;
559
0
    return false;
560
0
  }
561
562
  // Temporarily pin the blocks that hold merge operands
563
0
  TempPinData();
564
0
  merge_context_.Clear();
565
  // Start the merge process by pushing the first operand
566
0
  merge_context_.PushOperand(
567
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
568
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
569
570
0
  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
571
572
0
  ParsedInternalKey ikey;
573
0
  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
574
0
    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
575
0
    if (!ParseKey(&ikey)) {
576
0
      return false;
577
0
    }
578
579
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
580
0
                                                saved_key_.GetUserKey())) {
581
      // hit the next user key, stop right here
582
0
      break;
583
0
    }
584
0
    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
585
0
        kTypeDeletionWithTimestamp == ikey.type) {
586
      // hit a delete with the same user key, stop right here
587
      // iter_ is positioned after delete
588
0
      iter_.Next();
589
0
      break;
590
0
    }
591
0
    if (!PrepareValue()) {
592
0
      return false;
593
0
    }
594
595
0
    if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) {
596
0
      Slice value = iter_.value();
597
0
      saved_write_unix_time_ = iter_.write_unix_time();
598
0
      if (kTypeValuePreferredSeqno == ikey.type) {
599
0
        value = ParsePackedValueForValue(value);
600
0
      }
601
      // hit a put or put equivalent, merge the put value with operands and
602
      // store the final result in saved_value_. We are done!
603
0
      if (!MergeWithPlainBaseValue(value, ikey.user_key)) {
604
0
        return false;
605
0
      }
606
      // iter_ is positioned after put
607
0
      iter_.Next();
608
0
      if (!iter_.status().ok()) {
609
0
        valid_ = false;
610
0
        return false;
611
0
      }
612
0
      return true;
613
0
    } else if (kTypeMerge == ikey.type) {
614
      // hit a merge, add the value as an operand and run associative merge.
615
      // when complete, add result to operands and continue.
616
0
      merge_context_.PushOperand(
617
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
618
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
619
0
    } else if (kTypeBlobIndex == ikey.type) {
620
0
      if (expose_blob_index_) {
621
0
        status_ =
622
0
            Status::NotSupported("BlobDB does not support merge operator.");
623
0
        valid_ = false;
624
0
        return false;
625
0
      }
626
      // hit a put, merge the put value with operands and store the
627
      // final result in saved_value_. We are done!
628
0
      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
629
0
        return false;
630
0
      }
631
0
      valid_ = true;
632
0
      if (!MergeWithPlainBaseValue(blob_value_, ikey.user_key)) {
633
0
        return false;
634
0
      }
635
636
0
      ResetBlobValue();
637
638
      // iter_ is positioned after put
639
0
      iter_.Next();
640
0
      if (!iter_.status().ok()) {
641
0
        valid_ = false;
642
0
        return false;
643
0
      }
644
0
      return true;
645
0
    } else if (kTypeWideColumnEntity == ikey.type) {
646
0
      if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) {
647
0
        return false;
648
0
      }
649
650
      // iter_ is positioned after put
651
0
      iter_.Next();
652
0
      if (!iter_.status().ok()) {
653
0
        valid_ = false;
654
0
        return false;
655
0
      }
656
657
0
      return true;
658
0
    } else {
659
0
      valid_ = false;
660
0
      status_ = Status::Corruption(
661
0
          "Unrecognized value type: " +
662
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
663
0
      return false;
664
0
    }
665
0
  }
666
667
0
  if (!iter_.status().ok()) {
668
0
    valid_ = false;
669
0
    return false;
670
0
  }
671
672
  // we either exhausted all internal keys under this user key, or hit
673
  // a deletion marker.
674
  // feed null as the existing value to the merge operator, such that
675
  // client can differentiate this scenario and do things accordingly.
676
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
677
0
    return false;
678
0
  }
679
0
  assert(status_.ok());
680
0
  return true;
681
0
}
682
683
0
void DBIter::Prev() {
684
0
  assert(valid_);
685
0
  assert(status_.ok());
686
687
0
  PERF_COUNTER_ADD(iter_prev_count, 1);
688
0
  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
689
0
  ReleaseTempPinnedData();
690
0
  ResetBlobValue();
691
0
  ResetValueAndColumns();
692
0
  ResetInternalKeysSkippedCounter();
693
0
  bool ok = true;
694
0
  if (direction_ == kForward) {
695
0
    if (!ReverseToBackward()) {
696
0
      ok = false;
697
0
    }
698
0
  }
699
0
  if (ok) {
700
0
    ClearSavedValue();
701
702
0
    Slice prefix;
703
0
    if (prefix_same_as_start_) {
704
0
      assert(prefix_extractor_ != nullptr);
705
0
      prefix = prefix_.GetUserKey();
706
0
    }
707
0
    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
708
0
  }
709
710
0
  if (statistics_ != nullptr) {
711
0
    local_stats_.prev_count_++;
712
0
    if (valid_) {
713
0
      local_stats_.prev_found_count_++;
714
0
      local_stats_.bytes_read_ += (key().size() + value().size());
715
0
    }
716
0
  }
717
0
}
718
719
0
bool DBIter::ReverseToForward() {
720
0
  assert(iter_.status().ok());
721
722
  // When moving backwards, iter_ is positioned on _previous_ key, which may
723
  // not exist or may have different prefix than the current key().
724
  // If that's the case, seek iter_ to current key.
725
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
726
0
    std::string last_key;
727
0
    if (timestamp_size_ == 0) {
728
0
      AppendInternalKey(
729
0
          &last_key, ParsedInternalKey(saved_key_.GetUserKey(),
730
0
                                       kMaxSequenceNumber, kValueTypeForSeek));
731
0
    } else {
732
      // TODO: pre-create kTsMax.
733
0
      const std::string kTsMax(timestamp_size_, '\xff');
734
0
      AppendInternalKeyWithDifferentTimestamp(
735
0
          &last_key,
736
0
          ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
737
0
                            kValueTypeForSeek),
738
0
          kTsMax);
739
0
    }
740
0
    iter_.Seek(last_key);
741
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
742
0
  }
743
744
0
  direction_ = kForward;
745
  // Skip keys less than the current key() (a.k.a. saved_key_).
746
0
  while (iter_.Valid()) {
747
0
    ParsedInternalKey ikey;
748
0
    if (!ParseKey(&ikey)) {
749
0
      return false;
750
0
    }
751
0
    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
752
0
      return true;
753
0
    }
754
0
    iter_.Next();
755
0
  }
756
757
0
  if (!iter_.status().ok()) {
758
0
    valid_ = false;
759
0
    return false;
760
0
  }
761
762
0
  return true;
763
0
}
764
765
// Move iter_ to the key before saved_key_.
766
0
bool DBIter::ReverseToBackward() {
767
0
  assert(iter_.status().ok());
768
769
  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
770
  // key, which may not exist or may have prefix different from current.
771
  // If that's the case, seek to saved_key_.
772
0
  if (current_entry_is_merged_ &&
773
0
      (!expect_total_order_inner_iter() || !iter_.Valid())) {
774
0
    IterKey last_key;
775
    // Using kMaxSequenceNumber and kValueTypeForSeek
776
    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
777
    // than saved_key_.
778
0
    last_key.SetInternalKey(ParsedInternalKey(
779
0
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
780
0
    if (!expect_total_order_inner_iter()) {
781
0
      iter_.SeekForPrev(last_key.GetInternalKey());
782
0
    } else {
783
      // Some iterators may not support SeekForPrev(), so we avoid using it
784
      // when prefix seek mode is disabled. This is somewhat expensive
785
      // (an extra Prev(), as well as an extra change of direction of iter_),
786
      // so we may need to reconsider it later.
787
0
      iter_.Seek(last_key.GetInternalKey());
788
0
      if (!iter_.Valid() && iter_.status().ok()) {
789
0
        iter_.SeekToLast();
790
0
      }
791
0
    }
792
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
793
0
  }
794
795
0
  direction_ = kReverse;
796
0
  return FindUserKeyBeforeSavedKey();
797
0
}
798
799
395
void DBIter::PrevInternal(const Slice* prefix) {
800
567
  while (iter_.Valid()) {
801
395
    saved_key_.SetUserKey(
802
395
        ExtractUserKey(iter_.key()),
803
395
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
804
805
395
    assert(prefix == nullptr || prefix_extractor_ != nullptr);
806
395
    if (prefix != nullptr &&
807
395
        prefix_extractor_
808
0
                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
809
0
                                                      timestamp_size_))
810
0
                .compare(*prefix) != 0) {
811
0
      assert(prefix_same_as_start_);
812
      // Current key does not have the same prefix as start
813
0
      valid_ = false;
814
0
      return;
815
0
    }
816
817
395
    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
818
395
           user_comparator_.CompareWithoutTimestamp(
819
395
               saved_key_.GetUserKey(), /*a_has_ts=*/true,
820
395
               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
821
395
    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
822
395
        user_comparator_.CompareWithoutTimestamp(
823
0
            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
824
0
            /*b_has_ts=*/false) < 0) {
825
      // We've iterated earlier than the user-specified lower bound.
826
0
      valid_ = false;
827
0
      return;
828
0
    }
829
830
395
    if (!FindValueForCurrentKey()) {  // assigns valid_
831
0
      return;
832
0
    }
833
834
    // Whether or not we found a value for current key, we need iter_ to end up
835
    // on a smaller key.
836
395
    if (!FindUserKeyBeforeSavedKey()) {
837
0
      return;
838
0
    }
839
840
395
    if (valid_) {
841
      // Found the value.
842
223
      return;
843
223
    }
844
845
172
    if (TooManyInternalKeysSkipped(false)) {
846
0
      return;
847
0
    }
848
172
  }
849
850
  // We haven't found any key - iterator is not valid
851
172
  valid_ = false;
852
172
}
853
854
// Used for backwards iteration.
855
// Looks at the entries with user key saved_key_ and finds the most up-to-date
856
// value for it, or executes a merge, or determines that the value was deleted.
857
// Sets valid_ to true if the value is found and is ready to be presented to
858
// the user through value().
859
// Sets valid_ to false if the value was deleted, and we should try another key.
860
// Returns false if an error occurred, and !status().ok() and !valid_.
861
//
862
// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
863
// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
864
//       the entry just before them, or on the entry just after them.
865
395
bool DBIter::FindValueForCurrentKey() {
866
395
  assert(iter_.Valid());
867
395
  merge_context_.Clear();
868
395
  current_entry_is_merged_ = false;
869
  // last entry before merge (could be kTypeDeletion,
870
  // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue
871
  // kTypeBlobIndex, kTypeWideColumnEntity or kTypeValuePreferredSeqno)
872
395
  ValueType last_not_merge_type = kTypeDeletion;
873
395
  ValueType last_key_entry_type = kTypeDeletion;
874
875
  // If false, it indicates that we have not seen any valid entry, even though
876
  // last_key_entry_type is initialized to kTypeDeletion.
877
395
  bool valid_entry_seen = false;
878
879
  // Temporarily pin blocks that hold (merge operands / the value)
880
395
  ReleaseTempPinnedData();
881
395
  TempPinData();
882
395
  size_t num_skipped = 0;
883
1.99k
  while (iter_.Valid()) {
884
1.70k
    ParsedInternalKey ikey;
885
1.70k
    if (!ParseKey(&ikey)) {
886
0
      return false;
887
0
    }
888
889
1.70k
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
890
1.70k
                                                saved_key_.GetUserKey())) {
891
      // Found a smaller user key, thus we are done with current user key.
892
0
      break;
893
0
    }
894
895
1.70k
    assert(ikey.user_key.size() >= timestamp_size_);
896
1.70k
    Slice ts;
897
1.70k
    if (timestamp_size_ > 0) {
898
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
899
0
                 timestamp_size_);
900
0
    }
901
902
1.70k
    bool visible = IsVisible(ikey.sequence, ts);
903
1.70k
    if (!visible &&
904
1.70k
        (timestamp_lb_ == nullptr ||
905
0
         user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
906
      // Found an invisible version of the current user key, and it must have
907
      // a higher sequence number or timestamp. Therefore, we are done with the
908
      // current user key.
909
0
      break;
910
0
    }
911
912
1.70k
    if (!ts.empty()) {
913
0
      saved_timestamp_.assign(ts.data(), ts.size());
914
0
    }
915
916
1.70k
    if (TooManyInternalKeysSkipped()) {
917
0
      return false;
918
0
    }
919
920
    // This user key has lots of entries.
921
    // We're going from old to new, and it's taking too long. Let's do a Seek()
922
    // and go from new to old. This helps when a key was overwritten many times.
923
1.70k
    if (num_skipped >= max_skip_) {
924
109
      return FindValueForCurrentKeyUsingSeek();
925
109
    }
926
927
1.59k
    if (!PrepareValue()) {
928
0
      return false;
929
0
    }
930
931
1.59k
    if (timestamp_lb_ != nullptr) {
932
      // Only needed when timestamp_lb_ is not null
933
0
      [[maybe_unused]] const bool ret = ParseKey(&ikey_);
934
      // Since the preceding ParseKey(&ikey) succeeds, so must this.
935
0
      assert(ret);
936
0
      saved_key_.SetInternalKey(ikey);
937
1.59k
    } else if (user_comparator_.Compare(ikey.user_key,
938
1.59k
                                        saved_key_.GetUserKey()) < 0) {
939
0
      saved_key_.SetUserKey(
940
0
          ikey.user_key,
941
0
          !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
942
0
    }
943
944
1.59k
    valid_entry_seen = true;
945
1.59k
    last_key_entry_type = ikey.type;
946
1.59k
    switch (last_key_entry_type) {
947
1.01k
      case kTypeValue:
948
1.01k
      case kTypeValuePreferredSeqno:
949
1.01k
      case kTypeBlobIndex:
950
1.01k
      case kTypeWideColumnEntity:
951
1.01k
        if (iter_.iter()->IsValuePinned()) {
952
1.01k
          saved_write_unix_time_ = iter_.write_unix_time();
953
1.01k
          if (last_key_entry_type == kTypeValuePreferredSeqno) {
954
0
            pinned_value_ = ParsePackedValueForValue(iter_.value());
955
1.01k
          } else {
956
1.01k
            pinned_value_ = iter_.value();
957
1.01k
          }
958
1.01k
        } else {
959
0
          valid_ = false;
960
0
          status_ = Status::NotSupported(
961
0
              "Backward iteration not supported if underlying iterator's value "
962
0
              "cannot be pinned.");
963
0
        }
964
1.01k
        merge_context_.Clear();
965
1.01k
        last_not_merge_type = last_key_entry_type;
966
1.01k
        if (!status_.ok()) {
967
0
          return false;
968
0
        }
969
1.01k
        break;
970
1.01k
      case kTypeDeletion:
971
578
      case kTypeDeletionWithTimestamp:
972
578
      case kTypeSingleDeletion:
973
578
        merge_context_.Clear();
974
578
        last_not_merge_type = last_key_entry_type;
975
578
        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
976
578
        break;
977
0
      case kTypeMerge: {
978
0
        assert(merge_operator_ != nullptr);
979
0
        merge_context_.PushOperandBack(
980
0
            iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
981
0
        PERF_COUNTER_ADD(internal_merge_count, 1);
982
0
      } break;
983
0
      default:
984
0
        valid_ = false;
985
0
        status_ = Status::Corruption(
986
0
            "Unknown value type: " +
987
0
            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
988
0
        return false;
989
1.59k
    }
990
991
1.59k
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
992
1.59k
    iter_.Prev();
993
1.59k
    ++num_skipped;
994
995
1.59k
    if (visible && timestamp_lb_ != nullptr) {
996
      // If timestamp_lb_ is not nullptr, we do not have to look further for
997
      // another internal key. We can return this current internal key. Yet we
998
      // still keep the invariant that iter_ is positioned before the returned
999
      // key.
1000
0
      break;
1001
0
    }
1002
1.59k
  }
1003
1004
286
  if (!iter_.status().ok()) {
1005
0
    valid_ = false;
1006
0
    return false;
1007
0
  }
1008
1009
286
  if (!valid_entry_seen) {
1010
    // Since we haven't seen any valid entry, last_key_entry_type remains
1011
    // unchanged and the same as its initial value.
1012
0
    assert(last_key_entry_type == kTypeDeletion);
1013
0
    assert(last_not_merge_type == kTypeDeletion);
1014
0
    valid_ = false;
1015
0
    return true;
1016
0
  }
1017
1018
286
  if (timestamp_lb_ != nullptr) {
1019
0
    assert(last_key_entry_type == ikey_.type);
1020
0
  }
1021
1022
286
  switch (last_key_entry_type) {
1023
88
    case kTypeDeletion:
1024
88
    case kTypeDeletionWithTimestamp:
1025
88
    case kTypeSingleDeletion:
1026
88
      if (timestamp_lb_ == nullptr) {
1027
88
        valid_ = false;
1028
88
      } else {
1029
0
        valid_ = true;
1030
0
      }
1031
88
      return true;
1032
0
    case kTypeMerge:
1033
0
      current_entry_is_merged_ = true;
1034
0
      if (last_not_merge_type == kTypeDeletion ||
1035
0
          last_not_merge_type == kTypeSingleDeletion ||
1036
0
          last_not_merge_type == kTypeDeletionWithTimestamp) {
1037
0
        if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1038
0
          return false;
1039
0
        }
1040
0
        return true;
1041
0
      } else if (last_not_merge_type == kTypeBlobIndex) {
1042
0
        if (expose_blob_index_) {
1043
0
          status_ =
1044
0
              Status::NotSupported("BlobDB does not support merge operator.");
1045
0
          valid_ = false;
1046
0
          return false;
1047
0
        }
1048
0
        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
1049
0
          return false;
1050
0
        }
1051
0
        valid_ = true;
1052
0
        if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
1053
0
          return false;
1054
0
        }
1055
1056
0
        ResetBlobValue();
1057
1058
0
        return true;
1059
0
      } else if (last_not_merge_type == kTypeWideColumnEntity) {
1060
0
        if (!MergeWithWideColumnBaseValue(pinned_value_,
1061
0
                                          saved_key_.GetUserKey())) {
1062
0
          return false;
1063
0
        }
1064
1065
0
        return true;
1066
0
      } else {
1067
0
        assert(last_not_merge_type == kTypeValue ||
1068
0
               last_not_merge_type == kTypeValuePreferredSeqno);
1069
0
        if (!MergeWithPlainBaseValue(pinned_value_, saved_key_.GetUserKey())) {
1070
0
          return false;
1071
0
        }
1072
0
        return true;
1073
0
      }
1074
0
      break;
1075
198
    case kTypeValue:
1076
198
    case kTypeValuePreferredSeqno:
1077
198
      SetValueAndColumnsFromPlain(pinned_value_);
1078
1079
198
      break;
1080
0
    case kTypeBlobIndex:
1081
0
      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
1082
0
        return false;
1083
0
      }
1084
1085
0
      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
1086
0
                                                     : blob_value_);
1087
1088
0
      break;
1089
0
    case kTypeWideColumnEntity:
1090
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1091
0
        return false;
1092
0
      }
1093
0
      break;
1094
0
    default:
1095
0
      valid_ = false;
1096
0
      status_ = Status::Corruption(
1097
0
          "Unknown value type: " +
1098
0
          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
1099
0
      return false;
1100
286
  }
1101
198
  valid_ = true;
1102
198
  return true;
1103
286
}
1104
1105
// This function is used in FindValueForCurrentKey.
1106
// We use Seek() function instead of Prev() to find necessary value
1107
// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
1108
//       Would be nice to reuse some code.
1109
109
bool DBIter::FindValueForCurrentKeyUsingSeek() {
1110
  // FindValueForCurrentKey will enable pinning before calling
1111
  // FindValueForCurrentKeyUsingSeek()
1112
109
  assert(pinned_iters_mgr_.PinningEnabled());
1113
109
  std::string last_key;
1114
109
  if (0 == timestamp_size_) {
1115
109
    AppendInternalKey(&last_key,
1116
109
                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1117
109
                                        kValueTypeForSeek));
1118
109
  } else {
1119
0
    AppendInternalKeyWithDifferentTimestamp(
1120
0
        &last_key,
1121
0
        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1122
0
                          kValueTypeForSeek),
1123
0
        timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
1124
0
  }
1125
109
  iter_.Seek(last_key);
1126
109
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1127
1128
  // In case read_callback presents, the value we seek to may not be visible.
1129
  // Find the next value that's visible.
1130
109
  ParsedInternalKey ikey;
1131
1132
109
  while (true) {
1133
109
    if (!iter_.Valid()) {
1134
0
      valid_ = false;
1135
0
      return iter_.status().ok();
1136
0
    }
1137
1138
109
    if (!ParseKey(&ikey)) {
1139
0
      return false;
1140
0
    }
1141
109
    assert(ikey.user_key.size() >= timestamp_size_);
1142
109
    Slice ts;
1143
109
    if (timestamp_size_ > 0) {
1144
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1145
0
                 timestamp_size_);
1146
0
    }
1147
1148
109
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1149
109
                                                saved_key_.GetUserKey())) {
1150
      // No visible values for this key, even though FindValueForCurrentKey()
1151
      // has seen some. This is possible if we're using a tailing iterator, and
1152
      // the entries were discarded in a compaction.
1153
0
      valid_ = false;
1154
0
      return true;
1155
0
    }
1156
1157
109
    if (IsVisible(ikey.sequence, ts)) {
1158
109
      break;
1159
109
    }
1160
1161
0
    iter_.Next();
1162
0
  }
1163
1164
109
  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1165
109
      kTypeDeletionWithTimestamp == ikey.type) {
1166
84
    if (timestamp_lb_ == nullptr) {
1167
84
      valid_ = false;
1168
84
    } else {
1169
0
      valid_ = true;
1170
0
      saved_key_.SetInternalKey(ikey);
1171
0
    }
1172
84
    return true;
1173
84
  }
1174
25
  if (!PrepareValue()) {
1175
0
    return false;
1176
0
  }
1177
25
  if (timestamp_size_ > 0) {
1178
0
    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
1179
0
    saved_timestamp_.assign(ts.data(), ts.size());
1180
0
  }
1181
25
  if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno ||
1182
25
      ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) {
1183
25
    assert(iter_.iter()->IsValuePinned());
1184
25
    saved_write_unix_time_ = iter_.write_unix_time();
1185
25
    if (ikey.type == kTypeValuePreferredSeqno) {
1186
0
      pinned_value_ = ParsePackedValueForValue(iter_.value());
1187
25
    } else {
1188
25
      pinned_value_ = iter_.value();
1189
25
    }
1190
25
    if (ikey.type == kTypeBlobIndex) {
1191
0
      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
1192
0
        return false;
1193
0
      }
1194
1195
0
      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
1196
0
                                                     : blob_value_);
1197
25
    } else if (ikey.type == kTypeWideColumnEntity) {
1198
0
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
1199
0
        return false;
1200
0
      }
1201
25
    } else {
1202
25
      assert(ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno);
1203
25
      SetValueAndColumnsFromPlain(pinned_value_);
1204
25
    }
1205
1206
25
    if (timestamp_lb_ != nullptr) {
1207
0
      saved_key_.SetInternalKey(ikey);
1208
0
    }
1209
1210
25
    valid_ = true;
1211
25
    return true;
1212
25
  }
1213
1214
  // kTypeMerge. We need to collect all kTypeMerge values and save them
1215
  // in operands
1216
0
  assert(ikey.type == kTypeMerge);
1217
0
  current_entry_is_merged_ = true;
1218
0
  merge_context_.Clear();
1219
0
  merge_context_.PushOperand(
1220
0
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1221
0
  PERF_COUNTER_ADD(internal_merge_count, 1);
1222
1223
0
  while (true) {
1224
0
    iter_.Next();
1225
1226
0
    if (!iter_.Valid()) {
1227
0
      if (!iter_.status().ok()) {
1228
0
        valid_ = false;
1229
0
        return false;
1230
0
      }
1231
0
      break;
1232
0
    }
1233
0
    if (!ParseKey(&ikey)) {
1234
0
      return false;
1235
0
    }
1236
0
    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
1237
0
                                                saved_key_.GetUserKey())) {
1238
0
      break;
1239
0
    }
1240
0
    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
1241
0
        ikey.type == kTypeDeletionWithTimestamp) {
1242
0
      break;
1243
0
    }
1244
0
    if (!PrepareValue()) {
1245
0
      return false;
1246
0
    }
1247
1248
0
    if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno) {
1249
0
      Slice value = iter_.value();
1250
0
      if (ikey.type == kTypeValuePreferredSeqno) {
1251
0
        value = ParsePackedValueForValue(value);
1252
0
      }
1253
0
      if (!MergeWithPlainBaseValue(value, saved_key_.GetUserKey())) {
1254
0
        return false;
1255
0
      }
1256
0
      return true;
1257
0
    } else if (ikey.type == kTypeMerge) {
1258
0
      merge_context_.PushOperand(
1259
0
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
1260
0
      PERF_COUNTER_ADD(internal_merge_count, 1);
1261
0
    } else if (ikey.type == kTypeBlobIndex) {
1262
0
      if (expose_blob_index_) {
1263
0
        status_ =
1264
0
            Status::NotSupported("BlobDB does not support merge operator.");
1265
0
        valid_ = false;
1266
0
        return false;
1267
0
      }
1268
0
      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
1269
0
        return false;
1270
0
      }
1271
0
      valid_ = true;
1272
0
      if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
1273
0
        return false;
1274
0
      }
1275
1276
0
      ResetBlobValue();
1277
1278
0
      return true;
1279
0
    } else if (ikey.type == kTypeWideColumnEntity) {
1280
0
      if (!MergeWithWideColumnBaseValue(iter_.value(),
1281
0
                                        saved_key_.GetUserKey())) {
1282
0
        return false;
1283
0
      }
1284
1285
0
      return true;
1286
0
    } else {
1287
0
      valid_ = false;
1288
0
      status_ = Status::Corruption(
1289
0
          "Unknown value type: " +
1290
0
          std::to_string(static_cast<unsigned int>(ikey.type)));
1291
0
      return false;
1292
0
    }
1293
0
  }
1294
1295
0
  if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) {
1296
0
    return false;
1297
0
  }
1298
1299
  // Make sure we leave iter_ in a good state. If it's valid and we don't care
1300
  // about prefixes, that's already good enough. Otherwise it needs to be
1301
  // seeked to the current key.
1302
0
  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
1303
0
    if (!expect_total_order_inner_iter()) {
1304
0
      iter_.SeekForPrev(last_key);
1305
0
    } else {
1306
0
      iter_.Seek(last_key);
1307
0
      if (!iter_.Valid() && iter_.status().ok()) {
1308
0
        iter_.SeekToLast();
1309
0
      }
1310
0
    }
1311
0
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1312
0
  }
1313
1314
0
  valid_ = true;
1315
0
  return true;
1316
0
}
1317
1318
0
bool DBIter::MergeWithNoBaseValue(const Slice& user_key) {
1319
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1320
  // since a failure must be propagated regardless of its value.
1321
0
  ValueType result_type;
1322
0
  const Status s = MergeHelper::TimedFullMerge(
1323
0
      merge_operator_, user_key, MergeHelper::kNoBaseValue,
1324
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1325
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1326
0
      &saved_value_, &pinned_value_, &result_type);
1327
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1328
0
}
1329
1330
bool DBIter::MergeWithPlainBaseValue(const Slice& value,
1331
0
                                     const Slice& user_key) {
1332
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1333
  // since a failure must be propagated regardless of its value.
1334
0
  ValueType result_type;
1335
0
  const Status s = MergeHelper::TimedFullMerge(
1336
0
      merge_operator_, user_key, MergeHelper::kPlainBaseValue, value,
1337
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1338
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1339
0
      &saved_value_, &pinned_value_, &result_type);
1340
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1341
0
}
1342
1343
bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity,
1344
0
                                          const Slice& user_key) {
1345
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
1346
  // since a failure must be propagated regardless of its value.
1347
0
  ValueType result_type;
1348
0
  const Status s = MergeHelper::TimedFullMerge(
1349
0
      merge_operator_, user_key, MergeHelper::kWideBaseValue, entity,
1350
0
      merge_context_.GetOperands(), logger_, statistics_, clock_,
1351
0
      /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
1352
0
      &saved_value_, &pinned_value_, &result_type);
1353
0
  return SetValueAndColumnsFromMergeResult(s, result_type);
1354
0
}
1355
1356
// Move backwards until the key smaller than saved_key_.
1357
// Changes valid_ only if return value is false.
1358
395
bool DBIter::FindUserKeyBeforeSavedKey() {
1359
395
  assert(status_.ok());
1360
395
  size_t num_skipped = 0;
1361
504
  while (iter_.Valid()) {
1362
109
    ParsedInternalKey ikey;
1363
109
    if (!ParseKey(&ikey)) {
1364
0
      return false;
1365
0
    }
1366
1367
109
    if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
1368
0
      return true;
1369
0
    }
1370
1371
109
    if (TooManyInternalKeysSkipped()) {
1372
0
      return false;
1373
0
    }
1374
1375
109
    assert(ikey.sequence != kMaxSequenceNumber);
1376
109
    assert(ikey.user_key.size() >= timestamp_size_);
1377
109
    Slice ts;
1378
109
    if (timestamp_size_ > 0) {
1379
0
      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
1380
0
                 timestamp_size_);
1381
0
    }
1382
109
    if (!IsVisible(ikey.sequence, ts)) {
1383
0
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
1384
109
    } else {
1385
109
      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
1386
109
    }
1387
1388
109
    if (num_skipped >= max_skip_) {
1389
0
      num_skipped = 0;
1390
0
      std::string last_key;
1391
0
      if (timestamp_size_ == 0) {
1392
0
        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
1393
0
                                                       kMaxSequenceNumber,
1394
0
                                                       kValueTypeForSeek));
1395
0
      } else {
1396
        // TODO: pre-create kTsMax.
1397
0
        const std::string kTsMax(timestamp_size_, '\xff');
1398
0
        AppendInternalKeyWithDifferentTimestamp(
1399
0
            &last_key,
1400
0
            ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
1401
0
                              kValueTypeForSeek),
1402
0
            kTsMax);
1403
0
      }
1404
      // It would be more efficient to use SeekForPrev() here, but some
1405
      // iterators may not support it.
1406
0
      iter_.Seek(last_key);
1407
0
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1408
0
      if (!iter_.Valid()) {
1409
0
        break;
1410
0
      }
1411
109
    } else {
1412
109
      ++num_skipped;
1413
109
    }
1414
1415
109
    iter_.Prev();
1416
109
  }
1417
1418
395
  if (!iter_.status().ok()) {
1419
0
    valid_ = false;
1420
0
    return false;
1421
0
  }
1422
1423
395
  return true;
1424
395
}
1425
1426
31.3k
bool DBIter::TooManyInternalKeysSkipped(bool increment) {
1427
31.3k
  if ((max_skippable_internal_keys_ > 0) &&
1428
31.3k
      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
1429
0
    valid_ = false;
1430
0
    status_ = Status::Incomplete("Too many internal keys skipped.");
1431
0
    return true;
1432
31.3k
  } else if (increment) {
1433
31.2k
    num_internal_keys_skipped_++;
1434
31.2k
  }
1435
31.3k
  return false;
1436
31.3k
}
1437
1438
bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
1439
31.3k
                       bool* more_recent) {
1440
  // Remember that comparator orders preceding timestamp as larger.
1441
  // TODO(yanqin): support timestamp in read_callback_.
1442
31.3k
  bool visible_by_seq = (read_callback_ == nullptr)
1443
31.3k
                            ? sequence <= sequence_
1444
31.3k
                            : read_callback_->IsVisible(sequence);
1445
1446
31.3k
  bool visible_by_ts =
1447
31.3k
      (timestamp_ub_ == nullptr ||
1448
31.3k
       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
1449
31.3k
      (timestamp_lb_ == nullptr ||
1450
31.3k
       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
1451
1452
31.3k
  if (more_recent) {
1453
29.4k
    *more_recent = !visible_by_seq;
1454
29.4k
  }
1455
31.3k
  return visible_by_seq && visible_by_ts;
1456
31.3k
}
1457
1458
0
void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
1459
0
  is_key_seqnum_zero_ = false;
1460
0
  SequenceNumber seq = sequence_;
1461
0
  saved_key_.Clear();
1462
0
  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
1463
1464
0
  if (iterate_lower_bound_ != nullptr &&
1465
0
      user_comparator_.CompareWithoutTimestamp(
1466
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
1467
0
          /*b_has_ts=*/false) < 0) {
1468
    // Seek key is smaller than the lower bound.
1469
0
    saved_key_.Clear();
1470
0
    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
1471
0
                              timestamp_ub_);
1472
0
  }
1473
0
}
1474
1475
489
void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
1476
489
  is_key_seqnum_zero_ = false;
1477
489
  saved_key_.Clear();
1478
  // now saved_key is used to store internal key.
1479
489
  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
1480
489
                            kValueTypeForSeekForPrev, timestamp_ub_);
1481
1482
489
  if (timestamp_size_ > 0) {
1483
0
    const std::string kTsMin(timestamp_size_, '\0');
1484
0
    Slice ts = kTsMin;
1485
0
    saved_key_.UpdateInternalKey(
1486
0
        /*seq=*/0, kValueTypeForSeekForPrev,
1487
0
        timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
1488
0
  }
1489
1490
489
  if (iterate_upper_bound_ != nullptr &&
1491
489
      user_comparator_.CompareWithoutTimestamp(
1492
0
          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
1493
0
          /*b_has_ts=*/false) >= 0) {
1494
0
    saved_key_.Clear();
1495
0
    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
1496
0
                              kValueTypeForSeekForPrev, timestamp_ub_);
1497
0
    if (timestamp_size_ > 0) {
1498
0
      const std::string kTsMax(timestamp_size_, '\xff');
1499
0
      Slice ts = kTsMax;
1500
0
      saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
1501
0
                                   &ts);
1502
0
    }
1503
0
  }
1504
489
}
1505
1506
0
void DBIter::Seek(const Slice& target) {
1507
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
1508
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1509
0
  StopWatch sw(clock_, statistics_, DB_SEEK);
1510
1511
0
  if (cfh_ != nullptr) {
1512
    // TODO: What do we do if this returns an error?
1513
0
    Slice lower_bound, upper_bound;
1514
0
    if (iterate_lower_bound_ != nullptr) {
1515
0
      lower_bound = *iterate_lower_bound_;
1516
0
    } else {
1517
0
      lower_bound = Slice("");
1518
0
    }
1519
0
    if (iterate_upper_bound_ != nullptr) {
1520
0
      upper_bound = *iterate_upper_bound_;
1521
0
    } else {
1522
0
      upper_bound = Slice("");
1523
0
    }
1524
0
    cfh_->db()
1525
0
        ->TraceIteratorSeek(cfh_->cfd()->GetID(), target, lower_bound,
1526
0
                            upper_bound)
1527
0
        .PermitUncheckedError();
1528
0
  }
1529
1530
0
  status_ = Status::OK();
1531
0
  ReleaseTempPinnedData();
1532
0
  ResetBlobValue();
1533
0
  ResetValueAndColumns();
1534
0
  ResetInternalKeysSkippedCounter();
1535
1536
  // Seek the inner iterator based on the target key.
1537
0
  {
1538
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
1539
1540
0
    SetSavedKeyToSeekTarget(target);
1541
0
    iter_.Seek(saved_key_.GetInternalKey());
1542
1543
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
1544
0
  }
1545
0
  if (!iter_.Valid()) {
1546
0
    valid_ = false;
1547
0
    return;
1548
0
  }
1549
0
  direction_ = kForward;
1550
1551
  // Now the inner iterator is placed to the target position. From there,
1552
  // we need to find out the next key that is visible to the user.
1553
0
  ClearSavedValue();
1554
0
  if (prefix_same_as_start_) {
1555
    // The case where the iterator needs to be invalidated if it has exhausted
1556
    // keys within the same prefix of the seek key.
1557
0
    assert(prefix_extractor_ != nullptr);
1558
0
    Slice target_prefix = prefix_extractor_->Transform(target);
1559
0
    FindNextUserEntry(false /* not skipping saved_key */,
1560
0
                      &target_prefix /* prefix */);
1561
0
    if (valid_) {
1562
      // Remember the prefix of the seek key for the future Next() call to
1563
      // check.
1564
0
      prefix_.SetUserKey(target_prefix);
1565
0
    }
1566
0
  } else {
1567
0
    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
1568
0
  }
1569
0
  if (!valid_) {
1570
0
    return;
1571
0
  }
1572
1573
  // Updating stats and perf context counters.
1574
0
  if (statistics_ != nullptr) {
1575
    // Decrement since we don't want to count this key as skipped
1576
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1577
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1578
0
  }
1579
0
  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1580
0
}
1581
1582
489
void DBIter::SeekForPrev(const Slice& target) {
1583
489
  PERF_COUNTER_ADD(iter_seek_count, 1);
1584
489
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1585
489
  StopWatch sw(clock_, statistics_, DB_SEEK);
1586
1587
489
  if (cfh_ != nullptr) {
1588
    // TODO: What do we do if this returns an error?
1589
489
    Slice lower_bound, upper_bound;
1590
489
    if (iterate_lower_bound_ != nullptr) {
1591
0
      lower_bound = *iterate_lower_bound_;
1592
489
    } else {
1593
489
      lower_bound = Slice("");
1594
489
    }
1595
489
    if (iterate_upper_bound_ != nullptr) {
1596
0
      upper_bound = *iterate_upper_bound_;
1597
489
    } else {
1598
489
      upper_bound = Slice("");
1599
489
    }
1600
489
    cfh_->db()
1601
489
        ->TraceIteratorSeekForPrev(cfh_->cfd()->GetID(), target, lower_bound,
1602
489
                                   upper_bound)
1603
489
        .PermitUncheckedError();
1604
489
  }
1605
1606
489
  status_ = Status::OK();
1607
489
  ReleaseTempPinnedData();
1608
489
  ResetBlobValue();
1609
489
  ResetValueAndColumns();
1610
489
  ResetInternalKeysSkippedCounter();
1611
1612
  // Seek the inner iterator based on the target key.
1613
489
  {
1614
489
    PERF_TIMER_GUARD(seek_internal_seek_time);
1615
489
    SetSavedKeyToSeekForPrevTarget(target);
1616
489
    iter_.SeekForPrev(saved_key_.GetInternalKey());
1617
489
    RecordTick(statistics_, NUMBER_DB_SEEK);
1618
489
  }
1619
489
  if (!iter_.Valid()) {
1620
94
    valid_ = false;
1621
94
    return;
1622
94
  }
1623
395
  direction_ = kReverse;
1624
1625
  // Now the inner iterator is placed to the target position. From there,
1626
  // we need to find out the first key that is visible to the user in the
1627
  // backward direction.
1628
395
  ClearSavedValue();
1629
395
  if (prefix_same_as_start_) {
1630
    // The case where the iterator needs to be invalidated if it has exhausted
1631
    // keys within the same prefix of the seek key.
1632
0
    assert(prefix_extractor_ != nullptr);
1633
0
    Slice target_prefix = prefix_extractor_->Transform(target);
1634
0
    PrevInternal(&target_prefix);
1635
0
    if (valid_) {
1636
      // Remember the prefix of the seek key for the future Prev() call to
1637
      // check.
1638
0
      prefix_.SetUserKey(target_prefix);
1639
0
    }
1640
395
  } else {
1641
395
    PrevInternal(nullptr);
1642
395
  }
1643
1644
  // Report stats and perf context.
1645
395
  if (statistics_ != nullptr && valid_) {
1646
0
    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1647
0
    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1648
0
    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1649
0
  }
1650
395
}
1651
1652
5.18k
void DBIter::SeekToFirst() {
1653
5.18k
  if (iterate_lower_bound_ != nullptr) {
1654
0
    Seek(*iterate_lower_bound_);
1655
0
    return;
1656
0
  }
1657
5.18k
  PERF_COUNTER_ADD(iter_seek_count, 1);
1658
5.18k
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1659
  // Don't use iter_::Seek() if we set a prefix extractor
1660
  // because prefix seek will be used.
1661
5.18k
  if (!expect_total_order_inner_iter()) {
1662
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
1663
0
  }
1664
5.18k
  status_ = Status::OK();
1665
  // if iterator is empty, this status_ could be unchecked.
1666
5.18k
  status_.PermitUncheckedError();
1667
5.18k
  direction_ = kForward;
1668
5.18k
  ReleaseTempPinnedData();
1669
5.18k
  ResetBlobValue();
1670
5.18k
  ResetValueAndColumns();
1671
5.18k
  ResetInternalKeysSkippedCounter();
1672
5.18k
  ClearSavedValue();
1673
5.18k
  is_key_seqnum_zero_ = false;
1674
1675
5.18k
  {
1676
5.18k
    PERF_TIMER_GUARD(seek_internal_seek_time);
1677
5.18k
    iter_.SeekToFirst();
1678
5.18k
  }
1679
1680
5.18k
  RecordTick(statistics_, NUMBER_DB_SEEK);
1681
5.18k
  if (iter_.Valid()) {
1682
4.20k
    saved_key_.SetUserKey(
1683
4.20k
        ExtractUserKey(iter_.key()),
1684
4.20k
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
1685
4.20k
    FindNextUserEntry(false /* not skipping saved_key */,
1686
4.20k
                      nullptr /* no prefix check */);
1687
4.20k
    if (statistics_ != nullptr) {
1688
0
      if (valid_) {
1689
0
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1690
0
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1691
0
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1692
0
      }
1693
0
    }
1694
4.20k
  } else {
1695
975
    valid_ = false;
1696
975
  }
1697
5.18k
  if (valid_ && prefix_same_as_start_) {
1698
0
    assert(prefix_extractor_ != nullptr);
1699
0
    prefix_.SetUserKey(prefix_extractor_->Transform(
1700
0
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
1701
0
  }
1702
5.18k
}
1703
1704
0
void DBIter::SeekToLast() {
1705
0
  if (iterate_upper_bound_ != nullptr) {
1706
    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
1707
0
    SeekForPrev(*iterate_upper_bound_);
1708
#ifndef NDEBUG
1709
    Slice k = Valid() ? key() : Slice();
1710
    if (Valid() && timestamp_size_ > 0 && timestamp_lb_) {
1711
      k.remove_suffix(kNumInternalBytes + timestamp_size_);
1712
    }
1713
    assert(!Valid() || user_comparator_.CompareWithoutTimestamp(
1714
                           k, /*a_has_ts=*/false, *iterate_upper_bound_,
1715
                           /*b_has_ts=*/false) < 0);
1716
#endif
1717
0
    return;
1718
0
  }
1719
1720
0
  PERF_COUNTER_ADD(iter_seek_count, 1);
1721
0
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
1722
  // Don't use iter_::Seek() if we set a prefix extractor
1723
  // because prefix seek will be used.
1724
0
  if (!expect_total_order_inner_iter()) {
1725
0
    max_skip_ = std::numeric_limits<uint64_t>::max();
1726
0
  }
1727
0
  status_ = Status::OK();
1728
  // if iterator is empty, this status_ could be unchecked.
1729
0
  status_.PermitUncheckedError();
1730
0
  direction_ = kReverse;
1731
0
  ReleaseTempPinnedData();
1732
0
  ResetBlobValue();
1733
0
  ResetValueAndColumns();
1734
0
  ResetInternalKeysSkippedCounter();
1735
0
  ClearSavedValue();
1736
0
  is_key_seqnum_zero_ = false;
1737
1738
0
  {
1739
0
    PERF_TIMER_GUARD(seek_internal_seek_time);
1740
0
    iter_.SeekToLast();
1741
0
  }
1742
0
  PrevInternal(nullptr);
1743
0
  if (statistics_ != nullptr) {
1744
0
    RecordTick(statistics_, NUMBER_DB_SEEK);
1745
0
    if (valid_) {
1746
0
      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
1747
0
      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1748
0
      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
1749
0
    }
1750
0
  }
1751
0
  if (valid_ && prefix_same_as_start_) {
1752
0
    assert(prefix_extractor_ != nullptr);
1753
0
    prefix_.SetUserKey(prefix_extractor_->Transform(
1754
0
        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
1755
0
  }
1756
0
}
1757
1758
Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
1759
                        const ImmutableOptions& ioptions,
1760
                        const MutableCFOptions& mutable_cf_options,
1761
                        const Comparator* user_key_comparator,
1762
                        InternalIterator* internal_iter, const Version* version,
1763
                        const SequenceNumber& sequence,
1764
                        uint64_t max_sequential_skip_in_iterations,
1765
                        ReadCallback* read_callback,
1766
0
                        ColumnFamilyHandleImpl* cfh, bool expose_blob_index) {
1767
0
  DBIter* db_iter = new DBIter(
1768
0
      env, read_options, ioptions, mutable_cf_options, user_key_comparator,
1769
0
      internal_iter, version, sequence, false,
1770
0
      max_sequential_skip_in_iterations, read_callback, cfh, expose_blob_index);
1771
0
  return db_iter;
1772
0
}
1773
1774
}  // namespace ROCKSDB_NAMESPACE