/src/rocksdb/db/db_iter.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #pragma once |
11 | | #include <cstdint> |
12 | | #include <string> |
13 | | |
14 | | #include "db/db_impl/db_impl.h" |
15 | | #include "memory/arena.h" |
16 | | #include "options/cf_options.h" |
17 | | #include "rocksdb/db.h" |
18 | | #include "rocksdb/iterator.h" |
19 | | #include "rocksdb/wide_columns.h" |
20 | | #include "table/iterator_wrapper.h" |
21 | | #include "util/autovector.h" |
22 | | |
23 | | namespace ROCKSDB_NAMESPACE { |
24 | | class Version; |
25 | | |
26 | | // This file declares the factory functions of DBIter, in its original form |
27 | | // or a wrapped form with class ArenaWrappedDBIter, which is defined here. |
28 | | // Class DBIter, which is declared and implemented inside db_iter.cc, is |
29 | | // an iterator that converts internal keys (yielded by an InternalIterator) |
30 | | // that were live at the specified sequence number into appropriate user |
31 | | // keys. |
32 | | // Each internal key consists of a user key, a sequence number, and a value |
33 | | // type. DBIter deals with multiple key versions, tombstones, merge operands, |
34 | | // etc, and exposes an Iterator. |
35 | | // For example, DBIter may wrap following InternalIterator: |
36 | | // user key: AAA value: v3 seqno: 100 type: Put |
37 | | // user key: AAA value: v2 seqno: 97 type: Put |
38 | | // user key: AAA value: v1 seqno: 95 type: Put |
39 | | // user key: BBB value: v1 seqno: 90 type: Put |
40 | | // user key: BBC value: N/A seqno: 98 type: Delete |
41 | | // user key: BBC value: v1 seqno: 95 type: Put |
42 | | // If the snapshot passed in is 102, then the DBIter is expected to |
43 | | // expose the following iterator: |
44 | | // key: AAA value: v3 |
45 | | // key: BBB value: v1 |
46 | | // If the snapshot passed in is 96, then it should expose: |
47 | | // key: AAA value: v1 |
48 | | // key: BBB value: v1 |
49 | | // key: BBC value: v1 |
50 | | // |
51 | | |
52 | | // Memtables and sstables that make the DB representation contain |
53 | | // (userkey,seq,type) => uservalue entries. DBIter |
54 | | // combines multiple entries for the same userkey found in the DB |
55 | | // representation into a single entry while accounting for sequence |
56 | | // numbers, deletion markers, overwrites, etc. |
57 | | class DBIter final : public Iterator { |
58 | | public: |
59 | | // Return a new DBIter that reads from `internal_iter` at the specified |
60 | | // `sequence` number. |
61 | | // |
62 | | // @param active_mem Pointer to the active memtable that `internal_iter` |
63 | | // is reading from. If not null, the memtable can be marked for flush |
64 | | // according to options mutable_cf_options.memtable_op_scan_flush_trigger |
65 | | // and mutable_cf_options.memtable_avg_op_scan_flush_trigger. |
66 | | // @param arena_mode If true, the DBIter will be allocated from the arena. |
67 | | static DBIter* NewIter(Env* env, const ReadOptions& read_options, |
68 | | const ImmutableOptions& ioptions, |
69 | | const MutableCFOptions& mutable_cf_options, |
70 | | const Comparator* user_key_comparator, |
71 | | InternalIterator* internal_iter, |
72 | | const Version* version, const SequenceNumber& sequence, |
73 | | ReadCallback* read_callback, |
74 | | ReadOnlyMemTable* active_mem, |
75 | | ColumnFamilyHandleImpl* cfh = nullptr, |
76 | | bool expose_blob_index = false, |
77 | 16.2k | Arena* arena = nullptr) { |
78 | 16.2k | void* mem = arena ? arena->AllocateAligned(sizeof(DBIter)) |
79 | 16.2k | : operator new(sizeof(DBIter)); |
80 | 16.2k | DBIter* db_iter = new (mem) |
81 | 16.2k | DBIter(env, read_options, ioptions, mutable_cf_options, |
82 | 16.2k | user_key_comparator, internal_iter, version, sequence, arena, |
83 | 16.2k | read_callback, cfh, expose_blob_index, active_mem); |
84 | 16.2k | return db_iter; |
85 | 16.2k | } |
86 | | |
87 | | // The following is grossly complicated. TODO: clean it up |
88 | | // Which direction is the iterator currently moving? |
89 | | // (1) When moving forward: |
90 | | // (1a) if current_entry_is_merged_ = false, the internal iterator is |
91 | | // positioned at the exact entry that yields this->key(), this->value() |
92 | | // (1b) if current_entry_is_merged_ = true, the internal iterator is |
93 | | // positioned immediately after the last entry that contributed to the |
94 | | // current this->value(). That entry may or may not have key equal to |
95 | | // this->key(). |
96 | | // (2) When moving backwards, the internal iterator is positioned |
97 | | // just before all entries whose user key == this->key(). |
98 | | enum Direction : uint8_t { kForward, kReverse }; |
99 | | |
100 | | // LocalStatistics contain Statistics counters that will be aggregated per |
101 | | // each iterator instance and then will be sent to the global statistics when |
102 | | // the iterator is destroyed. |
103 | | // |
104 | | // The purpose of this approach is to avoid perf regression happening |
105 | | // when multiple threads bump the atomic counters from a DBIter::Next(). |
106 | | struct LocalStatistics { |
107 | 16.2k | explicit LocalStatistics() { ResetCounters(); } |
108 | | |
109 | 32.5k | void ResetCounters() { |
110 | 32.5k | next_count_ = 0; |
111 | 32.5k | next_found_count_ = 0; |
112 | 32.5k | prev_count_ = 0; |
113 | 32.5k | prev_found_count_ = 0; |
114 | 32.5k | bytes_read_ = 0; |
115 | 32.5k | skip_count_ = 0; |
116 | 32.5k | } |
117 | | |
118 | 16.2k | void BumpGlobalStatistics(Statistics* global_statistics) { |
119 | 16.2k | RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); |
120 | 16.2k | RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); |
121 | 16.2k | RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); |
122 | 16.2k | RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); |
123 | 16.2k | RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); |
124 | 16.2k | RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); |
125 | 16.2k | PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); |
126 | 16.2k | ResetCounters(); |
127 | 16.2k | } |
128 | | |
129 | | // Map to Tickers::NUMBER_DB_NEXT |
130 | | uint64_t next_count_; |
131 | | // Map to Tickers::NUMBER_DB_NEXT_FOUND |
132 | | uint64_t next_found_count_; |
133 | | // Map to Tickers::NUMBER_DB_PREV |
134 | | uint64_t prev_count_; |
135 | | // Map to Tickers::NUMBER_DB_PREV_FOUND |
136 | | uint64_t prev_found_count_; |
137 | | // Map to Tickers::ITER_BYTES_READ |
138 | | uint64_t bytes_read_; |
139 | | // Map to Tickers::NUMBER_ITER_SKIP |
140 | | uint64_t skip_count_; |
141 | | }; |
142 | | |
143 | | // No copying allowed |
144 | | DBIter(const DBIter&) = delete; |
145 | | void operator=(const DBIter&) = delete; |
146 | | |
147 | 16.2k | ~DBIter() override { |
148 | 16.2k | MarkMemtableForFlushForAvgTrigger(); |
149 | 16.2k | ThreadStatus::OperationType cur_op_type = |
150 | 16.2k | ThreadStatusUtil::GetThreadOperation(); |
151 | 16.2k | ThreadStatusUtil::SetThreadOperation( |
152 | 16.2k | ThreadStatus::OperationType::OP_UNKNOWN); |
153 | | // Release pinned data if any |
154 | 16.2k | if (pinned_iters_mgr_.PinningEnabled()) { |
155 | 3.18k | pinned_iters_mgr_.ReleasePinnedData(); |
156 | 3.18k | } |
157 | 16.2k | RecordTick(statistics_, NO_ITERATOR_DELETED); |
158 | 16.2k | ResetInternalKeysSkippedCounter(); |
159 | 16.2k | local_stats_.BumpGlobalStatistics(statistics_); |
160 | 16.2k | iter_.DeleteIter(arena_mode_); |
161 | 16.2k | ThreadStatusUtil::SetThreadOperation(cur_op_type); |
162 | 16.2k | } |
163 | 16.2k | void SetIter(InternalIterator* iter) { |
164 | 16.2k | assert(iter_.iter() == nullptr); |
165 | 16.2k | iter_.Set(iter); |
166 | 16.2k | iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); |
167 | 16.2k | } |
168 | | |
169 | 44.8k | bool Valid() const override { |
170 | | #ifdef ROCKSDB_ASSERT_STATUS_CHECKED |
171 | | if (valid_) { |
172 | | status_.PermitUncheckedError(); |
173 | | } |
174 | | #endif // ROCKSDB_ASSERT_STATUS_CHECKED |
175 | 44.8k | return valid_; |
176 | 44.8k | } |
177 | 33.1k | Slice key() const override { |
178 | 33.1k | assert(valid_); |
179 | 33.1k | if (timestamp_lb_) { |
180 | 0 | return saved_key_.GetInternalKey(); |
181 | 33.1k | } else { |
182 | 33.1k | const Slice ukey_and_ts = saved_key_.GetUserKey(); |
183 | 33.1k | return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_); |
184 | 33.1k | } |
185 | 33.1k | } |
186 | 33.1k | Slice value() const override { |
187 | 33.1k | assert(valid_); |
188 | | |
189 | 33.1k | return value_; |
190 | 33.1k | } |
191 | | |
192 | 0 | const WideColumns& columns() const override { |
193 | 0 | assert(valid_); |
194 | |
|
195 | 0 | return wide_columns_; |
196 | 0 | } |
197 | | |
198 | 0 | Status status() const override { |
199 | 0 | if (status_.ok()) { |
200 | 0 | return iter_.status(); |
201 | 0 | } else { |
202 | 0 | assert(!valid_); |
203 | 0 | return status_; |
204 | 0 | } |
205 | 0 | } |
206 | 0 | Slice timestamp() const override { |
207 | 0 | assert(valid_); |
208 | 0 | assert(timestamp_size_ > 0); |
209 | 0 | if (direction_ == kReverse) { |
210 | 0 | return saved_timestamp_; |
211 | 0 | } |
212 | 0 | const Slice ukey_and_ts = saved_key_.GetUserKey(); |
213 | 0 | assert(timestamp_size_ < ukey_and_ts.size()); |
214 | 0 | return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_); |
215 | 0 | } |
216 | 0 | bool IsBlob() const { |
217 | 0 | assert(valid_); |
218 | 0 | return is_blob_; |
219 | 0 | } |
220 | | |
221 | | Status GetProperty(std::string prop_name, std::string* prop) override; |
222 | | |
223 | | void Next() final override; |
224 | | void Prev() final override; |
225 | | // 'target' does not contain timestamp, even if user timestamp feature is |
226 | | // enabled. |
227 | | void Seek(const Slice& target) final override; |
228 | | void SeekForPrev(const Slice& target) final override; |
229 | | void SeekToFirst() final override; |
230 | | void SeekToLast() final override; |
231 | 0 | Env* env() const { return env_; } |
232 | 0 | void set_sequence(uint64_t s) { |
233 | 0 | sequence_ = s; |
234 | 0 | if (read_callback_) { |
235 | 0 | read_callback_->Refresh(s); |
236 | 0 | } |
237 | 0 | iter_.SetRangeDelReadSeqno(s); |
238 | 0 | } |
239 | 0 | void set_valid(bool v) { valid_ = v; } |
240 | | |
241 | | bool PrepareValue() override; |
242 | | |
243 | | void Prepare(const MultiScanArgs& scan_opts) override; |
244 | | Status ValidateScanOptions(const MultiScanArgs& multiscan_opts) const; |
245 | | |
246 | | private: |
247 | | DBIter(Env* _env, const ReadOptions& read_options, |
248 | | const ImmutableOptions& ioptions, |
249 | | const MutableCFOptions& mutable_cf_options, const Comparator* cmp, |
250 | | InternalIterator* iter, const Version* version, SequenceNumber s, |
251 | | bool arena_mode, ReadCallback* read_callback, |
252 | | ColumnFamilyHandleImpl* cfh, bool expose_blob_index, |
253 | | ReadOnlyMemTable* active_mem); |
254 | | |
255 | | class BlobReader { |
256 | | public: |
257 | | BlobReader(const Version* version, ReadTier read_tier, |
258 | | bool verify_checksums, bool fill_cache, |
259 | | Env::IOActivity io_activity) |
260 | 16.2k | : version_(version), |
261 | 16.2k | read_tier_(read_tier), |
262 | 16.2k | verify_checksums_(verify_checksums), |
263 | 16.2k | fill_cache_(fill_cache), |
264 | 16.2k | io_activity_(io_activity) {} |
265 | | |
266 | 0 | const Slice& GetBlobValue() const { return blob_value_; } |
267 | | Status RetrieveAndSetBlobValue(const Slice& user_key, |
268 | | const Slice& blob_index); |
269 | 48.6k | void ResetBlobValue() { blob_value_.Reset(); } |
270 | | |
271 | | private: |
272 | | PinnableSlice blob_value_; |
273 | | const Version* version_; |
274 | | ReadTier read_tier_; |
275 | | bool verify_checksums_; |
276 | | bool fill_cache_; |
277 | | Env::IOActivity io_activity_; |
278 | | }; |
279 | | |
280 | | // For all methods in this block: |
281 | | // PRE: iter_->Valid() && status_.ok() |
282 | | // Return false if there was an error, and status() is non-ok, valid_ = false; |
283 | | // in this case callers would usually stop what they were doing and return. |
284 | | bool ReverseToForward(); |
285 | | bool ReverseToBackward(); |
286 | | // Set saved_key_ to the seek key to target, with proper sequence number set. |
287 | | // It might get adjusted if the seek key is smaller than iterator lower bound. |
288 | | // target does not have timestamp. |
289 | | void SetSavedKeyToSeekTarget(const Slice& target); |
290 | | // Set saved_key_ to the seek key to target, with proper sequence number set. |
291 | | // It might get adjusted if the seek key is larger than iterator upper bound. |
292 | | // target does not have timestamp. |
293 | | void SetSavedKeyToSeekForPrevTarget(const Slice& target); |
294 | | bool FindValueForCurrentKey(); |
295 | | bool FindValueForCurrentKeyUsingSeek(); |
296 | | bool FindUserKeyBeforeSavedKey(); |
297 | | // If `skipping_saved_key` is true, the function will keep iterating until it |
298 | | // finds a user key that is larger than `saved_key_`. |
299 | | // If `prefix` is not null, the iterator needs to stop when all keys for the |
300 | | // prefix are exhausted and the iterator is set to invalid. |
301 | | bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); |
302 | | // Internal implementation of FindNextUserEntry(). |
303 | | bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); |
304 | | bool ParseKey(ParsedInternalKey* key); |
305 | | bool MergeValuesNewToOld(); |
306 | | |
307 | | // If prefix is not null, we need to set the iterator to invalid if no more |
308 | | // entry can be found within the prefix. |
309 | | void PrevInternal(const Slice* prefix); |
310 | | bool TooManyInternalKeysSkipped(bool increment = true); |
311 | | bool IsVisible(SequenceNumber sequence, const Slice& ts, |
312 | | bool* more_recent = nullptr); |
313 | | |
314 | | // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() |
315 | | // is called |
316 | 3.27k | void TempPinData() { |
317 | 3.27k | if (!pin_thru_lifetime_) { |
318 | 3.27k | pinned_iters_mgr_.StartPinning(); |
319 | 3.27k | } |
320 | 3.27k | } |
321 | | |
322 | | // Release blocks pinned by TempPinData() |
323 | 51.9k | void ReleaseTempPinnedData() { |
324 | 51.9k | if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { |
325 | 88 | pinned_iters_mgr_.ReleasePinnedData(); |
326 | 88 | } |
327 | 51.9k | } |
328 | | |
329 | 42.7k | inline void ClearSavedValue() { |
330 | 42.7k | if (saved_value_.capacity() > 1048576) { |
331 | 0 | std::string empty; |
332 | 0 | swap(empty, saved_value_); |
333 | 42.7k | } else { |
334 | 42.7k | saved_value_.clear(); |
335 | 42.7k | } |
336 | 42.7k | } |
337 | | |
338 | 28.9k | inline void ResetInternalKeysSkippedCounter() { |
339 | 28.9k | local_stats_.skip_count_ += num_internal_keys_skipped_; |
340 | 28.9k | if (valid_) { |
341 | 2.14k | local_stats_.skip_count_--; |
342 | 2.14k | } |
343 | 28.9k | num_internal_keys_skipped_ = 0; |
344 | 28.9k | } |
345 | | |
346 | 8.76k | bool expect_total_order_inner_iter() { |
347 | 8.76k | assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr); |
348 | 8.76k | return expect_total_order_inner_iter_; |
349 | 8.76k | } |
350 | | |
351 | | // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need |
352 | | // to return versions of the same key. We cannot just skip if the key value |
353 | | // is the same but timestamps are different but fall in timestamp range. |
354 | 43.0k | inline int CompareKeyForSkip(const Slice& a, const Slice& b) { |
355 | 43.0k | return timestamp_lb_ != nullptr |
356 | 43.0k | ? user_comparator_.Compare(a, b) |
357 | 43.0k | : user_comparator_.CompareWithoutTimestamp(a, b); |
358 | 43.0k | } |
359 | | |
360 | 38.1k | void SetValueAndColumnsFromPlain(const Slice& slice) { |
361 | 38.1k | assert(value_.empty()); |
362 | 38.1k | assert(wide_columns_.empty()); |
363 | | |
364 | 38.1k | value_ = slice; |
365 | 38.1k | wide_columns_.emplace_back(kDefaultWideColumnName, slice); |
366 | 38.1k | } |
367 | | |
368 | | bool SetValueAndColumnsFromBlobImpl(const Slice& user_key, |
369 | | const Slice& blob_index); |
370 | | bool SetValueAndColumnsFromBlob(const Slice& user_key, |
371 | | const Slice& blob_index); |
372 | | |
373 | | bool SetValueAndColumnsFromEntity(Slice slice); |
374 | | |
375 | | bool SetValueAndColumnsFromMergeResult(const Status& merge_status, |
376 | | ValueType result_type); |
377 | | |
378 | 48.6k | void ResetValueAndColumns() { |
379 | 48.6k | value_.clear(); |
380 | 48.6k | wide_columns_.clear(); |
381 | 48.6k | } |
382 | | |
383 | 48.6k | void ResetBlobData() { |
384 | 48.6k | blob_reader_.ResetBlobValue(); |
385 | 48.6k | lazy_blob_index_.clear(); |
386 | 48.6k | is_blob_ = false; |
387 | 48.6k | } |
388 | | |
389 | | // The following methods perform the actual merge operation for the |
390 | | // no/plain/blob/wide-column base value cases. |
391 | | // If user-defined timestamp is enabled, `user_key` includes timestamp. |
392 | | bool MergeWithNoBaseValue(const Slice& user_key); |
393 | | bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key); |
394 | | bool MergeWithBlobBaseValue(const Slice& blob_index, const Slice& user_key); |
395 | | bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key); |
396 | | |
397 | 48.2k | bool PrepareValueInternal() { |
398 | 48.2k | if (!iter_.PrepareValue()) { |
399 | 0 | assert(!iter_.status().ok()); |
400 | 0 | valid_ = false; |
401 | 0 | return false; |
402 | 0 | } |
403 | | // ikey_ could change as BlockBasedTableIterator does Block cache |
404 | | // lookup and index_iter_ could point to different block resulting |
405 | | // in ikey_ pointing to wrong key. So ikey_ needs to be updated in |
406 | | // case of Seek/Next calls to point to right key again. |
407 | 48.2k | if (!ParseKey(&ikey_)) { |
408 | 0 | return false; |
409 | 0 | } |
410 | 48.2k | return true; |
411 | 48.2k | } |
412 | | |
413 | 28.9k | void MarkMemtableForFlushForAvgTrigger() { |
414 | 28.9k | if (avg_op_scan_flush_trigger_ && |
415 | 0 | mem_hidden_op_scanned_since_seek_ >= memtable_op_scan_flush_trigger_ && |
416 | 0 | mem_hidden_op_scanned_since_seek_ >= |
417 | 0 | static_cast<uint64_t>(iter_step_since_seek_) * |
418 | 0 | avg_op_scan_flush_trigger_) { |
419 | 0 | assert(memtable_op_scan_flush_trigger_ > 0); |
420 | 0 | active_mem_->MarkForFlush(); |
421 | 0 | avg_op_scan_flush_trigger_ = 0; |
422 | 0 | memtable_op_scan_flush_trigger_ = 0; |
423 | 0 | } |
424 | 28.9k | iter_step_since_seek_ = 1; |
425 | 28.9k | mem_hidden_op_scanned_since_seek_ = 0; |
426 | 28.9k | } |
427 | | |
428 | 13.7k | void MarkMemtableForFlushForPerOpTrigger(uint64_t& mem_hidden_op_scanned) { |
429 | 13.7k | if (memtable_op_scan_flush_trigger_ && |
430 | 0 | ikey_.sequence >= memtable_seqno_lb_) { |
431 | 0 | if (++mem_hidden_op_scanned >= memtable_op_scan_flush_trigger_) { |
432 | 0 | active_mem_->MarkForFlush(); |
433 | | // Turn off the flush trigger checks. |
434 | 0 | memtable_op_scan_flush_trigger_ = 0; |
435 | 0 | avg_op_scan_flush_trigger_ = 0; |
436 | 0 | } |
437 | 0 | if (avg_op_scan_flush_trigger_) { |
438 | 0 | ++mem_hidden_op_scanned_since_seek_; |
439 | 0 | } |
440 | 0 | } |
441 | 13.7k | } |
442 | | |
443 | | const SliceTransform* prefix_extractor_; |
444 | | Env* const env_; |
445 | | SystemClock* clock_; |
446 | | Logger* logger_; |
447 | | UserComparatorWrapper user_comparator_; |
448 | | const MergeOperator* const merge_operator_; |
449 | | IteratorWrapper iter_; |
450 | | BlobReader blob_reader_; |
451 | | ReadCallback* read_callback_; |
452 | | // Max visible sequence number. It is normally the snapshot seq unless we have |
453 | | // uncommitted data in db as in WriteUnCommitted. |
454 | | SequenceNumber sequence_; |
455 | | |
456 | | IterKey saved_key_; |
457 | | // Reusable internal key data structure. This is only used inside one function |
458 | | // and should not be used across functions. Reusing this object can reduce |
459 | | // overhead of calling construction of the function if creating it each time. |
460 | | ParsedInternalKey ikey_; |
461 | | |
462 | | // The approximate write time for the entry. It is deduced from the entry's |
463 | | // sequence number if the seqno to time mapping is available. For a |
464 | | // kTypeValuePreferredSeqno entry, this is the write time specified by the |
465 | | // user. |
466 | | uint64_t saved_write_unix_time_; |
467 | | std::string saved_value_; |
468 | | Slice pinned_value_; |
469 | | // for prefix seek mode to support prev() |
470 | | // Value of the default column |
471 | | Slice value_; |
472 | | // All columns (i.e. name-value pairs) |
473 | | WideColumns wide_columns_; |
474 | | Statistics* statistics_; |
475 | | uint64_t max_skip_; |
476 | | uint64_t max_skippable_internal_keys_; |
477 | | uint64_t num_internal_keys_skipped_; |
478 | | const Slice* iterate_lower_bound_; |
479 | | const Slice* iterate_upper_bound_; |
480 | | |
481 | | // The prefix of the seek key. It is only used when prefix_same_as_start_ |
482 | | // is true and prefix extractor is not null. In Next() or Prev(), current keys |
483 | | // will be checked against this prefix, so that the iterator can be |
484 | | // invalidated if the keys in this prefix has been exhausted. Set it using |
485 | | // SetUserKey() and use it using GetUserKey(). |
486 | | IterKey prefix_; |
487 | | |
488 | | Status status_; |
489 | | Slice lazy_blob_index_; |
490 | | |
491 | | // List of operands for merge operator. |
492 | | MergeContext merge_context_; |
493 | | LocalStatistics local_stats_; |
494 | | PinnedIteratorsManager pinned_iters_mgr_; |
495 | | ColumnFamilyHandleImpl* cfh_; |
496 | | const Slice* const timestamp_ub_; |
497 | | const Slice* const timestamp_lb_; |
498 | | const size_t timestamp_size_; |
499 | | std::string saved_timestamp_; |
500 | | std::optional<MultiScanArgs> scan_opts_; |
501 | | size_t scan_index_{0}; |
502 | | ReadOnlyMemTable* const active_mem_; |
503 | | SequenceNumber memtable_seqno_lb_; |
504 | | uint32_t memtable_op_scan_flush_trigger_; |
505 | | uint32_t avg_op_scan_flush_trigger_; |
506 | | uint32_t iter_step_since_seek_; |
507 | | uint32_t mem_hidden_op_scanned_since_seek_; |
508 | | Direction direction_; |
509 | | bool valid_; |
510 | | bool current_entry_is_merged_; |
511 | | // True if we know that the current entry's seqnum is 0. |
512 | | // This information is used as that the next entry will be for another |
513 | | // user key. |
514 | | bool is_key_seqnum_zero_; |
515 | | const bool prefix_same_as_start_; |
516 | | // Means that we will pin all data blocks we read as long the Iterator |
517 | | // is not deleted, will be true if ReadOptions::pin_data is true |
518 | | const bool pin_thru_lifetime_; |
519 | | // Expect the inner iterator to maintain a total order. |
520 | | // prefix_extractor_ must be non-NULL if the value is false. |
521 | | const bool expect_total_order_inner_iter_; |
522 | | // Whether the iterator is allowed to expose blob references. Set to true when |
523 | | // the stacked BlobDB implementation is used, false otherwise. |
524 | | bool expose_blob_index_; |
525 | | bool allow_unprepared_value_; |
526 | | bool is_blob_; |
527 | | bool arena_mode_; |
528 | | }; |
529 | | } // namespace ROCKSDB_NAMESPACE |