/src/rocksdb/table/table_reader.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #pragma once |
11 | | #include <memory> |
12 | | |
13 | | #include "db/range_tombstone_fragmenter.h" |
14 | | #if USE_COROUTINES |
15 | | #include "folly/coro/Coroutine.h" |
16 | | #include "folly/coro/Task.h" |
17 | | #endif |
18 | | #include "rocksdb/slice_transform.h" |
19 | | #include "rocksdb/table_reader_caller.h" |
20 | | #include "table/get_context.h" |
21 | | #include "table/internal_iterator.h" |
22 | | #include "table/multiget_context.h" |
23 | | |
24 | | namespace ROCKSDB_NAMESPACE { |
25 | | |
26 | | class Iterator; |
27 | | struct ParsedInternalKey; |
28 | | class Slice; |
29 | | class Arena; |
30 | | struct ReadOptions; |
31 | | struct TableProperties; |
32 | | class GetContext; |
33 | | class MultiGetContext; |
34 | | |
35 | | // A Table (also referred to as SST) is a sorted map from strings to strings. |
36 | | // Tables are immutable and persistent. A Table may be safely accessed from |
37 | | // multiple threads without external synchronization. Table readers are used |
38 | | // for reading various types of table formats supported by rocksdb including |
39 | | // BlockBasedTable, PlainTable and CuckooTable format. |
40 | | class TableReader { |
41 | | public: |
42 | 109k | virtual ~TableReader() {} |
43 | | |
44 | | // Returns a new iterator over the table contents. |
45 | | // The result of NewIterator() is initially invalid (caller must |
46 | | // call one of the Seek methods on the iterator before using it). |
47 | | // |
48 | | // read_options: Must outlive the returned iterator. |
49 | | // arena: If not null, the arena needs to be used to allocate the Iterator. |
50 | | // When destroying the iterator, the caller will not call "delete" |
51 | | // but Iterator::~Iterator() directly. The destructor needs to destroy |
52 | | // all the states but those allocated in arena. |
53 | | // skip_filters: disables checking the bloom filters even if they exist. This |
54 | | // option is effective only for block-based table format. |
55 | | // compaction_readahead_size: its value will only be used if caller = |
56 | | // kCompaction |
57 | | virtual InternalIterator* NewIterator( |
58 | | const ReadOptions& read_options, const SliceTransform* prefix_extractor, |
59 | | Arena* arena, bool skip_filters, TableReaderCaller caller, |
60 | | size_t compaction_readahead_size = 0, |
61 | | bool allow_unprepared_value = false) = 0; |
62 | | |
63 | | // read_options.snapshot needs to outlive this call. |
64 | | virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( |
65 | 0 | const ReadOptions& /*read_options*/) { |
66 | 0 | return nullptr; |
67 | 0 | } |
68 | | |
69 | | virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( |
70 | 0 | SequenceNumber /* read_seqno */, const Slice* /* timestamp */) { |
71 | 0 | return nullptr; |
72 | 0 | } |
73 | | |
74 | | // Given a key, return an approximate byte offset in the file where |
75 | | // the data for that key begins (or would begin if the key were |
76 | | // present in the file). The returned value is in terms of file |
77 | | // bytes, and so includes effects like compression of the underlying data. |
78 | | // E.g., the approximate offset of the last key in the table will |
79 | | // be close to the file length. |
80 | | // TODO(peterd): Since this function is only used for approximate size |
81 | | // from beginning of file, reduce code duplication by removing this |
82 | | // function and letting ApproximateSize take optional start and end, so |
83 | | // that absolute start and end can be specified and optimized without |
84 | | // key / index work. |
85 | | virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options, |
86 | | const Slice& key, |
87 | | TableReaderCaller caller) = 0; |
88 | | |
89 | | // Given start and end keys, return the approximate data size in the file |
90 | | // between the keys. The returned value is in terms of file bytes, and so |
91 | | // includes effects like compression of the underlying data and applicable |
92 | | // portions of metadata including filters and indexes. Nullptr for start or |
93 | | // end (or both) indicates absolute start or end of the table. |
94 | | virtual uint64_t ApproximateSize(const ReadOptions& read_options, |
95 | | const Slice& start, const Slice& end, |
96 | | TableReaderCaller caller) = 0; |
97 | | |
98 | | struct Anchor { |
99 | | Anchor(const Slice& _user_key, size_t _range_size) |
100 | 0 | : user_key(_user_key.ToStringView()), range_size(_range_size) {} |
101 | | std::string user_key; |
102 | | size_t range_size; |
103 | | }; |
104 | | |
105 | | // Now try to return approximately 128 anchor keys. |
106 | | // The last one tends to be the largest key. |
107 | | virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/, |
108 | 0 | std::vector<Anchor>& /*anchors*/) { |
109 | 0 | return Status::NotSupported("ApproximateKeyAnchors() not supported."); |
110 | 0 | } |
111 | | |
112 | | // Set up the table for Compaction. Might change some parameters with |
113 | | // posix_fadvise |
114 | | virtual void SetupForCompaction() = 0; |
115 | | |
116 | | virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0; |
117 | | |
118 | | // Prepare work that can be done before the real Get() |
119 | 2.21k | virtual void Prepare(const Slice& /*target*/) {} |
120 | | |
121 | | // Report an approximation of how much memory has been used. |
122 | | virtual size_t ApproximateMemoryUsage() const = 0; |
123 | | |
124 | | // Calls get_context->SaveValue() repeatedly, starting with |
125 | | // the entry found after a call to Seek(key), until it returns false. |
126 | | // May not make such a call if filter policy says that key is not present. |
127 | | // |
128 | | // get_context->MarkKeyMayExist needs to be called when it is configured to be |
129 | | // memory only and the key is not found in the block cache. |
130 | | // |
131 | | // readOptions is the options for the read |
132 | | // key is the key to search for |
133 | | // skip_filters: disables checking the bloom filters even if they exist. This |
134 | | // option is effective only for block-based table format. |
135 | | virtual Status Get(const ReadOptions& readOptions, const Slice& key, |
136 | | GetContext* get_context, |
137 | | const SliceTransform* prefix_extractor, |
138 | | bool skip_filters = false) = 0; |
139 | | |
140 | | // Use bloom filters in the table file, if present, to filter out keys. The |
141 | | // mget_range will be updated to skip keys that get a negative result from |
142 | | // the filter lookup. |
143 | | virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/, |
144 | | const SliceTransform* /*prefix_extractor*/, |
145 | 0 | MultiGetContext::Range* /*mget_range*/) { |
146 | 0 | return Status::NotSupported(); |
147 | 0 | } |
148 | | |
149 | | virtual void MultiGet(const ReadOptions& readOptions, |
150 | | const MultiGetContext::Range* mget_range, |
151 | | const SliceTransform* prefix_extractor, |
152 | 0 | bool skip_filters = false) { |
153 | 0 | for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) { |
154 | 0 | *iter->s = Get(readOptions, iter->ikey, iter->get_context, |
155 | 0 | prefix_extractor, skip_filters); |
156 | 0 | } |
157 | 0 | } |
158 | | |
159 | | #if USE_COROUTINES |
160 | | virtual folly::coro::Task<void> MultiGetCoroutine( |
161 | | const ReadOptions& readOptions, const MultiGetContext::Range* mget_range, |
162 | | const SliceTransform* prefix_extractor, bool skip_filters = false) { |
163 | | MultiGet(readOptions, mget_range, prefix_extractor, skip_filters); |
164 | | co_return; |
165 | | } |
166 | | #endif // USE_COROUTINES |
167 | | |
168 | | // Prefetch data corresponding to a give range of keys |
169 | | // Typically this functionality is required for table implementations that |
170 | | // persists the data on a non volatile storage medium like disk/SSD |
171 | | virtual Status Prefetch(const ReadOptions& /* read_options */, |
172 | | const Slice* begin = nullptr, |
173 | 0 | const Slice* end = nullptr) { |
174 | 0 | (void)begin; |
175 | 0 | (void)end; |
176 | | // Default implementation is NOOP. |
177 | | // The child class should implement functionality when applicable |
178 | 0 | return Status::OK(); |
179 | 0 | } |
180 | | |
181 | | // convert db file to a human readable form |
182 | | virtual Status DumpTable(WritableFile* /*out_file*/, |
183 | 0 | bool /*show_sequence_number_type*/ = false) { |
184 | 0 | return Status::NotSupported("DumpTable() not supported"); |
185 | 0 | } |
186 | | |
187 | | // check whether there is corruption in this db file |
188 | | virtual Status VerifyChecksum(const ReadOptions& /*read_options*/, |
189 | | TableReaderCaller /*caller*/, |
190 | 0 | bool /*meta_blocks_only*/ = false) { |
191 | 0 | return Status::NotSupported("VerifyChecksum() not supported"); |
192 | 0 | } |
193 | | |
194 | | // Tell the reader that the file should now be obsolete, e.g. as a hint |
195 | | // to delete relevant cache entries on destruction. (It might not be safe |
196 | | // to "unpin" cache entries until destruction time.) NOTE: must be thread |
197 | | // safe because multiple table cache references might all mark this file as |
198 | | // obsolete when they are released (the last of which destroys this reader). |
199 | 0 | virtual void MarkObsolete(uint32_t /*uncache_aggressiveness*/) { |
200 | | // no-op as default |
201 | 0 | } |
202 | | }; |
203 | | |
204 | | } // namespace ROCKSDB_NAMESPACE |