/src/rocksdb/db/table_cache.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "db/table_cache.h" |
11 | | |
12 | | #include "db/dbformat.h" |
13 | | #include "db/range_tombstone_fragmenter.h" |
14 | | #include "db/snapshot_impl.h" |
15 | | #include "db/version_edit.h" |
16 | | #include "file/file_util.h" |
17 | | #include "file/filename.h" |
18 | | #include "file/random_access_file_reader.h" |
19 | | #include "monitoring/perf_context_imp.h" |
20 | | #include "rocksdb/advanced_options.h" |
21 | | #include "rocksdb/statistics.h" |
22 | | #include "table/block_based/block_based_table_reader.h" |
23 | | #include "table/get_context.h" |
24 | | #include "table/internal_iterator.h" |
25 | | #include "table/iterator_wrapper.h" |
26 | | #include "table/multiget_context.h" |
27 | | #include "table/table_builder.h" |
28 | | #include "table/table_reader.h" |
29 | | #include "test_util/sync_point.h" |
30 | | #include "util/cast_util.h" |
31 | | #include "util/coding.h" |
32 | | #include "util/stop_watch.h" |
33 | | |
34 | | // Generate the regular and coroutine versions of some methods by |
35 | | // including table_cache_sync_and_async.h twice |
36 | | // Macros in the header will expand differently based on whether |
37 | | // WITH_COROUTINES or WITHOUT_COROUTINES is defined |
38 | | // clang-format off |
39 | | #define WITHOUT_COROUTINES |
40 | | #include "db/table_cache_sync_and_async.h" |
41 | | #undef WITHOUT_COROUTINES |
42 | | #define WITH_COROUTINES |
43 | | #include "db/table_cache_sync_and_async.h" |
44 | | #undef WITH_COROUTINES |
45 | | // clang-format on |
46 | | |
47 | | namespace ROCKSDB_NAMESPACE { |
48 | | |
49 | | namespace { |
50 | | |
51 | 8.96k | static Slice GetSliceForFileNumber(const uint64_t* file_number) { |
52 | 8.96k | return Slice(reinterpret_cast<const char*>(file_number), |
53 | 8.96k | sizeof(*file_number)); |
54 | 8.96k | } |
55 | | |
56 | | |
57 | 0 | void AppendVarint64(IterKey* key, uint64_t v) { |
58 | 0 | char buf[10]; |
59 | 0 | auto ptr = EncodeVarint64(buf, v); |
60 | 0 | key->TrimAppend(key->Size(), buf, ptr - buf); |
61 | 0 | } |
62 | | |
63 | | |
64 | | } // anonymous namespace |
65 | | |
66 | | const int kLoadConcurency = 128; |
67 | | |
68 | | TableCache::TableCache(const ImmutableOptions& ioptions, |
69 | | const FileOptions* file_options, Cache* const cache, |
70 | | BlockCacheTracer* const block_cache_tracer, |
71 | | const std::shared_ptr<IOTracer>& io_tracer, |
72 | | const std::string& db_session_id) |
73 | | : ioptions_(ioptions), |
74 | | file_options_(*file_options), |
75 | | cache_(cache), |
76 | | immortal_tables_(false), |
77 | | block_cache_tracer_(block_cache_tracer), |
78 | | loader_mutex_(kLoadConcurency), |
79 | | io_tracer_(io_tracer), |
80 | 9.02k | db_session_id_(db_session_id) { |
81 | 9.02k | if (ioptions_.row_cache) { |
82 | | // If the same cache is shared by multiple instances, we need to |
83 | | // disambiguate its entries. |
84 | 0 | PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId()); |
85 | 0 | } |
86 | 9.02k | } |
87 | | |
88 | 9.02k | TableCache::~TableCache() = default; |
89 | | |
90 | | Status TableCache::GetTableReader( |
91 | | const ReadOptions& ro, const FileOptions& file_options, |
92 | | const InternalKeyComparator& internal_comparator, |
93 | | const FileMetaData& file_meta, bool sequential_mode, |
94 | | uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, |
95 | | std::unique_ptr<TableReader>* table_reader, |
96 | | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
97 | | bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, |
98 | 4.48k | size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { |
99 | 4.48k | std::string fname = TableFileName( |
100 | 4.48k | ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); |
101 | 4.48k | std::unique_ptr<FSRandomAccessFile> file; |
102 | 4.48k | FileOptions fopts = file_options; |
103 | 4.48k | fopts.temperature = file_temperature; |
104 | 4.48k | Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); |
105 | 4.48k | TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", |
106 | 4.48k | const_cast<Status*>(&s)); |
107 | 4.48k | if (s.ok()) { |
108 | 4.48k | s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); |
109 | 4.48k | } |
110 | 4.48k | if (s.ok()) { |
111 | 4.48k | RecordTick(ioptions_.stats, NO_FILE_OPENS); |
112 | 4.48k | } else if (s.IsPathNotFound()) { |
113 | 0 | fname = Rocks2LevelTableFileName(fname); |
114 | | // If this file is also not found, we want to use the error message |
115 | | // that contains the table file name which is less confusing. |
116 | 0 | Status temp_s = |
117 | 0 | PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); |
118 | 0 | if (temp_s.ok()) { |
119 | 0 | temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, |
120 | 0 | nullptr); |
121 | 0 | } |
122 | 0 | if (temp_s.ok()) { |
123 | 0 | RecordTick(ioptions_.stats, NO_FILE_OPENS); |
124 | 0 | s = temp_s; |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | 4.48k | if (s.ok()) { |
129 | 4.48k | if (!sequential_mode && ioptions_.advise_random_on_open) { |
130 | 4.48k | file->Hint(FSRandomAccessFile::kRandom); |
131 | 4.48k | } |
132 | 4.48k | if (ioptions_.default_temperature != Temperature::kUnknown && |
133 | 4.48k | file_temperature == Temperature::kUnknown) { |
134 | 0 | file_temperature = ioptions_.default_temperature; |
135 | 0 | } |
136 | 4.48k | StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); |
137 | 4.48k | std::unique_ptr<RandomAccessFileReader> file_reader( |
138 | 4.48k | new RandomAccessFileReader(std::move(file), fname, ioptions_.clock, |
139 | 4.48k | io_tracer_, ioptions_.stats, SST_READ_MICROS, |
140 | 4.48k | file_read_hist, ioptions_.rate_limiter.get(), |
141 | 4.48k | ioptions_.listeners, file_temperature, |
142 | 4.48k | level == ioptions_.num_levels - 1)); |
143 | 4.48k | UniqueId64x2 expected_unique_id; |
144 | 4.48k | if (ioptions_.verify_sst_unique_id_in_manifest) { |
145 | 4.48k | expected_unique_id = file_meta.unique_id; |
146 | 4.48k | } else { |
147 | 0 | expected_unique_id = kNullUniqueId64x2; // null ID == no verification |
148 | 0 | } |
149 | 4.48k | s = ioptions_.table_factory->NewTableReader( |
150 | 4.48k | ro, |
151 | 4.48k | TableReaderOptions( |
152 | 4.48k | ioptions_, prefix_extractor, file_options, internal_comparator, |
153 | 4.48k | block_protection_bytes_per_key, skip_filters, immortal_tables_, |
154 | 4.48k | false /* force_direct_prefetch */, level, block_cache_tracer_, |
155 | 4.48k | max_file_size_for_l0_meta_pin, db_session_id_, |
156 | 4.48k | file_meta.fd.GetNumber(), expected_unique_id, |
157 | 4.48k | file_meta.fd.largest_seqno, file_meta.tail_size, |
158 | 4.48k | file_meta.user_defined_timestamps_persisted), |
159 | 4.48k | std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, |
160 | 4.48k | prefetch_index_and_filter_in_cache); |
161 | 4.48k | TEST_SYNC_POINT("TableCache::GetTableReader:0"); |
162 | 4.48k | } |
163 | 4.48k | return s; |
164 | 4.48k | } |
165 | | |
166 | 0 | Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) { |
167 | 0 | Slice key = GetSliceForFileNumber(&file_number); |
168 | 0 | return cache->Lookup(key); |
169 | 0 | } |
170 | | |
171 | | Status TableCache::FindTable( |
172 | | const ReadOptions& ro, const FileOptions& file_options, |
173 | | const InternalKeyComparator& internal_comparator, |
174 | | const FileMetaData& file_meta, TypedHandle** handle, |
175 | | uint8_t block_protection_bytes_per_key, |
176 | | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
177 | | const bool no_io, HistogramImpl* file_read_hist, bool skip_filters, |
178 | | int level, bool prefetch_index_and_filter_in_cache, |
179 | 8.96k | size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { |
180 | 8.96k | PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); |
181 | 8.96k | uint64_t number = file_meta.fd.GetNumber(); |
182 | 8.96k | Slice key = GetSliceForFileNumber(&number); |
183 | 8.96k | *handle = cache_.Lookup(key); |
184 | 8.96k | TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", |
185 | 8.96k | const_cast<bool*>(&no_io)); |
186 | | |
187 | 8.96k | if (*handle == nullptr) { |
188 | 4.48k | if (no_io) { |
189 | 0 | return Status::Incomplete("Table not found in table_cache, no_io is set"); |
190 | 0 | } |
191 | 4.48k | MutexLock load_lock(&loader_mutex_.Get(key)); |
192 | | // We check the cache again under loading mutex |
193 | 4.48k | *handle = cache_.Lookup(key); |
194 | 4.48k | if (*handle != nullptr) { |
195 | 0 | return Status::OK(); |
196 | 0 | } |
197 | | |
198 | 4.48k | std::unique_ptr<TableReader> table_reader; |
199 | 4.48k | Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, |
200 | 4.48k | false /* sequential mode */, |
201 | 4.48k | block_protection_bytes_per_key, file_read_hist, |
202 | 4.48k | &table_reader, prefix_extractor, skip_filters, |
203 | 4.48k | level, prefetch_index_and_filter_in_cache, |
204 | 4.48k | max_file_size_for_l0_meta_pin, file_temperature); |
205 | 4.48k | if (!s.ok()) { |
206 | 0 | assert(table_reader == nullptr); |
207 | 0 | RecordTick(ioptions_.stats, NO_FILE_ERRORS); |
208 | | // We do not cache error results so that if the error is transient, |
209 | | // or somebody repairs the file, we recover automatically. |
210 | 4.48k | } else { |
211 | 4.48k | s = cache_.Insert(key, table_reader.get(), 1, handle); |
212 | 4.48k | if (s.ok()) { |
213 | | // Release ownership of table reader. |
214 | 4.48k | table_reader.release(); |
215 | 4.48k | } |
216 | 4.48k | } |
217 | 4.48k | return s; |
218 | 4.48k | } |
219 | 4.48k | return Status::OK(); |
220 | 8.96k | } |
221 | | |
222 | | InternalIterator* TableCache::NewIterator( |
223 | | const ReadOptions& options, const FileOptions& file_options, |
224 | | const InternalKeyComparator& icomparator, const FileMetaData& file_meta, |
225 | | RangeDelAggregator* range_del_agg, |
226 | | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
227 | | TableReader** table_reader_ptr, HistogramImpl* file_read_hist, |
228 | | TableReaderCaller caller, Arena* arena, bool skip_filters, int level, |
229 | | size_t max_file_size_for_l0_meta_pin, |
230 | | const InternalKey* smallest_compaction_key, |
231 | | const InternalKey* largest_compaction_key, bool allow_unprepared_value, |
232 | | uint8_t block_protection_bytes_per_key, const SequenceNumber* read_seqno, |
233 | 8.96k | std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter) { |
234 | 8.96k | PERF_TIMER_GUARD(new_table_iterator_nanos); |
235 | | |
236 | 8.96k | Status s; |
237 | 8.96k | TableReader* table_reader = nullptr; |
238 | 8.96k | TypedHandle* handle = nullptr; |
239 | 8.96k | if (table_reader_ptr != nullptr) { |
240 | 0 | *table_reader_ptr = nullptr; |
241 | 0 | } |
242 | 8.96k | bool for_compaction = caller == TableReaderCaller::kCompaction; |
243 | 8.96k | auto& fd = file_meta.fd; |
244 | 8.96k | table_reader = fd.table_reader; |
245 | 8.96k | if (table_reader == nullptr) { |
246 | 4.48k | s = FindTable(options, file_options, icomparator, file_meta, &handle, |
247 | 4.48k | block_protection_bytes_per_key, prefix_extractor, |
248 | 4.48k | options.read_tier == kBlockCacheTier /* no_io */, |
249 | 4.48k | file_read_hist, skip_filters, level, |
250 | 4.48k | true /* prefetch_index_and_filter_in_cache */, |
251 | 4.48k | max_file_size_for_l0_meta_pin, file_meta.temperature); |
252 | 4.48k | if (s.ok()) { |
253 | 4.48k | table_reader = cache_.Value(handle); |
254 | 4.48k | } |
255 | 4.48k | } |
256 | 8.96k | InternalIterator* result = nullptr; |
257 | 8.96k | if (s.ok()) { |
258 | 8.96k | if (options.table_filter && |
259 | 8.96k | !options.table_filter(*table_reader->GetTableProperties())) { |
260 | 0 | result = NewEmptyInternalIterator<Slice>(arena); |
261 | 8.96k | } else { |
262 | 8.96k | result = table_reader->NewIterator( |
263 | 8.96k | options, prefix_extractor.get(), arena, skip_filters, caller, |
264 | 8.96k | file_options.compaction_readahead_size, allow_unprepared_value); |
265 | 8.96k | } |
266 | 8.96k | if (handle != nullptr) { |
267 | 4.48k | cache_.RegisterReleaseAsCleanup(handle, *result); |
268 | 4.48k | handle = nullptr; // prevent from releasing below |
269 | 4.48k | } |
270 | | |
271 | 8.96k | if (for_compaction) { |
272 | 0 | table_reader->SetupForCompaction(); |
273 | 0 | } |
274 | 8.96k | if (table_reader_ptr != nullptr) { |
275 | 0 | *table_reader_ptr = table_reader; |
276 | 0 | } |
277 | 8.96k | } |
278 | 8.96k | if (s.ok() && !options.ignore_range_deletions) { |
279 | 8.96k | if (range_del_iter != nullptr) { |
280 | 4.48k | auto new_range_del_iter = |
281 | 4.48k | read_seqno ? table_reader->NewRangeTombstoneIterator( |
282 | 0 | *read_seqno, options.timestamp) |
283 | 4.48k | : table_reader->NewRangeTombstoneIterator(options); |
284 | 4.48k | if (new_range_del_iter == nullptr || new_range_del_iter->empty()) { |
285 | 1.57k | delete new_range_del_iter; |
286 | 1.57k | *range_del_iter = nullptr; |
287 | 2.90k | } else { |
288 | 2.90k | *range_del_iter = std::make_unique<TruncatedRangeDelIterator>( |
289 | 2.90k | std::unique_ptr<FragmentedRangeTombstoneIterator>( |
290 | 2.90k | new_range_del_iter), |
291 | 2.90k | &icomparator, &file_meta.smallest, &file_meta.largest); |
292 | 2.90k | } |
293 | 4.48k | } |
294 | 8.96k | if (range_del_agg != nullptr) { |
295 | 0 | if (range_del_agg->AddFile(fd.GetNumber())) { |
296 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter( |
297 | 0 | static_cast<FragmentedRangeTombstoneIterator*>( |
298 | 0 | table_reader->NewRangeTombstoneIterator(options))); |
299 | 0 | if (new_range_del_iter != nullptr) { |
300 | 0 | s = new_range_del_iter->status(); |
301 | 0 | } |
302 | 0 | if (s.ok()) { |
303 | 0 | const InternalKey* smallest = &file_meta.smallest; |
304 | 0 | const InternalKey* largest = &file_meta.largest; |
305 | 0 | if (smallest_compaction_key != nullptr) { |
306 | 0 | smallest = smallest_compaction_key; |
307 | 0 | } |
308 | 0 | if (largest_compaction_key != nullptr) { |
309 | 0 | largest = largest_compaction_key; |
310 | 0 | } |
311 | 0 | range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest, |
312 | 0 | largest); |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | 8.96k | } |
317 | | |
318 | 8.96k | if (handle != nullptr) { |
319 | 0 | cache_.Release(handle); |
320 | 0 | } |
321 | 8.96k | if (!s.ok()) { |
322 | 0 | assert(result == nullptr); |
323 | 0 | result = NewErrorInternalIterator<Slice>(s, arena); |
324 | 0 | } |
325 | 8.96k | return result; |
326 | 8.96k | } |
327 | | |
328 | | Status TableCache::GetRangeTombstoneIterator( |
329 | | const ReadOptions& options, |
330 | | const InternalKeyComparator& internal_comparator, |
331 | | const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, |
332 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) { |
333 | 0 | assert(out_iter); |
334 | 0 | const FileDescriptor& fd = file_meta.fd; |
335 | 0 | Status s; |
336 | 0 | TableReader* t = fd.table_reader; |
337 | 0 | TypedHandle* handle = nullptr; |
338 | 0 | if (t == nullptr) { |
339 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
340 | 0 | &handle, block_protection_bytes_per_key); |
341 | 0 | if (s.ok()) { |
342 | 0 | t = cache_.Value(handle); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | if (s.ok()) { |
346 | | // Note: NewRangeTombstoneIterator could return nullptr |
347 | 0 | out_iter->reset(t->NewRangeTombstoneIterator(options)); |
348 | 0 | } |
349 | 0 | if (handle) { |
350 | 0 | if (*out_iter) { |
351 | 0 | cache_.RegisterReleaseAsCleanup(handle, **out_iter); |
352 | 0 | } else { |
353 | 0 | cache_.Release(handle); |
354 | 0 | } |
355 | 0 | } |
356 | 0 | return s; |
357 | 0 | } |
358 | | |
359 | | uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, |
360 | | const FileDescriptor& fd, |
361 | | const Slice& internal_key, |
362 | | GetContext* get_context, |
363 | 0 | IterKey& row_cache_key) { |
364 | 0 | uint64_t fd_number = fd.GetNumber(); |
365 | | // We use the user key as cache key instead of the internal key, |
366 | | // otherwise the whole cache would be invalidated every time the |
367 | | // sequence key increases. However, to support caching snapshot |
368 | | // reads, we append a sequence number (incremented by 1 to |
369 | | // distinguish from 0) other than internal_key seq no |
370 | | // to determine row cache entry visibility. |
371 | | // If the snapshot is larger than the largest seqno in the file, |
372 | | // all data should be exposed to the snapshot, so we treat it |
373 | | // the same as there is no snapshot. The exception is that if |
374 | | // a seq-checking callback is registered, some internal keys |
375 | | // may still be filtered out. |
376 | 0 | uint64_t cache_entry_seq_no = 0; |
377 | | |
378 | | // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. |
379 | 0 | if (options.snapshot != nullptr && |
380 | 0 | (get_context->has_callback() || |
381 | 0 | static_cast_with_check<const SnapshotImpl>(options.snapshot) |
382 | 0 | ->GetSequenceNumber() <= fd.largest_seqno)) { |
383 | | // We should consider to use options.snapshot->GetSequenceNumber() |
384 | | // instead of GetInternalKeySeqno(k), which will make the code |
385 | | // easier to understand. |
386 | 0 | cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key); |
387 | 0 | } |
388 | | |
389 | | // Compute row cache key. |
390 | 0 | row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), |
391 | 0 | row_cache_id_.size()); |
392 | 0 | AppendVarint64(&row_cache_key, fd_number); |
393 | 0 | AppendVarint64(&row_cache_key, cache_entry_seq_no); |
394 | | |
395 | | // Provide a sequence number for callback checking on cache hit. |
396 | | // As cache_entry_seq_no starts at 1, decrease it's value by 1 to get |
397 | | // a sequence number align with get context's logic. |
398 | 0 | return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1; |
399 | 0 | } |
400 | | |
401 | | bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, |
402 | | size_t prefix_size, GetContext* get_context, |
403 | 0 | Status* read_status, SequenceNumber seq_no) { |
404 | 0 | bool found = false; |
405 | |
|
406 | 0 | row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); |
407 | 0 | RowCacheInterface row_cache{ioptions_.row_cache.get()}; |
408 | 0 | if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) { |
409 | | // Cleanable routine to release the cache entry |
410 | 0 | Cleanable value_pinner; |
411 | | // If it comes here value is located on the cache. |
412 | | // found_row_cache_entry points to the value on cache, |
413 | | // and value_pinner has cleanup procedure for the cached entry. |
414 | | // After replayGetContextLog() returns, get_context.pinnable_slice_ |
415 | | // will point to cache entry buffer (or a copy based on that) and |
416 | | // cleanup routine under value_pinner will be delegated to |
417 | | // get_context.pinnable_slice_. Cache entry is released when |
418 | | // get_context.pinnable_slice_ is reset. |
419 | 0 | row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); |
420 | | // If row cache hit, knowing cache key is the same to row_cache_key, |
421 | | // can use row_cache_key's seq no to construct InternalKey. |
422 | 0 | *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key, |
423 | 0 | get_context, &value_pinner, seq_no); |
424 | 0 | RecordTick(ioptions_.stats, ROW_CACHE_HIT); |
425 | 0 | found = true; |
426 | 0 | } else { |
427 | 0 | RecordTick(ioptions_.stats, ROW_CACHE_MISS); |
428 | 0 | } |
429 | 0 | return found; |
430 | 0 | } |
431 | | |
432 | | Status TableCache::Get( |
433 | | const ReadOptions& options, |
434 | | const InternalKeyComparator& internal_comparator, |
435 | | const FileMetaData& file_meta, const Slice& k, GetContext* get_context, |
436 | | uint8_t block_protection_bytes_per_key, |
437 | | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
438 | | HistogramImpl* file_read_hist, bool skip_filters, int level, |
439 | 0 | size_t max_file_size_for_l0_meta_pin) { |
440 | 0 | auto& fd = file_meta.fd; |
441 | 0 | std::string* row_cache_entry = nullptr; |
442 | 0 | bool done = false; |
443 | 0 | IterKey row_cache_key; |
444 | 0 | std::string row_cache_entry_buffer; |
445 | | |
446 | | // Check row cache if enabled. |
447 | | // Reuse row_cache_key sequence number when row cache hits. |
448 | 0 | Status s; |
449 | 0 | if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { |
450 | 0 | auto user_key = ExtractUserKey(k); |
451 | 0 | uint64_t cache_entry_seq_no = |
452 | 0 | CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); |
453 | 0 | done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), |
454 | 0 | get_context, &s, cache_entry_seq_no); |
455 | 0 | if (!done) { |
456 | 0 | row_cache_entry = &row_cache_entry_buffer; |
457 | 0 | } |
458 | 0 | } |
459 | 0 | TableReader* t = fd.table_reader; |
460 | 0 | TypedHandle* handle = nullptr; |
461 | 0 | if (s.ok() && !done) { |
462 | 0 | if (t == nullptr) { |
463 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
464 | 0 | &handle, block_protection_bytes_per_key, prefix_extractor, |
465 | 0 | options.read_tier == kBlockCacheTier /* no_io */, |
466 | 0 | file_read_hist, skip_filters, level, |
467 | 0 | true /* prefetch_index_and_filter_in_cache */, |
468 | 0 | max_file_size_for_l0_meta_pin, file_meta.temperature); |
469 | 0 | if (s.ok()) { |
470 | 0 | t = cache_.Value(handle); |
471 | 0 | } |
472 | 0 | } |
473 | 0 | SequenceNumber* max_covering_tombstone_seq = |
474 | 0 | get_context->max_covering_tombstone_seq(); |
475 | 0 | if (s.ok() && max_covering_tombstone_seq != nullptr && |
476 | 0 | !options.ignore_range_deletions) { |
477 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( |
478 | 0 | t->NewRangeTombstoneIterator(options)); |
479 | 0 | if (range_del_iter != nullptr) { |
480 | 0 | SequenceNumber seq = |
481 | 0 | range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); |
482 | 0 | if (seq > *max_covering_tombstone_seq) { |
483 | 0 | *max_covering_tombstone_seq = seq; |
484 | 0 | if (get_context->NeedTimestamp()) { |
485 | 0 | get_context->SetTimestampFromRangeTombstone( |
486 | 0 | range_del_iter->timestamp()); |
487 | 0 | } |
488 | 0 | } |
489 | 0 | } |
490 | 0 | } |
491 | 0 | if (s.ok()) { |
492 | 0 | get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. |
493 | 0 | s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); |
494 | 0 | get_context->SetReplayLog(nullptr); |
495 | 0 | } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { |
496 | | // Couldn't find table in cache and couldn't open it because of no_io. |
497 | 0 | get_context->MarkKeyMayExist(); |
498 | 0 | done = true; |
499 | 0 | } |
500 | 0 | } |
501 | | |
502 | | // Put the replay log in row cache only if something was found. |
503 | 0 | if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { |
504 | 0 | RowCacheInterface row_cache{ioptions_.row_cache.get()}; |
505 | 0 | size_t charge = row_cache_entry->capacity() + sizeof(std::string); |
506 | 0 | auto row_ptr = new std::string(std::move(*row_cache_entry)); |
507 | 0 | Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge); |
508 | 0 | if (!rcs.ok()) { |
509 | | // If row cache is full, it's OK to continue, but we keep ownership of |
510 | | // row_ptr. |
511 | 0 | delete row_ptr; |
512 | 0 | } |
513 | 0 | } |
514 | |
|
515 | 0 | if (handle != nullptr) { |
516 | 0 | cache_.Release(handle); |
517 | 0 | } |
518 | 0 | return s; |
519 | 0 | } |
520 | | |
521 | | void TableCache::UpdateRangeTombstoneSeqnums( |
522 | | const ReadOptions& options, TableReader* t, |
523 | 0 | MultiGetContext::Range& table_range) { |
524 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( |
525 | 0 | t->NewRangeTombstoneIterator(options)); |
526 | 0 | if (range_del_iter != nullptr) { |
527 | 0 | for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { |
528 | 0 | SequenceNumber* max_covering_tombstone_seq = |
529 | 0 | iter->get_context->max_covering_tombstone_seq(); |
530 | 0 | SequenceNumber seq = |
531 | 0 | range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts); |
532 | 0 | if (seq > *max_covering_tombstone_seq) { |
533 | 0 | *max_covering_tombstone_seq = seq; |
534 | 0 | if (iter->get_context->NeedTimestamp()) { |
535 | 0 | iter->get_context->SetTimestampFromRangeTombstone( |
536 | 0 | range_del_iter->timestamp()); |
537 | 0 | } |
538 | 0 | } |
539 | 0 | } |
540 | 0 | } |
541 | 0 | } |
542 | | |
543 | | Status TableCache::MultiGetFilter( |
544 | | const ReadOptions& options, |
545 | | const InternalKeyComparator& internal_comparator, |
546 | | const FileMetaData& file_meta, |
547 | | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
548 | | HistogramImpl* file_read_hist, int level, |
549 | | MultiGetContext::Range* mget_range, TypedHandle** table_handle, |
550 | 0 | uint8_t block_protection_bytes_per_key) { |
551 | 0 | auto& fd = file_meta.fd; |
552 | 0 | IterKey row_cache_key; |
553 | 0 | std::string row_cache_entry_buffer; |
554 | | |
555 | | // Check if we need to use the row cache. If yes, then we cannot do the |
556 | | // filtering here, since the filtering needs to happen after the row cache |
557 | | // lookup. |
558 | 0 | KeyContext& first_key = *mget_range->begin(); |
559 | 0 | if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) { |
560 | 0 | return Status::NotSupported(); |
561 | 0 | } |
562 | 0 | Status s; |
563 | 0 | TableReader* t = fd.table_reader; |
564 | 0 | TypedHandle* handle = nullptr; |
565 | 0 | MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), |
566 | 0 | mget_range->end()); |
567 | 0 | if (t == nullptr) { |
568 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
569 | 0 | &handle, block_protection_bytes_per_key, prefix_extractor, |
570 | 0 | options.read_tier == kBlockCacheTier /* no_io */, |
571 | 0 | file_read_hist, |
572 | 0 | /*skip_filters=*/false, level, |
573 | 0 | true /* prefetch_index_and_filter_in_cache */, |
574 | 0 | /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); |
575 | 0 | if (s.ok()) { |
576 | 0 | t = cache_.Value(handle); |
577 | 0 | } |
578 | 0 | *table_handle = handle; |
579 | 0 | } |
580 | 0 | if (s.ok()) { |
581 | 0 | s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); |
582 | 0 | } |
583 | 0 | if (s.ok() && !options.ignore_range_deletions) { |
584 | | // Update the range tombstone sequence numbers for the keys here |
585 | | // as TableCache::MultiGet may or may not be called, and even if it |
586 | | // is, it may be called with fewer keys in the rangedue to filtering. |
587 | 0 | UpdateRangeTombstoneSeqnums(options, t, tombstone_range); |
588 | 0 | } |
589 | 0 | if (mget_range->empty() && handle) { |
590 | 0 | cache_.Release(handle); |
591 | 0 | *table_handle = nullptr; |
592 | 0 | } |
593 | |
|
594 | 0 | return s; |
595 | 0 | } |
596 | | |
597 | | Status TableCache::GetTableProperties( |
598 | | const FileOptions& file_options, const ReadOptions& read_options, |
599 | | const InternalKeyComparator& internal_comparator, |
600 | | const FileMetaData& file_meta, |
601 | | std::shared_ptr<const TableProperties>* properties, |
602 | | uint8_t block_protection_bytes_per_key, |
603 | 4.48k | const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) { |
604 | 4.48k | auto table_reader = file_meta.fd.table_reader; |
605 | | // table already been pre-loaded? |
606 | 4.48k | if (table_reader) { |
607 | 4.48k | *properties = table_reader->GetTableProperties(); |
608 | | |
609 | 4.48k | return Status::OK(); |
610 | 4.48k | } |
611 | | |
612 | 0 | TypedHandle* table_handle = nullptr; |
613 | 0 | Status s = FindTable(read_options, file_options, internal_comparator, |
614 | 0 | file_meta, &table_handle, block_protection_bytes_per_key, |
615 | 0 | prefix_extractor, no_io); |
616 | 0 | if (!s.ok()) { |
617 | 0 | return s; |
618 | 0 | } |
619 | 0 | assert(table_handle); |
620 | 0 | auto table = cache_.Value(table_handle); |
621 | 0 | *properties = table->GetTableProperties(); |
622 | 0 | cache_.Release(table_handle); |
623 | 0 | return s; |
624 | 0 | } |
625 | | |
626 | | Status TableCache::ApproximateKeyAnchors( |
627 | | const ReadOptions& ro, const InternalKeyComparator& internal_comparator, |
628 | | const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, |
629 | 0 | std::vector<TableReader::Anchor>& anchors) { |
630 | 0 | Status s; |
631 | 0 | TableReader* t = file_meta.fd.table_reader; |
632 | 0 | TypedHandle* handle = nullptr; |
633 | 0 | if (t == nullptr) { |
634 | 0 | s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, |
635 | 0 | block_protection_bytes_per_key); |
636 | 0 | if (s.ok()) { |
637 | 0 | t = cache_.Value(handle); |
638 | 0 | } |
639 | 0 | } |
640 | 0 | if (s.ok() && t != nullptr) { |
641 | 0 | s = t->ApproximateKeyAnchors(ro, anchors); |
642 | 0 | } |
643 | 0 | if (handle != nullptr) { |
644 | 0 | cache_.Release(handle); |
645 | 0 | } |
646 | 0 | return s; |
647 | 0 | } |
648 | | |
649 | | size_t TableCache::GetMemoryUsageByTableReader( |
650 | | const FileOptions& file_options, const ReadOptions& read_options, |
651 | | const InternalKeyComparator& internal_comparator, |
652 | | const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, |
653 | 0 | const std::shared_ptr<const SliceTransform>& prefix_extractor) { |
654 | 0 | auto table_reader = file_meta.fd.table_reader; |
655 | | // table already been pre-loaded? |
656 | 0 | if (table_reader) { |
657 | 0 | return table_reader->ApproximateMemoryUsage(); |
658 | 0 | } |
659 | | |
660 | 0 | TypedHandle* table_handle = nullptr; |
661 | 0 | Status s = FindTable(read_options, file_options, internal_comparator, |
662 | 0 | file_meta, &table_handle, block_protection_bytes_per_key, |
663 | 0 | prefix_extractor, true /* no_io */); |
664 | 0 | if (!s.ok()) { |
665 | 0 | return 0; |
666 | 0 | } |
667 | 0 | assert(table_handle); |
668 | 0 | auto table = cache_.Value(table_handle); |
669 | 0 | auto ret = table->ApproximateMemoryUsage(); |
670 | 0 | cache_.Release(table_handle); |
671 | 0 | return ret; |
672 | 0 | } |
673 | | |
674 | 0 | void TableCache::Evict(Cache* cache, uint64_t file_number) { |
675 | 0 | cache->Erase(GetSliceForFileNumber(&file_number)); |
676 | 0 | } |
677 | | |
678 | | uint64_t TableCache::ApproximateOffsetOf( |
679 | | const ReadOptions& read_options, const Slice& key, |
680 | | const FileMetaData& file_meta, TableReaderCaller caller, |
681 | | const InternalKeyComparator& internal_comparator, |
682 | | uint8_t block_protection_bytes_per_key, |
683 | 0 | const std::shared_ptr<const SliceTransform>& prefix_extractor) { |
684 | 0 | uint64_t result = 0; |
685 | 0 | TableReader* table_reader = file_meta.fd.table_reader; |
686 | 0 | TypedHandle* table_handle = nullptr; |
687 | 0 | if (table_reader == nullptr) { |
688 | 0 | Status s = |
689 | 0 | FindTable(read_options, file_options_, internal_comparator, file_meta, |
690 | 0 | &table_handle, block_protection_bytes_per_key, |
691 | 0 | prefix_extractor, false /* no_io */); |
692 | 0 | if (s.ok()) { |
693 | 0 | table_reader = cache_.Value(table_handle); |
694 | 0 | } |
695 | 0 | } |
696 | |
|
697 | 0 | if (table_reader != nullptr) { |
698 | 0 | result = table_reader->ApproximateOffsetOf(read_options, key, caller); |
699 | 0 | } |
700 | 0 | if (table_handle != nullptr) { |
701 | 0 | cache_.Release(table_handle); |
702 | 0 | } |
703 | |
|
704 | 0 | return result; |
705 | 0 | } |
706 | | |
707 | | uint64_t TableCache::ApproximateSize( |
708 | | const ReadOptions& read_options, const Slice& start, const Slice& end, |
709 | | const FileMetaData& file_meta, TableReaderCaller caller, |
710 | | const InternalKeyComparator& internal_comparator, |
711 | | uint8_t block_protection_bytes_per_key, |
712 | 0 | const std::shared_ptr<const SliceTransform>& prefix_extractor) { |
713 | 0 | uint64_t result = 0; |
714 | 0 | TableReader* table_reader = file_meta.fd.table_reader; |
715 | 0 | TypedHandle* table_handle = nullptr; |
716 | 0 | if (table_reader == nullptr) { |
717 | 0 | Status s = |
718 | 0 | FindTable(read_options, file_options_, internal_comparator, file_meta, |
719 | 0 | &table_handle, block_protection_bytes_per_key, |
720 | 0 | prefix_extractor, false /* no_io */); |
721 | 0 | if (s.ok()) { |
722 | 0 | table_reader = cache_.Value(table_handle); |
723 | 0 | } |
724 | 0 | } |
725 | |
|
726 | 0 | if (table_reader != nullptr) { |
727 | 0 | result = table_reader->ApproximateSize(read_options, start, end, caller); |
728 | 0 | } |
729 | 0 | if (table_handle != nullptr) { |
730 | 0 | cache_.Release(table_handle); |
731 | 0 | } |
732 | |
|
733 | 0 | return result; |
734 | 0 | } |
735 | | |
736 | | void TableCache::ReleaseObsolete(Cache* cache, Cache::Handle* h, |
737 | 4.48k | uint32_t uncache_aggressiveness) { |
738 | 4.48k | CacheInterface typed_cache(cache); |
739 | 4.48k | TypedHandle* table_handle = reinterpret_cast<TypedHandle*>(h); |
740 | 4.48k | TableReader* table_reader = typed_cache.Value(table_handle); |
741 | 4.48k | table_reader->MarkObsolete(uncache_aggressiveness); |
742 | 4.48k | typed_cache.ReleaseAndEraseIfLastRef(table_handle); |
743 | 4.48k | } |
744 | | |
745 | | } // namespace ROCKSDB_NAMESPACE |