/src/rocksdb/db/table_cache.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "db/table_cache.h" |
11 | | |
12 | | #include "db/dbformat.h" |
13 | | #include "db/range_tombstone_fragmenter.h" |
14 | | #include "db/snapshot_impl.h" |
15 | | #include "db/version_edit.h" |
16 | | #include "file/file_util.h" |
17 | | #include "file/filename.h" |
18 | | #include "file/random_access_file_reader.h" |
19 | | #include "monitoring/perf_context_imp.h" |
20 | | #include "rocksdb/advanced_options.h" |
21 | | #include "rocksdb/statistics.h" |
22 | | #include "table/block_based/block_based_table_reader.h" |
23 | | #include "table/get_context.h" |
24 | | #include "table/internal_iterator.h" |
25 | | #include "table/iterator_wrapper.h" |
26 | | #include "table/multiget_context.h" |
27 | | #include "table/table_builder.h" |
28 | | #include "table/table_reader.h" |
29 | | #include "test_util/sync_point.h" |
30 | | #include "util/cast_util.h" |
31 | | #include "util/coding.h" |
32 | | #include "util/stop_watch.h" |
33 | | |
34 | | // Generate the regular and coroutine versions of some methods by |
35 | | // including table_cache_sync_and_async.h twice |
36 | | // Macros in the header will expand differently based on whether |
37 | | // WITH_COROUTINES or WITHOUT_COROUTINES is defined |
38 | | // clang-format off |
39 | | #define WITHOUT_COROUTINES |
40 | | #include "db/table_cache_sync_and_async.h" |
41 | | #undef WITHOUT_COROUTINES |
42 | | #define WITH_COROUTINES |
43 | | #include "db/table_cache_sync_and_async.h" |
44 | | #undef WITH_COROUTINES |
45 | | // clang-format on |
46 | | |
47 | | namespace ROCKSDB_NAMESPACE { |
48 | | |
49 | | namespace { |
50 | | |
51 | 165k | static Slice GetSliceForFileNumber(const uint64_t* file_number) { |
52 | 165k | return Slice(reinterpret_cast<const char*>(file_number), |
53 | 165k | sizeof(*file_number)); |
54 | 165k | } |
55 | | |
56 | 0 | void AppendVarint64(IterKey* key, uint64_t v) { |
57 | 0 | char buf[10]; |
58 | 0 | auto ptr = EncodeVarint64(buf, v); |
59 | 0 | key->TrimAppend(key->Size(), buf, ptr - buf); |
60 | 0 | } |
61 | | |
62 | | } // anonymous namespace |
63 | | |
64 | | const int kLoadConcurency = 128; |
65 | | |
66 | | TableCache::TableCache(const ImmutableOptions& ioptions, |
67 | | const FileOptions* file_options, Cache* const cache, |
68 | | BlockCacheTracer* const block_cache_tracer, |
69 | | const std::shared_ptr<IOTracer>& io_tracer, |
70 | | const std::string& db_session_id) |
71 | 118k | : ioptions_(ioptions), |
72 | 118k | file_options_(*file_options), |
73 | 118k | cache_(cache), |
74 | 118k | immortal_tables_(false), |
75 | 118k | block_cache_tracer_(block_cache_tracer), |
76 | 118k | loader_mutex_(kLoadConcurency), |
77 | 118k | io_tracer_(io_tracer), |
78 | 118k | db_session_id_(db_session_id) { |
79 | 118k | if (ioptions_.row_cache) { |
80 | | // If the same cache is shared by multiple instances, we need to |
81 | | // disambiguate its entries. |
82 | 0 | PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId()); |
83 | 0 | } |
84 | 118k | } |
85 | | |
86 | 118k | TableCache::~TableCache() = default; |
87 | | |
88 | | Status TableCache::GetTableReader( |
89 | | const ReadOptions& ro, const FileOptions& file_options, |
90 | | const InternalKeyComparator& internal_comparator, |
91 | | const FileMetaData& file_meta, bool sequential_mode, |
92 | | HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader, |
93 | | const MutableCFOptions& mutable_cf_options, bool skip_filters, int level, |
94 | | bool prefetch_index_and_filter_in_cache, |
95 | 126k | size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { |
96 | 126k | std::string fname = TableFileName( |
97 | 126k | ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); |
98 | 126k | std::unique_ptr<FSRandomAccessFile> file; |
99 | 126k | FileOptions fopts = file_options; |
100 | 126k | fopts.temperature = file_temperature; |
101 | 126k | Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); |
102 | 126k | TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", |
103 | 126k | const_cast<Status*>(&s)); |
104 | 126k | if (s.ok()) { |
105 | 125k | s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); |
106 | 125k | } |
107 | 126k | if (s.ok()) { |
108 | 125k | RecordTick(ioptions_.stats, NO_FILE_OPENS); |
109 | 125k | } else if (s.IsPathNotFound()) { |
110 | 0 | fname = Rocks2LevelTableFileName(fname); |
111 | | // If this file is also not found, we want to use the error message |
112 | | // that contains the table file name which is less confusing. |
113 | 0 | Status temp_s = |
114 | 0 | PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); |
115 | 0 | if (temp_s.ok()) { |
116 | 0 | temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, |
117 | 0 | nullptr); |
118 | 0 | } |
119 | 0 | if (temp_s.ok()) { |
120 | 0 | RecordTick(ioptions_.stats, NO_FILE_OPENS); |
121 | 0 | s = temp_s; |
122 | 0 | } |
123 | 0 | } |
124 | | |
125 | 126k | if (s.ok()) { |
126 | 125k | if (!sequential_mode && ioptions_.advise_random_on_open) { |
127 | 125k | file->Hint(FSRandomAccessFile::kRandom); |
128 | 125k | } |
129 | 125k | if (ioptions_.default_temperature != Temperature::kUnknown && |
130 | 125k | file_temperature == Temperature::kUnknown) { |
131 | 0 | file_temperature = ioptions_.default_temperature; |
132 | 0 | } |
133 | 125k | StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); |
134 | 125k | std::unique_ptr<RandomAccessFileReader> file_reader( |
135 | 125k | new RandomAccessFileReader(std::move(file), fname, ioptions_.clock, |
136 | 125k | io_tracer_, ioptions_.stats, SST_READ_MICROS, |
137 | 125k | file_read_hist, ioptions_.rate_limiter.get(), |
138 | 125k | ioptions_.listeners, file_temperature, |
139 | 125k | level == ioptions_.num_levels - 1)); |
140 | 125k | UniqueId64x2 expected_unique_id; |
141 | 125k | if (ioptions_.verify_sst_unique_id_in_manifest) { |
142 | 125k | expected_unique_id = file_meta.unique_id; |
143 | 18.4E | } else { |
144 | 18.4E | expected_unique_id = kNullUniqueId64x2; // null ID == no verification |
145 | 18.4E | } |
146 | 125k | s = mutable_cf_options.table_factory->NewTableReader( |
147 | 125k | ro, |
148 | 125k | TableReaderOptions( |
149 | 125k | ioptions_, mutable_cf_options.prefix_extractor, |
150 | 125k | mutable_cf_options.compression_manager.get(), file_options, |
151 | 125k | internal_comparator, |
152 | 125k | mutable_cf_options.block_protection_bytes_per_key, skip_filters, |
153 | 125k | immortal_tables_, false /* force_direct_prefetch */, level, |
154 | 125k | block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_, |
155 | 125k | file_meta.fd.GetNumber(), expected_unique_id, |
156 | 125k | file_meta.fd.largest_seqno, file_meta.tail_size, |
157 | 125k | file_meta.user_defined_timestamps_persisted), |
158 | 125k | std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, |
159 | 125k | prefetch_index_and_filter_in_cache); |
160 | 125k | TEST_SYNC_POINT("TableCache::GetTableReader:0"); |
161 | 125k | } |
162 | 126k | return s; |
163 | 126k | } |
164 | | |
165 | 0 | Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) { |
166 | | // NOTE: sharing same Cache with BlobFileCache |
167 | 0 | Slice key = GetSliceForFileNumber(&file_number); |
168 | 0 | return cache->Lookup(key); |
169 | 0 | } |
170 | | |
171 | | Status TableCache::FindTable( |
172 | | const ReadOptions& ro, const FileOptions& file_options, |
173 | | const InternalKeyComparator& internal_comparator, |
174 | | const FileMetaData& file_meta, TypedHandle** handle, |
175 | | const MutableCFOptions& mutable_cf_options, const bool no_io, |
176 | | HistogramImpl* file_read_hist, bool skip_filters, int level, |
177 | | bool prefetch_index_and_filter_in_cache, |
178 | 156k | size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { |
179 | 156k | PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); |
180 | 156k | uint64_t number = file_meta.fd.GetNumber(); |
181 | | // NOTE: sharing same Cache with BlobFileCache |
182 | 156k | Slice key = GetSliceForFileNumber(&number); |
183 | 156k | *handle = cache_.Lookup(key); |
184 | 156k | TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", |
185 | 156k | const_cast<bool*>(&no_io)); |
186 | | |
187 | 156k | if (*handle == nullptr) { |
188 | 126k | if (no_io) { |
189 | 0 | return Status::Incomplete("Table not found in table_cache, no_io is set"); |
190 | 0 | } |
191 | 126k | MutexLock load_lock(&loader_mutex_.Get(key)); |
192 | | // We check the cache again under loading mutex |
193 | 126k | *handle = cache_.Lookup(key); |
194 | 126k | if (*handle != nullptr) { |
195 | 0 | return Status::OK(); |
196 | 0 | } |
197 | | |
198 | 126k | std::unique_ptr<TableReader> table_reader; |
199 | 126k | Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, |
200 | 126k | false /* sequential mode */, file_read_hist, |
201 | 126k | &table_reader, mutable_cf_options, skip_filters, |
202 | 126k | level, prefetch_index_and_filter_in_cache, |
203 | 126k | max_file_size_for_l0_meta_pin, file_temperature); |
204 | 126k | if (!s.ok()) { |
205 | 0 | assert(table_reader == nullptr); |
206 | 0 | RecordTick(ioptions_.stats, NO_FILE_ERRORS); |
207 | | // We do not cache error results so that if the error is transient, |
208 | | // or somebody repairs the file, we recover automatically. |
209 | 0 | IGNORE_STATUS_IF_ERROR(s); |
210 | 126k | } else { |
211 | 126k | s = cache_.Insert(key, table_reader.get(), 1, handle); |
212 | 126k | if (s.ok()) { |
213 | | // Release ownership of table reader. |
214 | 125k | table_reader.release(); |
215 | 125k | } |
216 | 126k | } |
217 | 126k | return s; |
218 | 126k | } |
219 | 30.8k | return Status::OK(); |
220 | 156k | } |
221 | | |
222 | | InternalIterator* TableCache::NewIterator( |
223 | | const ReadOptions& options, const FileOptions& file_options, |
224 | | const InternalKeyComparator& icomparator, const FileMetaData& file_meta, |
225 | | RangeDelAggregator* range_del_agg, |
226 | | const MutableCFOptions& mutable_cf_options, TableReader** table_reader_ptr, |
227 | | HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, |
228 | | bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin, |
229 | | const InternalKey* smallest_compaction_key, |
230 | | const InternalKey* largest_compaction_key, bool allow_unprepared_value, |
231 | | const SequenceNumber* read_seqno, |
232 | 82.3k | std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter) { |
233 | 82.3k | PERF_TIMER_GUARD(new_table_iterator_nanos); |
234 | | |
235 | 82.3k | Status s; |
236 | 82.3k | TableReader* table_reader = nullptr; |
237 | 82.3k | TypedHandle* handle = nullptr; |
238 | 82.3k | if (table_reader_ptr != nullptr) { |
239 | 0 | *table_reader_ptr = nullptr; |
240 | 0 | } |
241 | 82.3k | bool for_compaction = caller == TableReaderCaller::kCompaction; |
242 | 82.3k | auto& fd = file_meta.fd; |
243 | 82.3k | table_reader = fd.table_reader; |
244 | 82.3k | if (table_reader == nullptr) { |
245 | 23.9k | s = FindTable(options, file_options, icomparator, file_meta, &handle, |
246 | 23.9k | mutable_cf_options, |
247 | 23.9k | options.read_tier == kBlockCacheTier /* no_io */, |
248 | 23.9k | file_read_hist, skip_filters, level, |
249 | 23.9k | true /* prefetch_index_and_filter_in_cache */, |
250 | 23.9k | max_file_size_for_l0_meta_pin, file_meta.temperature); |
251 | 23.9k | if (s.ok()) { |
252 | 23.9k | table_reader = cache_.Value(handle); |
253 | 23.9k | } |
254 | 23.9k | } |
255 | 82.3k | InternalIterator* result = nullptr; |
256 | 82.3k | if (s.ok()) { |
257 | 82.3k | if (options.table_filter && |
258 | 82.3k | !options.table_filter(*table_reader->GetTableProperties())) { |
259 | 0 | result = NewEmptyInternalIterator<Slice>(arena); |
260 | 82.3k | } else { |
261 | 82.3k | result = table_reader->NewIterator( |
262 | 82.3k | options, mutable_cf_options.prefix_extractor.get(), arena, |
263 | 82.3k | skip_filters, caller, file_options.compaction_readahead_size, |
264 | 82.3k | allow_unprepared_value); |
265 | 82.3k | } |
266 | 82.3k | if (handle != nullptr) { |
267 | 23.9k | cache_.RegisterReleaseAsCleanup(handle, *result); |
268 | 23.9k | handle = nullptr; // prevent from releasing below |
269 | 23.9k | } |
270 | | |
271 | 82.3k | if (for_compaction) { |
272 | 25.0k | table_reader->SetupForCompaction(); |
273 | 25.0k | } |
274 | 82.3k | if (table_reader_ptr != nullptr) { |
275 | 0 | *table_reader_ptr = table_reader; |
276 | 0 | } |
277 | 82.3k | } |
278 | 82.3k | if (s.ok() && !options.ignore_range_deletions) { |
279 | 82.3k | if (range_del_iter != nullptr) { |
280 | 54.6k | auto new_range_del_iter = |
281 | 54.6k | read_seqno ? table_reader->NewRangeTombstoneIterator( |
282 | 13.8k | *read_seqno, options.timestamp) |
283 | 54.6k | : table_reader->NewRangeTombstoneIterator(options); |
284 | 54.6k | if (new_range_del_iter == nullptr || new_range_del_iter->empty()) { |
285 | 50.9k | delete new_range_del_iter; |
286 | 50.9k | *range_del_iter = nullptr; |
287 | 50.9k | } else { |
288 | 3.72k | *range_del_iter = std::make_unique<TruncatedRangeDelIterator>( |
289 | 3.72k | std::unique_ptr<FragmentedRangeTombstoneIterator>( |
290 | 3.72k | new_range_del_iter), |
291 | 3.72k | &icomparator, &file_meta.smallest, &file_meta.largest); |
292 | 3.72k | } |
293 | 54.6k | } |
294 | 82.3k | if (range_del_agg != nullptr) { |
295 | 28.7k | if (range_del_agg->AddFile(fd.GetNumber())) { |
296 | 28.7k | std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter( |
297 | 28.7k | static_cast<FragmentedRangeTombstoneIterator*>( |
298 | 28.7k | table_reader->NewRangeTombstoneIterator(options))); |
299 | 28.7k | if (new_range_del_iter != nullptr) { |
300 | 0 | s = new_range_del_iter->status(); |
301 | 0 | } |
302 | 28.7k | if (s.ok()) { |
303 | 28.7k | const InternalKey* smallest = &file_meta.smallest; |
304 | 28.7k | const InternalKey* largest = &file_meta.largest; |
305 | 28.7k | if (smallest_compaction_key != nullptr) { |
306 | 5.36k | smallest = smallest_compaction_key; |
307 | 5.36k | } |
308 | 28.7k | if (largest_compaction_key != nullptr) { |
309 | 5.36k | largest = largest_compaction_key; |
310 | 5.36k | } |
311 | 28.7k | range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest, |
312 | 28.7k | largest); |
313 | 28.7k | } |
314 | 28.7k | } |
315 | 28.7k | } |
316 | 82.3k | } |
317 | | |
318 | 82.3k | if (handle != nullptr) { |
319 | 0 | cache_.Release(handle); |
320 | 0 | } |
321 | 82.3k | if (!s.ok()) { |
322 | 0 | assert(result == nullptr); |
323 | 0 | result = NewErrorInternalIterator<Slice>(s, arena); |
324 | 0 | } |
325 | 82.3k | return result; |
326 | 82.3k | } |
327 | | |
328 | | Status TableCache::GetRangeTombstoneIterator( |
329 | | const ReadOptions& options, |
330 | | const InternalKeyComparator& internal_comparator, |
331 | | const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, |
332 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) { |
333 | 0 | assert(out_iter); |
334 | 0 | const FileDescriptor& fd = file_meta.fd; |
335 | 0 | Status s; |
336 | 0 | TableReader* t = fd.table_reader; |
337 | 0 | TypedHandle* handle = nullptr; |
338 | 0 | if (t == nullptr) { |
339 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
340 | 0 | &handle, mutable_cf_options); |
341 | 0 | if (s.ok()) { |
342 | 0 | t = cache_.Value(handle); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | if (s.ok()) { |
346 | | // Note: NewRangeTombstoneIterator could return nullptr |
347 | 0 | out_iter->reset(t->NewRangeTombstoneIterator(options)); |
348 | 0 | } |
349 | 0 | if (handle) { |
350 | 0 | if (*out_iter) { |
351 | 0 | cache_.RegisterReleaseAsCleanup(handle, **out_iter); |
352 | 0 | } else { |
353 | 0 | cache_.Release(handle); |
354 | 0 | } |
355 | 0 | } |
356 | 0 | return s; |
357 | 0 | } |
358 | | |
359 | | uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, |
360 | | const FileDescriptor& fd, |
361 | | const Slice& internal_key, |
362 | | GetContext* get_context, |
363 | 0 | IterKey& row_cache_key) { |
364 | 0 | uint64_t fd_number = fd.GetNumber(); |
365 | | // We use the user key as cache key instead of the internal key, |
366 | | // otherwise the whole cache would be invalidated every time the |
367 | | // sequence key increases. However, to support caching snapshot |
368 | | // reads, we append a sequence number (incremented by 1 to |
369 | | // distinguish from 0) other than internal_key seq no |
370 | | // to determine row cache entry visibility. |
371 | | // If the snapshot is larger than the largest seqno in the file, |
372 | | // all data should be exposed to the snapshot, so we treat it |
373 | | // the same as there is no snapshot. The exception is that if |
374 | | // a seq-checking callback is registered, some internal keys |
375 | | // may still be filtered out. |
376 | 0 | uint64_t cache_entry_seq_no = 0; |
377 | | |
378 | | // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. |
379 | 0 | if (options.snapshot != nullptr && |
380 | 0 | (get_context->has_callback() || |
381 | 0 | static_cast_with_check<const SnapshotImpl>(options.snapshot) |
382 | 0 | ->GetSequenceNumber() <= fd.largest_seqno)) { |
383 | | // We should consider to use options.snapshot->GetSequenceNumber() |
384 | | // instead of GetInternalKeySeqno(k), which will make the code |
385 | | // easier to understand. |
386 | 0 | cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key); |
387 | 0 | } |
388 | | |
389 | | // Compute row cache key. |
390 | 0 | row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), |
391 | 0 | row_cache_id_.size()); |
392 | 0 | AppendVarint64(&row_cache_key, fd_number); |
393 | 0 | AppendVarint64(&row_cache_key, cache_entry_seq_no); |
394 | | |
395 | | // Provide a sequence number for callback checking on cache hit. |
396 | | // As cache_entry_seq_no starts at 1, decrease it's value by 1 to get |
397 | | // a sequence number align with get context's logic. |
398 | 0 | return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1; |
399 | 0 | } |
400 | | |
401 | | bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, |
402 | | size_t prefix_size, GetContext* get_context, |
403 | 0 | Status* read_status, SequenceNumber seq_no) { |
404 | 0 | bool found = false; |
405 | |
|
406 | 0 | row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); |
407 | 0 | RowCacheInterface row_cache{ioptions_.row_cache.get()}; |
408 | 0 | if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) { |
409 | | // Cleanable routine to release the cache entry |
410 | 0 | Cleanable value_pinner; |
411 | | // If it comes here value is located on the cache. |
412 | | // found_row_cache_entry points to the value on cache, |
413 | | // and value_pinner has cleanup procedure for the cached entry. |
414 | | // After replayGetContextLog() returns, get_context.pinnable_slice_ |
415 | | // will point to cache entry buffer (or a copy based on that) and |
416 | | // cleanup routine under value_pinner will be delegated to |
417 | | // get_context.pinnable_slice_. Cache entry is released when |
418 | | // get_context.pinnable_slice_ is reset. |
419 | 0 | row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); |
420 | | // If row cache hit, knowing cache key is the same to row_cache_key, |
421 | | // can use row_cache_key's seq no to construct InternalKey. |
422 | 0 | *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key, |
423 | 0 | get_context, &value_pinner, seq_no); |
424 | 0 | RecordTick(ioptions_.stats, ROW_CACHE_HIT); |
425 | 0 | found = true; |
426 | 0 | } else { |
427 | 0 | RecordTick(ioptions_.stats, ROW_CACHE_MISS); |
428 | 0 | } |
429 | 0 | return found; |
430 | 0 | } |
431 | | |
432 | | Status TableCache::Get(const ReadOptions& options, |
433 | | const InternalKeyComparator& internal_comparator, |
434 | | const FileMetaData& file_meta, const Slice& k, |
435 | | GetContext* get_context, |
436 | | const MutableCFOptions& mutable_cf_options, |
437 | | HistogramImpl* file_read_hist, bool skip_filters, |
438 | 1.69k | int level, size_t max_file_size_for_l0_meta_pin) { |
439 | 1.69k | auto& fd = file_meta.fd; |
440 | 1.69k | std::string* row_cache_entry = nullptr; |
441 | 1.69k | bool done = false; |
442 | 1.69k | IterKey row_cache_key; |
443 | 1.69k | std::string row_cache_entry_buffer; |
444 | | |
445 | | // Check row cache if enabled. |
446 | | // Reuse row_cache_key sequence number when row cache hits. |
447 | 1.69k | Status s; |
448 | 1.69k | if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { |
449 | 0 | auto user_key = ExtractUserKey(k); |
450 | 0 | uint64_t cache_entry_seq_no = |
451 | 0 | CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); |
452 | 0 | done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), |
453 | 0 | get_context, &s, cache_entry_seq_no); |
454 | 0 | if (!done) { |
455 | 0 | row_cache_entry = &row_cache_entry_buffer; |
456 | 0 | } |
457 | 0 | } |
458 | 1.69k | TableReader* t = fd.table_reader; |
459 | 1.69k | TypedHandle* handle = nullptr; |
460 | 1.69k | if (s.ok() && !done) { |
461 | 1.69k | if (t == nullptr) { |
462 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
463 | 0 | &handle, mutable_cf_options, |
464 | 0 | options.read_tier == kBlockCacheTier /* no_io */, |
465 | 0 | file_read_hist, skip_filters, level, |
466 | 0 | true /* prefetch_index_and_filter_in_cache */, |
467 | 0 | max_file_size_for_l0_meta_pin, file_meta.temperature); |
468 | 0 | if (s.ok()) { |
469 | 0 | t = cache_.Value(handle); |
470 | 0 | } |
471 | 0 | } |
472 | 1.69k | SequenceNumber* max_covering_tombstone_seq = |
473 | 1.69k | get_context->max_covering_tombstone_seq(); |
474 | 1.69k | if (s.ok() && max_covering_tombstone_seq != nullptr && |
475 | 1.69k | !options.ignore_range_deletions) { |
476 | 1.69k | std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( |
477 | 1.69k | t->NewRangeTombstoneIterator(options)); |
478 | 1.69k | if (range_del_iter != nullptr) { |
479 | 0 | SequenceNumber seq = |
480 | 0 | range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); |
481 | 0 | if (seq > *max_covering_tombstone_seq) { |
482 | 0 | *max_covering_tombstone_seq = seq; |
483 | 0 | if (get_context->NeedTimestamp()) { |
484 | 0 | get_context->SetTimestampFromRangeTombstone( |
485 | 0 | range_del_iter->timestamp()); |
486 | 0 | } |
487 | 0 | } |
488 | 0 | } |
489 | 1.69k | } |
490 | 1.69k | if (s.ok()) { |
491 | 1.69k | get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. |
492 | 1.69k | s = t->Get(options, k, get_context, |
493 | 1.69k | mutable_cf_options.prefix_extractor.get(), skip_filters); |
494 | 1.69k | get_context->SetReplayLog(nullptr); |
495 | 1.69k | } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { |
496 | | // Couldn't find table in cache and couldn't open it because of no_io. |
497 | 0 | get_context->MarkKeyMayExist(); |
498 | 0 | done = true; |
499 | 0 | } |
500 | 1.69k | } |
501 | | |
502 | | // Put the replay log in row cache only if something was found. |
503 | 1.69k | if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { |
504 | 0 | RowCacheInterface row_cache{ioptions_.row_cache.get()}; |
505 | 0 | size_t charge = row_cache_entry->capacity() + sizeof(std::string); |
506 | 0 | auto row_ptr = new std::string(std::move(*row_cache_entry)); |
507 | 0 | Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge); |
508 | 0 | if (!rcs.ok()) { |
509 | | // If row cache is full, it's OK to continue, but we keep ownership of |
510 | | // row_ptr. |
511 | 0 | delete row_ptr; |
512 | 0 | } |
513 | 0 | } |
514 | | |
515 | 1.69k | if (handle != nullptr) { |
516 | 0 | cache_.Release(handle); |
517 | 0 | } |
518 | 1.69k | return s; |
519 | 1.69k | } |
520 | | |
521 | | void TableCache::UpdateRangeTombstoneSeqnums( |
522 | | const ReadOptions& options, TableReader* t, |
523 | 0 | MultiGetContext::Range& table_range) { |
524 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( |
525 | 0 | t->NewRangeTombstoneIterator(options)); |
526 | 0 | if (range_del_iter != nullptr) { |
527 | 0 | for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { |
528 | 0 | SequenceNumber* max_covering_tombstone_seq = |
529 | 0 | iter->get_context->max_covering_tombstone_seq(); |
530 | 0 | SequenceNumber seq = |
531 | 0 | range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts); |
532 | 0 | if (seq > *max_covering_tombstone_seq) { |
533 | 0 | *max_covering_tombstone_seq = seq; |
534 | 0 | if (iter->get_context->NeedTimestamp()) { |
535 | 0 | iter->get_context->SetTimestampFromRangeTombstone( |
536 | 0 | range_del_iter->timestamp()); |
537 | 0 | } |
538 | 0 | } |
539 | 0 | } |
540 | 0 | } |
541 | 0 | } |
542 | | |
543 | | Status TableCache::MultiGetFilter( |
544 | | const ReadOptions& options, |
545 | | const InternalKeyComparator& internal_comparator, |
546 | | const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, |
547 | | HistogramImpl* file_read_hist, int level, |
548 | 0 | MultiGetContext::Range* mget_range, TypedHandle** table_handle) { |
549 | 0 | auto& fd = file_meta.fd; |
550 | 0 | IterKey row_cache_key; |
551 | 0 | std::string row_cache_entry_buffer; |
552 | | |
553 | | // Check if we need to use the row cache. If yes, then we cannot do the |
554 | | // filtering here, since the filtering needs to happen after the row cache |
555 | | // lookup. |
556 | 0 | KeyContext& first_key = *mget_range->begin(); |
557 | 0 | if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) { |
558 | 0 | return Status::NotSupported(); |
559 | 0 | } |
560 | 0 | Status s; |
561 | 0 | TableReader* t = fd.table_reader; |
562 | 0 | TypedHandle* handle = nullptr; |
563 | 0 | MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), |
564 | 0 | mget_range->end()); |
565 | 0 | if (t == nullptr) { |
566 | 0 | s = FindTable(options, file_options_, internal_comparator, file_meta, |
567 | 0 | &handle, mutable_cf_options, |
568 | 0 | options.read_tier == kBlockCacheTier /* no_io */, |
569 | 0 | file_read_hist, |
570 | 0 | /*skip_filters=*/false, level, |
571 | 0 | true /* prefetch_index_and_filter_in_cache */, |
572 | 0 | /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); |
573 | 0 | if (s.ok()) { |
574 | 0 | t = cache_.Value(handle); |
575 | 0 | } |
576 | 0 | *table_handle = handle; |
577 | 0 | } |
578 | 0 | if (s.ok()) { |
579 | 0 | s = t->MultiGetFilter(options, mutable_cf_options.prefix_extractor.get(), |
580 | 0 | mget_range); |
581 | 0 | } |
582 | 0 | if (s.ok() && !options.ignore_range_deletions) { |
583 | | // Update the range tombstone sequence numbers for the keys here |
584 | | // as TableCache::MultiGet may or may not be called, and even if it |
585 | | // is, it may be called with fewer keys in the rangedue to filtering. |
586 | 0 | UpdateRangeTombstoneSeqnums(options, t, tombstone_range); |
587 | 0 | } |
588 | 0 | if (mget_range->empty() && handle) { |
589 | 0 | cache_.Release(handle); |
590 | 0 | *table_handle = nullptr; |
591 | 0 | } |
592 | |
|
593 | 0 | return s; |
594 | 0 | } |
595 | | |
596 | | Status TableCache::GetTableProperties( |
597 | | const FileOptions& file_options, const ReadOptions& read_options, |
598 | | const InternalKeyComparator& internal_comparator, |
599 | | const FileMetaData& file_meta, |
600 | | std::shared_ptr<const TableProperties>* properties, |
601 | 160k | const MutableCFOptions& mutable_cf_options, bool no_io) { |
602 | 160k | auto table_reader = file_meta.fd.table_reader; |
603 | | // table already been pre-loaded? |
604 | 160k | if (table_reader) { |
605 | 160k | *properties = table_reader->GetTableProperties(); |
606 | | |
607 | 160k | return Status::OK(); |
608 | 160k | } |
609 | | |
610 | 0 | TypedHandle* table_handle = nullptr; |
611 | 0 | Status s = FindTable(read_options, file_options, internal_comparator, |
612 | 0 | file_meta, &table_handle, mutable_cf_options, no_io); |
613 | 0 | if (!s.ok()) { |
614 | 0 | return s; |
615 | 0 | } |
616 | 0 | assert(table_handle); |
617 | 0 | auto table = cache_.Value(table_handle); |
618 | 0 | *properties = table->GetTableProperties(); |
619 | 0 | cache_.Release(table_handle); |
620 | 0 | return s; |
621 | 0 | } |
622 | | |
623 | | Status TableCache::ApproximateKeyAnchors( |
624 | | const ReadOptions& ro, const InternalKeyComparator& internal_comparator, |
625 | | const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, |
626 | | |
627 | 0 | std::vector<TableReader::Anchor>& anchors) { |
628 | 0 | Status s; |
629 | 0 | TableReader* t = file_meta.fd.table_reader; |
630 | 0 | TypedHandle* handle = nullptr; |
631 | 0 | if (t == nullptr) { |
632 | 0 | s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, |
633 | 0 | mutable_cf_options); |
634 | 0 | if (s.ok()) { |
635 | 0 | t = cache_.Value(handle); |
636 | 0 | } |
637 | 0 | } |
638 | 0 | if (s.ok() && t != nullptr) { |
639 | 0 | s = t->ApproximateKeyAnchors(ro, anchors); |
640 | 0 | } |
641 | 0 | if (handle != nullptr) { |
642 | 0 | cache_.Release(handle); |
643 | 0 | } |
644 | 0 | return s; |
645 | 0 | } |
646 | | |
647 | | size_t TableCache::GetMemoryUsageByTableReader( |
648 | | const FileOptions& file_options, const ReadOptions& read_options, |
649 | | const InternalKeyComparator& internal_comparator, |
650 | 0 | const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options) { |
651 | 0 | auto table_reader = file_meta.fd.table_reader; |
652 | | // table already been pre-loaded? |
653 | 0 | if (table_reader) { |
654 | 0 | return table_reader->ApproximateMemoryUsage(); |
655 | 0 | } |
656 | | |
657 | 0 | TypedHandle* table_handle = nullptr; |
658 | 0 | Status s = |
659 | 0 | FindTable(read_options, file_options, internal_comparator, file_meta, |
660 | 0 | &table_handle, mutable_cf_options, true /* no_io */); |
661 | 0 | if (!s.ok()) { |
662 | 0 | return 0; |
663 | 0 | } |
664 | 0 | assert(table_handle); |
665 | 0 | auto table = cache_.Value(table_handle); |
666 | 0 | auto ret = table->ApproximateMemoryUsage(); |
667 | 0 | cache_.Release(table_handle); |
668 | 0 | return ret; |
669 | 0 | } |
670 | | |
671 | 8.27k | void TableCache::Evict(Cache* cache, uint64_t file_number) { |
672 | 8.27k | cache->Erase(GetSliceForFileNumber(&file_number)); |
673 | 8.27k | } |
674 | | |
675 | | uint64_t TableCache::ApproximateOffsetOf( |
676 | | const ReadOptions& read_options, const Slice& key, |
677 | | const FileMetaData& file_meta, TableReaderCaller caller, |
678 | | const InternalKeyComparator& internal_comparator, |
679 | 0 | const MutableCFOptions& mutable_cf_options) { |
680 | 0 | uint64_t result = 0; |
681 | 0 | TableReader* table_reader = file_meta.fd.table_reader; |
682 | 0 | TypedHandle* table_handle = nullptr; |
683 | 0 | if (table_reader == nullptr) { |
684 | 0 | Status s = |
685 | 0 | FindTable(read_options, file_options_, internal_comparator, file_meta, |
686 | 0 | &table_handle, mutable_cf_options, false /* no_io */); |
687 | 0 | if (s.ok()) { |
688 | 0 | table_reader = cache_.Value(table_handle); |
689 | 0 | } |
690 | 0 | } |
691 | |
|
692 | 0 | if (table_reader != nullptr) { |
693 | 0 | result = table_reader->ApproximateOffsetOf(read_options, key, caller); |
694 | 0 | } |
695 | 0 | if (table_handle != nullptr) { |
696 | 0 | cache_.Release(table_handle); |
697 | 0 | } |
698 | |
|
699 | 0 | return result; |
700 | 0 | } |
701 | | |
702 | | uint64_t TableCache::ApproximateSize( |
703 | | const ReadOptions& read_options, const Slice& start, const Slice& end, |
704 | | const FileMetaData& file_meta, TableReaderCaller caller, |
705 | | const InternalKeyComparator& internal_comparator, |
706 | 0 | const MutableCFOptions& mutable_cf_options) { |
707 | 0 | uint64_t result = 0; |
708 | 0 | TableReader* table_reader = file_meta.fd.table_reader; |
709 | 0 | TypedHandle* table_handle = nullptr; |
710 | 0 | if (table_reader == nullptr) { |
711 | 0 | Status s = |
712 | 0 | FindTable(read_options, file_options_, internal_comparator, file_meta, |
713 | 0 | &table_handle, mutable_cf_options, false /* no_io */); |
714 | 0 | if (s.ok()) { |
715 | 0 | table_reader = cache_.Value(table_handle); |
716 | 0 | } |
717 | 0 | } |
718 | |
|
719 | 0 | if (table_reader != nullptr) { |
720 | 0 | result = table_reader->ApproximateSize(read_options, start, end, caller); |
721 | 0 | } |
722 | 0 | if (table_handle != nullptr) { |
723 | 0 | cache_.Release(table_handle); |
724 | 0 | } |
725 | |
|
726 | 0 | return result; |
727 | 0 | } |
728 | | |
729 | | void TableCache::ReleaseObsolete(Cache* cache, uint64_t file_number, |
730 | | Cache::Handle* h, |
731 | 126k | uint32_t uncache_aggressiveness) { |
732 | 126k | CacheInterface typed_cache(cache); |
733 | 126k | TypedHandle* table_handle = reinterpret_cast<TypedHandle*>(h); |
734 | 126k | if (table_handle == nullptr) { |
735 | 0 | table_handle = typed_cache.Lookup(GetSliceForFileNumber(&file_number)); |
736 | 0 | } |
737 | 126k | if (table_handle != nullptr) { |
738 | 126k | TableReader* table_reader = typed_cache.Value(table_handle); |
739 | 126k | table_reader->MarkObsolete(uncache_aggressiveness); |
740 | 126k | typed_cache.ReleaseAndEraseIfLastRef(table_handle); |
741 | 126k | } |
742 | 126k | } |
743 | | |
744 | | } // namespace ROCKSDB_NAMESPACE |