/src/rocksdb/db/version_set.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "db/version_set.h" |
11 | | |
12 | | #include <algorithm> |
13 | | #include <array> |
14 | | #include <cinttypes> |
15 | | #include <cstdio> |
16 | | #include <list> |
17 | | #include <map> |
18 | | #include <set> |
19 | | #include <string> |
20 | | #include <unordered_map> |
21 | | #include <vector> |
22 | | |
23 | | #include "db/blob/blob_fetcher.h" |
24 | | #include "db/blob/blob_file_cache.h" |
25 | | #include "db/blob/blob_file_reader.h" |
26 | | #include "db/blob/blob_log_format.h" |
27 | | #include "db/blob/blob_source.h" |
28 | | #include "db/compaction/compaction.h" |
29 | | #include "db/compaction/file_pri.h" |
30 | | #include "db/dbformat.h" |
31 | | #include "db/internal_stats.h" |
32 | | #include "db/log_reader.h" |
33 | | #include "db/log_writer.h" |
34 | | #include "db/manifest_ops.h" |
35 | | #include "db/memtable.h" |
36 | | #include "db/merge_context.h" |
37 | | #include "db/merge_helper.h" |
38 | | #include "db/pinned_iterators_manager.h" |
39 | | #include "db/table_cache.h" |
40 | | #include "db/version_builder.h" |
41 | | #include "db/version_edit.h" |
42 | | #include "db/version_edit_handler.h" |
43 | | #include "db/wide/wide_columns_helper.h" |
44 | | #include "file/file_util.h" |
45 | | #include "table/compaction_merging_iterator.h" |
46 | | |
47 | | #if USE_COROUTINES |
48 | | #include "folly/coro/BlockingWait.h" |
49 | | #include "folly/coro/Collect.h" |
50 | | #endif |
51 | | #include "file/filename.h" |
52 | | #include "file/random_access_file_reader.h" |
53 | | #include "file/read_write_util.h" |
54 | | #include "file/writable_file_writer.h" |
55 | | #include "logging/logging.h" |
56 | | #include "monitoring/file_read_sample.h" |
57 | | #include "monitoring/perf_context_imp.h" |
58 | | #include "monitoring/persistent_stats_history.h" |
59 | | #include "options/options_helper.h" |
60 | | #include "rocksdb/env.h" |
61 | | #include "rocksdb/merge_operator.h" |
62 | | #include "rocksdb/write_buffer_manager.h" |
63 | | #include "table/format.h" |
64 | | #include "table/get_context.h" |
65 | | #include "table/internal_iterator.h" |
66 | | #include "table/merging_iterator.h" |
67 | | #include "table/meta_blocks.h" |
68 | | #include "table/multiget_context.h" |
69 | | #include "table/plain/plain_table_factory.h" |
70 | | #include "table/table_reader.h" |
71 | | #include "table/two_level_iterator.h" |
72 | | #include "table/unique_id_impl.h" |
73 | | #include "test_util/sync_point.h" |
74 | | #include "util/cast_util.h" |
75 | | #include "util/coding.h" |
76 | | #include "util/coro_utils.h" |
77 | | #include "util/stop_watch.h" |
78 | | #include "util/string_util.h" |
79 | | #include "util/user_comparator_wrapper.h" |
80 | | |
81 | | // Generate the regular and coroutine versions of some methods by |
82 | | // including version_set_sync_and_async.h twice |
83 | | // Macros in the header will expand differently based on whether |
84 | | // WITH_COROUTINES or WITHOUT_COROUTINES is defined |
85 | | // clang-format off |
86 | | #define WITHOUT_COROUTINES |
87 | | #include "db/version_set_sync_and_async.h" |
88 | | #undef WITHOUT_COROUTINES |
89 | | #define WITH_COROUTINES |
90 | | #include "db/version_set_sync_and_async.h" |
91 | | #undef WITH_COROUTINES |
92 | | // clang-format on |
93 | | |
94 | | namespace ROCKSDB_NAMESPACE { |
95 | | |
96 | | namespace { |
97 | | |
98 | | using ScanOptionsMap = std::unordered_map<size_t, MultiScanArgs>; |
99 | | |
100 | | // Find File in LevelFilesBrief data structure |
101 | | // Within an index range defined by left and right |
102 | | int FindFileInRange(const InternalKeyComparator& icmp, |
103 | | const LevelFilesBrief& file_level, const Slice& key, |
104 | 3.70k | uint32_t left, uint32_t right) { |
105 | 5.58k | auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { |
106 | 5.58k | return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; |
107 | 5.58k | }; |
108 | 3.70k | const auto& b = file_level.files; |
109 | 3.70k | return static_cast<int>(std::lower_bound(b + left, b + right, key, cmp) - b); |
110 | 3.70k | } |
111 | | |
112 | | Status OverlapWithIterator(const Comparator* ucmp, |
113 | | const Slice& smallest_user_key, |
114 | | const Slice& largest_user_key, |
115 | 2.93k | InternalIterator* iter, bool* overlap) { |
116 | 2.93k | InternalKey range_start(smallest_user_key, kMaxSequenceNumber, |
117 | 2.93k | kValueTypeForSeek); |
118 | 2.93k | iter->Seek(range_start.Encode()); |
119 | 2.93k | if (!iter->status().ok()) { |
120 | 0 | return iter->status(); |
121 | 0 | } |
122 | | |
123 | 2.93k | *overlap = false; |
124 | 2.93k | if (iter->Valid()) { |
125 | 2.92k | ParsedInternalKey seek_result; |
126 | 2.92k | Status s = ParseInternalKey(iter->key(), &seek_result, |
127 | 2.92k | false /* log_err_key */); // TODO |
128 | 2.92k | if (!s.ok()) { |
129 | 0 | return s; |
130 | 0 | } |
131 | | |
132 | 2.92k | if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= |
133 | 2.92k | 0) { |
134 | 2.83k | *overlap = true; |
135 | 2.83k | } |
136 | 2.92k | } |
137 | | |
138 | 2.93k | return iter->status(); |
139 | 2.93k | } |
140 | | |
141 | | // Class to help choose the next file to search for the particular key. |
142 | | // Searches and returns files level by level. |
143 | | // We can search level-by-level since entries never hop across |
144 | | // levels. Therefore we are guaranteed that if we find data |
145 | | // in a smaller level, later levels are irrelevant (unless we |
146 | | // are MergeInProgress). |
147 | | class FilePicker { |
148 | | public: |
149 | | FilePicker(const Slice& user_key, const Slice& ikey, |
150 | | autovector<LevelFilesBrief>* file_levels, unsigned int num_levels, |
151 | | FileIndexer* file_indexer, const Comparator* user_comparator, |
152 | | const InternalKeyComparator* internal_comparator) |
153 | 2.43k | : num_levels_(num_levels), |
154 | 2.43k | curr_level_(static_cast<unsigned int>(-1)), |
155 | 2.43k | returned_file_level_(static_cast<unsigned int>(-1)), |
156 | 2.43k | hit_file_level_(static_cast<unsigned int>(-1)), |
157 | 2.43k | search_left_bound_(0), |
158 | 2.43k | search_right_bound_(FileIndexer::kLevelMaxIndex), |
159 | 2.43k | level_files_brief_(file_levels), |
160 | 2.43k | is_hit_file_last_in_level_(false), |
161 | 2.43k | curr_file_level_(nullptr), |
162 | 2.43k | user_key_(user_key), |
163 | 2.43k | ikey_(ikey), |
164 | 2.43k | file_indexer_(file_indexer), |
165 | 2.43k | user_comparator_(user_comparator), |
166 | 2.43k | internal_comparator_(internal_comparator) { |
167 | | // Setup member variables to search first level. |
168 | 2.43k | search_ended_ = !PrepareNextLevel(); |
169 | 2.43k | if (!search_ended_) { |
170 | | // Prefetch Level 0 table data to avoid cache miss if possible. |
171 | 3.42k | for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { |
172 | 2.06k | auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; |
173 | 2.06k | if (r) { |
174 | 2.06k | r->Prepare(ikey); |
175 | 2.06k | } |
176 | 2.06k | } |
177 | 1.35k | } |
178 | 2.43k | } |
179 | | |
180 | 0 | int GetCurrentLevel() const { return curr_level_; } |
181 | | |
182 | 2.86k | FdWithKeyRange* GetNextFile() { |
183 | 3.28k | while (!search_ended_) { // Loops over different levels. |
184 | 2.08k | while (curr_index_in_curr_level_ < curr_file_level_->num_files) { |
185 | | // Loops over all files in current level. |
186 | 1.70k | FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; |
187 | 1.70k | hit_file_level_ = curr_level_; |
188 | 1.70k | is_hit_file_last_in_level_ = |
189 | 1.70k | curr_index_in_curr_level_ == curr_file_level_->num_files - 1; |
190 | 1.70k | int cmp_largest = -1; |
191 | | |
192 | | // Do key range filtering of files or/and fractional cascading if: |
193 | | // (1) not all the files are in level 0, or |
194 | | // (2) there are more than 3 current level files |
195 | | // If there are only 3 or less current level files in the system, we |
196 | | // skip the key range filtering. In this case, more likely, the system |
197 | | // is highly tuned to minimize number of tables queried by each query, |
198 | | // so it is unlikely that key range filtering is more efficient than |
199 | | // querying the files. |
200 | 1.70k | if (num_levels_ > 1 || curr_file_level_->num_files > 3) { |
201 | | // Check if key is within a file's range. If search left bound and |
202 | | // right bound point to the same find, we are sure key falls in |
203 | | // range. |
204 | 978 | assert(curr_level_ == 0 || |
205 | 978 | curr_index_in_curr_level_ == start_index_in_curr_level_ || |
206 | 978 | user_comparator_->CompareWithoutTimestamp( |
207 | 978 | user_key_, ExtractUserKey(f->smallest_key)) <= 0); |
208 | | |
209 | 978 | int cmp_smallest = user_comparator_->CompareWithoutTimestamp( |
210 | 978 | user_key_, ExtractUserKey(f->smallest_key)); |
211 | 978 | if (cmp_smallest >= 0) { |
212 | 774 | cmp_largest = user_comparator_->CompareWithoutTimestamp( |
213 | 774 | user_key_, ExtractUserKey(f->largest_key)); |
214 | 774 | } |
215 | | |
216 | | // Setup file search bound for the next level based on the |
217 | | // comparison results |
218 | 978 | if (curr_level_ > 0) { |
219 | 464 | file_indexer_->GetNextLevelIndex( |
220 | 464 | curr_level_, curr_index_in_curr_level_, cmp_smallest, |
221 | 464 | cmp_largest, &search_left_bound_, &search_right_bound_); |
222 | 464 | } |
223 | | // Key falls out of current file's range |
224 | 978 | if (cmp_smallest < 0 || cmp_largest > 0) { |
225 | 223 | if (curr_level_ == 0) { |
226 | 189 | ++curr_index_in_curr_level_; |
227 | 189 | continue; |
228 | 189 | } else { |
229 | | // Search next level. |
230 | 34 | break; |
231 | 34 | } |
232 | 223 | } |
233 | 978 | } |
234 | | |
235 | 1.47k | returned_file_level_ = curr_level_; |
236 | 1.47k | if (curr_level_ > 0 && cmp_largest < 0) { |
237 | | // No more files to search in this level. |
238 | 161 | search_ended_ = !PrepareNextLevel(); |
239 | 1.31k | } else { |
240 | 1.31k | ++curr_index_in_curr_level_; |
241 | 1.31k | } |
242 | 1.47k | return f; |
243 | 1.70k | } |
244 | | // Start searching next level. |
245 | 422 | search_ended_ = !PrepareNextLevel(); |
246 | 422 | } |
247 | | // Search ended. |
248 | 1.38k | return nullptr; |
249 | 2.86k | } |
250 | | |
251 | | // getter for current file level |
252 | | // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts |
253 | 5.46k | unsigned int GetHitFileLevel() { return hit_file_level_; } |
254 | | |
255 | | // Returns true if the most recent "hit file" (i.e., one returned by |
256 | | // GetNextFile()) is at the last index in its level. |
257 | 1.47k | bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } |
258 | | |
259 | | private: |
260 | | unsigned int num_levels_; |
261 | | unsigned int curr_level_; |
262 | | unsigned int returned_file_level_; |
263 | | unsigned int hit_file_level_; |
264 | | int32_t search_left_bound_; |
265 | | int32_t search_right_bound_; |
266 | | autovector<LevelFilesBrief>* level_files_brief_; |
267 | | bool search_ended_; |
268 | | bool is_hit_file_last_in_level_; |
269 | | LevelFilesBrief* curr_file_level_; |
270 | | unsigned int curr_index_in_curr_level_; |
271 | | unsigned int start_index_in_curr_level_; |
272 | | Slice user_key_; |
273 | | Slice ikey_; |
274 | | FileIndexer* file_indexer_; |
275 | | const Comparator* user_comparator_; |
276 | | const InternalKeyComparator* internal_comparator_; |
277 | | |
278 | | // Setup local variables to search next level. |
279 | | // Returns false if there are no more levels to search. |
280 | 3.01k | bool PrepareNextLevel() { |
281 | 3.01k | curr_level_++; |
282 | 5.72k | while (curr_level_ < num_levels_) { |
283 | 4.18k | curr_file_level_ = &(*level_files_brief_)[curr_level_]; |
284 | 4.18k | if (curr_file_level_->num_files == 0) { |
285 | | // When current level is empty, the search bound generated from upper |
286 | | // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is |
287 | | // also empty. |
288 | 2.70k | assert(search_left_bound_ == 0); |
289 | 2.70k | assert(search_right_bound_ == -1 || |
290 | 2.70k | search_right_bound_ == FileIndexer::kLevelMaxIndex); |
291 | | // Since current level is empty, it will need to search all files in |
292 | | // the next level |
293 | 2.70k | search_left_bound_ = 0; |
294 | 2.70k | search_right_bound_ = FileIndexer::kLevelMaxIndex; |
295 | 2.70k | curr_level_++; |
296 | 2.70k | continue; |
297 | 2.70k | } |
298 | | |
299 | | // Some files may overlap each other. We find |
300 | | // all files that overlap user_key and process them in order from |
301 | | // newest to oldest. In the context of merge-operator, this can occur at |
302 | | // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes |
303 | | // are always compacted into a single entry). |
304 | 1.48k | int32_t start_index; |
305 | 1.48k | if (curr_level_ == 0) { |
306 | | // On Level-0, we read through all files to check for overlap. |
307 | 1.01k | start_index = 0; |
308 | 1.01k | } else { |
309 | | // On Level-n (n>=1), files are sorted. Binary search to find the |
310 | | // earliest file whose largest key >= ikey. Search left bound and |
311 | | // right bound are used to narrow the range. |
312 | 471 | if (search_left_bound_ <= search_right_bound_) { |
313 | 471 | if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { |
314 | 471 | search_right_bound_ = |
315 | 471 | static_cast<int32_t>(curr_file_level_->num_files) - 1; |
316 | 471 | } |
317 | | // `search_right_bound_` is an inclusive upper-bound, but since it was |
318 | | // determined based on user key, it is still possible the lookup key |
319 | | // falls to the right of `search_right_bound_`'s corresponding file. |
320 | | // So, pass a limit one higher, which allows us to detect this case. |
321 | 471 | start_index = |
322 | 471 | FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, |
323 | 471 | static_cast<uint32_t>(search_left_bound_), |
324 | 471 | static_cast<uint32_t>(search_right_bound_) + 1); |
325 | 471 | if (start_index == search_right_bound_ + 1) { |
326 | | // `ikey_` comes after `search_right_bound_`. The lookup key does |
327 | | // not exist on this level, so let's skip this level and do a full |
328 | | // binary search on the next level. |
329 | 7 | search_left_bound_ = 0; |
330 | 7 | search_right_bound_ = FileIndexer::kLevelMaxIndex; |
331 | 7 | curr_level_++; |
332 | 7 | continue; |
333 | 7 | } |
334 | 471 | } else { |
335 | | // search_left_bound > search_right_bound, key does not exist in |
336 | | // this level. Since no comparison is done in this level, it will |
337 | | // need to search all files in the next level. |
338 | 0 | search_left_bound_ = 0; |
339 | 0 | search_right_bound_ = FileIndexer::kLevelMaxIndex; |
340 | 0 | curr_level_++; |
341 | 0 | continue; |
342 | 0 | } |
343 | 471 | } |
344 | 1.47k | start_index_in_curr_level_ = start_index; |
345 | 1.47k | curr_index_in_curr_level_ = start_index; |
346 | | |
347 | 1.47k | return true; |
348 | 1.48k | } |
349 | | // curr_level_ = num_levels_. So, no more levels to search. |
350 | 1.54k | return false; |
351 | 3.01k | } |
352 | | }; |
353 | | } // anonymous namespace |
354 | | |
355 | | class FilePickerMultiGet { |
356 | | private: |
357 | | struct FilePickerContext; |
358 | | |
359 | | public: |
360 | | FilePickerMultiGet(MultiGetRange* range, |
361 | | autovector<LevelFilesBrief>* file_levels, |
362 | | unsigned int num_levels, FileIndexer* file_indexer, |
363 | | const Comparator* user_comparator, |
364 | | const InternalKeyComparator* internal_comparator) |
365 | 0 | : num_levels_(num_levels), |
366 | 0 | curr_level_(static_cast<unsigned int>(-1)), |
367 | 0 | returned_file_level_(static_cast<unsigned int>(-1)), |
368 | 0 | hit_file_level_(static_cast<unsigned int>(-1)), |
369 | 0 | range_(*range, range->begin(), range->end()), |
370 | 0 | maybe_repeat_key_(false), |
371 | 0 | current_level_range_(*range, range->begin(), range->end()), |
372 | 0 | current_file_range_(*range, range->begin(), range->end()), |
373 | 0 | batch_iter_(range->begin()), |
374 | 0 | batch_iter_prev_(range->begin()), |
375 | 0 | upper_key_(range->begin()), |
376 | 0 | level_files_brief_(file_levels), |
377 | 0 | is_hit_file_last_in_level_(false), |
378 | 0 | curr_file_level_(nullptr), |
379 | 0 | file_indexer_(file_indexer), |
380 | 0 | user_comparator_(user_comparator), |
381 | 0 | internal_comparator_(internal_comparator), |
382 | 0 | hit_file_(nullptr) { |
383 | 0 | for (auto iter = range_.begin(); iter != range_.end(); ++iter) { |
384 | 0 | fp_ctx_array_[iter.index()] = |
385 | 0 | FilePickerContext(0, FileIndexer::kLevelMaxIndex); |
386 | 0 | } |
387 | | |
388 | | // Setup member variables to search first level. |
389 | 0 | search_ended_ = !PrepareNextLevel(); |
390 | 0 | if (!search_ended_) { |
391 | | // REVISIT |
392 | | // Prefetch Level 0 table data to avoid cache miss if possible. |
393 | | // As of now, only PlainTableReader and CuckooTableReader do any |
394 | | // prefetching. This may not be necessary anymore once we implement |
395 | | // batching in those table readers |
396 | 0 | for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { |
397 | 0 | auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; |
398 | 0 | if (r) { |
399 | 0 | for (auto iter = range_.begin(); iter != range_.end(); ++iter) { |
400 | 0 | r->Prepare(iter->ikey); |
401 | 0 | } |
402 | 0 | } |
403 | 0 | } |
404 | 0 | } |
405 | 0 | } |
406 | | |
407 | | FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other) |
408 | | : num_levels_(other.num_levels_), |
409 | | curr_level_(other.curr_level_), |
410 | | returned_file_level_(other.returned_file_level_), |
411 | | hit_file_level_(other.hit_file_level_), |
412 | | fp_ctx_array_(other.fp_ctx_array_), |
413 | | range_(*range, range->begin(), range->end()), |
414 | | maybe_repeat_key_(false), |
415 | | current_level_range_(*range, range->begin(), range->end()), |
416 | | current_file_range_(*range, range->begin(), range->end()), |
417 | | batch_iter_(range->begin()), |
418 | | batch_iter_prev_(range->begin()), |
419 | | upper_key_(range->begin()), |
420 | | level_files_brief_(other.level_files_brief_), |
421 | | is_hit_file_last_in_level_(false), |
422 | | curr_file_level_(other.curr_file_level_), |
423 | | file_indexer_(other.file_indexer_), |
424 | | user_comparator_(other.user_comparator_), |
425 | | internal_comparator_(other.internal_comparator_), |
426 | 0 | hit_file_(nullptr) { |
427 | 0 | PrepareNextLevelForSearch(); |
428 | 0 | } |
429 | | |
430 | 0 | int GetCurrentLevel() const { return curr_level_; } |
431 | | |
432 | 0 | void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); } |
433 | | |
434 | 0 | FdWithKeyRange* GetNextFileInLevel() { |
435 | 0 | if (batch_iter_ == current_level_range_.end() || search_ended_) { |
436 | 0 | hit_file_ = nullptr; |
437 | 0 | return nullptr; |
438 | 0 | } else { |
439 | 0 | if (maybe_repeat_key_) { |
440 | 0 | maybe_repeat_key_ = false; |
441 | | // Check if we found the final value for the last key in the |
442 | | // previous lookup range. If we did, then there's no need to look |
443 | | // any further for that key, so advance batch_iter_. Else, keep |
444 | | // batch_iter_ positioned on that key so we look it up again in |
445 | | // the next file |
446 | | // For L0, always advance the key because we will look in the next |
447 | | // file regardless for all keys not found yet |
448 | 0 | if (current_level_range_.CheckKeyDone(batch_iter_) || |
449 | 0 | curr_level_ == 0) { |
450 | 0 | batch_iter_ = upper_key_; |
451 | 0 | } |
452 | 0 | } |
453 | | // batch_iter_prev_ will become the start key for the next file |
454 | | // lookup |
455 | 0 | batch_iter_prev_ = batch_iter_; |
456 | 0 | } |
457 | | |
458 | 0 | MultiGetRange next_file_range(current_level_range_, batch_iter_prev_, |
459 | 0 | current_level_range_.end()); |
460 | 0 | size_t curr_file_index = |
461 | 0 | (batch_iter_ != current_level_range_.end()) |
462 | 0 | ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level |
463 | 0 | : curr_file_level_->num_files; |
464 | 0 | FdWithKeyRange* f; |
465 | 0 | bool is_last_key_in_file; |
466 | 0 | if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f, |
467 | 0 | &is_last_key_in_file)) { |
468 | 0 | hit_file_ = nullptr; |
469 | 0 | return nullptr; |
470 | 0 | } else { |
471 | 0 | if (is_last_key_in_file) { |
472 | | // Since cmp_largest is 0, batch_iter_ still points to the last key |
473 | | // that falls in this file, instead of the next one. Increment |
474 | | // the file index for all keys between batch_iter_ and upper_key_ |
475 | 0 | auto tmp_iter = batch_iter_; |
476 | 0 | while (tmp_iter != upper_key_) { |
477 | 0 | ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level); |
478 | 0 | ++tmp_iter; |
479 | 0 | } |
480 | 0 | maybe_repeat_key_ = true; |
481 | 0 | } |
482 | | // Set the range for this file |
483 | 0 | current_file_range_ = |
484 | 0 | MultiGetRange(next_file_range, batch_iter_prev_, upper_key_); |
485 | 0 | returned_file_level_ = curr_level_; |
486 | 0 | hit_file_level_ = curr_level_; |
487 | 0 | is_hit_file_last_in_level_ = |
488 | 0 | curr_file_index == curr_file_level_->num_files - 1; |
489 | 0 | hit_file_ = f; |
490 | 0 | return f; |
491 | 0 | } |
492 | 0 | } |
493 | | |
494 | | // getter for current file level |
495 | | // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts |
496 | 0 | unsigned int GetHitFileLevel() { return hit_file_level_; } |
497 | | |
498 | 0 | FdWithKeyRange* GetHitFile() { return hit_file_; } |
499 | | |
500 | | // Returns true if the most recent "hit file" (i.e., one returned by |
501 | | // GetNextFile()) is at the last index in its level. |
502 | 0 | bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } |
503 | | |
504 | 0 | bool KeyMaySpanNextFile() { return maybe_repeat_key_; } |
505 | | |
506 | 0 | bool IsSearchEnded() { return search_ended_; } |
507 | | |
508 | 0 | const MultiGetRange& CurrentFileRange() { return current_file_range_; } |
509 | | |
510 | 0 | bool RemainingOverlapInLevel() { |
511 | 0 | return !current_level_range_.Suffix(current_file_range_).empty(); |
512 | 0 | } |
513 | | |
514 | 0 | MultiGetRange& GetRange() { return range_; } |
515 | | |
516 | 0 | void ReplaceRange(const MultiGetRange& other) { |
517 | 0 | assert(hit_file_ == nullptr); |
518 | 0 | range_ = other; |
519 | 0 | current_level_range_ = other; |
520 | 0 | } |
521 | | |
522 | | FilePickerMultiGet(FilePickerMultiGet&& other) |
523 | | : num_levels_(other.num_levels_), |
524 | | curr_level_(other.curr_level_), |
525 | | returned_file_level_(other.returned_file_level_), |
526 | | hit_file_level_(other.hit_file_level_), |
527 | | fp_ctx_array_(std::move(other.fp_ctx_array_)), |
528 | | range_(std::move(other.range_)), |
529 | | maybe_repeat_key_(other.maybe_repeat_key_), |
530 | | current_level_range_(std::move(other.current_level_range_)), |
531 | | current_file_range_(std::move(other.current_file_range_)), |
532 | | batch_iter_(other.batch_iter_, ¤t_level_range_), |
533 | | batch_iter_prev_(other.batch_iter_prev_, ¤t_level_range_), |
534 | | upper_key_(other.upper_key_, ¤t_level_range_), |
535 | | level_files_brief_(other.level_files_brief_), |
536 | | search_ended_(other.search_ended_), |
537 | | is_hit_file_last_in_level_(other.is_hit_file_last_in_level_), |
538 | | curr_file_level_(other.curr_file_level_), |
539 | | file_indexer_(other.file_indexer_), |
540 | | user_comparator_(other.user_comparator_), |
541 | | internal_comparator_(other.internal_comparator_), |
542 | 0 | hit_file_(other.hit_file_) {} |
543 | | |
544 | | private: |
545 | | unsigned int num_levels_; |
546 | | unsigned int curr_level_; |
547 | | unsigned int returned_file_level_; |
548 | | unsigned int hit_file_level_; |
549 | | |
550 | | struct FilePickerContext { |
551 | | int32_t search_left_bound; |
552 | | int32_t search_right_bound; |
553 | | unsigned int curr_index_in_curr_level; |
554 | | unsigned int start_index_in_curr_level; |
555 | | |
556 | | FilePickerContext(int32_t left, int32_t right) |
557 | 0 | : search_left_bound(left), |
558 | 0 | search_right_bound(right), |
559 | 0 | curr_index_in_curr_level(0), |
560 | 0 | start_index_in_curr_level(0) {} |
561 | | |
562 | | FilePickerContext() = default; |
563 | | }; |
564 | | std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_; |
565 | | MultiGetRange range_; |
566 | | bool maybe_repeat_key_; |
567 | | MultiGetRange current_level_range_; |
568 | | MultiGetRange current_file_range_; |
569 | | // Iterator to iterate through the keys in a MultiGet batch, that gets reset |
570 | | // at the beginning of each level. Each call to GetNextFile() will position |
571 | | // batch_iter_ at or right after the last key that was found in the returned |
572 | | // SST file |
573 | | MultiGetRange::Iterator batch_iter_; |
574 | | // An iterator that records the previous position of batch_iter_, i.e last |
575 | | // key found in the previous SST file, in order to serve as the start of |
576 | | // the batch key range for the next SST file |
577 | | MultiGetRange::Iterator batch_iter_prev_; |
578 | | MultiGetRange::Iterator upper_key_; |
579 | | autovector<LevelFilesBrief>* level_files_brief_; |
580 | | bool search_ended_; |
581 | | bool is_hit_file_last_in_level_; |
582 | | LevelFilesBrief* curr_file_level_; |
583 | | FileIndexer* file_indexer_; |
584 | | const Comparator* user_comparator_; |
585 | | const InternalKeyComparator* internal_comparator_; |
586 | | FdWithKeyRange* hit_file_; |
587 | | |
588 | | // Iterates through files in the current level until it finds a file that |
589 | | // contains at least one key from the MultiGet batch |
590 | | bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, |
591 | | size_t* file_index, FdWithKeyRange** fd, |
592 | 0 | bool* is_last_key_in_file) { |
593 | 0 | size_t curr_file_index = *file_index; |
594 | 0 | FdWithKeyRange* f = nullptr; |
595 | 0 | bool file_hit = false; |
596 | 0 | int cmp_largest = -1; |
597 | 0 | int cmp_smallest = -1; |
598 | 0 | if (curr_file_index >= curr_file_level_->num_files) { |
599 | | // In the unlikely case the next key is a duplicate of the current key, |
600 | | // and the current key is the last in the level and the internal key |
601 | | // was not found, we need to skip lookup for the remaining keys and |
602 | | // reset the search bounds |
603 | 0 | if (batch_iter_ != current_level_range_.end()) { |
604 | | #ifndef NDEBUG |
605 | | if (curr_level_ < num_levels_ + 1) { |
606 | | if ((*level_files_brief_)[curr_level_].num_files == 0) { |
607 | | struct FilePickerContext& fp_ctx = |
608 | | fp_ctx_array_[batch_iter_.index()]; |
609 | | |
610 | | assert(fp_ctx.search_left_bound == 0); |
611 | | assert(fp_ctx.search_right_bound == -1 || |
612 | | fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); |
613 | | } |
614 | | } |
615 | | #endif // NDBEUG |
616 | |
|
617 | 0 | ++batch_iter_; |
618 | 0 | for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { |
619 | 0 | struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; |
620 | 0 | fp_ctx.search_left_bound = 0; |
621 | 0 | fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; |
622 | 0 | } |
623 | 0 | } |
624 | 0 | return false; |
625 | 0 | } |
626 | | // Loops over keys in the MultiGet batch until it finds a file with |
627 | | // atleast one of the keys. Then it keeps moving forward until the |
628 | | // last key in the batch that falls in that file |
629 | 0 | while (batch_iter_ != current_level_range_.end() && |
630 | 0 | (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level == |
631 | 0 | curr_file_index || |
632 | 0 | !file_hit)) { |
633 | 0 | struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; |
634 | 0 | f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; |
635 | 0 | Slice& user_key = batch_iter_->ukey_without_ts; |
636 | | |
637 | | // Do key range filtering of files or/and fractional cascading if: |
638 | | // (1) not all the files are in level 0, or |
639 | | // (2) there are more than 3 current level files |
640 | | // If there are only 3 or less current level files in the system, we |
641 | | // skip the key range filtering. In this case, more likely, the system |
642 | | // is highly tuned to minimize number of tables queried by each query, |
643 | | // so it is unlikely that key range filtering is more efficient than |
644 | | // querying the files. |
645 | 0 | if (num_levels_ > 1 || curr_file_level_->num_files > 3) { |
646 | | // Check if key is within a file's range. If search left bound and |
647 | | // right bound point to the same find, we are sure key falls in |
648 | | // range. |
649 | 0 | cmp_smallest = user_comparator_->CompareWithoutTimestamp( |
650 | 0 | user_key, false, ExtractUserKey(f->smallest_key), true); |
651 | |
|
652 | 0 | assert(curr_level_ == 0 || |
653 | 0 | fp_ctx.curr_index_in_curr_level == |
654 | 0 | fp_ctx.start_index_in_curr_level || |
655 | 0 | cmp_smallest <= 0); |
656 | |
|
657 | 0 | if (cmp_smallest >= 0) { |
658 | 0 | cmp_largest = user_comparator_->CompareWithoutTimestamp( |
659 | 0 | user_key, false, ExtractUserKey(f->largest_key), true); |
660 | 0 | } else { |
661 | 0 | cmp_largest = -1; |
662 | 0 | } |
663 | | |
664 | | // Setup file search bound for the next level based on the |
665 | | // comparison results |
666 | 0 | if (curr_level_ > 0) { |
667 | 0 | file_indexer_->GetNextLevelIndex( |
668 | 0 | curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest, |
669 | 0 | cmp_largest, &fp_ctx.search_left_bound, |
670 | 0 | &fp_ctx.search_right_bound); |
671 | 0 | } |
672 | | // Key falls out of current file's range |
673 | 0 | if (cmp_smallest < 0 || cmp_largest > 0) { |
674 | 0 | next_file_range->SkipKey(batch_iter_); |
675 | 0 | } else { |
676 | 0 | file_hit = true; |
677 | 0 | } |
678 | 0 | } else { |
679 | 0 | file_hit = true; |
680 | 0 | } |
681 | 0 | if (cmp_largest == 0) { |
682 | | // cmp_largest is 0, which means the next key will not be in this |
683 | | // file, so stop looking further. However, its possible there are |
684 | | // duplicates in the batch, so find the upper bound for the batch |
685 | | // in this file (upper_key_) by skipping past the duplicates. We |
686 | | // leave batch_iter_ as is since we may have to pick up from there |
687 | | // for the next file, if this file has a merge value rather than |
688 | | // final value |
689 | 0 | upper_key_ = batch_iter_; |
690 | 0 | ++upper_key_; |
691 | 0 | while (upper_key_ != current_level_range_.end() && |
692 | 0 | user_comparator_->CompareWithoutTimestamp( |
693 | 0 | batch_iter_->ukey_without_ts, false, |
694 | 0 | upper_key_->ukey_without_ts, false) == 0) { |
695 | 0 | if (curr_level_ > 0) { |
696 | 0 | struct FilePickerContext& ctx = fp_ctx_array_[upper_key_.index()]; |
697 | 0 | file_indexer_->GetNextLevelIndex( |
698 | 0 | curr_level_, ctx.curr_index_in_curr_level, cmp_smallest, |
699 | 0 | cmp_largest, &ctx.search_left_bound, &ctx.search_right_bound); |
700 | 0 | } |
701 | 0 | ++upper_key_; |
702 | 0 | } |
703 | 0 | break; |
704 | 0 | } else { |
705 | 0 | if (curr_level_ == 0) { |
706 | | // We need to look through all files in level 0 |
707 | 0 | ++fp_ctx.curr_index_in_curr_level; |
708 | 0 | } |
709 | 0 | ++batch_iter_; |
710 | 0 | } |
711 | 0 | if (!file_hit) { |
712 | 0 | curr_file_index = |
713 | 0 | (batch_iter_ != current_level_range_.end()) |
714 | 0 | ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level |
715 | 0 | : curr_file_level_->num_files; |
716 | 0 | } |
717 | 0 | } |
718 | |
|
719 | 0 | *fd = f; |
720 | 0 | *file_index = curr_file_index; |
721 | 0 | *is_last_key_in_file = cmp_largest == 0; |
722 | 0 | if (!*is_last_key_in_file) { |
723 | | // If the largest key in the batch overlapping the file is not the |
724 | | // largest key in the file, upper_ley_ would not have been updated so |
725 | | // update it here |
726 | 0 | upper_key_ = batch_iter_; |
727 | 0 | } |
728 | 0 | return file_hit; |
729 | 0 | } |
730 | | |
731 | | // Setup local variables to search next level. |
732 | | // Returns false if there are no more levels to search. |
733 | 0 | bool PrepareNextLevel() { |
734 | 0 | if (curr_level_ == 0) { |
735 | 0 | MultiGetRange::Iterator mget_iter = current_level_range_.begin(); |
736 | 0 | if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < |
737 | 0 | curr_file_level_->num_files) { |
738 | 0 | batch_iter_prev_ = current_level_range_.begin(); |
739 | 0 | upper_key_ = batch_iter_ = current_level_range_.begin(); |
740 | 0 | return true; |
741 | 0 | } |
742 | 0 | } |
743 | | |
744 | 0 | curr_level_++; |
745 | | // Reset key range to saved value |
746 | 0 | while (curr_level_ < num_levels_) { |
747 | 0 | bool level_contains_keys = false; |
748 | 0 | curr_file_level_ = &(*level_files_brief_)[curr_level_]; |
749 | 0 | if (curr_file_level_->num_files == 0) { |
750 | | // When current level is empty, the search bound generated from upper |
751 | | // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is |
752 | | // also empty. |
753 | |
|
754 | 0 | for (auto mget_iter = current_level_range_.begin(); |
755 | 0 | mget_iter != current_level_range_.end(); ++mget_iter) { |
756 | 0 | struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; |
757 | |
|
758 | 0 | assert(fp_ctx.search_left_bound == 0); |
759 | 0 | assert(fp_ctx.search_right_bound == -1 || |
760 | 0 | fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); |
761 | | // Since current level is empty, it will need to search all files in |
762 | | // the next level |
763 | 0 | fp_ctx.search_left_bound = 0; |
764 | 0 | fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; |
765 | 0 | } |
766 | | // Skip all subsequent empty levels |
767 | 0 | do { |
768 | 0 | ++curr_level_; |
769 | 0 | } while ((curr_level_ < num_levels_) && |
770 | 0 | (*level_files_brief_)[curr_level_].num_files == 0); |
771 | 0 | continue; |
772 | 0 | } |
773 | | |
774 | | // Some files may overlap each other. We find |
775 | | // all files that overlap user_key and process them in order from |
776 | | // newest to oldest. In the context of merge-operator, this can occur at |
777 | | // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes |
778 | | // are always compacted into a single entry). |
779 | 0 | int32_t start_index = -1; |
780 | 0 | current_level_range_ = |
781 | 0 | MultiGetRange(range_, range_.begin(), range_.end()); |
782 | 0 | for (auto mget_iter = current_level_range_.begin(); |
783 | 0 | mget_iter != current_level_range_.end(); ++mget_iter) { |
784 | 0 | struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; |
785 | 0 | if (curr_level_ == 0) { |
786 | | // On Level-0, we read through all files to check for overlap. |
787 | 0 | start_index = 0; |
788 | 0 | level_contains_keys = true; |
789 | 0 | } else { |
790 | | // On Level-n (n>=1), files are sorted. Binary search to find the |
791 | | // earliest file whose largest key >= ikey. Search left bound and |
792 | | // right bound are used to narrow the range. |
793 | 0 | if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) { |
794 | 0 | if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) { |
795 | 0 | fp_ctx.search_right_bound = |
796 | 0 | static_cast<int32_t>(curr_file_level_->num_files) - 1; |
797 | 0 | } |
798 | | // `search_right_bound_` is an inclusive upper-bound, but since it |
799 | | // was determined based on user key, it is still possible the lookup |
800 | | // key falls to the right of `search_right_bound_`'s corresponding |
801 | | // file. So, pass a limit one higher, which allows us to detect this |
802 | | // case. |
803 | 0 | Slice& ikey = mget_iter->ikey; |
804 | 0 | start_index = FindFileInRange( |
805 | 0 | *internal_comparator_, *curr_file_level_, ikey, |
806 | 0 | static_cast<uint32_t>(fp_ctx.search_left_bound), |
807 | 0 | static_cast<uint32_t>(fp_ctx.search_right_bound) + 1); |
808 | 0 | if (start_index == fp_ctx.search_right_bound + 1) { |
809 | | // `ikey_` comes after `search_right_bound_`. The lookup key does |
810 | | // not exist on this level, so let's skip this level and do a full |
811 | | // binary search on the next level. |
812 | 0 | fp_ctx.search_left_bound = 0; |
813 | 0 | fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; |
814 | 0 | current_level_range_.SkipKey(mget_iter); |
815 | 0 | continue; |
816 | 0 | } else { |
817 | 0 | level_contains_keys = true; |
818 | 0 | } |
819 | 0 | } else { |
820 | | // search_left_bound > search_right_bound, key does not exist in |
821 | | // this level. Since no comparison is done in this level, it will |
822 | | // need to search all files in the next level. |
823 | 0 | fp_ctx.search_left_bound = 0; |
824 | 0 | fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; |
825 | 0 | current_level_range_.SkipKey(mget_iter); |
826 | 0 | continue; |
827 | 0 | } |
828 | 0 | } |
829 | 0 | assert(start_index >= 0); |
830 | 0 | assert(start_index < static_cast<int32_t>(curr_file_level_->num_files)); |
831 | 0 | fp_ctx.start_index_in_curr_level = start_index; |
832 | 0 | fp_ctx.curr_index_in_curr_level = start_index; |
833 | 0 | } |
834 | 0 | if (level_contains_keys) { |
835 | 0 | batch_iter_prev_ = current_level_range_.begin(); |
836 | 0 | upper_key_ = batch_iter_ = current_level_range_.begin(); |
837 | 0 | return true; |
838 | 0 | } |
839 | 0 | curr_level_++; |
840 | 0 | } |
841 | | // curr_level_ = num_levels_. So, no more levels to search. |
842 | 0 | return false; |
843 | 0 | } |
844 | | }; |
845 | | |
846 | 328k | VersionStorageInfo::~VersionStorageInfo() { delete[] files_; } |
847 | | |
848 | 328k | Version::~Version() { |
849 | 328k | assert(refs_ == 0); |
850 | | |
851 | | // Remove from linked list |
852 | 328k | prev_->next_ = next_; |
853 | 328k | next_->prev_ = prev_; |
854 | | |
855 | | // Drop references to files |
856 | 1.96M | for (int level = 0; level < storage_info_.num_levels_; level++) { |
857 | 1.82M | for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { |
858 | 190k | FileMetaData* f = storage_info_.files_[level][i]; |
859 | 190k | assert(f->refs > 0); |
860 | 190k | f->refs--; |
861 | 190k | if (f->refs <= 0) { |
862 | 102k | assert(cfd_ != nullptr); |
863 | | // When not in the process of closing the DB, we'll have a superversion |
864 | | // to get current mutable options from |
865 | 102k | auto* sv = cfd_->GetSuperVersion(); |
866 | 102k | uint32_t path_id = f->fd.GetPathId(); |
867 | 102k | assert(path_id < cfd_->ioptions().cf_paths.size()); |
868 | 102k | vset_->obsolete_files_.emplace_back( |
869 | 102k | f, cfd_->ioptions().cf_paths[path_id].path, |
870 | 102k | sv ? sv->mutable_cf_options.uncache_aggressiveness : 0, |
871 | 102k | cfd_->GetFileMetadataCacheReservationManager()); |
872 | 102k | } |
873 | 190k | } |
874 | 1.63M | } |
875 | 328k | } |
876 | | |
877 | | int FindFile(const InternalKeyComparator& icmp, |
878 | 3.23k | const LevelFilesBrief& file_level, const Slice& key) { |
879 | 3.23k | return FindFileInRange(icmp, file_level, key, 0, |
880 | 3.23k | static_cast<uint32_t>(file_level.num_files)); |
881 | 3.23k | } |
882 | | |
883 | | void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, |
884 | | const std::vector<FileMetaData*>& files, |
885 | 249k | Arena* arena) { |
886 | 249k | assert(file_level); |
887 | 249k | assert(arena); |
888 | | |
889 | 249k | size_t num = files.size(); |
890 | 249k | file_level->num_files = num; |
891 | 249k | char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange)); |
892 | 249k | file_level->files = new (mem) FdWithKeyRange[num]; |
893 | | |
894 | 459k | for (size_t i = 0; i < num; i++) { |
895 | 210k | Slice smallest_key = files[i]->smallest.Encode(); |
896 | 210k | Slice largest_key = files[i]->largest.Encode(); |
897 | | |
898 | | // Copy key slice to sequential memory |
899 | 210k | size_t smallest_size = smallest_key.size(); |
900 | 210k | size_t largest_size = largest_key.size(); |
901 | 210k | mem = arena->AllocateAligned(smallest_size + largest_size); |
902 | 210k | memcpy(mem, smallest_key.data(), smallest_size); |
903 | 210k | memcpy(mem + smallest_size, largest_key.data(), largest_size); |
904 | | |
905 | 210k | FdWithKeyRange& f = file_level->files[i]; |
906 | 210k | f.fd = files[i]->fd; |
907 | 210k | f.file_metadata = files[i]; |
908 | 210k | f.smallest_key = Slice(mem, smallest_size); |
909 | 210k | f.largest_key = Slice(mem + smallest_size, largest_size); |
910 | 210k | } |
911 | 249k | } |
912 | | |
913 | | static bool AfterFile(const Comparator* ucmp, const Slice* user_key, |
914 | 2.82k | const FdWithKeyRange* f) { |
915 | | // nullptr user_key occurs before all keys and is therefore never after *f |
916 | 2.82k | return (user_key != nullptr && |
917 | 2.82k | ucmp->CompareWithoutTimestamp(*user_key, |
918 | 2.82k | ExtractUserKey(f->largest_key)) > 0); |
919 | 2.82k | } |
920 | | |
921 | | static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, |
922 | 2.78k | const FdWithKeyRange* f) { |
923 | | // nullptr user_key occurs after all keys and is therefore never before *f |
924 | 2.78k | return (user_key != nullptr && |
925 | 2.78k | ucmp->CompareWithoutTimestamp(*user_key, |
926 | 2.78k | ExtractUserKey(f->smallest_key)) < 0); |
927 | 2.78k | } |
928 | | |
929 | | bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, |
930 | | bool disjoint_sorted_files, |
931 | | const LevelFilesBrief& file_level, |
932 | | const Slice* smallest_user_key, |
933 | 0 | const Slice* largest_user_key) { |
934 | 0 | const Comparator* ucmp = icmp.user_comparator(); |
935 | 0 | if (!disjoint_sorted_files) { |
936 | | // Need to check against all files |
937 | 0 | for (size_t i = 0; i < file_level.num_files; i++) { |
938 | 0 | const FdWithKeyRange* f = &(file_level.files[i]); |
939 | 0 | if (AfterFile(ucmp, smallest_user_key, f) || |
940 | 0 | BeforeFile(ucmp, largest_user_key, f)) { |
941 | | // No overlap |
942 | 0 | } else { |
943 | 0 | return true; // Overlap |
944 | 0 | } |
945 | 0 | } |
946 | 0 | return false; |
947 | 0 | } |
948 | | |
949 | | // Binary search over file list |
950 | 0 | uint32_t index = 0; |
951 | 0 | if (smallest_user_key != nullptr) { |
952 | | // Find the leftmost possible internal key for smallest_user_key |
953 | 0 | InternalKey small; |
954 | 0 | small.SetMinPossibleForUserKey(*smallest_user_key); |
955 | 0 | index = FindFile(icmp, file_level, small.Encode()); |
956 | 0 | } |
957 | |
|
958 | 0 | if (index >= file_level.num_files) { |
959 | | // beginning of range is after all files, so no overlap. |
960 | 0 | return false; |
961 | 0 | } |
962 | | |
963 | 0 | return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]); |
964 | 0 | } |
965 | | |
966 | | namespace { |
967 | | |
968 | | class LevelIterator final : public InternalIterator { |
969 | | public: |
970 | | // NOTE: many of the const& parameters are saved in this object (so |
971 | | // must outlive this object) |
972 | | LevelIterator( |
973 | | TableCache* table_cache, const ReadOptions& read_options, |
974 | | const FileOptions& file_options, const InternalKeyComparator& icomparator, |
975 | | const LevelFilesBrief* flevel, const MutableCFOptions& mutable_cf_options, |
976 | | bool should_sample, HistogramImpl* file_read_hist, |
977 | | TableReaderCaller caller, bool skip_filters, int level, |
978 | | RangeDelAggregator* range_del_agg, |
979 | | const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries = |
980 | | nullptr, |
981 | | bool allow_unprepared_value = false, |
982 | | std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ = |
983 | | nullptr) |
984 | 8.24k | : table_cache_(table_cache), |
985 | 8.24k | read_options_(read_options), |
986 | 8.24k | file_options_(file_options), |
987 | 8.24k | icomparator_(icomparator), |
988 | 8.24k | user_comparator_(icomparator.user_comparator()), |
989 | 8.24k | flevel_(flevel), |
990 | 8.24k | mutable_cf_options_(mutable_cf_options), |
991 | 8.24k | prefix_extractor_(mutable_cf_options.prefix_extractor.get()), |
992 | 8.24k | file_read_hist_(file_read_hist), |
993 | 8.24k | caller_(caller), |
994 | 8.24k | file_index_(flevel_->num_files), |
995 | 8.24k | range_del_agg_(range_del_agg), |
996 | 8.24k | pinned_iters_mgr_(nullptr), |
997 | 8.24k | compaction_boundaries_(compaction_boundaries), |
998 | 8.24k | range_tombstone_iter_(nullptr), |
999 | 8.24k | read_seq_(read_options.snapshot |
1000 | 8.24k | ? read_options.snapshot->GetSequenceNumber() |
1001 | 8.24k | : kMaxSequenceNumber), |
1002 | 8.24k | level_(level), |
1003 | 8.24k | should_sample_(should_sample), |
1004 | 8.24k | skip_filters_(skip_filters), |
1005 | 8.24k | allow_unprepared_value_(allow_unprepared_value), |
1006 | 8.24k | is_next_read_sequential_(false), |
1007 | 8.24k | to_return_sentinel_(false), |
1008 | 8.24k | scan_opts_(nullptr) { |
1009 | | // Empty level is not supported. |
1010 | 8.24k | assert(flevel_ != nullptr && flevel_->num_files > 0); |
1011 | 8.24k | if (range_tombstone_iter_ptr_) { |
1012 | 7.61k | *range_tombstone_iter_ptr_ = &range_tombstone_iter_; |
1013 | 7.61k | } |
1014 | 8.24k | } |
1015 | | |
1016 | 8.24k | ~LevelIterator() override { delete file_iter_.Set(nullptr); } |
1017 | | |
1018 | | // Seek to the first file with a key >= target. |
1019 | | // If range_tombstone_iter_ is not nullptr, then we pretend that file |
1020 | | // boundaries are fake keys (sentinel keys). These keys are used to keep range |
1021 | | // tombstones alive even when all point keys in an SST file are exhausted. |
1022 | | // These sentinel keys will be skipped in merging iterator. |
1023 | | void Seek(const Slice& target) override; |
1024 | | void SeekForPrev(const Slice& target) override; |
1025 | | void SeekToFirst() override; |
1026 | | void SeekToLast() override; |
1027 | | void Next() final override; |
1028 | | bool NextAndGetResult(IterateResult* result) override; |
1029 | | void Prev() override; |
1030 | | |
1031 | | // In addition to valid and invalid state (!file_iter.Valid() and |
1032 | | // status.ok()), a third state of the iterator is when !file_iter_.Valid() and |
1033 | | // to_return_sentinel_. This means we are at the end of a file, and a sentinel |
1034 | | // key (the file boundary that we pretend as a key) is to be returned next. |
1035 | | // file_iter_.Valid() and to_return_sentinel_ should not both be true. |
1036 | 33.9k | bool Valid() const override { |
1037 | 33.9k | assert(!(file_iter_.Valid() && to_return_sentinel_)); |
1038 | 33.9k | return file_iter_.Valid() || to_return_sentinel_; |
1039 | 33.9k | } |
1040 | 19.3k | Slice key() const override { |
1041 | 19.3k | assert(Valid()); |
1042 | 19.3k | if (to_return_sentinel_) { |
1043 | | // Sentinel should be returned after file_iter_ reaches the end of the |
1044 | | // file |
1045 | 3.68k | assert(!file_iter_.Valid()); |
1046 | 3.68k | return sentinel_; |
1047 | 3.68k | } |
1048 | 15.7k | return file_iter_.key(); |
1049 | 19.3k | } |
1050 | | |
1051 | 7.15k | Slice value() const override { |
1052 | 7.15k | assert(Valid()); |
1053 | 7.15k | assert(!to_return_sentinel_); |
1054 | 7.15k | return file_iter_.value(); |
1055 | 7.15k | } |
1056 | | |
1057 | 3.63k | uint64_t write_unix_time() const override { |
1058 | 3.63k | assert(Valid()); |
1059 | 3.63k | return file_iter_.write_unix_time(); |
1060 | 3.63k | } |
1061 | | |
1062 | 11.5k | Status status() const override { |
1063 | 11.5k | return file_iter_.iter() ? file_iter_.status() : Status::OK(); |
1064 | 11.5k | } |
1065 | | |
1066 | 4.35k | bool PrepareValue() override { return file_iter_.PrepareValue(); } |
1067 | | |
1068 | 0 | inline bool MayBeOutOfLowerBound() override { |
1069 | 0 | assert(Valid()); |
1070 | 0 | return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); |
1071 | 0 | } |
1072 | | |
1073 | 0 | inline IterBoundCheck UpperBoundCheckResult() override { |
1074 | 0 | if (Valid()) { |
1075 | 0 | return file_iter_.UpperBoundCheckResult(); |
1076 | 0 | } else { |
1077 | 0 | return IterBoundCheck::kUnknown; |
1078 | 0 | } |
1079 | 0 | } |
1080 | | |
1081 | 4.85k | void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { |
1082 | 4.85k | pinned_iters_mgr_ = pinned_iters_mgr; |
1083 | 4.85k | if (file_iter_.iter()) { |
1084 | 0 | file_iter_.SetPinnedItersMgr(pinned_iters_mgr); |
1085 | 0 | } |
1086 | 4.85k | } |
1087 | | |
1088 | 30 | bool IsKeyPinned() const override { |
1089 | 30 | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && |
1090 | 30 | file_iter_.iter() && file_iter_.IsKeyPinned(); |
1091 | 30 | } |
1092 | | |
1093 | 2.69k | bool IsValuePinned() const override { |
1094 | 2.69k | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && |
1095 | 2.69k | file_iter_.iter() && file_iter_.IsValuePinned(); |
1096 | 2.69k | } |
1097 | | |
1098 | 26.3k | bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; } |
1099 | | |
1100 | 0 | void SetRangeDelReadSeqno(SequenceNumber read_seq) override { |
1101 | 0 | read_seq_ = read_seq; |
1102 | 0 | } |
1103 | | |
1104 | 0 | inline bool FileHasMultiScanArg(size_t file_index) { |
1105 | 0 | if (file_to_scan_opts_.get()) { |
1106 | 0 | auto it = file_to_scan_opts_->find(file_index); |
1107 | 0 | if (it != file_to_scan_opts_->end()) { |
1108 | 0 | return !it->second.empty(); |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | return false; |
1112 | 0 | } |
1113 | | |
1114 | 0 | MultiScanArgs& GetMultiScanArgForFile(size_t file_index) { |
1115 | 0 | auto multi_scan_args_it = file_to_scan_opts_->find(file_index); |
1116 | 0 | if (multi_scan_args_it == file_to_scan_opts_->end()) { |
1117 | 0 | auto ret = file_to_scan_opts_->emplace( |
1118 | 0 | file_index, MultiScanArgs(user_comparator_.user_comparator())); |
1119 | 0 | multi_scan_args_it = ret.first; |
1120 | 0 | assert(ret.second); |
1121 | 0 | } |
1122 | 0 | return multi_scan_args_it->second; |
1123 | 0 | } |
1124 | | |
1125 | 0 | void Prepare(const MultiScanArgs* so) override { |
1126 | | // We assume here that scan_opts is sorted such that |
1127 | | // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping |
1128 | 0 | if (so == nullptr) { |
1129 | 0 | return; |
1130 | 0 | } |
1131 | 0 | scan_opts_ = so; |
1132 | | |
1133 | | // Verify comparator is consistent |
1134 | 0 | assert(so->GetComparator() == user_comparator_.user_comparator()); |
1135 | |
|
1136 | 0 | file_to_scan_opts_ = std::make_unique<ScanOptionsMap>(); |
1137 | 0 | for (size_t k = 0; k < scan_opts_->size(); k++) { |
1138 | 0 | const ScanOptions& opt = scan_opts_->GetScanRanges().at(k); |
1139 | 0 | auto start = opt.range.start; |
1140 | 0 | auto end = opt.range.limit; |
1141 | |
|
1142 | 0 | if (!start.has_value()) { |
1143 | 0 | continue; |
1144 | 0 | } |
1145 | | |
1146 | | // We can capture this case in the future, but for now lets skip this. |
1147 | 0 | if (!end.has_value()) { |
1148 | 0 | continue; |
1149 | 0 | } |
1150 | | |
1151 | 0 | const size_t timestamp_size = |
1152 | 0 | user_comparator_.user_comparator()->timestamp_size(); |
1153 | 0 | InternalKey istart, iend; |
1154 | 0 | if (timestamp_size == 0) { |
1155 | 0 | istart = |
1156 | 0 | InternalKey(start.value(), kMaxSequenceNumber, kValueTypeForSeek); |
1157 | | // end key is exclusive for multiscan |
1158 | 0 | iend = InternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); |
1159 | 0 | } else { |
1160 | 0 | std::string start_key_with_ts, end_key_with_ts; |
1161 | 0 | AppendKeyWithMaxTimestamp(&start_key_with_ts, start.value(), |
1162 | 0 | timestamp_size); |
1163 | 0 | AppendKeyWithMaxTimestamp(&end_key_with_ts, end.value(), |
1164 | 0 | timestamp_size); |
1165 | 0 | istart = InternalKey(start_key_with_ts, kMaxSequenceNumber, |
1166 | 0 | kValueTypeForSeek); |
1167 | | // end key is exclusive for multiscan |
1168 | 0 | iend = |
1169 | 0 | InternalKey(end_key_with_ts, kMaxSequenceNumber, kValueTypeForSeek); |
1170 | 0 | } |
1171 | | |
1172 | | // TODO: This needs to be optimized, right now we iterate twice, which |
1173 | | // we dont need to. We can do this in N rather than 2N. |
1174 | 0 | size_t fstart = FindFile(icomparator_, *flevel_, istart.Encode()); |
1175 | 0 | size_t fend = FindFile(icomparator_, *flevel_, iend.Encode()); |
1176 | | |
1177 | | // We need to check the relevant cases |
1178 | | // Cases: |
1179 | | // 1. [ S E ] |
1180 | | // 2. [ S ] [ E ] |
1181 | | // 3. [ S ] ...... [ E ] |
1182 | 0 | for (auto i = fstart; i <= fend; i++) { |
1183 | 0 | if (i < flevel_->num_files) { |
1184 | | // FindFile only compares against the largest_key, so we need this |
1185 | | // additional check to ensure the scan range overlaps the file |
1186 | 0 | if (icomparator_.InternalKeyComparator::Compare( |
1187 | 0 | iend.Encode(), flevel_->files[i].smallest_key) < 0) { |
1188 | 0 | continue; |
1189 | 0 | } |
1190 | 0 | auto const metadata = flevel_->files[i].file_metadata; |
1191 | 0 | if (metadata->FileIsStandAloneRangeTombstone()) { |
1192 | | // Skip stand alone range deletion files. |
1193 | 0 | continue; |
1194 | 0 | } |
1195 | 0 | auto& args = GetMultiScanArgForFile(i); |
1196 | 0 | args.insert(start.value(), end.value(), opt.property_bag); |
1197 | 0 | } |
1198 | 0 | } |
1199 | 0 | } |
1200 | | // Propagate multiscan configs |
1201 | 0 | for (auto& file_to_arg : *file_to_scan_opts_) { |
1202 | 0 | file_to_arg.second.CopyConfigFrom(*so); |
1203 | 0 | } |
1204 | 0 | } |
1205 | | |
1206 | | private: |
1207 | | // Return true if at least one invalid file is seen and skipped. |
1208 | | bool SkipEmptyFileForward(); |
1209 | | void SkipEmptyFileBackward(); |
1210 | | void SetFileIterator(InternalIterator* iter); |
1211 | | void InitFileIterator(size_t new_file_index); |
1212 | | |
1213 | 12.6k | const Slice& file_smallest_key(size_t file_index) { |
1214 | 12.6k | assert(file_index < flevel_->num_files); |
1215 | 12.6k | return flevel_->files[file_index].smallest_key; |
1216 | 12.6k | } |
1217 | | |
1218 | 12.7k | const Slice& file_largest_key(size_t file_index) { |
1219 | 12.7k | assert(file_index < flevel_->num_files); |
1220 | 12.7k | return flevel_->files[file_index].largest_key; |
1221 | 12.7k | } |
1222 | | |
1223 | 2.58k | bool KeyReachedUpperBound(const Slice& internal_key) { |
1224 | 2.58k | return read_options_.iterate_upper_bound != nullptr && |
1225 | 0 | user_comparator_.CompareWithoutTimestamp( |
1226 | 0 | ExtractUserKey(internal_key), /*a_has_ts=*/true, |
1227 | 0 | *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0; |
1228 | 2.58k | } |
1229 | | |
1230 | 17.0k | void ClearRangeTombstoneIter() { |
1231 | 17.0k | if (range_tombstone_iter_) { |
1232 | 16.4k | range_tombstone_iter_->reset(); |
1233 | 16.4k | } |
1234 | 17.0k | } |
1235 | | |
1236 | | // Move file_iter_ to the file at file_index_. |
1237 | | // range_tombstone_iter_ is updated with a range tombstone iterator |
1238 | | // into the new file. Old range tombstone iterator is cleared. |
1239 | 11.4k | InternalIterator* NewFileIterator() { |
1240 | 11.4k | assert(file_index_ < flevel_->num_files); |
1241 | 11.4k | auto file_meta = flevel_->files[file_index_]; |
1242 | 11.4k | if (should_sample_) { |
1243 | 4 | sample_file_read_inc(file_meta.file_metadata); |
1244 | 4 | } |
1245 | | |
1246 | 11.4k | const InternalKey* smallest_compaction_key = nullptr; |
1247 | 11.4k | const InternalKey* largest_compaction_key = nullptr; |
1248 | 11.4k | if (compaction_boundaries_ != nullptr) { |
1249 | 3.98k | smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; |
1250 | 3.98k | largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; |
1251 | 3.98k | } |
1252 | 11.4k | CheckMayBeOutOfLowerBound(); |
1253 | 11.4k | ClearRangeTombstoneIter(); |
1254 | 11.4k | return table_cache_->NewIterator( |
1255 | 11.4k | read_options_, file_options_, icomparator_, *file_meta.file_metadata, |
1256 | 11.4k | range_del_agg_, mutable_cf_options_, |
1257 | 11.4k | nullptr /* don't need reference to table */, file_read_hist_, caller_, |
1258 | 11.4k | /*arena=*/nullptr, skip_filters_, level_, |
1259 | 11.4k | /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, |
1260 | 11.4k | largest_compaction_key, allow_unprepared_value_, &read_seq_, |
1261 | 11.4k | range_tombstone_iter_); |
1262 | 11.4k | } |
1263 | | |
1264 | | // Check if current file being fully within iterate_lower_bound. |
1265 | | // |
1266 | | // Note MyRocks may update iterate bounds between seek. To workaround it, |
1267 | | // we need to check and update may_be_out_of_lower_bound_ accordingly. |
1268 | 18.9k | void CheckMayBeOutOfLowerBound() { |
1269 | 18.9k | if (read_options_.iterate_lower_bound != nullptr && |
1270 | 0 | file_index_ < flevel_->num_files) { |
1271 | 0 | may_be_out_of_lower_bound_ = |
1272 | 0 | user_comparator_.CompareWithoutTimestamp( |
1273 | 0 | ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true, |
1274 | 0 | *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0; |
1275 | 0 | } |
1276 | 18.9k | } |
1277 | | |
1278 | | #ifndef NDEBUG |
1279 | | bool OverlapRange(const ScanOptions& opts); |
1280 | | #endif |
1281 | | |
1282 | | TableCache* table_cache_; |
1283 | | const ReadOptions& read_options_; |
1284 | | const FileOptions& file_options_; |
1285 | | const InternalKeyComparator& icomparator_; |
1286 | | const UserComparatorWrapper user_comparator_; |
1287 | | const LevelFilesBrief* flevel_; |
1288 | | mutable FileDescriptor current_value_; |
1289 | | const MutableCFOptions& mutable_cf_options_; |
1290 | | const SliceTransform* prefix_extractor_; |
1291 | | |
1292 | | HistogramImpl* file_read_hist_; |
1293 | | TableReaderCaller caller_; |
1294 | | size_t file_index_; |
1295 | | RangeDelAggregator* range_del_agg_; |
1296 | | IteratorWrapper file_iter_; // May be nullptr |
1297 | | PinnedIteratorsManager* pinned_iters_mgr_; |
1298 | | |
1299 | | // To be propagated to RangeDelAggregator in order to safely truncate range |
1300 | | // tombstones. |
1301 | | const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_; |
1302 | | |
1303 | | // This is set when this level iterator is used under a merging iterator |
1304 | | // that processes range tombstones. range_tombstone_iter_ points to where the |
1305 | | // merging iterator stores the range tombstones iterator for this level. When |
1306 | | // this level iterator moves to a new SST file, it updates the range |
1307 | | // tombstones accordingly through this pointer. So the merging iterator always |
1308 | | // has access to the current SST file's range tombstones. |
1309 | | // |
1310 | | // The level iterator treats file boundary as fake keys (sentinel keys) to |
1311 | | // keep range tombstones alive if needed and make upper level, i.e. merging |
1312 | | // iterator, aware of file changes (when level iterator moves to a new SST |
1313 | | // file, there is some bookkeeping work that needs to be done at merging |
1314 | | // iterator end). |
1315 | | // |
1316 | | // *range_tombstone_iter_ points to range tombstones of the current SST file |
1317 | | std::unique_ptr<TruncatedRangeDelIterator>* range_tombstone_iter_; |
1318 | | |
1319 | | // The sentinel key to be returned |
1320 | | Slice sentinel_; |
1321 | | SequenceNumber read_seq_; |
1322 | | |
1323 | | int level_; |
1324 | | bool should_sample_; |
1325 | | bool skip_filters_; |
1326 | | bool allow_unprepared_value_; |
1327 | | bool may_be_out_of_lower_bound_ = true; |
1328 | | bool is_next_read_sequential_; |
1329 | | // Set in Seek() when a prefix seek reaches end of the current file, |
1330 | | // and the next file has a different prefix. SkipEmptyFileForward() |
1331 | | // will not move to next file when this flag is set. |
1332 | | bool prefix_exhausted_ = false; |
1333 | | // Whether next/prev key is a sentinel key. |
1334 | | bool to_return_sentinel_ = false; |
1335 | | const MultiScanArgs* scan_opts_ = nullptr; |
1336 | | |
1337 | | // Our stored scan_opts for each prefix |
1338 | | std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr; |
1339 | | |
1340 | | // Sets flags for if we should return the sentinel key next. |
1341 | | // The condition for returning sentinel is reaching the end of current |
1342 | | // file_iter_: !Valid() && status.().ok(). |
1343 | | void TrySetDeleteRangeSentinel(const Slice& boundary_key); |
1344 | 16.8k | void ClearSentinel() { to_return_sentinel_ = false; } |
1345 | | }; |
1346 | | |
1347 | 20.1k | void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) { |
1348 | 20.1k | assert(range_tombstone_iter_); |
1349 | 20.1k | if (file_iter_.iter() != nullptr && !file_iter_.Valid() && |
1350 | 9.42k | file_iter_.status().ok()) { |
1351 | 9.42k | to_return_sentinel_ = true; |
1352 | 9.42k | sentinel_ = boundary_key; |
1353 | 9.42k | } |
1354 | 20.1k | } |
1355 | | |
1356 | 1.00k | void LevelIterator::Seek(const Slice& target) { |
1357 | 1.00k | prefix_exhausted_ = false; |
1358 | 1.00k | ClearSentinel(); |
1359 | | // Check whether the seek key fall under the same file |
1360 | 1.00k | bool need_to_reseek = true; |
1361 | 1.00k | if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { |
1362 | 136 | const FdWithKeyRange& cur_file = flevel_->files[file_index_]; |
1363 | 136 | if (icomparator_.InternalKeyComparator::Compare( |
1364 | 136 | target, cur_file.largest_key) <= 0 && |
1365 | 61 | icomparator_.InternalKeyComparator::Compare( |
1366 | 61 | target, cur_file.smallest_key) >= 0) { |
1367 | 57 | need_to_reseek = false; |
1368 | 57 | assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) == |
1369 | 57 | file_index_); |
1370 | 57 | } |
1371 | 136 | } |
1372 | 1.00k | if (need_to_reseek) { |
1373 | 949 | TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); |
1374 | 949 | size_t new_file_index = FindFile(icomparator_, *flevel_, target); |
1375 | 949 | InitFileIterator(new_file_index); |
1376 | 949 | } |
1377 | | |
1378 | 1.00k | if (file_iter_.iter() != nullptr) { |
1379 | 928 | if (scan_opts_) { |
1380 | | // At this point, we only know that the seek target is < largest_key |
1381 | | // in the file. We need to check whether there is actual overlap. |
1382 | 0 | const FdWithKeyRange& cur_file = flevel_->files[file_index_]; |
1383 | 0 | if (KeyReachedUpperBound(cur_file.smallest_key)) { |
1384 | 0 | return; |
1385 | 0 | } |
1386 | 0 | } |
1387 | 928 | file_iter_.Seek(target); |
1388 | | // Status::TryAgain indicates asynchronous request for retrieval of data |
1389 | | // blocks has been submitted. So it should return at this point and Seek |
1390 | | // should be called again to retrieve the requested block and execute the |
1391 | | // remaining code. |
1392 | 928 | if (file_iter_.status() == Status::TryAgain()) { |
1393 | 0 | return; |
1394 | 0 | } |
1395 | 928 | if (!file_iter_.Valid() && file_iter_.status().ok() && |
1396 | 0 | prefix_extractor_ != nullptr && !read_options_.total_order_seek && |
1397 | 0 | !read_options_.auto_prefix_mode && |
1398 | 0 | file_index_ < flevel_->num_files - 1) { |
1399 | 0 | size_t ts_sz = user_comparator_.user_comparator()->timestamp_size(); |
1400 | 0 | Slice target_user_key_without_ts = |
1401 | 0 | ExtractUserKeyAndStripTimestamp(target, ts_sz); |
1402 | 0 | Slice next_file_first_user_key_without_ts = |
1403 | 0 | ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1), |
1404 | 0 | ts_sz); |
1405 | 0 | if (prefix_extractor_->InDomain(target_user_key_without_ts) && |
1406 | 0 | (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) || |
1407 | 0 | prefix_extractor_->Transform(target_user_key_without_ts) |
1408 | 0 | .compare(prefix_extractor_->Transform( |
1409 | 0 | next_file_first_user_key_without_ts)) != 0)) { |
1410 | | // SkipEmptyFileForward() will not advance to next file when this flag |
1411 | | // is set for reason detailed below. |
1412 | | // |
1413 | | // The file we initially positioned to has no keys under the target |
1414 | | // prefix, and the next file's smallest key has a different prefix than |
1415 | | // target. When doing prefix iterator seek, when keys for one prefix |
1416 | | // have been exhausted, it can jump to any key that is larger. Here we |
1417 | | // are enforcing a stricter contract than that, in order to make it |
1418 | | // easier for higher layers (merging and DB iterator) to reason the |
1419 | | // correctness: |
1420 | | // 1. Within the prefix, the result should be accurate. |
1421 | | // 2. If keys for the prefix is exhausted, it is either positioned to |
1422 | | // the next key after the prefix, or make the iterator invalid. |
1423 | | // A side benefit will be that it invalidates the iterator earlier so |
1424 | | // that the upper level merging iterator can merge fewer child |
1425 | | // iterators. |
1426 | | // |
1427 | | // The flag is cleared in Seek*() calls. There is no need to clear the |
1428 | | // flag in Prev() since Prev() will not be called when the flag is set |
1429 | | // for reasons explained below. If range_tombstone_iter_ is nullptr, |
1430 | | // then there is no file boundary sentinel key. Since |
1431 | | // !file_iter_.Valid() from the if condition above, this level iterator |
1432 | | // is !Valid(), so Prev() will not be called. If range_tombstone_iter_ |
1433 | | // is not nullptr, there are two cases depending on if this level |
1434 | | // iterator reaches top of the heap in merging iterator (the upper |
1435 | | // layer). |
1436 | | // If so, merging iterator will see the sentinel key, call |
1437 | | // NextAndGetResult() and the call to NextAndGetResult() will skip the |
1438 | | // sentinel key and makes this level iterator invalid. If not, then it |
1439 | | // could be because the upper layer is done before any method of this |
1440 | | // level iterator is called or another Seek*() call is invoked. Either |
1441 | | // way, Prev() is never called before Seek*(). |
1442 | | // The flag should not be cleared at the beginning of |
1443 | | // Next/NextAndGetResult() since it is used in SkipEmptyFileForward() |
1444 | | // called in Next/NextAndGetResult(). |
1445 | 0 | prefix_exhausted_ = true; |
1446 | 0 | } |
1447 | 0 | } |
1448 | | |
1449 | 928 | if (range_tombstone_iter_) { |
1450 | 312 | TrySetDeleteRangeSentinel(file_largest_key(file_index_)); |
1451 | 312 | } |
1452 | 928 | } |
1453 | 1.00k | SkipEmptyFileForward(); |
1454 | 1.00k | CheckMayBeOutOfLowerBound(); |
1455 | 1.00k | } |
1456 | | |
1457 | 2.28k | void LevelIterator::SeekForPrev(const Slice& target) { |
1458 | 2.28k | prefix_exhausted_ = false; |
1459 | 2.28k | ClearSentinel(); |
1460 | 2.28k | size_t new_file_index = FindFile(icomparator_, *flevel_, target); |
1461 | | // Seek beyond this level's smallest key |
1462 | 2.28k | if (new_file_index == 0 && |
1463 | 885 | icomparator_.Compare(target, file_smallest_key(0)) < 0) { |
1464 | 534 | SetFileIterator(nullptr); |
1465 | 534 | ClearRangeTombstoneIter(); |
1466 | 534 | CheckMayBeOutOfLowerBound(); |
1467 | 534 | return; |
1468 | 534 | } |
1469 | 1.75k | if (new_file_index >= flevel_->num_files) { |
1470 | 1.08k | new_file_index = flevel_->num_files - 1; |
1471 | 1.08k | } |
1472 | | |
1473 | 1.75k | InitFileIterator(new_file_index); |
1474 | 1.75k | if (file_iter_.iter() != nullptr) { |
1475 | 1.75k | file_iter_.SeekForPrev(target); |
1476 | 1.75k | if (range_tombstone_iter_ && |
1477 | 1.75k | icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) { |
1478 | | // In SeekForPrev() case, it is possible that the target is less than |
1479 | | // file's lower boundary since largest key is used to determine file index |
1480 | | // (FindFile()). When target is less than file's lower boundary, sentinel |
1481 | | // key should not be set so that SeekForPrev() does not result in a key |
1482 | | // larger than target. This is correct in that there is no need to keep |
1483 | | // the range tombstones in this file alive as they only cover keys |
1484 | | // starting from the file's lower boundary, which is after `target`. |
1485 | 1.53k | TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); |
1486 | 1.53k | } |
1487 | 1.75k | SkipEmptyFileBackward(); |
1488 | 1.75k | } |
1489 | 1.75k | CheckMayBeOutOfLowerBound(); |
1490 | 1.75k | } |
1491 | | |
1492 | 4.12k | void LevelIterator::SeekToFirst() { |
1493 | 4.12k | prefix_exhausted_ = false; |
1494 | 4.12k | ClearSentinel(); |
1495 | 4.12k | InitFileIterator(0); |
1496 | 4.12k | if (file_iter_.iter() != nullptr) { |
1497 | 4.12k | file_iter_.SeekToFirst(); |
1498 | 4.12k | if (range_tombstone_iter_) { |
1499 | | // We do this in SeekToFirst() and SeekToLast() since |
1500 | | // we could have an empty file with only range tombstones. |
1501 | 4.12k | TrySetDeleteRangeSentinel(file_largest_key(file_index_)); |
1502 | 4.12k | } |
1503 | 4.12k | } |
1504 | 4.12k | SkipEmptyFileForward(); |
1505 | 4.12k | CheckMayBeOutOfLowerBound(); |
1506 | 4.12k | } |
1507 | | |
1508 | 0 | void LevelIterator::SeekToLast() { |
1509 | 0 | prefix_exhausted_ = false; |
1510 | 0 | ClearSentinel(); |
1511 | 0 | InitFileIterator(flevel_->num_files - 1); |
1512 | 0 | if (file_iter_.iter() != nullptr) { |
1513 | 0 | file_iter_.SeekToLast(); |
1514 | 0 | if (range_tombstone_iter_) { |
1515 | 0 | TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); |
1516 | 0 | } |
1517 | 0 | } |
1518 | 0 | SkipEmptyFileBackward(); |
1519 | 0 | CheckMayBeOutOfLowerBound(); |
1520 | 0 | } |
1521 | | |
1522 | 0 | void LevelIterator::Next() { |
1523 | 0 | assert(Valid()); |
1524 | 0 | if (to_return_sentinel_) { |
1525 | | // file_iter_ is at EOF already when to_return_sentinel_ |
1526 | 0 | ClearSentinel(); |
1527 | 0 | } else { |
1528 | 0 | file_iter_.Next(); |
1529 | 0 | if (range_tombstone_iter_) { |
1530 | 0 | TrySetDeleteRangeSentinel(file_largest_key(file_index_)); |
1531 | 0 | } |
1532 | 0 | } |
1533 | 0 | SkipEmptyFileForward(); |
1534 | 0 | } |
1535 | | |
1536 | 12.2k | bool LevelIterator::NextAndGetResult(IterateResult* result) { |
1537 | 12.2k | assert(Valid()); |
1538 | | // file_iter_ is at EOF already when to_return_sentinel_ |
1539 | 12.2k | bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result); |
1540 | 12.2k | if (!is_valid) { |
1541 | 11.4k | if (to_return_sentinel_) { |
1542 | 5.73k | ClearSentinel(); |
1543 | 5.73k | } else if (range_tombstone_iter_) { |
1544 | 5.73k | TrySetDeleteRangeSentinel(file_largest_key(file_index_)); |
1545 | 5.73k | } |
1546 | 11.4k | is_next_read_sequential_ = true; |
1547 | 11.4k | SkipEmptyFileForward(); |
1548 | 11.4k | is_next_read_sequential_ = false; |
1549 | 11.4k | is_valid = Valid(); |
1550 | 11.4k | if (is_valid) { |
1551 | | // This could be set in TrySetDeleteRangeSentinel() or |
1552 | | // SkipEmptyFileForward() above. |
1553 | 8.32k | if (to_return_sentinel_) { |
1554 | 5.73k | result->key = sentinel_; |
1555 | 5.73k | result->bound_check_result = IterBoundCheck::kUnknown; |
1556 | 5.73k | result->value_prepared = true; |
1557 | 5.73k | } else { |
1558 | 2.58k | result->key = key(); |
1559 | 2.58k | result->bound_check_result = file_iter_.UpperBoundCheckResult(); |
1560 | | // Ideally, we should return the real file_iter_.value_prepared but the |
1561 | | // information is not here. It would casue an extra PrepareValue() |
1562 | | // for the first key of a file. |
1563 | 2.58k | result->value_prepared = !allow_unprepared_value_; |
1564 | 2.58k | } |
1565 | 8.32k | } |
1566 | 11.4k | } |
1567 | 12.2k | return is_valid; |
1568 | 12.2k | } |
1569 | | |
1570 | 7.38k | void LevelIterator::Prev() { |
1571 | 7.38k | assert(Valid()); |
1572 | 7.38k | if (to_return_sentinel_) { |
1573 | 3.68k | ClearSentinel(); |
1574 | 3.69k | } else { |
1575 | 3.69k | file_iter_.Prev(); |
1576 | 3.69k | if (range_tombstone_iter_) { |
1577 | 3.69k | TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); |
1578 | 3.69k | } |
1579 | 3.69k | } |
1580 | 7.38k | SkipEmptyFileBackward(); |
1581 | 7.38k | } |
1582 | | |
1583 | 16.6k | bool LevelIterator::SkipEmptyFileForward() { |
1584 | 16.6k | bool seen_empty_file = false; |
1585 | | // Pause at sentinel key |
1586 | 19.1k | while (!to_return_sentinel_ && |
1587 | 13.4k | (file_iter_.iter() == nullptr || |
1588 | 13.3k | (!file_iter_.Valid() && file_iter_.status().ok() && |
1589 | 5.73k | file_iter_.iter()->UpperBoundCheckResult() != |
1590 | 5.81k | IterBoundCheck::kOutOfBound))) { |
1591 | 5.81k | seen_empty_file = true; |
1592 | | // Move to next file |
1593 | 5.81k | if (file_index_ >= flevel_->num_files - 1 || |
1594 | 2.58k | KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) || |
1595 | 3.23k | prefix_exhausted_) { |
1596 | 3.23k | SetFileIterator(nullptr); |
1597 | 3.23k | ClearRangeTombstoneIter(); |
1598 | 3.23k | break; |
1599 | 3.23k | } |
1600 | | // may init a new *range_tombstone_iter |
1601 | 2.58k | InitFileIterator(file_index_ + 1); |
1602 | | // We moved to a new SST file |
1603 | | // Seek range_tombstone_iter_ to reset its !Valid() default state. |
1604 | | // We do not need to call range_tombstone_iter_.Seek* in |
1605 | | // LevelIterator::Seek* since when the merging iterator calls |
1606 | | // LevelIterator::Seek*, it should also call Seek* into the corresponding |
1607 | | // range tombstone iterator. |
1608 | 2.58k | if (file_iter_.iter() != nullptr) { |
1609 | | // If we are doing prepared scan opts then we should seek to the values |
1610 | | // specified by the scan opts |
1611 | | |
1612 | 2.58k | if (scan_opts_ && FileHasMultiScanArg(file_index_)) { |
1613 | 0 | const ScanOptions& opts = |
1614 | 0 | GetMultiScanArgForFile(file_index_).GetScanRanges().front(); |
1615 | 0 | if (opts.range.start.has_value()) { |
1616 | 0 | InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber, |
1617 | 0 | kValueTypeForSeek); |
1618 | 0 | file_iter_.Seek(target.Encode()); |
1619 | 0 | } |
1620 | 2.58k | } else { |
1621 | 2.58k | file_iter_.SeekToFirst(); |
1622 | 2.58k | } |
1623 | 2.58k | if (range_tombstone_iter_) { |
1624 | 2.58k | if (*range_tombstone_iter_) { |
1625 | 0 | (*range_tombstone_iter_)->SeekToFirst(); |
1626 | 0 | } |
1627 | 2.58k | TrySetDeleteRangeSentinel(file_largest_key(file_index_)); |
1628 | 2.58k | } |
1629 | 2.58k | } |
1630 | 2.58k | } |
1631 | 16.6k | return seen_empty_file; |
1632 | 16.6k | } |
1633 | | |
1634 | 9.13k | void LevelIterator::SkipEmptyFileBackward() { |
1635 | | // Pause at sentinel key |
1636 | 11.3k | while (!to_return_sentinel_ && |
1637 | 7.61k | (file_iter_.iter() == nullptr || |
1638 | 7.61k | (!file_iter_.Valid() && file_iter_.status().ok()))) { |
1639 | | // Move to previous file |
1640 | 3.90k | if (file_index_ == 0) { |
1641 | | // Already the first file |
1642 | 1.74k | SetFileIterator(nullptr); |
1643 | 1.74k | ClearRangeTombstoneIter(); |
1644 | 1.74k | return; |
1645 | 1.74k | } |
1646 | 2.16k | InitFileIterator(file_index_ - 1); |
1647 | | // We moved to a new SST file |
1648 | | // Seek range_tombstone_iter_ to reset its !Valid() default state. |
1649 | 2.16k | if (file_iter_.iter() != nullptr) { |
1650 | 2.16k | file_iter_.SeekToLast(); |
1651 | 2.16k | if (range_tombstone_iter_) { |
1652 | 2.16k | if (*range_tombstone_iter_) { |
1653 | 0 | (*range_tombstone_iter_)->SeekToLast(); |
1654 | 0 | } |
1655 | 2.16k | TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); |
1656 | 2.16k | if (to_return_sentinel_) { |
1657 | 0 | break; |
1658 | 0 | } |
1659 | 2.16k | } |
1660 | 2.16k | } |
1661 | 2.16k | } |
1662 | 9.13k | } |
1663 | | |
1664 | | #ifndef NDEBUG |
1665 | | bool LevelIterator::OverlapRange(const ScanOptions& opts) { |
1666 | | return (user_comparator_.CompareWithoutTimestamp( |
1667 | | opts.range.start.value(), /*a_has_ts=*/false, |
1668 | | ExtractUserKey(flevel_->files[file_index_].largest_key), |
1669 | | /*b_has_ts=*/true) <= 0 && |
1670 | | user_comparator_.CompareWithoutTimestamp( |
1671 | | opts.range.limit.value(), /*a_has_ts=*/false, |
1672 | | ExtractUserKey(flevel_->files[file_index_].smallest_key), |
1673 | | /*b_has_ts=*/true) > 0); |
1674 | | } |
1675 | | #endif |
1676 | | |
1677 | 17.0k | void LevelIterator::SetFileIterator(InternalIterator* iter) { |
1678 | 17.0k | if (pinned_iters_mgr_ && iter) { |
1679 | 6.90k | iter->SetPinnedItersMgr(pinned_iters_mgr_); |
1680 | 6.90k | } |
1681 | | |
1682 | 17.0k | InternalIterator* old_iter = file_iter_.Set(iter); |
1683 | 17.0k | if (iter && scan_opts_) { |
1684 | 0 | if (FileHasMultiScanArg(file_index_)) { |
1685 | 0 | const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_); |
1686 | 0 | assert(OverlapRange(*new_opts.GetScanRanges().begin()) && |
1687 | 0 | OverlapRange(*new_opts.GetScanRanges().rbegin())); |
1688 | 0 | file_iter_.Prepare(&new_opts); |
1689 | 0 | } |
1690 | 0 | } |
1691 | | |
1692 | | // Update the read pattern for PrefetchBuffer. |
1693 | 17.0k | if (is_next_read_sequential_) { |
1694 | 5.73k | file_iter_.UpdateReadaheadState(old_iter); |
1695 | 5.73k | } |
1696 | | |
1697 | 17.0k | if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { |
1698 | 4.17k | pinned_iters_mgr_->PinIterator(old_iter); |
1699 | 12.9k | } else { |
1700 | 12.9k | delete old_iter; |
1701 | 12.9k | } |
1702 | 17.0k | } |
1703 | | |
1704 | 11.5k | void LevelIterator::InitFileIterator(size_t new_file_index) { |
1705 | 11.5k | if (new_file_index >= flevel_->num_files) { |
1706 | 78 | file_index_ = new_file_index; |
1707 | 78 | SetFileIterator(nullptr); |
1708 | 78 | ClearRangeTombstoneIter(); |
1709 | 78 | return; |
1710 | 11.5k | } else { |
1711 | | // If the file iterator shows incomplete, we try it again if users seek |
1712 | | // to the same file, as this time we may go to a different data block |
1713 | | // which is cached in block cache. |
1714 | | // |
1715 | 11.5k | if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() && |
1716 | 4.75k | new_file_index == file_index_) { |
1717 | | // file_iter_ is already constructed with this iterator, so |
1718 | | // no need to change anything |
1719 | 11.4k | } else { |
1720 | 11.4k | file_index_ = new_file_index; |
1721 | 11.4k | InternalIterator* iter = NewFileIterator(); |
1722 | 11.4k | SetFileIterator(iter); |
1723 | 11.4k | } |
1724 | 11.5k | } |
1725 | 11.5k | } |
1726 | | |
1727 | | } // anonymous namespace |
1728 | | |
1729 | | Status Version::GetTableProperties(const ReadOptions& read_options, |
1730 | | std::shared_ptr<const TableProperties>* tp, |
1731 | | const FileMetaData* file_meta, |
1732 | 117k | const std::string* fname) const { |
1733 | 117k | auto* table_cache = cfd_->table_cache(); |
1734 | 117k | const auto& ioptions = cfd_->ioptions(); |
1735 | 117k | Status s = table_cache->GetTableProperties( |
1736 | 117k | file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, |
1737 | 117k | mutable_cf_options_, true /* no io */); |
1738 | 117k | if (s.ok()) { |
1739 | 117k | return s; |
1740 | 117k | } |
1741 | | |
1742 | | // We only ignore error type `Incomplete` since it's by design that we |
1743 | | // disallow table when it's not in table cache. |
1744 | 0 | if (!s.IsIncomplete()) { |
1745 | 0 | return s; |
1746 | 0 | } |
1747 | | |
1748 | | // 2. Table is not present in table cache, we'll read the table properties |
1749 | | // directly from the properties block in the file. |
1750 | 0 | std::unique_ptr<FSRandomAccessFile> file; |
1751 | 0 | std::string file_name; |
1752 | 0 | if (fname != nullptr) { |
1753 | 0 | file_name = *fname; |
1754 | 0 | } else { |
1755 | 0 | file_name = TableFileName(ioptions.cf_paths, file_meta->fd.GetNumber(), |
1756 | 0 | file_meta->fd.GetPathId()); |
1757 | 0 | } |
1758 | 0 | s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file, |
1759 | 0 | nullptr); |
1760 | 0 | if (!s.ok()) { |
1761 | 0 | return s; |
1762 | 0 | } |
1763 | | |
1764 | | // By setting the magic number to kNullTableMagicNumber, we can bypass |
1765 | | // the magic number check in the footer. |
1766 | 0 | std::unique_ptr<RandomAccessFileReader> file_reader( |
1767 | 0 | new RandomAccessFileReader( |
1768 | 0 | std::move(file), file_name, ioptions.clock /* clock */, io_tracer_, |
1769 | 0 | ioptions.stats /* stats */, |
1770 | 0 | Histograms::SST_READ_MICROS /* hist_type */, |
1771 | 0 | nullptr /* file_read_hist */, nullptr /* rate_limiter */, |
1772 | 0 | ioptions.listeners)); |
1773 | 0 | std::unique_ptr<TableProperties> props; |
1774 | 0 | s = ReadTableProperties( |
1775 | 0 | file_reader.get(), file_meta->fd.GetFileSize(), |
1776 | 0 | Footer::kNullTableMagicNumber /* table's magic number */, ioptions, |
1777 | 0 | read_options, &props); |
1778 | 0 | if (!s.ok()) { |
1779 | 0 | return s; |
1780 | 0 | } |
1781 | 0 | *tp = std::move(props); |
1782 | 0 | RecordTick(ioptions.stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); |
1783 | 0 | return s; |
1784 | 0 | } |
1785 | | |
1786 | | Status Version::GetPropertiesOfAllTables( |
1787 | 0 | const ReadOptions& read_options, TablePropertiesCollection* props) const { |
1788 | 0 | Status s; |
1789 | 0 | for (int level = 0; level < storage_info_.num_levels_; level++) { |
1790 | 0 | s = GetPropertiesOfAllTables(read_options, props, level); |
1791 | 0 | if (!s.ok()) { |
1792 | 0 | return s; |
1793 | 0 | } |
1794 | 0 | } |
1795 | | |
1796 | 0 | return Status::OK(); |
1797 | 0 | } |
1798 | | |
1799 | | Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, |
1800 | 0 | std::string* out_str) { |
1801 | 0 | if (max_entries_to_print <= 0) { |
1802 | 0 | return Status::OK(); |
1803 | 0 | } |
1804 | 0 | int num_entries_left = max_entries_to_print; |
1805 | |
|
1806 | 0 | std::stringstream ss; |
1807 | | |
1808 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
1809 | 0 | const ReadOptions read_options; |
1810 | 0 | for (int level = 0; level < storage_info_.num_levels_; level++) { |
1811 | 0 | for (const auto& file_meta : storage_info_.files_[level]) { |
1812 | 0 | auto fname = |
1813 | 0 | TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(), |
1814 | 0 | file_meta->fd.GetPathId()); |
1815 | |
|
1816 | 0 | ss << "=== file : " << fname << " ===\n"; |
1817 | |
|
1818 | 0 | TableCache* table_cache = cfd_->table_cache(); |
1819 | 0 | std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter; |
1820 | |
|
1821 | 0 | Status s = table_cache->GetRangeTombstoneIterator( |
1822 | 0 | read_options, cfd_->internal_comparator(), *file_meta, |
1823 | 0 | mutable_cf_options_, &tombstone_iter); |
1824 | 0 | if (!s.ok()) { |
1825 | 0 | return s; |
1826 | 0 | } |
1827 | 0 | if (tombstone_iter) { |
1828 | 0 | tombstone_iter->SeekToFirst(); |
1829 | | |
1830 | | // TODO: print timestamp |
1831 | 0 | while (tombstone_iter->Valid() && num_entries_left > 0) { |
1832 | 0 | ss << "start: " << tombstone_iter->start_key().ToString(true) |
1833 | 0 | << " end: " << tombstone_iter->end_key().ToString(true) |
1834 | 0 | << " seq: " << tombstone_iter->seq() << '\n'; |
1835 | 0 | tombstone_iter->Next(); |
1836 | 0 | num_entries_left--; |
1837 | 0 | } |
1838 | 0 | if (num_entries_left <= 0) { |
1839 | 0 | break; |
1840 | 0 | } |
1841 | 0 | } |
1842 | 0 | } |
1843 | 0 | if (num_entries_left <= 0) { |
1844 | 0 | break; |
1845 | 0 | } |
1846 | 0 | } |
1847 | 0 | assert(num_entries_left >= 0); |
1848 | 0 | if (num_entries_left <= 0) { |
1849 | 0 | ss << "(results may not be complete)\n"; |
1850 | 0 | } |
1851 | |
|
1852 | 0 | *out_str = ss.str(); |
1853 | 0 | return Status::OK(); |
1854 | 0 | } |
1855 | | |
1856 | | Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, |
1857 | | TablePropertiesCollection* props, |
1858 | 0 | int level) const { |
1859 | 0 | for (const auto& file_meta : storage_info_.files_[level]) { |
1860 | 0 | auto fname = |
1861 | 0 | TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(), |
1862 | 0 | file_meta->fd.GetPathId()); |
1863 | | // 1. If the table is already present in table cache, load table |
1864 | | // properties from there. |
1865 | 0 | std::shared_ptr<const TableProperties> table_properties; |
1866 | 0 | Status s = |
1867 | 0 | GetTableProperties(read_options, &table_properties, file_meta, &fname); |
1868 | 0 | if (s.ok()) { |
1869 | 0 | props->insert({fname, table_properties}); |
1870 | 0 | } else { |
1871 | 0 | return s; |
1872 | 0 | } |
1873 | 0 | } |
1874 | | |
1875 | 0 | return Status::OK(); |
1876 | 0 | } |
1877 | | |
1878 | | Status Version::GetPropertiesOfTablesInRange( |
1879 | | const ReadOptions& read_options, const autovector<UserKeyRange>& ranges, |
1880 | 0 | TablePropertiesCollection* props) const { |
1881 | 0 | for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { |
1882 | 0 | for (const auto& range : ranges) { |
1883 | | // Convert user_key into a corresponding internal key. |
1884 | 0 | InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); |
1885 | 0 | InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); |
1886 | 0 | std::vector<FileMetaData*> files; |
1887 | 0 | storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, |
1888 | 0 | false); |
1889 | 0 | for (const auto& file_meta : files) { |
1890 | 0 | auto fname = |
1891 | 0 | TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(), |
1892 | 0 | file_meta->fd.GetPathId()); |
1893 | 0 | if (props->count(fname) == 0) { |
1894 | | // 1. If the table is already present in table cache, load table |
1895 | | // properties from there. |
1896 | 0 | std::shared_ptr<const TableProperties> table_properties; |
1897 | 0 | Status s = GetTableProperties(read_options, &table_properties, |
1898 | 0 | file_meta, &fname); |
1899 | 0 | if (s.ok()) { |
1900 | 0 | props->insert({fname, table_properties}); |
1901 | 0 | } else { |
1902 | 0 | return s; |
1903 | 0 | } |
1904 | 0 | } |
1905 | 0 | } |
1906 | 0 | } |
1907 | 0 | } |
1908 | | |
1909 | 0 | return Status::OK(); |
1910 | 0 | } |
1911 | | |
1912 | | Status Version::GetPropertiesOfTablesByLevel( |
1913 | | const ReadOptions& read_options, |
1914 | | std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level) |
1915 | 0 | const { |
1916 | 0 | Status s; |
1917 | |
|
1918 | 0 | props_by_level->reserve(storage_info_.num_levels_); |
1919 | 0 | for (int level = 0; level < storage_info_.num_levels_; level++) { |
1920 | 0 | props_by_level->push_back(std::make_unique<TablePropertiesCollection>()); |
1921 | 0 | s = GetPropertiesOfAllTables(read_options, props_by_level->back().get(), |
1922 | 0 | level); |
1923 | 0 | if (!s.ok()) { |
1924 | 0 | return s; |
1925 | 0 | } |
1926 | 0 | } |
1927 | 0 | return Status::OK(); |
1928 | 0 | } |
1929 | | |
1930 | | Status Version::GetAggregatedTableProperties( |
1931 | | const ReadOptions& read_options, std::shared_ptr<const TableProperties>* tp, |
1932 | 0 | int level) { |
1933 | 0 | TablePropertiesCollection props; |
1934 | 0 | Status s; |
1935 | 0 | if (level < 0) { |
1936 | 0 | s = GetPropertiesOfAllTables(read_options, &props); |
1937 | 0 | } else { |
1938 | 0 | s = GetPropertiesOfAllTables(read_options, &props, level); |
1939 | 0 | } |
1940 | 0 | if (!s.ok()) { |
1941 | 0 | return s; |
1942 | 0 | } |
1943 | | |
1944 | 0 | auto* new_tp = new TableProperties(); |
1945 | 0 | for (const auto& item : props) { |
1946 | 0 | new_tp->Add(*item.second); |
1947 | 0 | } |
1948 | 0 | tp->reset(new_tp); |
1949 | 0 | return Status::OK(); |
1950 | 0 | } |
1951 | | |
1952 | 0 | size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { |
1953 | 0 | size_t total_usage = 0; |
1954 | 0 | for (auto& file_level : storage_info_.level_files_brief_) { |
1955 | 0 | for (size_t i = 0; i < file_level.num_files; i++) { |
1956 | 0 | total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( |
1957 | 0 | file_options_, read_options, cfd_->internal_comparator(), |
1958 | 0 | *file_level.files[i].file_metadata, mutable_cf_options_); |
1959 | 0 | } |
1960 | 0 | } |
1961 | 0 | return total_usage; |
1962 | 0 | } |
1963 | | |
1964 | 65.4k | void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { |
1965 | 65.4k | assert(cf_meta); |
1966 | 65.4k | assert(cfd_); |
1967 | | |
1968 | 65.4k | cf_meta->name = cfd_->GetName(); |
1969 | 65.4k | cf_meta->size = 0; |
1970 | 65.4k | cf_meta->file_count = 0; |
1971 | 65.4k | cf_meta->levels.clear(); |
1972 | | |
1973 | 65.4k | cf_meta->blob_file_size = 0; |
1974 | 65.4k | cf_meta->blob_file_count = 0; |
1975 | 65.4k | cf_meta->blob_files.clear(); |
1976 | | |
1977 | 65.4k | const auto& ioptions = cfd_->ioptions(); |
1978 | 65.4k | auto* vstorage = storage_info(); |
1979 | | |
1980 | 523k | for (int level = 0; level < cfd_->NumberLevels(); level++) { |
1981 | 458k | uint64_t level_size = 0; |
1982 | 458k | cf_meta->file_count += vstorage->LevelFiles(level).size(); |
1983 | 458k | std::vector<SstFileMetaData> files; |
1984 | 458k | for (const auto& file : vstorage->LevelFiles(level)) { |
1985 | 93.8k | uint32_t path_id = file->fd.GetPathId(); |
1986 | 93.8k | std::string file_path; |
1987 | 93.8k | if (path_id < ioptions.cf_paths.size()) { |
1988 | 93.8k | file_path = ioptions.cf_paths[path_id].path; |
1989 | 93.8k | } else { |
1990 | 0 | assert(!ioptions.cf_paths.empty()); |
1991 | 0 | file_path = ioptions.cf_paths.back().path; |
1992 | 0 | } |
1993 | 93.8k | const uint64_t file_number = file->fd.GetNumber(); |
1994 | 93.8k | files.emplace_back( |
1995 | 93.8k | MakeTableFileName("", file_number), file_number, file_path, |
1996 | 93.8k | file->fd.GetFileSize(), file->fd.smallest_seqno, |
1997 | 93.8k | file->fd.largest_seqno, file->smallest.user_key().ToString(), |
1998 | 93.8k | file->largest.user_key().ToString(), |
1999 | 93.8k | file->stats.num_reads_sampled.load(std::memory_order_relaxed), |
2000 | 93.8k | file->being_compacted, file->temperature, |
2001 | 93.8k | file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), |
2002 | 93.8k | file->TryGetFileCreationTime(), file->epoch_number, |
2003 | 93.8k | file->file_checksum, file->file_checksum_func_name); |
2004 | 93.8k | files.back().num_entries = file->num_entries; |
2005 | 93.8k | files.back().num_deletions = file->num_deletions; |
2006 | 93.8k | files.back().smallest = file->smallest.Encode().ToString(); |
2007 | 93.8k | files.back().largest = file->largest.Encode().ToString(); |
2008 | 93.8k | level_size += file->fd.GetFileSize(); |
2009 | 93.8k | } |
2010 | 458k | cf_meta->levels.emplace_back(level, level_size, std::move(files)); |
2011 | 458k | cf_meta->size += level_size; |
2012 | 458k | } |
2013 | 65.4k | for (const auto& meta : vstorage->GetBlobFiles()) { |
2014 | 0 | assert(meta); |
2015 | |
|
2016 | 0 | cf_meta->blob_files.emplace_back( |
2017 | 0 | meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()), |
2018 | 0 | ioptions.cf_paths.front().path, meta->GetBlobFileSize(), |
2019 | 0 | meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(), |
2020 | 0 | meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(), |
2021 | 0 | meta->GetChecksumMethod(), meta->GetChecksumValue()); |
2022 | 0 | ++cf_meta->blob_file_count; |
2023 | 0 | cf_meta->blob_file_size += meta->GetBlobFileSize(); |
2024 | 0 | } |
2025 | 65.4k | } |
2026 | | |
2027 | 0 | uint64_t Version::GetSstFilesSize() { |
2028 | 0 | uint64_t sst_files_size = 0; |
2029 | 0 | for (int level = 0; level < storage_info_.num_levels_; level++) { |
2030 | 0 | for (const auto& file_meta : storage_info_.LevelFiles(level)) { |
2031 | 0 | sst_files_size += file_meta->fd.GetFileSize(); |
2032 | 0 | } |
2033 | 0 | } |
2034 | 0 | return sst_files_size; |
2035 | 0 | } |
2036 | | |
2037 | | void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, |
2038 | 0 | Slice* largest_user_key) { |
2039 | 0 | smallest_user_key->clear(); |
2040 | 0 | largest_user_key->clear(); |
2041 | 0 | bool initialized = false; |
2042 | 0 | const Comparator* ucmp = storage_info_.user_comparator_; |
2043 | 0 | for (int level = 0; level < cfd_->NumberLevels(); level++) { |
2044 | 0 | if (storage_info_.LevelFiles(level).size() == 0) { |
2045 | 0 | continue; |
2046 | 0 | } |
2047 | 0 | if (level == 0) { |
2048 | | // we need to consider all files on level 0 |
2049 | 0 | for (const auto& file : storage_info_.LevelFiles(level)) { |
2050 | 0 | const Slice& start_user_key = file->smallest.user_key(); |
2051 | 0 | if (!initialized || |
2052 | 0 | ucmp->Compare(start_user_key, *smallest_user_key) < 0) { |
2053 | 0 | *smallest_user_key = start_user_key; |
2054 | 0 | } |
2055 | 0 | const Slice& end_user_key = file->largest.user_key(); |
2056 | 0 | if (!initialized || |
2057 | 0 | ucmp->Compare(end_user_key, *largest_user_key) > 0) { |
2058 | 0 | *largest_user_key = end_user_key; |
2059 | 0 | } |
2060 | 0 | initialized = true; |
2061 | 0 | } |
2062 | 0 | } else { |
2063 | | // we only need to consider the first and last file |
2064 | 0 | const Slice& start_user_key = |
2065 | 0 | storage_info_.LevelFiles(level)[0]->smallest.user_key(); |
2066 | 0 | if (!initialized || |
2067 | 0 | ucmp->Compare(start_user_key, *smallest_user_key) < 0) { |
2068 | 0 | *smallest_user_key = start_user_key; |
2069 | 0 | } |
2070 | 0 | const Slice& end_user_key = |
2071 | 0 | storage_info_.LevelFiles(level).back()->largest.user_key(); |
2072 | 0 | if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { |
2073 | 0 | *largest_user_key = end_user_key; |
2074 | 0 | } |
2075 | 0 | initialized = true; |
2076 | 0 | } |
2077 | 0 | } |
2078 | 0 | } |
2079 | | |
2080 | 0 | void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { |
2081 | 0 | uint64_t oldest_time = std::numeric_limits<uint64_t>::max(); |
2082 | 0 | for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { |
2083 | 0 | for (FileMetaData* meta : storage_info_.LevelFiles(level)) { |
2084 | 0 | assert(meta->fd.table_reader != nullptr); |
2085 | 0 | uint64_t file_creation_time = meta->TryGetFileCreationTime(); |
2086 | 0 | if (file_creation_time == kUnknownFileCreationTime) { |
2087 | 0 | *creation_time = 0; |
2088 | 0 | return; |
2089 | 0 | } |
2090 | 0 | if (file_creation_time < oldest_time) { |
2091 | 0 | oldest_time = file_creation_time; |
2092 | 0 | } |
2093 | 0 | } |
2094 | 0 | } |
2095 | 0 | *creation_time = oldest_time; |
2096 | 0 | } |
2097 | | |
2098 | | InternalIterator* Version::TEST_GetLevelIterator( |
2099 | | const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, |
2100 | 0 | int level, bool allow_unprepared_value) { |
2101 | 0 | auto* arena = merge_iter_builder->GetArena(); |
2102 | 0 | auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); |
2103 | 0 | std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr = nullptr; |
2104 | 0 | auto level_iter = new (mem) LevelIterator( |
2105 | 0 | cfd_->table_cache(), read_options, file_options_, |
2106 | 0 | cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), |
2107 | 0 | mutable_cf_options_, should_sample_file_read(), |
2108 | 0 | cfd_->internal_stats()->GetFileReadHist(level), |
2109 | 0 | TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, |
2110 | 0 | nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, |
2111 | 0 | allow_unprepared_value, &tombstone_iter_ptr); |
2112 | 0 | if (read_options.ignore_range_deletions) { |
2113 | 0 | merge_iter_builder->AddIterator(level_iter); |
2114 | 0 | } else { |
2115 | 0 | merge_iter_builder->AddPointAndTombstoneIterator( |
2116 | 0 | level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); |
2117 | 0 | } |
2118 | 0 | return level_iter; |
2119 | 0 | } |
2120 | | |
2121 | 0 | uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { |
2122 | | // Estimation will be inaccurate when: |
2123 | | // (1) there exist merge keys |
2124 | | // (2) keys are directly overwritten |
2125 | | // (3) deletion on non-existing keys |
2126 | | // (4) low number of samples |
2127 | 0 | if (current_num_samples_ == 0) { |
2128 | 0 | return 0; |
2129 | 0 | } |
2130 | | |
2131 | 0 | if (current_num_non_deletions_ <= current_num_deletions_) { |
2132 | 0 | return 0; |
2133 | 0 | } |
2134 | | |
2135 | 0 | uint64_t est = current_num_non_deletions_ - current_num_deletions_; |
2136 | |
|
2137 | 0 | uint64_t file_count = 0; |
2138 | 0 | for (int level = 0; level < num_levels_; ++level) { |
2139 | 0 | file_count += files_[level].size(); |
2140 | 0 | } |
2141 | |
|
2142 | 0 | if (current_num_samples_ < file_count) { |
2143 | 0 | assert(current_num_samples_ != 0); |
2144 | 0 | assert(est != 0); |
2145 | 0 | double multiplier = static_cast<double>(file_count) / current_num_samples_; |
2146 | 0 | double maximum_multiplier = |
2147 | 0 | static_cast<double>(std::numeric_limits<uint64_t>::max()) / est; |
2148 | | // If it can overflow, we return the maximum unsigned long. |
2149 | 0 | if (multiplier >= maximum_multiplier) { |
2150 | 0 | return std::numeric_limits<uint64_t>::max(); |
2151 | 0 | } |
2152 | 0 | return static_cast<uint64_t>(est * multiplier); |
2153 | 0 | } else { |
2154 | 0 | return est; |
2155 | 0 | } |
2156 | 0 | } |
2157 | | |
2158 | | double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( |
2159 | 0 | int level) const { |
2160 | 0 | assert(level < num_levels_); |
2161 | 0 | uint64_t sum_file_size_bytes = 0; |
2162 | 0 | uint64_t sum_data_size_bytes = 0; |
2163 | 0 | for (auto* file_meta : files_[level]) { |
2164 | 0 | auto raw_size = file_meta->raw_key_size + file_meta->raw_value_size; |
2165 | | // Check if the table property is properly initialized. It might not be |
2166 | | // because in `UpdateAccumulatedStats` we limit the maximum number of |
2167 | | // properties to read once. |
2168 | 0 | if (raw_size > 0) { |
2169 | 0 | sum_file_size_bytes += file_meta->fd.GetFileSize(); |
2170 | 0 | sum_data_size_bytes += raw_size; |
2171 | 0 | } |
2172 | 0 | } |
2173 | 0 | if (sum_file_size_bytes == 0) { |
2174 | 0 | return -1.0; |
2175 | 0 | } |
2176 | 0 | return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes; |
2177 | 0 | } |
2178 | | |
2179 | | void Version::AddIterators(const ReadOptions& read_options, |
2180 | | const FileOptions& soptions, |
2181 | | MergeIteratorBuilder* merge_iter_builder, |
2182 | 16.2k | bool allow_unprepared_value) { |
2183 | 16.2k | assert(storage_info_.finalized_); |
2184 | | |
2185 | 59.0k | for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { |
2186 | 42.8k | AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level, |
2187 | 42.8k | allow_unprepared_value); |
2188 | 42.8k | } |
2189 | 16.2k | } |
2190 | | |
2191 | | void Version::AddIteratorsForLevel(const ReadOptions& read_options, |
2192 | | const FileOptions& soptions, |
2193 | | MergeIteratorBuilder* merge_iter_builder, |
2194 | 42.8k | int level, bool allow_unprepared_value) { |
2195 | 42.8k | assert(storage_info_.finalized_); |
2196 | 42.8k | if (level >= storage_info_.num_non_empty_levels()) { |
2197 | | // This is an empty level |
2198 | 0 | return; |
2199 | 42.8k | } else if (storage_info_.LevelFilesBrief(level).num_files == 0) { |
2200 | | // No files in this level |
2201 | 26.4k | return; |
2202 | 26.4k | } |
2203 | | |
2204 | 16.3k | bool should_sample = should_sample_file_read(); |
2205 | | |
2206 | 16.3k | auto* arena = merge_iter_builder->GetArena(); |
2207 | 16.3k | if (level == 0) { |
2208 | | // Merge all level zero files together since they may overlap |
2209 | 11.5k | std::unique_ptr<TruncatedRangeDelIterator> tombstone_iter = nullptr; |
2210 | 29.2k | for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { |
2211 | 17.6k | const auto& file = storage_info_.LevelFilesBrief(0).files[i]; |
2212 | 17.6k | auto table_iter = cfd_->table_cache()->NewIterator( |
2213 | 17.6k | read_options, soptions, cfd_->internal_comparator(), |
2214 | 17.6k | *file.file_metadata, /*range_del_agg=*/nullptr, mutable_cf_options_, |
2215 | 17.6k | nullptr, cfd_->internal_stats()->GetFileReadHist(0), |
2216 | 17.6k | TableReaderCaller::kUserIterator, arena, |
2217 | 17.6k | /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, |
2218 | 17.6k | /*smallest_compaction_key=*/nullptr, |
2219 | 17.6k | /*largest_compaction_key=*/nullptr, allow_unprepared_value, |
2220 | 17.6k | /*range_del_read_seqno=*/nullptr, &tombstone_iter); |
2221 | 17.6k | if (read_options.ignore_range_deletions) { |
2222 | 0 | merge_iter_builder->AddIterator(table_iter); |
2223 | 17.6k | } else { |
2224 | 17.6k | merge_iter_builder->AddPointAndTombstoneIterator( |
2225 | 17.6k | table_iter, std::move(tombstone_iter)); |
2226 | 17.6k | } |
2227 | 17.6k | } |
2228 | 11.5k | if (should_sample) { |
2229 | | // Count ones for every L0 files. This is done per iterator creation |
2230 | | // rather than Seek(), while files in other levels are recored per seek. |
2231 | | // If users execute one range query per iterator, there may be some |
2232 | | // discrepancy here. |
2233 | 10 | for (FileMetaData* meta : storage_info_.LevelFiles(0)) { |
2234 | 10 | sample_file_read_inc(meta); |
2235 | 10 | } |
2236 | 9 | } |
2237 | 11.5k | } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { |
2238 | | // For levels > 0, we can use a concatenating iterator that sequentially |
2239 | | // walks through the non-overlapping files in the level, opening them |
2240 | | // lazily. |
2241 | 4.85k | auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); |
2242 | 4.85k | std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr = nullptr; |
2243 | 4.85k | auto level_iter = new (mem) LevelIterator( |
2244 | 4.85k | cfd_->table_cache(), read_options, soptions, |
2245 | 4.85k | cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), |
2246 | 4.85k | mutable_cf_options_, should_sample_file_read(), |
2247 | 4.85k | cfd_->internal_stats()->GetFileReadHist(level), |
2248 | 4.85k | TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, |
2249 | 4.85k | /*range_del_agg=*/nullptr, |
2250 | 4.85k | /*compaction_boundaries=*/nullptr, allow_unprepared_value, |
2251 | 4.85k | &tombstone_iter_ptr); |
2252 | 4.85k | if (read_options.ignore_range_deletions) { |
2253 | 0 | merge_iter_builder->AddIterator(level_iter); |
2254 | 4.85k | } else { |
2255 | 4.85k | merge_iter_builder->AddPointAndTombstoneIterator( |
2256 | 4.85k | level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); |
2257 | 4.85k | } |
2258 | 4.85k | } |
2259 | 16.3k | } |
2260 | | |
2261 | | Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, |
2262 | | const FileOptions& file_options, |
2263 | | const Slice& smallest_user_key, |
2264 | | const Slice& largest_user_key, |
2265 | 6.89k | int level, bool* overlap) { |
2266 | 6.89k | assert(storage_info_.finalized_); |
2267 | | |
2268 | 6.89k | auto icmp = cfd_->internal_comparator(); |
2269 | 6.89k | auto ucmp = icmp.user_comparator(); |
2270 | | |
2271 | 6.89k | Arena arena; |
2272 | 6.89k | Status status; |
2273 | 6.89k | ReadRangeDelAggregator range_del_agg(&icmp, |
2274 | 6.89k | kMaxSequenceNumber /* upper_bound */); |
2275 | | |
2276 | 6.89k | *overlap = false; |
2277 | | |
2278 | 6.89k | if (level == 0) { |
2279 | 3.67k | for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { |
2280 | 2.82k | const auto file = &storage_info_.LevelFilesBrief(0).files[i]; |
2281 | 2.82k | if (AfterFile(ucmp, &smallest_user_key, file) || |
2282 | 2.78k | BeforeFile(ucmp, &largest_user_key, file)) { |
2283 | 521 | continue; |
2284 | 521 | } |
2285 | 2.30k | ScopedArenaPtr<InternalIterator> iter(cfd_->table_cache()->NewIterator( |
2286 | 2.30k | read_options, file_options, cfd_->internal_comparator(), |
2287 | 2.30k | *file->file_metadata, &range_del_agg, mutable_cf_options_, nullptr, |
2288 | 2.30k | cfd_->internal_stats()->GetFileReadHist(0), |
2289 | 2.30k | TableReaderCaller::kUserIterator, &arena, |
2290 | 2.30k | /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, |
2291 | 2.30k | /*smallest_compaction_key=*/nullptr, |
2292 | 2.30k | /*largest_compaction_key=*/nullptr, |
2293 | 2.30k | /*allow_unprepared_value=*/false)); |
2294 | 2.30k | status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, |
2295 | 2.30k | iter.get(), overlap); |
2296 | 2.30k | if (!status.ok() || *overlap) { |
2297 | 2.29k | break; |
2298 | 2.29k | } |
2299 | 2.30k | } |
2300 | 3.75k | } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { |
2301 | 625 | auto mem = arena.AllocateAligned(sizeof(LevelIterator)); |
2302 | 625 | ScopedArenaPtr<InternalIterator> iter(new (mem) LevelIterator( |
2303 | 625 | cfd_->table_cache(), read_options, file_options, |
2304 | 625 | cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), |
2305 | 625 | mutable_cf_options_, should_sample_file_read(), |
2306 | 625 | cfd_->internal_stats()->GetFileReadHist(level), |
2307 | 625 | TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, |
2308 | 625 | &range_del_agg, nullptr, false)); |
2309 | 625 | status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, |
2310 | 625 | iter.get(), overlap); |
2311 | 625 | } |
2312 | | |
2313 | 6.89k | if (status.ok() && *overlap == false && |
2314 | 4.05k | range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) { |
2315 | 0 | *overlap = true; |
2316 | 0 | } |
2317 | 6.89k | return status; |
2318 | 6.89k | } |
2319 | | |
2320 | | VersionStorageInfo::VersionStorageInfo( |
2321 | | const InternalKeyComparator* internal_comparator, |
2322 | | const Comparator* user_comparator, int levels, |
2323 | | CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, |
2324 | | bool _force_consistency_checks, |
2325 | | EpochNumberRequirement epoch_number_requirement, SystemClock* clock, |
2326 | | uint32_t bottommost_file_compaction_delay, |
2327 | | OffpeakTimeOption offpeak_time_option) |
2328 | 328k | : internal_comparator_(internal_comparator), |
2329 | 328k | user_comparator_(user_comparator), |
2330 | | // cfd is nullptr if Version is dummy |
2331 | 328k | num_levels_(levels), |
2332 | 328k | num_non_empty_levels_(0), |
2333 | 328k | file_indexer_(user_comparator), |
2334 | 328k | compaction_style_(compaction_style), |
2335 | 328k | files_(new std::vector<FileMetaData*>[num_levels_]), |
2336 | 328k | base_level_(num_levels_ == 1 ? -1 : 1), |
2337 | 328k | lowest_unnecessary_level_(-1), |
2338 | 328k | level_multiplier_(0.0), |
2339 | 328k | files_by_compaction_pri_(num_levels_), |
2340 | 328k | level0_non_overlapping_(false), |
2341 | 328k | next_file_to_compact_by_size_(num_levels_), |
2342 | 328k | compaction_score_(num_levels_), |
2343 | 328k | compaction_level_(num_levels_), |
2344 | 328k | l0_delay_trigger_count_(0), |
2345 | 328k | compact_cursor_(num_levels_), |
2346 | 328k | accumulated_file_size_(0), |
2347 | 328k | accumulated_raw_key_size_(0), |
2348 | 328k | accumulated_raw_value_size_(0), |
2349 | 328k | accumulated_num_non_deletions_(0), |
2350 | 328k | accumulated_num_deletions_(0), |
2351 | 328k | current_num_non_deletions_(0), |
2352 | 328k | current_num_deletions_(0), |
2353 | 328k | current_num_samples_(0), |
2354 | 328k | estimated_compaction_needed_bytes_(0), |
2355 | 328k | clock_(clock), |
2356 | 328k | bottommost_file_compaction_delay_(bottommost_file_compaction_delay), |
2357 | 328k | finalized_(false), |
2358 | 328k | force_consistency_checks_(_force_consistency_checks), |
2359 | 328k | epoch_number_requirement_(epoch_number_requirement), |
2360 | 328k | offpeak_time_option_(std::move(offpeak_time_option)) { |
2361 | 328k | if (ref_vstorage != nullptr) { |
2362 | 138k | accumulated_file_size_ = ref_vstorage->accumulated_file_size_; |
2363 | 138k | accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; |
2364 | 138k | accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_; |
2365 | 138k | accumulated_num_non_deletions_ = |
2366 | 138k | ref_vstorage->accumulated_num_non_deletions_; |
2367 | 138k | accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; |
2368 | 138k | current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_; |
2369 | 138k | current_num_deletions_ = ref_vstorage->current_num_deletions_; |
2370 | 138k | current_num_samples_ = ref_vstorage->current_num_samples_; |
2371 | 138k | oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_; |
2372 | 138k | compact_cursor_ = ref_vstorage->compact_cursor_; |
2373 | 138k | compact_cursor_.resize(num_levels_); |
2374 | 138k | } |
2375 | 328k | } |
2376 | | |
2377 | | Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, |
2378 | | const FileOptions& file_opt, |
2379 | | const MutableCFOptions& mutable_cf_options, |
2380 | | const std::shared_ptr<IOTracer>& io_tracer, |
2381 | | uint64_t version_number, |
2382 | | EpochNumberRequirement epoch_number_requirement) |
2383 | 328k | : env_(vset->env_), |
2384 | 328k | clock_(vset->clock_), |
2385 | 328k | cfd_(column_family_data), |
2386 | 328k | info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions().logger), |
2387 | 328k | db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions().stats), |
2388 | 328k | table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), |
2389 | 328k | blob_source_(cfd_ ? cfd_->blob_source() : nullptr), |
2390 | | merge_operator_( |
2391 | 328k | (cfd_ == nullptr) ? nullptr : cfd_->ioptions().merge_operator.get()), |
2392 | 328k | storage_info_( |
2393 | 328k | (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), |
2394 | 328k | (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), |
2395 | 328k | cfd_ == nullptr ? 0 : cfd_->NumberLevels(), |
2396 | 328k | cfd_ == nullptr ? kCompactionStyleLevel |
2397 | 328k | : cfd_->ioptions().compaction_style, |
2398 | 328k | (cfd_ == nullptr || cfd_->current() == nullptr) |
2399 | 328k | ? nullptr |
2400 | 328k | : cfd_->current()->storage_info(), |
2401 | 328k | cfd_ == nullptr ? false : cfd_->ioptions().force_consistency_checks, |
2402 | 328k | epoch_number_requirement, |
2403 | 328k | cfd_ == nullptr ? nullptr : cfd_->ioptions().clock, |
2404 | 328k | cfd_ == nullptr ? 0 |
2405 | 328k | : mutable_cf_options.bottommost_file_compaction_delay, |
2406 | 328k | vset->offpeak_time_option()), |
2407 | 328k | vset_(vset), |
2408 | 328k | next_(this), |
2409 | 328k | prev_(this), |
2410 | 328k | refs_(0), |
2411 | 328k | file_options_(file_opt), |
2412 | 328k | mutable_cf_options_(mutable_cf_options), |
2413 | | max_file_size_for_l0_meta_pin_( |
2414 | 328k | MaxFileSizeForL0MetaPin(mutable_cf_options_)), |
2415 | 328k | version_number_(version_number), |
2416 | 328k | io_tracer_(io_tracer), |
2417 | 328k | use_async_io_(false) { |
2418 | 328k | if (CheckFSFeatureSupport(env_->GetFileSystem().get(), |
2419 | 328k | FSSupportedOps::kAsyncIO)) { |
2420 | 0 | use_async_io_ = true; |
2421 | 0 | } |
2422 | 328k | } |
2423 | | |
2424 | | Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, |
2425 | | const Slice& blob_index_slice, |
2426 | | FilePrefetchBuffer* prefetch_buffer, |
2427 | 0 | PinnableSlice* value, uint64_t* bytes_read) const { |
2428 | 0 | BlobIndex blob_index; |
2429 | |
|
2430 | 0 | { |
2431 | 0 | Status s = blob_index.DecodeFrom(blob_index_slice); |
2432 | 0 | if (!s.ok()) { |
2433 | 0 | return s; |
2434 | 0 | } |
2435 | 0 | } |
2436 | | |
2437 | 0 | return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value, |
2438 | 0 | bytes_read); |
2439 | 0 | } |
2440 | | |
2441 | | Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, |
2442 | | const BlobIndex& blob_index, |
2443 | | FilePrefetchBuffer* prefetch_buffer, |
2444 | 0 | PinnableSlice* value, uint64_t* bytes_read) const { |
2445 | 0 | assert(value); |
2446 | |
|
2447 | 0 | if (blob_index.HasTTL() || blob_index.IsInlined()) { |
2448 | 0 | return Status::Corruption("Unexpected TTL/inlined blob index"); |
2449 | 0 | } |
2450 | | |
2451 | 0 | const uint64_t blob_file_number = blob_index.file_number(); |
2452 | |
|
2453 | 0 | auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number); |
2454 | 0 | if (!blob_file_meta) { |
2455 | 0 | return Status::Corruption("Invalid blob file number"); |
2456 | 0 | } |
2457 | | |
2458 | 0 | assert(blob_source_); |
2459 | 0 | value->Reset(); |
2460 | 0 | const Status s = blob_source_->GetBlob( |
2461 | 0 | read_options, user_key, blob_file_number, blob_index.offset(), |
2462 | 0 | blob_file_meta->GetBlobFileSize(), blob_index.size(), |
2463 | 0 | blob_index.compression(), prefetch_buffer, value, bytes_read); |
2464 | |
|
2465 | 0 | return s; |
2466 | 0 | } |
2467 | | |
2468 | | void Version::MultiGetBlob( |
2469 | | const ReadOptions& read_options, MultiGetRange& range, |
2470 | 0 | std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) { |
2471 | 0 | assert(!blob_ctxs.empty()); |
2472 | |
|
2473 | 0 | autovector<BlobFileReadRequests> blob_reqs; |
2474 | |
|
2475 | 0 | for (auto& ctx : blob_ctxs) { |
2476 | 0 | const auto file_number = ctx.first; |
2477 | 0 | const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number); |
2478 | |
|
2479 | 0 | autovector<BlobReadRequest> blob_reqs_in_file; |
2480 | 0 | BlobReadContexts& blobs_in_file = ctx.second; |
2481 | 0 | for (auto& blob : blobs_in_file) { |
2482 | 0 | const BlobIndex& blob_index = blob.blob_index; |
2483 | 0 | const KeyContext* const key_context = blob.key_context; |
2484 | 0 | assert(key_context); |
2485 | 0 | assert(key_context->get_context); |
2486 | 0 | assert(key_context->s); |
2487 | |
|
2488 | 0 | if (key_context->value) { |
2489 | 0 | key_context->value->Reset(); |
2490 | 0 | } else { |
2491 | 0 | assert(key_context->columns); |
2492 | 0 | key_context->columns->Reset(); |
2493 | 0 | } |
2494 | |
|
2495 | 0 | if (!blob_file_meta) { |
2496 | 0 | *key_context->s = Status::Corruption("Invalid blob file number"); |
2497 | 0 | continue; |
2498 | 0 | } |
2499 | | |
2500 | 0 | if (blob_index.HasTTL() || blob_index.IsInlined()) { |
2501 | 0 | *key_context->s = |
2502 | 0 | Status::Corruption("Unexpected TTL/inlined blob index"); |
2503 | 0 | continue; |
2504 | 0 | } |
2505 | | |
2506 | 0 | blob_reqs_in_file.emplace_back( |
2507 | 0 | key_context->get_context->ukey_to_get_blob_value(), |
2508 | 0 | blob_index.offset(), blob_index.size(), blob_index.compression(), |
2509 | 0 | &blob.result, key_context->s); |
2510 | 0 | } |
2511 | 0 | if (blob_reqs_in_file.size() > 0) { |
2512 | 0 | const auto file_size = blob_file_meta->GetBlobFileSize(); |
2513 | 0 | blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file); |
2514 | 0 | } |
2515 | 0 | } |
2516 | |
|
2517 | 0 | if (blob_reqs.size() > 0) { |
2518 | 0 | blob_source_->MultiGetBlob(read_options, blob_reqs, |
2519 | 0 | /*bytes_read=*/nullptr); |
2520 | 0 | } |
2521 | |
|
2522 | 0 | for (auto& ctx : blob_ctxs) { |
2523 | 0 | BlobReadContexts& blobs_in_file = ctx.second; |
2524 | 0 | for (auto& blob : blobs_in_file) { |
2525 | 0 | const KeyContext* const key_context = blob.key_context; |
2526 | 0 | assert(key_context); |
2527 | 0 | assert(key_context->get_context); |
2528 | 0 | assert(key_context->s); |
2529 | |
|
2530 | 0 | if (key_context->s->ok()) { |
2531 | 0 | if (key_context->value) { |
2532 | 0 | *key_context->value = std::move(blob.result); |
2533 | 0 | range.AddValueSize(key_context->value->size()); |
2534 | 0 | } else { |
2535 | 0 | assert(key_context->columns); |
2536 | 0 | key_context->columns->SetPlainValue(std::move(blob.result)); |
2537 | 0 | range.AddValueSize(key_context->columns->serialized_size()); |
2538 | 0 | } |
2539 | |
|
2540 | 0 | if (range.GetValueSize() > read_options.value_size_soft_limit) { |
2541 | 0 | *key_context->s = Status::Aborted(); |
2542 | 0 | } |
2543 | 0 | } else if (key_context->s->IsIncomplete()) { |
2544 | | // read_options.read_tier == kBlockCacheTier |
2545 | | // Cannot read blob(s): no disk I/O allowed |
2546 | 0 | auto& get_context = *(key_context->get_context); |
2547 | 0 | get_context.MarkKeyMayExist(); |
2548 | 0 | } |
2549 | 0 | } |
2550 | 0 | } |
2551 | 0 | } |
2552 | | |
2553 | | void Version::Get(const ReadOptions& read_options, const LookupKey& k, |
2554 | | PinnableSlice* value, PinnableWideColumns* columns, |
2555 | | std::string* timestamp, Status* status, |
2556 | | MergeContext* merge_context, |
2557 | | SequenceNumber* max_covering_tombstone_seq, |
2558 | | PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, |
2559 | | bool* key_exists, SequenceNumber* seq, ReadCallback* callback, |
2560 | 2.43k | bool* is_blob, bool do_merge) { |
2561 | 2.43k | Slice ikey = k.internal_key(); |
2562 | 2.43k | Slice user_key = k.user_key(); |
2563 | | |
2564 | 2.43k | assert(status->ok() || status->IsMergeInProgress()); |
2565 | | |
2566 | 2.43k | if (key_exists != nullptr) { |
2567 | | // will falsify below if not found |
2568 | 0 | *key_exists = true; |
2569 | 0 | } |
2570 | | |
2571 | 2.43k | uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; |
2572 | 2.43k | if (vset_ && vset_->block_cache_tracer_ && |
2573 | 2.43k | vset_->block_cache_tracer_->is_tracing_enabled()) { |
2574 | 0 | tracing_get_id = vset_->block_cache_tracer_->NextGetId(); |
2575 | 0 | } |
2576 | | |
2577 | | // Note: the old StackableDB-based BlobDB passes in |
2578 | | // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we |
2579 | | // need to provide it here. |
2580 | 2.43k | bool is_blob_index = false; |
2581 | 2.43k | bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; |
2582 | 2.43k | BlobFetcher blob_fetcher(this, read_options); |
2583 | | |
2584 | 2.43k | assert(pinned_iters_mgr); |
2585 | 2.43k | GetContext get_context( |
2586 | 2.43k | user_comparator(), merge_operator_, info_log_, db_statistics_, |
2587 | 2.43k | status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, |
2588 | 2.43k | do_merge ? value : nullptr, do_merge ? columns : nullptr, |
2589 | 2.43k | do_merge ? timestamp : nullptr, value_found, merge_context, do_merge, |
2590 | 2.43k | max_covering_tombstone_seq, clock_, seq, |
2591 | 2.43k | merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, |
2592 | 2.43k | tracing_get_id, &blob_fetcher); |
2593 | | |
2594 | | // Pin blocks that we read to hold merge operands |
2595 | 2.43k | if (merge_operator_) { |
2596 | 0 | pinned_iters_mgr->StartPinning(); |
2597 | 0 | } |
2598 | | |
2599 | 2.43k | FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, |
2600 | 2.43k | storage_info_.num_non_empty_levels_, |
2601 | 2.43k | &storage_info_.file_indexer_, user_comparator(), |
2602 | 2.43k | internal_comparator()); |
2603 | 2.43k | FdWithKeyRange* f = fp.GetNextFile(); |
2604 | | |
2605 | 2.86k | while (f != nullptr) { |
2606 | 1.47k | if (*max_covering_tombstone_seq > 0) { |
2607 | | // The remaining files we look at will only contain covered keys, so we |
2608 | | // stop here. |
2609 | 0 | break; |
2610 | 0 | } |
2611 | 1.47k | if (get_context.sample()) { |
2612 | 3 | sample_file_read_inc(f->file_metadata); |
2613 | 3 | } |
2614 | | |
2615 | 1.47k | bool timer_enabled = |
2616 | 1.47k | GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && |
2617 | 0 | get_perf_context()->per_level_perf_context_enabled; |
2618 | 1.47k | StopWatchNano timer(clock_, timer_enabled /* auto_start */); |
2619 | 1.47k | *status = table_cache_->Get( |
2620 | 1.47k | read_options, *internal_comparator(), *f->file_metadata, ikey, |
2621 | 1.47k | &get_context, mutable_cf_options_, |
2622 | 1.47k | cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), |
2623 | 1.47k | IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()), |
2624 | 1.47k | fp.IsHitFileLastInLevel()), |
2625 | 1.47k | fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); |
2626 | | // TODO: examine the behavior for corrupted key |
2627 | 1.47k | if (timer_enabled) { |
2628 | 0 | PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), |
2629 | 0 | fp.GetHitFileLevel()); |
2630 | 0 | } |
2631 | 1.47k | if (!status->ok()) { |
2632 | 0 | if (db_statistics_ != nullptr) { |
2633 | 0 | get_context.ReportCounters(); |
2634 | 0 | } |
2635 | 0 | return; |
2636 | 0 | } |
2637 | | |
2638 | | // report the counters before returning |
2639 | 1.47k | if (get_context.State() != GetContext::kNotFound && |
2640 | 1.05k | get_context.State() != GetContext::kMerge && |
2641 | 1.05k | db_statistics_ != nullptr) { |
2642 | 0 | get_context.ReportCounters(); |
2643 | 0 | } |
2644 | 1.47k | switch (get_context.State()) { |
2645 | 424 | case GetContext::kNotFound: |
2646 | | // Keep searching in other files |
2647 | 424 | break; |
2648 | 0 | case GetContext::kMerge: |
2649 | | // TODO: update per-level perfcontext user_key_return_count for kMerge |
2650 | 0 | break; |
2651 | 587 | case GetContext::kFound: |
2652 | 587 | if (fp.GetHitFileLevel() == 0) { |
2653 | 364 | RecordTick(db_statistics_, GET_HIT_L0); |
2654 | 364 | } else if (fp.GetHitFileLevel() == 1) { |
2655 | 0 | RecordTick(db_statistics_, GET_HIT_L1); |
2656 | 223 | } else if (fp.GetHitFileLevel() >= 2) { |
2657 | 223 | RecordTick(db_statistics_, GET_HIT_L2_AND_UP); |
2658 | 223 | } |
2659 | | |
2660 | 587 | PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, |
2661 | 587 | fp.GetHitFileLevel()); |
2662 | | |
2663 | 587 | if (is_blob_index && do_merge && (value || columns)) { |
2664 | 0 | Slice blob_index = |
2665 | 0 | value ? *value |
2666 | 0 | : WideColumnsHelper::GetDefaultColumn(columns->columns()); |
2667 | |
|
2668 | 0 | TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex", |
2669 | 0 | &blob_index); |
2670 | |
|
2671 | 0 | constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; |
2672 | |
|
2673 | 0 | PinnableSlice result; |
2674 | |
|
2675 | 0 | constexpr uint64_t* bytes_read = nullptr; |
2676 | |
|
2677 | 0 | *status = GetBlob(read_options, get_context.ukey_to_get_blob_value(), |
2678 | 0 | blob_index, prefetch_buffer, &result, bytes_read); |
2679 | 0 | if (!status->ok()) { |
2680 | 0 | if (status->IsIncomplete()) { |
2681 | 0 | get_context.MarkKeyMayExist(); |
2682 | 0 | } |
2683 | 0 | return; |
2684 | 0 | } |
2685 | | |
2686 | 0 | if (value) { |
2687 | 0 | *value = std::move(result); |
2688 | 0 | } else { |
2689 | 0 | assert(columns); |
2690 | 0 | columns->SetPlainValue(std::move(result)); |
2691 | 0 | } |
2692 | 0 | } |
2693 | | |
2694 | 587 | return; |
2695 | 587 | case GetContext::kDeleted: |
2696 | | // Use empty error message for speed |
2697 | 466 | *status = Status::NotFound(); |
2698 | 466 | return; |
2699 | 0 | case GetContext::kCorrupt: |
2700 | 0 | *status = Status::Corruption("corrupted key for ", user_key); |
2701 | 0 | return; |
2702 | 0 | case GetContext::kUnexpectedBlobIndex: |
2703 | 0 | ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); |
2704 | 0 | *status = Status::NotSupported( |
2705 | 0 | "Encounter unexpected blob index. Please open DB with " |
2706 | 0 | "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); |
2707 | 0 | return; |
2708 | 0 | case GetContext::kMergeOperatorFailed: |
2709 | 0 | *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); |
2710 | 0 | return; |
2711 | 1.47k | } |
2712 | 424 | f = fp.GetNextFile(); |
2713 | 424 | } |
2714 | 1.38k | if (db_statistics_ != nullptr) { |
2715 | 0 | get_context.ReportCounters(); |
2716 | 0 | } |
2717 | 1.38k | if (GetContext::kMerge == get_context.State()) { |
2718 | 0 | if (!do_merge) { |
2719 | 0 | *status = Status::OK(); |
2720 | 0 | return; |
2721 | 0 | } |
2722 | 0 | if (!merge_operator_) { |
2723 | 0 | *status = Status::InvalidArgument( |
2724 | 0 | "merge_operator is not properly initialized."); |
2725 | 0 | return; |
2726 | 0 | } |
2727 | | // merge_operands are in saver and we hit the beginning of the key history |
2728 | | // do a final merge of nullptr and operands; |
2729 | 0 | if (value || columns) { |
2730 | | // `op_failure_scope` (an output parameter) is not provided (set to |
2731 | | // nullptr) since a failure must be propagated regardless of its value. |
2732 | 0 | *status = MergeHelper::TimedFullMerge( |
2733 | 0 | merge_operator_, user_key, MergeHelper::kNoBaseValue, |
2734 | 0 | merge_context->GetOperands(), info_log_, db_statistics_, clock_, |
2735 | 0 | /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, |
2736 | 0 | value ? value->GetSelf() : nullptr, columns); |
2737 | 0 | if (status->ok()) { |
2738 | 0 | if (LIKELY(value != nullptr)) { |
2739 | 0 | value->PinSelf(); |
2740 | 0 | } |
2741 | 0 | } |
2742 | 0 | } |
2743 | 1.38k | } else { |
2744 | 1.38k | if (key_exists != nullptr) { |
2745 | 0 | *key_exists = false; |
2746 | 0 | } |
2747 | 1.38k | *status = Status::NotFound(); // Use an empty error message for speed |
2748 | 1.38k | } |
2749 | 1.38k | } |
2750 | | |
2751 | | void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, |
2752 | 0 | ReadCallback* callback) { |
2753 | 0 | PinnedIteratorsManager pinned_iters_mgr; |
2754 | | |
2755 | | // Pin blocks that we read to hold merge operands |
2756 | 0 | if (merge_operator_) { |
2757 | 0 | pinned_iters_mgr.StartPinning(); |
2758 | 0 | } |
2759 | 0 | uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; |
2760 | |
|
2761 | 0 | if (vset_ && vset_->block_cache_tracer_ && |
2762 | 0 | vset_->block_cache_tracer_->is_tracing_enabled()) { |
2763 | 0 | tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); |
2764 | 0 | } |
2765 | | // Even though we know the batch size won't be > MAX_BATCH_SIZE, |
2766 | | // use autovector in order to avoid unnecessary construction of GetContext |
2767 | | // objects, which is expensive |
2768 | 0 | autovector<GetContext, 16> get_ctx; |
2769 | 0 | BlobFetcher blob_fetcher(this, read_options); |
2770 | 0 | for (auto iter = range->begin(); iter != range->end(); ++iter) { |
2771 | 0 | assert(iter->s->ok() || iter->s->IsMergeInProgress()); |
2772 | 0 | get_ctx.emplace_back( |
2773 | 0 | user_comparator(), merge_operator_, info_log_, db_statistics_, |
2774 | 0 | iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, |
2775 | 0 | iter->ukey_with_ts, iter->value, iter->columns, iter->timestamp, |
2776 | 0 | nullptr, &(iter->merge_context), true, |
2777 | 0 | &iter->max_covering_tombstone_seq, clock_, nullptr, |
2778 | 0 | merge_operator_ ? &pinned_iters_mgr : nullptr, callback, |
2779 | 0 | &iter->is_blob_index, tracing_mget_id, &blob_fetcher); |
2780 | | // MergeInProgress status, if set, has been transferred to the get_context |
2781 | | // state, so we set status to ok here. From now on, the iter status will |
2782 | | // be used for IO errors, and get_context state will be used for any |
2783 | | // key level errors |
2784 | 0 | *(iter->s) = Status::OK(); |
2785 | 0 | } |
2786 | 0 | int get_ctx_index = 0; |
2787 | 0 | for (auto iter = range->begin(); iter != range->end(); |
2788 | 0 | ++iter, get_ctx_index++) { |
2789 | 0 | iter->get_context = &(get_ctx[get_ctx_index]); |
2790 | 0 | } |
2791 | |
|
2792 | 0 | Status s; |
2793 | | // blob_file => [[blob_idx, it], ...] |
2794 | 0 | std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs; |
2795 | 0 | MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end()); |
2796 | | #if USE_COROUTINES |
2797 | | if (read_options.async_io && read_options.optimize_multiget_for_io && |
2798 | | using_coroutines() && use_async_io_) { |
2799 | | s = MultiGetAsync(read_options, range, &blob_ctxs); |
2800 | | } else |
2801 | | #endif // USE_COROUTINES |
2802 | 0 | { |
2803 | 0 | MultiGetRange file_picker_range(*range, range->begin(), range->end()); |
2804 | 0 | FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_, |
2805 | 0 | storage_info_.num_non_empty_levels_, |
2806 | 0 | &storage_info_.file_indexer_, user_comparator(), |
2807 | 0 | internal_comparator()); |
2808 | 0 | FdWithKeyRange* f = fp.GetNextFileInLevel(); |
2809 | 0 | uint64_t num_index_read = 0; |
2810 | 0 | uint64_t num_filter_read = 0; |
2811 | 0 | uint64_t num_sst_read = 0; |
2812 | 0 | uint64_t num_level_read = 0; |
2813 | |
|
2814 | 0 | int prev_level = -1; |
2815 | |
|
2816 | 0 | while (!fp.IsSearchEnded()) { |
2817 | | // This will be set to true later if we actually look up in a file in L0. |
2818 | | // For per level stats purposes, an L0 file is treated as a level |
2819 | 0 | bool dump_stats_for_l0_file = false; |
2820 | | |
2821 | | // Avoid using the coroutine version if we're looking in a L0 file, since |
2822 | | // L0 files won't be parallelized anyway. The regular synchronous version |
2823 | | // is faster. |
2824 | 0 | if (!read_options.async_io || !using_coroutines() || !use_async_io_ || |
2825 | 0 | fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) { |
2826 | 0 | if (f) { |
2827 | 0 | bool skip_filters = |
2828 | 0 | IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()), |
2829 | 0 | fp.IsHitFileLastInLevel()); |
2830 | | // Call MultiGetFromSST for looking up a single file |
2831 | 0 | s = MultiGetFromSST(read_options, fp.CurrentFileRange(), |
2832 | 0 | fp.GetHitFileLevel(), skip_filters, |
2833 | 0 | /*skip_range_deletions=*/false, f, blob_ctxs, |
2834 | 0 | /*table_handle=*/nullptr, num_filter_read, |
2835 | 0 | num_index_read, num_sst_read); |
2836 | 0 | if (fp.GetHitFileLevel() == 0) { |
2837 | 0 | dump_stats_for_l0_file = true; |
2838 | 0 | } |
2839 | 0 | } |
2840 | 0 | if (s.ok()) { |
2841 | 0 | f = fp.GetNextFileInLevel(); |
2842 | 0 | } |
2843 | | #if USE_COROUTINES |
2844 | | } else { |
2845 | | std::vector<folly::coro::Task<Status>> mget_tasks; |
2846 | | while (f != nullptr) { |
2847 | | MultiGetRange file_range = fp.CurrentFileRange(); |
2848 | | TableCache::TypedHandle* table_handle = nullptr; |
2849 | | bool skip_filters = |
2850 | | IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()), |
2851 | | fp.IsHitFileLastInLevel()); |
2852 | | bool skip_range_deletions = false; |
2853 | | if (!skip_filters) { |
2854 | | Status status = table_cache_->MultiGetFilter( |
2855 | | read_options, *internal_comparator(), *f->file_metadata, |
2856 | | mutable_cf_options_, |
2857 | | cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), |
2858 | | fp.GetHitFileLevel(), &file_range, &table_handle); |
2859 | | skip_range_deletions = true; |
2860 | | if (status.ok()) { |
2861 | | skip_filters = true; |
2862 | | } else if (!status.IsNotSupported()) { |
2863 | | s = status; |
2864 | | } |
2865 | | } |
2866 | | |
2867 | | if (!s.ok()) { |
2868 | | break; |
2869 | | } |
2870 | | |
2871 | | if (!file_range.empty()) { |
2872 | | mget_tasks.emplace_back(MultiGetFromSSTCoroutine( |
2873 | | read_options, file_range, fp.GetHitFileLevel(), skip_filters, |
2874 | | skip_range_deletions, f, blob_ctxs, table_handle, |
2875 | | num_filter_read, num_index_read, num_sst_read)); |
2876 | | } |
2877 | | if (fp.KeyMaySpanNextFile()) { |
2878 | | break; |
2879 | | } |
2880 | | f = fp.GetNextFileInLevel(); |
2881 | | } |
2882 | | if (mget_tasks.size() > 0) { |
2883 | | RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, |
2884 | | mget_tasks.size()); |
2885 | | // Collect all results so far |
2886 | | std::vector<Status> statuses = |
2887 | | folly::coro::blockingWait(co_withExecutor( |
2888 | | &range->context()->executor(), |
2889 | | folly::coro::collectAllRange(std::move(mget_tasks)))); |
2890 | | if (s.ok()) { |
2891 | | for (Status stat : statuses) { |
2892 | | if (!stat.ok()) { |
2893 | | s = std::move(stat); |
2894 | | break; |
2895 | | } |
2896 | | } |
2897 | | } |
2898 | | |
2899 | | if (s.ok() && fp.KeyMaySpanNextFile()) { |
2900 | | f = fp.GetNextFileInLevel(); |
2901 | | } |
2902 | | } |
2903 | | #endif // USE_COROUTINES |
2904 | 0 | } |
2905 | | // If bad status or we found final result for all the keys |
2906 | 0 | if (!s.ok() || file_picker_range.empty()) { |
2907 | 0 | break; |
2908 | 0 | } |
2909 | 0 | if (!f) { |
2910 | | // Reached the end of this level. Prepare the next level |
2911 | 0 | fp.PrepareNextLevelForSearch(); |
2912 | 0 | if (!fp.IsSearchEnded()) { |
2913 | | // Its possible there is no overlap on this level and f is nullptr |
2914 | 0 | f = fp.GetNextFileInLevel(); |
2915 | 0 | } |
2916 | 0 | if (dump_stats_for_l0_file || |
2917 | 0 | (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) { |
2918 | | // Dump the stats if the search has moved to the next level and |
2919 | | // reset for next level. |
2920 | 0 | if (num_filter_read + num_index_read) { |
2921 | 0 | RecordInHistogram(db_statistics_, |
2922 | 0 | NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, |
2923 | 0 | num_index_read + num_filter_read); |
2924 | 0 | } |
2925 | 0 | if (num_sst_read) { |
2926 | 0 | RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, |
2927 | 0 | num_sst_read); |
2928 | 0 | num_level_read++; |
2929 | 0 | } |
2930 | 0 | num_filter_read = 0; |
2931 | 0 | num_index_read = 0; |
2932 | 0 | num_sst_read = 0; |
2933 | 0 | } |
2934 | 0 | prev_level = fp.GetHitFileLevel(); |
2935 | 0 | } |
2936 | 0 | } |
2937 | | |
2938 | | // Dump stats for most recent level |
2939 | 0 | if (num_filter_read + num_index_read) { |
2940 | 0 | RecordInHistogram(db_statistics_, |
2941 | 0 | NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, |
2942 | 0 | num_index_read + num_filter_read); |
2943 | 0 | } |
2944 | 0 | if (num_sst_read) { |
2945 | 0 | RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); |
2946 | 0 | num_level_read++; |
2947 | 0 | } |
2948 | 0 | if (num_level_read) { |
2949 | 0 | RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, |
2950 | 0 | num_level_read); |
2951 | 0 | } |
2952 | 0 | } |
2953 | |
|
2954 | 0 | if (!blob_ctxs.empty()) { |
2955 | 0 | MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs); |
2956 | 0 | } |
2957 | | |
2958 | | // Process any left over keys |
2959 | 0 | for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) { |
2960 | 0 | GetContext& get_context = *iter->get_context; |
2961 | 0 | Status* status = iter->s; |
2962 | 0 | Slice user_key = iter->lkey->user_key(); |
2963 | |
|
2964 | 0 | if (db_statistics_ != nullptr) { |
2965 | 0 | get_context.ReportCounters(); |
2966 | 0 | } |
2967 | 0 | if (GetContext::kMerge == get_context.State()) { |
2968 | 0 | if (!merge_operator_) { |
2969 | 0 | *status = Status::InvalidArgument( |
2970 | 0 | "merge_operator is not properly initialized."); |
2971 | 0 | range->MarkKeyDone(iter); |
2972 | 0 | continue; |
2973 | 0 | } |
2974 | | // merge_operands are in saver and we hit the beginning of the key history |
2975 | | // do a final merge of nullptr and operands; |
2976 | | // `op_failure_scope` (an output parameter) is not provided (set to |
2977 | | // nullptr) since a failure must be propagated regardless of its value. |
2978 | 0 | *status = MergeHelper::TimedFullMerge( |
2979 | 0 | merge_operator_, user_key, MergeHelper::kNoBaseValue, |
2980 | 0 | iter->merge_context.GetOperands(), info_log_, db_statistics_, clock_, |
2981 | 0 | /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, |
2982 | 0 | iter->value ? iter->value->GetSelf() : nullptr, iter->columns); |
2983 | 0 | if (LIKELY(iter->value != nullptr)) { |
2984 | 0 | iter->value->PinSelf(); |
2985 | 0 | range->AddValueSize(iter->value->size()); |
2986 | 0 | } else { |
2987 | 0 | assert(iter->columns); |
2988 | 0 | range->AddValueSize(iter->columns->serialized_size()); |
2989 | 0 | } |
2990 | |
|
2991 | 0 | range->MarkKeyDone(iter); |
2992 | 0 | if (range->GetValueSize() > read_options.value_size_soft_limit) { |
2993 | 0 | s = Status::Aborted(); |
2994 | 0 | break; |
2995 | 0 | } |
2996 | 0 | } else { |
2997 | 0 | range->MarkKeyDone(iter); |
2998 | 0 | *status = Status::NotFound(); // Use an empty error message for speed |
2999 | 0 | } |
3000 | 0 | } |
3001 | |
|
3002 | 0 | for (auto iter = range->begin(); iter != range->end(); ++iter) { |
3003 | 0 | range->MarkKeyDone(iter); |
3004 | 0 | *(iter->s) = s; |
3005 | 0 | } |
3006 | 0 | } |
3007 | | |
3008 | | #ifdef USE_COROUTINES |
3009 | | Status Version::ProcessBatch( |
3010 | | const ReadOptions& read_options, FilePickerMultiGet* batch, |
3011 | | std::vector<folly::coro::Task<Status>>& mget_tasks, |
3012 | | std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs, |
3013 | | autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting, |
3014 | | std::deque<size_t>& to_process, unsigned int& num_tasks_queued, |
3015 | | std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>& |
3016 | | mget_stats) { |
3017 | | FilePickerMultiGet& fp = *batch; |
3018 | | MultiGetRange range = fp.GetRange(); |
3019 | | // Initialize a new empty range. Any keys that are not in this level will |
3020 | | // eventually become part of the new range. |
3021 | | MultiGetRange leftover(range, range.begin(), range.begin()); |
3022 | | FdWithKeyRange* f = nullptr; |
3023 | | Status s; |
3024 | | |
3025 | | f = fp.GetNextFileInLevel(); |
3026 | | while (!f) { |
3027 | | fp.PrepareNextLevelForSearch(); |
3028 | | if (!fp.IsSearchEnded()) { |
3029 | | f = fp.GetNextFileInLevel(); |
3030 | | } else { |
3031 | | break; |
3032 | | } |
3033 | | } |
3034 | | while (f) { |
3035 | | MultiGetRange file_range = fp.CurrentFileRange(); |
3036 | | TableCache::TypedHandle* table_handle = nullptr; |
3037 | | bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()), |
3038 | | fp.IsHitFileLastInLevel()); |
3039 | | bool skip_range_deletions = false; |
3040 | | if (!skip_filters) { |
3041 | | Status status = table_cache_->MultiGetFilter( |
3042 | | read_options, *internal_comparator(), *f->file_metadata, |
3043 | | mutable_cf_options_, |
3044 | | cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), |
3045 | | fp.GetHitFileLevel(), &file_range, &table_handle); |
3046 | | if (status.ok()) { |
3047 | | skip_filters = true; |
3048 | | skip_range_deletions = true; |
3049 | | } else if (!status.IsNotSupported()) { |
3050 | | s = status; |
3051 | | } |
3052 | | } |
3053 | | if (!s.ok()) { |
3054 | | break; |
3055 | | } |
3056 | | // At this point, file_range contains any keys that are likely in this |
3057 | | // file. It may have false positives, but that's ok since higher level |
3058 | | // lookups for the key are dependent on this lookup anyway. |
3059 | | // Add the complement of file_range to leftover. That's the set of keys |
3060 | | // definitely not in this level. |
3061 | | // Subtract the complement of file_range from range, since they will be |
3062 | | // processed in a separate batch in parallel. |
3063 | | leftover += ~file_range; |
3064 | | range -= ~file_range; |
3065 | | if (!file_range.empty()) { |
3066 | | int level = fp.GetHitFileLevel(); |
3067 | | auto stat = mget_stats.find(level); |
3068 | | if (stat == mget_stats.end()) { |
3069 | | auto entry = mget_stats.insert({level, {0, 0, 0}}); |
3070 | | assert(entry.second); |
3071 | | stat = entry.first; |
3072 | | } |
3073 | | |
3074 | | if (waiting.empty() && to_process.empty() && |
3075 | | !fp.RemainingOverlapInLevel() && leftover.empty() && |
3076 | | mget_tasks.empty()) { |
3077 | | // All keys are in one SST file, so take the fast path |
3078 | | s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(), |
3079 | | skip_filters, skip_range_deletions, f, *blob_ctxs, |
3080 | | table_handle, std::get<0>(stat->second), |
3081 | | std::get<1>(stat->second), |
3082 | | std::get<2>(stat->second)); |
3083 | | } else { |
3084 | | mget_tasks.emplace_back(MultiGetFromSSTCoroutine( |
3085 | | read_options, file_range, fp.GetHitFileLevel(), skip_filters, |
3086 | | skip_range_deletions, f, *blob_ctxs, table_handle, |
3087 | | std::get<0>(stat->second), std::get<1>(stat->second), |
3088 | | std::get<2>(stat->second))); |
3089 | | ++num_tasks_queued; |
3090 | | } |
3091 | | } |
3092 | | if (fp.KeyMaySpanNextFile() && !file_range.empty()) { |
3093 | | break; |
3094 | | } |
3095 | | f = fp.GetNextFileInLevel(); |
3096 | | } |
3097 | | // Split the current batch only if some keys are likely in this level and |
3098 | | // some are not. Only split if we're done with this level, i.e f is null. |
3099 | | // Otherwise, it means there are more files in this level to look at. |
3100 | | if (s.ok() && !f && !leftover.empty() && !range.empty()) { |
3101 | | fp.ReplaceRange(range); |
3102 | | batches.emplace_back(&leftover, fp); |
3103 | | to_process.emplace_back(batches.size() - 1); |
3104 | | } |
3105 | | // 1. If f is non-null, that means we might not be done with this level. |
3106 | | // This can happen if one of the keys is the last key in the file, i.e |
3107 | | // fp.KeyMaySpanNextFile() is true. |
3108 | | // 2. If range is empty, then we're done with this range and no need to |
3109 | | // prepare the next level |
3110 | | // 3. If some tasks were queued for this range, then the next level will be |
3111 | | // prepared after executing those tasks |
3112 | | if (!f && !range.empty() && !num_tasks_queued) { |
3113 | | fp.PrepareNextLevelForSearch(); |
3114 | | } |
3115 | | return s; |
3116 | | } |
3117 | | |
3118 | | Status Version::MultiGetAsync( |
3119 | | const ReadOptions& options, MultiGetRange* range, |
3120 | | std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) { |
3121 | | autovector<FilePickerMultiGet, 4> batches; |
3122 | | std::deque<size_t> waiting; |
3123 | | std::deque<size_t> to_process; |
3124 | | Status s; |
3125 | | std::vector<folly::coro::Task<Status>> mget_tasks; |
3126 | | std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats; |
3127 | | |
3128 | | // Create the initial batch with the input range |
3129 | | batches.emplace_back(range, &storage_info_.level_files_brief_, |
3130 | | storage_info_.num_non_empty_levels_, |
3131 | | &storage_info_.file_indexer_, user_comparator(), |
3132 | | internal_comparator()); |
3133 | | to_process.emplace_back(0); |
3134 | | |
3135 | | while (!to_process.empty()) { |
3136 | | // As we process a batch, it may get split into two. So reserve space for |
3137 | | // an additional batch in the autovector in order to prevent later moves |
3138 | | // of elements in ProcessBatch(). |
3139 | | batches.reserve(batches.size() + 1); |
3140 | | |
3141 | | size_t idx = to_process.front(); |
3142 | | FilePickerMultiGet* batch = &batches.at(idx); |
3143 | | unsigned int num_tasks_queued = 0; |
3144 | | to_process.pop_front(); |
3145 | | if (batch->IsSearchEnded() || batch->GetRange().empty()) { |
3146 | | // If to_process is empty, i.e no more batches to look at, then we need |
3147 | | // schedule the enqueued coroutines and wait for them. Otherwise, we |
3148 | | // skip this batch and move to the next one in to_process. |
3149 | | if (!to_process.empty()) { |
3150 | | continue; |
3151 | | } |
3152 | | } else { |
3153 | | // Look through one level. This may split the batch and enqueue it to |
3154 | | // to_process |
3155 | | s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting, |
3156 | | to_process, num_tasks_queued, mget_stats); |
3157 | | // If ProcessBatch didn't enqueue any coroutine tasks, it means all |
3158 | | // keys were filtered out. So put the batch back in to_process to |
3159 | | // lookup in the next level |
3160 | | if (!num_tasks_queued && !batch->IsSearchEnded()) { |
3161 | | // Put this back in the processing queue |
3162 | | to_process.emplace_back(idx); |
3163 | | } else if (num_tasks_queued) { |
3164 | | waiting.emplace_back(idx); |
3165 | | } |
3166 | | } |
3167 | | // If ProcessBatch() returned an error, then schedule the enqueued |
3168 | | // coroutines and wait for them, then abort the MultiGet. |
3169 | | if (to_process.empty() || !s.ok()) { |
3170 | | if (mget_tasks.size() > 0) { |
3171 | | assert(waiting.size()); |
3172 | | RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); |
3173 | | // Collect all results so far |
3174 | | std::vector<Status> statuses = |
3175 | | folly::coro::blockingWait(co_withExecutor( |
3176 | | &range->context()->executor(), |
3177 | | folly::coro::collectAllRange(std::move(mget_tasks)))); |
3178 | | mget_tasks.clear(); |
3179 | | if (s.ok()) { |
3180 | | for (Status stat : statuses) { |
3181 | | if (!stat.ok()) { |
3182 | | s = std::move(stat); |
3183 | | break; |
3184 | | } |
3185 | | } |
3186 | | } |
3187 | | |
3188 | | if (!s.ok()) { |
3189 | | break; |
3190 | | } |
3191 | | |
3192 | | for (size_t wait_idx : waiting) { |
3193 | | FilePickerMultiGet& fp = batches.at(wait_idx); |
3194 | | // 1. If fp.GetHitFile() is non-null, then there could be more |
3195 | | // overlap in this level. So skip preparing next level. |
3196 | | // 2. If fp.GetRange() is empty, then this batch is completed |
3197 | | // and no need to prepare the next level. |
3198 | | if (!fp.GetHitFile() && !fp.GetRange().empty()) { |
3199 | | fp.PrepareNextLevelForSearch(); |
3200 | | } |
3201 | | } |
3202 | | to_process.swap(waiting); |
3203 | | } else { |
3204 | | assert(!s.ok() || waiting.size() == 0); |
3205 | | } |
3206 | | } |
3207 | | if (!s.ok()) { |
3208 | | break; |
3209 | | } |
3210 | | } |
3211 | | |
3212 | | uint64_t num_levels = 0; |
3213 | | for (auto& stat : mget_stats) { |
3214 | | if (stat.first == 0) { |
3215 | | num_levels += std::get<2>(stat.second); |
3216 | | } else { |
3217 | | num_levels++; |
3218 | | } |
3219 | | |
3220 | | uint64_t num_meta_reads = |
3221 | | std::get<0>(stat.second) + std::get<1>(stat.second); |
3222 | | uint64_t num_sst_reads = std::get<2>(stat.second); |
3223 | | if (num_meta_reads > 0) { |
3224 | | RecordInHistogram(db_statistics_, |
3225 | | NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, |
3226 | | num_meta_reads); |
3227 | | } |
3228 | | if (num_sst_reads > 0) { |
3229 | | RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads); |
3230 | | } |
3231 | | } |
3232 | | if (num_levels > 0) { |
3233 | | RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels); |
3234 | | } |
3235 | | |
3236 | | return s; |
3237 | | } |
3238 | | #endif |
3239 | | |
3240 | 6.95k | bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { |
3241 | | // Reaching the bottom level implies misses at all upper levels, so we'll |
3242 | | // skip checking the filters when we predict a hit. |
3243 | 6.95k | return cfd_->ioptions().optimize_filters_for_hits && |
3244 | 0 | (level > 0 || is_file_last_in_level) && |
3245 | 0 | level == storage_info_.num_non_empty_levels() - 1; |
3246 | 6.95k | } |
3247 | | |
3248 | 233k | void VersionStorageInfo::GenerateLevelFilesBrief() { |
3249 | 233k | level_files_brief_.resize(num_non_empty_levels_); |
3250 | 474k | for (int level = 0; level < num_non_empty_levels_; level++) { |
3251 | 240k | DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level], |
3252 | 240k | &arena_); |
3253 | 240k | } |
3254 | 233k | } |
3255 | | |
3256 | | void VersionStorageInfo::PrepareForVersionAppend( |
3257 | | const ImmutableOptions& immutable_options, |
3258 | 233k | const MutableCFOptions& mutable_cf_options) { |
3259 | 233k | ComputeCompensatedSizes(); |
3260 | 233k | UpdateNumNonEmptyLevels(); |
3261 | 233k | CalculateBaseBytes(immutable_options, mutable_cf_options); |
3262 | 233k | UpdateFilesByCompactionPri(immutable_options, mutable_cf_options); |
3263 | 233k | GenerateFileIndexer(); |
3264 | 233k | GenerateLevelFilesBrief(); |
3265 | 233k | GenerateLevel0NonOverlapping(); |
3266 | 233k | GenerateBottommostFiles(); |
3267 | 233k | GenerateFileLocationIndex(); |
3268 | 233k | } |
3269 | | |
3270 | | void Version::PrepareAppend(const ReadOptions& read_options, |
3271 | 233k | bool update_stats) { |
3272 | 233k | TEST_SYNC_POINT_CALLBACK( |
3273 | 233k | "Version::PrepareAppend:forced_check", |
3274 | 233k | static_cast<void*>(&storage_info_.force_consistency_checks_)); |
3275 | | |
3276 | 233k | if (update_stats) { |
3277 | 138k | UpdateAccumulatedStats(read_options); |
3278 | 138k | } |
3279 | | |
3280 | 233k | storage_info_.PrepareForVersionAppend(cfd_->ioptions(), mutable_cf_options_); |
3281 | 233k | } |
3282 | | |
3283 | | bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, |
3284 | 341k | FileMetaData* file_meta) { |
3285 | 341k | if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { |
3286 | 238k | return false; |
3287 | 238k | } |
3288 | 102k | std::shared_ptr<const TableProperties> tp; |
3289 | 102k | Status s = GetTableProperties(read_options, &tp, file_meta); |
3290 | 102k | file_meta->init_stats_from_file = true; |
3291 | 102k | if (!s.ok()) { |
3292 | 0 | ROCKS_LOG_ERROR(vset_->db_options_->info_log, |
3293 | 0 | "Unable to load table properties for file %" PRIu64 |
3294 | 0 | " --- %s\n", |
3295 | 0 | file_meta->fd.GetNumber(), s.ToString().c_str()); |
3296 | 0 | return false; |
3297 | 0 | } |
3298 | 102k | if (tp.get() == nullptr) { |
3299 | 0 | return false; |
3300 | 0 | } |
3301 | 102k | file_meta->num_entries = tp->num_entries; |
3302 | 102k | file_meta->num_deletions = tp->num_deletions; |
3303 | 102k | file_meta->raw_value_size = tp->raw_value_size; |
3304 | 102k | file_meta->raw_key_size = tp->raw_key_size; |
3305 | 102k | file_meta->num_range_deletions = tp->num_range_deletions; |
3306 | | // Ensure new invariants on old files |
3307 | 102k | file_meta->num_deletions = |
3308 | 102k | std::max(tp->num_deletions, tp->num_range_deletions); |
3309 | 102k | file_meta->num_entries = std::max(tp->num_entries, tp->num_deletions); |
3310 | 102k | return true; |
3311 | 102k | } |
3312 | | |
3313 | 102k | void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { |
3314 | 102k | TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats", |
3315 | 102k | nullptr); |
3316 | | |
3317 | 102k | assert(file_meta->init_stats_from_file); |
3318 | 102k | accumulated_file_size_ += file_meta->fd.GetFileSize(); |
3319 | 102k | accumulated_raw_key_size_ += file_meta->raw_key_size; |
3320 | 102k | accumulated_raw_value_size_ += file_meta->raw_value_size; |
3321 | 102k | assert(file_meta->num_entries >= file_meta->num_deletions); |
3322 | 102k | accumulated_num_non_deletions_ += |
3323 | 102k | file_meta->num_entries - file_meta->num_deletions; |
3324 | 102k | accumulated_num_deletions_ += file_meta->num_deletions; |
3325 | | |
3326 | 102k | current_num_non_deletions_ += |
3327 | 102k | file_meta->num_entries - file_meta->num_deletions; |
3328 | 102k | current_num_deletions_ += file_meta->num_deletions; |
3329 | 102k | current_num_samples_++; |
3330 | 102k | } |
3331 | | |
3332 | 12.3k | void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { |
3333 | 12.3k | if (file_meta->init_stats_from_file) { |
3334 | 12.3k | current_num_non_deletions_ -= |
3335 | 12.3k | file_meta->num_entries - file_meta->num_deletions; |
3336 | 12.3k | current_num_deletions_ -= file_meta->num_deletions; |
3337 | 12.3k | current_num_samples_--; |
3338 | 12.3k | } |
3339 | 12.3k | } |
3340 | | |
3341 | 138k | void Version::UpdateAccumulatedStats(const ReadOptions& read_options) { |
3342 | | // maximum number of table properties loaded from files. |
3343 | 138k | const int kMaxInitCount = 20; |
3344 | 138k | int init_count = 0; |
3345 | | // here only the first kMaxInitCount files which haven't been |
3346 | | // initialized from file will be updated with num_deletions. |
3347 | | // The motivation here is to cap the maximum I/O per Version creation. |
3348 | | // The reason for choosing files from lower-level instead of higher-level |
3349 | | // is that such design is able to propagate the initialization from |
3350 | | // lower-level to higher-level: When the num_deletions of lower-level |
3351 | | // files are updated, it will make the lower-level files have accurate |
3352 | | // compensated_file_size, making lower-level to higher-level compaction |
3353 | | // will be triggered, which creates higher-level files whose num_deletions |
3354 | | // will be updated here. |
3355 | 138k | for (int level = 0; |
3356 | 1.10M | level < storage_info_.num_levels_ && init_count < kMaxInitCount; |
3357 | 966k | ++level) { |
3358 | 966k | for (auto* file_meta : storage_info_.files_[level]) { |
3359 | 190k | if (MaybeInitializeFileMetaData(read_options, file_meta)) { |
3360 | | // each FileMeta will be initialized only once. |
3361 | 102k | storage_info_.UpdateAccumulatedStats(file_meta); |
3362 | | // when option "max_open_files" is -1, all the file metadata has |
3363 | | // already been read, so MaybeInitializeFileMetaData() won't incur |
3364 | | // any I/O cost. "max_open_files=-1" means that the table cache passed |
3365 | | // to the VersionSet and then to the ColumnFamilySet has a size of |
3366 | | // TableCache::kInfiniteCapacity |
3367 | 102k | if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() == |
3368 | 102k | TableCache::kInfiniteCapacity) { |
3369 | 102k | continue; |
3370 | 102k | } |
3371 | 0 | if (++init_count >= kMaxInitCount) { |
3372 | 0 | break; |
3373 | 0 | } |
3374 | 0 | } |
3375 | 190k | } |
3376 | 966k | } |
3377 | | // In case all sampled-files contain only deletion entries, then we |
3378 | | // load the table-property of a file in higher-level to initialize |
3379 | | // that value. |
3380 | 138k | for (int level = storage_info_.num_levels_ - 1; |
3381 | 1.01M | storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { |
3382 | 875k | for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1; |
3383 | 1.02M | storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { |
3384 | 150k | if (MaybeInitializeFileMetaData(read_options, |
3385 | 150k | storage_info_.files_[level][i])) { |
3386 | 0 | storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); |
3387 | 0 | } |
3388 | 150k | } |
3389 | 875k | } |
3390 | 138k | } |
3391 | | |
3392 | 233k | void VersionStorageInfo::ComputeCompensatedSizes() { |
3393 | 233k | static const int kDeletionWeightOnCompaction = 2; |
3394 | 233k | uint64_t average_value_size = GetAverageValueSize(); |
3395 | | |
3396 | | // compute the compensated size |
3397 | 1.86M | for (int level = 0; level < num_levels_; level++) { |
3398 | 1.63M | for (auto* file_meta : files_[level]) { |
3399 | | // Here we only compute compensated_file_size for those file_meta |
3400 | | // which compensated_file_size is uninitialized (== 0). This is true only |
3401 | | // for files that have been created right now and no other thread has |
3402 | | // access to them. That's why we can safely mutate compensated_file_size. |
3403 | 190k | if (file_meta->compensated_file_size == 0) { |
3404 | 102k | file_meta->compensated_file_size = file_meta->fd.GetFileSize(); |
3405 | | // Here we only boost the size of deletion entries of a file only |
3406 | | // when the number of deletion entries is greater than the number of |
3407 | | // non-deletion entries in the file. The motivation here is that in |
3408 | | // a stable workload, the number of deletion entries should be roughly |
3409 | | // equal to the number of non-deletion entries. If we compensate the |
3410 | | // size of deletion entries in a stable workload, the deletion |
3411 | | // compensation logic might introduce unwanted effet which changes the |
3412 | | // shape of LSM tree. |
3413 | 102k | if ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 >= |
3414 | 102k | file_meta->num_entries) { |
3415 | 47.8k | file_meta->compensated_file_size += |
3416 | 47.8k | ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 - |
3417 | 47.8k | file_meta->num_entries) * |
3418 | 47.8k | average_value_size * kDeletionWeightOnCompaction; |
3419 | 47.8k | } |
3420 | 102k | file_meta->compensated_file_size += |
3421 | 102k | file_meta->compensated_range_deletion_size; |
3422 | 102k | } |
3423 | 190k | } |
3424 | 1.63M | } |
3425 | 233k | } |
3426 | | |
3427 | 2.56M | int VersionStorageInfo::MaxInputLevel() const { |
3428 | 2.56M | if (compaction_style_ == kCompactionStyleLevel) { |
3429 | 2.56M | return num_levels() - 2; |
3430 | 2.56M | } |
3431 | 0 | return 0; |
3432 | 2.56M | } |
3433 | | |
3434 | 242k | int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const { |
3435 | 242k | if (allow_ingest_behind) { |
3436 | 0 | assert(num_levels() > 1); |
3437 | 0 | return num_levels() - 2; |
3438 | 0 | } |
3439 | 242k | return num_levels() - 1; |
3440 | 242k | } |
3441 | | |
3442 | | void VersionStorageInfo::EstimateCompactionBytesNeeded( |
3443 | 240k | const MutableCFOptions& mutable_cf_options) { |
3444 | | // Only implemented for level-based compaction |
3445 | 240k | if (compaction_style_ != kCompactionStyleLevel) { |
3446 | 0 | estimated_compaction_needed_bytes_ = 0; |
3447 | 0 | return; |
3448 | 0 | } |
3449 | | |
3450 | | // Start from Level 0, if level 0 qualifies compaction to level 1, |
3451 | | // we estimate the size of compaction. |
3452 | | // Then we move on to the next level and see whether it qualifies compaction |
3453 | | // to the next level. The size of the level is estimated as the actual size |
3454 | | // on the level plus the input bytes from the previous level if there is any. |
3455 | | // If it exceeds, take the exceeded bytes as compaction input and add the size |
3456 | | // of the compaction size to tatal size. |
3457 | | // We keep doing it to Level 2, 3, etc, until the last level and return the |
3458 | | // accumulated bytes. |
3459 | | |
3460 | 240k | uint64_t bytes_compact_to_next_level = 0; |
3461 | 240k | uint64_t level_size = 0; |
3462 | 240k | for (auto* f : files_[0]) { |
3463 | 137k | level_size += f->fd.GetFileSize(); |
3464 | 137k | } |
3465 | | // Level 0 |
3466 | 240k | bool level0_compact_triggered = false; |
3467 | 240k | if (static_cast<int>(files_[0].size()) >= |
3468 | 240k | mutable_cf_options.level0_file_num_compaction_trigger || |
3469 | 233k | level_size >= mutable_cf_options.max_bytes_for_level_base) { |
3470 | 6.90k | level0_compact_triggered = true; |
3471 | 6.90k | estimated_compaction_needed_bytes_ = level_size; |
3472 | 6.90k | bytes_compact_to_next_level = level_size; |
3473 | 233k | } else { |
3474 | 233k | estimated_compaction_needed_bytes_ = 0; |
3475 | 233k | } |
3476 | | |
3477 | | // Level 1 and up. |
3478 | 240k | uint64_t bytes_next_level = 0; |
3479 | 240k | for (int level = base_level(); level <= MaxInputLevel(); level++) { |
3480 | 0 | level_size = 0; |
3481 | 0 | if (bytes_next_level > 0) { |
3482 | | #ifndef NDEBUG |
3483 | | uint64_t level_size2 = 0; |
3484 | | for (auto* f : files_[level]) { |
3485 | | level_size2 += f->fd.GetFileSize(); |
3486 | | } |
3487 | | assert(level_size2 == bytes_next_level); |
3488 | | #endif |
3489 | 0 | level_size = bytes_next_level; |
3490 | 0 | bytes_next_level = 0; |
3491 | 0 | } else { |
3492 | 0 | for (auto* f : files_[level]) { |
3493 | 0 | level_size += f->fd.GetFileSize(); |
3494 | 0 | } |
3495 | 0 | } |
3496 | 0 | if (level == base_level() && level0_compact_triggered) { |
3497 | | // Add base level size to compaction if level0 compaction triggered. |
3498 | 0 | estimated_compaction_needed_bytes_ += level_size; |
3499 | 0 | } |
3500 | | // Add size added by previous compaction |
3501 | 0 | level_size += bytes_compact_to_next_level; |
3502 | 0 | bytes_compact_to_next_level = 0; |
3503 | 0 | uint64_t level_target = MaxBytesForLevel(level); |
3504 | 0 | if (level_size > level_target) { |
3505 | 0 | bytes_compact_to_next_level = level_size - level_target; |
3506 | | // Estimate the actual compaction fan-out ratio as size ratio between |
3507 | | // the two levels. |
3508 | |
|
3509 | 0 | assert(bytes_next_level == 0); |
3510 | 0 | if (level + 1 < num_levels_) { |
3511 | 0 | for (auto* f : files_[level + 1]) { |
3512 | 0 | bytes_next_level += f->fd.GetFileSize(); |
3513 | 0 | } |
3514 | 0 | } |
3515 | 0 | if (bytes_next_level > 0) { |
3516 | 0 | assert(level_size > 0); |
3517 | 0 | estimated_compaction_needed_bytes_ += static_cast<uint64_t>( |
3518 | 0 | static_cast<double>(bytes_compact_to_next_level) * |
3519 | 0 | (static_cast<double>(bytes_next_level) / |
3520 | 0 | static_cast<double>(level_size) + |
3521 | 0 | 1)); |
3522 | 0 | } |
3523 | 0 | } |
3524 | 0 | } |
3525 | 240k | } |
3526 | | |
3527 | | namespace { |
3528 | | uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, |
3529 | | const MutableCFOptions& mutable_cf_options, |
3530 | 0 | const std::vector<FileMetaData*>& files) { |
3531 | 0 | uint32_t ttl_expired_files_count = 0; |
3532 | |
|
3533 | 0 | int64_t _current_time; |
3534 | 0 | auto status = ioptions.clock->GetCurrentTime(&_current_time); |
3535 | 0 | if (status.ok()) { |
3536 | 0 | const uint64_t current_time = static_cast<uint64_t>(_current_time); |
3537 | 0 | for (FileMetaData* f : files) { |
3538 | 0 | if (!f->being_compacted) { |
3539 | 0 | uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); |
3540 | 0 | if (oldest_ancester_time != 0 && |
3541 | 0 | oldest_ancester_time < (current_time - mutable_cf_options.ttl)) { |
3542 | 0 | ttl_expired_files_count++; |
3543 | 0 | } |
3544 | 0 | } |
3545 | 0 | } |
3546 | 0 | } |
3547 | 0 | return ttl_expired_files_count; |
3548 | 0 | } |
3549 | | |
3550 | | bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions, |
3551 | | const MutableCFOptions& mutable_cf_options, |
3552 | 0 | const std::vector<FileMetaData*>& files) { |
3553 | 0 | const std::vector<FileTemperatureAge>& ages = |
3554 | 0 | mutable_cf_options.compaction_options_fifo |
3555 | 0 | .file_temperature_age_thresholds; |
3556 | 0 | if (ages.empty()) { |
3557 | 0 | return false; |
3558 | 0 | } |
3559 | 0 | if (files.empty()) { |
3560 | 0 | return false; |
3561 | 0 | } |
3562 | 0 | int64_t _current_time; |
3563 | 0 | auto status = ioptions.clock->GetCurrentTime(&_current_time); |
3564 | 0 | const uint64_t current_time = static_cast<uint64_t>(_current_time); |
3565 | | // This is the same logic used in |
3566 | | // FIFOCompactionPicker::PickTemperatureChangeCompaction(). |
3567 | 0 | if (status.ok() && current_time >= ages[0].age) { |
3568 | 0 | uint64_t create_time_threshold = current_time - ages[0].age; |
3569 | 0 | Temperature target_temp; |
3570 | 0 | assert(files.size() >= 1); |
3571 | 0 | for (size_t index = files.size(); index >= 1; --index) { |
3572 | 0 | FileMetaData* cur_file = files[index - 1]; |
3573 | 0 | FileMetaData* prev_file = index < 2 ? nullptr : files[index - 2]; |
3574 | 0 | if (!cur_file->being_compacted) { |
3575 | 0 | uint64_t est_newest_key_time = cur_file->TryGetNewestKeyTime(prev_file); |
3576 | | // Newer file could have newest_key_time populated |
3577 | 0 | if (est_newest_key_time == kUnknownNewestKeyTime) { |
3578 | 0 | continue; |
3579 | 0 | } |
3580 | 0 | if (est_newest_key_time > create_time_threshold) { |
3581 | 0 | return false; |
3582 | 0 | } |
3583 | 0 | target_temp = ages[0].temperature; |
3584 | 0 | for (size_t i = 1; i < ages.size(); ++i) { |
3585 | 0 | if (current_time >= ages[i].age && |
3586 | 0 | est_newest_key_time <= current_time - ages[i].age) { |
3587 | 0 | target_temp = ages[i].temperature; |
3588 | 0 | } |
3589 | 0 | } |
3590 | 0 | if (cur_file->temperature != target_temp) { |
3591 | 0 | return true; |
3592 | 0 | } |
3593 | 0 | } |
3594 | 0 | } |
3595 | 0 | } |
3596 | 0 | return false; |
3597 | 0 | } |
3598 | | } // anonymous namespace |
3599 | | |
3600 | | void VersionStorageInfo::ComputeCompactionScore( |
3601 | | const ImmutableOptions& immutable_options, |
3602 | 240k | const MutableCFOptions& mutable_cf_options) { |
3603 | 240k | double total_downcompact_bytes = 0.0; |
3604 | | // Historically, score is defined as actual bytes in a level divided by |
3605 | | // the level's target size, and 1.0 is the threshold for triggering |
3606 | | // compaction. Higher score means higher prioritization. |
3607 | | // Now we keep the compaction triggering condition, but consider more |
3608 | | // factors for prioritization, while still keeping the 1.0 threshold. |
3609 | | // In order to provide flexibility for reducing score while still |
3610 | | // maintaining it to be over 1.0, we scale the original score by 10x |
3611 | | // if it is larger than 1.0. |
3612 | 240k | const double kScoreScale = 10.0; |
3613 | 240k | int max_output_level = |
3614 | 240k | MaxOutputLevel(immutable_options.cf_allow_ingest_behind || |
3615 | 240k | immutable_options.allow_ingest_behind); |
3616 | 1.68M | for (int level = 0; level <= MaxInputLevel(); level++) { |
3617 | 1.44M | double score; |
3618 | 1.44M | if (level == 0) { |
3619 | | // We treat level-0 specially by bounding the number of files |
3620 | | // instead of number of bytes for two reasons: |
3621 | | // |
3622 | | // (1) With larger write-buffer sizes, it is nice not to do too |
3623 | | // many level-0 compactions. |
3624 | | // |
3625 | | // (2) The files in level-0 are merged on every read and |
3626 | | // therefore we wish to avoid too many files when the individual |
3627 | | // file size is small (perhaps because of a small write-buffer |
3628 | | // setting, or very high compression ratios, or lots of |
3629 | | // overwrites/deletions). |
3630 | 240k | int num_sorted_runs = 0; |
3631 | 240k | uint64_t total_size = 0; |
3632 | 240k | for (auto* f : files_[level]) { |
3633 | 137k | total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize()); |
3634 | 137k | if (!f->being_compacted) { |
3635 | 122k | total_size += f->compensated_file_size; |
3636 | 122k | num_sorted_runs++; |
3637 | 122k | } |
3638 | 137k | } |
3639 | 240k | if (compaction_style_ == kCompactionStyleUniversal) { |
3640 | | // For universal compaction, we use level0 score to indicate |
3641 | | // compaction score for the whole DB. Adding other levels as if |
3642 | | // they are L0 files. |
3643 | 0 | for (int i = 1; i <= max_output_level; i++) { |
3644 | | // It's possible that a subset of the files in a level may be in a |
3645 | | // compaction, due to delete triggered compaction or trivial move. |
3646 | | // In that case, the below check may not catch a level being |
3647 | | // compacted as it only checks the first file. The worst that can |
3648 | | // happen is a scheduled compaction thread will find nothing to do. |
3649 | 0 | if (!files_[i].empty() && !files_[i][0]->being_compacted) { |
3650 | 0 | num_sorted_runs++; |
3651 | 0 | } |
3652 | 0 | } |
3653 | 0 | } |
3654 | | |
3655 | 240k | if (compaction_style_ == kCompactionStyleFIFO) { |
3656 | 0 | auto max_table_files_size = |
3657 | 0 | mutable_cf_options.compaction_options_fifo.max_table_files_size; |
3658 | 0 | if (max_table_files_size == 0) { |
3659 | | // avoid divide 0 |
3660 | 0 | max_table_files_size = 1; |
3661 | 0 | } |
3662 | 0 | score = static_cast<double>(total_size) / max_table_files_size; |
3663 | 0 | if (score < 1 && |
3664 | 0 | mutable_cf_options.compaction_options_fifo.allow_compaction) { |
3665 | 0 | score = std::max( |
3666 | 0 | static_cast<double>(num_sorted_runs) / |
3667 | 0 | mutable_cf_options.level0_file_num_compaction_trigger, |
3668 | 0 | score); |
3669 | 0 | } |
3670 | 0 | if (score < 1 && mutable_cf_options.ttl > 0) { |
3671 | 0 | score = |
3672 | 0 | std::max(static_cast<double>(GetExpiredTtlFilesCount( |
3673 | 0 | immutable_options, mutable_cf_options, files_[0])), |
3674 | 0 | score); |
3675 | 0 | } |
3676 | 0 | if (score < 1 && |
3677 | 0 | ShouldChangeFileTemperature(immutable_options, mutable_cf_options, |
3678 | 0 | files_[0])) { |
3679 | | // For FIFO, just need a large enough score to trigger compaction. |
3680 | 0 | const double kScoreForNeedCompaction = 1.1; |
3681 | 0 | score = kScoreForNeedCompaction; |
3682 | 0 | } |
3683 | 240k | } else { |
3684 | | // For universal compaction, if a user configures `max_read_amp`, then |
3685 | | // the score may be a false positive signal. |
3686 | | // `level0_file_num_compaction_trigger` is used as a trigger to check |
3687 | | // if there is any compaction work to do. |
3688 | 240k | score = static_cast<double>(num_sorted_runs) / |
3689 | 240k | mutable_cf_options.level0_file_num_compaction_trigger; |
3690 | 240k | if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { |
3691 | | // Level-based involves L0->L0 compactions that can lead to oversized |
3692 | | // L0 files. Take into account size as well to avoid later giant |
3693 | | // compactions to the base level. |
3694 | | // If score in L0 is always too high, L0->LBase will always be |
3695 | | // prioritized over LBase->LBase+1 compaction and LBase will |
3696 | | // accumulate to too large. But if L0 score isn't high enough, L0 will |
3697 | | // accumulate and data is not moved to LBase fast enough. The score |
3698 | | // calculation below takes into account L0 size vs LBase size. |
3699 | 240k | if (immutable_options.level_compaction_dynamic_level_bytes) { |
3700 | 240k | if (total_size >= mutable_cf_options.max_bytes_for_level_base) { |
3701 | | // When calculating estimated_compaction_needed_bytes, we assume |
3702 | | // L0 is qualified as pending compactions. We will need to make |
3703 | | // sure that it qualifies for compaction. |
3704 | | // It might be guaranteed by logic below anyway, but we are |
3705 | | // explicit here to make sure we don't stop writes with no |
3706 | | // compaction scheduled. |
3707 | 0 | score = std::max(score, 1.01); |
3708 | 0 | } |
3709 | 240k | if (total_size > level_max_bytes_[base_level_]) { |
3710 | | // In this case, we compare L0 size with actual LBase size and |
3711 | | // make sure score is more than 1.0 (10.0 after scaled) if L0 is |
3712 | | // larger than LBase. Since LBase score = LBase size / |
3713 | | // (target size + total_downcompact_bytes) where |
3714 | | // total_downcompact_bytes = total_size > LBase size, |
3715 | | // LBase score is lower than 10.0. So L0->LBase is prioritized |
3716 | | // over LBase -> LBase+1. |
3717 | 0 | uint64_t base_level_size = 0; |
3718 | 0 | for (auto f : files_[base_level_]) { |
3719 | 0 | base_level_size += f->compensated_file_size; |
3720 | 0 | } |
3721 | 0 | score = std::max(score, static_cast<double>(total_size) / |
3722 | 0 | static_cast<double>(std::max( |
3723 | 0 | base_level_size, |
3724 | 0 | level_max_bytes_[base_level_]))); |
3725 | 0 | } |
3726 | 240k | if (score > 1.0) { |
3727 | 1.02k | score *= kScoreScale; |
3728 | 1.02k | } |
3729 | 240k | } else { |
3730 | 0 | score = std::max(score, |
3731 | 0 | static_cast<double>(total_size) / |
3732 | 0 | mutable_cf_options.max_bytes_for_level_base); |
3733 | 0 | } |
3734 | 240k | } |
3735 | 240k | } |
3736 | 1.20M | } else { // level > 0 |
3737 | | // Compute the ratio of current size to size limit. |
3738 | 1.20M | uint64_t level_bytes_no_compacting = 0; |
3739 | 1.20M | uint64_t level_total_bytes = 0; |
3740 | 1.20M | for (auto f : files_[level]) { |
3741 | 0 | level_total_bytes += f->fd.GetFileSize(); |
3742 | 0 | if (!f->being_compacted) { |
3743 | 0 | level_bytes_no_compacting += f->compensated_file_size; |
3744 | 0 | } |
3745 | 0 | } |
3746 | 1.20M | if (!immutable_options.level_compaction_dynamic_level_bytes) { |
3747 | 0 | score = static_cast<double>(level_bytes_no_compacting) / |
3748 | 0 | MaxBytesForLevel(level); |
3749 | 1.20M | } else { |
3750 | 1.20M | if (level_bytes_no_compacting < MaxBytesForLevel(level)) { |
3751 | 1.20M | score = static_cast<double>(level_bytes_no_compacting) / |
3752 | 1.20M | MaxBytesForLevel(level); |
3753 | 1.20M | } else { |
3754 | | // If there are a large mount of data being compacted down to the |
3755 | | // current level soon, we would de-prioritize compaction from |
3756 | | // a level where the incoming data would be a large ratio. We do |
3757 | | // it by dividing level size not by target level size, but |
3758 | | // the target size and the incoming compaction bytes. |
3759 | 0 | score = static_cast<double>(level_bytes_no_compacting) / |
3760 | 0 | (MaxBytesForLevel(level) + total_downcompact_bytes) * |
3761 | 0 | kScoreScale; |
3762 | 0 | } |
3763 | | // Drain unnecessary levels, but with lower priority compared to |
3764 | | // when L0 is eligible. Only non-empty levels can be unnecessary. |
3765 | | // If there is no unnecessary levels, lowest_unnecessary_level_ = -1. |
3766 | 1.20M | if (level_bytes_no_compacting > 0 && |
3767 | 0 | level <= lowest_unnecessary_level_) { |
3768 | 0 | score = std::max( |
3769 | 0 | score, kScoreScale * |
3770 | 0 | (1.001 + 0.001 * (lowest_unnecessary_level_ - level))); |
3771 | 0 | } |
3772 | 1.20M | } |
3773 | 1.20M | if (level <= lowest_unnecessary_level_) { |
3774 | 0 | total_downcompact_bytes += level_total_bytes; |
3775 | 1.20M | } else if (level_total_bytes > MaxBytesForLevel(level)) { |
3776 | 0 | total_downcompact_bytes += |
3777 | 0 | static_cast<double>(level_total_bytes - MaxBytesForLevel(level)); |
3778 | 0 | } |
3779 | 1.20M | } |
3780 | 1.44M | compaction_level_[level] = level; |
3781 | 1.44M | compaction_score_[level] = score; |
3782 | 1.44M | } |
3783 | | |
3784 | | // sort all the levels based on their score. Higher scores get listed |
3785 | | // first. Use bubble sort because the number of entries are small. |
3786 | 1.44M | for (int i = 0; i < num_levels() - 2; i++) { |
3787 | 4.80M | for (int j = i + 1; j < num_levels() - 1; j++) { |
3788 | 3.60M | if (compaction_score_[i] < compaction_score_[j]) { |
3789 | 0 | double score = compaction_score_[i]; |
3790 | 0 | int level = compaction_level_[i]; |
3791 | 0 | compaction_score_[i] = compaction_score_[j]; |
3792 | 0 | compaction_level_[i] = compaction_level_[j]; |
3793 | 0 | compaction_score_[j] = score; |
3794 | 0 | compaction_level_[j] = level; |
3795 | 0 | } |
3796 | 3.60M | } |
3797 | 1.20M | } |
3798 | 240k | ComputeFilesMarkedForCompaction(max_output_level); |
3799 | 240k | ComputeBottommostFilesMarkedForCompaction( |
3800 | 240k | immutable_options.cf_allow_ingest_behind || |
3801 | 240k | immutable_options.allow_ingest_behind); |
3802 | 240k | ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); |
3803 | 240k | ComputeFilesMarkedForPeriodicCompaction( |
3804 | 240k | immutable_options, mutable_cf_options.periodic_compaction_seconds, |
3805 | 240k | max_output_level); |
3806 | 240k | ComputeFilesMarkedForForcedBlobGC( |
3807 | 240k | mutable_cf_options.blob_garbage_collection_age_cutoff, |
3808 | 240k | mutable_cf_options.blob_garbage_collection_force_threshold, |
3809 | 240k | mutable_cf_options.enable_blob_garbage_collection); |
3810 | | |
3811 | 240k | EstimateCompactionBytesNeeded(mutable_cf_options); |
3812 | 240k | } |
3813 | | |
3814 | 240k | void VersionStorageInfo::ComputeFilesMarkedForCompaction(int last_level) { |
3815 | 240k | files_marked_for_compaction_.clear(); |
3816 | 240k | int last_qualify_level = 0; |
3817 | 240k | standalone_range_tombstone_files_mark_threshold_ = kMaxSequenceNumber; |
3818 | | |
3819 | | // Do not include files from the last level with data |
3820 | | // If table properties collector suggests a file on the last level, |
3821 | | // we should not move it to a new level. |
3822 | 1.48M | for (int level = last_level; level >= 1; level--) { |
3823 | 1.27M | if (!files_[level].empty()) { |
3824 | 32.7k | last_qualify_level = level - 1; |
3825 | 32.7k | break; |
3826 | 32.7k | } |
3827 | 1.27M | } |
3828 | | |
3829 | 643k | for (int level = 0; level <= last_qualify_level; level++) { |
3830 | 403k | for (auto* f : files_[level]) { |
3831 | 137k | if (!f->being_compacted && f->marked_for_compaction) { |
3832 | 0 | files_marked_for_compaction_.emplace_back(level, f); |
3833 | 0 | if (f->FileIsStandAloneRangeTombstone()) { |
3834 | 0 | standalone_range_tombstone_files_mark_threshold_ = |
3835 | 0 | std::min(standalone_range_tombstone_files_mark_threshold_, |
3836 | 0 | f->fd.smallest_seqno); |
3837 | 0 | } |
3838 | 0 | } |
3839 | 137k | } |
3840 | 403k | } |
3841 | 240k | } |
3842 | | |
3843 | | void VersionStorageInfo::ComputeExpiredTtlFiles( |
3844 | 240k | const ImmutableOptions& ioptions, const uint64_t ttl) { |
3845 | 240k | expired_ttl_files_.clear(); |
3846 | 240k | if (ttl == 0 || compaction_style_ != CompactionStyle::kCompactionStyleLevel) { |
3847 | 0 | return; |
3848 | 0 | } |
3849 | | |
3850 | 240k | int64_t _current_time; |
3851 | 240k | auto status = ioptions.clock->GetCurrentTime(&_current_time); |
3852 | 240k | if (!status.ok()) { |
3853 | 0 | return; |
3854 | 0 | } |
3855 | 240k | const uint64_t current_time = static_cast<uint64_t>(_current_time); |
3856 | | |
3857 | 1.68M | for (int level = 0; level < num_levels() - 1; level++) { |
3858 | 1.44M | for (FileMetaData* f : files_[level]) { |
3859 | 137k | if (!f->being_compacted) { |
3860 | 122k | uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); |
3861 | 122k | if (oldest_ancester_time > 0 && |
3862 | 122k | oldest_ancester_time < (current_time - ttl)) { |
3863 | 0 | expired_ttl_files_.emplace_back(level, f); |
3864 | 0 | } |
3865 | 122k | } |
3866 | 137k | } |
3867 | 1.44M | } |
3868 | 240k | } |
3869 | | |
3870 | | void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( |
3871 | | const ImmutableOptions& ioptions, |
3872 | 240k | const uint64_t periodic_compaction_seconds, int last_level) { |
3873 | 240k | files_marked_for_periodic_compaction_.clear(); |
3874 | 240k | if (periodic_compaction_seconds == 0) { |
3875 | 240k | return; |
3876 | 240k | } |
3877 | | |
3878 | 0 | int64_t temp_current_time; |
3879 | 0 | auto status = ioptions.clock->GetCurrentTime(&temp_current_time); |
3880 | 0 | if (!status.ok()) { |
3881 | 0 | return; |
3882 | 0 | } |
3883 | 0 | const uint64_t current_time = static_cast<uint64_t>(temp_current_time); |
3884 | | |
3885 | | // If periodic_compaction_seconds is larger than current time, periodic |
3886 | | // compaction can't possibly be triggered. |
3887 | 0 | if (periodic_compaction_seconds > current_time) { |
3888 | 0 | return; |
3889 | 0 | } |
3890 | | |
3891 | 0 | const uint64_t allowed_time_limit = |
3892 | 0 | current_time - periodic_compaction_seconds; |
3893 | | |
3894 | | // Find the adjust_allowed_time_limit such that it includes files that are |
3895 | | // going to expire by the time next daily offpeak starts. |
3896 | 0 | const OffpeakTimeInfo offpeak_time_info = |
3897 | 0 | offpeak_time_option_.GetOffpeakTimeInfo(current_time); |
3898 | 0 | const uint64_t adjusted_allowed_time_limit = |
3899 | 0 | allowed_time_limit + |
3900 | 0 | (offpeak_time_info.is_now_offpeak |
3901 | 0 | ? offpeak_time_info.seconds_till_next_offpeak_start |
3902 | 0 | : 0); |
3903 | |
|
3904 | 0 | for (int level = 0; level <= last_level; level++) { |
3905 | 0 | for (auto f : files_[level]) { |
3906 | 0 | if (!f->being_compacted) { |
3907 | | // Compute a file's modification time in the following order: |
3908 | | // 1. Use file_creation_time table property if it is > 0. |
3909 | | // 2. Use creation_time table property if it is > 0. |
3910 | | // 3. Use file's mtime metadata if the above two table properties are 0. |
3911 | | // Don't consider the file at all if the modification time cannot be |
3912 | | // correctly determined based on the above conditions. |
3913 | 0 | uint64_t file_modification_time = f->TryGetFileCreationTime(); |
3914 | 0 | if (file_modification_time == kUnknownFileCreationTime) { |
3915 | 0 | file_modification_time = f->TryGetOldestAncesterTime(); |
3916 | 0 | } |
3917 | 0 | if (file_modification_time == kUnknownOldestAncesterTime) { |
3918 | 0 | auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), |
3919 | 0 | f->fd.GetPathId()); |
3920 | 0 | status = ioptions.env->GetFileModificationTime( |
3921 | 0 | file_path, &file_modification_time); |
3922 | 0 | if (!status.ok()) { |
3923 | 0 | ROCKS_LOG_WARN(ioptions.logger, |
3924 | 0 | "Can't get file modification time: %s: %s", |
3925 | 0 | file_path.c_str(), status.ToString().c_str()); |
3926 | 0 | continue; |
3927 | 0 | } |
3928 | 0 | } |
3929 | 0 | if (file_modification_time > 0 && |
3930 | 0 | file_modification_time < adjusted_allowed_time_limit) { |
3931 | 0 | files_marked_for_periodic_compaction_.emplace_back(level, f); |
3932 | 0 | } |
3933 | 0 | } |
3934 | 0 | } |
3935 | 0 | } |
3936 | 0 | } |
3937 | | |
3938 | | void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( |
3939 | | double blob_garbage_collection_age_cutoff, |
3940 | | double blob_garbage_collection_force_threshold, |
3941 | 240k | bool enable_blob_garbage_collection) { |
3942 | 240k | files_marked_for_forced_blob_gc_.clear(); |
3943 | 240k | if (!(enable_blob_garbage_collection && |
3944 | 0 | blob_garbage_collection_age_cutoff > 0.0 && |
3945 | 240k | blob_garbage_collection_force_threshold < 1.0)) { |
3946 | 240k | return; |
3947 | 240k | } |
3948 | | |
3949 | 0 | if (blob_files_.empty()) { |
3950 | 0 | return; |
3951 | 0 | } |
3952 | | |
3953 | | // Number of blob files eligible for GC based on age |
3954 | 0 | const size_t cutoff_count = static_cast<size_t>( |
3955 | 0 | blob_garbage_collection_age_cutoff * blob_files_.size()); |
3956 | 0 | if (!cutoff_count) { |
3957 | 0 | return; |
3958 | 0 | } |
3959 | | |
3960 | | // Compute the sum of total and garbage bytes over the batch of blob files |
3961 | | // currently eligible for garbage collection based on |
3962 | | // blob_garbage_collection_age_cutoff, and if the garbage ratio exceeds |
3963 | | // blob_garbage_collection_force_threshold, schedule compaction for the |
3964 | | // SST files that reference the oldest batch of blob files. Here is a toy |
3965 | | // example. Let's assume we have three SSTs 1, 2, and 3, and four blob files |
3966 | | // 10, 11, 12, and 13, which correspond to the range that is eligible for GC |
3967 | | // and satisfy the garbage ratio threshold. Also, let's say SSTs 1 and 2 both |
3968 | | // rely on blob file 10 and potentially some higher-numbered ones, while SST 3 |
3969 | | // relies on blob file 12 and potentially some higher-numbered ones. Then, the |
3970 | | // SST to oldest blob file mapping is as follows: |
3971 | | // |
3972 | | // SST file number Oldest blob file number |
3973 | | // 1 10 |
3974 | | // 2 10 |
3975 | | // 3 12 |
3976 | | // |
3977 | | // This is what the same thing looks like from the blob files' POV. (Note that |
3978 | | // the linked SSTs simply denote the inverse mapping of the above.) |
3979 | | // |
3980 | | // Blob file number Linked SST set |
3981 | | // 10 {1, 2} |
3982 | | // 11 {} |
3983 | | // 12 {3} |
3984 | | // 13 {} |
3985 | | // |
3986 | | // Then, the oldest batch of blob files consists of blob files 10 and 11, |
3987 | | // and we can get rid of them by forcing the compaction of SSTs 1 and 2. |
3988 | 0 | const auto& oldest_meta = blob_files_.front(); |
3989 | 0 | assert(oldest_meta); |
3990 | |
|
3991 | 0 | const auto& linked_ssts = oldest_meta->GetLinkedSsts(); |
3992 | 0 | assert(!linked_ssts.empty()); |
3993 | |
|
3994 | 0 | size_t count = 1; |
3995 | 0 | uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); |
3996 | 0 | uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); |
3997 | |
|
3998 | 0 | assert(cutoff_count <= blob_files_.size()); |
3999 | |
|
4000 | 0 | for (; count < cutoff_count; ++count) { |
4001 | 0 | const auto& meta = blob_files_[count]; |
4002 | 0 | assert(meta); |
4003 | |
|
4004 | 0 | sum_total_blob_bytes += meta->GetTotalBlobBytes(); |
4005 | 0 | sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); |
4006 | 0 | } |
4007 | |
|
4008 | 0 | if (sum_garbage_blob_bytes < |
4009 | 0 | blob_garbage_collection_force_threshold * sum_total_blob_bytes) { |
4010 | 0 | return; |
4011 | 0 | } |
4012 | | |
4013 | 0 | for (uint64_t sst_file_number : linked_ssts) { |
4014 | 0 | const FileLocation location = GetFileLocation(sst_file_number); |
4015 | 0 | assert(location.IsValid()); |
4016 | |
|
4017 | 0 | const int level = location.GetLevel(); |
4018 | 0 | assert(level >= 0); |
4019 | |
|
4020 | 0 | const size_t pos = location.GetPosition(); |
4021 | |
|
4022 | 0 | FileMetaData* const sst_meta = files_[level][pos]; |
4023 | 0 | assert(sst_meta); |
4024 | |
|
4025 | 0 | if (sst_meta->being_compacted) { |
4026 | 0 | continue; |
4027 | 0 | } |
4028 | | |
4029 | 0 | files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); |
4030 | 0 | } |
4031 | 0 | } |
4032 | | |
4033 | | namespace { |
4034 | | |
4035 | | // used to sort files by size |
4036 | | struct Fsize { |
4037 | | size_t index; |
4038 | | FileMetaData* file; |
4039 | | }; |
4040 | | |
4041 | | // Comparator that is used to sort files based on their size |
4042 | | // In normal mode: descending size |
4043 | 0 | bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { |
4044 | 0 | return (first.file->compensated_file_size > |
4045 | 0 | second.file->compensated_file_size); |
4046 | 0 | } |
4047 | | } // anonymous namespace |
4048 | | |
4049 | 190k | void VersionStorageInfo::AddFile(int level, FileMetaData* f) { |
4050 | 190k | auto& level_files = files_[level]; |
4051 | 190k | level_files.push_back(f); |
4052 | | |
4053 | 190k | f->refs++; |
4054 | 190k | } |
4055 | | |
4056 | | void VersionStorageInfo::AddBlobFile( |
4057 | 0 | std::shared_ptr<BlobFileMetaData> blob_file_meta) { |
4058 | 0 | assert(blob_file_meta); |
4059 | |
|
4060 | 0 | assert(blob_files_.empty() || |
4061 | 0 | (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() < |
4062 | 0 | blob_file_meta->GetBlobFileNumber())); |
4063 | |
|
4064 | 0 | blob_files_.emplace_back(std::move(blob_file_meta)); |
4065 | 0 | } |
4066 | | |
4067 | | VersionStorageInfo::BlobFiles::const_iterator |
4068 | 138k | VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const { |
4069 | 138k | return std::lower_bound( |
4070 | 138k | blob_files_.begin(), blob_files_.end(), blob_file_number, |
4071 | 138k | [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) { |
4072 | 0 | assert(lhs); |
4073 | 0 | return lhs->GetBlobFileNumber() < rhs; |
4074 | 0 | }); |
4075 | 138k | } |
4076 | | |
4077 | 233k | void VersionStorageInfo::SetFinalized() { |
4078 | 233k | finalized_ = true; |
4079 | | |
4080 | | #ifndef NDEBUG |
4081 | | if (compaction_style_ != kCompactionStyleLevel) { |
4082 | | // Not level based compaction. |
4083 | | return; |
4084 | | } |
4085 | | assert(base_level_ < 0 || num_levels() == 1 || |
4086 | | (base_level_ >= 1 && base_level_ < num_levels())); |
4087 | | // Verify all levels newer than base_level are empty except L0 |
4088 | | for (int level = 1; level < base_level(); level++) { |
4089 | | assert(NumLevelBytes(level) == 0); |
4090 | | } |
4091 | | uint64_t max_bytes_prev_level = 0; |
4092 | | for (int level = base_level(); level < num_levels() - 1; level++) { |
4093 | | if (LevelFiles(level).size() == 0) { |
4094 | | continue; |
4095 | | } |
4096 | | assert(MaxBytesForLevel(level) >= max_bytes_prev_level); |
4097 | | max_bytes_prev_level = MaxBytesForLevel(level); |
4098 | | } |
4099 | | for (int level = 0; level < num_levels(); level++) { |
4100 | | assert(LevelFiles(level).size() == 0 || |
4101 | | LevelFiles(level).size() == LevelFilesBrief(level).num_files); |
4102 | | if (LevelFiles(level).size() > 0) { |
4103 | | assert(level < num_non_empty_levels()); |
4104 | | } |
4105 | | } |
4106 | | assert(compaction_level_.size() > 0); |
4107 | | assert(compaction_level_.size() == compaction_score_.size()); |
4108 | | #endif |
4109 | 233k | } |
4110 | | |
4111 | 233k | void VersionStorageInfo::UpdateNumNonEmptyLevels() { |
4112 | 233k | num_non_empty_levels_ = num_levels_; |
4113 | 1.62M | for (int i = num_levels_ - 1; i >= 0; i--) { |
4114 | 1.45M | if (files_[i].size() != 0) { |
4115 | 66.7k | return; |
4116 | 1.39M | } else { |
4117 | 1.39M | num_non_empty_levels_ = i; |
4118 | 1.39M | } |
4119 | 1.45M | } |
4120 | 233k | } |
4121 | | |
4122 | | namespace { |
4123 | | // Sort `temp` based on ratio of overlapping size over file size |
4124 | | void SortFileByOverlappingRatio( |
4125 | | const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files, |
4126 | | const std::vector<FileMetaData*>& next_level_files, SystemClock* clock, |
4127 | | int level, int num_non_empty_levels, uint64_t ttl, |
4128 | 1.40M | std::vector<Fsize>* temp) { |
4129 | 1.40M | std::unordered_map<uint64_t, uint64_t> file_to_order; |
4130 | 1.40M | auto next_level_it = next_level_files.begin(); |
4131 | | |
4132 | 1.40M | int64_t curr_time; |
4133 | 1.40M | Status status = clock->GetCurrentTime(&curr_time); |
4134 | 1.40M | if (!status.ok()) { |
4135 | | // If we can't get time, disable TTL. |
4136 | 0 | ttl = 0; |
4137 | 0 | } |
4138 | | |
4139 | 1.40M | FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl, |
4140 | 1.40M | num_non_empty_levels, level); |
4141 | | |
4142 | 1.40M | for (auto& file : files) { |
4143 | 117k | uint64_t overlapping_bytes = 0; |
4144 | | // Skip files in next level that is smaller than current file |
4145 | 117k | while (next_level_it != next_level_files.end() && |
4146 | 0 | icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { |
4147 | 0 | next_level_it++; |
4148 | 0 | } |
4149 | | |
4150 | 117k | while (next_level_it != next_level_files.end() && |
4151 | 0 | icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { |
4152 | 0 | overlapping_bytes += (*next_level_it)->fd.file_size; |
4153 | |
|
4154 | 0 | if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { |
4155 | | // next level file cross large boundary of current file. |
4156 | 0 | break; |
4157 | 0 | } |
4158 | 0 | next_level_it++; |
4159 | 0 | } |
4160 | | |
4161 | 117k | uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1; |
4162 | 117k | assert(ttl_boost_score > 0); |
4163 | 117k | assert(file->compensated_file_size != 0); |
4164 | 117k | file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U / |
4165 | 117k | file->compensated_file_size / |
4166 | 117k | ttl_boost_score; |
4167 | 117k | } |
4168 | | |
4169 | 1.40M | size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort |
4170 | 1.40M | ? VersionStorageInfo::kNumberFilesToSort |
4171 | 1.40M | : temp->size(); |
4172 | | |
4173 | 1.40M | std::partial_sort( |
4174 | 1.40M | temp->begin(), temp->begin() + num_to_sort, temp->end(), |
4175 | 1.40M | [&](const Fsize& f1, const Fsize& f2) -> bool { |
4176 | | // If score is the same, pick file with smaller keys. |
4177 | | // This makes the algorithm more deterministic, and also |
4178 | | // help the trivial move case to have more files to |
4179 | | // extend. |
4180 | 115k | if (f1.file->marked_for_compaction == f2.file->marked_for_compaction) { |
4181 | 115k | if (file_to_order[f1.file->fd.GetNumber()] == |
4182 | 115k | file_to_order[f2.file->fd.GetNumber()]) { |
4183 | 115k | return icmp.Compare(f1.file->smallest, f2.file->smallest) < 0; |
4184 | 115k | } |
4185 | 0 | return file_to_order[f1.file->fd.GetNumber()] < |
4186 | 0 | file_to_order[f2.file->fd.GetNumber()]; |
4187 | 115k | } else { |
4188 | 0 | return f1.file->marked_for_compaction > |
4189 | 0 | f2.file->marked_for_compaction; |
4190 | 0 | } |
4191 | 115k | }); |
4192 | 1.40M | } |
4193 | | |
4194 | | void SortFileByRoundRobin(const InternalKeyComparator& icmp, |
4195 | | std::vector<InternalKey>* compact_cursor, |
4196 | | bool level0_non_overlapping, int level, |
4197 | 0 | std::vector<Fsize>* temp) { |
4198 | 0 | if (level == 0 && !level0_non_overlapping) { |
4199 | | // Using kOldestSmallestSeqFirst when level === 0, since the |
4200 | | // files may overlap (not fully sorted) |
4201 | 0 | std::sort(temp->begin(), temp->end(), |
4202 | 0 | [](const Fsize& f1, const Fsize& f2) -> bool { |
4203 | 0 | return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno; |
4204 | 0 | }); |
4205 | 0 | return; |
4206 | 0 | } |
4207 | | |
4208 | 0 | bool should_move_files = |
4209 | 0 | compact_cursor->at(level).size() > 0 && temp->size() > 1; |
4210 | | |
4211 | | // The iterator points to the Fsize with smallest key larger than or equal to |
4212 | | // the given cursor |
4213 | 0 | std::vector<Fsize>::iterator current_file_iter; |
4214 | 0 | if (should_move_files) { |
4215 | | // Find the file of which the smallest key is larger than or equal to |
4216 | | // the cursor (the smallest key in the successor file of the last |
4217 | | // chosen file), skip this if the cursor is invalid or there is only |
4218 | | // one file in this level |
4219 | 0 | current_file_iter = std::lower_bound( |
4220 | 0 | temp->begin(), temp->end(), compact_cursor->at(level), |
4221 | 0 | [&](const Fsize& f, const InternalKey& cursor) -> bool { |
4222 | 0 | return icmp.Compare(cursor, f.file->smallest) > 0; |
4223 | 0 | }); |
4224 | |
|
4225 | 0 | should_move_files = |
4226 | 0 | current_file_iter != temp->end() && current_file_iter != temp->begin(); |
4227 | 0 | } |
4228 | 0 | if (should_move_files) { |
4229 | | // Construct a local temporary vector |
4230 | 0 | std::vector<Fsize> local_temp; |
4231 | 0 | local_temp.reserve(temp->size()); |
4232 | | // Move the selected File into the first position and its successors |
4233 | | // into the second, third, ..., positions |
4234 | 0 | for (auto iter = current_file_iter; iter != temp->end(); iter++) { |
4235 | 0 | local_temp.push_back(*iter); |
4236 | 0 | } |
4237 | | // Move the origin predecessors of the selected file in a round-robin |
4238 | | // manner |
4239 | 0 | for (auto iter = temp->begin(); iter != current_file_iter; iter++) { |
4240 | 0 | local_temp.push_back(*iter); |
4241 | 0 | } |
4242 | | // Replace all the items in temp |
4243 | 0 | for (size_t i = 0; i < local_temp.size(); i++) { |
4244 | 0 | temp->at(i) = local_temp[i]; |
4245 | 0 | } |
4246 | 0 | } |
4247 | 0 | } |
4248 | | } // anonymous namespace |
4249 | | |
4250 | | void VersionStorageInfo::UpdateFilesByCompactionPri( |
4251 | 233k | const ImmutableOptions& ioptions, const MutableCFOptions& options) { |
4252 | 233k | if (compaction_style_ == kCompactionStyleNone || |
4253 | 233k | compaction_style_ == kCompactionStyleFIFO || |
4254 | 233k | compaction_style_ == kCompactionStyleUniversal) { |
4255 | | // don't need this |
4256 | 0 | return; |
4257 | 0 | } |
4258 | | // No need to sort the highest level because it is never compacted. |
4259 | 1.63M | for (int level = 0; level < num_levels() - 1; level++) { |
4260 | 1.40M | const std::vector<FileMetaData*>& files = files_[level]; |
4261 | 1.40M | auto& files_by_compaction_pri = files_by_compaction_pri_[level]; |
4262 | 1.40M | assert(files_by_compaction_pri.size() == 0); |
4263 | | |
4264 | | // populate a temp vector for sorting based on size |
4265 | 1.40M | std::vector<Fsize> temp(files.size()); |
4266 | 1.51M | for (size_t i = 0; i < files.size(); i++) { |
4267 | 117k | temp[i].index = i; |
4268 | 117k | temp[i].file = files[i]; |
4269 | 117k | } |
4270 | | |
4271 | | // sort the top kNumberFilesToSort based on file size |
4272 | 1.40M | size_t num = VersionStorageInfo::kNumberFilesToSort; |
4273 | 1.40M | if (num > temp.size()) { |
4274 | 1.40M | num = temp.size(); |
4275 | 1.40M | } |
4276 | 1.40M | switch (ioptions.compaction_pri) { |
4277 | 0 | case kByCompensatedSize: |
4278 | 0 | std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), |
4279 | 0 | CompareCompensatedSizeDescending); |
4280 | 0 | break; |
4281 | 0 | case kOldestLargestSeqFirst: |
4282 | 0 | std::sort(temp.begin(), temp.end(), |
4283 | 0 | [](const Fsize& f1, const Fsize& f2) -> bool { |
4284 | 0 | return f1.file->fd.largest_seqno < |
4285 | 0 | f2.file->fd.largest_seqno; |
4286 | 0 | }); |
4287 | 0 | break; |
4288 | 0 | case kOldestSmallestSeqFirst: |
4289 | 0 | std::sort(temp.begin(), temp.end(), |
4290 | 0 | [](const Fsize& f1, const Fsize& f2) -> bool { |
4291 | 0 | return f1.file->fd.smallest_seqno < |
4292 | 0 | f2.file->fd.smallest_seqno; |
4293 | 0 | }); |
4294 | 0 | break; |
4295 | 1.40M | case kMinOverlappingRatio: |
4296 | 1.40M | SortFileByOverlappingRatio(*internal_comparator_, files_[level], |
4297 | 1.40M | files_[level + 1], ioptions.clock, level, |
4298 | 1.40M | num_non_empty_levels_, options.ttl, &temp); |
4299 | 1.40M | break; |
4300 | 0 | case kRoundRobin: |
4301 | 0 | SortFileByRoundRobin(*internal_comparator_, &compact_cursor_, |
4302 | 0 | level0_non_overlapping_, level, &temp); |
4303 | 0 | break; |
4304 | 0 | default: |
4305 | 0 | assert(false); |
4306 | 1.40M | } |
4307 | 1.40M | assert(temp.size() == files.size()); |
4308 | | |
4309 | | // initialize files_by_compaction_pri_ |
4310 | 1.51M | for (size_t i = 0; i < temp.size(); i++) { |
4311 | 117k | files_by_compaction_pri.push_back(static_cast<int>(temp[i].index)); |
4312 | 117k | } |
4313 | 1.40M | next_file_to_compact_by_size_[level] = 0; |
4314 | 1.40M | assert(files_[level].size() == files_by_compaction_pri_[level].size()); |
4315 | 1.40M | } |
4316 | 233k | } |
4317 | | |
4318 | 233k | void VersionStorageInfo::GenerateLevel0NonOverlapping() { |
4319 | 233k | assert(!finalized_); |
4320 | 233k | level0_non_overlapping_ = true; |
4321 | 233k | if (level_files_brief_.size() == 0) { |
4322 | 166k | return; |
4323 | 166k | } |
4324 | | |
4325 | | // A copy of L0 files sorted by smallest key |
4326 | 66.7k | std::vector<FdWithKeyRange> level0_sorted_file( |
4327 | 66.7k | level_files_brief_[0].files, |
4328 | 66.7k | level_files_brief_[0].files + level_files_brief_[0].num_files); |
4329 | 66.7k | std::sort(level0_sorted_file.begin(), level0_sorted_file.end(), |
4330 | 66.7k | [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool { |
4331 | 62.3k | return (internal_comparator_->Compare(f1.smallest_key, |
4332 | 62.3k | f2.smallest_key) < 0); |
4333 | 62.3k | }); |
4334 | | |
4335 | 121k | for (size_t i = 1; i < level0_sorted_file.size(); ++i) { |
4336 | 56.0k | FdWithKeyRange& f = level0_sorted_file[i]; |
4337 | 56.0k | FdWithKeyRange& prev = level0_sorted_file[i - 1]; |
4338 | 56.0k | if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) { |
4339 | 1.41k | level0_non_overlapping_ = false; |
4340 | 1.41k | break; |
4341 | 1.41k | } |
4342 | 56.0k | } |
4343 | 66.7k | } |
4344 | | |
4345 | 233k | void VersionStorageInfo::GenerateBottommostFiles() { |
4346 | 233k | assert(!finalized_); |
4347 | 233k | assert(bottommost_files_.empty()); |
4348 | 474k | for (size_t level = 0; level < level_files_brief_.size(); ++level) { |
4349 | 431k | for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files; |
4350 | 240k | ++file_idx) { |
4351 | 190k | const FdWithKeyRange& f = level_files_brief_[level].files[file_idx]; |
4352 | 190k | int l0_file_idx; |
4353 | 190k | if (level == 0) { |
4354 | 117k | l0_file_idx = static_cast<int>(file_idx); |
4355 | 117k | } else { |
4356 | 73.0k | l0_file_idx = -1; |
4357 | 73.0k | } |
4358 | 190k | Slice smallest_user_key = ExtractUserKey(f.smallest_key); |
4359 | 190k | Slice largest_user_key = ExtractUserKey(f.largest_key); |
4360 | 190k | if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key, |
4361 | 190k | static_cast<int>(level), |
4362 | 190k | l0_file_idx)) { |
4363 | 110k | bottommost_files_.emplace_back(static_cast<int>(level), |
4364 | 110k | f.file_metadata); |
4365 | 110k | } |
4366 | 190k | } |
4367 | 240k | } |
4368 | 233k | } |
4369 | | |
4370 | 233k | void VersionStorageInfo::GenerateFileLocationIndex() { |
4371 | 233k | size_t num_files = 0; |
4372 | | |
4373 | 1.86M | for (int level = 0; level < num_levels_; ++level) { |
4374 | 1.63M | num_files += files_[level].size(); |
4375 | 1.63M | } |
4376 | | |
4377 | 233k | file_locations_.reserve(num_files); |
4378 | | |
4379 | 1.86M | for (int level = 0; level < num_levels_; ++level) { |
4380 | 1.82M | for (size_t pos = 0; pos < files_[level].size(); ++pos) { |
4381 | 190k | const FileMetaData* const meta = files_[level][pos]; |
4382 | 190k | assert(meta); |
4383 | | |
4384 | 190k | const uint64_t file_number = meta->fd.GetNumber(); |
4385 | | |
4386 | 190k | assert(file_locations_.find(file_number) == file_locations_.end()); |
4387 | 190k | file_locations_.emplace(file_number, FileLocation(level, pos)); |
4388 | 190k | } |
4389 | 1.63M | } |
4390 | 233k | } |
4391 | | |
4392 | | void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum, |
4393 | 1.34k | bool allow_ingest_behind) { |
4394 | 1.34k | assert(seqnum >= oldest_snapshot_seqnum_); |
4395 | 1.34k | oldest_snapshot_seqnum_ = seqnum; |
4396 | 1.34k | if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { |
4397 | 1.29k | ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind); |
4398 | 1.29k | } |
4399 | 1.34k | } |
4400 | | |
4401 | | void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction( |
4402 | 241k | bool allow_ingest_behind) { |
4403 | 241k | bottommost_files_marked_for_compaction_.clear(); |
4404 | 241k | bottommost_files_mark_threshold_ = kMaxSequenceNumber; |
4405 | 241k | if (allow_ingest_behind) { |
4406 | 0 | return; |
4407 | 0 | } |
4408 | | // If a file's creation time is larger than creation_time_ub, |
4409 | | // it is too new to be marked for compaction. |
4410 | 241k | int64_t creation_time_ub = 0; |
4411 | 241k | bool needs_delay = bottommost_file_compaction_delay_ > 0; |
4412 | 241k | if (needs_delay) { |
4413 | 0 | int64_t current_time = 0; |
4414 | 0 | clock_->GetCurrentTime(¤t_time).PermitUncheckedError(); |
4415 | | // Note that if GetCurrentTime() fails, current_time will be 0. |
4416 | | // We will treat it as is and treat all files as too new. |
4417 | | // The subtraction will not underflow since |
4418 | | // bottommost_file_compaction_delay_ is of type uint32_t. |
4419 | 0 | creation_time_ub = |
4420 | 0 | current_time - static_cast<int64_t>(bottommost_file_compaction_delay_); |
4421 | 0 | } |
4422 | | |
4423 | 241k | for (auto& level_and_file : bottommost_files_) { |
4424 | 125k | if (!level_and_file.second->being_compacted && |
4425 | 118k | level_and_file.second->fd.largest_seqno != 0) { |
4426 | | // largest_seqno might be nonzero due to containing the final key in an |
4427 | | // earlier compaction, whose seqnum we didn't zero out. |
4428 | 106k | if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { |
4429 | 3.33k | if (!needs_delay) { |
4430 | 3.33k | bottommost_files_marked_for_compaction_.push_back(level_and_file); |
4431 | 3.33k | } else if (creation_time_ub > 0) { |
4432 | 0 | int64_t creation_time = static_cast<int64_t>( |
4433 | 0 | level_and_file.second->TryGetFileCreationTime()); |
4434 | 0 | if (creation_time == kUnknownFileCreationTime || |
4435 | 0 | creation_time <= creation_time_ub) { |
4436 | 0 | bottommost_files_marked_for_compaction_.push_back(level_and_file); |
4437 | 0 | } else { |
4438 | | // Just ignore this file for both |
4439 | | // bottommost_files_marked_for_compaction_ and |
4440 | | // bottommost_files_mark_threshold_. The next time |
4441 | | // this method is called, it will try this file again. The method |
4442 | | // is called after a new Version creation (compaction, flush, etc.), |
4443 | | // after a compaction is picked, and after a snapshot newer than |
4444 | | // bottommost_files_mark_threshold_ is released. |
4445 | 0 | } |
4446 | 0 | } else { |
4447 | | // creation_time_ub <= 0, all files are too new to be marked for |
4448 | | // compaction. |
4449 | 0 | } |
4450 | 102k | } else { |
4451 | 102k | bottommost_files_mark_threshold_ = |
4452 | 102k | std::min(bottommost_files_mark_threshold_, |
4453 | 102k | level_and_file.second->fd.largest_seqno); |
4454 | 102k | } |
4455 | 106k | } |
4456 | 125k | } |
4457 | 241k | } |
4458 | | |
4459 | 596k | void Version::Ref() { ++refs_; } |
4460 | | |
4461 | 596k | bool Version::Unref() { |
4462 | 596k | assert(refs_ >= 1); |
4463 | 596k | --refs_; |
4464 | 596k | if (refs_ == 0) { |
4465 | 328k | delete this; |
4466 | 328k | return true; |
4467 | 328k | } |
4468 | 267k | return false; |
4469 | 596k | } |
4470 | | |
4471 | | bool VersionStorageInfo::OverlapInLevel(int level, |
4472 | | const Slice* smallest_user_key, |
4473 | 0 | const Slice* largest_user_key) { |
4474 | 0 | if (level >= num_non_empty_levels_) { |
4475 | | // empty level, no overlap |
4476 | 0 | return false; |
4477 | 0 | } |
4478 | 0 | return SomeFileOverlapsRange(*internal_comparator_, (level > 0), |
4479 | 0 | level_files_brief_[level], smallest_user_key, |
4480 | 0 | largest_user_key); |
4481 | 0 | } |
4482 | | |
4483 | | // Store in "*inputs" all files in "level" that overlap [begin,end] |
4484 | | // If hint_index is specified, then it points to a file in the |
4485 | | // overlapping range. |
4486 | | // The file_index returns a pointer to any file in an overlapping range. |
4487 | | void VersionStorageInfo::GetOverlappingInputs( |
4488 | | int level, const InternalKey* begin, const InternalKey* end, |
4489 | | std::vector<FileMetaData*>* inputs, int hint_index, int* file_index, |
4490 | | bool expand_range, const FileMetaData* starting_l0_file, |
4491 | 23.7k | InternalKey** next_smallest) const { |
4492 | 23.7k | if (level >= num_non_empty_levels_) { |
4493 | | // this level is empty, no overlapping inputs |
4494 | 4.35k | return; |
4495 | 4.35k | } |
4496 | | |
4497 | 19.4k | inputs->clear(); |
4498 | 19.4k | if (file_index) { |
4499 | 7.81k | *file_index = -1; |
4500 | 7.81k | } |
4501 | 19.4k | const Comparator* user_cmp = user_comparator_; |
4502 | 19.4k | if (level > 0) { |
4503 | 12.4k | GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, |
4504 | 12.4k | file_index, false, next_smallest); |
4505 | 12.4k | return; |
4506 | 12.4k | } |
4507 | | |
4508 | 6.93k | if (next_smallest) { |
4509 | | // next_smallest key only makes sense for non-level 0, where files are |
4510 | | // non-overlapping |
4511 | 0 | *next_smallest = nullptr; |
4512 | 0 | } |
4513 | | |
4514 | 6.93k | Slice user_begin, user_end; |
4515 | 6.93k | if (begin != nullptr) { |
4516 | 6.93k | user_begin = begin->user_key(); |
4517 | 6.93k | } |
4518 | 6.93k | if (end != nullptr) { |
4519 | 6.93k | user_end = end->user_key(); |
4520 | 6.93k | } |
4521 | | |
4522 | | // index stores the file index need to check. |
4523 | 6.93k | std::list<size_t> index; |
4524 | 6.93k | size_t start_index = 0; |
4525 | 6.93k | if (starting_l0_file != nullptr) { |
4526 | 0 | uint64_t starting_file_number = starting_l0_file->fd.GetNumber(); |
4527 | 0 | for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { |
4528 | 0 | if (level_files_brief_[level].files[i].fd.GetNumber() == |
4529 | 0 | starting_file_number) { |
4530 | 0 | start_index = i; |
4531 | 0 | break; |
4532 | 0 | } |
4533 | 0 | } |
4534 | 0 | assert(start_index < level_files_brief_[level].num_files); |
4535 | 0 | } |
4536 | 31.8k | for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) { |
4537 | 24.8k | index.emplace_back(i); |
4538 | 24.8k | } |
4539 | | |
4540 | 13.8k | while (!index.empty()) { |
4541 | 8.97k | bool found_overlapping_file = false; |
4542 | 8.97k | auto iter = index.begin(); |
4543 | 37.4k | while (iter != index.end()) { |
4544 | 28.5k | FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); |
4545 | 28.5k | const Slice file_start = ExtractUserKey(f->smallest_key); |
4546 | 28.5k | const Slice file_limit = ExtractUserKey(f->largest_key); |
4547 | 28.5k | if (begin != nullptr && |
4548 | 28.5k | user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { |
4549 | | // "f" is completely before specified range; skip it |
4550 | 16 | iter++; |
4551 | 28.5k | } else if (end != nullptr && |
4552 | 28.5k | user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { |
4553 | | // "f" is completely after specified range; skip it |
4554 | 7.25k | iter++; |
4555 | 21.2k | } else { |
4556 | | // if overlap |
4557 | 21.2k | inputs->emplace_back(files_[level][*iter]); |
4558 | 21.2k | found_overlapping_file = true; |
4559 | | // record the first file index. |
4560 | 21.2k | if (file_index && *file_index == -1) { |
4561 | 0 | *file_index = static_cast<int>(*iter); |
4562 | 0 | } |
4563 | | // the related file is overlap, erase to avoid checking again. |
4564 | 21.2k | iter = index.erase(iter); |
4565 | 21.2k | if (expand_range) { |
4566 | 21.2k | if (begin != nullptr && |
4567 | 21.2k | user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { |
4568 | 6 | user_begin = file_start; |
4569 | 6 | } |
4570 | 21.2k | if (end != nullptr && |
4571 | 21.2k | user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { |
4572 | 677 | user_end = file_limit; |
4573 | 677 | } |
4574 | 21.2k | } |
4575 | 21.2k | } |
4576 | 28.5k | } |
4577 | | // if all the files left are not overlap, break |
4578 | 8.97k | if (!found_overlapping_file) { |
4579 | 2.11k | break; |
4580 | 2.11k | } |
4581 | 8.97k | } |
4582 | 6.93k | } |
4583 | | |
4584 | | // Store in "*inputs" files in "level" that within range [begin,end] |
4585 | | // Guarantee a "clean cut" boundary between the files in inputs |
4586 | | // and the surrounding files and the maxinum number of files. |
4587 | | // This will ensure that no parts of a key are lost during compaction. |
4588 | | // If hint_index is specified, then it points to a file in the range. |
4589 | | // The file_index returns a pointer to any file in an overlapping range. |
4590 | | void VersionStorageInfo::GetCleanInputsWithinInterval( |
4591 | | int level, const InternalKey* begin, const InternalKey* end, |
4592 | 2.02k | std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const { |
4593 | 2.02k | inputs->clear(); |
4594 | 2.02k | if (file_index) { |
4595 | 0 | *file_index = -1; |
4596 | 0 | } |
4597 | 2.02k | if (level >= num_non_empty_levels_ || level == 0 || |
4598 | 2.02k | level_files_brief_[level].num_files == 0) { |
4599 | | // this level is empty, no inputs within range |
4600 | | // also don't support clean input interval within L0 |
4601 | 2.02k | return; |
4602 | 2.02k | } |
4603 | | |
4604 | 0 | GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, |
4605 | 0 | file_index, true /* within_interval */); |
4606 | 0 | } |
4607 | | |
4608 | | // Store in "*inputs" all files in "level" that overlap [begin,end] |
4609 | | // Employ binary search to find at least one file that overlaps the |
4610 | | // specified range. From that file, iterate backwards and |
4611 | | // forwards to find all overlapping files. |
4612 | | // if within_range is set, then only store the maximum clean inputs |
4613 | | // within range [begin, end]. "clean" means there is a boundary |
4614 | | // between the files in "*inputs" and the surrounding files |
4615 | | void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( |
4616 | | int level, const InternalKey* begin, const InternalKey* end, |
4617 | | std::vector<FileMetaData*>* inputs, int hint_index, int* file_index, |
4618 | 12.4k | bool within_interval, InternalKey** next_smallest) const { |
4619 | 12.4k | assert(level > 0); |
4620 | | |
4621 | 12.4k | auto user_cmp = user_comparator_; |
4622 | 12.4k | const FdWithKeyRange* files = level_files_brief_[level].files; |
4623 | 12.4k | const int num_files = static_cast<int>(level_files_brief_[level].num_files); |
4624 | | |
4625 | | // begin to use binary search to find lower bound |
4626 | | // and upper bound. |
4627 | 12.4k | int start_index = 0; |
4628 | 12.4k | int end_index = num_files; |
4629 | | |
4630 | 12.4k | if (begin != nullptr) { |
4631 | | // if within_interval is true, with file_key would find |
4632 | | // not overlapping ranges in std::lower_bound. |
4633 | 12.4k | auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f, |
4634 | 22.1k | const InternalKey* k) { |
4635 | 22.1k | auto& file_key = within_interval ? f.file_metadata->smallest |
4636 | 22.1k | : f.file_metadata->largest; |
4637 | 22.1k | return sstableKeyCompare(user_cmp, file_key, *k) < 0; |
4638 | 22.1k | }; |
4639 | | |
4640 | 12.4k | start_index = static_cast<int>( |
4641 | 12.4k | std::lower_bound(files, |
4642 | 12.4k | files + (hint_index == -1 ? num_files : hint_index), |
4643 | 12.4k | begin, cmp) - |
4644 | 12.4k | files); |
4645 | | |
4646 | 12.4k | if (start_index > 0 && within_interval) { |
4647 | 0 | bool is_overlapping = true; |
4648 | 0 | while (is_overlapping && start_index < num_files) { |
4649 | 0 | auto& pre_limit = files[start_index - 1].file_metadata->largest; |
4650 | 0 | auto& cur_start = files[start_index].file_metadata->smallest; |
4651 | 0 | is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0; |
4652 | 0 | start_index += is_overlapping; |
4653 | 0 | } |
4654 | 0 | } |
4655 | 12.4k | } |
4656 | | |
4657 | 12.4k | if (end != nullptr) { |
4658 | | // if within_interval is true, with file_key would find |
4659 | | // not overlapping ranges in std::upper_bound. |
4660 | 12.4k | auto cmp = [&user_cmp, &within_interval](const InternalKey* k, |
4661 | 20.1k | const FdWithKeyRange& f) { |
4662 | 20.1k | auto& file_key = within_interval ? f.file_metadata->largest |
4663 | 20.1k | : f.file_metadata->smallest; |
4664 | 20.1k | return sstableKeyCompare(user_cmp, *k, file_key) < 0; |
4665 | 20.1k | }; |
4666 | | |
4667 | 12.4k | end_index = static_cast<int>( |
4668 | 12.4k | std::upper_bound(files + start_index, files + num_files, end, cmp) - |
4669 | 12.4k | files); |
4670 | | |
4671 | 12.4k | if (end_index < num_files && within_interval) { |
4672 | 0 | bool is_overlapping = true; |
4673 | 0 | while (is_overlapping && end_index > start_index) { |
4674 | 0 | auto& next_start = files[end_index].file_metadata->smallest; |
4675 | 0 | auto& cur_limit = files[end_index - 1].file_metadata->largest; |
4676 | 0 | is_overlapping = |
4677 | 0 | sstableKeyCompare(user_cmp, cur_limit, next_start) == 0; |
4678 | 0 | end_index -= is_overlapping; |
4679 | 0 | } |
4680 | 0 | } |
4681 | 12.4k | } |
4682 | | |
4683 | 12.4k | assert(start_index <= end_index); |
4684 | | |
4685 | | // If there were no overlapping files, return immediately. |
4686 | 12.4k | if (start_index == end_index) { |
4687 | 2.22k | if (next_smallest) { |
4688 | 0 | *next_smallest = nullptr; |
4689 | 0 | } |
4690 | 2.22k | return; |
4691 | 2.22k | } |
4692 | | |
4693 | 12.4k | assert(start_index < end_index); |
4694 | | |
4695 | | // returns the index where an overlap is found |
4696 | 10.2k | if (file_index) { |
4697 | 7.43k | *file_index = start_index; |
4698 | 7.43k | } |
4699 | | |
4700 | | // insert overlapping files into vector |
4701 | 30.6k | for (int i = start_index; i < end_index; i++) { |
4702 | 20.3k | inputs->push_back(files_[level][i]); |
4703 | 20.3k | } |
4704 | | |
4705 | 10.2k | if (next_smallest != nullptr) { |
4706 | | // Provide the next key outside the range covered by inputs |
4707 | 0 | if (end_index < static_cast<int>(files_[level].size())) { |
4708 | 0 | **next_smallest = files_[level][end_index]->smallest; |
4709 | 0 | } else { |
4710 | 0 | *next_smallest = nullptr; |
4711 | 0 | } |
4712 | 0 | } |
4713 | 10.2k | } |
4714 | | |
4715 | 398 | uint64_t VersionStorageInfo::NumLevelBytes(int level) const { |
4716 | 398 | assert(level >= 0); |
4717 | 398 | assert(level < num_levels()); |
4718 | 398 | return TotalFileSize(files_[level]); |
4719 | 398 | } |
4720 | | |
4721 | | const char* VersionStorageInfo::LevelSummary( |
4722 | 8.39k | LevelSummaryStorage* scratch) const { |
4723 | 8.39k | int len = 0; |
4724 | 8.39k | if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { |
4725 | 8.39k | assert(base_level_ < static_cast<int>(level_max_bytes_.size())); |
4726 | 8.39k | if (level_multiplier_ != 0.0) { |
4727 | 6.32k | len = snprintf( |
4728 | 6.32k | scratch->buffer, sizeof(scratch->buffer), |
4729 | 6.32k | "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", |
4730 | 6.32k | base_level_, level_multiplier_, level_max_bytes_[base_level_]); |
4731 | 6.32k | } |
4732 | 8.39k | } |
4733 | 8.39k | len += |
4734 | 8.39k | snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); |
4735 | 67.1k | for (int i = 0; i < num_levels(); i++) { |
4736 | 58.7k | int sz = sizeof(scratch->buffer) - len; |
4737 | 58.7k | int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); |
4738 | 58.7k | if (ret < 0 || ret >= sz) { |
4739 | 0 | break; |
4740 | 0 | } |
4741 | 58.7k | len += ret; |
4742 | 58.7k | } |
4743 | 8.39k | if (len > 0) { |
4744 | | // overwrite the last space |
4745 | 8.39k | --len; |
4746 | 8.39k | } |
4747 | 8.39k | len += |
4748 | 8.39k | snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, |
4749 | 8.39k | "] max score %.2f, estimated pending compaction bytes %" PRIu64, |
4750 | 8.39k | compaction_score_[0], estimated_compaction_needed_bytes_); |
4751 | | |
4752 | 8.39k | if (!files_marked_for_compaction_.empty()) { |
4753 | 0 | snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, |
4754 | 0 | " (%" ROCKSDB_PRIszt " files need compaction)", |
4755 | 0 | files_marked_for_compaction_.size()); |
4756 | 0 | } |
4757 | | |
4758 | 8.39k | return scratch->buffer; |
4759 | 8.39k | } |
4760 | | |
4761 | | const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, |
4762 | 0 | int level) const { |
4763 | 0 | int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); |
4764 | 0 | for (const auto& f : files_[level]) { |
4765 | 0 | int sz = sizeof(scratch->buffer) - len; |
4766 | 0 | char sztxt[16]; |
4767 | 0 | AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt)); |
4768 | 0 | int ret = snprintf(scratch->buffer + len, sz, |
4769 | 0 | "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", |
4770 | 0 | f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, |
4771 | 0 | static_cast<int>(f->being_compacted)); |
4772 | 0 | if (ret < 0 || ret >= sz) { |
4773 | 0 | break; |
4774 | 0 | } |
4775 | 0 | len += ret; |
4776 | 0 | } |
4777 | | // overwrite the last space (only if files_[level].size() is non-zero) |
4778 | 0 | if (files_[level].size() && len > 0) { |
4779 | 0 | --len; |
4780 | 0 | } |
4781 | 0 | snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); |
4782 | 0 | return scratch->buffer; |
4783 | 0 | } |
4784 | | |
4785 | 130k | bool VersionStorageInfo::HasMissingEpochNumber() const { |
4786 | 1.04M | for (int level = 0; level < num_levels_; ++level) { |
4787 | 916k | for (const FileMetaData* f : files_[level]) { |
4788 | 76.8k | if (f->epoch_number == kUnknownEpochNumber) { |
4789 | 0 | return true; |
4790 | 0 | } |
4791 | 76.8k | } |
4792 | 916k | } |
4793 | 130k | return false; |
4794 | 130k | } |
4795 | | |
4796 | 65.4k | uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const { |
4797 | 65.4k | uint64_t max_epoch_number = kUnknownEpochNumber; |
4798 | 523k | for (int level = 0; level < num_levels_; ++level) { |
4799 | 458k | for (const FileMetaData* f : files_[level]) { |
4800 | 76.8k | max_epoch_number = std::max(max_epoch_number, f->epoch_number); |
4801 | 76.8k | } |
4802 | 458k | } |
4803 | 65.4k | return max_epoch_number; |
4804 | 65.4k | } |
4805 | | |
4806 | | void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd, |
4807 | 65.4k | bool restart_epoch, bool force) { |
4808 | 65.4k | if (restart_epoch) { |
4809 | 65.4k | cfd->ResetNextEpochNumber(); |
4810 | | |
4811 | 65.4k | bool reserve_epoch_num_for_file_ingested_behind = cfd->AllowIngestBehind(); |
4812 | 65.4k | if (reserve_epoch_num_for_file_ingested_behind) { |
4813 | 0 | uint64_t reserved_epoch_number = cfd->NewEpochNumber(); |
4814 | 0 | assert(reserved_epoch_number == |
4815 | 0 | kReservedEpochNumberForFileIngestedBehind); |
4816 | 0 | ROCKS_LOG_INFO(cfd->ioptions().info_log.get(), |
4817 | 0 | "[%s]CF has reserved epoch number %" PRIu64 |
4818 | 0 | " for files ingested " |
4819 | 0 | "behind since `Options::allow_ingest_behind` or " |
4820 | 0 | "`Options::cf_allow_ingest_behind` is true", |
4821 | 0 | cfd->GetName().c_str(), reserved_epoch_number); |
4822 | 0 | } |
4823 | 65.4k | } |
4824 | | |
4825 | 65.4k | bool missing_epoch_number = HasMissingEpochNumber(); |
4826 | 65.4k | if (missing_epoch_number || force) { |
4827 | 0 | for (int level = num_levels_ - 1; level >= 1; --level) { |
4828 | 0 | auto& files_at_level = files_[level]; |
4829 | 0 | if (files_at_level.empty()) { |
4830 | 0 | continue; |
4831 | 0 | } |
4832 | 0 | uint64_t next_epoch_number = cfd->NewEpochNumber(); |
4833 | 0 | for (FileMetaData* f : files_at_level) { |
4834 | 0 | f->epoch_number = next_epoch_number; |
4835 | 0 | } |
4836 | 0 | } |
4837 | 0 | for (auto file_meta_iter = files_[0].rbegin(); |
4838 | 0 | file_meta_iter != files_[0].rend(); file_meta_iter++) { |
4839 | 0 | FileMetaData* f = *file_meta_iter; |
4840 | 0 | f->epoch_number = cfd->NewEpochNumber(); |
4841 | 0 | } |
4842 | 0 | if (missing_epoch_number) { |
4843 | 0 | assert(epoch_number_requirement_ == |
4844 | 0 | EpochNumberRequirement::kMightMissing); |
4845 | 0 | ROCKS_LOG_WARN(cfd->ioptions().info_log.get(), |
4846 | 0 | "[%s]CF's epoch numbers are inferred based on seqno", |
4847 | 0 | cfd->GetName().c_str()); |
4848 | 0 | epoch_number_requirement_ = EpochNumberRequirement::kMustPresent; |
4849 | 0 | } |
4850 | 65.4k | } else { |
4851 | 65.4k | assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent); |
4852 | 65.4k | cfd->SetNextEpochNumber( |
4853 | 65.4k | std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber())); |
4854 | 65.4k | } |
4855 | 65.4k | } |
4856 | | |
4857 | 0 | uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { |
4858 | 0 | uint64_t result = 0; |
4859 | 0 | std::vector<FileMetaData*> overlaps; |
4860 | 0 | for (int level = 1; level < num_levels() - 1; level++) { |
4861 | 0 | for (const auto& f : files_[level]) { |
4862 | 0 | GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); |
4863 | 0 | const uint64_t sum = TotalFileSize(overlaps); |
4864 | 0 | if (sum > result) { |
4865 | 0 | result = sum; |
4866 | 0 | } |
4867 | 0 | } |
4868 | 0 | } |
4869 | 0 | return result; |
4870 | 0 | } |
4871 | | |
4872 | 3.60M | uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const { |
4873 | | // Note: the result for level zero is not really used since we set |
4874 | | // the level-0 compaction threshold based on number of files. |
4875 | 3.60M | assert(level >= 0); |
4876 | 3.60M | assert(level < static_cast<int>(level_max_bytes_.size())); |
4877 | 3.60M | return level_max_bytes_[level]; |
4878 | 3.60M | } |
4879 | | |
4880 | | void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, |
4881 | 233k | const MutableCFOptions& options) { |
4882 | | // Special logic to set number of sorted runs. |
4883 | | // It is to match the previous behavior when all files are in L0. |
4884 | 233k | int num_l0_count = static_cast<int>(files_[0].size()); |
4885 | 233k | if (compaction_style_ == kCompactionStyleUniversal) { |
4886 | | // For universal compaction, we use level0 score to indicate |
4887 | | // compaction score for the whole DB. Adding other levels as if |
4888 | | // they are L0 files. |
4889 | 0 | for (int i = 1; i < num_levels(); i++) { |
4890 | 0 | if (!files_[i].empty()) { |
4891 | 0 | num_l0_count++; |
4892 | 0 | } |
4893 | 0 | } |
4894 | 0 | } |
4895 | 233k | set_l0_delay_trigger_count(num_l0_count); |
4896 | | |
4897 | 233k | level_max_bytes_.resize(ioptions.num_levels); |
4898 | 233k | if (!ioptions.level_compaction_dynamic_level_bytes) { |
4899 | 0 | base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1; |
4900 | | |
4901 | | // Calculate for static bytes base case |
4902 | 0 | for (int i = 0; i < ioptions.num_levels; ++i) { |
4903 | 0 | if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { |
4904 | 0 | level_max_bytes_[i] = options.max_bytes_for_level_base; |
4905 | 0 | } else if (i > 1) { |
4906 | 0 | level_max_bytes_[i] = MultiplyCheckOverflow( |
4907 | 0 | MultiplyCheckOverflow(level_max_bytes_[i - 1], |
4908 | 0 | options.max_bytes_for_level_multiplier), |
4909 | 0 | options.MaxBytesMultiplerAdditional(i - 1)); |
4910 | 0 | } else { |
4911 | 0 | level_max_bytes_[i] = options.max_bytes_for_level_base; |
4912 | 0 | } |
4913 | 0 | } |
4914 | 233k | } else { |
4915 | 233k | assert(ioptions.compaction_style == kCompactionStyleLevel); |
4916 | 233k | uint64_t max_level_size = 0; |
4917 | | |
4918 | 233k | int first_non_empty_level = -1; |
4919 | | // Find size of non-L0 level of most data. |
4920 | | // Cannot use the size of the last level because it can be empty or less |
4921 | | // than previous levels after compaction. |
4922 | 1.63M | for (int i = 1; i < num_levels_; i++) { |
4923 | 1.40M | uint64_t total_size = 0; |
4924 | 1.40M | for (const auto& f : files_[i]) { |
4925 | 73.0k | total_size += f->fd.GetFileSize(); |
4926 | 73.0k | } |
4927 | 1.40M | if (total_size > 0 && first_non_empty_level == -1) { |
4928 | 29.0k | first_non_empty_level = i; |
4929 | 29.0k | } |
4930 | 1.40M | if (total_size > max_level_size) { |
4931 | 29.0k | max_level_size = total_size; |
4932 | 29.0k | } |
4933 | 1.40M | } |
4934 | | |
4935 | | // Prefill every level's max bytes to disallow compaction from there. |
4936 | 1.86M | for (int i = 0; i < num_levels_; i++) { |
4937 | 1.63M | level_max_bytes_[i] = std::numeric_limits<uint64_t>::max(); |
4938 | 1.63M | } |
4939 | | |
4940 | 233k | lowest_unnecessary_level_ = -1; |
4941 | 233k | if (max_level_size == 0) { |
4942 | | // No data for L1 and up. L0 compacts to last level directly. |
4943 | | // No compaction from L1+ needs to be scheduled. |
4944 | 204k | base_level_ = num_levels_ - 1; |
4945 | 204k | } else { |
4946 | 29.0k | assert(first_non_empty_level >= 1); |
4947 | 29.0k | uint64_t base_bytes_max = options.max_bytes_for_level_base; |
4948 | 29.0k | uint64_t base_bytes_min = static_cast<uint64_t>( |
4949 | 29.0k | base_bytes_max / options.max_bytes_for_level_multiplier); |
4950 | | |
4951 | | // Try whether we can make last level's target size to be max_level_size |
4952 | 29.0k | uint64_t cur_level_size = max_level_size; |
4953 | 29.0k | for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) { |
4954 | | // Round up after dividing |
4955 | 0 | cur_level_size = static_cast<uint64_t>( |
4956 | 0 | cur_level_size / options.max_bytes_for_level_multiplier); |
4957 | 0 | if (lowest_unnecessary_level_ == -1 && |
4958 | 0 | cur_level_size <= base_bytes_min && |
4959 | 0 | (options.preclude_last_level_data_seconds == 0 || |
4960 | 0 | i < num_levels_ - 2)) { |
4961 | | // When per_key_placement is enabled, the proximal level is |
4962 | | // necessary. |
4963 | 0 | lowest_unnecessary_level_ = i; |
4964 | 0 | } |
4965 | 0 | } |
4966 | | |
4967 | | // Calculate base level and its size. |
4968 | 29.0k | uint64_t base_level_size; |
4969 | 29.0k | if (cur_level_size <= base_bytes_min) { |
4970 | | // If per_key_placement is not enabled, |
4971 | | // either there is only one non-empty level after level 0, |
4972 | | // which can less than base_bytes_min AND necessary, |
4973 | | // or there is some unnecessary level. |
4974 | 29.0k | assert(first_non_empty_level == num_levels_ - 1 || |
4975 | 29.0k | options.preclude_last_level_data_seconds > 0 || |
4976 | 29.0k | lowest_unnecessary_level_ != -1); |
4977 | | // Case 1. If we make target size of last level to be max_level_size, |
4978 | | // target size of the first non-empty level would be smaller than |
4979 | | // base_bytes_min. We set it be base_bytes_min. |
4980 | 29.0k | base_level_size = base_bytes_min + 1U; |
4981 | 29.0k | base_level_ = first_non_empty_level; |
4982 | 29.0k | if (base_level_ < num_levels_ - 1) { |
4983 | 0 | ROCKS_LOG_INFO( |
4984 | 0 | ioptions.logger, |
4985 | 0 | "More existing levels in DB than needed: all non-zero " |
4986 | 0 | "levels <= level %d are unnecessary. " |
4987 | 0 | "max_bytes_for_level_multiplier may not be guaranteed.", |
4988 | 0 | lowest_unnecessary_level_); |
4989 | 0 | } |
4990 | 29.0k | } else { |
4991 | 0 | assert(lowest_unnecessary_level_ == -1); |
4992 | | // Find base level (where L0 data is compacted to). |
4993 | 0 | base_level_ = first_non_empty_level; |
4994 | 0 | while (base_level_ > 1 && cur_level_size > base_bytes_max) { |
4995 | 0 | --base_level_; |
4996 | 0 | cur_level_size = static_cast<uint64_t>( |
4997 | 0 | cur_level_size / options.max_bytes_for_level_multiplier); |
4998 | 0 | } |
4999 | 0 | if (cur_level_size > base_bytes_max) { |
5000 | | // Even L1 will be too large |
5001 | 0 | assert(base_level_ == 1); |
5002 | 0 | base_level_size = base_bytes_max; |
5003 | 0 | } else { |
5004 | 0 | base_level_size = std::max(static_cast<uint64_t>(1), cur_level_size); |
5005 | 0 | } |
5006 | 0 | } |
5007 | | |
5008 | 29.0k | level_multiplier_ = options.max_bytes_for_level_multiplier; |
5009 | 29.0k | assert(base_level_size > 0); |
5010 | | |
5011 | 29.0k | uint64_t level_size = base_level_size; |
5012 | 58.0k | for (int i = base_level_; i < num_levels_; i++) { |
5013 | 29.0k | if (i > base_level_) { |
5014 | 0 | level_size = MultiplyCheckOverflow(level_size, level_multiplier_); |
5015 | 0 | } |
5016 | | // Don't set any level below base_bytes_max. Otherwise, the LSM can |
5017 | | // assume an hourglass shape where L1+ sizes are smaller than L0. This |
5018 | | // causes compaction scoring, which depends on level sizes, to favor L1+ |
5019 | | // at the expense of L0, which may fill up and stall. |
5020 | 29.0k | level_max_bytes_[i] = std::max(level_size, base_bytes_max); |
5021 | 29.0k | } |
5022 | 29.0k | } |
5023 | 233k | } |
5024 | 233k | } |
5025 | | |
5026 | 0 | uint64_t VersionStorageInfo::EstimateLiveDataSize() const { |
5027 | | // Estimate the live data size by adding up the size of a maximal set of |
5028 | | // sst files with no range overlap in same or higher level. The less |
5029 | | // compacted, the more optimistic (smaller) this estimate is. Also, |
5030 | | // for multiple sorted runs within a level, file order will matter. |
5031 | 0 | uint64_t size = 0; |
5032 | |
|
5033 | 0 | auto ikey_lt = [this](InternalKey* x, InternalKey* y) { |
5034 | 0 | return internal_comparator_->Compare(*x, *y) < 0; |
5035 | 0 | }; |
5036 | | // (Ordered) map of largest keys in files being included in size estimate |
5037 | 0 | std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt); |
5038 | |
|
5039 | 0 | for (int l = num_levels_ - 1; l >= 0; l--) { |
5040 | 0 | bool found_end = false; |
5041 | 0 | for (auto file : files_[l]) { |
5042 | | // Find the first file already included with largest key is larger than |
5043 | | // the smallest key of `file`. If that file does not overlap with the |
5044 | | // current file, none of the files in the map does. If there is |
5045 | | // no potential overlap, we can safely insert the rest of this level |
5046 | | // (if the level is not 0) into the map without checking again because |
5047 | | // the elements in the level are sorted and non-overlapping. |
5048 | 0 | auto lb = (found_end && l != 0) ? ranges.end() |
5049 | 0 | : ranges.lower_bound(&file->smallest); |
5050 | 0 | found_end = (lb == ranges.end()); |
5051 | 0 | if (found_end || internal_comparator_->Compare( |
5052 | 0 | file->largest, (*lb).second->smallest) < 0) { |
5053 | 0 | ranges.emplace_hint(lb, &file->largest, file); |
5054 | 0 | size += file->fd.file_size; |
5055 | 0 | } |
5056 | 0 | } |
5057 | 0 | } |
5058 | | |
5059 | | // For BlobDB, the result also includes the exact value of live bytes in the |
5060 | | // blob files of the version. |
5061 | 0 | for (const auto& meta : blob_files_) { |
5062 | 0 | assert(meta); |
5063 | |
|
5064 | 0 | size += meta->GetTotalBlobBytes(); |
5065 | 0 | size -= meta->GetGarbageBlobBytes(); |
5066 | 0 | } |
5067 | |
|
5068 | 0 | return size; |
5069 | 0 | } |
5070 | | |
5071 | | bool VersionStorageInfo::RangeMightExistAfterSortedRun( |
5072 | | const Slice& smallest_user_key, const Slice& largest_user_key, |
5073 | 196k | int last_level, int last_l0_idx) { |
5074 | 196k | assert((last_l0_idx != -1) == (last_level == 0)); |
5075 | | // TODO(ajkr): this preserves earlier behavior where we considered an L0 file |
5076 | | // bottommost only if it's the oldest L0 file and there are no files on older |
5077 | | // levels. It'd be better to consider it bottommost if there's no overlap in |
5078 | | // older levels/files. |
5079 | 196k | if (last_level == 0 && |
5080 | 117k | last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) { |
5081 | 56.8k | return true; |
5082 | 56.8k | } |
5083 | | |
5084 | | // Checks whether there are files living beyond the `last_level`. If lower |
5085 | | // levels have files, it checks for overlap between [`smallest_key`, |
5086 | | // `largest_key`] and those files. Bottomlevel optimizations can be made if |
5087 | | // there are no files in lower levels or if there is no overlap with the files |
5088 | | // in the lower levels. |
5089 | 483k | for (int level = last_level + 1; level < num_levels(); level++) { |
5090 | | // The range is not in the bottommost level if there are files in lower |
5091 | | // levels when the `last_level` is 0 or if there are files in lower levels |
5092 | | // which overlap with [`smallest_key`, `largest_key`]. |
5093 | 365k | if (files_[level].size() > 0 && |
5094 | 22.4k | (last_level == 0 || |
5095 | 22.4k | OverlapInLevel(level, &smallest_user_key, &largest_user_key))) { |
5096 | 22.4k | return true; |
5097 | 22.4k | } |
5098 | 365k | } |
5099 | 117k | return false; |
5100 | 139k | } |
5101 | | |
5102 | | Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint( |
5103 | 22.3k | int level, CompactionStyleSet compaction_style_set) const { |
5104 | 22.3k | if (!compaction_style_set.Contains(compaction_style_)) { |
5105 | 0 | return Env::WLTH_NOT_SET; |
5106 | 0 | } |
5107 | | |
5108 | 22.3k | switch (compaction_style_) { |
5109 | 22.3k | case kCompactionStyleLevel: |
5110 | 22.3k | if (level == 0) { |
5111 | 19.3k | return Env::WLTH_MEDIUM; |
5112 | 19.3k | } |
5113 | | |
5114 | | // L1: medium, L2: long, ... |
5115 | 3.00k | if (level - base_level_ >= 2) { |
5116 | 0 | return Env::WLTH_EXTREME; |
5117 | 3.00k | } else if (level < base_level_) { |
5118 | | // There is no restriction which prevents level passed in to be smaller |
5119 | | // than base_level. |
5120 | 0 | return Env::WLTH_MEDIUM; |
5121 | 0 | } |
5122 | 3.00k | return static_cast<Env::WriteLifeTimeHint>( |
5123 | 3.00k | level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM)); |
5124 | 0 | case kCompactionStyleUniversal: |
5125 | 0 | if (level == 0) { |
5126 | 0 | return Env::WLTH_SHORT; |
5127 | 0 | } |
5128 | 0 | if (level == 1) { |
5129 | 0 | return Env::WLTH_MEDIUM; |
5130 | 0 | } |
5131 | 0 | return Env::WLTH_LONG; |
5132 | 0 | default: |
5133 | 0 | return Env::WLTH_NOT_SET; |
5134 | 22.3k | } |
5135 | 22.3k | } |
5136 | | |
5137 | | void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files, |
5138 | 130k | std::vector<uint64_t>* live_blob_files) const { |
5139 | 130k | assert(live_table_files); |
5140 | 130k | assert(live_blob_files); |
5141 | | |
5142 | 1.04M | for (int level = 0; level < storage_info_.num_levels(); ++level) { |
5143 | 916k | const auto& level_files = storage_info_.LevelFiles(level); |
5144 | 916k | for (const auto& meta : level_files) { |
5145 | 184k | assert(meta); |
5146 | | |
5147 | 184k | live_table_files->emplace_back(meta->fd.GetNumber()); |
5148 | 184k | } |
5149 | 916k | } |
5150 | | |
5151 | 130k | const auto& blob_files = storage_info_.GetBlobFiles(); |
5152 | 130k | for (const auto& meta : blob_files) { |
5153 | 0 | assert(meta); |
5154 | |
|
5155 | 0 | live_blob_files->emplace_back(meta->GetBlobFileNumber()); |
5156 | 0 | } |
5157 | 130k | } |
5158 | | |
5159 | | void Version::RemoveLiveFiles( |
5160 | | std::vector<ObsoleteFileInfo>& sst_delete_candidates, |
5161 | 26.8k | std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const { |
5162 | 26.8k | for (ObsoleteFileInfo& fi : sst_delete_candidates) { |
5163 | 15.1k | if (!fi.only_delete_metadata && |
5164 | 13.9k | storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) != |
5165 | 13.9k | VersionStorageInfo::FileLocation::Invalid()) { |
5166 | 5.71k | fi.only_delete_metadata = true; |
5167 | 5.71k | } |
5168 | 15.1k | } |
5169 | | |
5170 | 26.8k | blob_delete_candidates.erase( |
5171 | 26.8k | std::remove_if( |
5172 | 26.8k | blob_delete_candidates.begin(), blob_delete_candidates.end(), |
5173 | 26.8k | [this](ObsoleteBlobFileInfo& x) { |
5174 | 0 | return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber()); |
5175 | 0 | }), |
5176 | 26.8k | blob_delete_candidates.end()); |
5177 | 26.8k | } |
5178 | | |
5179 | 0 | std::string Version::DebugString(bool hex, bool print_stats) const { |
5180 | 0 | std::string r; |
5181 | 0 | for (int level = 0; level < storage_info_.num_levels_; level++) { |
5182 | | // E.g., |
5183 | | // --- level 1 --- |
5184 | | // 17:123[1 .. 124]['a' .. 'd'] |
5185 | | // 20:43[124 .. 128]['e' .. 'g'] |
5186 | | // |
5187 | | // if print_stats=true: |
5188 | | // 17:123[1 .. 124]['a' .. 'd'](4096) |
5189 | 0 | r.append("--- level "); |
5190 | 0 | AppendNumberTo(&r, level); |
5191 | 0 | r.append(" --- version# "); |
5192 | 0 | AppendNumberTo(&r, version_number_); |
5193 | 0 | if (storage_info_.compact_cursor_[level].Valid()) { |
5194 | 0 | r.append(" --- compact_cursor: "); |
5195 | 0 | r.append(storage_info_.compact_cursor_[level].DebugString(hex)); |
5196 | 0 | } |
5197 | 0 | r.append(" ---\n"); |
5198 | 0 | const std::vector<FileMetaData*>& files = storage_info_.files_[level]; |
5199 | 0 | for (size_t i = 0; i < files.size(); i++) { |
5200 | 0 | r.push_back(' '); |
5201 | 0 | AppendNumberTo(&r, files[i]->fd.GetNumber()); |
5202 | 0 | r.push_back(':'); |
5203 | 0 | AppendNumberTo(&r, files[i]->fd.GetFileSize()); |
5204 | 0 | r.append("["); |
5205 | 0 | AppendNumberTo(&r, files[i]->fd.smallest_seqno); |
5206 | 0 | r.append(" .. "); |
5207 | 0 | AppendNumberTo(&r, files[i]->fd.largest_seqno); |
5208 | 0 | r.append("]"); |
5209 | 0 | r.append("["); |
5210 | 0 | r.append(files[i]->smallest.DebugString(hex)); |
5211 | 0 | r.append(" .. "); |
5212 | 0 | r.append(files[i]->largest.DebugString(hex)); |
5213 | 0 | r.append("]"); |
5214 | 0 | if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) { |
5215 | 0 | r.append(" blob_file:"); |
5216 | 0 | AppendNumberTo(&r, files[i]->oldest_blob_file_number); |
5217 | 0 | } |
5218 | 0 | if (print_stats) { |
5219 | 0 | r.append("("); |
5220 | 0 | r.append(std::to_string( |
5221 | 0 | files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); |
5222 | 0 | r.append(")"); |
5223 | 0 | } |
5224 | 0 | r.append("\n"); |
5225 | 0 | } |
5226 | 0 | } |
5227 | |
|
5228 | 0 | const auto& blob_files = storage_info_.GetBlobFiles(); |
5229 | 0 | if (!blob_files.empty()) { |
5230 | 0 | r.append("--- blob files --- version# "); |
5231 | 0 | AppendNumberTo(&r, version_number_); |
5232 | 0 | r.append(" ---\n"); |
5233 | 0 | for (const auto& blob_file_meta : blob_files) { |
5234 | 0 | assert(blob_file_meta); |
5235 | |
|
5236 | 0 | r.append(blob_file_meta->DebugString()); |
5237 | 0 | r.push_back('\n'); |
5238 | 0 | } |
5239 | 0 | } |
5240 | |
|
5241 | 0 | return r; |
5242 | 0 | } |
5243 | | |
5244 | | // this is used to batch writes to the manifest file |
5245 | | struct VersionSet::ManifestWriter { |
5246 | | Status status; |
5247 | | bool done; |
5248 | | InstrumentedCondVar cv; |
5249 | | ColumnFamilyData* cfd; |
5250 | | const autovector<VersionEdit*>& edit_list; |
5251 | | const std::function<void(const Status&)> manifest_write_callback; |
5252 | | |
5253 | | explicit ManifestWriter( |
5254 | | InstrumentedMutex* mu, ColumnFamilyData* _cfd, |
5255 | | const autovector<VersionEdit*>& e, |
5256 | | const std::function<void(const Status&)>& manifest_wcb) |
5257 | 106k | : done(false), |
5258 | 106k | cv(mu), |
5259 | 106k | cfd(_cfd), |
5260 | 106k | edit_list(e), |
5261 | 106k | manifest_write_callback(manifest_wcb) {} |
5262 | 106k | ~ManifestWriter() { status.PermitUncheckedError(); } |
5263 | | |
5264 | 72.5k | bool IsAllWalEdits() const { |
5265 | 72.5k | bool all_wal_edits = true; |
5266 | 72.5k | for (const auto& e : edit_list) { |
5267 | 72.5k | if (!e->IsWalManipulation()) { |
5268 | 72.5k | all_wal_edits = false; |
5269 | 72.5k | break; |
5270 | 72.5k | } |
5271 | 72.5k | } |
5272 | 72.5k | return all_wal_edits; |
5273 | 72.5k | } |
5274 | | }; |
5275 | | |
5276 | 381k | Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { |
5277 | 381k | assert(edit); |
5278 | 381k | if (edit->IsInAtomicGroup()) { |
5279 | 0 | TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); |
5280 | 0 | if (replay_buffer_.empty()) { |
5281 | 0 | replay_buffer_.resize(edit->GetRemainingEntries() + 1); |
5282 | 0 | TEST_SYNC_POINT_CALLBACK( |
5283 | 0 | "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); |
5284 | 0 | } |
5285 | 0 | read_edits_in_atomic_group_++; |
5286 | 0 | if (read_edits_in_atomic_group_ + edit->GetRemainingEntries() != |
5287 | 0 | static_cast<uint32_t>(replay_buffer_.size())) { |
5288 | 0 | TEST_SYNC_POINT_CALLBACK( |
5289 | 0 | "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); |
5290 | 0 | return Status::Corruption("corrupted atomic group"); |
5291 | 0 | } |
5292 | 0 | replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit; |
5293 | 0 | if (read_edits_in_atomic_group_ == replay_buffer_.size()) { |
5294 | 0 | TEST_SYNC_POINT_CALLBACK( |
5295 | 0 | "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); |
5296 | 0 | return Status::OK(); |
5297 | 0 | } |
5298 | 0 | return Status::OK(); |
5299 | 0 | } |
5300 | | |
5301 | | // A normal edit. |
5302 | 381k | if (!replay_buffer().empty()) { |
5303 | 0 | TEST_SYNC_POINT_CALLBACK( |
5304 | 0 | "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); |
5305 | 0 | return Status::Corruption("corrupted atomic group"); |
5306 | 0 | } |
5307 | 381k | return Status::OK(); |
5308 | 381k | } |
5309 | | |
5310 | 0 | bool AtomicGroupReadBuffer::IsFull() const { |
5311 | 0 | return read_edits_in_atomic_group_ == replay_buffer_.size(); |
5312 | 0 | } |
5313 | | |
5314 | 0 | bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } |
5315 | | |
5316 | 0 | void AtomicGroupReadBuffer::Clear() { |
5317 | 0 | read_edits_in_atomic_group_ = 0; |
5318 | 0 | replay_buffer_.clear(); |
5319 | 0 | } |
5320 | | |
5321 | | VersionSet::VersionSet( |
5322 | | const std::string& dbname, const ImmutableDBOptions* _db_options, |
5323 | | const FileOptions& storage_options, Cache* table_cache, |
5324 | | WriteBufferManager* write_buffer_manager, WriteController* write_controller, |
5325 | | BlockCacheTracer* const block_cache_tracer, |
5326 | | const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id, |
5327 | | const std::string& db_session_id, const std::string& daily_offpeak_time_utc, |
5328 | | ErrorHandler* error_handler, bool unchanging) |
5329 | 48.4k | : column_family_set_(new ColumnFamilySet( |
5330 | 48.4k | dbname, _db_options, storage_options, table_cache, |
5331 | 48.4k | write_buffer_manager, write_controller, block_cache_tracer, io_tracer, |
5332 | 48.4k | db_id, db_session_id)), |
5333 | 48.4k | table_cache_(table_cache), |
5334 | 48.4k | env_(_db_options->env), |
5335 | 48.4k | fs_(_db_options->fs, io_tracer), |
5336 | 48.4k | clock_(_db_options->clock), |
5337 | 48.4k | dbname_(dbname), |
5338 | 48.4k | db_id_(db_id), |
5339 | 48.4k | db_options_(_db_options), |
5340 | 48.4k | next_file_number_(2), |
5341 | 48.4k | manifest_file_number_(0), // Filled by Recover() |
5342 | 48.4k | options_file_number_(0), |
5343 | 48.4k | options_file_size_(0), |
5344 | 48.4k | pending_manifest_file_number_(0), |
5345 | 48.4k | last_sequence_(0), |
5346 | 48.4k | last_allocated_sequence_(0), |
5347 | 48.4k | last_published_sequence_(0), |
5348 | 48.4k | prev_log_number_(0), |
5349 | 48.4k | current_version_number_(0), |
5350 | 48.4k | manifest_file_size_(0), |
5351 | 48.4k | file_options_(storage_options), |
5352 | 48.4k | block_cache_tracer_(block_cache_tracer), |
5353 | 48.4k | io_tracer_(io_tracer), |
5354 | 48.4k | db_session_id_(db_session_id), |
5355 | 48.4k | offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)), |
5356 | 48.4k | error_handler_(error_handler), |
5357 | 48.4k | unchanging_(unchanging), |
5358 | 48.4k | closed_(false) {} |
5359 | | |
5360 | 48.4k | Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { |
5361 | 48.4k | Status s; |
5362 | 48.4k | if (closed_ || unchanging_ || !manifest_file_number_ || !descriptor_log_) { |
5363 | 0 | return s; |
5364 | 0 | } |
5365 | | |
5366 | 48.4k | std::string manifest_file_name = |
5367 | 48.4k | DescriptorFileName(dbname_, manifest_file_number_); |
5368 | 48.4k | uint64_t size = 0; |
5369 | 48.4k | IOStatus io_s = descriptor_log_->Close(WriteOptions()); |
5370 | 48.4k | descriptor_log_.reset(); |
5371 | 48.4k | TEST_SYNC_POINT("VersionSet::Close:AfterClose"); |
5372 | 48.4k | if (io_s.ok()) { |
5373 | 48.4k | io_s = fs_->GetFileSize(manifest_file_name, IOOptions(), &size, nullptr); |
5374 | 48.4k | } |
5375 | 48.4k | if (!io_s.ok() || size != manifest_file_size_) { |
5376 | 0 | if (io_s.ok()) { |
5377 | | // This means the size is not as expected. So we treat it as a |
5378 | | // corruption and set io_s appropriately |
5379 | 0 | io_s = IOStatus::Corruption(); |
5380 | 0 | } |
5381 | 0 | ColumnFamilyData* cfd = GetColumnFamilySet()->GetDefault(); |
5382 | 0 | IOErrorInfo io_error_info(io_s, FileOperationType::kVerify, |
5383 | 0 | manifest_file_name, /*length=*/size, |
5384 | 0 | /*offset=*/0); |
5385 | |
|
5386 | 0 | for (auto& listener : cfd->ioptions().listeners) { |
5387 | 0 | listener->OnIOError(io_error_info); |
5388 | 0 | } |
5389 | 0 | io_s.PermitUncheckedError(); |
5390 | 0 | io_error_info.io_status.PermitUncheckedError(); |
5391 | 0 | ROCKS_LOG_ERROR(db_options_->info_log, |
5392 | 0 | "MANIFEST verification on Close, " |
5393 | 0 | "filename %s, expected size %" PRIu64 |
5394 | 0 | " failed with status %s and " |
5395 | 0 | "actual size %" PRIu64 "\n", |
5396 | 0 | manifest_file_name.c_str(), manifest_file_size_, |
5397 | 0 | io_s.ToString().c_str(), size); |
5398 | 0 | VersionEdit edit; |
5399 | 0 | assert(cfd); |
5400 | 0 | s = LogAndApply(cfd, ReadOptions(), WriteOptions(), &edit, mu, db_dir); |
5401 | 0 | } |
5402 | | |
5403 | 48.4k | closed_ = true; |
5404 | 48.4k | return s; |
5405 | 48.4k | } |
5406 | | |
5407 | 48.4k | VersionSet::~VersionSet() { |
5408 | | // Must clean up column families to make all files "obsolete" |
5409 | 48.4k | column_family_set_.reset(); |
5410 | | |
5411 | 90.3k | for (auto& file : obsolete_files_) { |
5412 | | // NOTE: DB is shutting down, so file is probably not obsolete, just |
5413 | | // no longer referenced by Versions in memory. |
5414 | | // For more context, see comment on "table_cache_->EraseUnRefEntries()" |
5415 | | // in DBImpl::CloseHelper(). |
5416 | | // Using uncache_aggressiveness=0 overrides any previous marking to |
5417 | | // attempt to uncache the file's blocks (which after cleaning up |
5418 | | // column families could cause use-after-free) |
5419 | 90.3k | TableCache::ReleaseObsolete(table_cache_, file.metadata->fd.GetNumber(), |
5420 | 90.3k | file.metadata->table_reader_handle, |
5421 | 90.3k | /*uncache_aggressiveness=*/0); |
5422 | 90.3k | file.DeleteMetadata(); |
5423 | 90.3k | } |
5424 | 48.4k | obsolete_files_.clear(); |
5425 | 48.4k | io_status_.PermitUncheckedError(); |
5426 | 48.4k | } |
5427 | | |
5428 | 0 | void VersionSet::Reset() { |
5429 | 0 | if (column_family_set_) { |
5430 | 0 | WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); |
5431 | 0 | WriteController* wc = column_family_set_->write_controller(); |
5432 | | // db_id becomes the source of truth after DBImpl::Recover(): |
5433 | | // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527 |
5434 | | // Note: we may not be able to recover db_id from MANIFEST if |
5435 | | // options.write_dbid_to_manifest is false (default). |
5436 | 0 | column_family_set_.reset(new ColumnFamilySet( |
5437 | 0 | dbname_, db_options_, file_options_, table_cache_, wbm, wc, |
5438 | 0 | block_cache_tracer_, io_tracer_, db_id_, db_session_id_)); |
5439 | 0 | } |
5440 | 0 | db_id_.clear(); |
5441 | 0 | next_file_number_.store(2); |
5442 | 0 | min_log_number_to_keep_.store(0); |
5443 | 0 | manifest_file_number_ = 0; |
5444 | 0 | options_file_number_ = 0; |
5445 | 0 | pending_manifest_file_number_ = 0; |
5446 | 0 | last_sequence_.store(0); |
5447 | 0 | last_allocated_sequence_.store(0); |
5448 | 0 | last_published_sequence_.store(0); |
5449 | 0 | prev_log_number_ = 0; |
5450 | 0 | descriptor_log_.reset(); |
5451 | 0 | current_version_number_ = 0; |
5452 | 0 | manifest_writers_.clear(); |
5453 | 0 | manifest_file_size_ = 0; |
5454 | 0 | obsolete_files_.clear(); |
5455 | 0 | obsolete_manifests_.clear(); |
5456 | 0 | wals_.Reset(); |
5457 | 0 | } |
5458 | | |
5459 | | void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, |
5460 | 233k | Version* v) { |
5461 | | // compute new compaction score |
5462 | 233k | v->storage_info()->ComputeCompactionScore( |
5463 | 233k | column_family_data->ioptions(), |
5464 | 233k | column_family_data->GetLatestMutableCFOptions()); |
5465 | | |
5466 | | // Mark v finalized |
5467 | 233k | v->storage_info_.SetFinalized(); |
5468 | | |
5469 | | // Make "v" current |
5470 | 233k | assert(v->refs_ == 0); |
5471 | 233k | Version* current = column_family_data->current(); |
5472 | 233k | assert(v != current); |
5473 | 233k | if (current != nullptr) { |
5474 | 138k | assert(current->refs_ > 0); |
5475 | 138k | current->Unref(); |
5476 | 138k | } |
5477 | 233k | column_family_data->SetCurrent(v); |
5478 | 233k | v->Ref(); |
5479 | | |
5480 | | // Append to linked list |
5481 | 233k | v->prev_ = column_family_data->dummy_versions()->prev_; |
5482 | 233k | v->next_ = column_family_data->dummy_versions(); |
5483 | 233k | v->prev_->next_ = v; |
5484 | 233k | v->next_->prev_ = v; |
5485 | 233k | } |
5486 | | |
5487 | | Status VersionSet::ProcessManifestWrites( |
5488 | | std::deque<ManifestWriter>& writers, InstrumentedMutex* mu, |
5489 | | FSDirectory* dir_contains_current_file, bool new_descriptor_log, |
5490 | | const ColumnFamilyOptions* new_cf_options, const ReadOptions& read_options, |
5491 | 89.5k | const WriteOptions& write_options) { |
5492 | 89.5k | mu->AssertHeld(); |
5493 | 89.5k | assert(!writers.empty()); |
5494 | 89.5k | ManifestWriter& first_writer = writers.front(); |
5495 | 89.5k | ManifestWriter* last_writer = &first_writer; |
5496 | | |
5497 | 89.5k | assert(!manifest_writers_.empty()); |
5498 | 89.5k | assert(manifest_writers_.front() == &first_writer); |
5499 | | |
5500 | 89.5k | autovector<VersionEdit*> batch_edits; |
5501 | | // This vector keeps track of the corresponding user-defined timestamp size |
5502 | | // for `batch_edits` side by side, which is only needed for encoding a |
5503 | | // `VersionEdit` that adds new SST files. |
5504 | | // Note that anytime `batch_edits` has new element added or get existing |
5505 | | // element removed, `batch_edits_ts_sz` should be updated too. |
5506 | 89.5k | autovector<std::optional<size_t>> batch_edits_ts_sz; |
5507 | 89.5k | autovector<Version*> versions; |
5508 | 89.5k | std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards; |
5509 | 89.5k | autovector<const autovector<uint64_t>*> files_to_quarantine_if_commit_fail; |
5510 | 89.5k | autovector<uint64_t> limbo_descriptor_log_file_number; |
5511 | | |
5512 | | // Tracking `max_last_sequence` is needed to ensure we write |
5513 | | // `VersionEdit::last_sequence_`s in non-decreasing order according to the |
5514 | | // recovery code's requirement. It also allows us to defer updating |
5515 | | // `descriptor_last_sequence_` until the apply phase, after the log phase |
5516 | | // succeeds. |
5517 | 89.5k | SequenceNumber max_last_sequence = descriptor_last_sequence_; |
5518 | | |
5519 | 89.5k | bool skip_manifest_write = |
5520 | 89.5k | first_writer.edit_list.front()->IsNoManifestWriteDummy(); |
5521 | | |
5522 | 89.5k | if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) { |
5523 | | // No group commits for column family add or drop |
5524 | 34.0k | LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence); |
5525 | 34.0k | batch_edits.push_back(first_writer.edit_list.front()); |
5526 | 34.0k | batch_edits_ts_sz.push_back(std::nullopt); |
5527 | 55.5k | } else { |
5528 | 55.5k | auto it = manifest_writers_.cbegin(); |
5529 | 55.5k | size_t group_start = std::numeric_limits<size_t>::max(); |
5530 | 72.5k | for (;;) { |
5531 | 72.5k | assert(!(*it)->edit_list.front()->IsColumnFamilyManipulation()); |
5532 | 72.5k | last_writer = *it; |
5533 | 72.5k | assert(last_writer != nullptr); |
5534 | 72.5k | assert(last_writer->cfd != nullptr); |
5535 | 72.5k | if (last_writer->cfd->IsDropped()) { |
5536 | | // If we detect a dropped CF at this point, and the corresponding |
5537 | | // version edits belong to an atomic group, then we need to find out |
5538 | | // the preceding version edits in the same atomic group, and update |
5539 | | // their `remaining_entries_` member variable because we are NOT going |
5540 | | // to write the version edits' of dropped CF to the MANIFEST. If we |
5541 | | // don't update, then Recover can report corrupted atomic group because |
5542 | | // the `remaining_entries_` do not match. |
5543 | 0 | if (!batch_edits.empty()) { |
5544 | 0 | if (batch_edits.back()->IsInAtomicGroup() && |
5545 | 0 | batch_edits.back()->GetRemainingEntries() > 0) { |
5546 | 0 | assert(group_start < batch_edits.size()); |
5547 | 0 | const auto& edit_list = last_writer->edit_list; |
5548 | 0 | size_t k = 0; |
5549 | 0 | while (k < edit_list.size()) { |
5550 | 0 | if (!edit_list[k]->IsInAtomicGroup()) { |
5551 | 0 | break; |
5552 | 0 | } else if (edit_list[k]->GetRemainingEntries() == 0) { |
5553 | 0 | ++k; |
5554 | 0 | break; |
5555 | 0 | } |
5556 | 0 | ++k; |
5557 | 0 | } |
5558 | 0 | for (auto i = group_start; i < batch_edits.size(); ++i) { |
5559 | 0 | assert(static_cast<uint32_t>(k) <= |
5560 | 0 | batch_edits.back()->GetRemainingEntries()); |
5561 | 0 | batch_edits[i]->SetRemainingEntries( |
5562 | 0 | batch_edits[i]->GetRemainingEntries() - |
5563 | 0 | static_cast<uint32_t>(k)); |
5564 | 0 | } |
5565 | 0 | } |
5566 | 0 | } |
5567 | 72.5k | } else { |
5568 | | // We do a linear search on versions because versions is small. |
5569 | | // TODO(yanqin) maybe consider unordered_map |
5570 | 72.5k | Version* version = nullptr; |
5571 | 72.5k | VersionBuilder* builder = nullptr; |
5572 | 89.5k | for (int i = 0; i != static_cast<int>(versions.size()); ++i) { |
5573 | 17.0k | uint32_t cf_id = last_writer->cfd->GetID(); |
5574 | 17.0k | if (versions[i]->cfd()->GetID() == cf_id) { |
5575 | 0 | version = versions[i]; |
5576 | 0 | assert(!builder_guards.empty() && |
5577 | 0 | builder_guards.size() == versions.size()); |
5578 | 0 | builder = builder_guards[i]->version_builder(); |
5579 | 0 | TEST_SYNC_POINT_CALLBACK( |
5580 | 0 | "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id); |
5581 | 0 | break; |
5582 | 0 | } |
5583 | 17.0k | } |
5584 | 72.5k | if (version == nullptr) { |
5585 | | // WAL manipulations do not need to be applied to versions. |
5586 | 72.5k | if (!last_writer->IsAllWalEdits()) { |
5587 | 72.5k | version = new Version( |
5588 | 72.5k | last_writer->cfd, this, file_options_, |
5589 | 72.5k | last_writer->cfd ? last_writer->cfd->GetLatestMutableCFOptions() |
5590 | 72.5k | : MutableCFOptions(*new_cf_options), |
5591 | 72.5k | io_tracer_, current_version_number_++); |
5592 | 72.5k | versions.push_back(version); |
5593 | 72.5k | builder_guards.emplace_back( |
5594 | 72.5k | new BaseReferencedVersionBuilder(last_writer->cfd)); |
5595 | 72.5k | builder = builder_guards.back()->version_builder(); |
5596 | 72.5k | } |
5597 | 72.5k | assert(last_writer->IsAllWalEdits() || builder); |
5598 | 72.5k | assert(last_writer->IsAllWalEdits() || version); |
5599 | 72.5k | TEST_SYNC_POINT_CALLBACK( |
5600 | 72.5k | "VersionSet::ProcessManifestWrites:NewVersion", version); |
5601 | 72.5k | } |
5602 | 72.5k | const Comparator* ucmp = last_writer->cfd->user_comparator(); |
5603 | 72.5k | assert(ucmp); |
5604 | 72.5k | std::optional<size_t> edit_ts_sz = ucmp->timestamp_size(); |
5605 | 191k | for (const auto& e : last_writer->edit_list) { |
5606 | 191k | if (e->IsInAtomicGroup()) { |
5607 | 0 | if (batch_edits.empty() || !batch_edits.back()->IsInAtomicGroup() || |
5608 | 0 | (batch_edits.back()->IsInAtomicGroup() && |
5609 | 0 | batch_edits.back()->GetRemainingEntries() == 0)) { |
5610 | 0 | group_start = batch_edits.size(); |
5611 | 0 | } |
5612 | 191k | } else if (group_start != std::numeric_limits<size_t>::max()) { |
5613 | 0 | group_start = std::numeric_limits<size_t>::max(); |
5614 | 0 | } |
5615 | 191k | Status s = LogAndApplyHelper(last_writer->cfd, builder, e, |
5616 | 191k | &max_last_sequence, mu); |
5617 | 191k | if (!s.ok()) { |
5618 | | // free up the allocated memory |
5619 | 0 | for (auto v : versions) { |
5620 | 0 | delete v; |
5621 | 0 | } |
5622 | | // FIXME? manifest_writers_ still has requested updates |
5623 | 0 | return s; |
5624 | 0 | } |
5625 | 191k | batch_edits.push_back(e); |
5626 | 191k | batch_edits_ts_sz.push_back(edit_ts_sz); |
5627 | 191k | } |
5628 | 72.5k | } |
5629 | | // Loop increment/conditions |
5630 | 72.5k | ++it; |
5631 | 72.5k | if (it == manifest_writers_.cend()) { |
5632 | 55.5k | break; |
5633 | 55.5k | } |
5634 | 17.0k | if (skip_manifest_write) { |
5635 | | // no grouping when skipping manifest write |
5636 | 0 | break; |
5637 | 0 | } |
5638 | 17.0k | const auto* next = (*it)->edit_list.front(); |
5639 | 17.0k | if (next->IsColumnFamilyManipulation() || |
5640 | 17.0k | next->IsNoManifestWriteDummy()) { |
5641 | | // no group commits for column family add or drop |
5642 | | // nor for dummy skipping manifest write |
5643 | 0 | break; |
5644 | 0 | } |
5645 | 17.0k | } |
5646 | 128k | for (int i = 0; i < static_cast<int>(versions.size()); ++i) { |
5647 | 72.5k | assert(!builder_guards.empty() && |
5648 | 72.5k | builder_guards.size() == versions.size()); |
5649 | 72.5k | auto* builder = builder_guards[i]->version_builder(); |
5650 | 72.5k | Status s = builder->SaveTo(versions[i]->storage_info()); |
5651 | 72.5k | if (!s.ok()) { |
5652 | | // free up the allocated memory |
5653 | 0 | for (auto v : versions) { |
5654 | 0 | delete v; |
5655 | 0 | } |
5656 | | // FIXME? manifest_writers_ still has requested updates |
5657 | 0 | return s; |
5658 | 0 | } |
5659 | 72.5k | } |
5660 | 55.5k | } |
5661 | | |
5662 | | #ifndef NDEBUG |
5663 | | // Verify that version edits of atomic groups have correct |
5664 | | // remaining_entries_. |
5665 | | size_t k = 0; |
5666 | | while (k < batch_edits.size()) { |
5667 | | while (k < batch_edits.size() && !batch_edits[k]->IsInAtomicGroup()) { |
5668 | | ++k; |
5669 | | } |
5670 | | if (k == batch_edits.size()) { |
5671 | | break; |
5672 | | } |
5673 | | size_t i = k; |
5674 | | while (i < batch_edits.size()) { |
5675 | | if (!batch_edits[i]->IsInAtomicGroup()) { |
5676 | | break; |
5677 | | } |
5678 | | assert(i - k + batch_edits[i]->GetRemainingEntries() == |
5679 | | batch_edits[k]->GetRemainingEntries()); |
5680 | | if (batch_edits[i]->GetRemainingEntries() == 0) { |
5681 | | ++i; |
5682 | | break; |
5683 | | } |
5684 | | ++i; |
5685 | | } |
5686 | | assert(batch_edits[i - 1]->IsInAtomicGroup()); |
5687 | | assert(0 == batch_edits[i - 1]->GetRemainingEntries()); |
5688 | | std::vector<VersionEdit*> tmp; |
5689 | | for (size_t j = k; j != i; ++j) { |
5690 | | tmp.emplace_back(batch_edits[j]); |
5691 | | } |
5692 | | TEST_SYNC_POINT_CALLBACK( |
5693 | | "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp); |
5694 | | k = i; |
5695 | | } |
5696 | | if (skip_manifest_write) { |
5697 | | // no grouping when skipping manifest write |
5698 | | assert(last_writer == &first_writer); |
5699 | | } |
5700 | | #endif // NDEBUG |
5701 | | |
5702 | 89.5k | assert(pending_manifest_file_number_ == 0); |
5703 | 89.5k | if (!skip_manifest_write && |
5704 | 89.5k | (!descriptor_log_ || |
5705 | 48.4k | manifest_file_size_ > db_options_->max_manifest_file_size)) { |
5706 | 48.4k | TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); |
5707 | 48.4k | new_descriptor_log = true; |
5708 | 48.4k | } else { |
5709 | 41.1k | pending_manifest_file_number_ = manifest_file_number_; |
5710 | 41.1k | } |
5711 | | |
5712 | | // Local cached copy of state variable(s). WriteCurrentStateToManifest() |
5713 | | // reads its content after releasing db mutex to avoid race with |
5714 | | // SwitchMemtable(). |
5715 | 89.5k | std::unordered_map<uint32_t, MutableCFState> curr_state; |
5716 | 89.5k | VersionEdit wal_additions; |
5717 | 89.5k | if (new_descriptor_log) { |
5718 | 48.4k | pending_manifest_file_number_ = NewFileNumber(); |
5719 | 48.4k | batch_edits.back()->SetNextFile(next_file_number_.load()); |
5720 | | |
5721 | | // if we are writing out new snapshot make sure to persist max column |
5722 | | // family. |
5723 | 48.4k | if (column_family_set_->GetMaxColumnFamily() > 0) { |
5724 | 24.3k | first_writer.edit_list.front()->SetMaxColumnFamily( |
5725 | 24.3k | column_family_set_->GetMaxColumnFamily()); |
5726 | 24.3k | } |
5727 | 65.4k | for (const auto* cfd : *column_family_set_) { |
5728 | 65.4k | assert(curr_state.find(cfd->GetID()) == curr_state.end()); |
5729 | 65.4k | curr_state.emplace( |
5730 | 65.4k | cfd->GetID(), |
5731 | 65.4k | MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())); |
5732 | 65.4k | } |
5733 | | |
5734 | 48.4k | for (const auto& wal : wals_.GetWals()) { |
5735 | 0 | wal_additions.AddWal(wal.first, wal.second); |
5736 | 0 | } |
5737 | 48.4k | } |
5738 | | |
5739 | 89.5k | uint64_t new_manifest_file_size = 0; |
5740 | 89.5k | Status s; |
5741 | 89.5k | IOStatus io_s; |
5742 | 89.5k | IOStatus manifest_io_status; |
5743 | 89.5k | manifest_io_status.PermitUncheckedError(); |
5744 | 89.5k | std::unique_ptr<log::Writer> new_desc_log_ptr; |
5745 | 89.5k | if (skip_manifest_write) { |
5746 | 0 | if (s.ok()) { |
5747 | 0 | constexpr bool update_stats = true; |
5748 | 0 | for (int i = 0; i < static_cast<int>(versions.size()); ++i) { |
5749 | | // NOTE: normally called with DB mutex released, but we don't |
5750 | | // want to release the DB mutex in this mode of LogAndApply |
5751 | 0 | versions[i]->PrepareAppend(read_options, update_stats); |
5752 | 0 | } |
5753 | 0 | } |
5754 | 89.5k | } else { |
5755 | 89.5k | FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); |
5756 | | // DB option (in file_options_) takes precedence when not kUnknown |
5757 | 89.5k | if (file_options_.temperature != Temperature::kUnknown) { |
5758 | 0 | opt_file_opts.temperature = file_options_.temperature; |
5759 | 0 | } |
5760 | 89.5k | mu->Unlock(); |
5761 | 89.5k | TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); |
5762 | 89.5k | TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); |
5763 | 89.5k | if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { |
5764 | 128k | for (int i = 0; i < static_cast<int>(versions.size()); ++i) { |
5765 | 72.5k | assert(!builder_guards.empty() && |
5766 | 72.5k | builder_guards.size() == versions.size()); |
5767 | 72.5k | ColumnFamilyData* cfd = versions[i]->cfd_; |
5768 | 72.5k | s = builder_guards[i]->version_builder()->LoadTableHandlers( |
5769 | 72.5k | cfd->internal_stats(), 1 /* max_threads */, |
5770 | 72.5k | true /* prefetch_index_and_filter_in_cache */, |
5771 | 72.5k | false /* is_initial_load */, versions[i]->GetMutableCFOptions(), |
5772 | 72.5k | MaxFileSizeForL0MetaPin(versions[i]->GetMutableCFOptions()), |
5773 | 72.5k | read_options); |
5774 | 72.5k | if (!s.ok()) { |
5775 | 0 | if (db_options_->paranoid_checks) { |
5776 | 0 | break; |
5777 | 0 | } |
5778 | 0 | s = Status::OK(); |
5779 | 0 | } |
5780 | 72.5k | } |
5781 | 55.5k | } |
5782 | | |
5783 | 89.5k | log::Writer* raw_desc_log_ptr = descriptor_log_.get(); |
5784 | 89.5k | if (s.ok() && new_descriptor_log) { |
5785 | | // This is fine because everything inside of this block is serialized -- |
5786 | | // only one thread can be here at the same time |
5787 | | // create new manifest file |
5788 | 48.4k | ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", |
5789 | 48.4k | pending_manifest_file_number_); |
5790 | 48.4k | std::string descriptor_fname = |
5791 | 48.4k | DescriptorFileName(dbname_, pending_manifest_file_number_); |
5792 | 48.4k | std::unique_ptr<FSWritableFile> descriptor_file; |
5793 | 48.4k | io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file, |
5794 | 48.4k | opt_file_opts); |
5795 | 48.4k | if (io_s.ok()) { |
5796 | 48.4k | descriptor_file->SetPreallocationBlockSize( |
5797 | 48.4k | db_options_->manifest_preallocation_size); |
5798 | 48.4k | FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; |
5799 | 48.4k | std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( |
5800 | 48.4k | std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, |
5801 | 48.4k | io_tracer_, nullptr, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, |
5802 | 48.4k | db_options_->listeners, nullptr, |
5803 | 48.4k | tmp_set.Contains(FileType::kDescriptorFile), |
5804 | 48.4k | tmp_set.Contains(FileType::kDescriptorFile))); |
5805 | 48.4k | new_desc_log_ptr.reset( |
5806 | 48.4k | new log::Writer(std::move(file_writer), 0, false)); |
5807 | 48.4k | raw_desc_log_ptr = new_desc_log_ptr.get(); |
5808 | 48.4k | s = WriteCurrentStateToManifest(write_options, curr_state, |
5809 | 48.4k | wal_additions, raw_desc_log_ptr, io_s); |
5810 | 48.4k | assert(s == io_s); |
5811 | 48.4k | } |
5812 | 48.4k | if (!io_s.ok()) { |
5813 | 0 | manifest_io_status = io_s; |
5814 | 0 | s = io_s; |
5815 | 0 | } |
5816 | 48.4k | } |
5817 | | |
5818 | 89.5k | if (s.ok()) { |
5819 | 89.5k | if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { |
5820 | 55.5k | constexpr bool update_stats = true; |
5821 | | |
5822 | 128k | for (int i = 0; i < static_cast<int>(versions.size()); ++i) { |
5823 | 72.5k | versions[i]->PrepareAppend(read_options, update_stats); |
5824 | 72.5k | } |
5825 | 55.5k | } |
5826 | | |
5827 | | // Write new records to MANIFEST log |
5828 | | #ifndef NDEBUG |
5829 | | size_t idx = 0; |
5830 | | #endif |
5831 | 89.5k | assert(batch_edits.size() == batch_edits_ts_sz.size()); |
5832 | 315k | for (size_t bidx = 0; bidx < batch_edits.size(); bidx++) { |
5833 | 225k | auto& e = batch_edits[bidx]; |
5834 | 225k | files_to_quarantine_if_commit_fail.push_back( |
5835 | 225k | e->GetFilesToQuarantineIfCommitFail()); |
5836 | 225k | std::string record; |
5837 | 225k | if (!e->EncodeTo(&record, batch_edits_ts_sz[bidx])) { |
5838 | 0 | s = Status::Corruption("Unable to encode VersionEdit:" + |
5839 | 0 | e->DebugString(true)); |
5840 | 0 | break; |
5841 | 0 | } |
5842 | 225k | TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord", |
5843 | 225k | REDUCE_ODDS2); |
5844 | | #ifndef NDEBUG |
5845 | | if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { |
5846 | | TEST_SYNC_POINT_CALLBACK( |
5847 | | "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", |
5848 | | nullptr); |
5849 | | TEST_SYNC_POINT( |
5850 | | "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); |
5851 | | } |
5852 | | ++idx; |
5853 | | #endif /* !NDEBUG */ |
5854 | 225k | io_s = raw_desc_log_ptr->AddRecord(write_options, record); |
5855 | 225k | if (!io_s.ok()) { |
5856 | 0 | s = io_s; |
5857 | 0 | manifest_io_status = io_s; |
5858 | 0 | break; |
5859 | 0 | } |
5860 | 225k | } |
5861 | | |
5862 | 89.5k | if (s.ok()) { |
5863 | 89.5k | io_s = |
5864 | 89.5k | SyncManifest(db_options_, write_options, raw_desc_log_ptr->file()); |
5865 | 89.5k | manifest_io_status = io_s; |
5866 | 89.5k | TEST_SYNC_POINT_CALLBACK( |
5867 | 89.5k | "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); |
5868 | 89.5k | } |
5869 | 89.5k | if (!io_s.ok()) { |
5870 | 0 | s = io_s; |
5871 | 0 | ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", |
5872 | 0 | s.ToString().c_str()); |
5873 | 0 | } |
5874 | 89.5k | } |
5875 | | |
5876 | | // If we just created a new descriptor file, install it by writing a |
5877 | | // new CURRENT file that points to it. |
5878 | 89.5k | if (s.ok()) { |
5879 | 89.5k | assert(manifest_io_status.ok()); |
5880 | 89.5k | } |
5881 | 89.5k | if (s.ok() && new_descriptor_log) { |
5882 | 48.4k | io_s = SetCurrentFile( |
5883 | 48.4k | write_options, fs_.get(), dbname_, pending_manifest_file_number_, |
5884 | 48.4k | file_options_.temperature, dir_contains_current_file); |
5885 | 48.4k | if (!io_s.ok()) { |
5886 | 0 | s = io_s; |
5887 | | // Quarantine old manifest file in case new manifest file's CURRENT file |
5888 | | // wasn't created successfully and the old manifest is needed. |
5889 | 0 | limbo_descriptor_log_file_number.push_back(manifest_file_number_); |
5890 | 0 | files_to_quarantine_if_commit_fail.push_back( |
5891 | 0 | &limbo_descriptor_log_file_number); |
5892 | 0 | } |
5893 | 48.4k | } |
5894 | | |
5895 | 89.5k | if (s.ok()) { |
5896 | | // find offset in manifest file where this version is stored. |
5897 | 89.5k | new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize(); |
5898 | 89.5k | } |
5899 | | |
5900 | 89.5k | if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { |
5901 | 17.0k | TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); |
5902 | 17.0k | TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); |
5903 | 17.0k | TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); |
5904 | 17.0k | } |
5905 | | |
5906 | 89.5k | LogFlush(db_options_->info_log); |
5907 | 89.5k | TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); |
5908 | 89.5k | mu->Lock(); |
5909 | 89.5k | } |
5910 | | |
5911 | 89.5k | if (s.ok()) { |
5912 | | // Apply WAL edits, DB mutex must be held. |
5913 | 225k | for (auto& e : batch_edits) { |
5914 | 225k | if (e->IsWalAddition()) { |
5915 | 0 | s = wals_.AddWals(e->GetWalAdditions()); |
5916 | 225k | } else if (e->IsWalDeletion()) { |
5917 | 0 | s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber()); |
5918 | 0 | } |
5919 | 225k | if (!s.ok()) { |
5920 | 0 | break; |
5921 | 0 | } |
5922 | 225k | } |
5923 | 89.5k | } |
5924 | | |
5925 | 89.5k | if (!io_s.ok()) { |
5926 | 0 | if (io_status_.ok()) { |
5927 | 0 | io_status_ = io_s; |
5928 | 0 | if (error_handler_) { |
5929 | 0 | error_handler_->AddFilesToQuarantine( |
5930 | 0 | files_to_quarantine_if_commit_fail); |
5931 | 0 | } |
5932 | 0 | } |
5933 | 89.5k | } else if (!io_status_.ok()) { |
5934 | 0 | io_status_ = io_s; |
5935 | 0 | if (error_handler_) { |
5936 | 0 | error_handler_->ClearFilesToQuarantine(); |
5937 | 0 | } |
5938 | 0 | } |
5939 | | |
5940 | | // Append the old manifest file to the obsolete_manifest_ list to be deleted |
5941 | | // by PurgeObsoleteFiles later. |
5942 | 89.5k | if (s.ok() && new_descriptor_log) { |
5943 | 48.4k | descriptor_log_ = std::move(new_desc_log_ptr); |
5944 | 48.4k | obsolete_manifests_.emplace_back( |
5945 | 48.4k | DescriptorFileName("", manifest_file_number_)); |
5946 | 48.4k | } |
5947 | | |
5948 | | // Install the new versions |
5949 | 89.5k | if (s.ok()) { |
5950 | 89.5k | if (first_writer.edit_list.front()->IsColumnFamilyAdd()) { |
5951 | 17.0k | assert(batch_edits.size() == 1); |
5952 | 17.0k | assert(new_cf_options != nullptr); |
5953 | 17.0k | assert(max_last_sequence == descriptor_last_sequence_); |
5954 | 17.0k | CreateColumnFamily(*new_cf_options, read_options, |
5955 | 17.0k | first_writer.edit_list.front(), |
5956 | 17.0k | /*read_only*/ false); |
5957 | 72.5k | } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { |
5958 | 17.0k | assert(batch_edits.size() == 1); |
5959 | 17.0k | assert(max_last_sequence == descriptor_last_sequence_); |
5960 | 17.0k | first_writer.cfd->SetDropped(); |
5961 | 17.0k | first_writer.cfd->UnrefAndTryDelete(); |
5962 | 55.5k | } else { |
5963 | | // Each version in versions corresponds to a column family. |
5964 | | // For each column family, update its log number indicating that logs |
5965 | | // with number smaller than this should be ignored. |
5966 | 55.5k | uint64_t last_min_log_number_to_keep = 0; |
5967 | 191k | for (const auto& e : batch_edits) { |
5968 | 191k | ColumnFamilyData* cfd = nullptr; |
5969 | 191k | if (!e->IsColumnFamilyManipulation()) { |
5970 | 191k | cfd = column_family_set_->GetColumnFamily(e->GetColumnFamily()); |
5971 | | // e would not have been added to batch_edits if its corresponding |
5972 | | // column family is dropped. |
5973 | 191k | assert(cfd); |
5974 | 191k | } |
5975 | 191k | if (cfd) { |
5976 | 191k | if (e->HasLogNumber() && e->GetLogNumber() > cfd->GetLogNumber()) { |
5977 | 57.9k | cfd->SetLogNumber(e->GetLogNumber()); |
5978 | 57.9k | } |
5979 | 191k | if (e->HasFullHistoryTsLow()) { |
5980 | 0 | cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); |
5981 | 0 | } |
5982 | 191k | } |
5983 | 191k | if (e->HasMinLogNumberToKeep()) { |
5984 | 40.9k | last_min_log_number_to_keep = |
5985 | 40.9k | std::max(last_min_log_number_to_keep, e->GetMinLogNumberToKeep()); |
5986 | 40.9k | } |
5987 | 191k | } |
5988 | | |
5989 | 55.5k | if (last_min_log_number_to_keep != 0) { |
5990 | 40.9k | MarkMinLogNumberToKeep(last_min_log_number_to_keep); |
5991 | 40.9k | } |
5992 | | |
5993 | 128k | for (int i = 0; i < static_cast<int>(versions.size()); ++i) { |
5994 | 72.5k | ColumnFamilyData* cfd = versions[i]->cfd_; |
5995 | 72.5k | AppendVersion(cfd, versions[i]); |
5996 | 72.5k | } |
5997 | 55.5k | } |
5998 | 89.5k | if (!skip_manifest_write) { |
5999 | 89.5k | assert(max_last_sequence >= descriptor_last_sequence_); |
6000 | 89.5k | descriptor_last_sequence_ = max_last_sequence; |
6001 | 89.5k | manifest_file_number_ = pending_manifest_file_number_; |
6002 | 89.5k | manifest_file_size_ = new_manifest_file_size; |
6003 | 89.5k | prev_log_number_ = first_writer.edit_list.front()->GetPrevLogNumber(); |
6004 | 89.5k | } |
6005 | 89.5k | } else { |
6006 | 0 | std::string version_edits; |
6007 | 0 | for (auto& e : batch_edits) { |
6008 | 0 | version_edits += ("\n" + e->DebugString(true)); |
6009 | 0 | } |
6010 | 0 | ROCKS_LOG_ERROR(db_options_->info_log, |
6011 | 0 | "Error in committing version edit to MANIFEST: %s", |
6012 | 0 | version_edits.c_str()); |
6013 | 0 | for (auto v : versions) { |
6014 | 0 | delete v; |
6015 | 0 | } |
6016 | | // If manifest append failed for whatever reason, the file could be |
6017 | | // corrupted. So we need to force the next version update to start a |
6018 | | // new manifest file. |
6019 | 0 | descriptor_log_.reset(); |
6020 | 0 | new_desc_log_ptr.reset(); |
6021 | | // If manifest operations failed, then we know the CURRENT file still |
6022 | | // points to the original MANIFEST. Therefore, we can safely delete the |
6023 | | // new MANIFEST. |
6024 | | // If manifest operations succeeded, and we are here, then it is possible |
6025 | | // that renaming tmp file to CURRENT failed. |
6026 | | // |
6027 | | // On local POSIX-compliant FS, the CURRENT must point to the original |
6028 | | // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also |
6029 | | // keep it. Future recovery will ignore this MANIFEST. It's also ok for the |
6030 | | // process not to crash and continue using the db. Any future LogAndApply() |
6031 | | // call will switch to a new MANIFEST and update CURRENT, still ignoring |
6032 | | // this one. |
6033 | | // |
6034 | | // On non-local FS, it is |
6035 | | // possible that the rename operation succeeded on the server (remote) |
6036 | | // side, but the client somehow returns a non-ok status to RocksDB. Note |
6037 | | // that this does not violate atomicity. Should we delete the new MANIFEST |
6038 | | // successfully, a subsequent recovery attempt will likely see the CURRENT |
6039 | | // pointing to the new MANIFEST, thus fail. We will not be able to open the |
6040 | | // DB again. Therefore, if manifest operations succeed, we should keep the |
6041 | | // the new MANIFEST. If the process proceeds, any future LogAndApply() call |
6042 | | // will switch to a new MANIFEST and update CURRENT. If user tries to |
6043 | | // re-open the DB, |
6044 | | // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. |
6045 | | // b) CURRENT points to the original MANIFEST, and the original MANIFEST |
6046 | | // also exists. |
6047 | 0 | if (!manifest_io_status.ok() && new_descriptor_log) { |
6048 | 0 | ROCKS_LOG_INFO(db_options_->info_log, |
6049 | 0 | "Deleting manifest %" PRIu64 " current manifest %" PRIu64 |
6050 | 0 | "\n", |
6051 | 0 | pending_manifest_file_number_, manifest_file_number_); |
6052 | 0 | Status manifest_del_status = env_->DeleteFile( |
6053 | 0 | DescriptorFileName(dbname_, pending_manifest_file_number_)); |
6054 | 0 | if (!manifest_del_status.ok()) { |
6055 | 0 | ROCKS_LOG_WARN(db_options_->info_log, |
6056 | 0 | "Failed to delete manifest %" PRIu64 ": %s", |
6057 | 0 | pending_manifest_file_number_, |
6058 | 0 | manifest_del_status.ToString().c_str()); |
6059 | 0 | } |
6060 | 0 | } |
6061 | 0 | } |
6062 | | |
6063 | 89.5k | pending_manifest_file_number_ = 0; |
6064 | | |
6065 | | #ifndef NDEBUG |
6066 | | // This is here kind of awkwardly because there's no other consistency |
6067 | | // checks on `VersionSet`'s updates for the new `Version`s. We might want |
6068 | | // to move it to a dedicated function, or remove it if we gain enough |
6069 | | // confidence in `descriptor_last_sequence_`. |
6070 | | if (s.ok()) { |
6071 | | for (const auto* v : versions) { |
6072 | | const auto* vstorage = v->storage_info(); |
6073 | | for (int level = 0; level < vstorage->num_levels(); ++level) { |
6074 | | for (const auto& file : vstorage->LevelFiles(level)) { |
6075 | | assert(file->fd.largest_seqno <= descriptor_last_sequence_); |
6076 | | } |
6077 | | } |
6078 | | } |
6079 | | } |
6080 | | #endif // NDEBUG |
6081 | | |
6082 | | // wake up all the waiting writers |
6083 | 106k | while (true) { |
6084 | 106k | ManifestWriter* ready = manifest_writers_.front(); |
6085 | 106k | manifest_writers_.pop_front(); |
6086 | 106k | bool need_signal = true; |
6087 | 123k | for (const auto& w : writers) { |
6088 | 123k | if (&w == ready) { |
6089 | 106k | need_signal = false; |
6090 | 106k | break; |
6091 | 106k | } |
6092 | 123k | } |
6093 | 106k | ready->status = s; |
6094 | 106k | ready->done = true; |
6095 | 106k | if (ready->manifest_write_callback) { |
6096 | 72.5k | (ready->manifest_write_callback)(s); |
6097 | 72.5k | } |
6098 | 106k | if (need_signal) { |
6099 | 0 | ready->cv.Signal(); |
6100 | 0 | } |
6101 | 106k | if (ready == last_writer) { |
6102 | 89.5k | break; |
6103 | 89.5k | } |
6104 | 106k | } |
6105 | 89.5k | if (!manifest_writers_.empty()) { |
6106 | 939 | manifest_writers_.front()->cv.Signal(); |
6107 | 939 | } |
6108 | 89.5k | return s; |
6109 | 89.5k | } |
6110 | | |
6111 | 0 | void VersionSet::WakeUpWaitingManifestWriters() { |
6112 | | // wake up all the waiting writers |
6113 | | // Notify new head of manifest write queue. |
6114 | 0 | if (!manifest_writers_.empty()) { |
6115 | 0 | manifest_writers_.front()->cv.Signal(); |
6116 | 0 | } |
6117 | 0 | } |
6118 | | |
6119 | | // 'datas' is grammatically incorrect. We still use this notation to indicate |
6120 | | // that this variable represents a collection of column_family_data. |
6121 | | Status VersionSet::LogAndApply( |
6122 | | const autovector<ColumnFamilyData*>& column_family_datas, |
6123 | | const ReadOptions& read_options, const WriteOptions& write_options, |
6124 | | const autovector<autovector<VersionEdit*>>& edit_lists, |
6125 | | InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, |
6126 | | bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, |
6127 | | const std::vector<std::function<void(const Status&)>>& manifest_wcbs, |
6128 | 89.5k | const std::function<Status()>& pre_cb) { |
6129 | 89.5k | mu->AssertHeld(); |
6130 | 89.5k | int num_edits = 0; |
6131 | 106k | for (const auto& elist : edit_lists) { |
6132 | 106k | num_edits += static_cast<int>(elist.size()); |
6133 | 106k | } |
6134 | 89.5k | if (num_edits == 0) { |
6135 | 0 | return Status::OK(); |
6136 | 89.5k | } else if (num_edits > 1) { |
6137 | | #ifndef NDEBUG |
6138 | | for (const auto& edit_list : edit_lists) { |
6139 | | for (const auto& edit : edit_list) { |
6140 | | assert(!edit->IsColumnFamilyManipulation()); |
6141 | | assert(!edit->IsNoManifestWriteDummy()); |
6142 | | } |
6143 | | } |
6144 | | #endif /* ! NDEBUG */ |
6145 | 40.9k | } |
6146 | | |
6147 | 89.5k | int num_cfds = static_cast<int>(column_family_datas.size()); |
6148 | 89.5k | if (num_cfds == 1 && column_family_datas[0] == nullptr) { |
6149 | 17.0k | assert(edit_lists.size() == 1 && edit_lists[0].size() == 1); |
6150 | 17.0k | assert(edit_lists[0][0]->IsColumnFamilyAdd()); |
6151 | 17.0k | assert(new_cf_options != nullptr); |
6152 | 17.0k | } |
6153 | 89.5k | std::deque<ManifestWriter> writers; |
6154 | 89.5k | if (num_cfds > 0) { |
6155 | 89.5k | assert(static_cast<size_t>(num_cfds) == edit_lists.size()); |
6156 | 89.5k | } |
6157 | 196k | for (int i = 0; i < num_cfds; ++i) { |
6158 | 106k | const auto wcb = |
6159 | 106k | manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i]; |
6160 | 106k | writers.emplace_back(mu, column_family_datas[i], edit_lists[i], wcb); |
6161 | 106k | manifest_writers_.push_back(&writers[i]); |
6162 | 106k | } |
6163 | 89.5k | assert(!writers.empty()); |
6164 | 89.5k | ManifestWriter& first_writer = writers.front(); |
6165 | 89.5k | TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting", |
6166 | 89.5k | nullptr); |
6167 | 90.5k | while (!first_writer.done && &first_writer != manifest_writers_.front()) { |
6168 | 939 | first_writer.cv.Wait(); |
6169 | 939 | } |
6170 | 89.5k | if (first_writer.done) { |
6171 | | // All non-CF-manipulation operations can be grouped together and committed |
6172 | | // to MANIFEST. They should all have finished. The status code is stored in |
6173 | | // the first manifest writer. |
6174 | | #ifndef NDEBUG |
6175 | | for (const auto& writer : writers) { |
6176 | | assert(writer.done); |
6177 | | } |
6178 | | TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu); |
6179 | | #endif /* !NDEBUG */ |
6180 | | // FIXME: One MANIFEST write failure can cause all writes to SetBGError, |
6181 | | // should only SetBGError once. |
6182 | 0 | return first_writer.status; |
6183 | 0 | } |
6184 | 89.5k | TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndNotDone", mu); |
6185 | | |
6186 | 89.5k | int num_undropped_cfds = 0; |
6187 | 106k | for (auto cfd : column_family_datas) { |
6188 | | // if cfd == nullptr, it is a column family add. |
6189 | 106k | if (cfd == nullptr || !cfd->IsDropped()) { |
6190 | 106k | ++num_undropped_cfds; |
6191 | 106k | } |
6192 | 106k | } |
6193 | 89.5k | Status s; |
6194 | 89.5k | if (0 == num_undropped_cfds) { |
6195 | 0 | s = Status::ColumnFamilyDropped(); |
6196 | 0 | } |
6197 | | // Call pre_cb once we know we have work to do and are scheduled as the |
6198 | | // exclusive manifest writer (and new Version appender) |
6199 | 89.5k | if (s.ok() && pre_cb) { |
6200 | 0 | s = pre_cb(); |
6201 | 0 | } |
6202 | 89.5k | if (!s.ok()) { |
6203 | | // Revert manifest_writers_ |
6204 | 0 | for (int i = 0; i != num_cfds; ++i) { |
6205 | 0 | manifest_writers_.pop_front(); |
6206 | 0 | } |
6207 | | // Notify new head of manifest write queue. |
6208 | 0 | if (!manifest_writers_.empty()) { |
6209 | 0 | manifest_writers_.front()->cv.Signal(); |
6210 | 0 | } |
6211 | 0 | return s; |
6212 | 89.5k | } else { |
6213 | 89.5k | return ProcessManifestWrites(writers, mu, dir_contains_current_file, |
6214 | 89.5k | new_descriptor_log, new_cf_options, |
6215 | 89.5k | read_options, write_options); |
6216 | 89.5k | } |
6217 | 89.5k | } |
6218 | | |
6219 | | void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, |
6220 | 34.0k | SequenceNumber* max_last_sequence) { |
6221 | 34.0k | assert(max_last_sequence != nullptr); |
6222 | 34.0k | assert(edit->IsColumnFamilyManipulation()); |
6223 | 34.0k | edit->SetNextFile(next_file_number_.load()); |
6224 | 34.0k | assert(!edit->HasLastSequence()); |
6225 | 34.0k | edit->SetLastSequence(*max_last_sequence); |
6226 | 34.0k | if (edit->IsColumnFamilyDrop()) { |
6227 | | // if we drop column family, we have to make sure to save max column family, |
6228 | | // so that we don't reuse existing ID |
6229 | 17.0k | edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); |
6230 | 17.0k | } |
6231 | 34.0k | } |
6232 | | |
6233 | | Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, |
6234 | | VersionBuilder* builder, VersionEdit* edit, |
6235 | | SequenceNumber* max_last_sequence, |
6236 | 191k | InstrumentedMutex* mu) { |
6237 | 191k | #ifdef NDEBUG |
6238 | 191k | (void)cfd; |
6239 | 191k | #endif |
6240 | 191k | mu->AssertHeld(); |
6241 | 191k | assert(!edit->IsColumnFamilyManipulation()); |
6242 | 191k | assert(max_last_sequence != nullptr); |
6243 | | |
6244 | 191k | if (edit->HasLogNumber()) { |
6245 | 57.9k | assert(edit->GetLogNumber() >= cfd->GetLogNumber()); |
6246 | 57.9k | assert(edit->GetLogNumber() < next_file_number_.load()); |
6247 | 57.9k | } |
6248 | | |
6249 | 191k | if (!edit->HasPrevLogNumber()) { |
6250 | 190k | edit->SetPrevLogNumber(prev_log_number_); |
6251 | 190k | } |
6252 | 191k | edit->SetNextFile(next_file_number_.load()); |
6253 | 191k | if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) { |
6254 | 18.7k | *max_last_sequence = edit->GetLastSequence(); |
6255 | 173k | } else { |
6256 | 173k | edit->SetLastSequence(*max_last_sequence); |
6257 | 173k | } |
6258 | | |
6259 | | // The builder can be nullptr only if edit is WAL manipulation, |
6260 | | // because WAL edits do not need to be applied to versions, |
6261 | | // we return Status::OK() in this case. |
6262 | 191k | assert(builder || edit->IsWalManipulation()); |
6263 | 191k | return builder ? builder->Apply(edit) : Status::OK(); |
6264 | 191k | } |
6265 | | |
6266 | | Status VersionSet::Recover( |
6267 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
6268 | | std::string* db_id, bool no_error_if_files_missing, bool is_retry, |
6269 | 48.4k | Status* log_status) { |
6270 | 48.4k | const ReadOptions read_options(Env::IOActivity::kDBOpen); |
6271 | | // Read "CURRENT" file, which contains a pointer to the current manifest |
6272 | | // file |
6273 | 48.4k | std::string manifest_path; |
6274 | 48.4k | Status s = GetCurrentManifestPath(dbname_, fs_.get(), is_retry, |
6275 | 48.4k | &manifest_path, &manifest_file_number_); |
6276 | 48.4k | if (!s.ok()) { |
6277 | 0 | return s; |
6278 | 0 | } |
6279 | | |
6280 | 48.4k | ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n", |
6281 | 48.4k | manifest_path.c_str()); |
6282 | | |
6283 | 48.4k | std::unique_ptr<SequentialFileReader> manifest_file_reader; |
6284 | 48.4k | { |
6285 | 48.4k | std::unique_ptr<FSSequentialFile> manifest_file; |
6286 | 48.4k | s = fs_->NewSequentialFile(manifest_path, |
6287 | 48.4k | fs_->OptimizeForManifestRead(file_options_), |
6288 | 48.4k | &manifest_file, nullptr); |
6289 | 48.4k | if (!s.ok()) { |
6290 | 0 | return s; |
6291 | 0 | } |
6292 | 48.4k | manifest_file_reader.reset(new SequentialFileReader( |
6293 | 48.4k | std::move(manifest_file), manifest_path, |
6294 | 48.4k | db_options_->log_readahead_size, io_tracer_, db_options_->listeners, |
6295 | 48.4k | /*rate_limiter=*/nullptr, is_retry)); |
6296 | 48.4k | } |
6297 | 0 | TEST_SYNC_POINT("VersionSet::Recover:StartManifestRead"); |
6298 | | |
6299 | 48.4k | uint64_t current_manifest_file_size = 0; |
6300 | 48.4k | uint64_t log_number = 0; |
6301 | 48.4k | { |
6302 | 48.4k | VersionSet::LogReporter reporter; |
6303 | 48.4k | Status log_read_status; |
6304 | 48.4k | reporter.status = &log_read_status; |
6305 | 48.4k | log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, |
6306 | 48.4k | true /* checksum */, 0 /* log_number */); |
6307 | 48.4k | VersionEditHandler handler( |
6308 | 48.4k | read_only, column_families, const_cast<VersionSet*>(this), |
6309 | 48.4k | /*track_found_and_missing_files=*/false, no_error_if_files_missing, |
6310 | 48.4k | io_tracer_, read_options, /*allow_incomplete_valid_version=*/false, |
6311 | 48.4k | EpochNumberRequirement::kMightMissing); |
6312 | 48.4k | handler.Iterate(reader, &log_read_status); |
6313 | 48.4k | s = handler.status(); |
6314 | 48.4k | if (s.ok()) { |
6315 | 48.4k | log_number = handler.GetVersionEditParams().GetLogNumber(); |
6316 | 48.4k | current_manifest_file_size = reader.GetReadOffset(); |
6317 | 48.4k | assert(current_manifest_file_size != 0); |
6318 | 48.4k | handler.GetDbId(db_id); |
6319 | 48.4k | } |
6320 | 48.4k | if (s.ok()) { |
6321 | 48.4k | RecoverEpochNumbers(); |
6322 | 48.4k | } |
6323 | 48.4k | if (log_status) { |
6324 | 48.4k | *log_status = log_read_status; |
6325 | 48.4k | } |
6326 | 48.4k | } |
6327 | | |
6328 | 48.4k | if (s.ok()) { |
6329 | 48.4k | manifest_file_size_ = current_manifest_file_size; |
6330 | 48.4k | ROCKS_LOG_INFO( |
6331 | 48.4k | db_options_->info_log, |
6332 | 48.4k | "Recovered from manifest file:%s succeeded," |
6333 | 48.4k | "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64 |
6334 | 48.4k | ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 |
6335 | 48.4k | ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 |
6336 | 48.4k | ",min_log_number_to_keep is %" PRIu64 "\n", |
6337 | 48.4k | manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), |
6338 | 48.4k | last_sequence_.load(), log_number, prev_log_number_, |
6339 | 48.4k | column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep()); |
6340 | | |
6341 | 65.4k | for (auto cfd : *column_family_set_) { |
6342 | 65.4k | if (cfd->IsDropped()) { |
6343 | 0 | continue; |
6344 | 0 | } |
6345 | 65.4k | ROCKS_LOG_INFO(db_options_->info_log, |
6346 | 65.4k | "Column family [%s] (ID %" PRIu32 |
6347 | 65.4k | "), log number is %" PRIu64 "\n", |
6348 | 65.4k | cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); |
6349 | 65.4k | } |
6350 | 48.4k | } |
6351 | | |
6352 | 48.4k | return s; |
6353 | 48.4k | } |
6354 | | |
6355 | | namespace { |
6356 | | class ManifestPicker { |
6357 | | public: |
6358 | | explicit ManifestPicker(const std::string& dbname, |
6359 | | const std::vector<std::string>& files_in_dbname); |
6360 | | // REQUIRES Valid() == true |
6361 | | std::string GetNextManifest(uint64_t* file_number, std::string* file_name); |
6362 | 0 | bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); } |
6363 | | |
6364 | | private: |
6365 | | const std::string& dbname_; |
6366 | | // MANIFEST file names(s) |
6367 | | std::vector<std::string> manifest_files_; |
6368 | | std::vector<std::string>::const_iterator manifest_file_iter_; |
6369 | | }; |
6370 | | |
6371 | | ManifestPicker::ManifestPicker(const std::string& dbname, |
6372 | | const std::vector<std::string>& files_in_dbname) |
6373 | 0 | : dbname_(dbname) { |
6374 | | // populate manifest files |
6375 | 0 | assert(!files_in_dbname.empty()); |
6376 | 0 | for (const auto& fname : files_in_dbname) { |
6377 | 0 | uint64_t file_num = 0; |
6378 | 0 | FileType file_type; |
6379 | 0 | bool parse_ok = ParseFileName(fname, &file_num, &file_type); |
6380 | 0 | if (parse_ok && file_type == kDescriptorFile) { |
6381 | 0 | manifest_files_.push_back(fname); |
6382 | 0 | } |
6383 | 0 | } |
6384 | | // seek to first manifest |
6385 | 0 | std::sort(manifest_files_.begin(), manifest_files_.end(), |
6386 | 0 | [](const std::string& lhs, const std::string& rhs) { |
6387 | 0 | uint64_t num1 = 0; |
6388 | 0 | uint64_t num2 = 0; |
6389 | 0 | FileType type1; |
6390 | 0 | FileType type2; |
6391 | 0 | bool parse_ok1 = ParseFileName(lhs, &num1, &type1); |
6392 | 0 | bool parse_ok2 = ParseFileName(rhs, &num2, &type2); |
6393 | | #ifndef NDEBUG |
6394 | | assert(parse_ok1); |
6395 | | assert(parse_ok2); |
6396 | | #else |
6397 | 0 | (void)parse_ok1; |
6398 | 0 | (void)parse_ok2; |
6399 | 0 | #endif |
6400 | 0 | return num1 > num2; |
6401 | 0 | }); |
6402 | 0 | manifest_file_iter_ = manifest_files_.begin(); |
6403 | 0 | } |
6404 | | |
6405 | | std::string ManifestPicker::GetNextManifest(uint64_t* number, |
6406 | 0 | std::string* file_name) { |
6407 | 0 | assert(Valid()); |
6408 | 0 | std::string ret; |
6409 | 0 | if (manifest_file_iter_ != manifest_files_.end()) { |
6410 | 0 | ret.assign(dbname_); |
6411 | 0 | if (ret.back() != kFilePathSeparator) { |
6412 | 0 | ret.push_back(kFilePathSeparator); |
6413 | 0 | } |
6414 | 0 | ret.append(*manifest_file_iter_); |
6415 | 0 | if (number) { |
6416 | 0 | FileType type; |
6417 | 0 | bool parse = ParseFileName(*manifest_file_iter_, number, &type); |
6418 | 0 | assert(type == kDescriptorFile); |
6419 | | #ifndef NDEBUG |
6420 | | assert(parse); |
6421 | | #else |
6422 | 0 | (void)parse; |
6423 | 0 | #endif |
6424 | 0 | } |
6425 | 0 | if (file_name) { |
6426 | 0 | *file_name = *manifest_file_iter_; |
6427 | 0 | } |
6428 | 0 | ++manifest_file_iter_; |
6429 | 0 | } |
6430 | 0 | return ret; |
6431 | 0 | } |
6432 | | } // anonymous namespace |
6433 | | |
6434 | | Status VersionSet::TryRecover( |
6435 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
6436 | | const std::vector<std::string>& files_in_dbname, std::string* db_id, |
6437 | 0 | bool* has_missing_table_file) { |
6438 | 0 | ManifestPicker manifest_picker(dbname_, files_in_dbname); |
6439 | 0 | if (!manifest_picker.Valid()) { |
6440 | 0 | return Status::Corruption("Cannot locate MANIFEST file in " + dbname_); |
6441 | 0 | } |
6442 | 0 | Status s; |
6443 | 0 | std::string manifest_path = |
6444 | 0 | manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); |
6445 | 0 | while (!manifest_path.empty()) { |
6446 | 0 | s = TryRecoverFromOneManifest(manifest_path, column_families, read_only, |
6447 | 0 | db_id, has_missing_table_file); |
6448 | 0 | if (s.ok() || !manifest_picker.Valid()) { |
6449 | 0 | break; |
6450 | 0 | } |
6451 | 0 | Reset(); |
6452 | 0 | manifest_path = |
6453 | 0 | manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); |
6454 | 0 | } |
6455 | 0 | return s; |
6456 | 0 | } |
6457 | | |
6458 | | Status VersionSet::TryRecoverFromOneManifest( |
6459 | | const std::string& manifest_path, |
6460 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
6461 | 0 | std::string* db_id, bool* has_missing_table_file) { |
6462 | 0 | const ReadOptions read_options(Env::IOActivity::kDBOpen); |
6463 | 0 | ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", |
6464 | 0 | manifest_path.c_str()); |
6465 | 0 | std::unique_ptr<SequentialFileReader> manifest_file_reader; |
6466 | 0 | Status s; |
6467 | 0 | { |
6468 | 0 | std::unique_ptr<FSSequentialFile> manifest_file; |
6469 | 0 | s = fs_->NewSequentialFile(manifest_path, |
6470 | 0 | fs_->OptimizeForManifestRead(file_options_), |
6471 | 0 | &manifest_file, nullptr); |
6472 | 0 | if (!s.ok()) { |
6473 | 0 | return s; |
6474 | 0 | } |
6475 | 0 | manifest_file_reader.reset(new SequentialFileReader( |
6476 | 0 | std::move(manifest_file), manifest_path, |
6477 | 0 | db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); |
6478 | 0 | } |
6479 | | |
6480 | 0 | assert(s.ok()); |
6481 | 0 | VersionSet::LogReporter reporter; |
6482 | 0 | reporter.status = &s; |
6483 | 0 | log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, |
6484 | 0 | /*checksum=*/true, /*log_num=*/0); |
6485 | 0 | VersionEditHandlerPointInTime handler_pit( |
6486 | 0 | read_only, column_families, const_cast<VersionSet*>(this), io_tracer_, |
6487 | 0 | read_options, /*allow_incomplete_valid_version=*/true, |
6488 | 0 | EpochNumberRequirement::kMightMissing); |
6489 | |
|
6490 | 0 | handler_pit.Iterate(reader, &s); |
6491 | |
|
6492 | 0 | handler_pit.GetDbId(db_id); |
6493 | |
|
6494 | 0 | assert(nullptr != has_missing_table_file); |
6495 | 0 | *has_missing_table_file = handler_pit.HasMissingFiles(); |
6496 | |
|
6497 | 0 | s = handler_pit.status(); |
6498 | 0 | if (s.ok()) { |
6499 | 0 | RecoverEpochNumbers(); |
6500 | 0 | } |
6501 | 0 | return s; |
6502 | 0 | } |
6503 | | |
6504 | 48.4k | void VersionSet::RecoverEpochNumbers() { |
6505 | 65.4k | for (auto cfd : *column_family_set_) { |
6506 | 65.4k | if (cfd->IsDropped()) { |
6507 | 0 | continue; |
6508 | 0 | } |
6509 | 65.4k | assert(cfd->initialized()); |
6510 | 65.4k | cfd->RecoverEpochNumbers(); |
6511 | 65.4k | } |
6512 | 48.4k | } |
6513 | | |
6514 | | Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families, |
6515 | | const std::string& dbname, |
6516 | 0 | FileSystem* fs) { |
6517 | | // Read "CURRENT" file, which contains a pointer to the current manifest file |
6518 | 0 | std::string manifest_path; |
6519 | 0 | uint64_t manifest_file_number; |
6520 | 0 | Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false, |
6521 | 0 | &manifest_path, &manifest_file_number); |
6522 | 0 | if (!s.ok()) { |
6523 | 0 | return s; |
6524 | 0 | } |
6525 | 0 | return ListColumnFamiliesFromManifest(manifest_path, fs, column_families); |
6526 | 0 | } |
6527 | | |
6528 | | Status VersionSet::ListColumnFamiliesFromManifest( |
6529 | | const std::string& manifest_path, FileSystem* fs, |
6530 | 0 | std::vector<std::string>* column_families) { |
6531 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
6532 | 0 | const ReadOptions read_options; |
6533 | 0 | std::unique_ptr<SequentialFileReader> file_reader; |
6534 | 0 | Status s; |
6535 | 0 | { |
6536 | 0 | std::unique_ptr<FSSequentialFile> file; |
6537 | | // these are just for performance reasons, not correctness, |
6538 | | // so we're fine using the defaults |
6539 | 0 | s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr); |
6540 | 0 | if (!s.ok()) { |
6541 | 0 | return s; |
6542 | 0 | } |
6543 | 0 | file_reader = std::make_unique<SequentialFileReader>( |
6544 | 0 | std::move(file), manifest_path, /*io_tracer=*/nullptr); |
6545 | 0 | } |
6546 | | |
6547 | 0 | VersionSet::LogReporter reporter; |
6548 | 0 | reporter.status = &s; |
6549 | 0 | log::Reader reader(nullptr, std::move(file_reader), &reporter, |
6550 | 0 | true /* checksum */, 0 /* log_number */); |
6551 | |
|
6552 | 0 | ListColumnFamiliesHandler handler(read_options); |
6553 | 0 | handler.Iterate(reader, &s); |
6554 | |
|
6555 | 0 | assert(column_families); |
6556 | 0 | column_families->clear(); |
6557 | 0 | if (handler.status().ok()) { |
6558 | 0 | for (const auto& iter : handler.GetColumnFamilyNames()) { |
6559 | 0 | column_families->push_back(iter.second); |
6560 | 0 | } |
6561 | 0 | } |
6562 | |
|
6563 | 0 | return handler.status(); |
6564 | 0 | } |
6565 | | |
6566 | | Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, |
6567 | | const Options* options, |
6568 | | const FileOptions& file_options, |
6569 | 0 | int new_levels) { |
6570 | 0 | if (new_levels <= 1) { |
6571 | 0 | return Status::InvalidArgument( |
6572 | 0 | "Number of levels needs to be bigger than 1"); |
6573 | 0 | } |
6574 | | |
6575 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
6576 | 0 | const ReadOptions read_options; |
6577 | 0 | const WriteOptions write_options; |
6578 | |
|
6579 | 0 | ImmutableDBOptions db_options(*options); |
6580 | 0 | ColumnFamilyOptions cf_options(*options); |
6581 | 0 | std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10, |
6582 | 0 | options->table_cache_numshardbits)); |
6583 | 0 | WriteController wc(options->delayed_write_rate); |
6584 | 0 | WriteBufferManager wb(options->db_write_buffer_size); |
6585 | 0 | VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, |
6586 | 0 | nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, |
6587 | 0 | /*db_id*/ "", |
6588 | 0 | /*db_session_id*/ "", options->daily_offpeak_time_utc, |
6589 | 0 | /*error_handler_*/ nullptr, /*unchanging=*/false); |
6590 | 0 | Status status; |
6591 | |
|
6592 | 0 | std::vector<ColumnFamilyDescriptor> dummy; |
6593 | 0 | ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, |
6594 | 0 | ColumnFamilyOptions(*options)); |
6595 | 0 | dummy.push_back(dummy_descriptor); |
6596 | 0 | status = versions.Recover(dummy); |
6597 | 0 | if (!status.ok()) { |
6598 | 0 | return status; |
6599 | 0 | } |
6600 | | |
6601 | 0 | Version* current_version = |
6602 | 0 | versions.GetColumnFamilySet()->GetDefault()->current(); |
6603 | 0 | auto* vstorage = current_version->storage_info(); |
6604 | 0 | int current_levels = vstorage->num_levels(); |
6605 | |
|
6606 | 0 | if (current_levels <= new_levels) { |
6607 | 0 | return Status::OK(); |
6608 | 0 | } |
6609 | | |
6610 | | // Make sure there are file only on one level from |
6611 | | // (new_levels-1) to (current_levels-1) |
6612 | 0 | int first_nonempty_level = -1; |
6613 | 0 | int first_nonempty_level_filenum = 0; |
6614 | 0 | for (int i = new_levels - 1; i < current_levels; i++) { |
6615 | 0 | int file_num = vstorage->NumLevelFiles(i); |
6616 | 0 | if (file_num != 0) { |
6617 | 0 | if (first_nonempty_level < 0) { |
6618 | 0 | first_nonempty_level = i; |
6619 | 0 | first_nonempty_level_filenum = file_num; |
6620 | 0 | } else { |
6621 | 0 | char msg[255]; |
6622 | 0 | snprintf(msg, sizeof(msg), |
6623 | 0 | "Found at least two levels containing files: " |
6624 | 0 | "[%d:%d],[%d:%d].\n", |
6625 | 0 | first_nonempty_level, first_nonempty_level_filenum, i, |
6626 | 0 | file_num); |
6627 | 0 | return Status::InvalidArgument(msg); |
6628 | 0 | } |
6629 | 0 | } |
6630 | 0 | } |
6631 | | |
6632 | | // we need to allocate an array with the old number of levels size to |
6633 | | // avoid SIGSEGV in WriteCurrentStatetoManifest() |
6634 | | // however, all levels bigger or equal to new_levels will be empty |
6635 | 0 | std::vector<FileMetaData*>* new_files_list = |
6636 | 0 | new std::vector<FileMetaData*>[current_levels]; |
6637 | 0 | for (int i = 0; i < new_levels - 1; i++) { |
6638 | 0 | new_files_list[i] = vstorage->LevelFiles(i); |
6639 | 0 | } |
6640 | |
|
6641 | 0 | if (first_nonempty_level > 0) { |
6642 | 0 | auto& new_last_level = new_files_list[new_levels - 1]; |
6643 | |
|
6644 | 0 | new_last_level = vstorage->LevelFiles(first_nonempty_level); |
6645 | |
|
6646 | 0 | for (size_t i = 0; i < new_last_level.size(); ++i) { |
6647 | 0 | const FileMetaData* const meta = new_last_level[i]; |
6648 | 0 | assert(meta); |
6649 | |
|
6650 | 0 | const uint64_t file_number = meta->fd.GetNumber(); |
6651 | |
|
6652 | 0 | vstorage->file_locations_[file_number] = |
6653 | 0 | VersionStorageInfo::FileLocation(new_levels - 1, i); |
6654 | 0 | } |
6655 | 0 | } |
6656 | |
|
6657 | 0 | delete[] vstorage->files_; |
6658 | 0 | vstorage->files_ = new_files_list; |
6659 | 0 | vstorage->num_levels_ = new_levels; |
6660 | 0 | vstorage->ResizeCompactCursors(new_levels); |
6661 | |
|
6662 | 0 | VersionEdit ve; |
6663 | 0 | InstrumentedMutex dummy_mutex; |
6664 | 0 | InstrumentedMutexLock l(&dummy_mutex); |
6665 | 0 | return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), |
6666 | 0 | read_options, write_options, &ve, &dummy_mutex, |
6667 | 0 | nullptr, true); |
6668 | 0 | } |
6669 | | |
6670 | | // Get the checksum information including the checksum and checksum function |
6671 | | // name of all SST and blob files in VersionSet. Store the information in |
6672 | | // FileChecksumList which contains a map from file number to its checksum info. |
6673 | | // If DB is not running, make sure call VersionSet::Recover() to load the file |
6674 | | // metadata from Manifest to VersionSet before calling this function. |
6675 | 0 | Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { |
6676 | | // Clean the previously stored checksum information if any. |
6677 | 0 | Status s; |
6678 | 0 | if (checksum_list == nullptr) { |
6679 | 0 | s = Status::InvalidArgument("checksum_list is nullptr"); |
6680 | 0 | return s; |
6681 | 0 | } |
6682 | 0 | checksum_list->reset(); |
6683 | |
|
6684 | 0 | for (auto cfd : *column_family_set_) { |
6685 | 0 | assert(cfd); |
6686 | |
|
6687 | 0 | if (cfd->IsDropped() || !cfd->initialized()) { |
6688 | 0 | continue; |
6689 | 0 | } |
6690 | | |
6691 | 0 | const auto* current = cfd->current(); |
6692 | 0 | assert(current); |
6693 | |
|
6694 | 0 | const auto* vstorage = current->storage_info(); |
6695 | 0 | assert(vstorage); |
6696 | | |
6697 | | /* SST files */ |
6698 | 0 | for (int level = 0; level < cfd->NumberLevels(); level++) { |
6699 | 0 | const auto& level_files = vstorage->LevelFiles(level); |
6700 | |
|
6701 | 0 | for (const auto& file : level_files) { |
6702 | 0 | assert(file); |
6703 | |
|
6704 | 0 | s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), |
6705 | 0 | file->file_checksum, |
6706 | 0 | file->file_checksum_func_name); |
6707 | 0 | if (!s.ok()) { |
6708 | 0 | return s; |
6709 | 0 | } |
6710 | 0 | } |
6711 | 0 | } |
6712 | | |
6713 | | /* Blob files */ |
6714 | 0 | const auto& blob_files = vstorage->GetBlobFiles(); |
6715 | 0 | for (const auto& meta : blob_files) { |
6716 | 0 | assert(meta); |
6717 | |
|
6718 | 0 | std::string checksum_value = meta->GetChecksumValue(); |
6719 | 0 | std::string checksum_method = meta->GetChecksumMethod(); |
6720 | 0 | assert(checksum_value.empty() == checksum_method.empty()); |
6721 | 0 | if (meta->GetChecksumMethod().empty()) { |
6722 | 0 | checksum_value = kUnknownFileChecksum; |
6723 | 0 | checksum_method = kUnknownFileChecksumFuncName; |
6724 | 0 | } |
6725 | |
|
6726 | 0 | s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(), |
6727 | 0 | checksum_value, checksum_method); |
6728 | 0 | if (!s.ok()) { |
6729 | 0 | return s; |
6730 | 0 | } |
6731 | 0 | } |
6732 | 0 | } |
6733 | | |
6734 | 0 | return s; |
6735 | 0 | } |
6736 | | |
6737 | | Status VersionSet::DumpManifest( |
6738 | | Options& options, std::string& dscname, bool verbose, bool hex, bool json, |
6739 | 0 | const std::vector<ColumnFamilyDescriptor>& cf_descs) { |
6740 | 0 | assert(options.env); |
6741 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
6742 | 0 | const ReadOptions read_options; |
6743 | |
|
6744 | 0 | std::vector<std::string> column_families; |
6745 | 0 | Status s = ListColumnFamiliesFromManifest( |
6746 | 0 | dscname, options.env->GetFileSystem().get(), &column_families); |
6747 | 0 | if (!s.ok()) { |
6748 | 0 | return s; |
6749 | 0 | } |
6750 | | |
6751 | | // Open the specified manifest file. |
6752 | 0 | std::unique_ptr<SequentialFileReader> file_reader; |
6753 | 0 | { |
6754 | 0 | std::unique_ptr<FSSequentialFile> file; |
6755 | 0 | const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem(); |
6756 | 0 | s = fs->NewSequentialFile( |
6757 | 0 | dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr); |
6758 | 0 | if (!s.ok()) { |
6759 | 0 | return s; |
6760 | 0 | } |
6761 | 0 | file_reader = std::make_unique<SequentialFileReader>( |
6762 | 0 | std::move(file), dscname, db_options_->log_readahead_size, io_tracer_); |
6763 | 0 | } |
6764 | | |
6765 | 0 | std::map<std::string, const ColumnFamilyDescriptor*> cf_name_to_desc; |
6766 | 0 | for (const auto& cf_desc : cf_descs) { |
6767 | 0 | cf_name_to_desc[cf_desc.name] = &cf_desc; |
6768 | 0 | } |
6769 | 0 | std::vector<ColumnFamilyDescriptor> final_cf_descs; |
6770 | 0 | for (const auto& cf : column_families) { |
6771 | 0 | const auto iter = cf_name_to_desc.find(cf); |
6772 | 0 | if (iter != cf_name_to_desc.cend()) { |
6773 | 0 | final_cf_descs.push_back(*iter->second); |
6774 | 0 | } else { |
6775 | 0 | final_cf_descs.emplace_back(cf, options); |
6776 | 0 | } |
6777 | 0 | } |
6778 | |
|
6779 | 0 | DumpManifestHandler handler(final_cf_descs, this, io_tracer_, read_options, |
6780 | 0 | verbose, hex, json); |
6781 | 0 | { |
6782 | 0 | VersionSet::LogReporter reporter; |
6783 | 0 | reporter.status = &s; |
6784 | 0 | log::Reader reader(nullptr, std::move(file_reader), &reporter, |
6785 | 0 | true /* checksum */, 0 /* log_number */); |
6786 | 0 | handler.Iterate(reader, &s); |
6787 | 0 | } |
6788 | |
|
6789 | 0 | return handler.status(); |
6790 | 0 | } |
6791 | | |
6792 | 175k | void VersionSet::MarkFileNumberUsed(uint64_t number) { |
6793 | | // only called during recovery and repair which are single threaded, so this |
6794 | | // works because there can't be concurrent calls |
6795 | 175k | if (next_file_number_.load(std::memory_order_relaxed) <= number) { |
6796 | 31.7k | next_file_number_.store(number + 1, std::memory_order_relaxed); |
6797 | 31.7k | } |
6798 | 175k | } |
6799 | | // Called only either from ::LogAndApply which is protected by mutex or during |
6800 | | // recovery which is single-threaded. |
6801 | 89.3k | void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { |
6802 | 89.3k | if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) { |
6803 | 71.5k | min_log_number_to_keep_.store(number, std::memory_order_relaxed); |
6804 | 71.5k | } |
6805 | 89.3k | } |
6806 | | |
6807 | | Status VersionSet::WriteCurrentStateToManifest( |
6808 | | const WriteOptions& write_options, |
6809 | | const std::unordered_map<uint32_t, MutableCFState>& curr_state, |
6810 | 48.4k | const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { |
6811 | | // TODO: Break up into multiple records to reduce memory usage on recovery? |
6812 | | |
6813 | | // WARNING: This method doesn't hold a mutex!! |
6814 | | |
6815 | | // This is done without DB mutex lock held, but only within single-threaded |
6816 | | // LogAndApply. Column family manipulations can only happen within LogAndApply |
6817 | | // (the same single thread), so we're safe to iterate. |
6818 | | |
6819 | 48.4k | assert(io_s.ok()); |
6820 | 48.4k | if (db_options_->write_dbid_to_manifest) { |
6821 | 48.4k | VersionEdit edit_for_db_id; |
6822 | 48.4k | assert(!db_id_.empty()); |
6823 | 48.4k | edit_for_db_id.SetDBId(db_id_); |
6824 | 48.4k | std::string db_id_record; |
6825 | 48.4k | if (!edit_for_db_id.EncodeTo(&db_id_record)) { |
6826 | 0 | return Status::Corruption("Unable to Encode VersionEdit:" + |
6827 | 0 | edit_for_db_id.DebugString(true)); |
6828 | 0 | } |
6829 | 48.4k | io_s = log->AddRecord(write_options, db_id_record); |
6830 | 48.4k | if (!io_s.ok()) { |
6831 | 0 | return io_s; |
6832 | 0 | } |
6833 | 48.4k | } |
6834 | | |
6835 | | // Save WALs. |
6836 | 48.4k | if (!wal_additions.GetWalAdditions().empty()) { |
6837 | 0 | TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal", |
6838 | 0 | const_cast<VersionEdit*>(&wal_additions)); |
6839 | 0 | std::string record; |
6840 | 0 | if (!wal_additions.EncodeTo(&record)) { |
6841 | 0 | return Status::Corruption("Unable to Encode VersionEdit: " + |
6842 | 0 | wal_additions.DebugString(true)); |
6843 | 0 | } |
6844 | 0 | io_s = log->AddRecord(write_options, record); |
6845 | 0 | if (!io_s.ok()) { |
6846 | 0 | return io_s; |
6847 | 0 | } |
6848 | 0 | } |
6849 | | |
6850 | | // New manifest should rollover the WAL deletion record from previous |
6851 | | // manifest. Otherwise, when an addition record of a deleted WAL gets added to |
6852 | | // this new manifest later (which can happens in e.g, SyncWAL()), this new |
6853 | | // manifest creates an illusion that such WAL hasn't been deleted. |
6854 | 48.4k | VersionEdit wal_deletions; |
6855 | 48.4k | wal_deletions.DeleteWalsBefore(min_log_number_to_keep()); |
6856 | 48.4k | std::string wal_deletions_record; |
6857 | 48.4k | if (!wal_deletions.EncodeTo(&wal_deletions_record)) { |
6858 | 0 | return Status::Corruption("Unable to Encode VersionEdit: " + |
6859 | 0 | wal_deletions.DebugString(true)); |
6860 | 0 | } |
6861 | 48.4k | io_s = log->AddRecord(write_options, wal_deletions_record); |
6862 | 48.4k | if (!io_s.ok()) { |
6863 | 0 | return io_s; |
6864 | 0 | } |
6865 | | |
6866 | 65.4k | for (auto cfd : *column_family_set_) { |
6867 | 65.4k | assert(cfd); |
6868 | | |
6869 | 65.4k | if (cfd->IsDropped()) { |
6870 | 0 | continue; |
6871 | 0 | } |
6872 | 65.4k | assert(cfd->initialized()); |
6873 | 65.4k | { |
6874 | | // Store column family info |
6875 | 65.4k | VersionEdit edit; |
6876 | 65.4k | if (cfd->GetID() != 0) { |
6877 | | // default column family is always there, |
6878 | | // no need to explicitly write it |
6879 | 17.0k | edit.AddColumnFamily(cfd->GetName()); |
6880 | 17.0k | edit.SetColumnFamily(cfd->GetID()); |
6881 | 17.0k | } |
6882 | 65.4k | edit.SetComparatorName( |
6883 | 65.4k | cfd->internal_comparator().user_comparator()->Name()); |
6884 | 65.4k | edit.SetPersistUserDefinedTimestamps( |
6885 | 65.4k | cfd->ioptions().persist_user_defined_timestamps); |
6886 | 65.4k | std::string record; |
6887 | 65.4k | if (!edit.EncodeTo(&record)) { |
6888 | 0 | return Status::Corruption("Unable to Encode VersionEdit:" + |
6889 | 0 | edit.DebugString(true)); |
6890 | 0 | } |
6891 | 65.4k | io_s = log->AddRecord(write_options, record); |
6892 | 65.4k | if (!io_s.ok()) { |
6893 | 0 | return io_s; |
6894 | 0 | } |
6895 | 65.4k | } |
6896 | | |
6897 | 65.4k | { |
6898 | | // Save files |
6899 | 65.4k | VersionEdit edit; |
6900 | 65.4k | edit.SetColumnFamily(cfd->GetID()); |
6901 | | |
6902 | 65.4k | const auto* current = cfd->current(); |
6903 | 65.4k | assert(current); |
6904 | | |
6905 | 65.4k | const auto* vstorage = current->storage_info(); |
6906 | 65.4k | assert(vstorage); |
6907 | | |
6908 | 523k | for (int level = 0; level < cfd->NumberLevels(); level++) { |
6909 | 458k | const auto& level_files = vstorage->LevelFiles(level); |
6910 | | |
6911 | 458k | for (const auto& f : level_files) { |
6912 | 76.8k | assert(f); |
6913 | | |
6914 | 76.8k | edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), |
6915 | 76.8k | f->fd.GetFileSize(), f->smallest, f->largest, |
6916 | 76.8k | f->fd.smallest_seqno, f->fd.largest_seqno, |
6917 | 76.8k | f->marked_for_compaction, f->temperature, |
6918 | 76.8k | f->oldest_blob_file_number, f->oldest_ancester_time, |
6919 | 76.8k | f->file_creation_time, f->epoch_number, f->file_checksum, |
6920 | 76.8k | f->file_checksum_func_name, f->unique_id, |
6921 | 76.8k | f->compensated_range_deletion_size, f->tail_size, |
6922 | 76.8k | f->user_defined_timestamps_persisted); |
6923 | 76.8k | } |
6924 | 458k | } |
6925 | | |
6926 | 65.4k | edit.SetCompactCursors(vstorage->GetCompactCursors()); |
6927 | | |
6928 | 65.4k | const auto& blob_files = vstorage->GetBlobFiles(); |
6929 | 65.4k | for (const auto& meta : blob_files) { |
6930 | 0 | assert(meta); |
6931 | |
|
6932 | 0 | const uint64_t blob_file_number = meta->GetBlobFileNumber(); |
6933 | |
|
6934 | 0 | edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(), |
6935 | 0 | meta->GetTotalBlobBytes(), meta->GetChecksumMethod(), |
6936 | 0 | meta->GetChecksumValue()); |
6937 | 0 | if (meta->GetGarbageBlobCount() > 0) { |
6938 | 0 | edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(), |
6939 | 0 | meta->GetGarbageBlobBytes()); |
6940 | 0 | } |
6941 | 0 | } |
6942 | | |
6943 | 65.4k | const auto iter = curr_state.find(cfd->GetID()); |
6944 | 65.4k | assert(iter != curr_state.end()); |
6945 | 65.4k | uint64_t log_number = iter->second.log_number; |
6946 | 65.4k | edit.SetLogNumber(log_number); |
6947 | | |
6948 | 65.4k | if (cfd->GetID() == 0) { |
6949 | | // min_log_number_to_keep is for the whole db, not for specific column |
6950 | | // family. So it does not need to be set for every column family, just |
6951 | | // need to be set once. Since default CF can never be dropped, we set |
6952 | | // the min_log to the default CF here. |
6953 | 48.4k | uint64_t min_log = min_log_number_to_keep(); |
6954 | 48.4k | if (min_log != 0) { |
6955 | 30.5k | edit.SetMinLogNumberToKeep(min_log); |
6956 | 30.5k | } |
6957 | 48.4k | } |
6958 | | |
6959 | 65.4k | const std::string& full_history_ts_low = iter->second.full_history_ts_low; |
6960 | 65.4k | if (!full_history_ts_low.empty()) { |
6961 | 0 | edit.SetFullHistoryTsLow(full_history_ts_low); |
6962 | 0 | } |
6963 | | |
6964 | 65.4k | edit.SetLastSequence(descriptor_last_sequence_); |
6965 | | |
6966 | 65.4k | const Comparator* ucmp = cfd->user_comparator(); |
6967 | 65.4k | assert(ucmp); |
6968 | 65.4k | std::string record; |
6969 | 65.4k | if (!edit.EncodeTo(&record, ucmp->timestamp_size())) { |
6970 | 0 | return Status::Corruption("Unable to Encode VersionEdit:" + |
6971 | 0 | edit.DebugString(true)); |
6972 | 0 | } |
6973 | 65.4k | io_s = log->AddRecord(write_options, record); |
6974 | 65.4k | if (!io_s.ok()) { |
6975 | 0 | return io_s; |
6976 | 0 | } |
6977 | 65.4k | } |
6978 | 65.4k | } |
6979 | 48.4k | return Status::OK(); |
6980 | 48.4k | } |
6981 | | |
6982 | | // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this |
6983 | | // function is called repeatedly with consecutive pairs of slices. For example |
6984 | | // if the slice list is [a, b, c, d] this function is called with arguments |
6985 | | // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where |
6986 | | // we avoid doing binary search for the keys b and c twice and instead somehow |
6987 | | // maintain state of where they first appear in the files. |
6988 | | uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, |
6989 | | const ReadOptions& read_options, |
6990 | | Version* v, const Slice& start, |
6991 | | const Slice& end, int start_level, |
6992 | 135k | int end_level, TableReaderCaller caller) { |
6993 | 135k | const auto& icmp = v->cfd_->internal_comparator(); |
6994 | | |
6995 | | // pre-condition |
6996 | 135k | assert(icmp.Compare(start, end) <= 0); |
6997 | | |
6998 | 135k | uint64_t total_full_size = 0; |
6999 | 135k | const auto* vstorage = v->storage_info(); |
7000 | 135k | const int num_non_empty_levels = vstorage->num_non_empty_levels(); |
7001 | 135k | end_level = (end_level == -1) ? num_non_empty_levels |
7002 | 135k | : std::min(end_level, num_non_empty_levels); |
7003 | 135k | if (end_level <= start_level) { |
7004 | 135k | return 0; |
7005 | 135k | } |
7006 | | |
7007 | | // Outline of the optimization that uses options.files_size_error_margin. |
7008 | | // When approximating the files total size that is used to store a keys range, |
7009 | | // we first sum up the sizes of the files that fully fall into the range. |
7010 | | // Then we sum up the sizes of all the files that may intersect with the range |
7011 | | // (this includes all files in L0 as well). Then, if total_intersecting_size |
7012 | | // is smaller than total_full_size * options.files_size_error_margin - we can |
7013 | | // infer that the intersecting files have a sufficiently negligible |
7014 | | // contribution to the total size, and we can approximate the storage required |
7015 | | // for the keys in range as just half of the intersecting_files_size. |
7016 | | // E.g., if the value of files_size_error_margin is 0.1, then the error of the |
7017 | | // approximation is limited to only ~10% of the total size of files that fully |
7018 | | // fall into the keys range. In such case, this helps to avoid a costly |
7019 | | // process of binary searching the intersecting files that is required only |
7020 | | // for a more precise calculation of the total size. |
7021 | | |
7022 | 0 | autovector<FdWithKeyRange*, 32> first_files; |
7023 | 0 | autovector<FdWithKeyRange*, 16> last_files; |
7024 | | |
7025 | | // scan all the levels |
7026 | 0 | for (int level = start_level; level < end_level; ++level) { |
7027 | 0 | const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); |
7028 | 0 | if (files_brief.num_files == 0) { |
7029 | | // empty level, skip exploration |
7030 | 0 | continue; |
7031 | 0 | } |
7032 | | |
7033 | 0 | if (level == 0) { |
7034 | | // level 0 files are not in sorted order, we need to iterate through |
7035 | | // the list to compute the total bytes that require scanning, |
7036 | | // so handle the case explicitly (similarly to first_files case) |
7037 | 0 | for (size_t i = 0; i < files_brief.num_files; i++) { |
7038 | 0 | first_files.push_back(&files_brief.files[i]); |
7039 | 0 | } |
7040 | 0 | continue; |
7041 | 0 | } |
7042 | | |
7043 | 0 | assert(level > 0); |
7044 | 0 | assert(files_brief.num_files > 0); |
7045 | | |
7046 | | // identify the file position for start key |
7047 | 0 | const int idx_start = |
7048 | 0 | FindFileInRange(icmp, files_brief, start, 0, |
7049 | 0 | static_cast<uint32_t>(files_brief.num_files - 1)); |
7050 | 0 | assert(static_cast<size_t>(idx_start) < files_brief.num_files); |
7051 | | |
7052 | | // identify the file position for end key |
7053 | 0 | int idx_end = idx_start; |
7054 | 0 | if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { |
7055 | 0 | idx_end = |
7056 | 0 | FindFileInRange(icmp, files_brief, end, idx_start, |
7057 | 0 | static_cast<uint32_t>(files_brief.num_files - 1)); |
7058 | 0 | } |
7059 | 0 | assert(idx_end >= idx_start && |
7060 | 0 | static_cast<size_t>(idx_end) < files_brief.num_files); |
7061 | | |
7062 | | // scan all files from the starting index to the ending index |
7063 | | // (inferred from the sorted order) |
7064 | | |
7065 | | // first scan all the intermediate full files (excluding first and last) |
7066 | 0 | for (int i = idx_start + 1; i < idx_end; ++i) { |
7067 | 0 | uint64_t file_size = files_brief.files[i].fd.GetFileSize(); |
7068 | | // The entire file falls into the range, so we can just take its size. |
7069 | 0 | assert(file_size == ApproximateSize(read_options, v, files_brief.files[i], |
7070 | 0 | start, end, caller)); |
7071 | 0 | total_full_size += file_size; |
7072 | 0 | } |
7073 | | |
7074 | | // save the first and the last files (which may be the same file), so we |
7075 | | // can scan them later. |
7076 | 0 | first_files.push_back(&files_brief.files[idx_start]); |
7077 | 0 | if (idx_start != idx_end) { |
7078 | | // we need to estimate size for both files, only if they are different |
7079 | 0 | last_files.push_back(&files_brief.files[idx_end]); |
7080 | 0 | } |
7081 | 0 | } |
7082 | | |
7083 | | // The sum of all file sizes that intersect the [start, end] keys range. |
7084 | 0 | uint64_t total_intersecting_size = 0; |
7085 | 0 | for (const auto* file_ptr : first_files) { |
7086 | 0 | total_intersecting_size += file_ptr->fd.GetFileSize(); |
7087 | 0 | } |
7088 | 0 | for (const auto* file_ptr : last_files) { |
7089 | 0 | total_intersecting_size += file_ptr->fd.GetFileSize(); |
7090 | 0 | } |
7091 | | |
7092 | | // Now scan all the first & last files at each level, and estimate their size. |
7093 | | // If the total_intersecting_size is less than X% of the total_full_size - we |
7094 | | // want to approximate the result in order to avoid the costly binary search |
7095 | | // inside ApproximateSize. We use half of file size as an approximation below. |
7096 | |
|
7097 | 0 | const double margin = options.files_size_error_margin; |
7098 | 0 | if (margin > 0 && total_intersecting_size < |
7099 | 0 | static_cast<uint64_t>(total_full_size * margin)) { |
7100 | 0 | total_full_size += total_intersecting_size / 2; |
7101 | 0 | } else { |
7102 | | // Estimate for all the first files (might also be last files), at each |
7103 | | // level |
7104 | 0 | for (const auto file_ptr : first_files) { |
7105 | 0 | total_full_size += |
7106 | 0 | ApproximateSize(read_options, v, *file_ptr, start, end, caller); |
7107 | 0 | } |
7108 | | |
7109 | | // Estimate for all the last files, at each level |
7110 | 0 | for (const auto file_ptr : last_files) { |
7111 | | // We could use ApproximateSize here, but calling ApproximateOffsetOf |
7112 | | // directly is just more efficient. |
7113 | 0 | total_full_size += |
7114 | 0 | ApproximateOffsetOf(read_options, v, *file_ptr, end, caller); |
7115 | 0 | } |
7116 | 0 | } |
7117 | |
|
7118 | 0 | return total_full_size; |
7119 | 135k | } |
7120 | | |
7121 | | uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, |
7122 | | Version* v, const FdWithKeyRange& f, |
7123 | | const Slice& key, |
7124 | 0 | TableReaderCaller caller) { |
7125 | | // pre-condition |
7126 | 0 | assert(v); |
7127 | 0 | const auto& icmp = v->cfd_->internal_comparator(); |
7128 | |
|
7129 | 0 | uint64_t result = 0; |
7130 | 0 | if (icmp.Compare(f.largest_key, key) <= 0) { |
7131 | | // Entire file is before "key", so just add the file size |
7132 | 0 | result = f.fd.GetFileSize(); |
7133 | 0 | } else if (icmp.Compare(f.smallest_key, key) > 0) { |
7134 | | // Entire file is after "key", so ignore |
7135 | 0 | result = 0; |
7136 | 0 | } else { |
7137 | | // "key" falls in the range for this table. Add the |
7138 | | // approximate offset of "key" within the table. |
7139 | 0 | TableCache* table_cache = v->cfd_->table_cache(); |
7140 | 0 | const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); |
7141 | 0 | if (table_cache != nullptr) { |
7142 | 0 | result = table_cache->ApproximateOffsetOf( |
7143 | 0 | read_options, key, *f.file_metadata, caller, icmp, cf_opts); |
7144 | 0 | } |
7145 | 0 | } |
7146 | 0 | return result; |
7147 | 0 | } |
7148 | | |
7149 | | uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, |
7150 | | Version* v, const FdWithKeyRange& f, |
7151 | | const Slice& start, const Slice& end, |
7152 | 0 | TableReaderCaller caller) { |
7153 | | // pre-condition |
7154 | 0 | assert(v); |
7155 | 0 | const auto& icmp = v->cfd_->internal_comparator(); |
7156 | 0 | assert(icmp.Compare(start, end) <= 0); |
7157 | |
|
7158 | 0 | if (icmp.Compare(f.largest_key, start) <= 0 || |
7159 | 0 | icmp.Compare(f.smallest_key, end) > 0) { |
7160 | | // Entire file is before or after the start/end keys range |
7161 | 0 | return 0; |
7162 | 0 | } |
7163 | | |
7164 | 0 | if (icmp.Compare(f.smallest_key, start) >= 0) { |
7165 | | // Start of the range is before the file start - approximate by end offset |
7166 | 0 | return ApproximateOffsetOf(read_options, v, f, end, caller); |
7167 | 0 | } |
7168 | | |
7169 | 0 | if (icmp.Compare(f.largest_key, end) < 0) { |
7170 | | // End of the range is after the file end - approximate by subtracting |
7171 | | // start offset from the file size |
7172 | 0 | uint64_t start_offset = |
7173 | 0 | ApproximateOffsetOf(read_options, v, f, start, caller); |
7174 | 0 | assert(f.fd.GetFileSize() >= start_offset); |
7175 | 0 | return f.fd.GetFileSize() - start_offset; |
7176 | 0 | } |
7177 | | |
7178 | | // The interval falls entirely in the range for this file. |
7179 | 0 | TableCache* table_cache = v->cfd_->table_cache(); |
7180 | 0 | if (table_cache == nullptr) { |
7181 | 0 | return 0; |
7182 | 0 | } |
7183 | 0 | const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); |
7184 | 0 | return table_cache->ApproximateSize(read_options, start, end, |
7185 | 0 | *f.file_metadata, caller, icmp, cf_opts); |
7186 | 0 | } |
7187 | | |
7188 | | void VersionSet::RemoveLiveFiles( |
7189 | | std::vector<ObsoleteFileInfo>& sst_delete_candidates, |
7190 | 25.4k | std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const { |
7191 | 25.4k | assert(column_family_set_); |
7192 | 26.5k | for (auto cfd : *column_family_set_) { |
7193 | 26.5k | assert(cfd); |
7194 | 26.5k | if (!cfd->initialized()) { |
7195 | 0 | continue; |
7196 | 0 | } |
7197 | | |
7198 | 26.5k | auto* current = cfd->current(); |
7199 | 26.5k | bool found_current = false; |
7200 | | |
7201 | 26.5k | Version* const dummy_versions = cfd->dummy_versions(); |
7202 | 26.5k | assert(dummy_versions); |
7203 | | |
7204 | 53.4k | for (Version* v = dummy_versions->next_; v != dummy_versions; |
7205 | 26.8k | v = v->next_) { |
7206 | 26.8k | v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); |
7207 | 26.8k | if (v == current) { |
7208 | 26.5k | found_current = true; |
7209 | 26.5k | } |
7210 | 26.8k | } |
7211 | | |
7212 | 26.5k | if (!found_current && current != nullptr) { |
7213 | | // Should never happen unless it is a bug. |
7214 | 0 | assert(false); |
7215 | 0 | current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); |
7216 | 0 | } |
7217 | 26.5k | } |
7218 | 25.4k | } |
7219 | | |
7220 | | void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files, |
7221 | 96.8k | std::vector<uint64_t>* live_blob_files) const { |
7222 | 96.8k | assert(live_table_files); |
7223 | 96.8k | assert(live_blob_files); |
7224 | | |
7225 | | // pre-calculate space requirement |
7226 | 96.8k | size_t total_table_files = 0; |
7227 | 96.8k | size_t total_blob_files = 0; |
7228 | | |
7229 | 96.8k | assert(column_family_set_); |
7230 | 130k | for (auto cfd : *column_family_set_) { |
7231 | 130k | assert(cfd); |
7232 | | |
7233 | 130k | if (!cfd->initialized()) { |
7234 | 0 | continue; |
7235 | 0 | } |
7236 | | |
7237 | 130k | Version* const dummy_versions = cfd->dummy_versions(); |
7238 | 130k | assert(dummy_versions); |
7239 | | |
7240 | 261k | for (Version* v = dummy_versions->next_; v != dummy_versions; |
7241 | 130k | v = v->next_) { |
7242 | 130k | assert(v); |
7243 | | |
7244 | 130k | const auto* vstorage = v->storage_info(); |
7245 | 130k | assert(vstorage); |
7246 | | |
7247 | 1.04M | for (int level = 0; level < vstorage->num_levels(); ++level) { |
7248 | 916k | total_table_files += vstorage->LevelFiles(level).size(); |
7249 | 916k | } |
7250 | | |
7251 | 130k | total_blob_files += vstorage->GetBlobFiles().size(); |
7252 | 130k | } |
7253 | 130k | } |
7254 | | |
7255 | | // just one time extension to the right size |
7256 | 96.8k | live_table_files->reserve(live_table_files->size() + total_table_files); |
7257 | 96.8k | live_blob_files->reserve(live_blob_files->size() + total_blob_files); |
7258 | | |
7259 | 96.8k | assert(column_family_set_); |
7260 | 130k | for (auto cfd : *column_family_set_) { |
7261 | 130k | assert(cfd); |
7262 | 130k | if (!cfd->initialized()) { |
7263 | 0 | continue; |
7264 | 0 | } |
7265 | | |
7266 | 130k | auto* current = cfd->current(); |
7267 | 130k | bool found_current = false; |
7268 | | |
7269 | 130k | Version* const dummy_versions = cfd->dummy_versions(); |
7270 | 130k | assert(dummy_versions); |
7271 | | |
7272 | 261k | for (Version* v = dummy_versions->next_; v != dummy_versions; |
7273 | 130k | v = v->next_) { |
7274 | 130k | v->AddLiveFiles(live_table_files, live_blob_files); |
7275 | 130k | if (v == current) { |
7276 | 130k | found_current = true; |
7277 | 130k | } |
7278 | 130k | } |
7279 | | |
7280 | 130k | if (!found_current && current != nullptr) { |
7281 | | // Should never happen unless it is a bug. |
7282 | 0 | assert(false); |
7283 | 0 | current->AddLiveFiles(live_table_files, live_blob_files); |
7284 | 0 | } |
7285 | 130k | } |
7286 | 96.8k | } |
7287 | | |
7288 | | InternalIterator* VersionSet::MakeInputIterator( |
7289 | | const ReadOptions& read_options, const Compaction* c, |
7290 | | RangeDelAggregator* range_del_agg, |
7291 | | const FileOptions& file_options_compactions, |
7292 | | const std::optional<const Slice>& start, |
7293 | 3.68k | const std::optional<const Slice>& end) { |
7294 | 3.68k | auto cfd = c->column_family_data(); |
7295 | | // Level-0 files have to be merged together. For other levels, |
7296 | | // we will make a concatenating iterator per level. |
7297 | | // TODO(opt): use concatenating iterator for level-0 if there is no overlap |
7298 | 3.68k | const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + |
7299 | 2.96k | c->num_input_levels() - 1 |
7300 | 3.68k | : c->num_input_levels()); |
7301 | 3.68k | InternalIterator** list = new InternalIterator*[space]; |
7302 | | // First item in the pair is a pointer to range tombstones. |
7303 | | // Second item is a pointer to a member of a LevelIterator, |
7304 | | // that will be initialized to where CompactionMergingIterator stores |
7305 | | // pointer to its range tombstones. This is used by LevelIterator |
7306 | | // to update pointer to range tombstones as it traverse different SST files. |
7307 | 3.68k | std::vector<std::pair<std::unique_ptr<TruncatedRangeDelIterator>, |
7308 | 3.68k | std::unique_ptr<TruncatedRangeDelIterator>**>> |
7309 | 3.68k | range_tombstones; |
7310 | 3.68k | size_t num = 0; |
7311 | 3.68k | [[maybe_unused]] size_t num_input_files = 0; |
7312 | 9.41k | for (size_t which = 0; which < c->num_input_levels(); which++) { |
7313 | 5.72k | const LevelFilesBrief* flevel = c->input_levels(which); |
7314 | 5.72k | num_input_files += flevel->num_files; |
7315 | 5.72k | if (flevel->num_files != 0) { |
7316 | 5.72k | if (c->level(which) == 0) { |
7317 | 12.4k | for (size_t i = 0; i < flevel->num_files; i++) { |
7318 | 9.44k | const FileMetaData& fmd = *flevel->files[i].file_metadata; |
7319 | 9.44k | if (start.has_value() && |
7320 | 0 | cfd->user_comparator()->CompareWithoutTimestamp( |
7321 | 0 | *start, fmd.largest.user_key()) > 0) { |
7322 | 0 | continue; |
7323 | 0 | } |
7324 | | // We should be able to filter out the case where the end key |
7325 | | // equals to the end boundary, since the end key is exclusive. |
7326 | | // We try to be extra safe here. |
7327 | 9.44k | if (end.has_value() && |
7328 | 0 | cfd->user_comparator()->CompareWithoutTimestamp( |
7329 | 0 | *end, fmd.smallest.user_key()) < 0) { |
7330 | 0 | continue; |
7331 | 0 | } |
7332 | 9.44k | std::unique_ptr<TruncatedRangeDelIterator> range_tombstone_iter = |
7333 | 9.44k | nullptr; |
7334 | 9.44k | list[num++] = cfd->table_cache()->NewIterator( |
7335 | 9.44k | read_options, file_options_compactions, |
7336 | 9.44k | cfd->internal_comparator(), fmd, range_del_agg, |
7337 | 9.44k | c->mutable_cf_options(), |
7338 | 9.44k | /*table_reader_ptr=*/nullptr, |
7339 | 9.44k | /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, |
7340 | 9.44k | /*arena=*/nullptr, |
7341 | 9.44k | /*skip_filters=*/false, |
7342 | 9.44k | /*level=*/static_cast<int>(c->level(which)), |
7343 | 9.44k | MaxFileSizeForL0MetaPin(c->mutable_cf_options()), |
7344 | 9.44k | /*smallest_compaction_key=*/nullptr, |
7345 | 9.44k | /*largest_compaction_key=*/nullptr, |
7346 | 9.44k | /*allow_unprepared_value=*/false, |
7347 | 9.44k | /*range_del_read_seqno=*/nullptr, |
7348 | 9.44k | /*range_del_iter=*/&range_tombstone_iter); |
7349 | 9.44k | range_tombstones.emplace_back(std::move(range_tombstone_iter), |
7350 | 9.44k | nullptr); |
7351 | 9.44k | } |
7352 | 2.96k | } else { |
7353 | | // Create concatenating iterator for the files from this level |
7354 | 2.76k | std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr = |
7355 | 2.76k | nullptr; |
7356 | 2.76k | list[num++] = new LevelIterator( |
7357 | 2.76k | cfd->table_cache(), read_options, file_options_compactions, |
7358 | 2.76k | cfd->internal_comparator(), flevel, c->mutable_cf_options(), |
7359 | 2.76k | /*should_sample=*/false, |
7360 | 2.76k | /*no per level latency histogram=*/nullptr, |
7361 | 2.76k | TableReaderCaller::kCompaction, /*skip_filters=*/false, |
7362 | 2.76k | /*level=*/static_cast<int>(c->level(which)), range_del_agg, |
7363 | 2.76k | c->boundaries(which), false, &tombstone_iter_ptr); |
7364 | 2.76k | range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); |
7365 | 2.76k | } |
7366 | 5.72k | } |
7367 | 5.72k | } |
7368 | 3.68k | TEST_SYNC_POINT_CALLBACK( |
7369 | 3.68k | "VersionSet::MakeInputIterator:NewCompactionMergingIterator", |
7370 | 3.68k | &num_input_files); |
7371 | 3.68k | assert(num <= space); |
7372 | 3.68k | InternalIterator* result = NewCompactionMergingIterator( |
7373 | 3.68k | &c->column_family_data()->internal_comparator(), list, |
7374 | 3.68k | static_cast<int>(num), range_tombstones, /*arena=*/nullptr, |
7375 | 3.68k | c->column_family_data()->internal_stats()); |
7376 | 3.68k | delete[] list; |
7377 | 3.68k | return result; |
7378 | 3.68k | } |
7379 | | |
7380 | | Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, |
7381 | | FileMetaData** meta, |
7382 | 0 | ColumnFamilyData** cfd) { |
7383 | 0 | for (auto cfd_iter : *column_family_set_) { |
7384 | 0 | if (!cfd_iter->initialized()) { |
7385 | 0 | continue; |
7386 | 0 | } |
7387 | 0 | Version* version = cfd_iter->current(); |
7388 | 0 | const auto* vstorage = version->storage_info(); |
7389 | 0 | for (int level = 0; level < vstorage->num_levels(); level++) { |
7390 | 0 | for (const auto& file : vstorage->LevelFiles(level)) { |
7391 | 0 | if (file->fd.GetNumber() == number) { |
7392 | 0 | *meta = file; |
7393 | 0 | *filelevel = level; |
7394 | 0 | *cfd = cfd_iter; |
7395 | 0 | return Status::OK(); |
7396 | 0 | } |
7397 | 0 | } |
7398 | 0 | } |
7399 | 0 | } |
7400 | 0 | return Status::NotFound("File not present in any level"); |
7401 | 0 | } |
7402 | | |
7403 | 0 | void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) { |
7404 | 0 | if (!metadata) { |
7405 | 0 | return; |
7406 | 0 | } |
7407 | 0 | assert(metadata); |
7408 | 0 | size_t count = 0; |
7409 | 0 | for (auto cfd : *column_family_set_) { |
7410 | 0 | if (cfd->IsDropped() || !cfd->initialized()) { |
7411 | 0 | continue; |
7412 | 0 | } |
7413 | 0 | for (int level = 0; level < cfd->NumberLevels(); level++) { |
7414 | 0 | count += cfd->current()->storage_info()->LevelFiles(level).size(); |
7415 | 0 | } |
7416 | 0 | } |
7417 | 0 | metadata->reserve(count); |
7418 | 0 | for (auto cfd : *column_family_set_) { |
7419 | 0 | if (cfd->IsDropped() || !cfd->initialized()) { |
7420 | 0 | continue; |
7421 | 0 | } |
7422 | 0 | for (int level = 0; level < cfd->NumberLevels(); level++) { |
7423 | 0 | for (const auto& file : |
7424 | 0 | cfd->current()->storage_info()->LevelFiles(level)) { |
7425 | 0 | LiveFileMetaData filemetadata; |
7426 | 0 | filemetadata.column_family_name = cfd->GetName(); |
7427 | 0 | uint32_t path_id = file->fd.GetPathId(); |
7428 | 0 | if (path_id < cfd->ioptions().cf_paths.size()) { |
7429 | 0 | filemetadata.db_path = cfd->ioptions().cf_paths[path_id].path; |
7430 | 0 | } else { |
7431 | 0 | assert(!cfd->ioptions().cf_paths.empty()); |
7432 | 0 | filemetadata.db_path = cfd->ioptions().cf_paths.back().path; |
7433 | 0 | } |
7434 | 0 | filemetadata.directory = filemetadata.db_path; |
7435 | 0 | const uint64_t file_number = file->fd.GetNumber(); |
7436 | 0 | filemetadata.name = MakeTableFileName("", file_number); |
7437 | 0 | filemetadata.relative_filename = filemetadata.name.substr(1); |
7438 | 0 | filemetadata.file_number = file_number; |
7439 | 0 | filemetadata.level = level; |
7440 | 0 | filemetadata.size = file->fd.GetFileSize(); |
7441 | 0 | filemetadata.smallestkey = file->smallest.user_key().ToString(); |
7442 | 0 | filemetadata.largestkey = file->largest.user_key().ToString(); |
7443 | 0 | filemetadata.smallest_seqno = file->fd.smallest_seqno; |
7444 | 0 | filemetadata.largest_seqno = file->fd.largest_seqno; |
7445 | 0 | filemetadata.num_reads_sampled = |
7446 | 0 | file->stats.num_reads_sampled.load(std::memory_order_relaxed); |
7447 | 0 | filemetadata.being_compacted = file->being_compacted; |
7448 | 0 | filemetadata.num_entries = file->num_entries; |
7449 | 0 | filemetadata.num_deletions = file->num_deletions; |
7450 | 0 | filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; |
7451 | 0 | filemetadata.file_checksum = file->file_checksum; |
7452 | 0 | filemetadata.file_checksum_func_name = file->file_checksum_func_name; |
7453 | 0 | filemetadata.temperature = file->temperature; |
7454 | 0 | filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); |
7455 | 0 | filemetadata.file_creation_time = file->TryGetFileCreationTime(); |
7456 | 0 | filemetadata.epoch_number = file->epoch_number; |
7457 | 0 | metadata->push_back(filemetadata); |
7458 | 0 | } |
7459 | 0 | } |
7460 | 0 | } |
7461 | 0 | } |
7462 | | |
7463 | | void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files, |
7464 | | std::vector<ObsoleteBlobFileInfo>* blob_files, |
7465 | | std::vector<std::string>* manifest_filenames, |
7466 | 122k | uint64_t min_pending_output) { |
7467 | 122k | assert(files); |
7468 | 122k | assert(blob_files); |
7469 | 122k | assert(manifest_filenames); |
7470 | 122k | assert(files->empty()); |
7471 | 122k | assert(blob_files->empty()); |
7472 | 122k | assert(manifest_filenames->empty()); |
7473 | | |
7474 | 122k | std::vector<ObsoleteFileInfo> pending_files; |
7475 | 122k | for (auto& f : obsolete_files_) { |
7476 | 12.3k | if (f.metadata->fd.GetNumber() < min_pending_output) { |
7477 | 12.3k | files->emplace_back(std::move(f)); |
7478 | 12.3k | } else { |
7479 | 0 | pending_files.emplace_back(std::move(f)); |
7480 | 0 | } |
7481 | 12.3k | } |
7482 | 122k | obsolete_files_.swap(pending_files); |
7483 | | |
7484 | 122k | std::vector<ObsoleteBlobFileInfo> pending_blob_files; |
7485 | 122k | for (auto& blob_file : obsolete_blob_files_) { |
7486 | 0 | if (blob_file.GetBlobFileNumber() < min_pending_output) { |
7487 | 0 | blob_files->emplace_back(std::move(blob_file)); |
7488 | 0 | } else { |
7489 | 0 | pending_blob_files.emplace_back(std::move(blob_file)); |
7490 | 0 | } |
7491 | 0 | } |
7492 | 122k | obsolete_blob_files_.swap(pending_blob_files); |
7493 | | |
7494 | 122k | obsolete_manifests_.swap(*manifest_filenames); |
7495 | 122k | } |
7496 | | |
7497 | 0 | uint64_t VersionSet::GetObsoleteSstFilesSize() const { |
7498 | 0 | uint64_t ret = 0; |
7499 | 0 | for (auto& f : obsolete_files_) { |
7500 | 0 | if (f.metadata != nullptr) { |
7501 | 0 | ret += f.metadata->fd.GetFileSize(); |
7502 | 0 | } |
7503 | 0 | } |
7504 | 0 | return ret; |
7505 | 0 | } |
7506 | | |
7507 | | ColumnFamilyData* VersionSet::CreateColumnFamily( |
7508 | | const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, |
7509 | 95.3k | const VersionEdit* edit, bool read_only) { |
7510 | 95.3k | assert(edit->IsColumnFamilyAdd()); |
7511 | | // Unchanging LSM tree implies no writes to the CF |
7512 | 95.3k | assert(!unchanging_ || read_only); |
7513 | | |
7514 | 95.3k | MutableCFOptions dummy_cf_options; |
7515 | 95.3k | Version* dummy_versions = |
7516 | 95.3k | new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_); |
7517 | | // Ref() dummy version once so that later we can call Unref() to delete it |
7518 | | // by avoiding calling "delete" explicitly (~Version is private) |
7519 | 95.3k | dummy_versions->Ref(); |
7520 | 95.3k | auto new_cfd = column_family_set_->CreateColumnFamily( |
7521 | 95.3k | edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions, |
7522 | 95.3k | cf_options, read_only); |
7523 | | |
7524 | 95.3k | Version* v = new Version(new_cfd, this, file_options_, |
7525 | 95.3k | new_cfd->GetLatestMutableCFOptions(), io_tracer_, |
7526 | 95.3k | current_version_number_++); |
7527 | | |
7528 | 95.3k | constexpr bool update_stats = false; |
7529 | | |
7530 | 95.3k | v->PrepareAppend(read_options, update_stats); |
7531 | | |
7532 | 95.3k | AppendVersion(new_cfd, v); |
7533 | | // GetLatestMutableCFOptions() is safe here without mutex since the |
7534 | | // cfd is not available to client |
7535 | 95.3k | new_cfd->CreateNewMemtable(LastSequence()); |
7536 | 95.3k | new_cfd->SetLogNumber(edit->GetLogNumber()); |
7537 | 95.3k | return new_cfd; |
7538 | 95.3k | } |
7539 | | |
7540 | 0 | uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) { |
7541 | 0 | uint64_t count = 0; |
7542 | 0 | for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { |
7543 | 0 | count++; |
7544 | 0 | } |
7545 | 0 | return count; |
7546 | 0 | } |
7547 | | |
7548 | 0 | uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) { |
7549 | 0 | std::unordered_set<uint64_t> unique_files; |
7550 | 0 | uint64_t total_files_size = 0; |
7551 | 0 | for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { |
7552 | 0 | VersionStorageInfo* storage_info = v->storage_info(); |
7553 | 0 | for (int level = 0; level < storage_info->num_levels_; level++) { |
7554 | 0 | for (const auto& file_meta : storage_info->LevelFiles(level)) { |
7555 | 0 | if (unique_files.find(file_meta->fd.packed_number_and_path_id) == |
7556 | 0 | unique_files.end()) { |
7557 | 0 | unique_files.insert(file_meta->fd.packed_number_and_path_id); |
7558 | 0 | total_files_size += file_meta->fd.GetFileSize(); |
7559 | 0 | } |
7560 | 0 | } |
7561 | 0 | } |
7562 | 0 | } |
7563 | 0 | return total_files_size; |
7564 | 0 | } |
7565 | | |
7566 | 0 | uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { |
7567 | 0 | std::unordered_set<uint64_t> unique_blob_files; |
7568 | |
|
7569 | 0 | uint64_t all_versions_blob_file_size = 0; |
7570 | |
|
7571 | 0 | for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { |
7572 | | // iterate all the versions |
7573 | 0 | const auto* vstorage = v->storage_info(); |
7574 | 0 | assert(vstorage); |
7575 | |
|
7576 | 0 | const auto& blob_files = vstorage->GetBlobFiles(); |
7577 | |
|
7578 | 0 | for (const auto& meta : blob_files) { |
7579 | 0 | assert(meta); |
7580 | |
|
7581 | 0 | const uint64_t blob_file_number = meta->GetBlobFileNumber(); |
7582 | |
|
7583 | 0 | if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) { |
7584 | | // find Blob file that has not been counted |
7585 | 0 | unique_blob_files.insert(blob_file_number); |
7586 | 0 | all_versions_blob_file_size += meta->GetBlobFileSize(); |
7587 | 0 | } |
7588 | 0 | } |
7589 | 0 | } |
7590 | |
|
7591 | 0 | return all_versions_blob_file_size; |
7592 | 0 | } |
7593 | | |
7594 | | Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, |
7595 | | ColumnFamilyData* cfd, |
7596 | | const std::string& fpath, int level, |
7597 | 0 | const FileMetaData& meta) { |
7598 | 0 | uint64_t fsize = 0; |
7599 | 0 | Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); |
7600 | 0 | if (status.ok()) { |
7601 | 0 | if (fsize != meta.fd.GetFileSize()) { |
7602 | 0 | status = Status::Corruption("File size mismatch: " + fpath); |
7603 | 0 | } |
7604 | 0 | } |
7605 | 0 | if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) { |
7606 | 0 | assert(cfd); |
7607 | 0 | TableCache* table_cache = cfd->table_cache(); |
7608 | 0 | assert(table_cache); |
7609 | |
|
7610 | 0 | const auto& cf_opts = cfd->GetLatestMutableCFOptions(); |
7611 | 0 | size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(cf_opts); |
7612 | |
|
7613 | 0 | const FileOptions& file_opts = file_options(); |
7614 | |
|
7615 | 0 | Version* version = cfd->current(); |
7616 | 0 | assert(version); |
7617 | 0 | VersionStorageInfo& storage_info = version->storage_info_; |
7618 | 0 | const InternalKeyComparator* icmp = storage_info.InternalComparator(); |
7619 | 0 | assert(icmp); |
7620 | |
|
7621 | 0 | InternalStats* internal_stats = cfd->internal_stats(); |
7622 | |
|
7623 | 0 | TableCache::TypedHandle* handle = nullptr; |
7624 | 0 | FileMetaData meta_copy = meta; |
7625 | 0 | status = table_cache->FindTable( |
7626 | 0 | read_options, file_opts, *icmp, meta_copy, &handle, cf_opts, |
7627 | 0 | /*no_io=*/false, internal_stats->GetFileReadHist(level), false, level, |
7628 | 0 | /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, |
7629 | 0 | meta_copy.temperature); |
7630 | 0 | if (handle) { |
7631 | 0 | table_cache->get_cache().Release(handle); |
7632 | 0 | } |
7633 | 0 | } |
7634 | 0 | return status; |
7635 | 0 | } |
7636 | | |
7637 | | ReactiveVersionSet::ReactiveVersionSet( |
7638 | | const std::string& dbname, const ImmutableDBOptions* _db_options, |
7639 | | const FileOptions& _file_options, Cache* table_cache, |
7640 | | WriteBufferManager* write_buffer_manager, WriteController* write_controller, |
7641 | | const std::shared_ptr<IOTracer>& io_tracer) |
7642 | 0 | : VersionSet(dbname, _db_options, _file_options, table_cache, |
7643 | 0 | write_buffer_manager, write_controller, |
7644 | 0 | /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", |
7645 | 0 | /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "", |
7646 | 0 | /*error_handler=*/nullptr, /*unchanging=*/false) {} |
7647 | | |
7648 | 0 | ReactiveVersionSet::~ReactiveVersionSet() = default; |
7649 | | |
7650 | | Status ReactiveVersionSet::Recover( |
7651 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
7652 | | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader, |
7653 | | std::unique_ptr<log::Reader::Reporter>* manifest_reporter, |
7654 | 0 | std::unique_ptr<Status>* manifest_reader_status) { |
7655 | 0 | assert(manifest_reader != nullptr); |
7656 | 0 | assert(manifest_reporter != nullptr); |
7657 | 0 | assert(manifest_reader_status != nullptr); |
7658 | |
|
7659 | 0 | manifest_reader_status->reset(new Status()); |
7660 | 0 | manifest_reporter->reset(new LogReporter()); |
7661 | 0 | static_cast_with_check<LogReporter>(manifest_reporter->get())->status = |
7662 | 0 | manifest_reader_status->get(); |
7663 | 0 | Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); |
7664 | 0 | if (!s.ok()) { |
7665 | 0 | return s; |
7666 | 0 | } |
7667 | 0 | log::Reader* reader = manifest_reader->get(); |
7668 | 0 | assert(reader); |
7669 | |
|
7670 | 0 | manifest_tailer_.reset(new ManifestTailer( |
7671 | 0 | column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_, |
7672 | 0 | read_options_, EpochNumberRequirement::kMightMissing)); |
7673 | |
|
7674 | 0 | manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); |
7675 | |
|
7676 | 0 | s = manifest_tailer_->status(); |
7677 | 0 | if (s.ok()) { |
7678 | 0 | RecoverEpochNumbers(); |
7679 | 0 | } |
7680 | 0 | return s; |
7681 | 0 | } |
7682 | | |
7683 | | Status ReactiveVersionSet::ReadAndApply( |
7684 | | InstrumentedMutex* mu, |
7685 | | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader, |
7686 | | Status* manifest_read_status, |
7687 | | std::unordered_set<ColumnFamilyData*>* cfds_changed, |
7688 | 0 | std::vector<std::string>* files_to_delete) { |
7689 | 0 | assert(manifest_reader != nullptr); |
7690 | 0 | assert(cfds_changed != nullptr); |
7691 | 0 | mu->AssertHeld(); |
7692 | |
|
7693 | 0 | Status s; |
7694 | 0 | log::Reader* reader = manifest_reader->get(); |
7695 | 0 | assert(reader); |
7696 | 0 | s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); |
7697 | 0 | if (!s.ok()) { |
7698 | 0 | return s; |
7699 | 0 | } |
7700 | 0 | manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status); |
7701 | 0 | s = manifest_tailer_->status(); |
7702 | 0 | if (s.ok()) { |
7703 | 0 | *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); |
7704 | 0 | } |
7705 | 0 | if (files_to_delete) { |
7706 | 0 | *files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles(); |
7707 | 0 | } |
7708 | |
|
7709 | 0 | return s; |
7710 | 0 | } |
7711 | | |
7712 | | Status ReactiveVersionSet::MaybeSwitchManifest( |
7713 | | log::Reader::Reporter* reporter, |
7714 | 0 | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) { |
7715 | 0 | assert(manifest_reader != nullptr); |
7716 | 0 | Status s; |
7717 | 0 | std::string manifest_path; |
7718 | 0 | s = GetCurrentManifestPath(dbname_, fs_.get(), /*is_retry=*/false, |
7719 | 0 | &manifest_path, &manifest_file_number_); |
7720 | 0 | if (!s.ok()) { |
7721 | 0 | return s; |
7722 | 0 | } |
7723 | 0 | std::unique_ptr<FSSequentialFile> manifest_file; |
7724 | 0 | if (manifest_reader->get() != nullptr && |
7725 | 0 | manifest_reader->get()->file()->file_name() == manifest_path) { |
7726 | | // CURRENT points to the same MANIFEST as before, no need to switch |
7727 | | // MANIFEST. |
7728 | 0 | return s; |
7729 | 0 | } |
7730 | 0 | assert(nullptr == manifest_reader->get() || |
7731 | 0 | manifest_reader->get()->file()->file_name() != manifest_path); |
7732 | 0 | s = fs_->FileExists(manifest_path, IOOptions(), nullptr); |
7733 | 0 | if (s.IsNotFound()) { |
7734 | 0 | return Status::TryAgain( |
7735 | 0 | "The primary may have switched to a new MANIFEST and deleted the old " |
7736 | 0 | "one."); |
7737 | 0 | } else if (!s.ok()) { |
7738 | 0 | return s; |
7739 | 0 | } |
7740 | 0 | TEST_SYNC_POINT( |
7741 | 0 | "ReactiveVersionSet::MaybeSwitchManifest:" |
7742 | 0 | "AfterGetCurrentManifestPath:0"); |
7743 | 0 | TEST_SYNC_POINT( |
7744 | 0 | "ReactiveVersionSet::MaybeSwitchManifest:" |
7745 | 0 | "AfterGetCurrentManifestPath:1"); |
7746 | | // The primary can also delete the MANIFEST while the secondary is reading |
7747 | | // it. This is OK on POSIX. For other file systems, maybe create a hard link |
7748 | | // to MANIFEST. The hard link should be cleaned up later by the secondary. |
7749 | 0 | s = fs_->NewSequentialFile(manifest_path, |
7750 | 0 | fs_->OptimizeForManifestRead(file_options_), |
7751 | 0 | &manifest_file, nullptr); |
7752 | 0 | std::unique_ptr<SequentialFileReader> manifest_file_reader; |
7753 | 0 | if (s.ok()) { |
7754 | 0 | manifest_file_reader.reset(new SequentialFileReader( |
7755 | 0 | std::move(manifest_file), manifest_path, |
7756 | 0 | db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); |
7757 | 0 | manifest_reader->reset(new log::FragmentBufferedReader( |
7758 | 0 | nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, |
7759 | 0 | 0 /* log_number */)); |
7760 | 0 | ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", |
7761 | 0 | manifest_path.c_str()); |
7762 | 0 | if (manifest_tailer_) { |
7763 | 0 | manifest_tailer_->PrepareToReadNewManifest(); |
7764 | 0 | } |
7765 | 0 | } else if (s.IsPathNotFound()) { |
7766 | | // This can happen if the primary switches to a new MANIFEST after the |
7767 | | // secondary reads the CURRENT file but before the secondary actually tries |
7768 | | // to open the MANIFEST. |
7769 | 0 | s = Status::TryAgain( |
7770 | 0 | "The primary may have switched to a new MANIFEST and deleted the old " |
7771 | 0 | "one."); |
7772 | 0 | } |
7773 | 0 | return s; |
7774 | 0 | } |
7775 | | |
7776 | | #ifndef NDEBUG |
7777 | | uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const { |
7778 | | assert(manifest_tailer_); |
7779 | | return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group(); |
7780 | | } |
7781 | | #endif // !NDEBUG |
7782 | | |
7783 | 0 | std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() { |
7784 | | assert(manifest_tailer_); |
7785 | 0 | return manifest_tailer_->GetReadBuffer().replay_buffer(); |
7786 | 0 | } |
7787 | | |
7788 | | } // namespace ROCKSDB_NAMESPACE |