/src/rocksdb/db/db_impl/db_impl_open.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | #include <cinttypes> |
10 | | |
11 | | #include "db/builder.h" |
12 | | #include "db/db_impl/db_impl.h" |
13 | | #include "db/error_handler.h" |
14 | | #include "db/periodic_task_scheduler.h" |
15 | | #include "env/composite_env_wrapper.h" |
16 | | #include "file/filename.h" |
17 | | #include "file/read_write_util.h" |
18 | | #include "file/sst_file_manager_impl.h" |
19 | | #include "file/writable_file_writer.h" |
20 | | #include "logging/logging.h" |
21 | | #include "monitoring/persistent_stats_history.h" |
22 | | #include "monitoring/thread_status_util.h" |
23 | | #include "options/options_helper.h" |
24 | | #include "rocksdb/options.h" |
25 | | #include "rocksdb/table.h" |
26 | | #include "rocksdb/wal_filter.h" |
27 | | #include "test_util/sync_point.h" |
28 | | #include "util/rate_limiter_impl.h" |
29 | | #include "util/string_util.h" |
30 | | #include "util/udt_util.h" |
31 | | |
32 | | namespace ROCKSDB_NAMESPACE { |
33 | | Options SanitizeOptions(const std::string& dbname, const Options& src, |
34 | 10.2k | bool read_only, Status* logger_creation_s) { |
35 | 10.2k | auto db_options = |
36 | 10.2k | SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s); |
37 | 10.2k | ImmutableDBOptions immutable_db_options(db_options); |
38 | 10.2k | auto cf_options = SanitizeCfOptions(immutable_db_options, read_only, |
39 | 10.2k | ColumnFamilyOptions(src)); |
40 | 10.2k | return Options(db_options, cf_options); |
41 | 10.2k | } |
42 | | |
43 | | DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, |
44 | 68.5k | bool read_only, Status* logger_creation_s) { |
45 | 68.5k | DBOptions result(src); |
46 | | |
47 | 68.5k | if (result.env == nullptr) { |
48 | 0 | result.env = Env::Default(); |
49 | 0 | } |
50 | | |
51 | | // result.max_open_files means an "infinite" open files. |
52 | 68.5k | if (result.max_open_files != -1) { |
53 | 0 | int max_max_open_files = port::GetMaxOpenFiles(); |
54 | 0 | if (max_max_open_files == -1) { |
55 | 0 | max_max_open_files = 0x400000; |
56 | 0 | } |
57 | 0 | ClipToRange(&result.max_open_files, 20, max_max_open_files); |
58 | 0 | TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles", |
59 | 0 | &result.max_open_files); |
60 | 0 | } |
61 | | |
62 | 68.5k | if (result.info_log == nullptr && !read_only) { |
63 | 68.5k | Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); |
64 | 68.5k | if (!s.ok()) { |
65 | | // No place suitable for logging |
66 | 0 | result.info_log = nullptr; |
67 | 0 | if (logger_creation_s) { |
68 | 0 | *logger_creation_s = s; |
69 | 0 | } |
70 | 0 | } |
71 | 68.5k | } |
72 | | |
73 | 68.5k | if (!result.write_buffer_manager) { |
74 | 68.5k | result.write_buffer_manager.reset( |
75 | 68.5k | new WriteBufferManager(result.db_write_buffer_size)); |
76 | 68.5k | } |
77 | 68.5k | auto bg_job_limits = DBImpl::GetBGJobLimits( |
78 | 68.5k | result.max_background_flushes, result.max_background_compactions, |
79 | 68.5k | result.max_background_jobs, true /* parallelize_compactions */); |
80 | 68.5k | result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, |
81 | 68.5k | Env::Priority::LOW); |
82 | 68.5k | result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, |
83 | 68.5k | Env::Priority::HIGH); |
84 | | |
85 | 68.5k | if (result.rate_limiter.get() != nullptr) { |
86 | 0 | if (result.bytes_per_sync == 0) { |
87 | 0 | result.bytes_per_sync = 1024 * 1024; |
88 | 0 | } |
89 | 0 | } |
90 | | |
91 | 68.5k | if (result.delayed_write_rate == 0) { |
92 | 68.5k | if (result.rate_limiter.get() != nullptr) { |
93 | 0 | result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond(); |
94 | 0 | } |
95 | 68.5k | if (result.delayed_write_rate == 0) { |
96 | 68.5k | result.delayed_write_rate = 16 * 1024 * 1024; |
97 | 68.5k | } |
98 | 68.5k | } |
99 | | |
100 | 68.5k | if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { |
101 | 0 | result.recycle_log_file_num = false; |
102 | 0 | } |
103 | | |
104 | 68.5k | if (result.recycle_log_file_num && |
105 | 68.5k | (result.wal_recovery_mode == |
106 | 0 | WALRecoveryMode::kTolerateCorruptedTailRecords || |
107 | 0 | result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { |
108 | | // - kTolerateCorruptedTailRecords is inconsistent with recycle log file |
109 | | // feature. WAL recycling expects recovery success upon encountering a |
110 | | // corrupt record at the point where new data ends and recycled data |
111 | | // remains at the tail. However, `kTolerateCorruptedTailRecords` must fail |
112 | | // upon encountering any such corrupt record, as it cannot differentiate |
113 | | // between this and a real corruption, which would cause committed updates |
114 | | // to be truncated -- a violation of the recovery guarantee. |
115 | | // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with |
116 | | // recycle log file feature temporarily due to a bug found introducing a |
117 | | // hole in the recovered data |
118 | | // (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236). |
119 | | // Besides this bug, we believe the features are fundamentally compatible. |
120 | 0 | result.recycle_log_file_num = 0; |
121 | 0 | } |
122 | | |
123 | 68.5k | if (result.db_paths.size() == 0) { |
124 | 68.5k | result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max()); |
125 | 68.5k | } else if (result.wal_dir.empty()) { |
126 | | // Use dbname as default |
127 | 0 | result.wal_dir = dbname; |
128 | 0 | } |
129 | 68.5k | if (!result.wal_dir.empty()) { |
130 | | // If there is a wal_dir already set, check to see if the wal_dir is the |
131 | | // same as the dbname AND the same as the db_path[0] (which must exist from |
132 | | // a few lines ago). If the wal_dir matches both of these values, then clear |
133 | | // the wal_dir value, which will make wal_dir == dbname. Most likely this |
134 | | // condition was the result of reading an old options file where we forced |
135 | | // wal_dir to be set (to dbname). |
136 | 0 | auto npath = NormalizePath(dbname + "/"); |
137 | 0 | if (npath == NormalizePath(result.wal_dir + "/") && |
138 | 0 | npath == NormalizePath(result.db_paths[0].path + "/")) { |
139 | 0 | result.wal_dir.clear(); |
140 | 0 | } |
141 | 0 | } |
142 | | |
143 | 68.5k | if (!result.wal_dir.empty() && result.wal_dir.back() == '/') { |
144 | 0 | result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); |
145 | 0 | } |
146 | | |
147 | | // Force flush on DB open if 2PC is enabled, since with 2PC we have no |
148 | | // guarantee that consecutive log files have consecutive sequence id, which |
149 | | // make recovery complicated. |
150 | 68.5k | if (result.allow_2pc) { |
151 | 0 | result.avoid_flush_during_recovery = false; |
152 | 0 | } |
153 | | |
154 | 68.5k | ImmutableDBOptions immutable_db_options(result); |
155 | 68.5k | if (!immutable_db_options.IsWalDirSameAsDBPath()) { |
156 | | // Either the WAL dir and db_paths[0]/db_name are not the same, or we |
157 | | // cannot tell for sure. In either case, assume they're different and |
158 | | // explicitly cleanup the trash log files (bypass DeleteScheduler) |
159 | | // Do this first so even if we end up calling |
160 | | // DeleteScheduler::CleanupDirectory on the same dir later, it will be |
161 | | // safe |
162 | 0 | std::vector<std::string> filenames; |
163 | 0 | IOOptions io_opts; |
164 | 0 | io_opts.do_not_recurse = true; |
165 | 0 | auto wal_dir = immutable_db_options.GetWalDir(); |
166 | 0 | Status s = immutable_db_options.fs->GetChildren( |
167 | 0 | wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr); |
168 | 0 | s.PermitUncheckedError(); //**TODO: What to do on error? |
169 | 0 | for (std::string& filename : filenames) { |
170 | 0 | if (filename.find(".log.trash", filename.length() - |
171 | 0 | std::string(".log.trash").length()) != |
172 | 0 | std::string::npos) { |
173 | 0 | std::string trash_file = wal_dir + "/" + filename; |
174 | 0 | result.env->DeleteFile(trash_file).PermitUncheckedError(); |
175 | 0 | } |
176 | 0 | } |
177 | 0 | } |
178 | | |
179 | | // Create a default SstFileManager for purposes of tracking compaction size |
180 | | // and facilitating recovery from out of space errors. |
181 | 68.5k | if (result.sst_file_manager.get() == nullptr) { |
182 | 68.5k | std::shared_ptr<SstFileManager> sst_file_manager( |
183 | 68.5k | NewSstFileManager(result.env, result.info_log)); |
184 | 68.5k | result.sst_file_manager = sst_file_manager; |
185 | 68.5k | } |
186 | | |
187 | | // Supported wal compression types |
188 | 68.5k | if (!StreamingCompressionTypeSupported(result.wal_compression)) { |
189 | 0 | result.wal_compression = kNoCompression; |
190 | 0 | ROCKS_LOG_WARN(result.info_log, |
191 | 0 | "wal_compression is disabled since only zstd is supported"); |
192 | 0 | } |
193 | | |
194 | 68.5k | return result; |
195 | 68.5k | } |
196 | | |
197 | | namespace { |
198 | | Status ValidateOptionsByTable( |
199 | | const DBOptions& db_opts, |
200 | 58.3k | const std::vector<ColumnFamilyDescriptor>& column_families) { |
201 | 58.3k | Status s; |
202 | 80.3k | for (auto& cf : column_families) { |
203 | 80.3k | s = ValidateOptions(db_opts, cf.options); |
204 | 80.3k | if (!s.ok()) { |
205 | 0 | return s; |
206 | 0 | } |
207 | 80.3k | } |
208 | 58.3k | return Status::OK(); |
209 | 58.3k | } |
210 | | } // namespace |
211 | | |
212 | | Status DBImpl::ValidateOptions( |
213 | | const DBOptions& db_options, |
214 | 58.3k | const std::vector<ColumnFamilyDescriptor>& column_families) { |
215 | 58.3k | Status s; |
216 | 80.3k | for (auto& cfd : column_families) { |
217 | 80.3k | s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); |
218 | 80.3k | if (!s.ok()) { |
219 | 0 | return s; |
220 | 0 | } |
221 | 80.3k | if (cfd.name == kDefaultColumnFamilyName) { |
222 | 58.3k | if (cfd.options.disallow_memtable_writes) { |
223 | 0 | return Status::InvalidArgument( |
224 | 0 | "Default column family cannot use disallow_memtable_writes=true"); |
225 | 0 | } |
226 | 58.3k | } |
227 | 80.3k | } |
228 | 58.3k | s = ValidateOptions(db_options); |
229 | 58.3k | return s; |
230 | 58.3k | } |
231 | | |
232 | 58.3k | Status DBImpl::ValidateOptions(const DBOptions& db_options) { |
233 | 58.3k | if (db_options.db_paths.size() > 4) { |
234 | 0 | return Status::NotSupported( |
235 | 0 | "More than four DB paths are not supported yet. "); |
236 | 0 | } |
237 | | |
238 | 58.3k | if (db_options.allow_mmap_reads && db_options.use_direct_reads) { |
239 | | // Protect against assert in PosixMMapReadableFile constructor |
240 | 0 | return Status::NotSupported( |
241 | 0 | "If memory mapped reads (allow_mmap_reads) are enabled " |
242 | 0 | "then direct I/O reads (use_direct_reads) must be disabled. "); |
243 | 0 | } |
244 | | |
245 | 58.3k | if (db_options.allow_mmap_writes && |
246 | 58.3k | db_options.use_direct_io_for_flush_and_compaction) { |
247 | 0 | return Status::NotSupported( |
248 | 0 | "If memory mapped writes (allow_mmap_writes) are enabled " |
249 | 0 | "then direct I/O writes (use_direct_io_for_flush_and_compaction) must " |
250 | 0 | "be disabled. "); |
251 | 0 | } |
252 | | |
253 | 58.3k | if (db_options.keep_log_file_num == 0) { |
254 | 0 | return Status::InvalidArgument("keep_log_file_num must be greater than 0"); |
255 | 0 | } |
256 | | |
257 | 58.3k | if (db_options.unordered_write && |
258 | 58.3k | !db_options.allow_concurrent_memtable_write) { |
259 | 0 | return Status::InvalidArgument( |
260 | 0 | "unordered_write is incompatible with " |
261 | 0 | "!allow_concurrent_memtable_write"); |
262 | 0 | } |
263 | | |
264 | 58.3k | if (db_options.unordered_write && db_options.enable_pipelined_write) { |
265 | 0 | return Status::InvalidArgument( |
266 | 0 | "unordered_write is incompatible with enable_pipelined_write"); |
267 | 0 | } |
268 | | |
269 | 58.3k | if (db_options.atomic_flush && db_options.enable_pipelined_write) { |
270 | 0 | return Status::InvalidArgument( |
271 | 0 | "atomic_flush is incompatible with enable_pipelined_write"); |
272 | 0 | } |
273 | | |
274 | 58.3k | if (db_options.use_direct_io_for_flush_and_compaction && |
275 | 58.3k | 0 == db_options.writable_file_max_buffer_size) { |
276 | 0 | return Status::InvalidArgument( |
277 | 0 | "writes in direct IO require writable_file_max_buffer_size > 0"); |
278 | 0 | } |
279 | | |
280 | 58.3k | if (db_options.daily_offpeak_time_utc != "") { |
281 | 0 | int start_time, end_time; |
282 | 0 | if (!TryParseTimeRangeString(db_options.daily_offpeak_time_utc, start_time, |
283 | 0 | end_time)) { |
284 | 0 | return Status::InvalidArgument( |
285 | 0 | "daily_offpeak_time_utc should be set in the format HH:mm-HH:mm " |
286 | 0 | "(e.g. 04:30-07:30)"); |
287 | 0 | } else if (start_time == end_time) { |
288 | 0 | return Status::InvalidArgument( |
289 | 0 | "start_time and end_time cannot be the same"); |
290 | 0 | } |
291 | 0 | } |
292 | | |
293 | 58.3k | if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) { |
294 | 0 | return Status::InvalidArgument( |
295 | 0 | "write_dbid_to_manifest and write_identity_file cannot both be false"); |
296 | 0 | } |
297 | 58.3k | return Status::OK(); |
298 | 58.3k | } |
299 | | |
300 | 10.2k | Status DBImpl::NewDB(std::vector<std::string>* new_filenames) { |
301 | 10.2k | VersionEdit new_db_edit; |
302 | 10.2k | const WriteOptions write_options(Env::IOActivity::kDBOpen); |
303 | 10.2k | Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true, |
304 | 10.2k | /*is_retry=*/false, &new_db_edit); |
305 | 10.2k | if (!s.ok()) { |
306 | 0 | return s; |
307 | 0 | } |
308 | 10.2k | new_db_edit.SetLogNumber(0); |
309 | 10.2k | new_db_edit.SetNextFile(2); |
310 | 10.2k | new_db_edit.SetLastSequence(0); |
311 | | |
312 | 10.2k | ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); |
313 | 10.2k | const std::string manifest = DescriptorFileName(dbname_, 1); |
314 | 10.2k | { |
315 | 10.2k | if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) { |
316 | 0 | fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); |
317 | 0 | } |
318 | 10.2k | std::unique_ptr<FSWritableFile> file; |
319 | 10.2k | FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); |
320 | | // DB option takes precedence when not kUnknown |
321 | 10.2k | if (immutable_db_options_.metadata_write_temperature != |
322 | 10.2k | Temperature::kUnknown) { |
323 | 0 | file_options.temperature = |
324 | 0 | immutable_db_options_.metadata_write_temperature; |
325 | 0 | } |
326 | 10.2k | s = NewWritableFile(fs_.get(), manifest, &file, file_options); |
327 | 10.2k | if (!s.ok()) { |
328 | 0 | return s; |
329 | 0 | } |
330 | 10.2k | FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; |
331 | 10.2k | file->SetPreallocationBlockSize( |
332 | 10.2k | immutable_db_options_.manifest_preallocation_size); |
333 | 10.2k | std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( |
334 | 10.2k | std::move(file), manifest, file_options, immutable_db_options_.clock, |
335 | 10.2k | io_tracer_, nullptr /* stats */, |
336 | 10.2k | Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, |
337 | 10.2k | immutable_db_options_.listeners, nullptr, |
338 | 10.2k | tmp_set.Contains(FileType::kDescriptorFile), |
339 | 10.2k | tmp_set.Contains(FileType::kDescriptorFile))); |
340 | 10.2k | log::Writer log(std::move(file_writer), 0, false); |
341 | 10.2k | std::string record; |
342 | 10.2k | new_db_edit.EncodeTo(&record); |
343 | 10.2k | s = log.AddRecord(write_options, record); |
344 | 10.2k | if (s.ok()) { |
345 | 10.2k | s = SyncManifest(&immutable_db_options_, write_options, log.file()); |
346 | 10.2k | } |
347 | 10.2k | } |
348 | 10.2k | if (s.ok()) { |
349 | | // Make "CURRENT" file that points to the new manifest file. |
350 | 10.2k | s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, |
351 | 10.2k | immutable_db_options_.metadata_write_temperature, |
352 | 10.2k | directories_.GetDbDir()); |
353 | 10.2k | if (new_filenames) { |
354 | 10.2k | new_filenames->emplace_back( |
355 | 10.2k | manifest.substr(manifest.find_last_of("/\\") + 1)); |
356 | 10.2k | } |
357 | 10.2k | } else { |
358 | 0 | fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); |
359 | 0 | } |
360 | 10.2k | return s; |
361 | 10.2k | } |
362 | | |
363 | | IOStatus DBImpl::CreateAndNewDirectory( |
364 | | FileSystem* fs, const std::string& dirname, |
365 | 138k | std::unique_ptr<FSDirectory>* directory) { |
366 | | // We call CreateDirIfMissing() as the directory may already exist (if we |
367 | | // are reopening a DB), when this happens we don't want creating the |
368 | | // directory to cause an error. However, we need to check if creating the |
369 | | // directory fails or else we may get an obscure message about the lock |
370 | | // file not existing. One real-world example of this occurring is if |
371 | | // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. |
372 | | // when dbname_ is "dir/db" but when "dir" doesn't exist. |
373 | 138k | IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr); |
374 | 138k | if (!io_s.ok()) { |
375 | 0 | return io_s; |
376 | 0 | } |
377 | 138k | return fs->NewDirectory(dirname, IOOptions(), directory, nullptr); |
378 | 138k | } |
379 | | |
380 | | IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname, |
381 | | const std::string& wal_dir, |
382 | 58.3k | const std::vector<DbPath>& data_paths) { |
383 | 58.3k | IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_); |
384 | 58.3k | if (!io_s.ok()) { |
385 | 0 | return io_s; |
386 | 0 | } |
387 | 58.3k | if (!wal_dir.empty() && dbname != wal_dir) { |
388 | 0 | io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_); |
389 | 0 | if (!io_s.ok()) { |
390 | 0 | return io_s; |
391 | 0 | } |
392 | 0 | } |
393 | | |
394 | 58.3k | data_dirs_.clear(); |
395 | 58.3k | for (auto& p : data_paths) { |
396 | 58.3k | const std::string db_path = p.path; |
397 | 58.3k | if (db_path == dbname) { |
398 | 58.3k | data_dirs_.emplace_back(nullptr); |
399 | 58.3k | } else { |
400 | 0 | std::unique_ptr<FSDirectory> path_directory; |
401 | 0 | io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory); |
402 | 0 | if (!io_s.ok()) { |
403 | 0 | return io_s; |
404 | 0 | } |
405 | 0 | data_dirs_.emplace_back(path_directory.release()); |
406 | 0 | } |
407 | 58.3k | } |
408 | 58.3k | assert(data_dirs_.size() == data_paths.size()); |
409 | 58.3k | return IOStatus::OK(); |
410 | 58.3k | } |
411 | | |
412 | | Status DBImpl::Recover( |
413 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
414 | | bool error_if_wal_file_exists, bool error_if_data_exists_in_wals, |
415 | | bool is_retry, uint64_t* recovered_seq, RecoveryContext* recovery_ctx, |
416 | 58.3k | bool* can_retry) { |
417 | 58.3k | mutex_.AssertHeld(); |
418 | | |
419 | 58.3k | const WriteOptions write_options(Env::IOActivity::kDBOpen); |
420 | 58.3k | bool tmp_is_new_db = false; |
421 | 58.3k | bool& is_new_db = recovery_ctx ? recovery_ctx->is_new_db_ : tmp_is_new_db; |
422 | 58.3k | assert(db_lock_ == nullptr); |
423 | 58.3k | std::vector<std::string> files_in_dbname; |
424 | 58.3k | if (!read_only) { |
425 | 58.3k | Status s = directories_.SetDirectories(fs_.get(), dbname_, |
426 | 58.3k | immutable_db_options_.wal_dir, |
427 | 58.3k | immutable_db_options_.db_paths); |
428 | 58.3k | if (!s.ok()) { |
429 | 0 | return s; |
430 | 0 | } |
431 | | |
432 | 58.3k | s = env_->LockFile(LockFileName(dbname_), &db_lock_); |
433 | 58.3k | if (!s.ok()) { |
434 | 0 | return s; |
435 | 0 | } |
436 | | |
437 | 58.3k | std::string current_fname = CurrentFileName(dbname_); |
438 | | // Path to any MANIFEST file in the db dir. It does not matter which one. |
439 | | // Since best-efforts recovery ignores CURRENT file, existence of a |
440 | | // MANIFEST indicates the recovery to recover existing db. If no MANIFEST |
441 | | // can be found, a new db will be created. |
442 | 58.3k | std::string manifest_path; |
443 | 58.3k | if (!immutable_db_options_.best_efforts_recovery) { |
444 | 58.3k | s = env_->FileExists(current_fname); |
445 | 58.3k | } else { |
446 | 0 | s = Status::NotFound(); |
447 | 0 | IOOptions io_opts; |
448 | 0 | io_opts.do_not_recurse = true; |
449 | 0 | Status io_s = immutable_db_options_.fs->GetChildren( |
450 | 0 | dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); |
451 | 0 | if (!io_s.ok()) { |
452 | 0 | s = io_s; |
453 | 0 | files_in_dbname.clear(); |
454 | 0 | } |
455 | 0 | for (const std::string& file : files_in_dbname) { |
456 | 0 | uint64_t number = 0; |
457 | 0 | FileType type = kWalFile; // initialize |
458 | 0 | if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { |
459 | 0 | uint64_t bytes; |
460 | 0 | s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes); |
461 | 0 | if (s.ok() && bytes != 0) { |
462 | | // Found non-empty MANIFEST (descriptor log), thus best-efforts |
463 | | // recovery does not have to treat the db as empty. |
464 | 0 | manifest_path = dbname_ + "/" + file; |
465 | 0 | break; |
466 | 0 | } |
467 | 0 | } |
468 | 0 | } |
469 | 0 | } |
470 | 58.3k | if (s.IsNotFound()) { |
471 | 10.2k | if (immutable_db_options_.create_if_missing) { |
472 | 10.2k | s = NewDB(&files_in_dbname); |
473 | 10.2k | is_new_db = true; |
474 | 10.2k | if (!s.ok()) { |
475 | 0 | return s; |
476 | 0 | } |
477 | 10.2k | } else { |
478 | 0 | return Status::InvalidArgument( |
479 | 0 | current_fname, "does not exist (create_if_missing is false)"); |
480 | 0 | } |
481 | 48.1k | } else if (s.ok()) { |
482 | 48.1k | if (immutable_db_options_.error_if_exists) { |
483 | 0 | return Status::InvalidArgument(dbname_, |
484 | 0 | "exists (error_if_exists is true)"); |
485 | 0 | } |
486 | 48.1k | } else { |
487 | | // Unexpected error reading file |
488 | 0 | assert(s.IsIOError()); |
489 | 0 | return s; |
490 | 0 | } |
491 | | // Verify compatibility of file_options_ and filesystem |
492 | 58.3k | { |
493 | 58.3k | std::unique_ptr<FSRandomAccessFile> idfile; |
494 | 58.3k | FileOptions customized_fs(file_options_); |
495 | 58.3k | customized_fs.use_direct_reads |= |
496 | 58.3k | immutable_db_options_.use_direct_io_for_flush_and_compaction; |
497 | 58.3k | const std::string& fname = |
498 | 58.3k | manifest_path.empty() ? current_fname : manifest_path; |
499 | 58.3k | s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); |
500 | 58.3k | if (!s.ok()) { |
501 | 0 | std::string error_str = s.ToString(); |
502 | | // Check if unsupported Direct I/O is the root cause |
503 | 0 | customized_fs.use_direct_reads = false; |
504 | 0 | s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); |
505 | 0 | if (s.ok()) { |
506 | 0 | return Status::InvalidArgument( |
507 | 0 | "Direct I/O is not supported by the specified DB."); |
508 | 0 | } else { |
509 | 0 | return Status::InvalidArgument( |
510 | 0 | "Found options incompatible with filesystem", error_str.c_str()); |
511 | 0 | } |
512 | 0 | } |
513 | 58.3k | } |
514 | 58.3k | } else if (immutable_db_options_.best_efforts_recovery) { |
515 | 0 | assert(files_in_dbname.empty()); |
516 | 0 | IOOptions io_opts; |
517 | 0 | io_opts.do_not_recurse = true; |
518 | 0 | Status s = immutable_db_options_.fs->GetChildren( |
519 | 0 | dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); |
520 | 0 | if (s.IsNotFound()) { |
521 | 0 | return Status::InvalidArgument(dbname_, |
522 | 0 | "does not exist (open for read only)"); |
523 | 0 | } else if (s.IsIOError()) { |
524 | 0 | return s; |
525 | 0 | } |
526 | 0 | assert(s.ok()); |
527 | 0 | } |
528 | 58.3k | assert(is_new_db || db_id_.empty()); |
529 | 58.3k | Status s; |
530 | 58.3k | bool missing_table_file = false; |
531 | 58.3k | if (!immutable_db_options_.best_efforts_recovery) { |
532 | | // Status of reading the descriptor file |
533 | 58.3k | Status desc_status; |
534 | 58.3k | s = versions_->Recover(column_families, read_only, &db_id_, |
535 | 58.3k | /*no_error_if_files_missing=*/false, is_retry, |
536 | 58.3k | &desc_status); |
537 | 58.3k | desc_status.PermitUncheckedError(); |
538 | 58.3k | if (is_retry) { |
539 | 0 | RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT); |
540 | 0 | if (desc_status.ok()) { |
541 | 0 | RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); |
542 | 0 | } |
543 | 0 | } |
544 | 58.3k | if (can_retry) { |
545 | | // If we're opening for the first time and the failure is likely due to |
546 | | // a corrupt MANIFEST file (could result in either the log::Reader |
547 | | // detecting a corrupt record, or SST files not found error due to |
548 | | // discarding badly formed tail records) |
549 | 58.3k | if (!is_retry && |
550 | 58.3k | (desc_status.IsCorruption() || s.IsNotFound() || s.IsCorruption()) && |
551 | 58.3k | CheckFSFeatureSupport(fs_.get(), |
552 | 0 | FSSupportedOps::kVerifyAndReconstructRead)) { |
553 | 0 | *can_retry = true; |
554 | 0 | ROCKS_LOG_ERROR( |
555 | 0 | immutable_db_options_.info_log, |
556 | 0 | "Possible corruption detected while replaying MANIFEST %s, %s. " |
557 | 0 | "Will be retried.", |
558 | 0 | desc_status.ToString().c_str(), s.ToString().c_str()); |
559 | 58.3k | } else { |
560 | 58.3k | *can_retry = false; |
561 | 58.3k | } |
562 | 58.3k | } |
563 | 58.3k | } else { |
564 | 0 | assert(!files_in_dbname.empty()); |
565 | 0 | s = versions_->TryRecover(column_families, read_only, files_in_dbname, |
566 | 0 | &db_id_, &missing_table_file); |
567 | 0 | if (s.ok()) { |
568 | | // TryRecover may delete previous column_family_set_. |
569 | 0 | column_family_memtables_.reset( |
570 | 0 | new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); |
571 | 0 | } |
572 | 0 | } |
573 | 58.3k | if (!s.ok()) { |
574 | 0 | return s; |
575 | 0 | } |
576 | 58.3k | if (s.ok() && !read_only) { |
577 | 80.3k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
578 | 80.3k | const auto& moptions = cfd->GetLatestMutableCFOptions(); |
579 | | // Try to trivially move files down the LSM tree to start from bottommost |
580 | | // level when level_compaction_dynamic_level_bytes is enabled. This should |
581 | | // only be useful when user is migrating to turning on this option. |
582 | | // If a user is migrating from Level Compaction with a smaller level |
583 | | // multiplier or from Universal Compaction, there may be too many |
584 | | // non-empty levels and the trivial moves here are not sufficed for |
585 | | // migration. Additional compactions are needed to drain unnecessary |
586 | | // levels. |
587 | | // |
588 | | // Note that this step moves files down LSM without consulting |
589 | | // SSTPartitioner. Further compactions are still needed if |
590 | | // the user wants to partition SST files. |
591 | | // Note that files moved in this step may not respect the compression |
592 | | // option in target level. |
593 | 80.3k | if (cfd->ioptions().compaction_style == |
594 | 80.3k | CompactionStyle::kCompactionStyleLevel && |
595 | 80.3k | cfd->ioptions().level_compaction_dynamic_level_bytes && |
596 | 80.3k | !moptions.disable_auto_compactions) { |
597 | 80.3k | int to_level = cfd->ioptions().num_levels - 1; |
598 | | // last level is reserved |
599 | | // allow_ingest_behind does not support Level Compaction, |
600 | | // and per_key_placement can have infinite compaction loop for Level |
601 | | // Compaction. Adjust to_level here just to be safe. |
602 | 80.3k | if (cfd->ioptions().allow_ingest_behind || |
603 | 80.3k | moptions.preclude_last_level_data_seconds > 0) { |
604 | 0 | to_level -= 1; |
605 | 0 | } |
606 | | // Whether this column family has a level trivially moved |
607 | 80.3k | bool moved = false; |
608 | | // Fill the LSM starting from to_level and going up one level at a time. |
609 | | // Some loop invariants (when last level is not reserved): |
610 | | // - levels in (from_level, to_level] are empty, and |
611 | | // - levels in (to_level, last_level] are non-empty. |
612 | 642k | for (int from_level = to_level; from_level >= 0; --from_level) { |
613 | 562k | const std::vector<FileMetaData*>& level_files = |
614 | 562k | cfd->current()->storage_info()->LevelFiles(from_level); |
615 | 562k | if (level_files.empty() || from_level == 0) { |
616 | 547k | continue; |
617 | 547k | } |
618 | 15.0k | assert(from_level <= to_level); |
619 | | // Trivial move files from `from_level` to `to_level` |
620 | 15.0k | if (from_level < to_level) { |
621 | 0 | if (!moved) { |
622 | | // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with |
623 | | // 7 levels |
624 | 0 | std::string lsm_state = "["; |
625 | 0 | for (int i = 0; i < cfd->ioptions().num_levels; ++i) { |
626 | 0 | lsm_state += std::to_string( |
627 | 0 | cfd->current()->storage_info()->NumLevelFiles(i)); |
628 | 0 | if (i < cfd->ioptions().num_levels - 1) { |
629 | 0 | lsm_state += ","; |
630 | 0 | } |
631 | 0 | } |
632 | 0 | lsm_state += "]"; |
633 | 0 | ROCKS_LOG_WARN(immutable_db_options_.info_log, |
634 | 0 | "[%s] Trivially move files down the LSM when open " |
635 | 0 | "with level_compaction_dynamic_level_bytes=true," |
636 | 0 | " lsm_state: %s (Files are moved only if DB " |
637 | 0 | "Recovery is successful).", |
638 | 0 | cfd->GetName().c_str(), lsm_state.c_str()); |
639 | 0 | moved = true; |
640 | 0 | } |
641 | 0 | ROCKS_LOG_WARN( |
642 | 0 | immutable_db_options_.info_log, |
643 | 0 | "[%s] Moving %zu files from from_level-%d to from_level-%d", |
644 | 0 | cfd->GetName().c_str(), level_files.size(), from_level, |
645 | 0 | to_level); |
646 | 0 | VersionEdit edit; |
647 | 0 | edit.SetColumnFamily(cfd->GetID()); |
648 | 0 | for (const FileMetaData* f : level_files) { |
649 | 0 | edit.DeleteFile(from_level, f->fd.GetNumber()); |
650 | 0 | edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), |
651 | 0 | f->fd.GetFileSize(), f->smallest, f->largest, |
652 | 0 | f->fd.smallest_seqno, f->fd.largest_seqno, |
653 | 0 | f->marked_for_compaction, |
654 | 0 | f->temperature, // this can be different from |
655 | | // `last_level_temperature` |
656 | 0 | f->oldest_blob_file_number, f->oldest_ancester_time, |
657 | 0 | f->file_creation_time, f->epoch_number, |
658 | 0 | f->file_checksum, f->file_checksum_func_name, |
659 | 0 | f->unique_id, f->compensated_range_deletion_size, |
660 | 0 | f->tail_size, f->user_defined_timestamps_persisted); |
661 | 0 | ROCKS_LOG_WARN(immutable_db_options_.info_log, |
662 | 0 | "[%s] Moving #%" PRIu64 |
663 | 0 | " from from_level-%d to from_level-%d %" PRIu64 |
664 | 0 | " bytes\n", |
665 | 0 | cfd->GetName().c_str(), f->fd.GetNumber(), |
666 | 0 | from_level, to_level, f->fd.GetFileSize()); |
667 | 0 | } |
668 | 0 | recovery_ctx->UpdateVersionEdits(cfd, edit); |
669 | 0 | } |
670 | 15.0k | --to_level; |
671 | 15.0k | } |
672 | 80.3k | } |
673 | 80.3k | } |
674 | 58.3k | } |
675 | 58.3k | if (is_new_db) { |
676 | | // Already set up DB ID in NewDB |
677 | 48.1k | } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) { |
678 | 48.1k | VersionEdit edit; |
679 | 48.1k | s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit); |
680 | 48.1k | recovery_ctx->UpdateVersionEdits( |
681 | 48.1k | versions_->GetColumnFamilySet()->GetDefault(), edit); |
682 | 48.1k | } else { |
683 | 0 | s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr); |
684 | 0 | } |
685 | 58.3k | assert(!s.ok() || !db_id_.empty()); |
686 | 58.3k | ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); |
687 | 58.3k | if (s.ok() && !read_only) { |
688 | 58.3k | s = MaybeUpdateNextFileNumber(recovery_ctx); |
689 | 58.3k | } |
690 | | |
691 | 58.3k | if (s.ok() && !read_only) { |
692 | | // TODO: share file descriptors (FSDirectory) with SetDirectories above |
693 | 58.3k | std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs; |
694 | 80.3k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
695 | 80.3k | s = cfd->AddDirectories(&created_dirs); |
696 | 80.3k | if (!s.ok()) { |
697 | 0 | return s; |
698 | 0 | } |
699 | 80.3k | } |
700 | 58.3k | } |
701 | | |
702 | 58.3k | std::vector<std::string> files_in_wal_dir; |
703 | 58.3k | if (s.ok()) { |
704 | | // Initial max_total_in_memory_state_ before recovery wals. Log recovery |
705 | | // may check this value to decide whether to flush. |
706 | 58.3k | max_total_in_memory_state_ = 0; |
707 | 80.3k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
708 | 80.3k | const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
709 | 80.3k | max_total_in_memory_state_ += mutable_cf_options.write_buffer_size * |
710 | 80.3k | mutable_cf_options.max_write_buffer_number; |
711 | 80.3k | } |
712 | | |
713 | 58.3k | SequenceNumber next_sequence(kMaxSequenceNumber); |
714 | 58.3k | default_cf_handle_ = new ColumnFamilyHandleImpl( |
715 | 58.3k | versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); |
716 | 58.3k | default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); |
717 | | |
718 | | // Recover from all newer log files than the ones named in the |
719 | | // descriptor (new log files may have been added by the previous |
720 | | // incarnation without registering them in the descriptor). |
721 | | // |
722 | | // Note that prev_log_number() is no longer used, but we pay |
723 | | // attention to it in case we are recovering a database |
724 | | // produced by an older version of rocksdb. |
725 | 58.3k | auto wal_dir = immutable_db_options_.GetWalDir(); |
726 | 58.3k | if (!immutable_db_options_.best_efforts_recovery) { |
727 | 58.3k | IOOptions io_opts; |
728 | 58.3k | io_opts.do_not_recurse = true; |
729 | 58.3k | s = immutable_db_options_.fs->GetChildren( |
730 | 58.3k | wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr); |
731 | 58.3k | } |
732 | 58.3k | if (s.IsNotFound()) { |
733 | 0 | return Status::InvalidArgument("wal_dir not found", wal_dir); |
734 | 58.3k | } else if (!s.ok()) { |
735 | 0 | return s; |
736 | 0 | } |
737 | | |
738 | 58.3k | std::unordered_map<uint64_t, std::string> wal_files; |
739 | 1.06M | for (const auto& file : files_in_wal_dir) { |
740 | 1.06M | uint64_t number; |
741 | 1.06M | FileType type; |
742 | 1.06M | if (ParseFileName(file, &number, &type) && type == kWalFile) { |
743 | 48.1k | if (is_new_db) { |
744 | 0 | return Status::Corruption( |
745 | 0 | "While creating a new Db, wal_dir contains " |
746 | 0 | "existing log file: ", |
747 | 0 | file); |
748 | 48.1k | } else { |
749 | 48.1k | wal_files[number] = LogFileName(wal_dir, number); |
750 | 48.1k | } |
751 | 48.1k | } |
752 | 1.06M | } |
753 | | |
754 | 58.3k | if (immutable_db_options_.track_and_verify_wals && !is_new_db && |
755 | 58.3k | !immutable_db_options_.best_efforts_recovery && wal_files.empty()) { |
756 | 0 | return Status::Corruption("Opening an existing DB with no WAL files"); |
757 | 0 | } |
758 | | |
759 | 58.3k | if (immutable_db_options_.track_and_verify_wals_in_manifest) { |
760 | 0 | if (!immutable_db_options_.best_efforts_recovery) { |
761 | | // Verify WALs in MANIFEST. |
762 | 0 | s = versions_->GetWalSet().CheckWals(env_, wal_files); |
763 | 0 | } // else since best effort recovery does not recover from WALs, no need |
764 | | // to check WALs. |
765 | 58.3k | } else if (!versions_->GetWalSet().GetWals().empty()) { |
766 | | // Tracking is disabled, clear previously tracked WALs from MANIFEST, |
767 | | // otherwise, in the future, if WAL tracking is enabled again, |
768 | | // since the WALs deleted when WAL tracking is disabled are not persisted |
769 | | // into MANIFEST, WAL check may fail. |
770 | 0 | VersionEdit edit; |
771 | 0 | WalNumber max_wal_number = |
772 | 0 | versions_->GetWalSet().GetWals().rbegin()->first; |
773 | 0 | edit.DeleteWalsBefore(max_wal_number + 1); |
774 | 0 | assert(recovery_ctx != nullptr); |
775 | 0 | assert(versions_->GetColumnFamilySet() != nullptr); |
776 | 0 | recovery_ctx->UpdateVersionEdits( |
777 | 0 | versions_->GetColumnFamilySet()->GetDefault(), edit); |
778 | 0 | } |
779 | 58.3k | if (!s.ok()) { |
780 | 0 | return s; |
781 | 0 | } |
782 | | |
783 | 58.3k | if (!wal_files.empty()) { |
784 | 48.1k | if (error_if_wal_file_exists) { |
785 | 0 | return Status::Corruption( |
786 | 0 | "The db was opened in readonly mode with error_if_wal_file_exists" |
787 | 0 | "flag but a WAL file already exists"); |
788 | 48.1k | } else if (error_if_data_exists_in_wals) { |
789 | 0 | for (auto& wal_file : wal_files) { |
790 | 0 | uint64_t bytes; |
791 | 0 | s = env_->GetFileSize(wal_file.second, &bytes); |
792 | 0 | if (s.ok()) { |
793 | 0 | if (bytes > 0) { |
794 | 0 | return Status::Corruption( |
795 | 0 | "error_if_data_exists_in_wals is set but there are data " |
796 | 0 | " in WAL files."); |
797 | 0 | } |
798 | 0 | } |
799 | 0 | } |
800 | 0 | } |
801 | 48.1k | } |
802 | | |
803 | 58.3k | if (!wal_files.empty()) { |
804 | | // Recover in the order in which the wals were generated |
805 | 48.1k | std::vector<uint64_t> wals; |
806 | 48.1k | wals.reserve(wal_files.size()); |
807 | 48.1k | for (const auto& wal_file : wal_files) { |
808 | 48.1k | wals.push_back(wal_file.first); |
809 | 48.1k | } |
810 | 48.1k | std::sort(wals.begin(), wals.end()); |
811 | | |
812 | 48.1k | bool corrupted_wal_found = false; |
813 | 48.1k | s = RecoverLogFiles(wals, &next_sequence, read_only, is_retry, |
814 | 48.1k | &corrupted_wal_found, recovery_ctx); |
815 | 48.1k | if (corrupted_wal_found && recovered_seq != nullptr) { |
816 | 0 | *recovered_seq = next_sequence; |
817 | 0 | } |
818 | 48.1k | if (!s.ok()) { |
819 | | // Clear memtables if recovery failed |
820 | 0 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
821 | 0 | cfd->CreateNewMemtable(kMaxSequenceNumber); |
822 | 0 | } |
823 | 0 | } |
824 | 48.1k | } |
825 | 58.3k | } |
826 | | |
827 | 58.3k | if (read_only) { |
828 | | // If we are opening as read-only, we need to update options_file_number_ |
829 | | // to reflect the most recent OPTIONS file. It does not matter for regular |
830 | | // read-write db instance because options_file_number_ will later be |
831 | | // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. |
832 | 0 | std::vector<std::string> filenames; |
833 | 0 | if (s.ok()) { |
834 | 0 | const std::string normalized_dbname = NormalizePath(dbname_); |
835 | 0 | const std::string normalized_wal_dir = |
836 | 0 | NormalizePath(immutable_db_options_.GetWalDir()); |
837 | 0 | if (immutable_db_options_.best_efforts_recovery) { |
838 | 0 | filenames = std::move(files_in_dbname); |
839 | 0 | } else if (normalized_dbname == normalized_wal_dir) { |
840 | 0 | filenames = std::move(files_in_wal_dir); |
841 | 0 | } else { |
842 | 0 | IOOptions io_opts; |
843 | 0 | io_opts.do_not_recurse = true; |
844 | 0 | s = immutable_db_options_.fs->GetChildren( |
845 | 0 | GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr); |
846 | 0 | } |
847 | 0 | } |
848 | 0 | if (s.ok()) { |
849 | 0 | uint64_t number = 0; |
850 | 0 | uint64_t options_file_number = 0; |
851 | 0 | FileType type; |
852 | 0 | for (const auto& fname : filenames) { |
853 | 0 | if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { |
854 | 0 | options_file_number = std::max(number, options_file_number); |
855 | 0 | } |
856 | 0 | } |
857 | 0 | versions_->options_file_number_ = options_file_number; |
858 | 0 | uint64_t options_file_size = 0; |
859 | 0 | if (options_file_number > 0) { |
860 | 0 | s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number), |
861 | 0 | &options_file_size); |
862 | 0 | } |
863 | 0 | versions_->options_file_size_ = options_file_size; |
864 | 0 | } |
865 | 0 | } |
866 | 58.3k | return s; |
867 | 58.3k | } |
868 | | |
869 | 0 | Status DBImpl::PersistentStatsProcessFormatVersion() { |
870 | 0 | mutex_.AssertHeld(); |
871 | 0 | Status s; |
872 | | // persist version when stats CF doesn't exist |
873 | 0 | bool should_persist_format_version = !persistent_stats_cfd_exists_; |
874 | 0 | mutex_.Unlock(); |
875 | 0 | if (persistent_stats_cfd_exists_) { |
876 | | // Check persistent stats format version compatibility. Drop and recreate |
877 | | // persistent stats CF if format version is incompatible |
878 | 0 | uint64_t format_version_recovered = 0; |
879 | 0 | Status s_format = DecodePersistentStatsVersionNumber( |
880 | 0 | this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); |
881 | 0 | uint64_t compatible_version_recovered = 0; |
882 | 0 | Status s_compatible = DecodePersistentStatsVersionNumber( |
883 | 0 | this, StatsVersionKeyType::kCompatibleVersion, |
884 | 0 | &compatible_version_recovered); |
885 | | // abort reading from existing stats CF if any of following is true: |
886 | | // 1. failed to read format version or compatible version from disk |
887 | | // 2. sst's format version is greater than current format version, meaning |
888 | | // this sst is encoded with a newer RocksDB release, and current compatible |
889 | | // version is below the sst's compatible version |
890 | 0 | if (!s_format.ok() || !s_compatible.ok() || |
891 | 0 | (kStatsCFCurrentFormatVersion < format_version_recovered && |
892 | 0 | kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { |
893 | 0 | if (!s_format.ok() || !s_compatible.ok()) { |
894 | 0 | ROCKS_LOG_WARN( |
895 | 0 | immutable_db_options_.info_log, |
896 | 0 | "Recreating persistent stats column family since reading " |
897 | 0 | "persistent stats version key failed. Format key: %s, compatible " |
898 | 0 | "key: %s", |
899 | 0 | s_format.ToString().c_str(), s_compatible.ToString().c_str()); |
900 | 0 | } else { |
901 | 0 | ROCKS_LOG_WARN( |
902 | 0 | immutable_db_options_.info_log, |
903 | 0 | "Recreating persistent stats column family due to corrupted or " |
904 | 0 | "incompatible format version. Recovered format: %" PRIu64 |
905 | 0 | "; recovered format compatible since: %" PRIu64 "\n", |
906 | 0 | format_version_recovered, compatible_version_recovered); |
907 | 0 | } |
908 | 0 | s = DropColumnFamily(persist_stats_cf_handle_); |
909 | 0 | if (s.ok()) { |
910 | 0 | s = DestroyColumnFamilyHandle(persist_stats_cf_handle_); |
911 | 0 | } |
912 | 0 | ColumnFamilyHandle* handle = nullptr; |
913 | 0 | if (s.ok()) { |
914 | 0 | ColumnFamilyOptions cfo; |
915 | 0 | OptimizeForPersistentStats(&cfo); |
916 | 0 | s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), |
917 | 0 | WriteOptions(Env::IOActivity::kDBOpen), cfo, |
918 | 0 | kPersistentStatsColumnFamilyName, &handle); |
919 | 0 | } |
920 | 0 | if (s.ok()) { |
921 | 0 | persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle); |
922 | | // should also persist version here because old stats CF is discarded |
923 | 0 | should_persist_format_version = true; |
924 | 0 | } |
925 | 0 | } |
926 | 0 | } |
927 | 0 | if (should_persist_format_version) { |
928 | | // Persistent stats CF being created for the first time, need to write |
929 | | // format version key |
930 | 0 | WriteBatch batch; |
931 | 0 | if (s.ok()) { |
932 | 0 | s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, |
933 | 0 | std::to_string(kStatsCFCurrentFormatVersion)); |
934 | 0 | } |
935 | 0 | if (s.ok()) { |
936 | 0 | s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, |
937 | 0 | std::to_string(kStatsCFCompatibleFormatVersion)); |
938 | 0 | } |
939 | 0 | if (s.ok()) { |
940 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
941 | 0 | WriteOptions wo; |
942 | 0 | wo.low_pri = true; |
943 | 0 | wo.no_slowdown = true; |
944 | 0 | wo.sync = false; |
945 | 0 | s = Write(wo, &batch); |
946 | 0 | } |
947 | 0 | } |
948 | 0 | mutex_.Lock(); |
949 | 0 | return s; |
950 | 0 | } |
951 | | |
952 | 0 | Status DBImpl::InitPersistStatsColumnFamily() { |
953 | 0 | mutex_.AssertHeld(); |
954 | 0 | assert(!persist_stats_cf_handle_); |
955 | 0 | ColumnFamilyData* persistent_stats_cfd = |
956 | 0 | versions_->GetColumnFamilySet()->GetColumnFamily( |
957 | 0 | kPersistentStatsColumnFamilyName); |
958 | 0 | persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; |
959 | |
|
960 | 0 | Status s; |
961 | 0 | if (persistent_stats_cfd != nullptr) { |
962 | | // We are recovering from a DB which already contains persistent stats CF, |
963 | | // the CF is already created in VersionSet::ApplyOneVersionEdit, but |
964 | | // column family handle was not. Need to explicitly create handle here. |
965 | 0 | persist_stats_cf_handle_ = |
966 | 0 | new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); |
967 | 0 | } else { |
968 | 0 | mutex_.Unlock(); |
969 | 0 | ColumnFamilyHandle* handle = nullptr; |
970 | 0 | ColumnFamilyOptions cfo; |
971 | 0 | OptimizeForPersistentStats(&cfo); |
972 | 0 | s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), |
973 | 0 | WriteOptions(Env::IOActivity::kDBOpen), cfo, |
974 | 0 | kPersistentStatsColumnFamilyName, &handle); |
975 | 0 | persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle); |
976 | 0 | mutex_.Lock(); |
977 | 0 | } |
978 | 0 | return s; |
979 | 0 | } |
980 | | |
981 | 58.3k | Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { |
982 | 58.3k | mutex_.AssertHeld(); |
983 | 58.3k | assert(versions_->descriptor_log_ == nullptr); |
984 | 58.3k | const ReadOptions read_options(Env::IOActivity::kDBOpen); |
985 | 58.3k | const WriteOptions write_options(Env::IOActivity::kDBOpen); |
986 | | |
987 | 58.3k | Status s = versions_->LogAndApply(recovery_ctx.cfds_, read_options, |
988 | 58.3k | write_options, recovery_ctx.edit_lists_, |
989 | 58.3k | &mutex_, directories_.GetDbDir()); |
990 | 58.3k | return s; |
991 | 58.3k | } |
992 | | |
993 | 48.1k | void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() { |
994 | 48.1k | if (immutable_db_options_.wal_filter == nullptr) { |
995 | 48.1k | return; |
996 | 48.1k | } |
997 | 0 | assert(immutable_db_options_.wal_filter != nullptr); |
998 | 0 | WalFilter& wal_filter = *(immutable_db_options_.wal_filter); |
999 | |
|
1000 | 0 | std::map<std::string, uint32_t> cf_name_id_map; |
1001 | 0 | std::map<uint32_t, uint64_t> cf_lognumber_map; |
1002 | 0 | assert(versions_); |
1003 | 0 | assert(versions_->GetColumnFamilySet()); |
1004 | 0 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1005 | 0 | assert(cfd); |
1006 | 0 | cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); |
1007 | 0 | cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber())); |
1008 | 0 | } |
1009 | |
|
1010 | 0 | wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map); |
1011 | 0 | } |
1012 | | |
1013 | | bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, |
1014 | | const std::string& wal_fname, |
1015 | | log::Reader::Reporter& reporter, |
1016 | | Status& status, |
1017 | | bool& stop_replay, |
1018 | 1.01M | WriteBatch& batch) { |
1019 | 1.01M | if (immutable_db_options_.wal_filter == nullptr) { |
1020 | 1.01M | return true; |
1021 | 1.01M | } |
1022 | 0 | assert(immutable_db_options_.wal_filter != nullptr); |
1023 | 0 | WalFilter& wal_filter = *(immutable_db_options_.wal_filter); |
1024 | |
|
1025 | 0 | WriteBatch new_batch; |
1026 | 0 | bool batch_changed = false; |
1027 | |
|
1028 | 0 | bool process_current_record = true; |
1029 | |
|
1030 | 0 | WalFilter::WalProcessingOption wal_processing_option = |
1031 | 0 | wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch, |
1032 | 0 | &batch_changed); |
1033 | |
|
1034 | 0 | switch (wal_processing_option) { |
1035 | 0 | case WalFilter::WalProcessingOption::kContinueProcessing: |
1036 | | // do nothing, proceeed normally |
1037 | 0 | break; |
1038 | 0 | case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: |
1039 | | // skip current record |
1040 | 0 | process_current_record = false; |
1041 | 0 | break; |
1042 | 0 | case WalFilter::WalProcessingOption::kStopReplay: |
1043 | | // skip current record and stop replay |
1044 | 0 | process_current_record = false; |
1045 | 0 | stop_replay = true; |
1046 | 0 | break; |
1047 | 0 | case WalFilter::WalProcessingOption::kCorruptedRecord: { |
1048 | 0 | status = Status::Corruption("Corruption reported by Wal Filter ", |
1049 | 0 | wal_filter.Name()); |
1050 | 0 | MaybeIgnoreError(&status); |
1051 | 0 | if (!status.ok()) { |
1052 | 0 | process_current_record = false; |
1053 | 0 | reporter.Corruption(batch.GetDataSize(), status); |
1054 | 0 | } |
1055 | 0 | break; |
1056 | 0 | } |
1057 | 0 | default: { |
1058 | | // logical error which should not happen. If RocksDB throws, we would |
1059 | | // just do `throw std::logic_error`. |
1060 | 0 | assert(false); |
1061 | 0 | status = Status::NotSupported( |
1062 | 0 | "Unknown WalProcessingOption returned by Wal Filter ", |
1063 | 0 | wal_filter.Name()); |
1064 | 0 | MaybeIgnoreError(&status); |
1065 | 0 | if (!status.ok()) { |
1066 | | // Ignore the error with current record processing. |
1067 | 0 | stop_replay = true; |
1068 | 0 | } |
1069 | 0 | break; |
1070 | 0 | } |
1071 | 0 | } |
1072 | | |
1073 | 0 | if (!process_current_record) { |
1074 | 0 | return false; |
1075 | 0 | } |
1076 | | |
1077 | 0 | if (batch_changed) { |
1078 | | // Make sure that the count in the new batch is |
1079 | | // within the orignal count. |
1080 | 0 | int new_count = WriteBatchInternal::Count(&new_batch); |
1081 | 0 | int original_count = WriteBatchInternal::Count(&batch); |
1082 | 0 | if (new_count > original_count) { |
1083 | 0 | ROCKS_LOG_FATAL( |
1084 | 0 | immutable_db_options_.info_log, |
1085 | 0 | "Recovering log #%" PRIu64 |
1086 | 0 | " mode %d log filter %s returned " |
1087 | 0 | "more records (%d) than original (%d) which is not allowed. " |
1088 | 0 | "Aborting recovery.", |
1089 | 0 | wal_number, static_cast<int>(immutable_db_options_.wal_recovery_mode), |
1090 | 0 | wal_filter.Name(), new_count, original_count); |
1091 | 0 | status = Status::NotSupported( |
1092 | 0 | "More than original # of records " |
1093 | 0 | "returned by Wal Filter ", |
1094 | 0 | wal_filter.Name()); |
1095 | 0 | return false; |
1096 | 0 | } |
1097 | | // Set the same sequence number in the new_batch |
1098 | | // as the original batch. |
1099 | 0 | WriteBatchInternal::SetSequence(&new_batch, |
1100 | 0 | WriteBatchInternal::Sequence(&batch)); |
1101 | 0 | batch = new_batch; |
1102 | 0 | } |
1103 | 0 | return true; |
1104 | 0 | } |
1105 | | |
1106 | | void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s, |
1107 | 0 | uint64_t log_number) { |
1108 | 0 | ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", |
1109 | 0 | (status == nullptr ? "(ignoring error) " : ""), fname, |
1110 | 0 | static_cast<int>(bytes), s.ToString().c_str()); |
1111 | 0 | if (status != nullptr && status->ok()) { |
1112 | 0 | *status = s; |
1113 | 0 | corrupted_wal_number_ = log_number; |
1114 | 0 | } |
1115 | 0 | } |
1116 | | |
1117 | 0 | void DBOpenLogRecordReadReporter::OldLogRecord(size_t bytes) { |
1118 | 0 | if (old_log_record != nullptr) { |
1119 | 0 | *old_log_record = true; |
1120 | 0 | } |
1121 | 0 | ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled", fname, |
1122 | 0 | static_cast<int>(bytes)); |
1123 | 0 | } |
1124 | | |
1125 | | // REQUIRES: wal_numbers are sorted in ascending order |
1126 | | Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers, |
1127 | | SequenceNumber* next_sequence, bool read_only, |
1128 | | bool is_retry, bool* corrupted_wal_found, |
1129 | 48.1k | RecoveryContext* recovery_ctx) { |
1130 | 48.1k | mutex_.AssertHeld(); |
1131 | | |
1132 | 48.1k | std::unordered_map<int, VersionEdit> version_edits; |
1133 | 48.1k | int job_id = 0; |
1134 | 48.1k | uint64_t min_wal_number = 0; |
1135 | 48.1k | SetupLogFilesRecovery(wal_numbers, &version_edits, &job_id, &min_wal_number); |
1136 | | |
1137 | 48.1k | Status status = ProcessLogFiles( |
1138 | 48.1k | wal_numbers, read_only, is_retry, min_wal_number, job_id, next_sequence, |
1139 | 48.1k | &version_edits, corrupted_wal_found, recovery_ctx); |
1140 | | |
1141 | 48.1k | FinishLogFilesRecovery(job_id, status); |
1142 | 48.1k | return status; |
1143 | 48.1k | } |
1144 | | |
1145 | | void DBImpl::SetupLogFilesRecovery( |
1146 | | const std::vector<uint64_t>& wal_numbers, |
1147 | | std::unordered_map<int, VersionEdit>* version_edits, int* job_id, |
1148 | 48.1k | uint64_t* min_wal_number) { |
1149 | 48.1k | assert(version_edits); |
1150 | 48.1k | assert(job_id); |
1151 | 48.1k | assert(min_wal_number); |
1152 | | // No need to refcount because iteration is under mutex |
1153 | 70.1k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1154 | 70.1k | VersionEdit edit; |
1155 | 70.1k | edit.SetColumnFamily(cfd->GetID()); |
1156 | 70.1k | version_edits->insert({cfd->GetID(), edit}); |
1157 | 70.1k | } |
1158 | | |
1159 | 48.1k | *job_id = next_job_id_.fetch_add(1); |
1160 | 48.1k | { |
1161 | 48.1k | auto stream = event_logger_.Log(); |
1162 | 48.1k | stream << "job" << *job_id; |
1163 | 48.1k | stream << "event" << "recovery_started"; |
1164 | 48.1k | stream << "wal_files"; |
1165 | 48.1k | stream.StartArray(); |
1166 | 48.1k | for (auto wal_number : wal_numbers) { |
1167 | 48.1k | stream << wal_number; |
1168 | 48.1k | } |
1169 | 48.1k | stream.EndArray(); |
1170 | 48.1k | } |
1171 | | |
1172 | | // No-op for immutable_db_options_.wal_filter == nullptr. |
1173 | 48.1k | InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap(); |
1174 | | |
1175 | 48.1k | *min_wal_number = MinLogNumberToKeep(); |
1176 | 48.1k | if (!allow_2pc()) { |
1177 | | // In non-2pc mode, we skip WALs that do not back unflushed data. |
1178 | 48.1k | *min_wal_number = |
1179 | 48.1k | std::max(*min_wal_number, versions_->MinLogNumberWithUnflushedData()); |
1180 | 48.1k | } |
1181 | 48.1k | } |
1182 | | |
1183 | | Status DBImpl::ProcessLogFiles( |
1184 | | const std::vector<uint64_t>& wal_numbers, bool read_only, bool is_retry, |
1185 | | uint64_t min_wal_number, int job_id, SequenceNumber* next_sequence, |
1186 | | std::unordered_map<int, VersionEdit>* version_edits, |
1187 | 48.1k | bool* corrupted_wal_found, RecoveryContext* recovery_ctx) { |
1188 | 48.1k | Status status; |
1189 | | |
1190 | 48.1k | bool stop_replay_by_wal_filter = false; |
1191 | 48.1k | bool stop_replay_for_corruption = false; |
1192 | 48.1k | bool flushed = false; |
1193 | 48.1k | uint64_t corrupted_wal_number = kMaxSequenceNumber; |
1194 | 48.1k | PredecessorWALInfo predecessor_wal_info; |
1195 | | |
1196 | 48.1k | for (auto wal_number : wal_numbers) { |
1197 | | // Detecting early break on the next iteration after `wal_number` has been |
1198 | | // advanced since this `wal_number` doesn't affect follow-up handling after |
1199 | | // breaking out of the for loop. |
1200 | 48.1k | if (!status.ok()) { |
1201 | 0 | break; |
1202 | 0 | } |
1203 | 48.1k | SequenceNumber prev_next_sequence = *next_sequence; |
1204 | 48.1k | if (status.ok()) { |
1205 | 48.1k | status = ProcessLogFile( |
1206 | 48.1k | wal_number, min_wal_number, is_retry, read_only, job_id, |
1207 | 48.1k | next_sequence, &stop_replay_for_corruption, |
1208 | 48.1k | &stop_replay_by_wal_filter, &corrupted_wal_number, |
1209 | 48.1k | corrupted_wal_found, version_edits, &flushed, predecessor_wal_info); |
1210 | 48.1k | } |
1211 | 48.1k | if (status.ok()) { |
1212 | 48.1k | status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence, |
1213 | 48.1k | *next_sequence); |
1214 | 48.1k | } |
1215 | 48.1k | } |
1216 | | |
1217 | 48.1k | if (status.ok()) { |
1218 | 48.1k | status = MaybeHandleStopReplayForCorruptionForInconsistency( |
1219 | 48.1k | stop_replay_for_corruption, corrupted_wal_number); |
1220 | 48.1k | } |
1221 | | |
1222 | 48.1k | if (status.ok()) { |
1223 | 48.1k | status = MaybeFlushFinalMemtableOrRestoreActiveLogFiles( |
1224 | 48.1k | wal_numbers, read_only, job_id, flushed, version_edits, recovery_ctx); |
1225 | 48.1k | } |
1226 | 48.1k | return status; |
1227 | 48.1k | } |
1228 | | |
1229 | | Status DBImpl::ProcessLogFile( |
1230 | | uint64_t wal_number, uint64_t min_wal_number, bool is_retry, bool read_only, |
1231 | | int job_id, SequenceNumber* next_sequence, bool* stop_replay_for_corruption, |
1232 | | bool* stop_replay_by_wal_filter, uint64_t* corrupted_wal_number, |
1233 | | bool* corrupted_wal_found, |
1234 | | std::unordered_map<int, VersionEdit>* version_edits, bool* flushed, |
1235 | 48.1k | PredecessorWALInfo& predecessor_wal_info) { |
1236 | 48.1k | assert(stop_replay_by_wal_filter); |
1237 | | |
1238 | | // Variable initialization starts |
1239 | 48.1k | Status status; |
1240 | 48.1k | bool old_log_record = false; |
1241 | | |
1242 | 48.1k | DBOpenLogRecordReadReporter reporter; |
1243 | 48.1k | std::unique_ptr<log::Reader> reader; |
1244 | | |
1245 | 48.1k | std::string fname = |
1246 | 48.1k | LogFileName(immutable_db_options_.GetWalDir(), wal_number); |
1247 | | |
1248 | 48.1k | auto logFileDropped = [this, &fname]() { |
1249 | 0 | uint64_t bytes; |
1250 | 0 | if (env_->GetFileSize(fname, &bytes).ok()) { |
1251 | 0 | auto info_log = immutable_db_options_.info_log.get(); |
1252 | 0 | ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), |
1253 | 0 | static_cast<int>(bytes)); |
1254 | 0 | } |
1255 | 0 | }; |
1256 | | |
1257 | 48.1k | std::string scratch; |
1258 | 48.1k | Slice record; |
1259 | 48.1k | uint64_t record_checksum; |
1260 | 48.1k | const UnorderedMap<uint32_t, size_t>& running_ts_sz = |
1261 | 48.1k | versions_->GetRunningColumnFamiliesTimestampSize(); |
1262 | | |
1263 | | // We need to track `last_seqno_observed` in addition to `next_sequence` since |
1264 | | // `last_seqno_observed != *next_sequence` when there are multiple key-value |
1265 | | // pairs in one WAL entry |
1266 | 48.1k | SequenceNumber last_seqno_observed = 0; |
1267 | | // Variable initialization ends |
1268 | | |
1269 | 48.1k | if (wal_number < min_wal_number) { |
1270 | 0 | ROCKS_LOG_INFO(immutable_db_options_.info_log, |
1271 | 0 | "Skipping log #%" PRIu64 |
1272 | 0 | " since it is older than min log to keep #%" PRIu64, |
1273 | 0 | wal_number, min_wal_number); |
1274 | 0 | assert(status.ok()); |
1275 | 0 | return status; |
1276 | 0 | } |
1277 | | |
1278 | 48.1k | SetupLogFileProcessing(wal_number); |
1279 | | |
1280 | 48.1k | if (*stop_replay_by_wal_filter) { |
1281 | 0 | logFileDropped(); |
1282 | 0 | assert(status.ok()); |
1283 | 0 | return status; |
1284 | 0 | } |
1285 | | |
1286 | 48.1k | Status init_status = InitializeLogReader( |
1287 | 48.1k | wal_number, is_retry, fname, *stop_replay_for_corruption, min_wal_number, |
1288 | 48.1k | predecessor_wal_info, &old_log_record, &status, &reporter, reader); |
1289 | | |
1290 | | // FIXME(hx235): Consolidate `!init_status.ok()` and `reader == nullptr` cases |
1291 | 48.1k | if (!init_status.ok()) { |
1292 | 0 | assert(status.ok()); |
1293 | 0 | status.PermitUncheckedError(); |
1294 | 0 | return init_status; |
1295 | 48.1k | } else if (reader == nullptr) { |
1296 | | // TODO(hx235): remove this case since it's confusing |
1297 | 0 | assert(status.ok()); |
1298 | | // Fail initializing log reader for one log file with an ok status. |
1299 | | // Try next one. |
1300 | 0 | return status; |
1301 | 0 | } |
1302 | | |
1303 | 48.1k | TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal", |
1304 | 48.1k | /*cb_arg=*/nullptr); |
1305 | 1.06M | while (true) { |
1306 | 1.06M | if (*stop_replay_by_wal_filter) { |
1307 | 0 | break; |
1308 | 0 | } |
1309 | | |
1310 | 1.06M | bool read_record = reader->ReadRecord( |
1311 | 1.06M | &record, &scratch, immutable_db_options_.wal_recovery_mode, |
1312 | 1.06M | &record_checksum); |
1313 | | |
1314 | | // `reader->ReadRecord` will change `status` through reporter in `reader` |
1315 | | // when a corruption is encountered |
1316 | | // FIXME(hx235): consolidate `read_record` and `status` |
1317 | 1.06M | if (!read_record || !status.ok()) { |
1318 | 48.1k | break; |
1319 | 48.1k | } |
1320 | | |
1321 | | // FIXME(hx235): consolidate `process_status` and `status` |
1322 | 1.01M | SequenceNumber prev_next_sequence = *next_sequence; |
1323 | 1.01M | Status process_status = ProcessLogRecord( |
1324 | 1.01M | record, reader, running_ts_sz, wal_number, fname, read_only, job_id, |
1325 | 1.01M | logFileDropped, &reporter, &record_checksum, &last_seqno_observed, |
1326 | 1.01M | next_sequence, stop_replay_for_corruption, &status, |
1327 | 1.01M | stop_replay_by_wal_filter, version_edits, flushed); |
1328 | | |
1329 | 1.01M | if (!process_status.ok()) { |
1330 | 0 | return process_status; |
1331 | 1.01M | } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery( |
1332 | 1.01M | prev_next_sequence, *next_sequence); |
1333 | 1.01M | !seqno_check_status.ok()) { |
1334 | | // Sequence number being set back indicates a serious software bug, the DB |
1335 | | // should not be opened in this case. |
1336 | 0 | return seqno_check_status; |
1337 | 1.01M | } else if (*stop_replay_for_corruption) { |
1338 | 0 | break; |
1339 | 0 | } |
1340 | 1.01M | } |
1341 | | |
1342 | 48.1k | ROCKS_LOG_INFO(immutable_db_options_.info_log, |
1343 | 48.1k | "Recovered to log #%" PRIu64 " next seq #%" PRIu64, wal_number, |
1344 | 48.1k | *next_sequence); |
1345 | | |
1346 | 48.1k | if (status.ok()) { |
1347 | 48.1k | status = UpdatePredecessorWALInfo(wal_number, last_seqno_observed, fname, |
1348 | 48.1k | predecessor_wal_info); |
1349 | 48.1k | } |
1350 | | |
1351 | 48.1k | if (!status.ok() || old_log_record) { |
1352 | 0 | status = HandleNonOkStatusOrOldLogRecord( |
1353 | 0 | wal_number, next_sequence, status, reporter, &old_log_record, |
1354 | 0 | stop_replay_for_corruption, corrupted_wal_number, corrupted_wal_found); |
1355 | 0 | } |
1356 | | |
1357 | 48.1k | FinishLogFileProcessing(status, next_sequence); |
1358 | | |
1359 | 48.1k | return status; |
1360 | 48.1k | } |
1361 | | |
1362 | 48.1k | void DBImpl::SetupLogFileProcessing(uint64_t wal_number) { |
1363 | | // The previous incarnation may not have written any MANIFEST |
1364 | | // records after allocating this log number. So we manually |
1365 | | // update the file number allocation counter in VersionSet. |
1366 | 48.1k | versions_->MarkFileNumberUsed(wal_number); |
1367 | | |
1368 | 48.1k | ROCKS_LOG_INFO(immutable_db_options_.info_log, |
1369 | 48.1k | "Recovering log #%" PRIu64 " mode %d", wal_number, |
1370 | 48.1k | static_cast<int>(immutable_db_options_.wal_recovery_mode)); |
1371 | 48.1k | } |
1372 | | |
1373 | | Status DBImpl::InitializeLogReader( |
1374 | | uint64_t wal_number, bool is_retry, std::string& fname, |
1375 | | bool stop_replay_for_corruption, uint64_t min_wal_number, |
1376 | | const PredecessorWALInfo& predecessor_wal_info, bool* const old_log_record, |
1377 | | Status* const reporter_status, DBOpenLogRecordReadReporter* reporter, |
1378 | 48.1k | std::unique_ptr<log::Reader>& reader) { |
1379 | 48.1k | assert(old_log_record); |
1380 | 48.1k | assert(reporter_status); |
1381 | 48.1k | assert(reporter); |
1382 | | |
1383 | 48.1k | Status status; |
1384 | | |
1385 | 48.1k | std::unique_ptr<SequentialFileReader> file_reader; |
1386 | 48.1k | { |
1387 | 48.1k | std::unique_ptr<FSSequentialFile> file; |
1388 | 48.1k | status = fs_->NewSequentialFile( |
1389 | 48.1k | fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); |
1390 | 48.1k | if (!status.ok()) { |
1391 | 0 | MaybeIgnoreError(&status); |
1392 | 0 | return status; |
1393 | 0 | } |
1394 | 48.1k | file_reader.reset(new SequentialFileReader( |
1395 | 48.1k | std::move(file), fname, immutable_db_options_.log_readahead_size, |
1396 | 48.1k | io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr, |
1397 | 48.1k | /*verify_and_reconstruct_read=*/is_retry)); |
1398 | 48.1k | } |
1399 | | |
1400 | | // Create the log reader. |
1401 | 0 | reporter->env = env_; |
1402 | 48.1k | reporter->info_log = immutable_db_options_.info_log.get(); |
1403 | 48.1k | reporter->fname = fname.c_str(); |
1404 | 48.1k | reporter->old_log_record = old_log_record; |
1405 | 48.1k | if (!immutable_db_options_.paranoid_checks || |
1406 | 48.1k | immutable_db_options_.wal_recovery_mode == |
1407 | 48.1k | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1408 | 0 | reporter->status = nullptr; |
1409 | 48.1k | } else { |
1410 | 48.1k | reporter->status = reporter_status; |
1411 | 48.1k | } |
1412 | | // We intentially make log::Reader do checksumming even if |
1413 | | // paranoid_checks==false so that corruptions cause entire commits |
1414 | | // to be skipped instead of propagating bad information (like overly |
1415 | | // large sequence numbers). |
1416 | 48.1k | reader.reset(new log::Reader( |
1417 | 48.1k | immutable_db_options_.info_log, std::move(file_reader), reporter, |
1418 | 48.1k | true /*checksum*/, wal_number, |
1419 | 48.1k | immutable_db_options_.track_and_verify_wals, stop_replay_for_corruption, |
1420 | 48.1k | min_wal_number, predecessor_wal_info)); |
1421 | 48.1k | return status; |
1422 | 48.1k | } |
1423 | | |
1424 | | Status DBImpl::ProcessLogRecord( |
1425 | | Slice record, const std::unique_ptr<log::Reader>& reader, |
1426 | | const UnorderedMap<uint32_t, size_t>& running_ts_sz, uint64_t wal_number, |
1427 | | const std::string& fname, bool read_only, int job_id, |
1428 | | const std::function<void()>& logFileDropped, |
1429 | | DBOpenLogRecordReadReporter* reporter, uint64_t* record_checksum, |
1430 | | SequenceNumber* last_seqno_observed, SequenceNumber* next_sequence, |
1431 | | bool* stop_replay_for_corruption, Status* status, |
1432 | | bool* stop_replay_by_wal_filter, |
1433 | 1.01M | std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) { |
1434 | 1.01M | assert(reporter); |
1435 | 1.01M | assert(last_seqno_observed); |
1436 | 1.01M | assert(stop_replay_for_corruption); |
1437 | 1.01M | assert(status); |
1438 | 1.01M | assert(stop_replay_by_wal_filter); |
1439 | | |
1440 | 1.01M | Status process_status; |
1441 | 1.01M | bool has_valid_writes = false; |
1442 | 1.01M | WriteBatch batch; |
1443 | 1.01M | std::unique_ptr<WriteBatch> new_batch; |
1444 | 1.01M | WriteBatch* batch_to_use = nullptr; |
1445 | | |
1446 | 1.01M | if (record.size() < WriteBatchInternal::kHeader) { |
1447 | 0 | reporter->Corruption(record.size(), |
1448 | 0 | Status::Corruption("log record too small")); |
1449 | 0 | assert(process_status.ok()); |
1450 | 0 | return process_status; |
1451 | 0 | } |
1452 | | |
1453 | 1.01M | process_status = InitializeWriteBatchForLogRecord( |
1454 | 1.01M | record, reader, running_ts_sz, &batch, new_batch, batch_to_use, |
1455 | 1.01M | record_checksum); |
1456 | 1.01M | if (!process_status.ok()) { |
1457 | 0 | return process_status; |
1458 | 0 | } |
1459 | 1.01M | assert(batch_to_use); |
1460 | | |
1461 | 1.01M | *last_seqno_observed = WriteBatchInternal::Sequence(batch_to_use); |
1462 | | |
1463 | 1.01M | if (*last_seqno_observed > kMaxSequenceNumber) { |
1464 | 0 | reporter->Corruption( |
1465 | 0 | record.size(), |
1466 | 0 | Status::Corruption("sequence " + std::to_string(*last_seqno_observed) + |
1467 | 0 | " is too large")); |
1468 | 0 | assert(process_status.ok()); |
1469 | 0 | return process_status; |
1470 | 0 | } |
1471 | | |
1472 | 1.01M | MaybeReviseStopReplayForCorruption(*last_seqno_observed, next_sequence, |
1473 | 1.01M | stop_replay_for_corruption); |
1474 | 1.01M | if (*stop_replay_for_corruption) { |
1475 | 0 | logFileDropped(); |
1476 | 0 | assert(process_status.ok()); |
1477 | 0 | return process_status; |
1478 | 0 | } |
1479 | | |
1480 | | // For the default case of wal_filter == nullptr, always performs no-op |
1481 | | // and returns true. |
1482 | 1.01M | if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, *reporter, *status, |
1483 | 1.01M | *stop_replay_by_wal_filter, |
1484 | 1.01M | *batch_to_use)) { |
1485 | 0 | assert(process_status.ok()); |
1486 | 0 | return process_status; |
1487 | 1.01M | } else { |
1488 | | // FIXME(hx235): Handle the potential non-okay `status` when |
1489 | | // `InvokeWalFilterIfNeededOnWalRecord()` returns true |
1490 | 1.01M | status->PermitUncheckedError(); |
1491 | 1.01M | } |
1492 | | |
1493 | 1.01M | assert(process_status.ok()); |
1494 | 1.01M | process_status = InsertLogRecordToMemtable(batch_to_use, wal_number, |
1495 | 1.01M | next_sequence, &has_valid_writes); |
1496 | 1.01M | MaybeIgnoreError(&process_status); |
1497 | | // We are treating this as a failure while reading since we read valid |
1498 | | // blocks that do not form coherent data |
1499 | 1.01M | if (!process_status.ok()) { |
1500 | | // FIXME(hx235): `reporter->Corruption()` will override the non-ok status |
1501 | | // set in `InvokeWalFilterIfNeededOnWalRecord` through passing `*status` |
1502 | 0 | reporter->Corruption(record.size(), process_status); |
1503 | 0 | process_status = Status::OK(); |
1504 | 0 | return process_status; |
1505 | 0 | } |
1506 | | |
1507 | 1.01M | process_status = MaybeWriteLevel0TableForRecovery( |
1508 | 1.01M | has_valid_writes, read_only, wal_number, job_id, next_sequence, |
1509 | 1.01M | version_edits, flushed); |
1510 | | |
1511 | 1.01M | return process_status; |
1512 | 1.01M | } |
1513 | | |
1514 | | // We create a new batch and initialize with a valid prot_info_ to store |
1515 | | // the data checksum |
1516 | | Status DBImpl::InitializeWriteBatchForLogRecord( |
1517 | | Slice record, const std::unique_ptr<log::Reader>& reader, |
1518 | | const UnorderedMap<uint32_t, size_t>& running_ts_sz, WriteBatch* batch, |
1519 | | std::unique_ptr<WriteBatch>& new_batch, WriteBatch*& batch_to_use, |
1520 | 1.01M | uint64_t* record_checksum) { |
1521 | 1.01M | assert(batch); |
1522 | 1.01M | assert(record_checksum); |
1523 | | |
1524 | 1.01M | Status status = WriteBatchInternal::SetContents(batch, record); |
1525 | 1.01M | if (!status.ok()) { |
1526 | 0 | return status; |
1527 | 0 | } |
1528 | | |
1529 | 1.01M | const UnorderedMap<uint32_t, size_t>& record_ts_sz = |
1530 | 1.01M | reader->GetRecordedTimestampSize(); |
1531 | 1.01M | status = HandleWriteBatchTimestampSizeDifference( |
1532 | 1.01M | batch, running_ts_sz, record_ts_sz, |
1533 | 1.01M | TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_, |
1534 | 1.01M | batch_per_txn_, &new_batch); |
1535 | 1.01M | if (!status.ok()) { |
1536 | 0 | return status; |
1537 | 0 | } |
1538 | | |
1539 | 1.01M | bool batch_updated = new_batch != nullptr; |
1540 | 1.01M | batch_to_use = batch_updated ? new_batch.get() : batch; |
1541 | 1.01M | TEST_SYNC_POINT_CALLBACK( |
1542 | 1.01M | "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", batch_to_use); |
1543 | 1.01M | TEST_SYNC_POINT_CALLBACK( |
1544 | 1.01M | "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", |
1545 | 1.01M | record_checksum); |
1546 | 1.01M | status = WriteBatchInternal::UpdateProtectionInfo( |
1547 | 1.01M | batch_to_use, 8 /* bytes_per_key */, |
1548 | 1.01M | batch_updated ? nullptr : record_checksum); |
1549 | | |
1550 | 1.01M | return status; |
1551 | 1.01M | } |
1552 | | |
1553 | | void DBImpl::MaybeReviseStopReplayForCorruption( |
1554 | | SequenceNumber sequence, SequenceNumber const* const next_sequence, |
1555 | 1.01M | bool* stop_replay_for_corruption) { |
1556 | 1.01M | if (immutable_db_options_.wal_recovery_mode == |
1557 | 1.01M | WALRecoveryMode::kPointInTimeRecovery) { |
1558 | 1.01M | assert(next_sequence); |
1559 | 1.01M | assert(stop_replay_for_corruption); |
1560 | | // In point-in-time recovery mode, if sequence id of log files are |
1561 | | // consecutive, we continue recovery despite corruption. This could |
1562 | | // happen when we open and write to a corrupted DB, where sequence id |
1563 | | // will start from the last sequence id we recovered. |
1564 | 1.01M | if (sequence == *next_sequence) { |
1565 | 914k | *stop_replay_for_corruption = false; |
1566 | 914k | } |
1567 | 1.01M | } |
1568 | 1.01M | } |
1569 | | |
1570 | | Status DBImpl::InsertLogRecordToMemtable(WriteBatch* batch_to_use, |
1571 | | uint64_t wal_number, |
1572 | | SequenceNumber* next_sequence, |
1573 | 1.01M | bool* has_valid_writes) { |
1574 | | // If column family was not found, it might mean that the WAL write |
1575 | | // batch references to the column family that was dropped after the |
1576 | | // insert. We don't want to fail the whole write batch in that case -- |
1577 | | // we just ignore the update. |
1578 | | // That's why we set ignore missing column families to true |
1579 | 1.01M | assert(batch_to_use); |
1580 | 1.01M | assert(has_valid_writes); |
1581 | 1.01M | Status status = WriteBatchInternal::InsertInto( |
1582 | 1.01M | batch_to_use, column_family_memtables_.get(), &flush_scheduler_, |
1583 | 1.01M | &trim_history_scheduler_, true, wal_number, this, |
1584 | 1.01M | false /* concurrent_memtable_writes */, next_sequence, has_valid_writes, |
1585 | 1.01M | seq_per_batch_, batch_per_txn_); |
1586 | 1.01M | return status; |
1587 | 1.01M | } |
1588 | | |
1589 | | Status DBImpl::MaybeWriteLevel0TableForRecovery( |
1590 | | bool has_valid_writes, bool read_only, uint64_t wal_number, int job_id, |
1591 | | SequenceNumber const* const next_sequence, |
1592 | 1.01M | std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) { |
1593 | 1.01M | assert(next_sequence); |
1594 | 1.01M | assert(version_edits); |
1595 | 1.01M | assert(flushed); |
1596 | | |
1597 | 1.01M | Status status; |
1598 | 1.01M | if (has_valid_writes && !read_only) { |
1599 | | // we can do this because this is called before client has access to the |
1600 | | // DB and there is only a single thread operating on DB |
1601 | 997k | ColumnFamilyData* cfd; |
1602 | | |
1603 | 997k | while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { |
1604 | 0 | cfd->UnrefAndTryDelete(); |
1605 | | // If this asserts, it means that InsertInto failed in |
1606 | | // filtering updates to already-flushed column families |
1607 | 0 | assert(cfd->GetLogNumber() <= wal_number); |
1608 | 0 | (void)wal_number; |
1609 | 0 | auto iter = version_edits->find(cfd->GetID()); |
1610 | 0 | assert(iter != version_edits->end()); |
1611 | 0 | VersionEdit* edit = &iter->second; |
1612 | 0 | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1613 | 0 | if (!status.ok()) { |
1614 | | // Reflect errors immediately so that conditions like full |
1615 | | // file-systems cause the DB::Open() to fail. |
1616 | 0 | return status; |
1617 | 0 | } |
1618 | 0 | *flushed = true; |
1619 | |
|
1620 | 0 | cfd->CreateNewMemtable(*next_sequence - 1); |
1621 | 0 | } |
1622 | 997k | } |
1623 | 1.01M | return status; |
1624 | 1.01M | } |
1625 | | |
1626 | | Status DBImpl::HandleNonOkStatusOrOldLogRecord( |
1627 | | uint64_t wal_number, SequenceNumber const* const next_sequence, |
1628 | | Status status, const DBOpenLogRecordReadReporter& reporter, |
1629 | | bool* old_log_record, bool* stop_replay_for_corruption, |
1630 | 0 | uint64_t* corrupted_wal_number, bool* corrupted_wal_found) { |
1631 | 0 | assert(!status.ok() || *old_log_record); |
1632 | |
|
1633 | 0 | assert(next_sequence); |
1634 | 0 | assert(old_log_record); |
1635 | 0 | assert(stop_replay_for_corruption); |
1636 | 0 | assert(corrupted_wal_number); |
1637 | |
|
1638 | 0 | if (status.IsNotSupported()) { |
1639 | | // We should not treat NotSupported as corruption. It is rather a clear |
1640 | | // sign that we are processing a WAL that is produced by an incompatible |
1641 | | // version of the code. |
1642 | 0 | return status; |
1643 | 0 | } |
1644 | | |
1645 | 0 | if (immutable_db_options_.wal_recovery_mode == |
1646 | 0 | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1647 | | // We should ignore all errors unconditionally |
1648 | 0 | return Status::OK(); |
1649 | 0 | } else if (immutable_db_options_.wal_recovery_mode == |
1650 | 0 | WALRecoveryMode::kPointInTimeRecovery) { |
1651 | 0 | if (status.IsIOError()) { |
1652 | 0 | ROCKS_LOG_ERROR(immutable_db_options_.info_log, |
1653 | 0 | "IOError during point-in-time reading log #%" PRIu64 |
1654 | 0 | " seq #%" PRIu64 |
1655 | 0 | ". %s. This likely mean loss of synced WAL, " |
1656 | 0 | "thus recovery fails.", |
1657 | 0 | wal_number, *next_sequence, status.ToString().c_str()); |
1658 | 0 | return status; |
1659 | 0 | } |
1660 | | // We should ignore the error but not continue replaying |
1661 | 0 | *old_log_record = false; |
1662 | 0 | *stop_replay_for_corruption = true; |
1663 | | // TODO(hx235): have a single source of corrupted WAL number once we |
1664 | | // consolidate the statuses |
1665 | 0 | uint64_t reporter_corrupted_wal_number = reporter.GetCorruptedLogNumber(); |
1666 | 0 | *corrupted_wal_number = reporter_corrupted_wal_number != kMaxSequenceNumber |
1667 | 0 | ? reporter_corrupted_wal_number |
1668 | 0 | : wal_number; |
1669 | 0 | if (corrupted_wal_found != nullptr) { |
1670 | 0 | *corrupted_wal_found = true; |
1671 | 0 | } |
1672 | 0 | return Status::OK(); |
1673 | 0 | } else { |
1674 | 0 | assert(immutable_db_options_.wal_recovery_mode == |
1675 | 0 | WALRecoveryMode::kTolerateCorruptedTailRecords || |
1676 | 0 | immutable_db_options_.wal_recovery_mode == |
1677 | 0 | WALRecoveryMode::kAbsoluteConsistency); |
1678 | 0 | return status; |
1679 | 0 | } |
1680 | 0 | } |
1681 | | |
1682 | | Status DBImpl::UpdatePredecessorWALInfo( |
1683 | | uint64_t wal_number, const SequenceNumber last_seqno_observed, |
1684 | 48.1k | const std::string& fname, PredecessorWALInfo& predecessor_wal_info) { |
1685 | 48.1k | uint64_t bytes; |
1686 | | |
1687 | 48.1k | Status s = env_->GetFileSize(fname, &bytes); |
1688 | 48.1k | if (!s.ok()) { |
1689 | 0 | return s; |
1690 | 0 | } |
1691 | | |
1692 | 48.1k | SequenceNumber mock_seqno = kMaxSequenceNumber; |
1693 | 48.1k | [[maybe_unused]] std::pair<uint64_t, SequenceNumber*> pair = |
1694 | 48.1k | std::make_pair(wal_number, &mock_seqno); |
1695 | 48.1k | TEST_SYNC_POINT_CALLBACK("DBImpl::UpdatePredecessorWALInfo", &pair); |
1696 | 48.1k | predecessor_wal_info = PredecessorWALInfo( |
1697 | 48.1k | wal_number, bytes, |
1698 | 48.1k | mock_seqno != kMaxSequenceNumber ? mock_seqno : last_seqno_observed); |
1699 | | |
1700 | 48.1k | return s; |
1701 | 48.1k | } |
1702 | | |
1703 | | void DBImpl::FinishLogFileProcessing(const Status& status, |
1704 | 48.1k | const SequenceNumber* next_sequence) { |
1705 | 48.1k | if (status.ok()) { |
1706 | 48.1k | assert(next_sequence); |
1707 | 48.1k | flush_scheduler_.Clear(); |
1708 | 48.1k | trim_history_scheduler_.Clear(); |
1709 | 48.1k | auto last_sequence = *next_sequence - 1; |
1710 | 48.1k | if ((*next_sequence != kMaxSequenceNumber) && |
1711 | 48.1k | (versions_->LastSequence() <= last_sequence)) { |
1712 | 36.1k | versions_->SetLastAllocatedSequence(last_sequence); |
1713 | 36.1k | versions_->SetLastPublishedSequence(last_sequence); |
1714 | 36.1k | versions_->SetLastSequence(last_sequence); |
1715 | 36.1k | } |
1716 | 48.1k | } |
1717 | 48.1k | } |
1718 | | |
1719 | | Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency( |
1720 | 48.1k | bool stop_replay_for_corruption, uint64_t corrupted_wal_number) { |
1721 | 48.1k | Status status; |
1722 | | |
1723 | | // Compare the corrupted log number to all columnfamily's current log number. |
1724 | | // Abort Open() if any column family's log number is greater than |
1725 | | // the corrupted log number, which means CF contains data beyond the point of |
1726 | | // corruption. This could during PIT recovery when the WAL is corrupted and |
1727 | | // some (but not all) CFs are flushed |
1728 | | // Exclude the PIT case where no log is dropped after the corruption point. |
1729 | | // This is to cover the case for empty wals after corrupted log, in which we |
1730 | | // don't reset stop_replay_for_corruption. |
1731 | 48.1k | if (stop_replay_for_corruption == true && |
1732 | 48.1k | (immutable_db_options_.wal_recovery_mode == |
1733 | 0 | WALRecoveryMode::kPointInTimeRecovery || |
1734 | 0 | immutable_db_options_.wal_recovery_mode == |
1735 | 0 | WALRecoveryMode::kTolerateCorruptedTailRecords)) { |
1736 | 0 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1737 | | // One special case cause cfd->GetLogNumber() > corrupted_wal_number but |
1738 | | // the CF is still consistent: If a new column family is created during |
1739 | | // the flush and the WAL sync fails at the same time, the new CF points to |
1740 | | // the new WAL but the old WAL is curropted. Since the new CF is empty, it |
1741 | | // is still consistent. We add the check of CF sst file size to avoid the |
1742 | | // false positive alert. |
1743 | | |
1744 | | // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to |
1745 | | // the ignorance of a very rare inconsistency case caused in data |
1746 | | // canclation. One CF is empty due to KV deletion. But those operations |
1747 | | // are in the WAL. If the WAL is corrupted, the status of this CF might |
1748 | | // not be consistent with others. However, the consistency check will be |
1749 | | // bypassed due to empty CF. |
1750 | | // TODO: a better and complete implementation is needed to ensure strict |
1751 | | // consistency check in WAL recovery including hanlding the tailing |
1752 | | // issues. |
1753 | 0 | if (cfd->GetLogNumber() > corrupted_wal_number && |
1754 | 0 | cfd->GetLiveSstFilesSize() > 0) { |
1755 | 0 | ROCKS_LOG_ERROR(immutable_db_options_.info_log, |
1756 | 0 | "Column family inconsistency: SST file contains data" |
1757 | 0 | " beyond the point of corruption."); |
1758 | 0 | status = Status::Corruption("SST file is ahead of WALs in CF " + |
1759 | 0 | cfd->GetName()); |
1760 | 0 | return status; |
1761 | 0 | } |
1762 | 0 | } |
1763 | 0 | } |
1764 | 48.1k | return status; |
1765 | 48.1k | } |
1766 | | |
1767 | | Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles( |
1768 | | const std::vector<uint64_t>& wal_numbers, bool read_only, int job_id, |
1769 | | bool flushed, std::unordered_map<int, VersionEdit>* version_edits, |
1770 | 48.1k | RecoveryContext* recovery_ctx) { |
1771 | 48.1k | assert(version_edits); |
1772 | | |
1773 | 48.1k | Status status; |
1774 | | // True if there's any data in the WALs; if not, we can skip re-processing |
1775 | | // them later |
1776 | 48.1k | bool data_seen = false; |
1777 | 48.1k | if (!read_only) { |
1778 | | // no need to refcount since client still doesn't have access |
1779 | | // to the DB and can not drop column families while we iterate |
1780 | 48.1k | const WalNumber max_wal_number = wal_numbers.back(); |
1781 | 70.1k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1782 | 70.1k | auto iter = version_edits->find(cfd->GetID()); |
1783 | 70.1k | assert(iter != version_edits->end()); |
1784 | 70.1k | VersionEdit* edit = &iter->second; |
1785 | | |
1786 | 70.1k | if (cfd->GetLogNumber() > max_wal_number) { |
1787 | | // Column family cfd has already flushed the data |
1788 | | // from all wals. Memtable has to be empty because |
1789 | | // we filter the updates based on wal_number |
1790 | | // (in WriteBatch::InsertInto) |
1791 | 0 | assert(cfd->mem()->GetFirstSequenceNumber() == 0); |
1792 | 0 | assert(edit->NumEntries() == 0); |
1793 | 0 | continue; |
1794 | 0 | } |
1795 | | |
1796 | 70.1k | TEST_SYNC_POINT_CALLBACK( |
1797 | 70.1k | "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); |
1798 | | |
1799 | | // flush the final memtable (if non-empty) |
1800 | 70.1k | if (cfd->mem()->GetFirstSequenceNumber() != 0) { |
1801 | | // If flush happened in the middle of recovery (e.g. due to memtable |
1802 | | // being full), we flush at the end. Otherwise we'll need to record |
1803 | | // where we were on last flush, which make the logic complicated. |
1804 | 20.0k | if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { |
1805 | 20.0k | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1806 | 20.0k | if (!status.ok()) { |
1807 | | // Recovery failed |
1808 | 0 | break; |
1809 | 0 | } |
1810 | 20.0k | flushed = true; |
1811 | | |
1812 | 20.0k | cfd->CreateNewMemtable(versions_->LastSequence()); |
1813 | 20.0k | } |
1814 | 20.0k | data_seen = true; |
1815 | 20.0k | } |
1816 | | |
1817 | | // Update the log number info in the version edit corresponding to this |
1818 | | // column family. Note that the version edits will be written to MANIFEST |
1819 | | // together later. |
1820 | | // writing wal_number in the manifest means that any log file |
1821 | | // with number strongly less than (wal_number + 1) is already |
1822 | | // recovered and should be ignored on next reincarnation. |
1823 | | // Since we already recovered max_wal_number, we want all wals |
1824 | | // with numbers `<= max_wal_number` (includes this one) to be ignored |
1825 | 70.1k | if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { |
1826 | 70.1k | edit->SetLogNumber(max_wal_number + 1); |
1827 | 70.1k | } |
1828 | 70.1k | } |
1829 | 48.1k | if (status.ok()) { |
1830 | | // we must mark the next log number as used, even though it's |
1831 | | // not actually used. that is because VersionSet assumes |
1832 | | // VersionSet::next_file_number_ always to be strictly greater than any |
1833 | | // log number |
1834 | 48.1k | versions_->MarkFileNumberUsed(max_wal_number + 1); |
1835 | 48.1k | assert(recovery_ctx != nullptr); |
1836 | | |
1837 | 70.1k | for (auto* cfd : *versions_->GetColumnFamilySet()) { |
1838 | 70.1k | auto iter = version_edits->find(cfd->GetID()); |
1839 | 70.1k | assert(iter != version_edits->end()); |
1840 | 70.1k | recovery_ctx->UpdateVersionEdits(cfd, iter->second); |
1841 | 70.1k | } |
1842 | | |
1843 | 48.1k | if (flushed || !data_seen) { |
1844 | 48.1k | VersionEdit wal_deletion; |
1845 | 48.1k | if (immutable_db_options_.track_and_verify_wals_in_manifest) { |
1846 | 0 | wal_deletion.DeleteWalsBefore(max_wal_number + 1); |
1847 | 0 | } |
1848 | 48.1k | if (!allow_2pc()) { |
1849 | | // In non-2pc mode, flushing the memtables of the column families |
1850 | | // means we can advance min_log_number_to_keep. |
1851 | 48.1k | wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1); |
1852 | 48.1k | } |
1853 | 48.1k | assert(versions_->GetColumnFamilySet() != nullptr); |
1854 | 48.1k | recovery_ctx->UpdateVersionEdits( |
1855 | 48.1k | versions_->GetColumnFamilySet()->GetDefault(), wal_deletion); |
1856 | 48.1k | } |
1857 | 48.1k | } |
1858 | 48.1k | } |
1859 | | |
1860 | 48.1k | if (status.ok()) { |
1861 | 48.1k | if (data_seen && !flushed) { |
1862 | 0 | status = RestoreAliveLogFiles(wal_numbers); |
1863 | 48.1k | } else if (!wal_numbers.empty()) { // If there's no data in the WAL, or we |
1864 | | // flushed all the data, still |
1865 | | // truncate the log file. If the process goes into a crash loop before |
1866 | | // the file is deleted, the preallocated space will never get freed. |
1867 | 48.1k | const bool truncate = !read_only; |
1868 | 48.1k | GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) |
1869 | 48.1k | .PermitUncheckedError(); |
1870 | 48.1k | } |
1871 | 48.1k | } |
1872 | 48.1k | return status; |
1873 | 48.1k | } |
1874 | | |
1875 | | Status DBImpl::CheckSeqnoNotSetBackDuringRecovery( |
1876 | 1.06M | SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) { |
1877 | 1.06M | if (prev_next_seqno == kMaxSequenceNumber || |
1878 | 1.06M | prev_next_seqno <= current_next_seqno) { |
1879 | 1.06M | return Status::OK(); |
1880 | 1.06M | } |
1881 | 0 | std::string msg = |
1882 | 0 | "Sequence number is being set backwards during recovery, this is likely " |
1883 | 0 | "a software bug or a data corruption. Prev next seqno: " + |
1884 | 0 | std::to_string(prev_next_seqno) + |
1885 | 0 | " , current next seqno: " + std::to_string(current_next_seqno); |
1886 | 0 | return Status::Corruption(msg); |
1887 | 1.06M | } |
1888 | | |
1889 | 48.1k | void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) { |
1890 | 48.1k | event_logger_.Log() << "job" << job_id << "event" |
1891 | 48.1k | << (status.ok() ? "recovery_finished" : "recovery_failed") |
1892 | 48.1k | << "status" << status.ToString(); |
1893 | 48.1k | } |
1894 | | |
1895 | | Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, |
1896 | 48.1k | WalFileNumberSize* log_ptr) { |
1897 | 48.1k | WalFileNumberSize log(wal_number); |
1898 | 48.1k | std::string fname = |
1899 | 48.1k | LogFileName(immutable_db_options_.GetWalDir(), wal_number); |
1900 | 48.1k | Status s; |
1901 | | // This gets the appear size of the wals, not including preallocated space. |
1902 | 48.1k | s = env_->GetFileSize(fname, &log.size); |
1903 | 48.1k | TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s); |
1904 | 48.1k | if (s.ok() && truncate) { |
1905 | 48.1k | std::unique_ptr<FSWritableFile> last_log; |
1906 | 48.1k | Status truncate_status = fs_->ReopenWritableFile( |
1907 | 48.1k | fname, |
1908 | 48.1k | fs_->OptimizeForLogWrite( |
1909 | 48.1k | file_options_, |
1910 | 48.1k | BuildDBOptions(immutable_db_options_, mutable_db_options_)), |
1911 | 48.1k | &last_log, nullptr); |
1912 | 48.1k | if (truncate_status.ok()) { |
1913 | 48.1k | truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); |
1914 | 48.1k | } |
1915 | 48.1k | if (truncate_status.ok()) { |
1916 | 48.1k | truncate_status = last_log->Close(IOOptions(), nullptr); |
1917 | 48.1k | } |
1918 | | // Not a critical error if fail to truncate. |
1919 | 48.1k | if (!truncate_status.ok() && !truncate_status.IsNotSupported()) { |
1920 | 0 | ROCKS_LOG_WARN(immutable_db_options_.info_log, |
1921 | 0 | "Failed to truncate log #%" PRIu64 ": %s", wal_number, |
1922 | 0 | truncate_status.ToString().c_str()); |
1923 | 0 | } |
1924 | 48.1k | } |
1925 | 48.1k | if (log_ptr) { |
1926 | 0 | *log_ptr = log; |
1927 | 0 | } |
1928 | 48.1k | return s; |
1929 | 48.1k | } |
1930 | | |
1931 | 0 | Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) { |
1932 | 0 | if (wal_numbers.empty()) { |
1933 | 0 | return Status::OK(); |
1934 | 0 | } |
1935 | 0 | Status s; |
1936 | 0 | mutex_.AssertHeld(); |
1937 | 0 | assert(immutable_db_options_.avoid_flush_during_recovery); |
1938 | | // Mark these as alive so they'll be considered for deletion later by |
1939 | | // FindObsoleteFiles() |
1940 | 0 | wals_total_size_.StoreRelaxed(0); |
1941 | 0 | wal_empty_ = false; |
1942 | 0 | uint64_t min_wal_with_unflushed_data = |
1943 | 0 | versions_->MinLogNumberWithUnflushedData(); |
1944 | 0 | for (auto wal_number : wal_numbers) { |
1945 | 0 | if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) { |
1946 | | // In non-2pc mode, the WAL files not backing unflushed data are not |
1947 | | // alive, thus should not be added to the alive_wal_files_. |
1948 | 0 | continue; |
1949 | 0 | } |
1950 | | // We preallocate space for wals, but then after a crash and restart, those |
1951 | | // preallocated space are not needed anymore. It is likely only the last |
1952 | | // log has such preallocated space, so we only truncate for the last log. |
1953 | 0 | WalFileNumberSize log; |
1954 | 0 | s = GetLogSizeAndMaybeTruncate( |
1955 | 0 | wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); |
1956 | 0 | if (!s.ok()) { |
1957 | 0 | break; |
1958 | 0 | } |
1959 | 0 | wals_total_size_.FetchAddRelaxed(log.size); |
1960 | 0 | alive_wal_files_.push_back(log); |
1961 | 0 | } |
1962 | 0 | return s; |
1963 | 0 | } |
1964 | | |
1965 | | Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, |
1966 | 20.0k | MemTable* mem, VersionEdit* edit) { |
1967 | 20.0k | mutex_.AssertHeld(); |
1968 | 20.0k | assert(cfd); |
1969 | 20.0k | assert(cfd->imm()); |
1970 | | // The immutable memtable list must be empty. |
1971 | 20.0k | assert(std::numeric_limits<uint64_t>::max() == |
1972 | 20.0k | cfd->imm()->GetEarliestMemTableID()); |
1973 | | |
1974 | 20.0k | const uint64_t start_micros = immutable_db_options_.clock->NowMicros(); |
1975 | | |
1976 | 20.0k | FileMetaData meta; |
1977 | 20.0k | std::vector<BlobFileAddition> blob_file_additions; |
1978 | | |
1979 | 20.0k | std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem( |
1980 | 20.0k | new std::list<uint64_t>::iterator( |
1981 | 20.0k | CaptureCurrentFileNumberInPendingOutputs())); |
1982 | 20.0k | meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); |
1983 | 20.0k | ReadOptions ro; |
1984 | 20.0k | ro.total_order_seek = true; |
1985 | 20.0k | ro.io_activity = Env::IOActivity::kDBOpen; |
1986 | 20.0k | Arena arena; |
1987 | 20.0k | Status s; |
1988 | 20.0k | TableProperties table_properties; |
1989 | 20.0k | const auto* ucmp = cfd->internal_comparator().user_comparator(); |
1990 | 20.0k | assert(ucmp); |
1991 | 20.0k | const size_t ts_sz = ucmp->timestamp_size(); |
1992 | 20.0k | const bool logical_strip_timestamp = |
1993 | 20.0k | ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps; |
1994 | | // Note that here we treat flush as level 0 compaction in internal stats |
1995 | 20.0k | InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, |
1996 | 20.0k | 1 /* count */); |
1997 | 20.0k | { |
1998 | 20.0k | ScopedArenaPtr<InternalIterator> iter( |
1999 | 20.0k | logical_strip_timestamp |
2000 | 20.0k | ? mem->NewTimestampStrippingIterator( |
2001 | 0 | ro, /*seqno_to_time_mapping=*/nullptr, &arena, |
2002 | 0 | /*prefix_extractor=*/nullptr, ts_sz) |
2003 | 20.0k | : mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, |
2004 | 20.0k | /*prefix_extractor=*/nullptr, |
2005 | 20.0k | /*for_flush=*/true)); |
2006 | 20.0k | ROCKS_LOG_DEBUG(immutable_db_options_.info_log, |
2007 | 20.0k | "[%s] [WriteLevel0TableForRecovery]" |
2008 | 20.0k | " Level-0 table #%" PRIu64 ": started", |
2009 | 20.0k | cfd->GetName().c_str(), meta.fd.GetNumber()); |
2010 | | |
2011 | | // Get the latest mutable cf options while the mutex is still locked |
2012 | 20.0k | const MutableCFOptions mutable_cf_options_copy = |
2013 | 20.0k | cfd->GetLatestMutableCFOptions(); |
2014 | 20.0k | bool paranoid_file_checks = |
2015 | 20.0k | cfd->GetLatestMutableCFOptions().paranoid_file_checks; |
2016 | | |
2017 | 20.0k | int64_t _current_time = 0; |
2018 | 20.0k | immutable_db_options_.clock->GetCurrentTime(&_current_time) |
2019 | 20.0k | .PermitUncheckedError(); // ignore error |
2020 | 20.0k | const uint64_t current_time = static_cast<uint64_t>(_current_time); |
2021 | 20.0k | meta.oldest_ancester_time = current_time; |
2022 | 20.0k | meta.epoch_number = cfd->NewEpochNumber(); |
2023 | 20.0k | { |
2024 | 20.0k | auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint( |
2025 | 20.0k | /*level=*/0, |
2026 | 20.0k | immutable_db_options_.calculate_sst_write_lifetime_hint_set); |
2027 | 20.0k | mutex_.Unlock(); |
2028 | | |
2029 | 20.0k | SequenceNumber earliest_write_conflict_snapshot; |
2030 | 20.0k | std::vector<SequenceNumber> snapshot_seqs = |
2031 | 20.0k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
2032 | 20.0k | SequenceNumber earliest_snapshot = |
2033 | 20.0k | (snapshot_seqs.empty() ? kMaxSequenceNumber : snapshot_seqs.at(0)); |
2034 | 20.0k | auto snapshot_checker = snapshot_checker_.get(); |
2035 | 20.0k | if (use_custom_gc_ && snapshot_checker == nullptr) { |
2036 | 0 | snapshot_checker = DisableGCSnapshotChecker::Instance(); |
2037 | 0 | } |
2038 | 20.0k | std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> |
2039 | 20.0k | range_del_iters; |
2040 | 20.0k | auto range_del_iter = |
2041 | 20.0k | logical_strip_timestamp |
2042 | 20.0k | ? mem->NewTimestampStrippingRangeTombstoneIterator( |
2043 | 0 | ro, kMaxSequenceNumber, ts_sz) |
2044 | | // This is called during recovery, where a live memtable is |
2045 | | // flushed directly. In this case, no fragmented tombstone list is |
2046 | | // cached in this memtable yet. |
2047 | 20.0k | : mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, |
2048 | 20.0k | false /* immutable_memtable */); |
2049 | 20.0k | if (range_del_iter != nullptr) { |
2050 | 3.72k | range_del_iters.emplace_back(range_del_iter); |
2051 | 3.72k | } |
2052 | | |
2053 | 20.0k | IOStatus io_s; |
2054 | 20.0k | const ReadOptions read_option(Env::IOActivity::kDBOpen); |
2055 | 20.0k | const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen); |
2056 | | |
2057 | 20.0k | TableBuilderOptions tboptions( |
2058 | 20.0k | cfd->ioptions(), mutable_cf_options_copy, read_option, write_option, |
2059 | 20.0k | cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(), |
2060 | 20.0k | GetCompressionFlush(cfd->ioptions(), mutable_cf_options_copy), |
2061 | 20.0k | mutable_cf_options_copy.compression_opts, cfd->GetID(), |
2062 | 20.0k | cfd->GetName(), 0 /* level */, current_time /* newest_key_time */, |
2063 | 20.0k | false /* is_bottommost */, TableFileCreationReason::kRecovery, |
2064 | 20.0k | 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_, |
2065 | 20.0k | db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(), |
2066 | 20.0k | kMaxSequenceNumber); |
2067 | 20.0k | Version* version = cfd->current(); |
2068 | 20.0k | version->Ref(); |
2069 | 20.0k | TableProperties temp_table_proerties; |
2070 | 20.0k | s = BuildTable( |
2071 | 20.0k | dbname_, versions_.get(), immutable_db_options_, tboptions, |
2072 | 20.0k | file_options_for_compaction_, cfd->table_cache(), iter.get(), |
2073 | 20.0k | std::move(range_del_iters), &meta, &blob_file_additions, |
2074 | 20.0k | snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot, |
2075 | 20.0k | kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, |
2076 | 20.0k | cfd->internal_stats(), &io_s, io_tracer_, |
2077 | 20.0k | BlobFileCreationReason::kRecovery, |
2078 | 20.0k | nullptr /* seqno_to_time_mapping */, &event_logger_, job_id, |
2079 | 20.0k | &temp_table_proerties /* table_properties */, write_hint, |
2080 | 20.0k | nullptr /*full_history_ts_low*/, &blob_callback_, version, |
2081 | 20.0k | nullptr /* memtable_payload_bytes */, |
2082 | 20.0k | nullptr /* memtable_garbage_bytes */, &flush_stats); |
2083 | 20.0k | version->Unref(); |
2084 | 20.0k | LogFlush(immutable_db_options_.info_log); |
2085 | 20.0k | ROCKS_LOG_DEBUG(immutable_db_options_.info_log, |
2086 | 20.0k | "[%s] [WriteLevel0TableForRecovery]" |
2087 | 20.0k | " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", |
2088 | 20.0k | cfd->GetName().c_str(), meta.fd.GetNumber(), |
2089 | 20.0k | meta.fd.GetFileSize(), s.ToString().c_str()); |
2090 | 20.0k | mutex_.Lock(); |
2091 | | |
2092 | | // TODO(AR) is this ok? |
2093 | 20.0k | if (!io_s.ok() && s.ok()) { |
2094 | 0 | s = io_s; |
2095 | 0 | } |
2096 | | |
2097 | 20.0k | uint64_t total_num_entries = mem->NumEntries(); |
2098 | 20.0k | if (s.ok() && total_num_entries != flush_stats.num_input_records) { |
2099 | 0 | std::string msg = "Expected " + std::to_string(total_num_entries) + |
2100 | 0 | " entries in memtable, but read " + |
2101 | 0 | std::to_string(flush_stats.num_input_records); |
2102 | 0 | ROCKS_LOG_WARN(immutable_db_options_.info_log, |
2103 | 0 | "[%s] [JOB %d] Level-0 flush during recover: %s", |
2104 | 0 | cfd->GetName().c_str(), job_id, msg.c_str()); |
2105 | 0 | if (immutable_db_options_.flush_verify_memtable_count) { |
2106 | 0 | s = Status::Corruption(msg); |
2107 | 0 | } |
2108 | 0 | } |
2109 | | // Only verify on table with format collects table properties |
2110 | 20.0k | const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
2111 | 20.0k | if (s.ok() && |
2112 | 20.0k | (mutable_cf_options.table_factory->IsInstanceOf( |
2113 | 20.0k | TableFactory::kBlockBasedTableName()) || |
2114 | 20.0k | mutable_cf_options.table_factory->IsInstanceOf( |
2115 | 0 | TableFactory::kPlainTableName())) && |
2116 | 20.0k | flush_stats.num_output_records != temp_table_proerties.num_entries) { |
2117 | 0 | std::string msg = |
2118 | 0 | "Number of keys in flush output SST files does not match " |
2119 | 0 | "number of keys added to the table. Expected " + |
2120 | 0 | std::to_string(flush_stats.num_output_records) + " but there are " + |
2121 | 0 | std::to_string(temp_table_proerties.num_entries) + |
2122 | 0 | " in output SST files"; |
2123 | 0 | ROCKS_LOG_WARN(immutable_db_options_.info_log, |
2124 | 0 | "[%s] [JOB %d] Level-0 flush during recover: %s", |
2125 | 0 | cfd->GetName().c_str(), job_id, msg.c_str()); |
2126 | 0 | if (immutable_db_options_.flush_verify_memtable_count) { |
2127 | 0 | s = Status::Corruption(msg); |
2128 | 0 | } |
2129 | 0 | } |
2130 | 20.0k | } |
2131 | 20.0k | } |
2132 | 20.0k | ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); |
2133 | | |
2134 | | // Note that if file_size is zero, the file has been deleted and |
2135 | | // should not be added to the manifest. |
2136 | 20.0k | const bool has_output = meta.fd.GetFileSize() > 0; |
2137 | | |
2138 | 20.0k | constexpr int level = 0; |
2139 | | |
2140 | 20.0k | if (s.ok() && has_output) { |
2141 | 20.0k | edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), |
2142 | 20.0k | meta.fd.GetFileSize(), meta.smallest, meta.largest, |
2143 | 20.0k | meta.fd.smallest_seqno, meta.fd.largest_seqno, |
2144 | 20.0k | meta.marked_for_compaction, meta.temperature, |
2145 | 20.0k | meta.oldest_blob_file_number, meta.oldest_ancester_time, |
2146 | 20.0k | meta.file_creation_time, meta.epoch_number, |
2147 | 20.0k | meta.file_checksum, meta.file_checksum_func_name, |
2148 | 20.0k | meta.unique_id, meta.compensated_range_deletion_size, |
2149 | 20.0k | meta.tail_size, meta.user_defined_timestamps_persisted); |
2150 | | |
2151 | 20.0k | for (const auto& blob : blob_file_additions) { |
2152 | 0 | edit->AddBlobFile(blob); |
2153 | 0 | } |
2154 | | |
2155 | | // For UDT in memtable only feature, move up the cutoff timestamp whenever |
2156 | | // a flush happens. |
2157 | 20.0k | if (logical_strip_timestamp) { |
2158 | 0 | Slice mem_newest_udt = mem->GetNewestUDT(); |
2159 | 0 | std::string full_history_ts_low = cfd->GetFullHistoryTsLow(); |
2160 | 0 | if (full_history_ts_low.empty() || |
2161 | 0 | ucmp->CompareTimestamp(mem_newest_udt, full_history_ts_low) >= 0) { |
2162 | 0 | std::string new_full_history_ts_low; |
2163 | 0 | GetFullHistoryTsLowFromU64CutoffTs(&mem_newest_udt, |
2164 | 0 | &new_full_history_ts_low); |
2165 | 0 | edit->SetFullHistoryTsLow(new_full_history_ts_low); |
2166 | 0 | } |
2167 | 0 | } |
2168 | 20.0k | } |
2169 | | |
2170 | 20.0k | flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; |
2171 | | |
2172 | 20.0k | if (has_output) { |
2173 | 20.0k | flush_stats.bytes_written = meta.fd.GetFileSize(); |
2174 | 20.0k | flush_stats.num_output_files = 1; |
2175 | 20.0k | } |
2176 | | |
2177 | 20.0k | const auto& blobs = edit->GetBlobFileAdditions(); |
2178 | 20.0k | for (const auto& blob : blobs) { |
2179 | 0 | flush_stats.bytes_written_blob += blob.GetTotalBlobBytes(); |
2180 | 0 | } |
2181 | | |
2182 | 20.0k | flush_stats.num_output_files_blob = static_cast<int>(blobs.size()); |
2183 | | |
2184 | 20.0k | cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, |
2185 | 20.0k | flush_stats); |
2186 | 20.0k | cfd->internal_stats()->AddCFStats( |
2187 | 20.0k | InternalStats::BYTES_FLUSHED, |
2188 | 20.0k | flush_stats.bytes_written + flush_stats.bytes_written_blob); |
2189 | 20.0k | RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); |
2190 | 20.0k | return s; |
2191 | 20.0k | } |
2192 | | |
2193 | | Status DB::Open(const Options& options, const std::string& dbname, |
2194 | 36.3k | std::unique_ptr<DB>* dbptr) { |
2195 | 36.3k | DBOptions db_options(options); |
2196 | 36.3k | ColumnFamilyOptions cf_options(options); |
2197 | 36.3k | std::vector<ColumnFamilyDescriptor> column_families; |
2198 | 36.3k | column_families.emplace_back(kDefaultColumnFamilyName, cf_options); |
2199 | 36.3k | if (db_options.persist_stats_to_disk) { |
2200 | 0 | column_families.emplace_back(kPersistentStatsColumnFamilyName, cf_options); |
2201 | 0 | } |
2202 | 36.3k | std::vector<ColumnFamilyHandle*> handles; |
2203 | 36.3k | Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); |
2204 | 36.3k | if (s.ok()) { |
2205 | 36.3k | if (db_options.persist_stats_to_disk) { |
2206 | 0 | assert(handles.size() == 2); |
2207 | 36.3k | } else { |
2208 | 36.3k | assert(handles.size() == 1); |
2209 | 36.3k | } |
2210 | | // i can delete the handle since DBImpl is always holding a reference to |
2211 | | // default column family |
2212 | 36.3k | if (db_options.persist_stats_to_disk && handles[1] != nullptr) { |
2213 | 0 | delete handles[1]; |
2214 | 0 | } |
2215 | 36.3k | delete handles[0]; |
2216 | 36.3k | } |
2217 | 36.3k | return s; |
2218 | 36.3k | } |
2219 | | |
2220 | | Status DB::Open(const DBOptions& db_options, const std::string& dbname, |
2221 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
2222 | | std::vector<ColumnFamilyHandle*>* handles, |
2223 | 58.3k | std::unique_ptr<DB>* dbptr) { |
2224 | 58.3k | const bool kSeqPerBatch = true; |
2225 | 58.3k | const bool kBatchPerTxn = true; |
2226 | 58.3k | ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking); |
2227 | 58.3k | ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN); |
2228 | 58.3k | bool can_retry = false; |
2229 | 58.3k | Status s; |
2230 | 58.3k | do { |
2231 | 58.3k | s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr, |
2232 | 58.3k | !kSeqPerBatch, kBatchPerTxn, can_retry, &can_retry); |
2233 | 58.3k | } while (!s.ok() && can_retry); |
2234 | 58.3k | ThreadStatusUtil::ResetThreadStatus(); |
2235 | 58.3k | return s; |
2236 | 58.3k | } |
2237 | | |
2238 | | // TODO: Implement the trimming in flush code path. |
2239 | | // TODO: Perform trimming before inserting into memtable during recovery. |
2240 | | // TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta |
2241 | | // info, and handle only these files to reduce io. |
2242 | | Status DB::OpenAndTrimHistory( |
2243 | | const DBOptions& db_options, const std::string& dbname, |
2244 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
2245 | | std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr, |
2246 | 0 | std::string trim_ts) { |
2247 | 0 | assert(dbptr != nullptr); |
2248 | 0 | assert(handles != nullptr); |
2249 | 0 | auto validate_options = [&db_options] { |
2250 | 0 | if (db_options.avoid_flush_during_recovery) { |
2251 | 0 | return Status::InvalidArgument( |
2252 | 0 | "avoid_flush_during_recovery incompatible with " |
2253 | 0 | "OpenAndTrimHistory"); |
2254 | 0 | } |
2255 | 0 | return Status::OK(); |
2256 | 0 | }; |
2257 | 0 | auto s = validate_options(); |
2258 | 0 | if (!s.ok()) { |
2259 | 0 | return s; |
2260 | 0 | } |
2261 | | |
2262 | 0 | DB* db = nullptr; |
2263 | 0 | s = DB::Open(db_options, dbname, column_families, handles, &db); |
2264 | 0 | if (!s.ok()) { |
2265 | 0 | return s; |
2266 | 0 | } |
2267 | 0 | assert(db); |
2268 | 0 | CompactRangeOptions options; |
2269 | 0 | options.bottommost_level_compaction = |
2270 | 0 | BottommostLevelCompaction::kForceOptimized; |
2271 | 0 | auto db_impl = static_cast_with_check<DBImpl>(db); |
2272 | 0 | for (auto handle : *handles) { |
2273 | 0 | assert(handle != nullptr); |
2274 | 0 | auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle); |
2275 | 0 | auto cfd = cfh->cfd(); |
2276 | 0 | assert(cfd != nullptr); |
2277 | | // Only compact column families with timestamp enabled |
2278 | 0 | if (cfd->user_comparator() != nullptr && |
2279 | 0 | cfd->user_comparator()->timestamp_size() > 0) { |
2280 | 0 | s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr, |
2281 | 0 | trim_ts); |
2282 | 0 | if (!s.ok()) { |
2283 | 0 | break; |
2284 | 0 | } |
2285 | 0 | } |
2286 | 0 | } |
2287 | 0 | auto clean_op = [&handles, &db] { |
2288 | 0 | for (auto handle : *handles) { |
2289 | 0 | auto temp_s = db->DestroyColumnFamilyHandle(handle); |
2290 | 0 | assert(temp_s.ok()); |
2291 | 0 | } |
2292 | 0 | handles->clear(); |
2293 | 0 | delete db; |
2294 | 0 | }; |
2295 | 0 | if (!s.ok()) { |
2296 | 0 | clean_op(); |
2297 | 0 | return s; |
2298 | 0 | } |
2299 | | |
2300 | 0 | dbptr->reset(db); |
2301 | 0 | return s; |
2302 | 0 | } |
2303 | | |
2304 | | IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, |
2305 | | uint64_t log_file_num, uint64_t recycle_log_number, |
2306 | | size_t preallocate_block_size, |
2307 | | const PredecessorWALInfo& predecessor_wal_info, |
2308 | 60.6k | log::Writer** new_log) { |
2309 | 60.6k | IOStatus io_s; |
2310 | 60.6k | std::unique_ptr<FSWritableFile> lfile; |
2311 | | |
2312 | 60.6k | DBOptions db_options = |
2313 | 60.6k | BuildDBOptions(immutable_db_options_, mutable_db_options_); |
2314 | 60.6k | FileOptions opt_file_options = |
2315 | 60.6k | fs_->OptimizeForLogWrite(file_options_, db_options); |
2316 | 60.6k | opt_file_options.write_hint = CalculateWALWriteHint(); |
2317 | | // DB option takes precedence when not kUnknown |
2318 | 60.6k | if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) { |
2319 | 0 | opt_file_options.temperature = immutable_db_options_.wal_write_temperature; |
2320 | 0 | } |
2321 | 60.6k | std::string wal_dir = immutable_db_options_.GetWalDir(); |
2322 | 60.6k | std::string log_fname = LogFileName(wal_dir, log_file_num); |
2323 | | |
2324 | 60.6k | if (recycle_log_number) { |
2325 | 0 | ROCKS_LOG_INFO(immutable_db_options_.info_log, |
2326 | 0 | "reusing log %" PRIu64 " from recycle list\n", |
2327 | 0 | recycle_log_number); |
2328 | 0 | std::string old_log_fname = LogFileName(wal_dir, recycle_log_number); |
2329 | 0 | TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); |
2330 | 0 | TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); |
2331 | 0 | io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, |
2332 | 0 | &lfile, /*dbg=*/nullptr); |
2333 | 60.6k | } else { |
2334 | 60.6k | io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); |
2335 | 60.6k | } |
2336 | | |
2337 | 60.6k | if (io_s.ok()) { |
2338 | | // Subsequent attempts to override the hint via SetWriteLifeTimeHint |
2339 | | // with the very same value will be ignored by the fs. |
2340 | 60.6k | lfile->SetWriteLifeTimeHint(opt_file_options.write_hint); |
2341 | 60.6k | lfile->SetPreallocationBlockSize(preallocate_block_size); |
2342 | | |
2343 | 60.6k | const auto& listeners = immutable_db_options_.listeners; |
2344 | 60.6k | FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; |
2345 | 60.6k | std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( |
2346 | 60.6k | std::move(lfile), log_fname, opt_file_options, |
2347 | 60.6k | immutable_db_options_.clock, io_tracer_, nullptr /* stats */, |
2348 | 60.6k | Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, listeners, nullptr, |
2349 | 60.6k | tmp_set.Contains(FileType::kWalFile), |
2350 | 60.6k | tmp_set.Contains(FileType::kWalFile))); |
2351 | 60.6k | *new_log = new log::Writer(std::move(file_writer), log_file_num, |
2352 | 60.6k | immutable_db_options_.recycle_log_file_num > 0, |
2353 | 60.6k | immutable_db_options_.manual_wal_flush, |
2354 | 60.6k | immutable_db_options_.wal_compression, |
2355 | 60.6k | immutable_db_options_.track_and_verify_wals); |
2356 | 60.6k | io_s = (*new_log)->AddCompressionTypeRecord(write_options); |
2357 | 60.6k | if (io_s.ok()) { |
2358 | 60.6k | io_s = (*new_log)->MaybeAddPredecessorWALInfo(write_options, |
2359 | 60.6k | predecessor_wal_info); |
2360 | 60.6k | } |
2361 | 60.6k | } |
2362 | | |
2363 | 60.6k | return io_s; |
2364 | 60.6k | } |
2365 | | |
2366 | | void DBImpl::TrackExistingDataFiles( |
2367 | 58.3k | const std::vector<std::string>& existing_data_files) { |
2368 | 58.3k | TrackOrUntrackFiles(existing_data_files, /*track=*/true); |
2369 | 58.3k | } |
2370 | | |
2371 | | Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, |
2372 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
2373 | | std::vector<ColumnFamilyHandle*>* handles, |
2374 | | std::unique_ptr<DB>* dbptr, const bool seq_per_batch, |
2375 | | const bool batch_per_txn, const bool is_retry, |
2376 | 58.3k | bool* can_retry) { |
2377 | 58.3k | const WriteOptions write_options(Env::IOActivity::kDBOpen); |
2378 | 58.3k | const ReadOptions read_options(Env::IOActivity::kDBOpen); |
2379 | | |
2380 | 58.3k | Status s = ValidateOptionsByTable(db_options, column_families); |
2381 | 58.3k | if (!s.ok()) { |
2382 | 0 | return s; |
2383 | 0 | } |
2384 | | |
2385 | 58.3k | s = ValidateOptions(db_options, column_families); |
2386 | 58.3k | if (!s.ok()) { |
2387 | 0 | return s; |
2388 | 0 | } |
2389 | | |
2390 | 58.3k | *dbptr = nullptr; |
2391 | 58.3k | assert(handles); |
2392 | 58.3k | handles->clear(); |
2393 | | |
2394 | 58.3k | size_t max_write_buffer_size = 0; |
2395 | 58.3k | MinAndMaxPreserveSeconds preserve_info; |
2396 | 80.3k | for (const auto& cf : column_families) { |
2397 | 80.3k | max_write_buffer_size = |
2398 | 80.3k | std::max(max_write_buffer_size, cf.options.write_buffer_size); |
2399 | 80.3k | preserve_info.Combine(cf.options); |
2400 | 80.3k | } |
2401 | | |
2402 | 58.3k | auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch, |
2403 | 58.3k | batch_per_txn); |
2404 | 58.3k | if (!impl->immutable_db_options_.info_log) { |
2405 | 0 | s = impl->init_logger_creation_s_; |
2406 | 0 | return s; |
2407 | 58.3k | } else { |
2408 | 58.3k | assert(impl->init_logger_creation_s_.ok()); |
2409 | 58.3k | } |
2410 | 58.3k | s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir()); |
2411 | 58.3k | if (s.ok()) { |
2412 | 58.3k | std::vector<std::string> paths; |
2413 | 58.3k | for (auto& db_path : impl->immutable_db_options_.db_paths) { |
2414 | 58.3k | paths.emplace_back(db_path.path); |
2415 | 58.3k | } |
2416 | 80.3k | for (auto& cf : column_families) { |
2417 | 80.3k | for (auto& cf_path : cf.options.cf_paths) { |
2418 | 0 | paths.emplace_back(cf_path.path); |
2419 | 0 | } |
2420 | 80.3k | } |
2421 | 58.3k | for (const auto& path : paths) { |
2422 | 58.3k | s = impl->env_->CreateDirIfMissing(path); |
2423 | 58.3k | if (!s.ok()) { |
2424 | 0 | break; |
2425 | 0 | } |
2426 | 58.3k | } |
2427 | | |
2428 | | // For recovery from NoSpace() error, we can only handle |
2429 | | // the case where the database is stored in a single path |
2430 | 58.3k | if (paths.size() <= 1) { |
2431 | 58.3k | impl->error_handler_.EnableAutoRecovery(); |
2432 | 58.3k | } |
2433 | 58.3k | } |
2434 | 58.3k | if (s.ok()) { |
2435 | 58.3k | s = impl->CreateArchivalDirectory(); |
2436 | 58.3k | } |
2437 | 58.3k | if (!s.ok()) { |
2438 | 0 | return s; |
2439 | 0 | } |
2440 | | |
2441 | 58.3k | impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); |
2442 | 58.3k | RecoveryContext recovery_ctx; |
2443 | 58.3k | impl->options_mutex_.Lock(); |
2444 | 58.3k | impl->mutex_.Lock(); |
2445 | | |
2446 | | // Handles create_if_missing, error_if_exists |
2447 | 58.3k | uint64_t recovered_seq(kMaxSequenceNumber); |
2448 | 58.3k | s = impl->Recover(column_families, false /* read_only */, |
2449 | 58.3k | false /* error_if_wal_file_exists */, |
2450 | 58.3k | false /* error_if_data_exists_in_wals */, is_retry, |
2451 | 58.3k | &recovered_seq, &recovery_ctx, can_retry); |
2452 | 58.3k | if (s.ok()) { |
2453 | 58.3k | uint64_t new_log_number = impl->versions_->NewFileNumber(); |
2454 | 58.3k | log::Writer* new_log = nullptr; |
2455 | 58.3k | const size_t preallocate_block_size = |
2456 | 58.3k | impl->GetWalPreallocateBlockSize(max_write_buffer_size); |
2457 | | // TODO(hx235): Pass in the correct `predecessor_wal_info` for the first WAL |
2458 | | // created during DB open with predecessor WALs from previous DB session due |
2459 | | // to `avoid_flush_during_recovery == true`. This can protect the last WAL |
2460 | | // recovered. |
2461 | 58.3k | s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/, |
2462 | 58.3k | preallocate_block_size, |
2463 | 58.3k | PredecessorWALInfo() /* predecessor_wal_info */, |
2464 | 58.3k | &new_log); |
2465 | 58.3k | if (s.ok()) { |
2466 | | // Prevent log files created by previous instance from being recycled. |
2467 | | // They might be in alive_log_file_, and might get recycled otherwise. |
2468 | 58.3k | impl->min_wal_number_to_recycle_ = new_log_number; |
2469 | 58.3k | } |
2470 | 58.3k | if (s.ok()) { |
2471 | 58.3k | InstrumentedMutexLock wl(&impl->wal_write_mutex_); |
2472 | 58.3k | impl->cur_wal_number_ = new_log_number; |
2473 | 58.3k | assert(new_log != nullptr); |
2474 | 58.3k | assert(impl->logs_.empty()); |
2475 | 58.3k | impl->logs_.emplace_back(new_log_number, new_log); |
2476 | 58.3k | } |
2477 | | |
2478 | 58.3k | if (s.ok()) { |
2479 | 58.3k | impl->alive_wal_files_.emplace_back(impl->cur_wal_number_); |
2480 | | // In WritePrepared there could be gap in sequence numbers. This breaks |
2481 | | // the trick we use in kPointInTimeRecovery which assumes the first seq in |
2482 | | // the log right after the corrupted log is one larger than the last seq |
2483 | | // we read from the wals. To let this trick keep working, we add a dummy |
2484 | | // entry with the expected sequence to the first log right after recovery. |
2485 | | // In non-WritePrepared case also the new log after recovery could be |
2486 | | // empty, and thus missing the consecutive seq hint to distinguish |
2487 | | // middle-log corruption to corrupted-log-remained-after-recovery. This |
2488 | | // case also will be addressed by a dummy write. |
2489 | 58.3k | if (recovered_seq != kMaxSequenceNumber) { |
2490 | 0 | WriteBatch empty_batch; |
2491 | 0 | WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); |
2492 | 0 | uint64_t wal_used, log_size; |
2493 | 0 | log::Writer* log_writer = impl->logs_.back().writer; |
2494 | 0 | WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back(); |
2495 | |
|
2496 | 0 | assert(log_writer->get_log_number() == wal_file_number_size.number); |
2497 | 0 | impl->mutex_.AssertHeld(); |
2498 | 0 | s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used, |
2499 | 0 | &log_size, wal_file_number_size, recovered_seq); |
2500 | 0 | if (s.ok()) { |
2501 | | // Need to fsync, otherwise it might get lost after a power reset. |
2502 | 0 | s = impl->FlushWAL(write_options, false); |
2503 | 0 | TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); |
2504 | 0 | IOOptions opts; |
2505 | 0 | if (s.ok()) { |
2506 | 0 | s = WritableFileWriter::PrepareIOOptions(write_options, opts); |
2507 | 0 | } |
2508 | 0 | if (s.ok()) { |
2509 | 0 | s = log_writer->file()->Sync(opts, |
2510 | 0 | impl->immutable_db_options_.use_fsync); |
2511 | 0 | } |
2512 | 0 | } |
2513 | 0 | } |
2514 | 58.3k | } |
2515 | 58.3k | } |
2516 | 58.3k | if (s.ok()) { |
2517 | 58.3k | s = impl->LogAndApplyForRecovery(recovery_ctx); |
2518 | 58.3k | } |
2519 | | |
2520 | 58.3k | if (s.ok() && !impl->immutable_db_options_.write_identity_file) { |
2521 | | // On successful recovery, delete an obsolete IDENTITY file to avoid DB ID |
2522 | | // inconsistency |
2523 | 0 | impl->env_->DeleteFile(IdentityFileName(impl->dbname_)) |
2524 | 0 | .PermitUncheckedError(); |
2525 | 0 | } |
2526 | | |
2527 | 58.3k | if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { |
2528 | 0 | impl->mutex_.AssertHeld(); |
2529 | 0 | s = impl->InitPersistStatsColumnFamily(); |
2530 | 0 | } |
2531 | | |
2532 | | // After reaching the post-recovery seqno but before creating SuperVersions |
2533 | | // ensure seqno to time mapping is pre-populated as needed. |
2534 | 58.3k | if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) { |
2535 | 0 | impl->PrepopulateSeqnoToTimeMapping(preserve_info); |
2536 | 0 | } |
2537 | | |
2538 | 58.3k | if (s.ok()) { |
2539 | | // set column family handles |
2540 | 80.3k | for (const auto& cf : column_families) { |
2541 | 80.3k | auto cfd = |
2542 | 80.3k | impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); |
2543 | 80.3k | if (cfd != nullptr) { |
2544 | 80.3k | handles->push_back( |
2545 | 80.3k | new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_)); |
2546 | 80.3k | impl->NewThreadStatusCfInfo(cfd); |
2547 | 80.3k | SuperVersionContext sv_context(/* create_superversion */ true); |
2548 | 80.3k | impl->InstallSuperVersionForConfigChange(cfd, &sv_context); |
2549 | 80.3k | sv_context.Clean(); |
2550 | 80.3k | } else { |
2551 | 0 | if (db_options.create_missing_column_families) { |
2552 | | // missing column family, create it |
2553 | 0 | ColumnFamilyHandle* handle = nullptr; |
2554 | 0 | impl->mutex_.Unlock(); |
2555 | | // NOTE: the work normally done in WrapUpCreateColumnFamilies will |
2556 | | // be done separately below. |
2557 | | // This includes InstallSuperVersionForConfigChange. |
2558 | 0 | s = impl->CreateColumnFamilyImpl(read_options, write_options, |
2559 | 0 | cf.options, cf.name, &handle); |
2560 | 0 | impl->mutex_.Lock(); |
2561 | 0 | if (s.ok()) { |
2562 | 0 | handles->push_back(handle); |
2563 | 0 | } else { |
2564 | 0 | break; |
2565 | 0 | } |
2566 | 0 | } else { |
2567 | 0 | s = Status::InvalidArgument("Column family not found", cf.name); |
2568 | 0 | break; |
2569 | 0 | } |
2570 | 0 | } |
2571 | 80.3k | } |
2572 | 58.3k | } |
2573 | | |
2574 | 58.3k | if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { |
2575 | | // Install SuperVersion for hidden column family |
2576 | 0 | assert(impl->persist_stats_cf_handle_); |
2577 | 0 | assert(impl->persist_stats_cf_handle_->cfd()); |
2578 | 0 | SuperVersionContext sv_context(/* create_superversion */ true); |
2579 | 0 | impl->InstallSuperVersionForConfigChange( |
2580 | 0 | impl->persist_stats_cf_handle_->cfd(), &sv_context); |
2581 | 0 | sv_context.Clean(); |
2582 | | // try to read format version |
2583 | 0 | s = impl->PersistentStatsProcessFormatVersion(); |
2584 | 0 | } |
2585 | | |
2586 | 58.3k | if (s.ok()) { |
2587 | 80.3k | for (auto cfd : *impl->versions_->GetColumnFamilySet()) { |
2588 | 80.3k | if (!cfd->mem()->IsSnapshotSupported()) { |
2589 | 0 | impl->is_snapshot_supported_ = false; |
2590 | 0 | } |
2591 | 80.3k | if (cfd->ioptions().merge_operator != nullptr && |
2592 | 80.3k | !cfd->mem()->IsMergeOperatorSupported()) { |
2593 | 0 | s = Status::InvalidArgument( |
2594 | 0 | "The memtable of column family %s does not support merge operator " |
2595 | 0 | "its options.merge_operator is non-null", |
2596 | 0 | cfd->GetName().c_str()); |
2597 | 0 | } |
2598 | 80.3k | if (!s.ok()) { |
2599 | 0 | break; |
2600 | 0 | } |
2601 | 80.3k | } |
2602 | 58.3k | } |
2603 | 58.3k | TEST_SYNC_POINT("DBImpl::Open:Opened"); |
2604 | 58.3k | Status persist_options_status; |
2605 | 58.3k | if (s.ok()) { |
2606 | | // Persist RocksDB Options before scheduling the compaction. |
2607 | | // The WriteOptionsFile() will release and lock the mutex internally. |
2608 | 58.3k | persist_options_status = |
2609 | 58.3k | impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/); |
2610 | 58.3k | impl->opened_successfully_ = true; |
2611 | 58.3k | } else { |
2612 | 0 | persist_options_status.PermitUncheckedError(); |
2613 | 0 | } |
2614 | 58.3k | impl->mutex_.Unlock(); |
2615 | | |
2616 | 58.3k | auto sfm = static_cast<SstFileManagerImpl*>( |
2617 | 58.3k | impl->immutable_db_options_.sst_file_manager.get()); |
2618 | 58.3k | if (s.ok() && sfm) { |
2619 | | // Set Statistics ptr for SstFileManager to dump the stats of |
2620 | | // DeleteScheduler. |
2621 | 58.3k | sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics); |
2622 | 58.3k | ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, |
2623 | 58.3k | "SstFileManager instance %p", sfm); |
2624 | | |
2625 | 58.3k | impl->TrackExistingDataFiles(recovery_ctx.existing_data_files_); |
2626 | | |
2627 | | // Reserve some disk buffer space. This is a heuristic - when we run out |
2628 | | // of disk space, this ensures that there is at least write_buffer_size |
2629 | | // amount of free space before we resume DB writes. In low disk space |
2630 | | // conditions, we want to avoid a lot of small L0 files due to frequent |
2631 | | // WAL write failures and resultant forced flushes |
2632 | 58.3k | sfm->ReserveDiskBuffer(max_write_buffer_size, |
2633 | 58.3k | impl->immutable_db_options_.db_paths[0].path); |
2634 | 58.3k | } |
2635 | | |
2636 | 58.3k | if (s.ok()) { |
2637 | | // When the DB is stopped, it's possible that there are some .trash files |
2638 | | // that were not deleted yet, when we open the DB we will find these .trash |
2639 | | // files and schedule them to be deleted (or delete immediately if |
2640 | | // SstFileManager was not used). |
2641 | | // Note that we only start doing this and below delete obsolete file after |
2642 | | // `TrackExistingDataFiles` are called, the `max_trash_db_ratio` is |
2643 | | // ineffective otherwise and these files' deletion won't be rate limited |
2644 | | // which can cause discard stall. |
2645 | 58.3k | for (const auto& path : impl->CollectAllDBPaths()) { |
2646 | 58.3k | DeleteScheduler::CleanupDirectory(impl->immutable_db_options_.env, sfm, |
2647 | 58.3k | path) |
2648 | 58.3k | .PermitUncheckedError(); |
2649 | 58.3k | } |
2650 | 58.3k | impl->mutex_.Lock(); |
2651 | | // This will do a full scan. |
2652 | 58.3k | impl->DeleteObsoleteFiles(); |
2653 | 58.3k | TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); |
2654 | 58.3k | impl->MaybeScheduleFlushOrCompaction(); |
2655 | 58.3k | impl->mutex_.Unlock(); |
2656 | 58.3k | } |
2657 | | |
2658 | 58.3k | if (s.ok()) { |
2659 | 58.3k | ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", |
2660 | 58.3k | impl.get()); |
2661 | 58.3k | LogFlush(impl->immutable_db_options_.info_log); |
2662 | 58.3k | if (!impl->WALBufferIsEmpty()) { |
2663 | 0 | s = impl->FlushWAL(write_options, false); |
2664 | 0 | if (s.ok()) { |
2665 | | // Sync is needed otherwise WAL buffered data might get lost after a |
2666 | | // power reset. |
2667 | 0 | log::Writer* log_writer = impl->logs_.back().writer; |
2668 | 0 | IOOptions opts; |
2669 | 0 | s = WritableFileWriter::PrepareIOOptions(write_options, opts); |
2670 | 0 | if (s.ok()) { |
2671 | 0 | s = log_writer->file()->Sync(opts, |
2672 | 0 | impl->immutable_db_options_.use_fsync); |
2673 | 0 | } |
2674 | 0 | } |
2675 | 0 | } |
2676 | 58.3k | if (s.ok() && !persist_options_status.ok()) { |
2677 | 0 | s = Status::IOError( |
2678 | 0 | "DB::Open() failed --- Unable to persist Options file", |
2679 | 0 | persist_options_status.ToString()); |
2680 | 0 | } |
2681 | 58.3k | } |
2682 | 58.3k | if (!s.ok()) { |
2683 | 0 | ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, |
2684 | 0 | "DB::Open() failed: %s", s.ToString().c_str()); |
2685 | 0 | } |
2686 | 58.3k | if (s.ok()) { |
2687 | 58.3k | s = impl->StartPeriodicTaskScheduler(); |
2688 | 58.3k | } |
2689 | 58.3k | if (s.ok()) { |
2690 | 58.3k | s = impl->RegisterRecordSeqnoTimeWorker(); |
2691 | 58.3k | } |
2692 | 58.3k | impl->options_mutex_.Unlock(); |
2693 | 58.3k | if (s.ok()) { |
2694 | 58.3k | *dbptr = std::move(impl); |
2695 | 58.3k | } else { |
2696 | 0 | for (auto* h : *handles) { |
2697 | 0 | delete h; |
2698 | 0 | } |
2699 | 0 | handles->clear(); |
2700 | 0 | } |
2701 | 58.3k | return s; |
2702 | 58.3k | } |
2703 | | } // namespace ROCKSDB_NAMESPACE |