Coverage Report

Created: 2025-07-23 07:17

/src/rocksdb/db/db_impl/db_impl_open.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#include <cinttypes>
10
11
#include "db/builder.h"
12
#include "db/db_impl/db_impl.h"
13
#include "db/error_handler.h"
14
#include "db/periodic_task_scheduler.h"
15
#include "env/composite_env_wrapper.h"
16
#include "file/filename.h"
17
#include "file/read_write_util.h"
18
#include "file/sst_file_manager_impl.h"
19
#include "file/writable_file_writer.h"
20
#include "logging/logging.h"
21
#include "monitoring/persistent_stats_history.h"
22
#include "monitoring/thread_status_util.h"
23
#include "options/options_helper.h"
24
#include "rocksdb/options.h"
25
#include "rocksdb/table.h"
26
#include "rocksdb/wal_filter.h"
27
#include "test_util/sync_point.h"
28
#include "util/rate_limiter_impl.h"
29
#include "util/string_util.h"
30
#include "util/udt_util.h"
31
32
namespace ROCKSDB_NAMESPACE {
33
Options SanitizeOptions(const std::string& dbname, const Options& src,
34
10.2k
                        bool read_only, Status* logger_creation_s) {
35
10.2k
  auto db_options =
36
10.2k
      SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
37
10.2k
  ImmutableDBOptions immutable_db_options(db_options);
38
10.2k
  auto cf_options = SanitizeCfOptions(immutable_db_options, read_only,
39
10.2k
                                      ColumnFamilyOptions(src));
40
10.2k
  return Options(db_options, cf_options);
41
10.2k
}
42
43
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
44
68.5k
                          bool read_only, Status* logger_creation_s) {
45
68.5k
  DBOptions result(src);
46
47
68.5k
  if (result.env == nullptr) {
48
0
    result.env = Env::Default();
49
0
  }
50
51
  // result.max_open_files means an "infinite" open files.
52
68.5k
  if (result.max_open_files != -1) {
53
0
    int max_max_open_files = port::GetMaxOpenFiles();
54
0
    if (max_max_open_files == -1) {
55
0
      max_max_open_files = 0x400000;
56
0
    }
57
0
    ClipToRange(&result.max_open_files, 20, max_max_open_files);
58
0
    TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
59
0
                             &result.max_open_files);
60
0
  }
61
62
68.5k
  if (result.info_log == nullptr && !read_only) {
63
68.5k
    Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
64
68.5k
    if (!s.ok()) {
65
      // No place suitable for logging
66
0
      result.info_log = nullptr;
67
0
      if (logger_creation_s) {
68
0
        *logger_creation_s = s;
69
0
      }
70
0
    }
71
68.5k
  }
72
73
68.5k
  if (!result.write_buffer_manager) {
74
68.5k
    result.write_buffer_manager.reset(
75
68.5k
        new WriteBufferManager(result.db_write_buffer_size));
76
68.5k
  }
77
68.5k
  auto bg_job_limits = DBImpl::GetBGJobLimits(
78
68.5k
      result.max_background_flushes, result.max_background_compactions,
79
68.5k
      result.max_background_jobs, true /* parallelize_compactions */);
80
68.5k
  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
81
68.5k
                                           Env::Priority::LOW);
82
68.5k
  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
83
68.5k
                                           Env::Priority::HIGH);
84
85
68.5k
  if (result.rate_limiter.get() != nullptr) {
86
0
    if (result.bytes_per_sync == 0) {
87
0
      result.bytes_per_sync = 1024 * 1024;
88
0
    }
89
0
  }
90
91
68.5k
  if (result.delayed_write_rate == 0) {
92
68.5k
    if (result.rate_limiter.get() != nullptr) {
93
0
      result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
94
0
    }
95
68.5k
    if (result.delayed_write_rate == 0) {
96
68.5k
      result.delayed_write_rate = 16 * 1024 * 1024;
97
68.5k
    }
98
68.5k
  }
99
100
68.5k
  if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
101
0
    result.recycle_log_file_num = false;
102
0
  }
103
104
68.5k
  if (result.recycle_log_file_num &&
105
68.5k
      (result.wal_recovery_mode ==
106
0
           WALRecoveryMode::kTolerateCorruptedTailRecords ||
107
0
       result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
108
    // - kTolerateCorruptedTailRecords is inconsistent with recycle log file
109
    //   feature. WAL recycling expects recovery success upon encountering a
110
    //   corrupt record at the point where new data ends and recycled data
111
    //   remains at the tail. However, `kTolerateCorruptedTailRecords` must fail
112
    //   upon encountering any such corrupt record, as it cannot differentiate
113
    //   between this and a real corruption, which would cause committed updates
114
    //   to be truncated -- a violation of the recovery guarantee.
115
    // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with
116
    //   recycle log file feature temporarily due to a bug found introducing a
117
    //   hole in the recovered data
118
    //   (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236).
119
    //   Besides this bug, we believe the features are fundamentally compatible.
120
0
    result.recycle_log_file_num = 0;
121
0
  }
122
123
68.5k
  if (result.db_paths.size() == 0) {
124
68.5k
    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
125
68.5k
  } else if (result.wal_dir.empty()) {
126
    // Use dbname as default
127
0
    result.wal_dir = dbname;
128
0
  }
129
68.5k
  if (!result.wal_dir.empty()) {
130
    // If there is a wal_dir already set, check to see if the wal_dir is the
131
    // same as the dbname AND the same as the db_path[0] (which must exist from
132
    // a few lines ago). If the wal_dir matches both of these values, then clear
133
    // the wal_dir value, which will make wal_dir == dbname.  Most likely this
134
    // condition was the result of reading an old options file where we forced
135
    // wal_dir to be set (to dbname).
136
0
    auto npath = NormalizePath(dbname + "/");
137
0
    if (npath == NormalizePath(result.wal_dir + "/") &&
138
0
        npath == NormalizePath(result.db_paths[0].path + "/")) {
139
0
      result.wal_dir.clear();
140
0
    }
141
0
  }
142
143
68.5k
  if (!result.wal_dir.empty() && result.wal_dir.back() == '/') {
144
0
    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
145
0
  }
146
147
  // Force flush on DB open if 2PC is enabled, since with 2PC we have no
148
  // guarantee that consecutive log files have consecutive sequence id, which
149
  // make recovery complicated.
150
68.5k
  if (result.allow_2pc) {
151
0
    result.avoid_flush_during_recovery = false;
152
0
  }
153
154
68.5k
  ImmutableDBOptions immutable_db_options(result);
155
68.5k
  if (!immutable_db_options.IsWalDirSameAsDBPath()) {
156
    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
157
    // cannot tell for sure. In either case, assume they're different and
158
    // explicitly cleanup the trash log files (bypass DeleteScheduler)
159
    // Do this first so even if we end up calling
160
    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
161
    // safe
162
0
    std::vector<std::string> filenames;
163
0
    IOOptions io_opts;
164
0
    io_opts.do_not_recurse = true;
165
0
    auto wal_dir = immutable_db_options.GetWalDir();
166
0
    Status s = immutable_db_options.fs->GetChildren(
167
0
        wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr);
168
0
    s.PermitUncheckedError();  //**TODO: What to do on error?
169
0
    for (std::string& filename : filenames) {
170
0
      if (filename.find(".log.trash", filename.length() -
171
0
                                          std::string(".log.trash").length()) !=
172
0
          std::string::npos) {
173
0
        std::string trash_file = wal_dir + "/" + filename;
174
0
        result.env->DeleteFile(trash_file).PermitUncheckedError();
175
0
      }
176
0
    }
177
0
  }
178
179
  // Create a default SstFileManager for purposes of tracking compaction size
180
  // and facilitating recovery from out of space errors.
181
68.5k
  if (result.sst_file_manager.get() == nullptr) {
182
68.5k
    std::shared_ptr<SstFileManager> sst_file_manager(
183
68.5k
        NewSstFileManager(result.env, result.info_log));
184
68.5k
    result.sst_file_manager = sst_file_manager;
185
68.5k
  }
186
187
  // Supported wal compression types
188
68.5k
  if (!StreamingCompressionTypeSupported(result.wal_compression)) {
189
0
    result.wal_compression = kNoCompression;
190
0
    ROCKS_LOG_WARN(result.info_log,
191
0
                   "wal_compression is disabled since only zstd is supported");
192
0
  }
193
194
68.5k
  return result;
195
68.5k
}
196
197
namespace {
198
Status ValidateOptionsByTable(
199
    const DBOptions& db_opts,
200
58.3k
    const std::vector<ColumnFamilyDescriptor>& column_families) {
201
58.3k
  Status s;
202
80.3k
  for (auto& cf : column_families) {
203
80.3k
    s = ValidateOptions(db_opts, cf.options);
204
80.3k
    if (!s.ok()) {
205
0
      return s;
206
0
    }
207
80.3k
  }
208
58.3k
  return Status::OK();
209
58.3k
}
210
}  // namespace
211
212
Status DBImpl::ValidateOptions(
213
    const DBOptions& db_options,
214
58.3k
    const std::vector<ColumnFamilyDescriptor>& column_families) {
215
58.3k
  Status s;
216
80.3k
  for (auto& cfd : column_families) {
217
80.3k
    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
218
80.3k
    if (!s.ok()) {
219
0
      return s;
220
0
    }
221
80.3k
    if (cfd.name == kDefaultColumnFamilyName) {
222
58.3k
      if (cfd.options.disallow_memtable_writes) {
223
0
        return Status::InvalidArgument(
224
0
            "Default column family cannot use disallow_memtable_writes=true");
225
0
      }
226
58.3k
    }
227
80.3k
  }
228
58.3k
  s = ValidateOptions(db_options);
229
58.3k
  return s;
230
58.3k
}
231
232
58.3k
Status DBImpl::ValidateOptions(const DBOptions& db_options) {
233
58.3k
  if (db_options.db_paths.size() > 4) {
234
0
    return Status::NotSupported(
235
0
        "More than four DB paths are not supported yet. ");
236
0
  }
237
238
58.3k
  if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
239
    // Protect against assert in PosixMMapReadableFile constructor
240
0
    return Status::NotSupported(
241
0
        "If memory mapped reads (allow_mmap_reads) are enabled "
242
0
        "then direct I/O reads (use_direct_reads) must be disabled. ");
243
0
  }
244
245
58.3k
  if (db_options.allow_mmap_writes &&
246
58.3k
      db_options.use_direct_io_for_flush_and_compaction) {
247
0
    return Status::NotSupported(
248
0
        "If memory mapped writes (allow_mmap_writes) are enabled "
249
0
        "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
250
0
        "be disabled. ");
251
0
  }
252
253
58.3k
  if (db_options.keep_log_file_num == 0) {
254
0
    return Status::InvalidArgument("keep_log_file_num must be greater than 0");
255
0
  }
256
257
58.3k
  if (db_options.unordered_write &&
258
58.3k
      !db_options.allow_concurrent_memtable_write) {
259
0
    return Status::InvalidArgument(
260
0
        "unordered_write is incompatible with "
261
0
        "!allow_concurrent_memtable_write");
262
0
  }
263
264
58.3k
  if (db_options.unordered_write && db_options.enable_pipelined_write) {
265
0
    return Status::InvalidArgument(
266
0
        "unordered_write is incompatible with enable_pipelined_write");
267
0
  }
268
269
58.3k
  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
270
0
    return Status::InvalidArgument(
271
0
        "atomic_flush is incompatible with enable_pipelined_write");
272
0
  }
273
274
58.3k
  if (db_options.use_direct_io_for_flush_and_compaction &&
275
58.3k
      0 == db_options.writable_file_max_buffer_size) {
276
0
    return Status::InvalidArgument(
277
0
        "writes in direct IO require writable_file_max_buffer_size > 0");
278
0
  }
279
280
58.3k
  if (db_options.daily_offpeak_time_utc != "") {
281
0
    int start_time, end_time;
282
0
    if (!TryParseTimeRangeString(db_options.daily_offpeak_time_utc, start_time,
283
0
                                 end_time)) {
284
0
      return Status::InvalidArgument(
285
0
          "daily_offpeak_time_utc should be set in the format HH:mm-HH:mm "
286
0
          "(e.g. 04:30-07:30)");
287
0
    } else if (start_time == end_time) {
288
0
      return Status::InvalidArgument(
289
0
          "start_time and end_time cannot be the same");
290
0
    }
291
0
  }
292
293
58.3k
  if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) {
294
0
    return Status::InvalidArgument(
295
0
        "write_dbid_to_manifest and write_identity_file cannot both be false");
296
0
  }
297
58.3k
  return Status::OK();
298
58.3k
}
299
300
10.2k
Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
301
10.2k
  VersionEdit new_db_edit;
302
10.2k
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
303
10.2k
  Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
304
10.2k
                       /*is_retry=*/false, &new_db_edit);
305
10.2k
  if (!s.ok()) {
306
0
    return s;
307
0
  }
308
10.2k
  new_db_edit.SetLogNumber(0);
309
10.2k
  new_db_edit.SetNextFile(2);
310
10.2k
  new_db_edit.SetLastSequence(0);
311
312
10.2k
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
313
10.2k
  const std::string manifest = DescriptorFileName(dbname_, 1);
314
10.2k
  {
315
10.2k
    if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
316
0
      fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
317
0
    }
318
10.2k
    std::unique_ptr<FSWritableFile> file;
319
10.2k
    FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
320
    // DB option takes precedence when not kUnknown
321
10.2k
    if (immutable_db_options_.metadata_write_temperature !=
322
10.2k
        Temperature::kUnknown) {
323
0
      file_options.temperature =
324
0
          immutable_db_options_.metadata_write_temperature;
325
0
    }
326
10.2k
    s = NewWritableFile(fs_.get(), manifest, &file, file_options);
327
10.2k
    if (!s.ok()) {
328
0
      return s;
329
0
    }
330
10.2k
    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
331
10.2k
    file->SetPreallocationBlockSize(
332
10.2k
        immutable_db_options_.manifest_preallocation_size);
333
10.2k
    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
334
10.2k
        std::move(file), manifest, file_options, immutable_db_options_.clock,
335
10.2k
        io_tracer_, nullptr /* stats */,
336
10.2k
        Histograms::HISTOGRAM_ENUM_MAX /* hist_type */,
337
10.2k
        immutable_db_options_.listeners, nullptr,
338
10.2k
        tmp_set.Contains(FileType::kDescriptorFile),
339
10.2k
        tmp_set.Contains(FileType::kDescriptorFile)));
340
10.2k
    log::Writer log(std::move(file_writer), 0, false);
341
10.2k
    std::string record;
342
10.2k
    new_db_edit.EncodeTo(&record);
343
10.2k
    s = log.AddRecord(write_options, record);
344
10.2k
    if (s.ok()) {
345
10.2k
      s = SyncManifest(&immutable_db_options_, write_options, log.file());
346
10.2k
    }
347
10.2k
  }
348
10.2k
  if (s.ok()) {
349
    // Make "CURRENT" file that points to the new manifest file.
350
10.2k
    s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
351
10.2k
                       immutable_db_options_.metadata_write_temperature,
352
10.2k
                       directories_.GetDbDir());
353
10.2k
    if (new_filenames) {
354
10.2k
      new_filenames->emplace_back(
355
10.2k
          manifest.substr(manifest.find_last_of("/\\") + 1));
356
10.2k
    }
357
10.2k
  } else {
358
0
    fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
359
0
  }
360
10.2k
  return s;
361
10.2k
}
362
363
IOStatus DBImpl::CreateAndNewDirectory(
364
    FileSystem* fs, const std::string& dirname,
365
138k
    std::unique_ptr<FSDirectory>* directory) {
366
  // We call CreateDirIfMissing() as the directory may already exist (if we
367
  // are reopening a DB), when this happens we don't want creating the
368
  // directory to cause an error. However, we need to check if creating the
369
  // directory fails or else we may get an obscure message about the lock
370
  // file not existing. One real-world example of this occurring is if
371
  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
372
  // when dbname_ is "dir/db" but when "dir" doesn't exist.
373
138k
  IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr);
374
138k
  if (!io_s.ok()) {
375
0
    return io_s;
376
0
  }
377
138k
  return fs->NewDirectory(dirname, IOOptions(), directory, nullptr);
378
138k
}
379
380
IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
381
                                     const std::string& wal_dir,
382
58.3k
                                     const std::vector<DbPath>& data_paths) {
383
58.3k
  IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_);
384
58.3k
  if (!io_s.ok()) {
385
0
    return io_s;
386
0
  }
387
58.3k
  if (!wal_dir.empty() && dbname != wal_dir) {
388
0
    io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_);
389
0
    if (!io_s.ok()) {
390
0
      return io_s;
391
0
    }
392
0
  }
393
394
58.3k
  data_dirs_.clear();
395
58.3k
  for (auto& p : data_paths) {
396
58.3k
    const std::string db_path = p.path;
397
58.3k
    if (db_path == dbname) {
398
58.3k
      data_dirs_.emplace_back(nullptr);
399
58.3k
    } else {
400
0
      std::unique_ptr<FSDirectory> path_directory;
401
0
      io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory);
402
0
      if (!io_s.ok()) {
403
0
        return io_s;
404
0
      }
405
0
      data_dirs_.emplace_back(path_directory.release());
406
0
    }
407
58.3k
  }
408
58.3k
  assert(data_dirs_.size() == data_paths.size());
409
58.3k
  return IOStatus::OK();
410
58.3k
}
411
412
Status DBImpl::Recover(
413
    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
414
    bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
415
    bool is_retry, uint64_t* recovered_seq, RecoveryContext* recovery_ctx,
416
58.3k
    bool* can_retry) {
417
58.3k
  mutex_.AssertHeld();
418
419
58.3k
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
420
58.3k
  bool tmp_is_new_db = false;
421
58.3k
  bool& is_new_db = recovery_ctx ? recovery_ctx->is_new_db_ : tmp_is_new_db;
422
58.3k
  assert(db_lock_ == nullptr);
423
58.3k
  std::vector<std::string> files_in_dbname;
424
58.3k
  if (!read_only) {
425
58.3k
    Status s = directories_.SetDirectories(fs_.get(), dbname_,
426
58.3k
                                           immutable_db_options_.wal_dir,
427
58.3k
                                           immutable_db_options_.db_paths);
428
58.3k
    if (!s.ok()) {
429
0
      return s;
430
0
    }
431
432
58.3k
    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
433
58.3k
    if (!s.ok()) {
434
0
      return s;
435
0
    }
436
437
58.3k
    std::string current_fname = CurrentFileName(dbname_);
438
    // Path to any MANIFEST file in the db dir. It does not matter which one.
439
    // Since best-efforts recovery ignores CURRENT file, existence of a
440
    // MANIFEST indicates the recovery to recover existing db. If no MANIFEST
441
    // can be found, a new db will be created.
442
58.3k
    std::string manifest_path;
443
58.3k
    if (!immutable_db_options_.best_efforts_recovery) {
444
58.3k
      s = env_->FileExists(current_fname);
445
58.3k
    } else {
446
0
      s = Status::NotFound();
447
0
      IOOptions io_opts;
448
0
      io_opts.do_not_recurse = true;
449
0
      Status io_s = immutable_db_options_.fs->GetChildren(
450
0
          dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
451
0
      if (!io_s.ok()) {
452
0
        s = io_s;
453
0
        files_in_dbname.clear();
454
0
      }
455
0
      for (const std::string& file : files_in_dbname) {
456
0
        uint64_t number = 0;
457
0
        FileType type = kWalFile;  // initialize
458
0
        if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
459
0
          uint64_t bytes;
460
0
          s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes);
461
0
          if (s.ok() && bytes != 0) {
462
            // Found non-empty MANIFEST (descriptor log), thus best-efforts
463
            // recovery does not have to treat the db as empty.
464
0
            manifest_path = dbname_ + "/" + file;
465
0
            break;
466
0
          }
467
0
        }
468
0
      }
469
0
    }
470
58.3k
    if (s.IsNotFound()) {
471
10.2k
      if (immutable_db_options_.create_if_missing) {
472
10.2k
        s = NewDB(&files_in_dbname);
473
10.2k
        is_new_db = true;
474
10.2k
        if (!s.ok()) {
475
0
          return s;
476
0
        }
477
10.2k
      } else {
478
0
        return Status::InvalidArgument(
479
0
            current_fname, "does not exist (create_if_missing is false)");
480
0
      }
481
48.1k
    } else if (s.ok()) {
482
48.1k
      if (immutable_db_options_.error_if_exists) {
483
0
        return Status::InvalidArgument(dbname_,
484
0
                                       "exists (error_if_exists is true)");
485
0
      }
486
48.1k
    } else {
487
      // Unexpected error reading file
488
0
      assert(s.IsIOError());
489
0
      return s;
490
0
    }
491
    // Verify compatibility of file_options_ and filesystem
492
58.3k
    {
493
58.3k
      std::unique_ptr<FSRandomAccessFile> idfile;
494
58.3k
      FileOptions customized_fs(file_options_);
495
58.3k
      customized_fs.use_direct_reads |=
496
58.3k
          immutable_db_options_.use_direct_io_for_flush_and_compaction;
497
58.3k
      const std::string& fname =
498
58.3k
          manifest_path.empty() ? current_fname : manifest_path;
499
58.3k
      s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
500
58.3k
      if (!s.ok()) {
501
0
        std::string error_str = s.ToString();
502
        // Check if unsupported Direct I/O is the root cause
503
0
        customized_fs.use_direct_reads = false;
504
0
        s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
505
0
        if (s.ok()) {
506
0
          return Status::InvalidArgument(
507
0
              "Direct I/O is not supported by the specified DB.");
508
0
        } else {
509
0
          return Status::InvalidArgument(
510
0
              "Found options incompatible with filesystem", error_str.c_str());
511
0
        }
512
0
      }
513
58.3k
    }
514
58.3k
  } else if (immutable_db_options_.best_efforts_recovery) {
515
0
    assert(files_in_dbname.empty());
516
0
    IOOptions io_opts;
517
0
    io_opts.do_not_recurse = true;
518
0
    Status s = immutable_db_options_.fs->GetChildren(
519
0
        dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
520
0
    if (s.IsNotFound()) {
521
0
      return Status::InvalidArgument(dbname_,
522
0
                                     "does not exist (open for read only)");
523
0
    } else if (s.IsIOError()) {
524
0
      return s;
525
0
    }
526
0
    assert(s.ok());
527
0
  }
528
58.3k
  assert(is_new_db || db_id_.empty());
529
58.3k
  Status s;
530
58.3k
  bool missing_table_file = false;
531
58.3k
  if (!immutable_db_options_.best_efforts_recovery) {
532
    // Status of reading the descriptor file
533
58.3k
    Status desc_status;
534
58.3k
    s = versions_->Recover(column_families, read_only, &db_id_,
535
58.3k
                           /*no_error_if_files_missing=*/false, is_retry,
536
58.3k
                           &desc_status);
537
58.3k
    desc_status.PermitUncheckedError();
538
58.3k
    if (is_retry) {
539
0
      RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
540
0
      if (desc_status.ok()) {
541
0
        RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
542
0
      }
543
0
    }
544
58.3k
    if (can_retry) {
545
      // If we're opening for the first time and the failure is likely due to
546
      // a corrupt MANIFEST file (could result in either the log::Reader
547
      // detecting a corrupt record, or SST files not found error due to
548
      // discarding badly formed tail records)
549
58.3k
      if (!is_retry &&
550
58.3k
          (desc_status.IsCorruption() || s.IsNotFound() || s.IsCorruption()) &&
551
58.3k
          CheckFSFeatureSupport(fs_.get(),
552
0
                                FSSupportedOps::kVerifyAndReconstructRead)) {
553
0
        *can_retry = true;
554
0
        ROCKS_LOG_ERROR(
555
0
            immutable_db_options_.info_log,
556
0
            "Possible corruption detected while replaying MANIFEST %s, %s. "
557
0
            "Will be retried.",
558
0
            desc_status.ToString().c_str(), s.ToString().c_str());
559
58.3k
      } else {
560
58.3k
        *can_retry = false;
561
58.3k
      }
562
58.3k
    }
563
58.3k
  } else {
564
0
    assert(!files_in_dbname.empty());
565
0
    s = versions_->TryRecover(column_families, read_only, files_in_dbname,
566
0
                              &db_id_, &missing_table_file);
567
0
    if (s.ok()) {
568
      // TryRecover may delete previous column_family_set_.
569
0
      column_family_memtables_.reset(
570
0
          new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
571
0
    }
572
0
  }
573
58.3k
  if (!s.ok()) {
574
0
    return s;
575
0
  }
576
58.3k
  if (s.ok() && !read_only) {
577
80.3k
    for (auto cfd : *versions_->GetColumnFamilySet()) {
578
80.3k
      const auto& moptions = cfd->GetLatestMutableCFOptions();
579
      // Try to trivially move files down the LSM tree to start from bottommost
580
      // level when level_compaction_dynamic_level_bytes is enabled. This should
581
      // only be useful when user is migrating to turning on this option.
582
      // If a user is migrating from Level Compaction with a smaller level
583
      // multiplier or from Universal Compaction, there may be too many
584
      // non-empty levels and the trivial moves here are not sufficed for
585
      // migration. Additional compactions are needed to drain unnecessary
586
      // levels.
587
      //
588
      // Note that this step moves files down LSM without consulting
589
      // SSTPartitioner. Further compactions are still needed if
590
      // the user wants to partition SST files.
591
      // Note that files moved in this step may not respect the compression
592
      // option in target level.
593
80.3k
      if (cfd->ioptions().compaction_style ==
594
80.3k
              CompactionStyle::kCompactionStyleLevel &&
595
80.3k
          cfd->ioptions().level_compaction_dynamic_level_bytes &&
596
80.3k
          !moptions.disable_auto_compactions) {
597
80.3k
        int to_level = cfd->ioptions().num_levels - 1;
598
        // last level is reserved
599
        // allow_ingest_behind does not support Level Compaction,
600
        // and per_key_placement can have infinite compaction loop for Level
601
        // Compaction. Adjust to_level here just to be safe.
602
80.3k
        if (cfd->ioptions().allow_ingest_behind ||
603
80.3k
            moptions.preclude_last_level_data_seconds > 0) {
604
0
          to_level -= 1;
605
0
        }
606
        // Whether this column family has a level trivially moved
607
80.3k
        bool moved = false;
608
        // Fill the LSM starting from to_level and going up one level at a time.
609
        // Some loop invariants (when last level is not reserved):
610
        // - levels in (from_level, to_level] are empty, and
611
        // - levels in (to_level, last_level] are non-empty.
612
642k
        for (int from_level = to_level; from_level >= 0; --from_level) {
613
562k
          const std::vector<FileMetaData*>& level_files =
614
562k
              cfd->current()->storage_info()->LevelFiles(from_level);
615
562k
          if (level_files.empty() || from_level == 0) {
616
547k
            continue;
617
547k
          }
618
15.0k
          assert(from_level <= to_level);
619
          // Trivial move files from `from_level` to `to_level`
620
15.0k
          if (from_level < to_level) {
621
0
            if (!moved) {
622
              // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with
623
              // 7 levels
624
0
              std::string lsm_state = "[";
625
0
              for (int i = 0; i < cfd->ioptions().num_levels; ++i) {
626
0
                lsm_state += std::to_string(
627
0
                    cfd->current()->storage_info()->NumLevelFiles(i));
628
0
                if (i < cfd->ioptions().num_levels - 1) {
629
0
                  lsm_state += ",";
630
0
                }
631
0
              }
632
0
              lsm_state += "]";
633
0
              ROCKS_LOG_WARN(immutable_db_options_.info_log,
634
0
                             "[%s] Trivially move files down the LSM when open "
635
0
                             "with level_compaction_dynamic_level_bytes=true,"
636
0
                             " lsm_state: %s (Files are moved only if DB "
637
0
                             "Recovery is successful).",
638
0
                             cfd->GetName().c_str(), lsm_state.c_str());
639
0
              moved = true;
640
0
            }
641
0
            ROCKS_LOG_WARN(
642
0
                immutable_db_options_.info_log,
643
0
                "[%s] Moving %zu files from from_level-%d to from_level-%d",
644
0
                cfd->GetName().c_str(), level_files.size(), from_level,
645
0
                to_level);
646
0
            VersionEdit edit;
647
0
            edit.SetColumnFamily(cfd->GetID());
648
0
            for (const FileMetaData* f : level_files) {
649
0
              edit.DeleteFile(from_level, f->fd.GetNumber());
650
0
              edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
651
0
                           f->fd.GetFileSize(), f->smallest, f->largest,
652
0
                           f->fd.smallest_seqno, f->fd.largest_seqno,
653
0
                           f->marked_for_compaction,
654
0
                           f->temperature,  // this can be different from
655
                           // `last_level_temperature`
656
0
                           f->oldest_blob_file_number, f->oldest_ancester_time,
657
0
                           f->file_creation_time, f->epoch_number,
658
0
                           f->file_checksum, f->file_checksum_func_name,
659
0
                           f->unique_id, f->compensated_range_deletion_size,
660
0
                           f->tail_size, f->user_defined_timestamps_persisted);
661
0
              ROCKS_LOG_WARN(immutable_db_options_.info_log,
662
0
                             "[%s] Moving #%" PRIu64
663
0
                             " from from_level-%d to from_level-%d %" PRIu64
664
0
                             " bytes\n",
665
0
                             cfd->GetName().c_str(), f->fd.GetNumber(),
666
0
                             from_level, to_level, f->fd.GetFileSize());
667
0
            }
668
0
            recovery_ctx->UpdateVersionEdits(cfd, edit);
669
0
          }
670
15.0k
          --to_level;
671
15.0k
        }
672
80.3k
      }
673
80.3k
    }
674
58.3k
  }
675
58.3k
  if (is_new_db) {
676
    // Already set up DB ID in NewDB
677
48.1k
  } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
678
48.1k
    VersionEdit edit;
679
48.1k
    s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit);
680
48.1k
    recovery_ctx->UpdateVersionEdits(
681
48.1k
        versions_->GetColumnFamilySet()->GetDefault(), edit);
682
48.1k
  } else {
683
0
    s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr);
684
0
  }
685
58.3k
  assert(!s.ok() || !db_id_.empty());
686
58.3k
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
687
58.3k
  if (s.ok() && !read_only) {
688
58.3k
    s = MaybeUpdateNextFileNumber(recovery_ctx);
689
58.3k
  }
690
691
58.3k
  if (s.ok() && !read_only) {
692
    // TODO: share file descriptors (FSDirectory) with SetDirectories above
693
58.3k
    std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
694
80.3k
    for (auto cfd : *versions_->GetColumnFamilySet()) {
695
80.3k
      s = cfd->AddDirectories(&created_dirs);
696
80.3k
      if (!s.ok()) {
697
0
        return s;
698
0
      }
699
80.3k
    }
700
58.3k
  }
701
702
58.3k
  std::vector<std::string> files_in_wal_dir;
703
58.3k
  if (s.ok()) {
704
    // Initial max_total_in_memory_state_ before recovery wals. Log recovery
705
    // may check this value to decide whether to flush.
706
58.3k
    max_total_in_memory_state_ = 0;
707
80.3k
    for (auto cfd : *versions_->GetColumnFamilySet()) {
708
80.3k
      const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
709
80.3k
      max_total_in_memory_state_ += mutable_cf_options.write_buffer_size *
710
80.3k
                                    mutable_cf_options.max_write_buffer_number;
711
80.3k
    }
712
713
58.3k
    SequenceNumber next_sequence(kMaxSequenceNumber);
714
58.3k
    default_cf_handle_ = new ColumnFamilyHandleImpl(
715
58.3k
        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
716
58.3k
    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
717
718
    // Recover from all newer log files than the ones named in the
719
    // descriptor (new log files may have been added by the previous
720
    // incarnation without registering them in the descriptor).
721
    //
722
    // Note that prev_log_number() is no longer used, but we pay
723
    // attention to it in case we are recovering a database
724
    // produced by an older version of rocksdb.
725
58.3k
    auto wal_dir = immutable_db_options_.GetWalDir();
726
58.3k
    if (!immutable_db_options_.best_efforts_recovery) {
727
58.3k
      IOOptions io_opts;
728
58.3k
      io_opts.do_not_recurse = true;
729
58.3k
      s = immutable_db_options_.fs->GetChildren(
730
58.3k
          wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr);
731
58.3k
    }
732
58.3k
    if (s.IsNotFound()) {
733
0
      return Status::InvalidArgument("wal_dir not found", wal_dir);
734
58.3k
    } else if (!s.ok()) {
735
0
      return s;
736
0
    }
737
738
58.3k
    std::unordered_map<uint64_t, std::string> wal_files;
739
1.06M
    for (const auto& file : files_in_wal_dir) {
740
1.06M
      uint64_t number;
741
1.06M
      FileType type;
742
1.06M
      if (ParseFileName(file, &number, &type) && type == kWalFile) {
743
48.1k
        if (is_new_db) {
744
0
          return Status::Corruption(
745
0
              "While creating a new Db, wal_dir contains "
746
0
              "existing log file: ",
747
0
              file);
748
48.1k
        } else {
749
48.1k
          wal_files[number] = LogFileName(wal_dir, number);
750
48.1k
        }
751
48.1k
      }
752
1.06M
    }
753
754
58.3k
    if (immutable_db_options_.track_and_verify_wals && !is_new_db &&
755
58.3k
        !immutable_db_options_.best_efforts_recovery && wal_files.empty()) {
756
0
      return Status::Corruption("Opening an existing DB with no WAL files");
757
0
    }
758
759
58.3k
    if (immutable_db_options_.track_and_verify_wals_in_manifest) {
760
0
      if (!immutable_db_options_.best_efforts_recovery) {
761
        // Verify WALs in MANIFEST.
762
0
        s = versions_->GetWalSet().CheckWals(env_, wal_files);
763
0
      }  // else since best effort recovery does not recover from WALs, no need
764
         // to check WALs.
765
58.3k
    } else if (!versions_->GetWalSet().GetWals().empty()) {
766
      // Tracking is disabled, clear previously tracked WALs from MANIFEST,
767
      // otherwise, in the future, if WAL tracking is enabled again,
768
      // since the WALs deleted when WAL tracking is disabled are not persisted
769
      // into MANIFEST, WAL check may fail.
770
0
      VersionEdit edit;
771
0
      WalNumber max_wal_number =
772
0
          versions_->GetWalSet().GetWals().rbegin()->first;
773
0
      edit.DeleteWalsBefore(max_wal_number + 1);
774
0
      assert(recovery_ctx != nullptr);
775
0
      assert(versions_->GetColumnFamilySet() != nullptr);
776
0
      recovery_ctx->UpdateVersionEdits(
777
0
          versions_->GetColumnFamilySet()->GetDefault(), edit);
778
0
    }
779
58.3k
    if (!s.ok()) {
780
0
      return s;
781
0
    }
782
783
58.3k
    if (!wal_files.empty()) {
784
48.1k
      if (error_if_wal_file_exists) {
785
0
        return Status::Corruption(
786
0
            "The db was opened in readonly mode with error_if_wal_file_exists"
787
0
            "flag but a WAL file already exists");
788
48.1k
      } else if (error_if_data_exists_in_wals) {
789
0
        for (auto& wal_file : wal_files) {
790
0
          uint64_t bytes;
791
0
          s = env_->GetFileSize(wal_file.second, &bytes);
792
0
          if (s.ok()) {
793
0
            if (bytes > 0) {
794
0
              return Status::Corruption(
795
0
                  "error_if_data_exists_in_wals is set but there are data "
796
0
                  " in WAL files.");
797
0
            }
798
0
          }
799
0
        }
800
0
      }
801
48.1k
    }
802
803
58.3k
    if (!wal_files.empty()) {
804
      // Recover in the order in which the wals were generated
805
48.1k
      std::vector<uint64_t> wals;
806
48.1k
      wals.reserve(wal_files.size());
807
48.1k
      for (const auto& wal_file : wal_files) {
808
48.1k
        wals.push_back(wal_file.first);
809
48.1k
      }
810
48.1k
      std::sort(wals.begin(), wals.end());
811
812
48.1k
      bool corrupted_wal_found = false;
813
48.1k
      s = RecoverLogFiles(wals, &next_sequence, read_only, is_retry,
814
48.1k
                          &corrupted_wal_found, recovery_ctx);
815
48.1k
      if (corrupted_wal_found && recovered_seq != nullptr) {
816
0
        *recovered_seq = next_sequence;
817
0
      }
818
48.1k
      if (!s.ok()) {
819
        // Clear memtables if recovery failed
820
0
        for (auto cfd : *versions_->GetColumnFamilySet()) {
821
0
          cfd->CreateNewMemtable(kMaxSequenceNumber);
822
0
        }
823
0
      }
824
48.1k
    }
825
58.3k
  }
826
827
58.3k
  if (read_only) {
828
    // If we are opening as read-only, we need to update options_file_number_
829
    // to reflect the most recent OPTIONS file. It does not matter for regular
830
    // read-write db instance because options_file_number_ will later be
831
    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
832
0
    std::vector<std::string> filenames;
833
0
    if (s.ok()) {
834
0
      const std::string normalized_dbname = NormalizePath(dbname_);
835
0
      const std::string normalized_wal_dir =
836
0
          NormalizePath(immutable_db_options_.GetWalDir());
837
0
      if (immutable_db_options_.best_efforts_recovery) {
838
0
        filenames = std::move(files_in_dbname);
839
0
      } else if (normalized_dbname == normalized_wal_dir) {
840
0
        filenames = std::move(files_in_wal_dir);
841
0
      } else {
842
0
        IOOptions io_opts;
843
0
        io_opts.do_not_recurse = true;
844
0
        s = immutable_db_options_.fs->GetChildren(
845
0
            GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr);
846
0
      }
847
0
    }
848
0
    if (s.ok()) {
849
0
      uint64_t number = 0;
850
0
      uint64_t options_file_number = 0;
851
0
      FileType type;
852
0
      for (const auto& fname : filenames) {
853
0
        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
854
0
          options_file_number = std::max(number, options_file_number);
855
0
        }
856
0
      }
857
0
      versions_->options_file_number_ = options_file_number;
858
0
      uint64_t options_file_size = 0;
859
0
      if (options_file_number > 0) {
860
0
        s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number),
861
0
                              &options_file_size);
862
0
      }
863
0
      versions_->options_file_size_ = options_file_size;
864
0
    }
865
0
  }
866
58.3k
  return s;
867
58.3k
}
868
869
0
Status DBImpl::PersistentStatsProcessFormatVersion() {
870
0
  mutex_.AssertHeld();
871
0
  Status s;
872
  // persist version when stats CF doesn't exist
873
0
  bool should_persist_format_version = !persistent_stats_cfd_exists_;
874
0
  mutex_.Unlock();
875
0
  if (persistent_stats_cfd_exists_) {
876
    // Check persistent stats format version compatibility. Drop and recreate
877
    // persistent stats CF if format version is incompatible
878
0
    uint64_t format_version_recovered = 0;
879
0
    Status s_format = DecodePersistentStatsVersionNumber(
880
0
        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
881
0
    uint64_t compatible_version_recovered = 0;
882
0
    Status s_compatible = DecodePersistentStatsVersionNumber(
883
0
        this, StatsVersionKeyType::kCompatibleVersion,
884
0
        &compatible_version_recovered);
885
    // abort reading from existing stats CF if any of following is true:
886
    // 1. failed to read format version or compatible version from disk
887
    // 2. sst's format version is greater than current format version, meaning
888
    // this sst is encoded with a newer RocksDB release, and current compatible
889
    // version is below the sst's compatible version
890
0
    if (!s_format.ok() || !s_compatible.ok() ||
891
0
        (kStatsCFCurrentFormatVersion < format_version_recovered &&
892
0
         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
893
0
      if (!s_format.ok() || !s_compatible.ok()) {
894
0
        ROCKS_LOG_WARN(
895
0
            immutable_db_options_.info_log,
896
0
            "Recreating persistent stats column family since reading "
897
0
            "persistent stats version key failed. Format key: %s, compatible "
898
0
            "key: %s",
899
0
            s_format.ToString().c_str(), s_compatible.ToString().c_str());
900
0
      } else {
901
0
        ROCKS_LOG_WARN(
902
0
            immutable_db_options_.info_log,
903
0
            "Recreating persistent stats column family due to corrupted or "
904
0
            "incompatible format version. Recovered format: %" PRIu64
905
0
            "; recovered format compatible since: %" PRIu64 "\n",
906
0
            format_version_recovered, compatible_version_recovered);
907
0
      }
908
0
      s = DropColumnFamily(persist_stats_cf_handle_);
909
0
      if (s.ok()) {
910
0
        s = DestroyColumnFamilyHandle(persist_stats_cf_handle_);
911
0
      }
912
0
      ColumnFamilyHandle* handle = nullptr;
913
0
      if (s.ok()) {
914
0
        ColumnFamilyOptions cfo;
915
0
        OptimizeForPersistentStats(&cfo);
916
0
        s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen),
917
0
                                   WriteOptions(Env::IOActivity::kDBOpen), cfo,
918
0
                                   kPersistentStatsColumnFamilyName, &handle);
919
0
      }
920
0
      if (s.ok()) {
921
0
        persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
922
        // should also persist version here because old stats CF is discarded
923
0
        should_persist_format_version = true;
924
0
      }
925
0
    }
926
0
  }
927
0
  if (should_persist_format_version) {
928
    // Persistent stats CF being created for the first time, need to write
929
    // format version key
930
0
    WriteBatch batch;
931
0
    if (s.ok()) {
932
0
      s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
933
0
                    std::to_string(kStatsCFCurrentFormatVersion));
934
0
    }
935
0
    if (s.ok()) {
936
0
      s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
937
0
                    std::to_string(kStatsCFCompatibleFormatVersion));
938
0
    }
939
0
    if (s.ok()) {
940
      // TODO: plumb Env::IOActivity, Env::IOPriority
941
0
      WriteOptions wo;
942
0
      wo.low_pri = true;
943
0
      wo.no_slowdown = true;
944
0
      wo.sync = false;
945
0
      s = Write(wo, &batch);
946
0
    }
947
0
  }
948
0
  mutex_.Lock();
949
0
  return s;
950
0
}
951
952
0
Status DBImpl::InitPersistStatsColumnFamily() {
953
0
  mutex_.AssertHeld();
954
0
  assert(!persist_stats_cf_handle_);
955
0
  ColumnFamilyData* persistent_stats_cfd =
956
0
      versions_->GetColumnFamilySet()->GetColumnFamily(
957
0
          kPersistentStatsColumnFamilyName);
958
0
  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
959
960
0
  Status s;
961
0
  if (persistent_stats_cfd != nullptr) {
962
    // We are recovering from a DB which already contains persistent stats CF,
963
    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
964
    // column family handle was not. Need to explicitly create handle here.
965
0
    persist_stats_cf_handle_ =
966
0
        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
967
0
  } else {
968
0
    mutex_.Unlock();
969
0
    ColumnFamilyHandle* handle = nullptr;
970
0
    ColumnFamilyOptions cfo;
971
0
    OptimizeForPersistentStats(&cfo);
972
0
    s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen),
973
0
                               WriteOptions(Env::IOActivity::kDBOpen), cfo,
974
0
                               kPersistentStatsColumnFamilyName, &handle);
975
0
    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
976
0
    mutex_.Lock();
977
0
  }
978
0
  return s;
979
0
}
980
981
58.3k
Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
982
58.3k
  mutex_.AssertHeld();
983
58.3k
  assert(versions_->descriptor_log_ == nullptr);
984
58.3k
  const ReadOptions read_options(Env::IOActivity::kDBOpen);
985
58.3k
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
986
987
58.3k
  Status s = versions_->LogAndApply(recovery_ctx.cfds_, read_options,
988
58.3k
                                    write_options, recovery_ctx.edit_lists_,
989
58.3k
                                    &mutex_, directories_.GetDbDir());
990
58.3k
  return s;
991
58.3k
}
992
993
48.1k
void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() {
994
48.1k
  if (immutable_db_options_.wal_filter == nullptr) {
995
48.1k
    return;
996
48.1k
  }
997
0
  assert(immutable_db_options_.wal_filter != nullptr);
998
0
  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
999
1000
0
  std::map<std::string, uint32_t> cf_name_id_map;
1001
0
  std::map<uint32_t, uint64_t> cf_lognumber_map;
1002
0
  assert(versions_);
1003
0
  assert(versions_->GetColumnFamilySet());
1004
0
  for (auto cfd : *versions_->GetColumnFamilySet()) {
1005
0
    assert(cfd);
1006
0
    cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
1007
0
    cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
1008
0
  }
1009
1010
0
  wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map);
1011
0
}
1012
1013
bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
1014
                                                const std::string& wal_fname,
1015
                                                log::Reader::Reporter& reporter,
1016
                                                Status& status,
1017
                                                bool& stop_replay,
1018
1.01M
                                                WriteBatch& batch) {
1019
1.01M
  if (immutable_db_options_.wal_filter == nullptr) {
1020
1.01M
    return true;
1021
1.01M
  }
1022
0
  assert(immutable_db_options_.wal_filter != nullptr);
1023
0
  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
1024
1025
0
  WriteBatch new_batch;
1026
0
  bool batch_changed = false;
1027
1028
0
  bool process_current_record = true;
1029
1030
0
  WalFilter::WalProcessingOption wal_processing_option =
1031
0
      wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch,
1032
0
                                &batch_changed);
1033
1034
0
  switch (wal_processing_option) {
1035
0
    case WalFilter::WalProcessingOption::kContinueProcessing:
1036
      // do nothing, proceeed normally
1037
0
      break;
1038
0
    case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
1039
      // skip current record
1040
0
      process_current_record = false;
1041
0
      break;
1042
0
    case WalFilter::WalProcessingOption::kStopReplay:
1043
      // skip current record and stop replay
1044
0
      process_current_record = false;
1045
0
      stop_replay = true;
1046
0
      break;
1047
0
    case WalFilter::WalProcessingOption::kCorruptedRecord: {
1048
0
      status = Status::Corruption("Corruption reported by Wal Filter ",
1049
0
                                  wal_filter.Name());
1050
0
      MaybeIgnoreError(&status);
1051
0
      if (!status.ok()) {
1052
0
        process_current_record = false;
1053
0
        reporter.Corruption(batch.GetDataSize(), status);
1054
0
      }
1055
0
      break;
1056
0
    }
1057
0
    default: {
1058
      // logical error which should not happen. If RocksDB throws, we would
1059
      // just do `throw std::logic_error`.
1060
0
      assert(false);
1061
0
      status = Status::NotSupported(
1062
0
          "Unknown WalProcessingOption returned by Wal Filter ",
1063
0
          wal_filter.Name());
1064
0
      MaybeIgnoreError(&status);
1065
0
      if (!status.ok()) {
1066
        // Ignore the error with current record processing.
1067
0
        stop_replay = true;
1068
0
      }
1069
0
      break;
1070
0
    }
1071
0
  }
1072
1073
0
  if (!process_current_record) {
1074
0
    return false;
1075
0
  }
1076
1077
0
  if (batch_changed) {
1078
    // Make sure that the count in the new batch is
1079
    // within the orignal count.
1080
0
    int new_count = WriteBatchInternal::Count(&new_batch);
1081
0
    int original_count = WriteBatchInternal::Count(&batch);
1082
0
    if (new_count > original_count) {
1083
0
      ROCKS_LOG_FATAL(
1084
0
          immutable_db_options_.info_log,
1085
0
          "Recovering log #%" PRIu64
1086
0
          " mode %d log filter %s returned "
1087
0
          "more records (%d) than original (%d) which is not allowed. "
1088
0
          "Aborting recovery.",
1089
0
          wal_number, static_cast<int>(immutable_db_options_.wal_recovery_mode),
1090
0
          wal_filter.Name(), new_count, original_count);
1091
0
      status = Status::NotSupported(
1092
0
          "More than original # of records "
1093
0
          "returned by Wal Filter ",
1094
0
          wal_filter.Name());
1095
0
      return false;
1096
0
    }
1097
    // Set the same sequence number in the new_batch
1098
    // as the original batch.
1099
0
    WriteBatchInternal::SetSequence(&new_batch,
1100
0
                                    WriteBatchInternal::Sequence(&batch));
1101
0
    batch = new_batch;
1102
0
  }
1103
0
  return true;
1104
0
}
1105
1106
void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s,
1107
0
                                             uint64_t log_number) {
1108
0
  ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
1109
0
                 (status == nullptr ? "(ignoring error) " : ""), fname,
1110
0
                 static_cast<int>(bytes), s.ToString().c_str());
1111
0
  if (status != nullptr && status->ok()) {
1112
0
    *status = s;
1113
0
    corrupted_wal_number_ = log_number;
1114
0
  }
1115
0
}
1116
1117
0
void DBOpenLogRecordReadReporter::OldLogRecord(size_t bytes) {
1118
0
  if (old_log_record != nullptr) {
1119
0
    *old_log_record = true;
1120
0
  }
1121
0
  ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled", fname,
1122
0
                 static_cast<int>(bytes));
1123
0
}
1124
1125
// REQUIRES: wal_numbers are sorted in ascending order
1126
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1127
                               SequenceNumber* next_sequence, bool read_only,
1128
                               bool is_retry, bool* corrupted_wal_found,
1129
48.1k
                               RecoveryContext* recovery_ctx) {
1130
48.1k
  mutex_.AssertHeld();
1131
1132
48.1k
  std::unordered_map<int, VersionEdit> version_edits;
1133
48.1k
  int job_id = 0;
1134
48.1k
  uint64_t min_wal_number = 0;
1135
48.1k
  SetupLogFilesRecovery(wal_numbers, &version_edits, &job_id, &min_wal_number);
1136
1137
48.1k
  Status status = ProcessLogFiles(
1138
48.1k
      wal_numbers, read_only, is_retry, min_wal_number, job_id, next_sequence,
1139
48.1k
      &version_edits, corrupted_wal_found, recovery_ctx);
1140
1141
48.1k
  FinishLogFilesRecovery(job_id, status);
1142
48.1k
  return status;
1143
48.1k
}
1144
1145
void DBImpl::SetupLogFilesRecovery(
1146
    const std::vector<uint64_t>& wal_numbers,
1147
    std::unordered_map<int, VersionEdit>* version_edits, int* job_id,
1148
48.1k
    uint64_t* min_wal_number) {
1149
48.1k
  assert(version_edits);
1150
48.1k
  assert(job_id);
1151
48.1k
  assert(min_wal_number);
1152
  // No need to refcount because iteration is under mutex
1153
70.1k
  for (auto cfd : *versions_->GetColumnFamilySet()) {
1154
70.1k
    VersionEdit edit;
1155
70.1k
    edit.SetColumnFamily(cfd->GetID());
1156
70.1k
    version_edits->insert({cfd->GetID(), edit});
1157
70.1k
  }
1158
1159
48.1k
  *job_id = next_job_id_.fetch_add(1);
1160
48.1k
  {
1161
48.1k
    auto stream = event_logger_.Log();
1162
48.1k
    stream << "job" << *job_id;
1163
48.1k
    stream << "event" << "recovery_started";
1164
48.1k
    stream << "wal_files";
1165
48.1k
    stream.StartArray();
1166
48.1k
    for (auto wal_number : wal_numbers) {
1167
48.1k
      stream << wal_number;
1168
48.1k
    }
1169
48.1k
    stream.EndArray();
1170
48.1k
  }
1171
1172
  // No-op for immutable_db_options_.wal_filter == nullptr.
1173
48.1k
  InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
1174
1175
48.1k
  *min_wal_number = MinLogNumberToKeep();
1176
48.1k
  if (!allow_2pc()) {
1177
    // In non-2pc mode, we skip WALs that do not back unflushed data.
1178
48.1k
    *min_wal_number =
1179
48.1k
        std::max(*min_wal_number, versions_->MinLogNumberWithUnflushedData());
1180
48.1k
  }
1181
48.1k
}
1182
1183
Status DBImpl::ProcessLogFiles(
1184
    const std::vector<uint64_t>& wal_numbers, bool read_only, bool is_retry,
1185
    uint64_t min_wal_number, int job_id, SequenceNumber* next_sequence,
1186
    std::unordered_map<int, VersionEdit>* version_edits,
1187
48.1k
    bool* corrupted_wal_found, RecoveryContext* recovery_ctx) {
1188
48.1k
  Status status;
1189
1190
48.1k
  bool stop_replay_by_wal_filter = false;
1191
48.1k
  bool stop_replay_for_corruption = false;
1192
48.1k
  bool flushed = false;
1193
48.1k
  uint64_t corrupted_wal_number = kMaxSequenceNumber;
1194
48.1k
  PredecessorWALInfo predecessor_wal_info;
1195
1196
48.1k
  for (auto wal_number : wal_numbers) {
1197
    // Detecting early break on the next iteration after `wal_number` has been
1198
    // advanced since this `wal_number` doesn't affect follow-up handling after
1199
    // breaking out of the for loop.
1200
48.1k
    if (!status.ok()) {
1201
0
      break;
1202
0
    }
1203
48.1k
    SequenceNumber prev_next_sequence = *next_sequence;
1204
48.1k
    if (status.ok()) {
1205
48.1k
      status = ProcessLogFile(
1206
48.1k
          wal_number, min_wal_number, is_retry, read_only, job_id,
1207
48.1k
          next_sequence, &stop_replay_for_corruption,
1208
48.1k
          &stop_replay_by_wal_filter, &corrupted_wal_number,
1209
48.1k
          corrupted_wal_found, version_edits, &flushed, predecessor_wal_info);
1210
48.1k
    }
1211
48.1k
    if (status.ok()) {
1212
48.1k
      status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence,
1213
48.1k
                                                  *next_sequence);
1214
48.1k
    }
1215
48.1k
  }
1216
1217
48.1k
  if (status.ok()) {
1218
48.1k
    status = MaybeHandleStopReplayForCorruptionForInconsistency(
1219
48.1k
        stop_replay_for_corruption, corrupted_wal_number);
1220
48.1k
  }
1221
1222
48.1k
  if (status.ok()) {
1223
48.1k
    status = MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
1224
48.1k
        wal_numbers, read_only, job_id, flushed, version_edits, recovery_ctx);
1225
48.1k
  }
1226
48.1k
  return status;
1227
48.1k
}
1228
1229
Status DBImpl::ProcessLogFile(
1230
    uint64_t wal_number, uint64_t min_wal_number, bool is_retry, bool read_only,
1231
    int job_id, SequenceNumber* next_sequence, bool* stop_replay_for_corruption,
1232
    bool* stop_replay_by_wal_filter, uint64_t* corrupted_wal_number,
1233
    bool* corrupted_wal_found,
1234
    std::unordered_map<int, VersionEdit>* version_edits, bool* flushed,
1235
48.1k
    PredecessorWALInfo& predecessor_wal_info) {
1236
48.1k
  assert(stop_replay_by_wal_filter);
1237
1238
  // Variable initialization starts
1239
48.1k
  Status status;
1240
48.1k
  bool old_log_record = false;
1241
1242
48.1k
  DBOpenLogRecordReadReporter reporter;
1243
48.1k
  std::unique_ptr<log::Reader> reader;
1244
1245
48.1k
  std::string fname =
1246
48.1k
      LogFileName(immutable_db_options_.GetWalDir(), wal_number);
1247
1248
48.1k
  auto logFileDropped = [this, &fname]() {
1249
0
    uint64_t bytes;
1250
0
    if (env_->GetFileSize(fname, &bytes).ok()) {
1251
0
      auto info_log = immutable_db_options_.info_log.get();
1252
0
      ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
1253
0
                     static_cast<int>(bytes));
1254
0
    }
1255
0
  };
1256
1257
48.1k
  std::string scratch;
1258
48.1k
  Slice record;
1259
48.1k
  uint64_t record_checksum;
1260
48.1k
  const UnorderedMap<uint32_t, size_t>& running_ts_sz =
1261
48.1k
      versions_->GetRunningColumnFamiliesTimestampSize();
1262
1263
  // We need to track `last_seqno_observed` in addition to `next_sequence` since
1264
  // `last_seqno_observed != *next_sequence` when there are multiple key-value
1265
  // pairs in one WAL entry
1266
48.1k
  SequenceNumber last_seqno_observed = 0;
1267
  // Variable initialization ends
1268
1269
48.1k
  if (wal_number < min_wal_number) {
1270
0
    ROCKS_LOG_INFO(immutable_db_options_.info_log,
1271
0
                   "Skipping log #%" PRIu64
1272
0
                   " since it is older than min log to keep #%" PRIu64,
1273
0
                   wal_number, min_wal_number);
1274
0
    assert(status.ok());
1275
0
    return status;
1276
0
  }
1277
1278
48.1k
  SetupLogFileProcessing(wal_number);
1279
1280
48.1k
  if (*stop_replay_by_wal_filter) {
1281
0
    logFileDropped();
1282
0
    assert(status.ok());
1283
0
    return status;
1284
0
  }
1285
1286
48.1k
  Status init_status = InitializeLogReader(
1287
48.1k
      wal_number, is_retry, fname, *stop_replay_for_corruption, min_wal_number,
1288
48.1k
      predecessor_wal_info, &old_log_record, &status, &reporter, reader);
1289
1290
  // FIXME(hx235): Consolidate `!init_status.ok()` and `reader == nullptr` cases
1291
48.1k
  if (!init_status.ok()) {
1292
0
    assert(status.ok());
1293
0
    status.PermitUncheckedError();
1294
0
    return init_status;
1295
48.1k
  } else if (reader == nullptr) {
1296
    // TODO(hx235): remove this case since it's confusing
1297
0
    assert(status.ok());
1298
    // Fail initializing log reader for one log file with an ok status.
1299
    // Try next one.
1300
0
    return status;
1301
0
  }
1302
1303
48.1k
  TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
1304
48.1k
                           /*cb_arg=*/nullptr);
1305
1.06M
  while (true) {
1306
1.06M
    if (*stop_replay_by_wal_filter) {
1307
0
      break;
1308
0
    }
1309
1310
1.06M
    bool read_record = reader->ReadRecord(
1311
1.06M
        &record, &scratch, immutable_db_options_.wal_recovery_mode,
1312
1.06M
        &record_checksum);
1313
1314
    // `reader->ReadRecord` will change `status` through reporter in `reader`
1315
    // when a corruption is encountered
1316
    // FIXME(hx235): consolidate `read_record` and `status`
1317
1.06M
    if (!read_record || !status.ok()) {
1318
48.1k
      break;
1319
48.1k
    }
1320
1321
    // FIXME(hx235): consolidate `process_status` and `status`
1322
1.01M
    SequenceNumber prev_next_sequence = *next_sequence;
1323
1.01M
    Status process_status = ProcessLogRecord(
1324
1.01M
        record, reader, running_ts_sz, wal_number, fname, read_only, job_id,
1325
1.01M
        logFileDropped, &reporter, &record_checksum, &last_seqno_observed,
1326
1.01M
        next_sequence, stop_replay_for_corruption, &status,
1327
1.01M
        stop_replay_by_wal_filter, version_edits, flushed);
1328
1329
1.01M
    if (!process_status.ok()) {
1330
0
      return process_status;
1331
1.01M
    } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery(
1332
1.01M
                   prev_next_sequence, *next_sequence);
1333
1.01M
               !seqno_check_status.ok()) {
1334
      // Sequence number being set back indicates a serious software bug, the DB
1335
      // should not be opened in this case.
1336
0
      return seqno_check_status;
1337
1.01M
    } else if (*stop_replay_for_corruption) {
1338
0
      break;
1339
0
    }
1340
1.01M
  }
1341
1342
48.1k
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
1343
48.1k
                 "Recovered to log #%" PRIu64 " next seq #%" PRIu64, wal_number,
1344
48.1k
                 *next_sequence);
1345
1346
48.1k
  if (status.ok()) {
1347
48.1k
    status = UpdatePredecessorWALInfo(wal_number, last_seqno_observed, fname,
1348
48.1k
                                      predecessor_wal_info);
1349
48.1k
  }
1350
1351
48.1k
  if (!status.ok() || old_log_record) {
1352
0
    status = HandleNonOkStatusOrOldLogRecord(
1353
0
        wal_number, next_sequence, status, reporter, &old_log_record,
1354
0
        stop_replay_for_corruption, corrupted_wal_number, corrupted_wal_found);
1355
0
  }
1356
1357
48.1k
  FinishLogFileProcessing(status, next_sequence);
1358
1359
48.1k
  return status;
1360
48.1k
}
1361
1362
48.1k
void DBImpl::SetupLogFileProcessing(uint64_t wal_number) {
1363
  // The previous incarnation may not have written any MANIFEST
1364
  // records after allocating this log number.  So we manually
1365
  // update the file number allocation counter in VersionSet.
1366
48.1k
  versions_->MarkFileNumberUsed(wal_number);
1367
1368
48.1k
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
1369
48.1k
                 "Recovering log #%" PRIu64 " mode %d", wal_number,
1370
48.1k
                 static_cast<int>(immutable_db_options_.wal_recovery_mode));
1371
48.1k
}
1372
1373
Status DBImpl::InitializeLogReader(
1374
    uint64_t wal_number, bool is_retry, std::string& fname,
1375
    bool stop_replay_for_corruption, uint64_t min_wal_number,
1376
    const PredecessorWALInfo& predecessor_wal_info, bool* const old_log_record,
1377
    Status* const reporter_status, DBOpenLogRecordReadReporter* reporter,
1378
48.1k
    std::unique_ptr<log::Reader>& reader) {
1379
48.1k
  assert(old_log_record);
1380
48.1k
  assert(reporter_status);
1381
48.1k
  assert(reporter);
1382
1383
48.1k
  Status status;
1384
1385
48.1k
  std::unique_ptr<SequentialFileReader> file_reader;
1386
48.1k
  {
1387
48.1k
    std::unique_ptr<FSSequentialFile> file;
1388
48.1k
    status = fs_->NewSequentialFile(
1389
48.1k
        fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
1390
48.1k
    if (!status.ok()) {
1391
0
      MaybeIgnoreError(&status);
1392
0
      return status;
1393
0
    }
1394
48.1k
    file_reader.reset(new SequentialFileReader(
1395
48.1k
        std::move(file), fname, immutable_db_options_.log_readahead_size,
1396
48.1k
        io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr,
1397
48.1k
        /*verify_and_reconstruct_read=*/is_retry));
1398
48.1k
  }
1399
1400
  // Create the log reader.
1401
0
  reporter->env = env_;
1402
48.1k
  reporter->info_log = immutable_db_options_.info_log.get();
1403
48.1k
  reporter->fname = fname.c_str();
1404
48.1k
  reporter->old_log_record = old_log_record;
1405
48.1k
  if (!immutable_db_options_.paranoid_checks ||
1406
48.1k
      immutable_db_options_.wal_recovery_mode ==
1407
48.1k
          WALRecoveryMode::kSkipAnyCorruptedRecords) {
1408
0
    reporter->status = nullptr;
1409
48.1k
  } else {
1410
48.1k
    reporter->status = reporter_status;
1411
48.1k
  }
1412
  // We intentially make log::Reader do checksumming even if
1413
  // paranoid_checks==false so that corruptions cause entire commits
1414
  // to be skipped instead of propagating bad information (like overly
1415
  // large sequence numbers).
1416
48.1k
  reader.reset(new log::Reader(
1417
48.1k
      immutable_db_options_.info_log, std::move(file_reader), reporter,
1418
48.1k
      true /*checksum*/, wal_number,
1419
48.1k
      immutable_db_options_.track_and_verify_wals, stop_replay_for_corruption,
1420
48.1k
      min_wal_number, predecessor_wal_info));
1421
48.1k
  return status;
1422
48.1k
}
1423
1424
Status DBImpl::ProcessLogRecord(
1425
    Slice record, const std::unique_ptr<log::Reader>& reader,
1426
    const UnorderedMap<uint32_t, size_t>& running_ts_sz, uint64_t wal_number,
1427
    const std::string& fname, bool read_only, int job_id,
1428
    const std::function<void()>& logFileDropped,
1429
    DBOpenLogRecordReadReporter* reporter, uint64_t* record_checksum,
1430
    SequenceNumber* last_seqno_observed, SequenceNumber* next_sequence,
1431
    bool* stop_replay_for_corruption, Status* status,
1432
    bool* stop_replay_by_wal_filter,
1433
1.01M
    std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
1434
1.01M
  assert(reporter);
1435
1.01M
  assert(last_seqno_observed);
1436
1.01M
  assert(stop_replay_for_corruption);
1437
1.01M
  assert(status);
1438
1.01M
  assert(stop_replay_by_wal_filter);
1439
1440
1.01M
  Status process_status;
1441
1.01M
  bool has_valid_writes = false;
1442
1.01M
  WriteBatch batch;
1443
1.01M
  std::unique_ptr<WriteBatch> new_batch;
1444
1.01M
  WriteBatch* batch_to_use = nullptr;
1445
1446
1.01M
  if (record.size() < WriteBatchInternal::kHeader) {
1447
0
    reporter->Corruption(record.size(),
1448
0
                         Status::Corruption("log record too small"));
1449
0
    assert(process_status.ok());
1450
0
    return process_status;
1451
0
  }
1452
1453
1.01M
  process_status = InitializeWriteBatchForLogRecord(
1454
1.01M
      record, reader, running_ts_sz, &batch, new_batch, batch_to_use,
1455
1.01M
      record_checksum);
1456
1.01M
  if (!process_status.ok()) {
1457
0
    return process_status;
1458
0
  }
1459
1.01M
  assert(batch_to_use);
1460
1461
1.01M
  *last_seqno_observed = WriteBatchInternal::Sequence(batch_to_use);
1462
1463
1.01M
  if (*last_seqno_observed > kMaxSequenceNumber) {
1464
0
    reporter->Corruption(
1465
0
        record.size(),
1466
0
        Status::Corruption("sequence " + std::to_string(*last_seqno_observed) +
1467
0
                           " is too large"));
1468
0
    assert(process_status.ok());
1469
0
    return process_status;
1470
0
  }
1471
1472
1.01M
  MaybeReviseStopReplayForCorruption(*last_seqno_observed, next_sequence,
1473
1.01M
                                     stop_replay_for_corruption);
1474
1.01M
  if (*stop_replay_for_corruption) {
1475
0
    logFileDropped();
1476
0
    assert(process_status.ok());
1477
0
    return process_status;
1478
0
  }
1479
1480
  // For the default case of wal_filter == nullptr, always performs no-op
1481
  // and returns true.
1482
1.01M
  if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, *reporter, *status,
1483
1.01M
                                          *stop_replay_by_wal_filter,
1484
1.01M
                                          *batch_to_use)) {
1485
0
    assert(process_status.ok());
1486
0
    return process_status;
1487
1.01M
  } else {
1488
    // FIXME(hx235): Handle the potential non-okay `status` when
1489
    // `InvokeWalFilterIfNeededOnWalRecord()` returns true
1490
1.01M
    status->PermitUncheckedError();
1491
1.01M
  }
1492
1493
1.01M
  assert(process_status.ok());
1494
1.01M
  process_status = InsertLogRecordToMemtable(batch_to_use, wal_number,
1495
1.01M
                                             next_sequence, &has_valid_writes);
1496
1.01M
  MaybeIgnoreError(&process_status);
1497
  // We are treating this as a failure while reading since we read valid
1498
  // blocks that do not form coherent data
1499
1.01M
  if (!process_status.ok()) {
1500
    // FIXME(hx235): `reporter->Corruption()` will override the non-ok status
1501
    // set in `InvokeWalFilterIfNeededOnWalRecord` through passing `*status`
1502
0
    reporter->Corruption(record.size(), process_status);
1503
0
    process_status = Status::OK();
1504
0
    return process_status;
1505
0
  }
1506
1507
1.01M
  process_status = MaybeWriteLevel0TableForRecovery(
1508
1.01M
      has_valid_writes, read_only, wal_number, job_id, next_sequence,
1509
1.01M
      version_edits, flushed);
1510
1511
1.01M
  return process_status;
1512
1.01M
}
1513
1514
// We create a new batch and initialize with a valid prot_info_ to store
1515
// the data checksum
1516
Status DBImpl::InitializeWriteBatchForLogRecord(
1517
    Slice record, const std::unique_ptr<log::Reader>& reader,
1518
    const UnorderedMap<uint32_t, size_t>& running_ts_sz, WriteBatch* batch,
1519
    std::unique_ptr<WriteBatch>& new_batch, WriteBatch*& batch_to_use,
1520
1.01M
    uint64_t* record_checksum) {
1521
1.01M
  assert(batch);
1522
1.01M
  assert(record_checksum);
1523
1524
1.01M
  Status status = WriteBatchInternal::SetContents(batch, record);
1525
1.01M
  if (!status.ok()) {
1526
0
    return status;
1527
0
  }
1528
1529
1.01M
  const UnorderedMap<uint32_t, size_t>& record_ts_sz =
1530
1.01M
      reader->GetRecordedTimestampSize();
1531
1.01M
  status = HandleWriteBatchTimestampSizeDifference(
1532
1.01M
      batch, running_ts_sz, record_ts_sz,
1533
1.01M
      TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
1534
1.01M
      batch_per_txn_, &new_batch);
1535
1.01M
  if (!status.ok()) {
1536
0
    return status;
1537
0
  }
1538
1539
1.01M
  bool batch_updated = new_batch != nullptr;
1540
1.01M
  batch_to_use = batch_updated ? new_batch.get() : batch;
1541
1.01M
  TEST_SYNC_POINT_CALLBACK(
1542
1.01M
      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", batch_to_use);
1543
1.01M
  TEST_SYNC_POINT_CALLBACK(
1544
1.01M
      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
1545
1.01M
      record_checksum);
1546
1.01M
  status = WriteBatchInternal::UpdateProtectionInfo(
1547
1.01M
      batch_to_use, 8 /* bytes_per_key */,
1548
1.01M
      batch_updated ? nullptr : record_checksum);
1549
1550
1.01M
  return status;
1551
1.01M
}
1552
1553
void DBImpl::MaybeReviseStopReplayForCorruption(
1554
    SequenceNumber sequence, SequenceNumber const* const next_sequence,
1555
1.01M
    bool* stop_replay_for_corruption) {
1556
1.01M
  if (immutable_db_options_.wal_recovery_mode ==
1557
1.01M
      WALRecoveryMode::kPointInTimeRecovery) {
1558
1.01M
    assert(next_sequence);
1559
1.01M
    assert(stop_replay_for_corruption);
1560
    // In point-in-time recovery mode, if sequence id of log files are
1561
    // consecutive, we continue recovery despite corruption. This could
1562
    // happen when we open and write to a corrupted DB, where sequence id
1563
    // will start from the last sequence id we recovered.
1564
1.01M
    if (sequence == *next_sequence) {
1565
914k
      *stop_replay_for_corruption = false;
1566
914k
    }
1567
1.01M
  }
1568
1.01M
}
1569
1570
Status DBImpl::InsertLogRecordToMemtable(WriteBatch* batch_to_use,
1571
                                         uint64_t wal_number,
1572
                                         SequenceNumber* next_sequence,
1573
1.01M
                                         bool* has_valid_writes) {
1574
  // If column family was not found, it might mean that the WAL write
1575
  // batch references to the column family that was dropped after the
1576
  // insert. We don't want to fail the whole write batch in that case --
1577
  // we just ignore the update.
1578
  // That's why we set ignore missing column families to true
1579
1.01M
  assert(batch_to_use);
1580
1.01M
  assert(has_valid_writes);
1581
1.01M
  Status status = WriteBatchInternal::InsertInto(
1582
1.01M
      batch_to_use, column_family_memtables_.get(), &flush_scheduler_,
1583
1.01M
      &trim_history_scheduler_, true, wal_number, this,
1584
1.01M
      false /* concurrent_memtable_writes */, next_sequence, has_valid_writes,
1585
1.01M
      seq_per_batch_, batch_per_txn_);
1586
1.01M
  return status;
1587
1.01M
}
1588
1589
Status DBImpl::MaybeWriteLevel0TableForRecovery(
1590
    bool has_valid_writes, bool read_only, uint64_t wal_number, int job_id,
1591
    SequenceNumber const* const next_sequence,
1592
1.01M
    std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
1593
1.01M
  assert(next_sequence);
1594
1.01M
  assert(version_edits);
1595
1.01M
  assert(flushed);
1596
1597
1.01M
  Status status;
1598
1.01M
  if (has_valid_writes && !read_only) {
1599
    // we can do this because this is called before client has access to the
1600
    // DB and there is only a single thread operating on DB
1601
997k
    ColumnFamilyData* cfd;
1602
1603
997k
    while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
1604
0
      cfd->UnrefAndTryDelete();
1605
      // If this asserts, it means that InsertInto failed in
1606
      // filtering updates to already-flushed column families
1607
0
      assert(cfd->GetLogNumber() <= wal_number);
1608
0
      (void)wal_number;
1609
0
      auto iter = version_edits->find(cfd->GetID());
1610
0
      assert(iter != version_edits->end());
1611
0
      VersionEdit* edit = &iter->second;
1612
0
      status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
1613
0
      if (!status.ok()) {
1614
        // Reflect errors immediately so that conditions like full
1615
        // file-systems cause the DB::Open() to fail.
1616
0
        return status;
1617
0
      }
1618
0
      *flushed = true;
1619
1620
0
      cfd->CreateNewMemtable(*next_sequence - 1);
1621
0
    }
1622
997k
  }
1623
1.01M
  return status;
1624
1.01M
}
1625
1626
Status DBImpl::HandleNonOkStatusOrOldLogRecord(
1627
    uint64_t wal_number, SequenceNumber const* const next_sequence,
1628
    Status status, const DBOpenLogRecordReadReporter& reporter,
1629
    bool* old_log_record, bool* stop_replay_for_corruption,
1630
0
    uint64_t* corrupted_wal_number, bool* corrupted_wal_found) {
1631
0
  assert(!status.ok() || *old_log_record);
1632
1633
0
  assert(next_sequence);
1634
0
  assert(old_log_record);
1635
0
  assert(stop_replay_for_corruption);
1636
0
  assert(corrupted_wal_number);
1637
1638
0
  if (status.IsNotSupported()) {
1639
    // We should not treat NotSupported as corruption. It is rather a clear
1640
    // sign that we are processing a WAL that is produced by an incompatible
1641
    // version of the code.
1642
0
    return status;
1643
0
  }
1644
1645
0
  if (immutable_db_options_.wal_recovery_mode ==
1646
0
      WALRecoveryMode::kSkipAnyCorruptedRecords) {
1647
    // We should ignore all errors unconditionally
1648
0
    return Status::OK();
1649
0
  } else if (immutable_db_options_.wal_recovery_mode ==
1650
0
             WALRecoveryMode::kPointInTimeRecovery) {
1651
0
    if (status.IsIOError()) {
1652
0
      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
1653
0
                      "IOError during point-in-time reading log #%" PRIu64
1654
0
                      " seq #%" PRIu64
1655
0
                      ". %s. This likely mean loss of synced WAL, "
1656
0
                      "thus recovery fails.",
1657
0
                      wal_number, *next_sequence, status.ToString().c_str());
1658
0
      return status;
1659
0
    }
1660
    // We should ignore the error but not continue replaying
1661
0
    *old_log_record = false;
1662
0
    *stop_replay_for_corruption = true;
1663
    // TODO(hx235): have a single source of corrupted WAL number once we
1664
    // consolidate the statuses
1665
0
    uint64_t reporter_corrupted_wal_number = reporter.GetCorruptedLogNumber();
1666
0
    *corrupted_wal_number = reporter_corrupted_wal_number != kMaxSequenceNumber
1667
0
                                ? reporter_corrupted_wal_number
1668
0
                                : wal_number;
1669
0
    if (corrupted_wal_found != nullptr) {
1670
0
      *corrupted_wal_found = true;
1671
0
    }
1672
0
    return Status::OK();
1673
0
  } else {
1674
0
    assert(immutable_db_options_.wal_recovery_mode ==
1675
0
               WALRecoveryMode::kTolerateCorruptedTailRecords ||
1676
0
           immutable_db_options_.wal_recovery_mode ==
1677
0
               WALRecoveryMode::kAbsoluteConsistency);
1678
0
    return status;
1679
0
  }
1680
0
}
1681
1682
Status DBImpl::UpdatePredecessorWALInfo(
1683
    uint64_t wal_number, const SequenceNumber last_seqno_observed,
1684
48.1k
    const std::string& fname, PredecessorWALInfo& predecessor_wal_info) {
1685
48.1k
  uint64_t bytes;
1686
1687
48.1k
  Status s = env_->GetFileSize(fname, &bytes);
1688
48.1k
  if (!s.ok()) {
1689
0
    return s;
1690
0
  }
1691
1692
48.1k
  SequenceNumber mock_seqno = kMaxSequenceNumber;
1693
48.1k
  [[maybe_unused]] std::pair<uint64_t, SequenceNumber*> pair =
1694
48.1k
      std::make_pair(wal_number, &mock_seqno);
1695
48.1k
  TEST_SYNC_POINT_CALLBACK("DBImpl::UpdatePredecessorWALInfo", &pair);
1696
48.1k
  predecessor_wal_info = PredecessorWALInfo(
1697
48.1k
      wal_number, bytes,
1698
48.1k
      mock_seqno != kMaxSequenceNumber ? mock_seqno : last_seqno_observed);
1699
1700
48.1k
  return s;
1701
48.1k
}
1702
1703
void DBImpl::FinishLogFileProcessing(const Status& status,
1704
48.1k
                                     const SequenceNumber* next_sequence) {
1705
48.1k
  if (status.ok()) {
1706
48.1k
    assert(next_sequence);
1707
48.1k
    flush_scheduler_.Clear();
1708
48.1k
    trim_history_scheduler_.Clear();
1709
48.1k
    auto last_sequence = *next_sequence - 1;
1710
48.1k
    if ((*next_sequence != kMaxSequenceNumber) &&
1711
48.1k
        (versions_->LastSequence() <= last_sequence)) {
1712
36.1k
      versions_->SetLastAllocatedSequence(last_sequence);
1713
36.1k
      versions_->SetLastPublishedSequence(last_sequence);
1714
36.1k
      versions_->SetLastSequence(last_sequence);
1715
36.1k
    }
1716
48.1k
  }
1717
48.1k
}
1718
1719
Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency(
1720
48.1k
    bool stop_replay_for_corruption, uint64_t corrupted_wal_number) {
1721
48.1k
  Status status;
1722
1723
  // Compare the corrupted log number to all columnfamily's current log number.
1724
  // Abort Open() if any column family's log number is greater than
1725
  // the corrupted log number, which means CF contains data beyond the point of
1726
  // corruption. This could during PIT recovery when the WAL is corrupted and
1727
  // some (but not all) CFs are flushed
1728
  // Exclude the PIT case where no log is dropped after the corruption point.
1729
  // This is to cover the case for empty wals after corrupted log, in which we
1730
  // don't reset stop_replay_for_corruption.
1731
48.1k
  if (stop_replay_for_corruption == true &&
1732
48.1k
      (immutable_db_options_.wal_recovery_mode ==
1733
0
           WALRecoveryMode::kPointInTimeRecovery ||
1734
0
       immutable_db_options_.wal_recovery_mode ==
1735
0
           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
1736
0
    for (auto cfd : *versions_->GetColumnFamilySet()) {
1737
      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
1738
      // the CF is still consistent: If a new column family is created during
1739
      // the flush and the WAL sync fails at the same time, the new CF points to
1740
      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
1741
      // is still consistent. We add the check of CF sst file size to avoid the
1742
      // false positive alert.
1743
1744
      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
1745
      // the ignorance of a very rare inconsistency case caused in data
1746
      // canclation. One CF is empty due to KV deletion. But those operations
1747
      // are in the WAL. If the WAL is corrupted, the status of this CF might
1748
      // not be consistent with others. However, the consistency check will be
1749
      // bypassed due to empty CF.
1750
      // TODO: a better and complete implementation is needed to ensure strict
1751
      // consistency check in WAL recovery including hanlding the tailing
1752
      // issues.
1753
0
      if (cfd->GetLogNumber() > corrupted_wal_number &&
1754
0
          cfd->GetLiveSstFilesSize() > 0) {
1755
0
        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
1756
0
                        "Column family inconsistency: SST file contains data"
1757
0
                        " beyond the point of corruption.");
1758
0
        status = Status::Corruption("SST file is ahead of WALs in CF " +
1759
0
                                    cfd->GetName());
1760
0
        return status;
1761
0
      }
1762
0
    }
1763
0
  }
1764
48.1k
  return status;
1765
48.1k
}
1766
1767
Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
1768
    const std::vector<uint64_t>& wal_numbers, bool read_only, int job_id,
1769
    bool flushed, std::unordered_map<int, VersionEdit>* version_edits,
1770
48.1k
    RecoveryContext* recovery_ctx) {
1771
48.1k
  assert(version_edits);
1772
1773
48.1k
  Status status;
1774
  // True if there's any data in the WALs; if not, we can skip re-processing
1775
  // them later
1776
48.1k
  bool data_seen = false;
1777
48.1k
  if (!read_only) {
1778
    // no need to refcount since client still doesn't have access
1779
    // to the DB and can not drop column families while we iterate
1780
48.1k
    const WalNumber max_wal_number = wal_numbers.back();
1781
70.1k
    for (auto cfd : *versions_->GetColumnFamilySet()) {
1782
70.1k
      auto iter = version_edits->find(cfd->GetID());
1783
70.1k
      assert(iter != version_edits->end());
1784
70.1k
      VersionEdit* edit = &iter->second;
1785
1786
70.1k
      if (cfd->GetLogNumber() > max_wal_number) {
1787
        // Column family cfd has already flushed the data
1788
        // from all wals. Memtable has to be empty because
1789
        // we filter the updates based on wal_number
1790
        // (in WriteBatch::InsertInto)
1791
0
        assert(cfd->mem()->GetFirstSequenceNumber() == 0);
1792
0
        assert(edit->NumEntries() == 0);
1793
0
        continue;
1794
0
      }
1795
1796
70.1k
      TEST_SYNC_POINT_CALLBACK(
1797
70.1k
          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
1798
1799
      // flush the final memtable (if non-empty)
1800
70.1k
      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
1801
        // If flush happened in the middle of recovery (e.g. due to memtable
1802
        // being full), we flush at the end. Otherwise we'll need to record
1803
        // where we were on last flush, which make the logic complicated.
1804
20.0k
        if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
1805
20.0k
          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
1806
20.0k
          if (!status.ok()) {
1807
            // Recovery failed
1808
0
            break;
1809
0
          }
1810
20.0k
          flushed = true;
1811
1812
20.0k
          cfd->CreateNewMemtable(versions_->LastSequence());
1813
20.0k
        }
1814
20.0k
        data_seen = true;
1815
20.0k
      }
1816
1817
      // Update the log number info in the version edit corresponding to this
1818
      // column family. Note that the version edits will be written to MANIFEST
1819
      // together later.
1820
      // writing wal_number in the manifest means that any log file
1821
      // with number strongly less than (wal_number + 1) is already
1822
      // recovered and should be ignored on next reincarnation.
1823
      // Since we already recovered max_wal_number, we want all wals
1824
      // with numbers `<= max_wal_number` (includes this one) to be ignored
1825
70.1k
      if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
1826
70.1k
        edit->SetLogNumber(max_wal_number + 1);
1827
70.1k
      }
1828
70.1k
    }
1829
48.1k
    if (status.ok()) {
1830
      // we must mark the next log number as used, even though it's
1831
      // not actually used. that is because VersionSet assumes
1832
      // VersionSet::next_file_number_ always to be strictly greater than any
1833
      // log number
1834
48.1k
      versions_->MarkFileNumberUsed(max_wal_number + 1);
1835
48.1k
      assert(recovery_ctx != nullptr);
1836
1837
70.1k
      for (auto* cfd : *versions_->GetColumnFamilySet()) {
1838
70.1k
        auto iter = version_edits->find(cfd->GetID());
1839
70.1k
        assert(iter != version_edits->end());
1840
70.1k
        recovery_ctx->UpdateVersionEdits(cfd, iter->second);
1841
70.1k
      }
1842
1843
48.1k
      if (flushed || !data_seen) {
1844
48.1k
        VersionEdit wal_deletion;
1845
48.1k
        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
1846
0
          wal_deletion.DeleteWalsBefore(max_wal_number + 1);
1847
0
        }
1848
48.1k
        if (!allow_2pc()) {
1849
          // In non-2pc mode, flushing the memtables of the column families
1850
          // means we can advance min_log_number_to_keep.
1851
48.1k
          wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
1852
48.1k
        }
1853
48.1k
        assert(versions_->GetColumnFamilySet() != nullptr);
1854
48.1k
        recovery_ctx->UpdateVersionEdits(
1855
48.1k
            versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
1856
48.1k
      }
1857
48.1k
    }
1858
48.1k
  }
1859
1860
48.1k
  if (status.ok()) {
1861
48.1k
    if (data_seen && !flushed) {
1862
0
      status = RestoreAliveLogFiles(wal_numbers);
1863
48.1k
    } else if (!wal_numbers.empty()) {  // If there's no data in the WAL, or we
1864
                                        // flushed all the data, still
1865
      // truncate the log file. If the process goes into a crash loop before
1866
      // the file is deleted, the preallocated space will never get freed.
1867
48.1k
      const bool truncate = !read_only;
1868
48.1k
      GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
1869
48.1k
          .PermitUncheckedError();
1870
48.1k
    }
1871
48.1k
  }
1872
48.1k
  return status;
1873
48.1k
}
1874
1875
Status DBImpl::CheckSeqnoNotSetBackDuringRecovery(
1876
1.06M
    SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) {
1877
1.06M
  if (prev_next_seqno == kMaxSequenceNumber ||
1878
1.06M
      prev_next_seqno <= current_next_seqno) {
1879
1.06M
    return Status::OK();
1880
1.06M
  }
1881
0
  std::string msg =
1882
0
      "Sequence number is being set backwards during recovery, this is likely "
1883
0
      "a software bug or a data corruption. Prev next seqno: " +
1884
0
      std::to_string(prev_next_seqno) +
1885
0
      " , current next seqno: " + std::to_string(current_next_seqno);
1886
0
  return Status::Corruption(msg);
1887
1.06M
}
1888
1889
48.1k
void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
1890
48.1k
  event_logger_.Log() << "job" << job_id << "event"
1891
48.1k
                      << (status.ok() ? "recovery_finished" : "recovery_failed")
1892
48.1k
                      << "status" << status.ToString();
1893
48.1k
}
1894
1895
Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
1896
48.1k
                                          WalFileNumberSize* log_ptr) {
1897
48.1k
  WalFileNumberSize log(wal_number);
1898
48.1k
  std::string fname =
1899
48.1k
      LogFileName(immutable_db_options_.GetWalDir(), wal_number);
1900
48.1k
  Status s;
1901
  // This gets the appear size of the wals, not including preallocated space.
1902
48.1k
  s = env_->GetFileSize(fname, &log.size);
1903
48.1k
  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
1904
48.1k
  if (s.ok() && truncate) {
1905
48.1k
    std::unique_ptr<FSWritableFile> last_log;
1906
48.1k
    Status truncate_status = fs_->ReopenWritableFile(
1907
48.1k
        fname,
1908
48.1k
        fs_->OptimizeForLogWrite(
1909
48.1k
            file_options_,
1910
48.1k
            BuildDBOptions(immutable_db_options_, mutable_db_options_)),
1911
48.1k
        &last_log, nullptr);
1912
48.1k
    if (truncate_status.ok()) {
1913
48.1k
      truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
1914
48.1k
    }
1915
48.1k
    if (truncate_status.ok()) {
1916
48.1k
      truncate_status = last_log->Close(IOOptions(), nullptr);
1917
48.1k
    }
1918
    // Not a critical error if fail to truncate.
1919
48.1k
    if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
1920
0
      ROCKS_LOG_WARN(immutable_db_options_.info_log,
1921
0
                     "Failed to truncate log #%" PRIu64 ": %s", wal_number,
1922
0
                     truncate_status.ToString().c_str());
1923
0
    }
1924
48.1k
  }
1925
48.1k
  if (log_ptr) {
1926
0
    *log_ptr = log;
1927
0
  }
1928
48.1k
  return s;
1929
48.1k
}
1930
1931
0
Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
1932
0
  if (wal_numbers.empty()) {
1933
0
    return Status::OK();
1934
0
  }
1935
0
  Status s;
1936
0
  mutex_.AssertHeld();
1937
0
  assert(immutable_db_options_.avoid_flush_during_recovery);
1938
  // Mark these as alive so they'll be considered for deletion later by
1939
  // FindObsoleteFiles()
1940
0
  wals_total_size_.StoreRelaxed(0);
1941
0
  wal_empty_ = false;
1942
0
  uint64_t min_wal_with_unflushed_data =
1943
0
      versions_->MinLogNumberWithUnflushedData();
1944
0
  for (auto wal_number : wal_numbers) {
1945
0
    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
1946
      // In non-2pc mode, the WAL files not backing unflushed data are not
1947
      // alive, thus should not be added to the alive_wal_files_.
1948
0
      continue;
1949
0
    }
1950
    // We preallocate space for wals, but then after a crash and restart, those
1951
    // preallocated space are not needed anymore. It is likely only the last
1952
    // log has such preallocated space, so we only truncate for the last log.
1953
0
    WalFileNumberSize log;
1954
0
    s = GetLogSizeAndMaybeTruncate(
1955
0
        wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
1956
0
    if (!s.ok()) {
1957
0
      break;
1958
0
    }
1959
0
    wals_total_size_.FetchAddRelaxed(log.size);
1960
0
    alive_wal_files_.push_back(log);
1961
0
  }
1962
0
  return s;
1963
0
}
1964
1965
Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1966
20.0k
                                           MemTable* mem, VersionEdit* edit) {
1967
20.0k
  mutex_.AssertHeld();
1968
20.0k
  assert(cfd);
1969
20.0k
  assert(cfd->imm());
1970
  // The immutable memtable list must be empty.
1971
20.0k
  assert(std::numeric_limits<uint64_t>::max() ==
1972
20.0k
         cfd->imm()->GetEarliestMemTableID());
1973
1974
20.0k
  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
1975
1976
20.0k
  FileMetaData meta;
1977
20.0k
  std::vector<BlobFileAddition> blob_file_additions;
1978
1979
20.0k
  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
1980
20.0k
      new std::list<uint64_t>::iterator(
1981
20.0k
          CaptureCurrentFileNumberInPendingOutputs()));
1982
20.0k
  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
1983
20.0k
  ReadOptions ro;
1984
20.0k
  ro.total_order_seek = true;
1985
20.0k
  ro.io_activity = Env::IOActivity::kDBOpen;
1986
20.0k
  Arena arena;
1987
20.0k
  Status s;
1988
20.0k
  TableProperties table_properties;
1989
20.0k
  const auto* ucmp = cfd->internal_comparator().user_comparator();
1990
20.0k
  assert(ucmp);
1991
20.0k
  const size_t ts_sz = ucmp->timestamp_size();
1992
20.0k
  const bool logical_strip_timestamp =
1993
20.0k
      ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps;
1994
  // Note that here we treat flush as level 0 compaction in internal stats
1995
20.0k
  InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
1996
20.0k
                                             1 /* count */);
1997
20.0k
  {
1998
20.0k
    ScopedArenaPtr<InternalIterator> iter(
1999
20.0k
        logical_strip_timestamp
2000
20.0k
            ? mem->NewTimestampStrippingIterator(
2001
0
                  ro, /*seqno_to_time_mapping=*/nullptr, &arena,
2002
0
                  /*prefix_extractor=*/nullptr, ts_sz)
2003
20.0k
            : mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
2004
20.0k
                               /*prefix_extractor=*/nullptr,
2005
20.0k
                               /*for_flush=*/true));
2006
20.0k
    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
2007
20.0k
                    "[%s] [WriteLevel0TableForRecovery]"
2008
20.0k
                    " Level-0 table #%" PRIu64 ": started",
2009
20.0k
                    cfd->GetName().c_str(), meta.fd.GetNumber());
2010
2011
    // Get the latest mutable cf options while the mutex is still locked
2012
20.0k
    const MutableCFOptions mutable_cf_options_copy =
2013
20.0k
        cfd->GetLatestMutableCFOptions();
2014
20.0k
    bool paranoid_file_checks =
2015
20.0k
        cfd->GetLatestMutableCFOptions().paranoid_file_checks;
2016
2017
20.0k
    int64_t _current_time = 0;
2018
20.0k
    immutable_db_options_.clock->GetCurrentTime(&_current_time)
2019
20.0k
        .PermitUncheckedError();  // ignore error
2020
20.0k
    const uint64_t current_time = static_cast<uint64_t>(_current_time);
2021
20.0k
    meta.oldest_ancester_time = current_time;
2022
20.0k
    meta.epoch_number = cfd->NewEpochNumber();
2023
20.0k
    {
2024
20.0k
      auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
2025
20.0k
          /*level=*/0,
2026
20.0k
          immutable_db_options_.calculate_sst_write_lifetime_hint_set);
2027
20.0k
      mutex_.Unlock();
2028
2029
20.0k
      SequenceNumber earliest_write_conflict_snapshot;
2030
20.0k
      std::vector<SequenceNumber> snapshot_seqs =
2031
20.0k
          snapshots_.GetAll(&earliest_write_conflict_snapshot);
2032
20.0k
      SequenceNumber earliest_snapshot =
2033
20.0k
          (snapshot_seqs.empty() ? kMaxSequenceNumber : snapshot_seqs.at(0));
2034
20.0k
      auto snapshot_checker = snapshot_checker_.get();
2035
20.0k
      if (use_custom_gc_ && snapshot_checker == nullptr) {
2036
0
        snapshot_checker = DisableGCSnapshotChecker::Instance();
2037
0
      }
2038
20.0k
      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
2039
20.0k
          range_del_iters;
2040
20.0k
      auto range_del_iter =
2041
20.0k
          logical_strip_timestamp
2042
20.0k
              ? mem->NewTimestampStrippingRangeTombstoneIterator(
2043
0
                    ro, kMaxSequenceNumber, ts_sz)
2044
              // This is called during recovery, where a live memtable is
2045
              // flushed directly. In this case, no fragmented tombstone list is
2046
              // cached in this memtable yet.
2047
20.0k
              : mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
2048
20.0k
                                               false /* immutable_memtable */);
2049
20.0k
      if (range_del_iter != nullptr) {
2050
3.72k
        range_del_iters.emplace_back(range_del_iter);
2051
3.72k
      }
2052
2053
20.0k
      IOStatus io_s;
2054
20.0k
      const ReadOptions read_option(Env::IOActivity::kDBOpen);
2055
20.0k
      const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen);
2056
2057
20.0k
      TableBuilderOptions tboptions(
2058
20.0k
          cfd->ioptions(), mutable_cf_options_copy, read_option, write_option,
2059
20.0k
          cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
2060
20.0k
          GetCompressionFlush(cfd->ioptions(), mutable_cf_options_copy),
2061
20.0k
          mutable_cf_options_copy.compression_opts, cfd->GetID(),
2062
20.0k
          cfd->GetName(), 0 /* level */, current_time /* newest_key_time */,
2063
20.0k
          false /* is_bottommost */, TableFileCreationReason::kRecovery,
2064
20.0k
          0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
2065
20.0k
          db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
2066
20.0k
          kMaxSequenceNumber);
2067
20.0k
      Version* version = cfd->current();
2068
20.0k
      version->Ref();
2069
20.0k
      TableProperties temp_table_proerties;
2070
20.0k
      s = BuildTable(
2071
20.0k
          dbname_, versions_.get(), immutable_db_options_, tboptions,
2072
20.0k
          file_options_for_compaction_, cfd->table_cache(), iter.get(),
2073
20.0k
          std::move(range_del_iters), &meta, &blob_file_additions,
2074
20.0k
          snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot,
2075
20.0k
          kMaxSequenceNumber, snapshot_checker, paranoid_file_checks,
2076
20.0k
          cfd->internal_stats(), &io_s, io_tracer_,
2077
20.0k
          BlobFileCreationReason::kRecovery,
2078
20.0k
          nullptr /* seqno_to_time_mapping */, &event_logger_, job_id,
2079
20.0k
          &temp_table_proerties /* table_properties */, write_hint,
2080
20.0k
          nullptr /*full_history_ts_low*/, &blob_callback_, version,
2081
20.0k
          nullptr /* memtable_payload_bytes */,
2082
20.0k
          nullptr /* memtable_garbage_bytes */, &flush_stats);
2083
20.0k
      version->Unref();
2084
20.0k
      LogFlush(immutable_db_options_.info_log);
2085
20.0k
      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
2086
20.0k
                      "[%s] [WriteLevel0TableForRecovery]"
2087
20.0k
                      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
2088
20.0k
                      cfd->GetName().c_str(), meta.fd.GetNumber(),
2089
20.0k
                      meta.fd.GetFileSize(), s.ToString().c_str());
2090
20.0k
      mutex_.Lock();
2091
2092
      // TODO(AR) is this ok?
2093
20.0k
      if (!io_s.ok() && s.ok()) {
2094
0
        s = io_s;
2095
0
      }
2096
2097
20.0k
      uint64_t total_num_entries = mem->NumEntries();
2098
20.0k
      if (s.ok() && total_num_entries != flush_stats.num_input_records) {
2099
0
        std::string msg = "Expected " + std::to_string(total_num_entries) +
2100
0
                          " entries in memtable, but read " +
2101
0
                          std::to_string(flush_stats.num_input_records);
2102
0
        ROCKS_LOG_WARN(immutable_db_options_.info_log,
2103
0
                       "[%s] [JOB %d] Level-0 flush during recover: %s",
2104
0
                       cfd->GetName().c_str(), job_id, msg.c_str());
2105
0
        if (immutable_db_options_.flush_verify_memtable_count) {
2106
0
          s = Status::Corruption(msg);
2107
0
        }
2108
0
      }
2109
      // Only verify on table with format collects table properties
2110
20.0k
      const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
2111
20.0k
      if (s.ok() &&
2112
20.0k
          (mutable_cf_options.table_factory->IsInstanceOf(
2113
20.0k
               TableFactory::kBlockBasedTableName()) ||
2114
20.0k
           mutable_cf_options.table_factory->IsInstanceOf(
2115
0
               TableFactory::kPlainTableName())) &&
2116
20.0k
          flush_stats.num_output_records != temp_table_proerties.num_entries) {
2117
0
        std::string msg =
2118
0
            "Number of keys in flush output SST files does not match "
2119
0
            "number of keys added to the table. Expected " +
2120
0
            std::to_string(flush_stats.num_output_records) + " but there are " +
2121
0
            std::to_string(temp_table_proerties.num_entries) +
2122
0
            " in output SST files";
2123
0
        ROCKS_LOG_WARN(immutable_db_options_.info_log,
2124
0
                       "[%s] [JOB %d] Level-0 flush during recover: %s",
2125
0
                       cfd->GetName().c_str(), job_id, msg.c_str());
2126
0
        if (immutable_db_options_.flush_verify_memtable_count) {
2127
0
          s = Status::Corruption(msg);
2128
0
        }
2129
0
      }
2130
20.0k
    }
2131
20.0k
  }
2132
20.0k
  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
2133
2134
  // Note that if file_size is zero, the file has been deleted and
2135
  // should not be added to the manifest.
2136
20.0k
  const bool has_output = meta.fd.GetFileSize() > 0;
2137
2138
20.0k
  constexpr int level = 0;
2139
2140
20.0k
  if (s.ok() && has_output) {
2141
20.0k
    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
2142
20.0k
                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
2143
20.0k
                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
2144
20.0k
                  meta.marked_for_compaction, meta.temperature,
2145
20.0k
                  meta.oldest_blob_file_number, meta.oldest_ancester_time,
2146
20.0k
                  meta.file_creation_time, meta.epoch_number,
2147
20.0k
                  meta.file_checksum, meta.file_checksum_func_name,
2148
20.0k
                  meta.unique_id, meta.compensated_range_deletion_size,
2149
20.0k
                  meta.tail_size, meta.user_defined_timestamps_persisted);
2150
2151
20.0k
    for (const auto& blob : blob_file_additions) {
2152
0
      edit->AddBlobFile(blob);
2153
0
    }
2154
2155
    // For UDT in memtable only feature, move up the cutoff timestamp whenever
2156
    // a flush happens.
2157
20.0k
    if (logical_strip_timestamp) {
2158
0
      Slice mem_newest_udt = mem->GetNewestUDT();
2159
0
      std::string full_history_ts_low = cfd->GetFullHistoryTsLow();
2160
0
      if (full_history_ts_low.empty() ||
2161
0
          ucmp->CompareTimestamp(mem_newest_udt, full_history_ts_low) >= 0) {
2162
0
        std::string new_full_history_ts_low;
2163
0
        GetFullHistoryTsLowFromU64CutoffTs(&mem_newest_udt,
2164
0
                                           &new_full_history_ts_low);
2165
0
        edit->SetFullHistoryTsLow(new_full_history_ts_low);
2166
0
      }
2167
0
    }
2168
20.0k
  }
2169
2170
20.0k
  flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
2171
2172
20.0k
  if (has_output) {
2173
20.0k
    flush_stats.bytes_written = meta.fd.GetFileSize();
2174
20.0k
    flush_stats.num_output_files = 1;
2175
20.0k
  }
2176
2177
20.0k
  const auto& blobs = edit->GetBlobFileAdditions();
2178
20.0k
  for (const auto& blob : blobs) {
2179
0
    flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
2180
0
  }
2181
2182
20.0k
  flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
2183
2184
20.0k
  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
2185
20.0k
                                            flush_stats);
2186
20.0k
  cfd->internal_stats()->AddCFStats(
2187
20.0k
      InternalStats::BYTES_FLUSHED,
2188
20.0k
      flush_stats.bytes_written + flush_stats.bytes_written_blob);
2189
20.0k
  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
2190
20.0k
  return s;
2191
20.0k
}
2192
2193
Status DB::Open(const Options& options, const std::string& dbname,
2194
36.3k
                std::unique_ptr<DB>* dbptr) {
2195
36.3k
  DBOptions db_options(options);
2196
36.3k
  ColumnFamilyOptions cf_options(options);
2197
36.3k
  std::vector<ColumnFamilyDescriptor> column_families;
2198
36.3k
  column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
2199
36.3k
  if (db_options.persist_stats_to_disk) {
2200
0
    column_families.emplace_back(kPersistentStatsColumnFamilyName, cf_options);
2201
0
  }
2202
36.3k
  std::vector<ColumnFamilyHandle*> handles;
2203
36.3k
  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
2204
36.3k
  if (s.ok()) {
2205
36.3k
    if (db_options.persist_stats_to_disk) {
2206
0
      assert(handles.size() == 2);
2207
36.3k
    } else {
2208
36.3k
      assert(handles.size() == 1);
2209
36.3k
    }
2210
    // i can delete the handle since DBImpl is always holding a reference to
2211
    // default column family
2212
36.3k
    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
2213
0
      delete handles[1];
2214
0
    }
2215
36.3k
    delete handles[0];
2216
36.3k
  }
2217
36.3k
  return s;
2218
36.3k
}
2219
2220
Status DB::Open(const DBOptions& db_options, const std::string& dbname,
2221
                const std::vector<ColumnFamilyDescriptor>& column_families,
2222
                std::vector<ColumnFamilyHandle*>* handles,
2223
58.3k
                std::unique_ptr<DB>* dbptr) {
2224
58.3k
  const bool kSeqPerBatch = true;
2225
58.3k
  const bool kBatchPerTxn = true;
2226
58.3k
  ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking);
2227
58.3k
  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN);
2228
58.3k
  bool can_retry = false;
2229
58.3k
  Status s;
2230
58.3k
  do {
2231
58.3k
    s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
2232
58.3k
                     !kSeqPerBatch, kBatchPerTxn, can_retry, &can_retry);
2233
58.3k
  } while (!s.ok() && can_retry);
2234
58.3k
  ThreadStatusUtil::ResetThreadStatus();
2235
58.3k
  return s;
2236
58.3k
}
2237
2238
// TODO: Implement the trimming in flush code path.
2239
// TODO: Perform trimming before inserting into memtable during recovery.
2240
// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta
2241
// info, and handle only these files to reduce io.
2242
Status DB::OpenAndTrimHistory(
2243
    const DBOptions& db_options, const std::string& dbname,
2244
    const std::vector<ColumnFamilyDescriptor>& column_families,
2245
    std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
2246
0
    std::string trim_ts) {
2247
0
  assert(dbptr != nullptr);
2248
0
  assert(handles != nullptr);
2249
0
  auto validate_options = [&db_options] {
2250
0
    if (db_options.avoid_flush_during_recovery) {
2251
0
      return Status::InvalidArgument(
2252
0
          "avoid_flush_during_recovery incompatible with "
2253
0
          "OpenAndTrimHistory");
2254
0
    }
2255
0
    return Status::OK();
2256
0
  };
2257
0
  auto s = validate_options();
2258
0
  if (!s.ok()) {
2259
0
    return s;
2260
0
  }
2261
2262
0
  DB* db = nullptr;
2263
0
  s = DB::Open(db_options, dbname, column_families, handles, &db);
2264
0
  if (!s.ok()) {
2265
0
    return s;
2266
0
  }
2267
0
  assert(db);
2268
0
  CompactRangeOptions options;
2269
0
  options.bottommost_level_compaction =
2270
0
      BottommostLevelCompaction::kForceOptimized;
2271
0
  auto db_impl = static_cast_with_check<DBImpl>(db);
2272
0
  for (auto handle : *handles) {
2273
0
    assert(handle != nullptr);
2274
0
    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
2275
0
    auto cfd = cfh->cfd();
2276
0
    assert(cfd != nullptr);
2277
    // Only compact column families with timestamp enabled
2278
0
    if (cfd->user_comparator() != nullptr &&
2279
0
        cfd->user_comparator()->timestamp_size() > 0) {
2280
0
      s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr,
2281
0
                                        trim_ts);
2282
0
      if (!s.ok()) {
2283
0
        break;
2284
0
      }
2285
0
    }
2286
0
  }
2287
0
  auto clean_op = [&handles, &db] {
2288
0
    for (auto handle : *handles) {
2289
0
      auto temp_s = db->DestroyColumnFamilyHandle(handle);
2290
0
      assert(temp_s.ok());
2291
0
    }
2292
0
    handles->clear();
2293
0
    delete db;
2294
0
  };
2295
0
  if (!s.ok()) {
2296
0
    clean_op();
2297
0
    return s;
2298
0
  }
2299
2300
0
  dbptr->reset(db);
2301
0
  return s;
2302
0
}
2303
2304
IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
2305
                           uint64_t log_file_num, uint64_t recycle_log_number,
2306
                           size_t preallocate_block_size,
2307
                           const PredecessorWALInfo& predecessor_wal_info,
2308
60.6k
                           log::Writer** new_log) {
2309
60.6k
  IOStatus io_s;
2310
60.6k
  std::unique_ptr<FSWritableFile> lfile;
2311
2312
60.6k
  DBOptions db_options =
2313
60.6k
      BuildDBOptions(immutable_db_options_, mutable_db_options_);
2314
60.6k
  FileOptions opt_file_options =
2315
60.6k
      fs_->OptimizeForLogWrite(file_options_, db_options);
2316
60.6k
  opt_file_options.write_hint = CalculateWALWriteHint();
2317
  // DB option takes precedence when not kUnknown
2318
60.6k
  if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
2319
0
    opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
2320
0
  }
2321
60.6k
  std::string wal_dir = immutable_db_options_.GetWalDir();
2322
60.6k
  std::string log_fname = LogFileName(wal_dir, log_file_num);
2323
2324
60.6k
  if (recycle_log_number) {
2325
0
    ROCKS_LOG_INFO(immutable_db_options_.info_log,
2326
0
                   "reusing log %" PRIu64 " from recycle list\n",
2327
0
                   recycle_log_number);
2328
0
    std::string old_log_fname = LogFileName(wal_dir, recycle_log_number);
2329
0
    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
2330
0
    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
2331
0
    io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
2332
0
                                  &lfile, /*dbg=*/nullptr);
2333
60.6k
  } else {
2334
60.6k
    io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
2335
60.6k
  }
2336
2337
60.6k
  if (io_s.ok()) {
2338
    // Subsequent attempts to override the hint via SetWriteLifeTimeHint
2339
    // with the very same value will be ignored by the fs.
2340
60.6k
    lfile->SetWriteLifeTimeHint(opt_file_options.write_hint);
2341
60.6k
    lfile->SetPreallocationBlockSize(preallocate_block_size);
2342
2343
60.6k
    const auto& listeners = immutable_db_options_.listeners;
2344
60.6k
    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
2345
60.6k
    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
2346
60.6k
        std::move(lfile), log_fname, opt_file_options,
2347
60.6k
        immutable_db_options_.clock, io_tracer_, nullptr /* stats */,
2348
60.6k
        Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, listeners, nullptr,
2349
60.6k
        tmp_set.Contains(FileType::kWalFile),
2350
60.6k
        tmp_set.Contains(FileType::kWalFile)));
2351
60.6k
    *new_log = new log::Writer(std::move(file_writer), log_file_num,
2352
60.6k
                               immutable_db_options_.recycle_log_file_num > 0,
2353
60.6k
                               immutable_db_options_.manual_wal_flush,
2354
60.6k
                               immutable_db_options_.wal_compression,
2355
60.6k
                               immutable_db_options_.track_and_verify_wals);
2356
60.6k
    io_s = (*new_log)->AddCompressionTypeRecord(write_options);
2357
60.6k
    if (io_s.ok()) {
2358
60.6k
      io_s = (*new_log)->MaybeAddPredecessorWALInfo(write_options,
2359
60.6k
                                                    predecessor_wal_info);
2360
60.6k
    }
2361
60.6k
  }
2362
2363
60.6k
  return io_s;
2364
60.6k
}
2365
2366
void DBImpl::TrackExistingDataFiles(
2367
58.3k
    const std::vector<std::string>& existing_data_files) {
2368
58.3k
  TrackOrUntrackFiles(existing_data_files, /*track=*/true);
2369
58.3k
}
2370
2371
Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2372
                    const std::vector<ColumnFamilyDescriptor>& column_families,
2373
                    std::vector<ColumnFamilyHandle*>* handles,
2374
                    std::unique_ptr<DB>* dbptr, const bool seq_per_batch,
2375
                    const bool batch_per_txn, const bool is_retry,
2376
58.3k
                    bool* can_retry) {
2377
58.3k
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
2378
58.3k
  const ReadOptions read_options(Env::IOActivity::kDBOpen);
2379
2380
58.3k
  Status s = ValidateOptionsByTable(db_options, column_families);
2381
58.3k
  if (!s.ok()) {
2382
0
    return s;
2383
0
  }
2384
2385
58.3k
  s = ValidateOptions(db_options, column_families);
2386
58.3k
  if (!s.ok()) {
2387
0
    return s;
2388
0
  }
2389
2390
58.3k
  *dbptr = nullptr;
2391
58.3k
  assert(handles);
2392
58.3k
  handles->clear();
2393
2394
58.3k
  size_t max_write_buffer_size = 0;
2395
58.3k
  MinAndMaxPreserveSeconds preserve_info;
2396
80.3k
  for (const auto& cf : column_families) {
2397
80.3k
    max_write_buffer_size =
2398
80.3k
        std::max(max_write_buffer_size, cf.options.write_buffer_size);
2399
80.3k
    preserve_info.Combine(cf.options);
2400
80.3k
  }
2401
2402
58.3k
  auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch,
2403
58.3k
                                       batch_per_txn);
2404
58.3k
  if (!impl->immutable_db_options_.info_log) {
2405
0
    s = impl->init_logger_creation_s_;
2406
0
    return s;
2407
58.3k
  } else {
2408
58.3k
    assert(impl->init_logger_creation_s_.ok());
2409
58.3k
  }
2410
58.3k
  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
2411
58.3k
  if (s.ok()) {
2412
58.3k
    std::vector<std::string> paths;
2413
58.3k
    for (auto& db_path : impl->immutable_db_options_.db_paths) {
2414
58.3k
      paths.emplace_back(db_path.path);
2415
58.3k
    }
2416
80.3k
    for (auto& cf : column_families) {
2417
80.3k
      for (auto& cf_path : cf.options.cf_paths) {
2418
0
        paths.emplace_back(cf_path.path);
2419
0
      }
2420
80.3k
    }
2421
58.3k
    for (const auto& path : paths) {
2422
58.3k
      s = impl->env_->CreateDirIfMissing(path);
2423
58.3k
      if (!s.ok()) {
2424
0
        break;
2425
0
      }
2426
58.3k
    }
2427
2428
    // For recovery from NoSpace() error, we can only handle
2429
    // the case where the database is stored in a single path
2430
58.3k
    if (paths.size() <= 1) {
2431
58.3k
      impl->error_handler_.EnableAutoRecovery();
2432
58.3k
    }
2433
58.3k
  }
2434
58.3k
  if (s.ok()) {
2435
58.3k
    s = impl->CreateArchivalDirectory();
2436
58.3k
  }
2437
58.3k
  if (!s.ok()) {
2438
0
    return s;
2439
0
  }
2440
2441
58.3k
  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
2442
58.3k
  RecoveryContext recovery_ctx;
2443
58.3k
  impl->options_mutex_.Lock();
2444
58.3k
  impl->mutex_.Lock();
2445
2446
  // Handles create_if_missing, error_if_exists
2447
58.3k
  uint64_t recovered_seq(kMaxSequenceNumber);
2448
58.3k
  s = impl->Recover(column_families, false /* read_only */,
2449
58.3k
                    false /* error_if_wal_file_exists */,
2450
58.3k
                    false /* error_if_data_exists_in_wals */, is_retry,
2451
58.3k
                    &recovered_seq, &recovery_ctx, can_retry);
2452
58.3k
  if (s.ok()) {
2453
58.3k
    uint64_t new_log_number = impl->versions_->NewFileNumber();
2454
58.3k
    log::Writer* new_log = nullptr;
2455
58.3k
    const size_t preallocate_block_size =
2456
58.3k
        impl->GetWalPreallocateBlockSize(max_write_buffer_size);
2457
    // TODO(hx235): Pass in the correct `predecessor_wal_info` for the first WAL
2458
    // created during DB open with predecessor WALs from previous DB session due
2459
    // to `avoid_flush_during_recovery == true`. This can protect the last WAL
2460
    // recovered.
2461
58.3k
    s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/,
2462
58.3k
                        preallocate_block_size,
2463
58.3k
                        PredecessorWALInfo() /* predecessor_wal_info */,
2464
58.3k
                        &new_log);
2465
58.3k
    if (s.ok()) {
2466
      // Prevent log files created by previous instance from being recycled.
2467
      // They might be in alive_log_file_, and might get recycled otherwise.
2468
58.3k
      impl->min_wal_number_to_recycle_ = new_log_number;
2469
58.3k
    }
2470
58.3k
    if (s.ok()) {
2471
58.3k
      InstrumentedMutexLock wl(&impl->wal_write_mutex_);
2472
58.3k
      impl->cur_wal_number_ = new_log_number;
2473
58.3k
      assert(new_log != nullptr);
2474
58.3k
      assert(impl->logs_.empty());
2475
58.3k
      impl->logs_.emplace_back(new_log_number, new_log);
2476
58.3k
    }
2477
2478
58.3k
    if (s.ok()) {
2479
58.3k
      impl->alive_wal_files_.emplace_back(impl->cur_wal_number_);
2480
      // In WritePrepared there could be gap in sequence numbers. This breaks
2481
      // the trick we use in kPointInTimeRecovery which assumes the first seq in
2482
      // the log right after the corrupted log is one larger than the last seq
2483
      // we read from the wals. To let this trick keep working, we add a dummy
2484
      // entry with the expected sequence to the first log right after recovery.
2485
      // In non-WritePrepared case also the new log after recovery could be
2486
      // empty, and thus missing the consecutive seq hint to distinguish
2487
      // middle-log corruption to corrupted-log-remained-after-recovery. This
2488
      // case also will be addressed by a dummy write.
2489
58.3k
      if (recovered_seq != kMaxSequenceNumber) {
2490
0
        WriteBatch empty_batch;
2491
0
        WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
2492
0
        uint64_t wal_used, log_size;
2493
0
        log::Writer* log_writer = impl->logs_.back().writer;
2494
0
        WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back();
2495
2496
0
        assert(log_writer->get_log_number() == wal_file_number_size.number);
2497
0
        impl->mutex_.AssertHeld();
2498
0
        s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used,
2499
0
                             &log_size, wal_file_number_size, recovered_seq);
2500
0
        if (s.ok()) {
2501
          // Need to fsync, otherwise it might get lost after a power reset.
2502
0
          s = impl->FlushWAL(write_options, false);
2503
0
          TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
2504
0
          IOOptions opts;
2505
0
          if (s.ok()) {
2506
0
            s = WritableFileWriter::PrepareIOOptions(write_options, opts);
2507
0
          }
2508
0
          if (s.ok()) {
2509
0
            s = log_writer->file()->Sync(opts,
2510
0
                                         impl->immutable_db_options_.use_fsync);
2511
0
          }
2512
0
        }
2513
0
      }
2514
58.3k
    }
2515
58.3k
  }
2516
58.3k
  if (s.ok()) {
2517
58.3k
    s = impl->LogAndApplyForRecovery(recovery_ctx);
2518
58.3k
  }
2519
2520
58.3k
  if (s.ok() && !impl->immutable_db_options_.write_identity_file) {
2521
    // On successful recovery, delete an obsolete IDENTITY file to avoid DB ID
2522
    // inconsistency
2523
0
    impl->env_->DeleteFile(IdentityFileName(impl->dbname_))
2524
0
        .PermitUncheckedError();
2525
0
  }
2526
2527
58.3k
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
2528
0
    impl->mutex_.AssertHeld();
2529
0
    s = impl->InitPersistStatsColumnFamily();
2530
0
  }
2531
2532
  // After reaching the post-recovery seqno but before creating SuperVersions
2533
  // ensure seqno to time mapping is pre-populated as needed.
2534
58.3k
  if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) {
2535
0
    impl->PrepopulateSeqnoToTimeMapping(preserve_info);
2536
0
  }
2537
2538
58.3k
  if (s.ok()) {
2539
    // set column family handles
2540
80.3k
    for (const auto& cf : column_families) {
2541
80.3k
      auto cfd =
2542
80.3k
          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
2543
80.3k
      if (cfd != nullptr) {
2544
80.3k
        handles->push_back(
2545
80.3k
            new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_));
2546
80.3k
        impl->NewThreadStatusCfInfo(cfd);
2547
80.3k
        SuperVersionContext sv_context(/* create_superversion */ true);
2548
80.3k
        impl->InstallSuperVersionForConfigChange(cfd, &sv_context);
2549
80.3k
        sv_context.Clean();
2550
80.3k
      } else {
2551
0
        if (db_options.create_missing_column_families) {
2552
          // missing column family, create it
2553
0
          ColumnFamilyHandle* handle = nullptr;
2554
0
          impl->mutex_.Unlock();
2555
          // NOTE: the work normally done in WrapUpCreateColumnFamilies will
2556
          // be done separately below.
2557
          // This includes InstallSuperVersionForConfigChange.
2558
0
          s = impl->CreateColumnFamilyImpl(read_options, write_options,
2559
0
                                           cf.options, cf.name, &handle);
2560
0
          impl->mutex_.Lock();
2561
0
          if (s.ok()) {
2562
0
            handles->push_back(handle);
2563
0
          } else {
2564
0
            break;
2565
0
          }
2566
0
        } else {
2567
0
          s = Status::InvalidArgument("Column family not found", cf.name);
2568
0
          break;
2569
0
        }
2570
0
      }
2571
80.3k
    }
2572
58.3k
  }
2573
2574
58.3k
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
2575
    // Install SuperVersion for hidden column family
2576
0
    assert(impl->persist_stats_cf_handle_);
2577
0
    assert(impl->persist_stats_cf_handle_->cfd());
2578
0
    SuperVersionContext sv_context(/* create_superversion */ true);
2579
0
    impl->InstallSuperVersionForConfigChange(
2580
0
        impl->persist_stats_cf_handle_->cfd(), &sv_context);
2581
0
    sv_context.Clean();
2582
    // try to read format version
2583
0
    s = impl->PersistentStatsProcessFormatVersion();
2584
0
  }
2585
2586
58.3k
  if (s.ok()) {
2587
80.3k
    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
2588
80.3k
      if (!cfd->mem()->IsSnapshotSupported()) {
2589
0
        impl->is_snapshot_supported_ = false;
2590
0
      }
2591
80.3k
      if (cfd->ioptions().merge_operator != nullptr &&
2592
80.3k
          !cfd->mem()->IsMergeOperatorSupported()) {
2593
0
        s = Status::InvalidArgument(
2594
0
            "The memtable of column family %s does not support merge operator "
2595
0
            "its options.merge_operator is non-null",
2596
0
            cfd->GetName().c_str());
2597
0
      }
2598
80.3k
      if (!s.ok()) {
2599
0
        break;
2600
0
      }
2601
80.3k
    }
2602
58.3k
  }
2603
58.3k
  TEST_SYNC_POINT("DBImpl::Open:Opened");
2604
58.3k
  Status persist_options_status;
2605
58.3k
  if (s.ok()) {
2606
    // Persist RocksDB Options before scheduling the compaction.
2607
    // The WriteOptionsFile() will release and lock the mutex internally.
2608
58.3k
    persist_options_status =
2609
58.3k
        impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
2610
58.3k
    impl->opened_successfully_ = true;
2611
58.3k
  } else {
2612
0
    persist_options_status.PermitUncheckedError();
2613
0
  }
2614
58.3k
  impl->mutex_.Unlock();
2615
2616
58.3k
  auto sfm = static_cast<SstFileManagerImpl*>(
2617
58.3k
      impl->immutable_db_options_.sst_file_manager.get());
2618
58.3k
  if (s.ok() && sfm) {
2619
    // Set Statistics ptr for SstFileManager to dump the stats of
2620
    // DeleteScheduler.
2621
58.3k
    sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics);
2622
58.3k
    ROCKS_LOG_INFO(impl->immutable_db_options_.info_log,
2623
58.3k
                   "SstFileManager instance %p", sfm);
2624
2625
58.3k
    impl->TrackExistingDataFiles(recovery_ctx.existing_data_files_);
2626
2627
    // Reserve some disk buffer space. This is a heuristic - when we run out
2628
    // of disk space, this ensures that there is at least write_buffer_size
2629
    // amount of free space before we resume DB writes. In low disk space
2630
    // conditions, we want to avoid a lot of small L0 files due to frequent
2631
    // WAL write failures and resultant forced flushes
2632
58.3k
    sfm->ReserveDiskBuffer(max_write_buffer_size,
2633
58.3k
                           impl->immutable_db_options_.db_paths[0].path);
2634
58.3k
  }
2635
2636
58.3k
  if (s.ok()) {
2637
    // When the DB is stopped, it's possible that there are some .trash files
2638
    // that were not deleted yet, when we open the DB we will find these .trash
2639
    // files and schedule them to be deleted (or delete immediately if
2640
    // SstFileManager was not used).
2641
    // Note that we only start doing this and below delete obsolete file after
2642
    // `TrackExistingDataFiles` are called, the `max_trash_db_ratio` is
2643
    // ineffective otherwise and these files' deletion won't be rate limited
2644
    // which can cause discard stall.
2645
58.3k
    for (const auto& path : impl->CollectAllDBPaths()) {
2646
58.3k
      DeleteScheduler::CleanupDirectory(impl->immutable_db_options_.env, sfm,
2647
58.3k
                                        path)
2648
58.3k
          .PermitUncheckedError();
2649
58.3k
    }
2650
58.3k
    impl->mutex_.Lock();
2651
    // This will do a full scan.
2652
58.3k
    impl->DeleteObsoleteFiles();
2653
58.3k
    TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
2654
58.3k
    impl->MaybeScheduleFlushOrCompaction();
2655
58.3k
    impl->mutex_.Unlock();
2656
58.3k
  }
2657
2658
58.3k
  if (s.ok()) {
2659
58.3k
    ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
2660
58.3k
                     impl.get());
2661
58.3k
    LogFlush(impl->immutable_db_options_.info_log);
2662
58.3k
    if (!impl->WALBufferIsEmpty()) {
2663
0
      s = impl->FlushWAL(write_options, false);
2664
0
      if (s.ok()) {
2665
        // Sync is needed otherwise WAL buffered data might get lost after a
2666
        // power reset.
2667
0
        log::Writer* log_writer = impl->logs_.back().writer;
2668
0
        IOOptions opts;
2669
0
        s = WritableFileWriter::PrepareIOOptions(write_options, opts);
2670
0
        if (s.ok()) {
2671
0
          s = log_writer->file()->Sync(opts,
2672
0
                                       impl->immutable_db_options_.use_fsync);
2673
0
        }
2674
0
      }
2675
0
    }
2676
58.3k
    if (s.ok() && !persist_options_status.ok()) {
2677
0
      s = Status::IOError(
2678
0
          "DB::Open() failed --- Unable to persist Options file",
2679
0
          persist_options_status.ToString());
2680
0
    }
2681
58.3k
  }
2682
58.3k
  if (!s.ok()) {
2683
0
    ROCKS_LOG_WARN(impl->immutable_db_options_.info_log,
2684
0
                   "DB::Open() failed: %s", s.ToString().c_str());
2685
0
  }
2686
58.3k
  if (s.ok()) {
2687
58.3k
    s = impl->StartPeriodicTaskScheduler();
2688
58.3k
  }
2689
58.3k
  if (s.ok()) {
2690
58.3k
    s = impl->RegisterRecordSeqnoTimeWorker();
2691
58.3k
  }
2692
58.3k
  impl->options_mutex_.Unlock();
2693
58.3k
  if (s.ok()) {
2694
58.3k
    *dbptr = std::move(impl);
2695
58.3k
  } else {
2696
0
    for (auto* h : *handles) {
2697
0
      delete h;
2698
0
    }
2699
0
    handles->clear();
2700
0
  }
2701
58.3k
  return s;
2702
58.3k
}
2703
}  // namespace ROCKSDB_NAMESPACE