Coverage Report

Created: 2026-05-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/db/db_filesnapshot.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
7
#include <algorithm>
8
#include <cstdint>
9
#include <memory>
10
#include <string>
11
#include <vector>
12
13
#include "db/db_impl/db_impl.h"
14
#include "db/job_context.h"
15
#include "db/version_set.h"
16
#include "file/file_util.h"
17
#include "file/filename.h"
18
#include "logging/logging.h"
19
#include "port/port.h"
20
#include "rocksdb/db.h"
21
#include "rocksdb/env.h"
22
#include "rocksdb/metadata.h"
23
#include "rocksdb/transaction_log.h"
24
#include "rocksdb/types.h"
25
#include "test_util/sync_point.h"
26
#include "util/file_checksum_helper.h"
27
#include "util/mutexlock.h"
28
29
namespace ROCKSDB_NAMESPACE {
30
31
0
Status DBImpl::FlushForGetLiveFiles(bool force_atomic_flush) {
32
0
  FlushOptions flush_opts;
33
0
  flush_opts.force_atomic_flush = force_atomic_flush;
34
0
  return DBImpl::FlushAllColumnFamilies(flush_opts, FlushReason::kGetLiveFiles);
35
0
}
36
37
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
38
0
                            uint64_t* manifest_file_size, bool flush_memtable) {
39
0
  *manifest_file_size = 0;
40
41
0
  mutex_.Lock();
42
43
0
  if (flush_memtable) {
44
0
    Status status = FlushForGetLiveFiles();
45
0
    if (!status.ok()) {
46
0
      mutex_.Unlock();
47
0
      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
48
0
                      status.ToString().c_str());
49
0
      return status;
50
0
    }
51
0
  }
52
53
  // Make a set of all of the live table and blob files
54
0
  std::vector<uint64_t> live_table_files;
55
0
  std::vector<uint64_t> live_blob_files;
56
0
  for (auto cfd : *versions_->GetColumnFamilySet()) {
57
0
    if (cfd->IsDropped()) {
58
0
      continue;
59
0
    }
60
0
    cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
61
0
  }
62
63
0
  ret.clear();
64
0
  ret.reserve(live_table_files.size() + live_blob_files.size() +
65
0
              3);  // for CURRENT + MANIFEST + OPTIONS
66
67
  // create names of the live files. The names are not absolute
68
  // paths, instead they are relative to dbname_.
69
0
  for (const auto& table_file_number : live_table_files) {
70
0
    ret.emplace_back(MakeTableFileName("", table_file_number));
71
0
  }
72
73
0
  for (const auto& blob_file_number : live_blob_files) {
74
0
    ret.emplace_back(BlobFileName("", blob_file_number));
75
0
  }
76
77
0
  ret.emplace_back(CurrentFileName(""));
78
0
  ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
79
  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
80
  // exist at all. In this cases we do not record any OPTIONS file in the live
81
  // file list.
82
0
  if (versions_->options_file_number() != 0) {
83
0
    ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
84
0
  }
85
86
  // find length of manifest file while holding the mutex lock
87
0
  *manifest_file_size = versions_->manifest_file_size();
88
89
0
  mutex_.Unlock();
90
0
  return Status::OK();
91
0
}
92
93
0
Status DBImpl::GetSortedWalFiles(VectorWalPtr& files) {
94
0
  return GetSortedWalFilesImpl(files,
95
0
                               /*need_seqnos*/ true);
96
0
}
97
98
0
Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
99
  // Record tracked WALs as a (minimum) cross-check for directory scan
100
0
  std::vector<uint64_t> required_by_manifest;
101
102
  // If caller disabled deletions, this function should return files that are
103
  // guaranteed not to be deleted until deletions are re-enabled. We need to
104
  // wait for pending purges to finish since WalManager doesn't know which
105
  // files are going to be purged. Additional purges won't be scheduled as
106
  // long as deletions are disabled (so the below loop must terminate).
107
  // Also note that we disable deletions anyway to avoid the case where a
108
  // file is deleted in the middle of the scan, causing IO error.
109
0
  Status deletions_disabled = DisableFileDeletions();
110
0
  {
111
0
    InstrumentedMutexLock l(&mutex_);
112
0
    while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
113
0
      TEST_SYNC_POINT("DBImpl::GetSortedWalFilesImpl:WaitPurge");
114
0
      bg_cv_.Wait();
115
0
    }
116
117
    // Record tracked WALs as a (minimum) cross-check for directory scan
118
0
    const auto& manifest_wals = versions_->GetWalSet().GetWals();
119
0
    required_by_manifest.reserve(manifest_wals.size());
120
0
    for (const auto& wal : manifest_wals) {
121
0
      required_by_manifest.push_back(wal.first);
122
0
    }
123
0
  }
124
125
  // NOTE: need to include archived WALs because needed WALs might have been
126
  // archived since getting required_by_manifest set
127
0
  Status s = wal_manager_.GetSortedWalFiles(files, need_seqnos,
128
0
                                            /*include_archived*/ true);
129
130
  // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
131
0
  if (deletions_disabled.ok()) {
132
0
    Status s2 = EnableFileDeletions();
133
0
    assert(s2.ok());
134
0
    s2.PermitUncheckedError();
135
0
  } else {
136
0
    assert(deletions_disabled.IsNotSupported());
137
0
  }
138
139
0
  if (s.ok()) {
140
    // Verify includes those required by manifest (one sorted list is superset
141
    // of the other)
142
0
    auto required = required_by_manifest.begin();
143
0
    auto included = files.begin();
144
145
0
    while (required != required_by_manifest.end()) {
146
0
      if (included == files.end() || *required < (*included)->LogNumber()) {
147
        // FAIL - did not find
148
0
        return Status::Corruption(
149
0
            "WAL file " + std::to_string(*required) +
150
0
            " required by manifest but not in directory list");
151
0
      }
152
0
      if (*required == (*included)->LogNumber()) {
153
0
        ++required;
154
0
        ++included;
155
0
      } else {
156
0
        assert(*required > (*included)->LogNumber());
157
0
        ++included;
158
0
      }
159
0
    }
160
0
  }
161
162
0
  if (s.ok()) {
163
0
    size_t wal_count = files.size();
164
0
    ROCKS_LOG_INFO(immutable_db_options_.info_log,
165
0
                   "Number of WAL files %" ROCKSDB_PRIszt " (%" ROCKSDB_PRIszt
166
0
                   " required by manifest)",
167
0
                   wal_count, required_by_manifest.size());
168
#ifndef NDEBUG
169
    std::ostringstream wal_names;
170
    for (const auto& wal : files) {
171
      wal_names << wal->PathName() << " ";
172
    }
173
174
    std::ostringstream wal_required_by_manifest_names;
175
    for (const auto& wal : required_by_manifest) {
176
      wal_required_by_manifest_names << wal << ".log ";
177
    }
178
179
    ROCKS_LOG_INFO(immutable_db_options_.info_log,
180
                   "Log files : %s .Log files required by manifest: %s.",
181
                   wal_names.str().c_str(),
182
                   wal_required_by_manifest_names.str().c_str());
183
#endif  // NDEBUG
184
0
  }
185
0
  return s;
186
0
}
187
188
0
Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) {
189
0
  uint64_t current_logfile_number;
190
0
  {
191
0
    InstrumentedMutexLock l(&mutex_);
192
0
    current_logfile_number = cur_wal_number_;
193
0
  }
194
195
0
  return wal_manager_.GetLiveWalFile(current_logfile_number, current_wal_file);
196
0
}
197
198
Status DBImpl::GetLiveFilesStorageInfo(
199
    const LiveFilesStorageInfoOptions& opts,
200
0
    std::vector<LiveFileStorageInfo>* files) {
201
  // To avoid returning partial results, only move results to files on success.
202
0
  assert(files);
203
0
  files->clear();
204
0
  std::vector<LiveFileStorageInfo> results;
205
206
  // NOTE: This implementation was largely migrated from Checkpoint.
207
208
0
  VectorWalPtr live_wal_files;
209
0
  bool flush_memtable = true;
210
0
  if (!immutable_db_options_.allow_2pc) {
211
0
    if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
212
0
      flush_memtable = false;
213
0
    } else if (opts.wal_size_for_flush > 0) {
214
      // FIXME: avoid querying the filesystem for current WAL state
215
      // If the outstanding WAL files are small, we skip the flush.
216
      // Don't take archived log size into account when calculating wal
217
      // size for flush, and don't need to verify consistency with manifest
218
      // here & now.
219
0
      Status wal_s = wal_manager_.GetSortedWalFiles(live_wal_files,
220
0
                                                    /* need_seqnos */ false,
221
0
                                                    /*include_archived*/ false);
222
223
0
      if (!wal_s.ok()) {
224
0
        return wal_s;
225
0
      }
226
227
      // Don't flush column families if total log size is smaller than
228
      // log_size_for_flush. We copy the log files instead.
229
      // We may be able to cover 2PC case too.
230
0
      uint64_t total_wal_size = 0;
231
0
      for (auto& wal : live_wal_files) {
232
0
        assert(wal->Type() == kAliveLogFile);
233
0
        total_wal_size += wal->SizeFileBytes();
234
0
      }
235
0
      if (total_wal_size < opts.wal_size_for_flush) {
236
0
        flush_memtable = false;
237
0
      }
238
0
      live_wal_files.clear();
239
0
    }
240
0
  }
241
242
  // This is a modified version of GetLiveFiles, to get access to more
243
  // metadata.
244
0
  mutex_.Lock();
245
0
  bool wal_locked = false;
246
0
  const bool needs_blob_direct_write_flush =
247
0
      HasInFlightBlobDirectWriteFilesWithLockHeld();
248
0
  if (needs_blob_direct_write_flush && !flush_memtable) {
249
0
    mutex_.Unlock();
250
0
    return Status::NotSupported(
251
0
        "Blob direct write requires flushing active blob files before "
252
0
        "capturing live files. Retry with flush enabled.");
253
0
  }
254
0
  if (flush_memtable) {
255
0
    wal_locked = lock_wal_count_ > 0;
256
0
    if (wal_locked) {
257
0
      if (needs_blob_direct_write_flush) {
258
0
        mutex_.Unlock();
259
0
        return Status::NotSupported(
260
0
            "Blob direct write requires flushing active blob files before "
261
0
            "capturing live files. Retry with WAL unlocked.");
262
0
      }
263
0
      ROCKS_LOG_INFO(immutable_db_options_.info_log,
264
0
                     "Can't FlushForGetLiveFiles while WAL is locked");
265
0
    } else {
266
0
      Status status = FlushForGetLiveFiles(opts.atomic_flush);
267
0
      if (!status.ok()) {
268
0
        mutex_.Unlock();
269
0
        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
270
0
                        "Cannot Flush data %s\n", status.ToString().c_str());
271
0
        return status;
272
0
      }
273
0
    }
274
0
  }
275
276
  // Make a set of all of the live table and blob files
277
0
  for (auto cfd : *versions_->GetColumnFamilySet()) {
278
0
    if (cfd->IsDropped()) {
279
0
      continue;
280
0
    }
281
0
    VersionStorageInfo& vsi = *cfd->current()->storage_info();
282
0
    auto& cf_paths = cfd->ioptions().cf_paths;
283
284
0
    auto GetDir = [&](size_t path_id) {
285
      // Matching TableFileName() behavior
286
0
      if (path_id >= cf_paths.size()) {
287
0
        assert(false);
288
0
        return cf_paths.back().path;
289
0
      } else {
290
0
        return cf_paths[path_id].path;
291
0
      }
292
0
    };
293
294
0
    for (int level = 0; level < vsi.num_levels(); ++level) {
295
0
      const auto& level_files = vsi.LevelFiles(level);
296
0
      for (const auto& meta : level_files) {
297
0
        assert(meta);
298
299
0
        results.emplace_back();
300
0
        LiveFileStorageInfo& info = results.back();
301
302
0
        info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
303
0
        info.directory = GetDir(meta->fd.GetPathId());
304
0
        info.file_number = meta->fd.GetNumber();
305
0
        info.file_type = kTableFile;
306
0
        info.size = meta->fd.GetFileSize();
307
0
        if (opts.include_checksum_info) {
308
0
          info.file_checksum_func_name = meta->file_checksum_func_name;
309
0
          info.file_checksum = meta->file_checksum;
310
0
          if (info.file_checksum_func_name.empty()) {
311
0
            info.file_checksum_func_name = kUnknownFileChecksumFuncName;
312
0
            info.file_checksum = kUnknownFileChecksum;
313
0
          }
314
0
        }
315
0
        info.temperature = meta->temperature;
316
0
      }
317
0
    }
318
0
    const auto& blob_files = vsi.GetBlobFiles();
319
0
    for (const auto& meta : blob_files) {
320
0
      assert(meta);
321
322
0
      results.emplace_back();
323
0
      LiveFileStorageInfo& info = results.back();
324
325
0
      info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
326
0
      info.directory = GetDir(/* path_id */ 0);
327
0
      info.file_number = meta->GetBlobFileNumber();
328
0
      info.file_type = kBlobFile;
329
0
      info.size = meta->GetBlobFileSize();
330
0
      if (opts.include_checksum_info) {
331
0
        info.file_checksum_func_name = meta->GetChecksumMethod();
332
0
        info.file_checksum = meta->GetChecksumValue();
333
0
        if (info.file_checksum_func_name.empty()) {
334
0
          info.file_checksum_func_name = kUnknownFileChecksumFuncName;
335
0
          info.file_checksum = kUnknownFileChecksum;
336
0
        }
337
0
      }
338
      // TODO?: info.temperature
339
0
    }
340
0
  }
341
342
  // Capture some final info before releasing mutex
343
0
  const uint64_t manifest_number = versions_->manifest_file_number();
344
0
  const uint64_t manifest_size = versions_->manifest_file_size();
345
0
  const uint64_t options_number = versions_->options_file_number();
346
0
  const uint64_t options_size = versions_->options_file_size_;
347
0
  const uint64_t min_log_num = MinLogNumberToKeep();
348
  // Ensure consistency with manifest for track_and_verify_wals_in_manifest
349
0
  const uint64_t max_log_num = cur_wal_number_;
350
351
0
  mutex_.Unlock();
352
353
0
  std::string manifest_fname = DescriptorFileName(manifest_number);
354
0
  {  // MANIFEST
355
0
    results.emplace_back();
356
0
    LiveFileStorageInfo& info = results.back();
357
358
0
    info.relative_filename = manifest_fname;
359
0
    info.directory = GetName();
360
0
    info.file_number = manifest_number;
361
0
    info.file_type = kDescriptorFile;
362
0
    info.size = manifest_size;
363
0
    info.trim_to_size = true;
364
0
    if (opts.include_checksum_info) {
365
0
      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
366
0
      info.file_checksum = kUnknownFileChecksum;
367
0
    }
368
0
  }
369
370
0
  {  // CURRENT
371
0
    results.emplace_back();
372
0
    LiveFileStorageInfo& info = results.back();
373
374
0
    info.relative_filename = kCurrentFileName;
375
0
    info.directory = GetName();
376
0
    info.file_type = kCurrentFile;
377
    // CURRENT could be replaced so we have to record the contents as needed.
378
0
    info.replacement_contents = manifest_fname + "\n";
379
0
    info.size = manifest_fname.size() + 1;
380
0
    if (opts.include_checksum_info) {
381
0
      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
382
0
      info.file_checksum = kUnknownFileChecksum;
383
0
    }
384
0
  }
385
386
  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
387
  // exist at all. In this cases we do not record any OPTIONS file in the live
388
  // file list.
389
0
  if (options_number != 0) {
390
0
    results.emplace_back();
391
0
    LiveFileStorageInfo& info = results.back();
392
393
0
    info.relative_filename = OptionsFileName(options_number);
394
0
    info.directory = GetName();
395
0
    info.file_number = options_number;
396
0
    info.file_type = kOptionsFile;
397
0
    info.size = options_size;
398
0
    if (opts.include_checksum_info) {
399
0
      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
400
0
      info.file_checksum = kUnknownFileChecksum;
401
0
    }
402
0
  }
403
404
  // Some legacy testing stuff  TODO: carefully clean up obsolete parts
405
0
  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
406
407
0
  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
408
0
  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
409
410
  // FlushWAL is required to ensure we can physically copy everything
411
  // logically written to the WAL. (Sync not strictly required for
412
  // active WAL to be copied rather than hard linked, even when
413
  // Checkpoint guarantees that the copied-to file is sync-ed. Plus we can't
414
  // help track_and_verify_wals_in_manifest after manifest_size is
415
  // already determined.)
416
0
  Status s = FlushWAL(/*sync=*/false);
417
0
  if (s.IsNotSupported()) {  // read-only DB or similar
418
0
    s = Status::OK();
419
0
  }
420
421
0
  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
422
0
  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
423
424
  // Even after WAL flush, there could be multiple WALs that are not
425
  // fully synced. Although the output DB of a Checkpoint or Backup needs
426
  // to be fully synced on return, we don't strictly need to sync this
427
  // DB (the input DB). If we allow Checkpoint to hard link an inactive
428
  // WAL that isn't fully synced, that could result in an insufficiently
429
  // sync-ed Checkpoint. Here we get the set of WALs that are potentially
430
  // unsynced or still being written to, to prevent them from being hard
431
  // linked. Enforcing max_log_num from above ensures any new WALs after
432
  // GetOpenWalSizes() and before GetSortedWalFiles() are not included in
433
  // the results.
434
  // NOTE: we might still hard link a file that is open for writing, even
435
  // if we don't do any more writes to it.
436
  //
437
  // In a step toward reducing unnecessary file metadata queries, we also
438
  // get and use our known flushed sizes for those WALs.
439
  // FIXME: eventually we should not be using filesystem queries at all for
440
  // the required set of WAL files.
441
  //
442
  // However for recycled log files, we just copy the whole file,
443
  // for better or worse.
444
  //
445
0
  std::map<uint64_t, uint64_t> open_wal_number_to_size;
446
0
  bool recycling_log_files = immutable_db_options_.recycle_log_file_num > 0;
447
0
  if (s.ok() && !recycling_log_files) {
448
0
    s = GetOpenWalSizes(open_wal_number_to_size);
449
0
  }
450
451
  // [old comment] If we have more than one column family, we also need to get
452
  // WAL files.
453
0
  if (s.ok()) {
454
    // FIXME: avoid querying the filesystem for current WAL state
455
0
    s = GetSortedWalFilesImpl(live_wal_files,
456
0
                              /* need_seqnos */ false);
457
0
  }
458
0
  if (!s.ok()) {
459
0
    return s;
460
0
  }
461
462
0
  size_t wal_count = live_wal_files.size();
463
  // Link WAL files. Copy exact size of last one because it is the only one
464
  // that has changes after the last flush.
465
0
  auto wal_dir = immutable_db_options_.GetWalDir();
466
0
  for (size_t i = 0; s.ok() && i < wal_count; ++i) {
467
0
    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
468
0
        (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num) &&
469
0
        live_wal_files[i]->LogNumber() <= max_log_num) {
470
0
      results.emplace_back();
471
0
      LiveFileStorageInfo& info = results.back();
472
0
      auto f = live_wal_files[i]->PathName();
473
0
      assert(!f.empty() && f[0] == '/');
474
0
      info.relative_filename = f.substr(1);
475
0
      info.directory = wal_dir;
476
0
      info.file_number = live_wal_files[i]->LogNumber();
477
0
      info.file_type = kWalFile;
478
0
      if (recycling_log_files) {
479
0
        info.size = live_wal_files[i]->SizeFileBytes();
480
        // Recyclable WAL files must be copied instead of hard linked
481
0
        info.trim_to_size = true;
482
0
      } else {
483
0
        auto it = open_wal_number_to_size.find(info.file_number);
484
0
        if (it == open_wal_number_to_size.end()) {
485
          // Known fully synced and no future writes (in part from
486
          // max_log_num check). Ok to hard link
487
0
          info.size = live_wal_files[i]->SizeFileBytes();
488
0
          assert(!info.trim_to_size);
489
0
        } else {
490
          // Marked as (possibly) still open -> use our known flushed size
491
          // and force file copy instead of hard link
492
0
          info.size = it->second;
493
0
          info.trim_to_size = true;
494
          // FIXME: this is needed as long as db_stress uses
495
          // SetReadUnsyncedData(false), because it will only be able to
496
          // copy the synced portion of the WAL, which under
497
          // SetReadUnsyncedData(false) is given by the reported file size.
498
0
          info.size = std::min(info.size, live_wal_files[i]->SizeFileBytes());
499
0
        }
500
0
      }
501
0
      if (opts.include_checksum_info) {
502
0
        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
503
0
        info.file_checksum = kUnknownFileChecksum;
504
0
      }
505
0
    }
506
0
  }
507
508
0
  if (s.ok()) {
509
    // Only move results to output on success.
510
0
    *files = std::move(results);
511
0
  }
512
0
  return s;
513
0
}
514
515
}  // namespace ROCKSDB_NAMESPACE