Coverage Report

Created: 2026-04-10 07:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/db/external_sst_file_ingestion_job.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
6
#include "db/external_sst_file_ingestion_job.h"
7
8
#include <algorithm>
9
#include <cinttypes>
10
#include <string>
11
#include <unordered_set>
12
#include <vector>
13
14
#include "db/builder.h"
15
#include "db/db_impl/db_impl.h"
16
#include "db/version_edit.h"
17
#include "file/file_util.h"
18
#include "file/random_access_file_reader.h"
19
#include "logging/logging.h"
20
#include "table/merging_iterator.h"
21
#include "table/sst_file_writer_collectors.h"
22
#include "table/table_builder.h"
23
#include "table/unique_id_impl.h"
24
#include "test_util/sync_point.h"
25
#include "util/udt_util.h"
26
27
namespace ROCKSDB_NAMESPACE {
28
29
Status ExternalSstFileIngestionJob::Prepare(
30
    const std::vector<std::string>& external_files_paths,
31
    const std::vector<std::string>& files_checksums,
32
    const std::vector<std::string>& files_checksum_func_names,
33
    const std::optional<RangeOpt>& atomic_replace_range,
34
    const Temperature& file_temperature, uint64_t next_file_number,
35
0
    SuperVersion* sv) {
36
0
  Status status;
37
38
  // Read the information of files we are ingesting
39
0
  for (const std::string& file_path : external_files_paths) {
40
0
    IngestedFileInfo file_to_ingest;
41
    // For temperature, first assume it matches provided hint
42
0
    file_to_ingest.file_temperature = file_temperature;
43
0
    status =
44
0
        GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
45
0
    if (!status.ok()) {
46
0
      ROCKS_LOG_WARN(db_options_.info_log,
47
0
                     "Failed to get ingested file info: %s: %s",
48
0
                     file_path.c_str(), status.ToString().c_str());
49
0
      return status;
50
0
    }
51
52
    // Files generated in another DB or CF may have a different column family
53
    // ID, so we let it pass here.
54
0
    if (file_to_ingest.cf_id !=
55
0
            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
56
0
        file_to_ingest.cf_id != cfd_->GetID() &&
57
0
        !ingestion_options_.allow_db_generated_files) {
58
0
      return Status::InvalidArgument(
59
0
          "External file column family id don't match");
60
0
    }
61
62
0
    if (file_to_ingest.num_entries == 0 &&
63
0
        file_to_ingest.num_range_deletions == 0) {
64
0
      return Status::InvalidArgument("File contain no entries");
65
0
    }
66
67
0
    if (!file_to_ingest.smallest_internal_key.Valid() ||
68
0
        !file_to_ingest.largest_internal_key.Valid()) {
69
0
      return Status::Corruption("Generated table have corrupted keys");
70
0
    }
71
72
0
    files_to_ingest_.emplace_back(std::move(file_to_ingest));
73
0
  }
74
75
0
  auto num_files = files_to_ingest_.size();
76
0
  if (num_files == 0) {
77
0
    return Status::InvalidArgument("The list of files is empty");
78
0
  } else if (num_files > 1) {
79
    // Verify that passed files don't have overlapping ranges
80
0
    autovector<const IngestedFileInfo*> sorted_files;
81
0
    for (size_t i = 0; i < num_files; i++) {
82
0
      sorted_files.push_back(&files_to_ingest_[i]);
83
0
    }
84
85
0
    std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_);
86
87
0
    for (size_t i = 0; i + 1 < num_files; i++) {
88
0
      if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1],
89
0
                                       /* known_sorted= */ true)) {
90
0
        files_overlap_ = true;
91
0
        break;
92
0
      }
93
0
    }
94
0
  }
95
96
0
  if (atomic_replace_range.has_value()) {
97
0
    atomic_replace_range_.emplace();
98
99
0
    if (atomic_replace_range->start && atomic_replace_range->limit) {
100
      // User keys to internal keys (with timestamps)
101
0
      const size_t ts_sz = ucmp_->timestamp_size();
102
0
      std::string start_with_ts, limit_with_ts;
103
0
      auto [start, limit] = MaybeAddTimestampsToRange(
104
0
          atomic_replace_range->start, atomic_replace_range->limit, ts_sz,
105
0
          &start_with_ts, &limit_with_ts);
106
0
      assert(start.has_value());
107
0
      assert(limit.has_value());
108
0
      atomic_replace_range_->smallest_internal_key.Set(
109
0
          *start, kMaxSequenceNumber, kValueTypeForSeek);
110
0
      atomic_replace_range_->largest_internal_key.Set(
111
0
          *limit, kMaxSequenceNumber, kValueTypeForSeek);
112
      // Check files to ingest against replace range
113
0
      for (size_t i = 0; i < num_files; i++) {
114
0
        if (!file_range_checker_.Contains(*atomic_replace_range_,
115
0
                                          files_to_ingest_[i])) {
116
0
          return Status::InvalidArgument(
117
0
              "Atomic replace range does not contain all files");
118
0
        }
119
0
      }
120
0
    } else {
121
      // Currently if either bound is not present, both must be
122
0
      assert(atomic_replace_range->start.has_value() == false);
123
0
      assert(atomic_replace_range->limit.has_value() == false);
124
0
      assert(atomic_replace_range_->smallest_internal_key.unset());
125
0
      assert(atomic_replace_range_->largest_internal_key.unset());
126
0
    }
127
0
  }
128
129
0
  if (files_overlap_) {
130
0
    if (ingestion_options_.ingest_behind) {
131
0
      return Status::NotSupported(
132
0
          "Files with overlapping ranges cannot be ingested with ingestion "
133
0
          "behind mode.");
134
0
    }
135
136
    // Overlapping files need at least two different sequence numbers. If
137
    // settings disables global seqno, ingestion will fail anyway, so fail
138
    // fast in prepare.
139
0
    if (!ingestion_options_.allow_global_seqno &&
140
0
        !ingestion_options_.allow_db_generated_files) {
141
0
      return Status::InvalidArgument(
142
0
          "Global seqno is required, but disabled (because external files key "
143
0
          "range overlaps).");
144
0
    }
145
146
0
    if (ucmp_->timestamp_size() > 0) {
147
0
      return Status::NotSupported(
148
0
          "Files with overlapping ranges cannot be ingested to column "
149
0
          "family with user-defined timestamp enabled.");
150
0
    }
151
0
  }
152
153
  // Copy/Move external files into DB
154
0
  std::unordered_set<size_t> ingestion_path_ids;
155
0
  for (IngestedFileInfo& f : files_to_ingest_) {
156
0
    f.copy_file = false;
157
0
    const std::string path_outside_db = f.external_file_path;
158
0
    const std::string path_inside_db = TableFileName(
159
0
        cfd_->ioptions().cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
160
0
    if (ingestion_options_.move_files || ingestion_options_.link_files) {
161
0
      status =
162
0
          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
163
0
      if (status.ok()) {
164
        // It is unsafe to assume application had sync the file and file
165
        // directory before ingest the file. For integrity of RocksDB we need
166
        // to sync the file.
167
168
        // TODO(xingbo), We should in general be moving away from production
169
        // uses of ReuseWritableFile (except explicitly for WAL recycling),
170
        // ReopenWritableFile, and NewRandomRWFile. We should create a
171
        // FileSystem::SyncFile/FsyncFile API that by default does the
172
        // re-open+sync+close combo but can (a) be reused easily, and (b) be
173
        // overridden to do that more cleanly, e.g. in EncryptedEnv.
174
        // https://github.com/facebook/rocksdb/issues/13741
175
0
        std::unique_ptr<FSWritableFile> file_to_sync;
176
0
        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
177
0
                                           &file_to_sync, nullptr);
178
0
        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
179
0
                                 &s);
180
        // Some file systems (especially remote/distributed) don't support
181
        // reopening a file for writing and don't require reopening and
182
        // syncing the file. Ignore the NotSupported error in that case.
183
0
        if (!s.IsNotSupported()) {
184
0
          status = s;
185
0
          if (status.ok()) {
186
0
            TEST_SYNC_POINT(
187
0
                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
188
0
            status = SyncIngestedFile(file_to_sync.get());
189
0
            TEST_SYNC_POINT(
190
0
                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
191
0
            if (!status.ok()) {
192
0
              ROCKS_LOG_WARN(db_options_.info_log,
193
0
                             "Failed to sync ingested file %s: %s",
194
0
                             path_inside_db.c_str(), status.ToString().c_str());
195
0
            }
196
0
          }
197
0
        }
198
0
      } else if (status.IsNotSupported() &&
199
0
                 ingestion_options_.failed_move_fall_back_to_copy) {
200
        // Original file is on a different FS, use copy instead of hard linking.
201
0
        f.copy_file = true;
202
0
        ROCKS_LOG_INFO(db_options_.info_log,
203
0
                       "Tried to link file %s but it's not supported : %s",
204
0
                       path_outside_db.c_str(), status.ToString().c_str());
205
0
      } else {
206
0
        ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s",
207
0
                       path_outside_db.c_str(), path_inside_db.c_str(),
208
0
                       status.ToString().c_str());
209
0
      }
210
0
    } else {
211
0
      f.copy_file = true;
212
0
    }
213
214
0
    if (f.copy_file) {
215
0
      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
216
0
                               nullptr);
217
      // Always determining the destination temperature from the ingested-to
218
      // level would be difficult because in general we only find out the level
219
      // ingested to later, during Run().
220
      // However, we can guarantee "last level" temperature for when the user
221
      // requires ingestion to the last level.
222
0
      Temperature dst_temp =
223
0
          (ingestion_options_.ingest_behind ||
224
0
           ingestion_options_.fail_if_not_bottommost_level)
225
0
              ? sv->mutable_cf_options.last_level_temperature
226
0
              : sv->mutable_cf_options.default_write_temperature;
227
      // Note: CopyFile also syncs the new file.
228
0
      status = CopyFile(fs_.get(), path_outside_db, f.file_temperature,
229
0
                        path_inside_db, dst_temp, 0, db_options_.use_fsync,
230
0
                        io_tracer_);
231
      // The destination of the copy will be ingested
232
0
      f.file_temperature = dst_temp;
233
234
0
      if (!status.ok()) {
235
0
        ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s",
236
0
                       path_outside_db.c_str(), path_inside_db.c_str(),
237
0
                       status.ToString().c_str());
238
0
      }
239
0
    } else {
240
      // Note: we currently assume that linking files does not cross
241
      // temperatures, so no need to change f.file_temperature
242
0
    }
243
0
    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
244
0
    if (!status.ok()) {
245
0
      break;
246
0
    }
247
0
    f.internal_file_path = path_inside_db;
248
    // Initialize the checksum information of ingested files.
249
0
    f.file_checksum = kUnknownFileChecksum;
250
0
    f.file_checksum_func_name = kUnknownFileChecksumFuncName;
251
0
    ingestion_path_ids.insert(f.fd.GetPathId());
252
0
  }
253
254
0
  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
255
0
  if (status.ok()) {
256
0
    for (auto path_id : ingestion_path_ids) {
257
0
      status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
258
0
          IOOptions(), nullptr,
259
0
          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
260
0
      if (!status.ok()) {
261
0
        ROCKS_LOG_WARN(db_options_.info_log,
262
0
                       "Failed to sync directory %" ROCKSDB_PRIszt
263
0
                       " while ingest file: %s",
264
0
                       path_id, status.ToString().c_str());
265
0
        break;
266
0
      }
267
0
    }
268
0
  }
269
0
  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
270
271
  // Generate and check the sst file checksum. Note that, if
272
  // IngestExternalFileOptions::write_global_seqno is true, we will not update
273
  // the checksum information in the files_to_ingests_ here, since the file is
274
  // updated with the new global_seqno. After global_seqno is updated, DB will
275
  // generate the new checksum and store it in the Manifest. In all other cases
276
  // if ingestion_options_.write_global_seqno == true and
277
  // verify_file_checksum is false, we only check the checksum function name.
278
0
  if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
279
0
    if (ingestion_options_.verify_file_checksum == false &&
280
0
        files_checksums.size() == files_to_ingest_.size() &&
281
0
        files_checksum_func_names.size() == files_to_ingest_.size()) {
282
      // Only when verify_file_checksum == false and the checksum for ingested
283
      // files are provided, DB will use the provided checksum and does not
284
      // generate the checksum for ingested files.
285
0
      need_generate_file_checksum_ = false;
286
0
    } else {
287
0
      need_generate_file_checksum_ = true;
288
0
    }
289
0
    std::vector<std::string> generated_checksums;
290
0
    std::vector<std::string> generated_checksum_func_names;
291
    // Step 1: generate the checksum for ingested sst file.
292
0
    if (need_generate_file_checksum_) {
293
0
      for (size_t i = 0; i < files_to_ingest_.size(); i++) {
294
0
        std::string generated_checksum;
295
0
        std::string generated_checksum_func_name;
296
0
        std::string requested_checksum_func_name =
297
0
            i < files_checksum_func_names.size() ? files_checksum_func_names[i]
298
0
                                                 : "";
299
        // TODO: rate limit file reads for checksum calculation during file
300
        // ingestion.
301
        // TODO: plumb Env::IOActivity
302
0
        ReadOptions ro;
303
        // Pass user-provided checksums through FileOptions when available.
304
        // The caller may not have provided checksums at all (empty vectors),
305
        // so we guard with a bounds check.
306
0
        FileOptions fopts;
307
0
        if (i < files_checksums.size()) {
308
0
          fopts.file_checksum = files_checksums[i];
309
0
        }
310
0
        if (i < files_checksum_func_names.size()) {
311
0
          fopts.file_checksum_func_name = files_checksum_func_names[i];
312
0
        } else {
313
0
          fopts.file_checksum_func_name = kNoFileChecksumFuncName;
314
0
        }
315
0
        IOStatus io_s = GenerateOneFileChecksum(
316
0
            fs_.get(), files_to_ingest_[i].internal_file_path,
317
0
            db_options_.file_checksum_gen_factory.get(),
318
0
            requested_checksum_func_name, &generated_checksum,
319
0
            &generated_checksum_func_name,
320
0
            ingestion_options_.verify_checksums_readahead_size,
321
0
            db_options_.allow_mmap_reads, io_tracer_,
322
0
            db_options_.rate_limiter.get(), ro, db_options_.stats,
323
0
            db_options_.clock, fopts);
324
0
        if (!io_s.ok()) {
325
0
          status = io_s;
326
0
          ROCKS_LOG_WARN(db_options_.info_log,
327
0
                         "Sst file checksum generation of file: %s failed: %s",
328
0
                         files_to_ingest_[i].internal_file_path.c_str(),
329
0
                         status.ToString().c_str());
330
0
          break;
331
0
        }
332
0
        if (ingestion_options_.write_global_seqno == false) {
333
0
          files_to_ingest_[i].file_checksum = generated_checksum;
334
0
          files_to_ingest_[i].file_checksum_func_name =
335
0
              generated_checksum_func_name;
336
0
        }
337
0
        generated_checksums.push_back(generated_checksum);
338
0
        generated_checksum_func_names.push_back(generated_checksum_func_name);
339
0
      }
340
0
    }
341
342
    // Step 2: based on the verify_file_checksum and ingested checksum
343
    // information, do the verification.
344
0
    if (status.ok()) {
345
0
      if (files_checksums.size() == files_to_ingest_.size() &&
346
0
          files_checksum_func_names.size() == files_to_ingest_.size()) {
347
        // Verify the checksum and checksum function name.
348
0
        if (ingestion_options_.verify_file_checksum) {
349
0
          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
350
0
            if (files_checksum_func_names[i] !=
351
0
                generated_checksum_func_names[i]) {
352
0
              status = Status::InvalidArgument(
353
0
                  "DB file checksum gen factory " +
354
0
                  std::string(db_options_.file_checksum_gen_factory->Name()) +
355
0
                  " generated checksum function name " +
356
0
                  generated_checksum_func_names[i] + " for file " +
357
0
                  external_files_paths[i] +
358
0
                  " which does not match requested/provided " +
359
0
                  files_checksum_func_names[i]);
360
0
              break;
361
0
            }
362
0
            if (files_checksums[i] != generated_checksums[i]) {
363
0
              status = Status::Corruption(
364
0
                  "Checksum verification mismatch for ingestion file " +
365
0
                  external_files_paths[i] + " using function " +
366
0
                  generated_checksum_func_names[i] + ". Expected: " +
367
0
                  Slice(files_checksums[i]).ToString(/*hex=*/true) +
368
0
                  " Computed: " +
369
0
                  Slice(generated_checksums[i]).ToString(/*hex=*/true));
370
0
              break;
371
0
            }
372
0
          }
373
0
        } else {
374
          // If verify_file_checksum is not enabled, we only verify the factory
375
          // recognizes the checksum function name. If it does not match, fail
376
          // the ingestion. If matches, we trust the ingested checksum
377
          // information and store in the Manifest.
378
0
          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
379
0
            FileChecksumGenContext gen_context;
380
0
            gen_context.file_name = files_to_ingest_[i].internal_file_path;
381
0
            gen_context.requested_checksum_func_name =
382
0
                files_checksum_func_names[i];
383
0
            auto file_checksum_gen =
384
0
                db_options_.file_checksum_gen_factory
385
0
                    ->CreateFileChecksumGenerator(gen_context);
386
387
0
            if (file_checksum_gen == nullptr ||
388
0
                files_checksum_func_names[i] != file_checksum_gen->Name()) {
389
0
              status = Status::InvalidArgument(
390
0
                  "Checksum function name " + files_checksum_func_names[i] +
391
0
                  " for file " + external_files_paths[i] +
392
0
                  " not recognized by DB checksum gen factory" +
393
0
                  db_options_.file_checksum_gen_factory->Name() +
394
0
                  (file_checksum_gen ? (" Returned function " +
395
0
                                        std::string(file_checksum_gen->Name()))
396
0
                                     : ""));
397
0
              break;
398
0
            }
399
0
            files_to_ingest_[i].file_checksum = files_checksums[i];
400
0
            files_to_ingest_[i].file_checksum_func_name =
401
0
                files_checksum_func_names[i];
402
0
          }
403
0
        }
404
0
      } else if (files_checksums.size() != files_checksum_func_names.size() ||
405
0
                 files_checksums.size() != 0) {
406
        // The checksum or checksum function name vector are not both empty
407
        // and they are incomplete.
408
0
        status = Status::InvalidArgument(
409
0
            "The checksum information of ingested sst files are nonempty and "
410
0
            "the size of checksums or the size of the checksum function "
411
0
            "names does not match with the number of ingested sst files");
412
0
      }
413
0
      if (!status.ok()) {
414
0
        ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s",
415
0
                       status.ToString().c_str());
416
0
      }
417
0
    }
418
0
  }
419
420
0
  if (status.ok()) {
421
0
    DivideInputFilesIntoBatches();
422
0
  }
423
424
0
  return status;
425
0
}
426
427
0
void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
428
0
  if (!files_overlap_) {
429
    // No overlap, treat as one batch without the need of tracking overall batch
430
    // range.
431
0
    file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ false);
432
0
    for (auto& file : files_to_ingest_) {
433
0
      file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
434
0
    }
435
0
    return;
436
0
  }
437
438
0
  file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
439
0
  for (auto& file : files_to_ingest_) {
440
0
    if (!file_batches_to_ingest_.back().unset() &&
441
0
        file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file,
442
0
                                     /* known_sorted= */ false)) {
443
0
      file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
444
0
    }
445
0
    file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
446
0
  }
447
0
}
448
449
Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
450
0
                                               SuperVersion* super_version) {
451
0
  Status status;
452
0
  if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) {
453
    // For replacing whole CF, we can simply check whether memtable is empty
454
0
    *flush_needed = !super_version->mem->IsEmpty();
455
0
  } else {
456
0
    autovector<UserKeyRange> ranges;
457
0
    if (atomic_replace_range_.has_value()) {
458
0
      assert(!atomic_replace_range_->smallest_internal_key.unset());
459
0
      assert(!atomic_replace_range_->largest_internal_key.unset());
460
      // NOTE: we already checked in Prepare() that the atomic_replace_range
461
      // covers all the files_to_ingest
462
      // FIXME: need to make upper bound key exclusive (not easy here because
463
      // the existing internal APIs deal in inclusive upper bound user keys)
464
0
      ranges.emplace_back(
465
0
          atomic_replace_range_->smallest_internal_key.user_key(),
466
0
          atomic_replace_range_->largest_internal_key.user_key());
467
0
    } else {
468
0
      ranges.reserve(files_to_ingest_.size());
469
0
      for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
470
0
        ranges.emplace_back(file_to_ingest.start_ukey,
471
0
                            file_to_ingest.limit_ukey);
472
0
      }
473
0
    }
474
0
    status = cfd_->RangesOverlapWithMemtables(
475
0
        ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
476
0
    if (!status.ok()) {
477
0
      ROCKS_LOG_WARN(db_options_.info_log,
478
0
                     "Failed to check ranges overlap with memtables: %s",
479
0
                     status.ToString().c_str());
480
0
    }
481
0
  }
482
0
  if (status.ok() && *flush_needed) {
483
0
    if (!ingestion_options_.allow_blocking_flush) {
484
0
      status = Status::InvalidArgument("External file requires flush");
485
0
    }
486
0
    if (ucmp_->timestamp_size() > 0) {
487
0
      status = Status::InvalidArgument(
488
0
          "Column family enables user-defined timestamps, please make "
489
0
          "sure the key range (without timestamp) of external file does not "
490
0
          "overlap with key range in the memtables.");
491
0
    }
492
0
  }
493
0
  return status;
494
0
}
495
496
// REQUIRES: we have become the only writer by entering both write_thread_ and
497
// nonmem_write_thread_
498
0
Status ExternalSstFileIngestionJob::Run() {
499
0
  SuperVersion* super_version = cfd_->GetSuperVersion();
500
  // If column family is flushed after Prepare and before Run, we should have a
501
  // specific state of Memtables. The mutable Memtable should be empty, and the
502
  // immutable Memtable list should be empty.
503
0
  if (flushed_before_run_ && (super_version->imm->NumNotFlushed() != 0 ||
504
0
                              !super_version->mem->IsEmpty())) {
505
0
    return Status::TryAgain(
506
0
        "Inconsistent memtable state detected when flushed before run.");
507
0
  }
508
0
  Status status;
509
#ifndef NDEBUG
510
  // We should never run the job with a memtable that is overlapping
511
  // with the files we are ingesting
512
  bool need_flush = false;
513
  status = NeedsFlush(&need_flush, super_version);
514
  if (!status.ok()) {
515
    ROCKS_LOG_WARN(db_options_.info_log,
516
                   "Failed to check if flush is needed: %s",
517
                   status.ToString().c_str());
518
    return status;
519
  }
520
  if (need_flush) {
521
    return Status::TryAgain("need_flush");
522
  }
523
  assert(status.ok() && need_flush == false);
524
#endif
525
526
0
  bool force_global_seqno = false;
527
528
0
  if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
529
    // We need to assign a global sequence number to all the files even
530
    // if the don't overlap with any ranges since we have snapshots
531
0
    force_global_seqno = true;
532
0
  }
533
  // It is safe to use this instead of LastAllocatedSequence since we are
534
  // the only active writer, and hence they are equal
535
0
  SequenceNumber last_seqno = versions_->LastSequence();
536
0
  edit_.SetColumnFamily(cfd_->GetID());
537
538
0
  if (atomic_replace_range_.has_value()) {
539
0
    auto* vstorage = super_version->current->storage_info();
540
0
    if (atomic_replace_range_->unset()) {
541
0
      if (cfd_->compaction_picker()->IsCompactionInProgress()) {
542
0
        return Status::InvalidArgument(
543
0
            "Atomic replace range (full) overlaps with pending compaction");
544
0
      }
545
0
      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
546
0
        for (auto file : vstorage->LevelFiles(lvl)) {
547
          // Set up to delete file to be replaced
548
0
          edit_.DeleteFile(lvl, file->fd.GetNumber());
549
0
        }
550
0
      }
551
0
    } else {
552
0
      assert(!atomic_replace_range_->smallest_internal_key.unset());
553
0
      assert(!atomic_replace_range_->largest_internal_key.unset());
554
0
      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
555
0
        if (cfd_->RangeOverlapWithCompaction(
556
0
                atomic_replace_range_->smallest_internal_key.user_key(),
557
0
                atomic_replace_range_->largest_internal_key.user_key(), lvl)) {
558
0
          return Status::InvalidArgument(
559
0
              "Atomic replace range overlaps with pending compaction");
560
0
        }
561
0
        for (auto file : vstorage->LevelFiles(lvl)) {
562
0
          if (file_range_checker_.Overlaps(*atomic_replace_range_,
563
0
                                           file->smallest, file->largest)) {
564
0
            if (file_range_checker_.Contains(*atomic_replace_range_,
565
0
                                             file->smallest, file->largest)) {
566
              // Set up to delete file to be replaced
567
0
              edit_.DeleteFile(lvl, file->fd.GetNumber());
568
0
            } else {
569
              // TODO: generate and ingest a tombstone file also
570
0
              return Status::InvalidArgument(
571
0
                  "Atomic replace range partially overlaps with existing file");
572
0
            }
573
0
          }
574
0
        }
575
0
      }
576
0
    }
577
0
  }
578
579
  // Find levels to ingest into
580
0
  std::optional<int> prev_batch_uppermost_level;
581
  // batches at the front of file_batches_to_ingest_ contains older updates and
582
  // are placed in smaller levels.
583
0
  for (auto& batch : file_batches_to_ingest_) {
584
0
    int batch_uppermost_level = 0;
585
0
    status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno,
586
0
                                     &last_seqno, &batch_uppermost_level,
587
0
                                     prev_batch_uppermost_level);
588
0
    if (!status.ok()) {
589
0
      ROCKS_LOG_WARN(db_options_.info_log,
590
0
                     "Failed to assign levels for one batch: %s",
591
0
                     status.ToString().c_str());
592
0
      return status;
593
0
    }
594
595
0
    prev_batch_uppermost_level = batch_uppermost_level;
596
0
  }
597
598
0
  CreateEquivalentFileIngestingCompactions();
599
0
  return status;
600
0
}
601
602
Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
603
    FileBatchInfo& batch, SuperVersion* super_version, bool force_global_seqno,
604
    SequenceNumber* last_seqno, int* batch_uppermost_level,
605
0
    std::optional<int> prev_batch_uppermost_level) {
606
0
  Status status;
607
0
  assert(batch_uppermost_level);
608
0
  *batch_uppermost_level = std::numeric_limits<int>::max();
609
0
  for (IngestedFileInfo* file : batch.files) {
610
0
    assert(file);
611
0
    SequenceNumber assigned_seqno = 0;
612
0
    if (ingestion_options_.ingest_behind) {
613
0
      status = CheckLevelForIngestedBehindFile(file);
614
0
    } else {
615
0
      status = AssignLevelAndSeqnoForIngestedFile(
616
0
          super_version, force_global_seqno, cfd_->ioptions().compaction_style,
617
0
          *last_seqno, file, &assigned_seqno, prev_batch_uppermost_level);
618
0
    }
619
620
    // Modify the smallest/largest internal key to include the sequence number
621
    // that we just learned. Only overwrite sequence number zero. There could
622
    // be a nonzero sequence number already to indicate a range tombstone's
623
    // exclusive endpoint.
624
0
    ParsedInternalKey smallest_parsed, largest_parsed;
625
0
    if (status.ok()) {
626
0
      status = ParseInternalKey(*(file->smallest_internal_key.rep()),
627
0
                                &smallest_parsed, false /* log_err_key */);
628
0
    }
629
0
    if (status.ok()) {
630
0
      status = ParseInternalKey(*(file->largest_internal_key.rep()),
631
0
                                &largest_parsed, false /* log_err_key */);
632
0
    }
633
0
    if (!status.ok()) {
634
0
      ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s",
635
0
                     status.ToString().c_str());
636
0
      return status;
637
0
    }
638
639
    // If any ingested file overlaps with the DB, it will fail here.
640
0
    if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) {
641
0
      return Status::InvalidArgument(
642
0
          "An ingested file overlaps with existing data in the DB and has been "
643
0
          "assigned a non-zero sequence number, which is not allowed when "
644
0
          "'allow_db_generated_files' is enabled.");
645
0
    }
646
647
0
    if (smallest_parsed.sequence == 0 && assigned_seqno != 0) {
648
0
      UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno,
649
0
                        smallest_parsed.type);
650
0
    }
651
0
    if (largest_parsed.sequence == 0 && assigned_seqno != 0) {
652
0
      UpdateInternalKey(file->largest_internal_key.rep(), assigned_seqno,
653
0
                        largest_parsed.type);
654
0
    }
655
656
0
    status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno);
657
0
    if (!status.ok()) {
658
0
      ROCKS_LOG_WARN(
659
0
          db_options_.info_log,
660
0
          "Failed to assign global sequence number for ingested file: %s",
661
0
          status.ToString().c_str());
662
0
      return status;
663
0
    }
664
0
    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
665
0
                             &assigned_seqno);
666
0
    assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1);
667
0
    if (assigned_seqno > *last_seqno) {
668
0
      *last_seqno = assigned_seqno;
669
0
    }
670
0
    max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno);
671
672
0
    status = GenerateChecksumForIngestedFile(file);
673
0
    if (!status.ok()) {
674
0
      ROCKS_LOG_WARN(db_options_.info_log,
675
0
                     "Failed to generate checksum for ingested file: %s",
676
0
                     status.ToString().c_str());
677
0
      return status;
678
0
    }
679
680
    // We use the import time as the ancester time. This is the time the data
681
    // is written to the database.
682
0
    int64_t temp_current_time = 0;
683
0
    uint64_t current_time = kUnknownFileCreationTime;
684
0
    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
685
0
    if (clock_->GetCurrentTime(&temp_current_time).ok()) {
686
0
      current_time = oldest_ancester_time =
687
0
          static_cast<uint64_t>(temp_current_time);
688
0
    }
689
0
    uint64_t tail_size = FileMetaData::CalculateTailSize(
690
0
        file->fd.GetFileSize(), file->table_properties);
691
692
0
    bool marked_for_compaction =
693
0
        file->table_properties.num_range_deletions == 1 &&
694
0
        (file->table_properties.num_entries ==
695
0
         file->table_properties.num_range_deletions);
696
0
    SequenceNumber smallest_seqno = file->assigned_seqno;
697
0
    SequenceNumber largest_seqno = file->assigned_seqno;
698
0
    if (ingestion_options_.allow_db_generated_files) {
699
0
      assert(file->assigned_seqno == 0);
700
0
      assert(file->smallest_seqno != kMaxSequenceNumber);
701
0
      assert(file->largest_seqno != kMaxSequenceNumber);
702
0
      smallest_seqno = file->smallest_seqno;
703
0
      largest_seqno = file->largest_seqno;
704
0
      max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno);
705
0
    }
706
0
    FileMetaData f_metadata(
707
0
        file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(),
708
0
        file->smallest_internal_key, file->largest_internal_key, smallest_seqno,
709
0
        largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber,
710
0
        oldest_ancester_time, current_time,
711
0
        ingestion_options_.ingest_behind
712
0
            ? kReservedEpochNumberForFileIngestedBehind
713
0
            : cfd_->NewEpochNumber(),  // orders files ingested to L0
714
0
        file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
715
0
        tail_size, file->user_defined_timestamps_persisted, "", "");
716
0
    f_metadata.temperature = file->file_temperature;
717
0
    f_metadata.marked_for_compaction = marked_for_compaction;
718
    // Extract min/max timestamps from table properties for UDT support.
719
    // This ensures ingested files have proper timestamp ranges in FileMetaData,
720
    // similar to files created by flush and compaction.
721
0
    ExtractTimestampFromTableProperties(file->table_properties, &f_metadata);
722
0
    edit_.AddFile(file->picked_level, f_metadata);
723
724
0
    *batch_uppermost_level =
725
0
        std::min(*batch_uppermost_level, file->picked_level);
726
0
  }
727
728
0
  return Status::OK();
729
0
}
730
731
0
void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
732
  // A map from output level to input of compactions equivalent to this
733
  // ingestion job.
734
  // TODO: simplify below logic to creating compaction per ingested file
735
  // instead of per output level, once we figure out how to treat ingested files
736
  // with adjacent range deletion tombstones to same output level in the same
737
  // job as non-overlapping compactions.
738
0
  std::map<int, CompactionInputFiles>
739
0
      output_level_to_file_ingesting_compaction_input;
740
741
0
  for (const auto& pair : edit_.GetNewFiles()) {
742
0
    int output_level = pair.first;
743
0
    const FileMetaData& f_metadata = pair.second;
744
745
0
    CompactionInputFiles& input =
746
0
        output_level_to_file_ingesting_compaction_input[output_level];
747
0
    if (input.files.empty()) {
748
      // Treat the source level of ingested files to be level 0
749
0
      input.level = 0;
750
0
    }
751
752
0
    compaction_input_metdatas_.push_back(new FileMetaData(f_metadata));
753
0
    input.files.push_back(compaction_input_metdatas_.back());
754
0
  }
755
756
0
  for (const auto& pair : output_level_to_file_ingesting_compaction_input) {
757
0
    int output_level = pair.first;
758
0
    const CompactionInputFiles& input = pair.second;
759
760
0
    const auto& mutable_cf_options = cfd_->GetLatestMutableCFOptions();
761
0
    file_ingesting_compactions_.push_back(new Compaction(
762
0
        cfd_->current()->storage_info(), cfd_->ioptions(), mutable_cf_options,
763
0
        mutable_db_options_, {input}, output_level,
764
        /* output file size limit not applicable */
765
0
        MaxFileSizeForLevel(mutable_cf_options, output_level,
766
0
                            cfd_->ioptions().compaction_style),
767
0
        LLONG_MAX /* max compaction bytes, not applicable */,
768
0
        0 /* output path ID, not applicable */, mutable_cf_options.compression,
769
0
        mutable_cf_options.compression_opts, Temperature::kUnknown,
770
0
        0 /* max_subcompaction, not applicable */,
771
0
        {} /* grandparents, not applicable */,
772
0
        std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
773
0
        CompactionReason::kExternalSstIngestion, "" /* trim_ts */,
774
0
        -1 /* score, not applicable */,
775
0
        files_overlap_ /* l0_files_might_overlap, not applicable */));
776
0
  }
777
0
}
778
779
0
void ExternalSstFileIngestionJob::RegisterRange() {
780
0
  for (const auto& c : file_ingesting_compactions_) {
781
0
    cfd_->compaction_picker()->RegisterCompaction(c);
782
0
  }
783
0
}
784
785
0
void ExternalSstFileIngestionJob::UnregisterRange() {
786
0
  for (const auto& c : file_ingesting_compactions_) {
787
0
    cfd_->compaction_picker()->UnregisterCompaction(c);
788
0
    delete c;
789
0
  }
790
0
  file_ingesting_compactions_.clear();
791
792
0
  for (const auto& f : compaction_input_metdatas_) {
793
0
    delete f;
794
0
  }
795
0
  compaction_input_metdatas_.clear();
796
0
}
797
798
0
void ExternalSstFileIngestionJob::UpdateStats() {
799
  // Update internal stats for new ingested files
800
0
  uint64_t total_keys = 0;
801
0
  uint64_t total_l0_files = 0;
802
0
  uint64_t total_time = clock_->NowMicros() - job_start_time_;
803
804
0
  EventLoggerStream stream = event_logger_->Log();
805
0
  stream << "event" << "ingest_finished";
806
0
  stream << "files_ingested";
807
0
  stream.StartArray();
808
809
0
  for (IngestedFileInfo& f : files_to_ingest_) {
810
0
    InternalStats::CompactionStats stats(
811
0
        CompactionReason::kExternalSstIngestion, 1);
812
0
    stats.micros = total_time;
813
    // If actual copy occurred for this file, then we need to count the file
814
    // size as the actual bytes written. If the file was linked, then we ignore
815
    // the bytes written for file metadata.
816
    // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
817
0
    if (f.copy_file) {
818
0
      stats.bytes_written = f.fd.GetFileSize();
819
0
    } else {
820
0
      stats.bytes_moved = f.fd.GetFileSize();
821
0
    }
822
0
    stats.num_output_files = 1;
823
0
    cfd_->internal_stats()->AddCompactionStats(f.picked_level,
824
0
                                               Env::Priority::USER, stats);
825
0
    cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
826
0
                                       f.fd.GetFileSize());
827
0
    total_keys += f.num_entries;
828
0
    if (f.picked_level == 0) {
829
0
      total_l0_files += 1;
830
0
    }
831
0
    ROCKS_LOG_INFO(
832
0
        db_options_.info_log,
833
0
        "[AddFile] External SST file %s was ingested in L%d with path %s "
834
0
        "(global_seqno=%" PRIu64 ")\n",
835
0
        f.external_file_path.c_str(), f.picked_level,
836
0
        f.internal_file_path.c_str(), f.assigned_seqno);
837
0
    stream << "file" << f.internal_file_path << "level" << f.picked_level;
838
0
  }
839
0
  stream.EndArray();
840
841
0
  stream << "lsm_state";
842
0
  stream.StartArray();
843
0
  auto vstorage = cfd_->current()->storage_info();
844
0
  for (int level = 0; level < vstorage->num_levels(); ++level) {
845
0
    stream << vstorage->NumLevelFiles(level);
846
0
  }
847
0
  stream.EndArray();
848
849
0
  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
850
0
                                     total_keys);
851
0
  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
852
0
                                     files_to_ingest_.size());
853
0
  cfd_->internal_stats()->AddCFStats(
854
0
      InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
855
0
}
856
857
0
void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
858
0
  IOOptions io_opts;
859
0
  if (!status.ok()) {
860
    // We failed to add the files to the database
861
    // remove all the files we copied
862
0
    DeleteInternalFiles();
863
0
    files_overlap_ = false;
864
0
  } else if (status.ok() && ingestion_options_.move_files) {
865
    // The files were moved and added successfully, remove original file links
866
0
    for (IngestedFileInfo& f : files_to_ingest_) {
867
0
      Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
868
0
      if (!s.ok()) {
869
0
        ROCKS_LOG_WARN(
870
0
            db_options_.info_log,
871
0
            "%s was added to DB successfully but failed to remove original "
872
0
            "file link : %s",
873
0
            f.external_file_path.c_str(), s.ToString().c_str());
874
0
      }
875
0
    }
876
0
  }
877
0
}
878
879
0
void ExternalSstFileIngestionJob::DeleteInternalFiles() {
880
0
  IOOptions io_opts;
881
0
  for (IngestedFileInfo& f : files_to_ingest_) {
882
0
    if (f.internal_file_path.empty()) {
883
0
      continue;
884
0
    }
885
0
    Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
886
0
    if (!s.ok()) {
887
0
      ROCKS_LOG_WARN(db_options_.info_log,
888
0
                     "AddFile() clean up for file %s failed : %s",
889
0
                     f.internal_file_path.c_str(), s.ToString().c_str());
890
0
    }
891
0
  }
892
0
}
893
894
Status ExternalSstFileIngestionJob::ResetTableReader(
895
    const std::string& external_file, uint64_t new_file_number,
896
    bool user_defined_timestamps_persisted, SuperVersion* sv,
897
    IngestedFileInfo* file_to_ingest,
898
0
    std::unique_ptr<TableReader>* table_reader) {
899
0
  std::unique_ptr<FSRandomAccessFile> sst_file;
900
0
  FileOptions fo{env_options_};
901
0
  fo.temperature = file_to_ingest->file_temperature;
902
0
  Status status =
903
0
      fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
904
0
  if (!status.ok()) {
905
0
    ROCKS_LOG_WARN(
906
0
        db_options_.info_log,
907
0
        "Failed to create random access file for external file %s: %s",
908
0
        external_file.c_str(), status.ToString().c_str());
909
0
    return status;
910
0
  }
911
0
  Temperature updated_temp = sst_file->GetTemperature();
912
0
  if (updated_temp != Temperature::kUnknown &&
913
0
      updated_temp != file_to_ingest->file_temperature) {
914
    // The hint was missing or wrong. Track temperature reported by storage.
915
0
    file_to_ingest->file_temperature = updated_temp;
916
0
  }
917
0
  std::unique_ptr<RandomAccessFileReader> sst_file_reader(
918
0
      new RandomAccessFileReader(std::move(sst_file), external_file,
919
0
                                 nullptr /*Env*/, io_tracer_));
920
0
  table_reader->reset();
921
0
  ReadOptions ro;
922
0
  ro.fill_cache = ingestion_options_.fill_cache;
923
0
  status = sv->mutable_cf_options.table_factory->NewTableReader(
924
0
      ro,
925
0
      TableReaderOptions(
926
0
          cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
927
0
          sv->mutable_cf_options.compression_manager.get(), env_options_,
928
0
          cfd_->internal_comparator(),
929
0
          sv->mutable_cf_options.block_protection_bytes_per_key,
930
0
          /*skip_filters*/ false, /*immortal*/ false,
931
0
          /*force_direct_prefetch*/ false, /*level*/ -1,
932
0
          /*block_cache_tracer*/ nullptr,
933
0
          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
934
0
          /*cur_file_num*/ new_file_number,
935
0
          /* unique_id */ {}, /* largest_seqno */ 0,
936
0
          /* tail_size */ 0, user_defined_timestamps_persisted),
937
0
      std::move(sst_file_reader), file_to_ingest->file_size, table_reader,
938
      // No need to prefetch index/filter if caching is not needed.
939
0
      /*prefetch_index_and_filter_in_cache=*/ingestion_options_.fill_cache);
940
0
  return status;
941
0
}
942
943
Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
944
    const std::string& external_file, uint64_t new_file_number,
945
    SuperVersion* sv, IngestedFileInfo* file_to_ingest,
946
0
    std::unique_ptr<TableReader>* table_reader) {
947
  // Get the external file properties
948
0
  auto props = table_reader->get()->GetTableProperties();
949
0
  assert(props.get());
950
0
  const auto& uprops = props->user_collected_properties;
951
952
  // Get table version
953
0
  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
954
0
  if (version_iter == uprops.end()) {
955
0
    assert(!SstFileWriter::CreatedBySstFileWriter(*props));
956
0
    if (!ingestion_options_.allow_db_generated_files) {
957
0
      return Status::Corruption("External file version not found");
958
0
    } else {
959
      // 0 is special version for when a file from live DB does not have the
960
      // version table property
961
0
      file_to_ingest->version = 0;
962
0
    }
963
0
  } else {
964
0
    assert(SstFileWriter::CreatedBySstFileWriter(*props));
965
0
    file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
966
0
  }
967
968
0
  auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
969
0
  if (file_to_ingest->version == 2) {
970
    // version 2 imply that we have global sequence number
971
0
    if (seqno_iter == uprops.end()) {
972
0
      return Status::Corruption(
973
0
          "External file global sequence number not found");
974
0
    }
975
976
    // Set the global sequence number
977
0
    file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
978
0
    if (props->external_sst_file_global_seqno_offset == 0) {
979
0
      file_to_ingest->global_seqno_offset = 0;
980
0
      return Status::Corruption("Was not able to find file global seqno field");
981
0
    }
982
0
    file_to_ingest->global_seqno_offset =
983
0
        static_cast<size_t>(props->external_sst_file_global_seqno_offset);
984
0
  } else if (file_to_ingest->version == 1) {
985
    // SST file V1 should not have global seqno field
986
0
    assert(seqno_iter == uprops.end());
987
0
    file_to_ingest->original_seqno = 0;
988
0
    if (ingestion_options_.allow_blocking_flush ||
989
0
        ingestion_options_.allow_global_seqno) {
990
0
      return Status::InvalidArgument(
991
0
          "External SST file V1 does not support global seqno");
992
0
    }
993
0
  } else if (file_to_ingest->version == 0) {
994
    // allow_db_generated_files is true
995
0
    assert(seqno_iter == uprops.end());
996
0
    file_to_ingest->original_seqno = 0;
997
0
    file_to_ingest->global_seqno_offset = 0;
998
0
  } else {
999
0
    return Status::InvalidArgument("External file version " +
1000
0
                                   std::to_string(file_to_ingest->version) +
1001
0
                                   " is not supported");
1002
0
  }
1003
1004
0
  file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
1005
  // This assignment works fine even though `table_reader` may later be reset,
1006
  // since that will not affect how table properties are parsed, and this
1007
  // assignment is making a copy.
1008
0
  file_to_ingest->table_properties = *props;
1009
1010
  // Get number of entries in table
1011
0
  file_to_ingest->num_entries = props->num_entries;
1012
0
  file_to_ingest->num_range_deletions = props->num_range_deletions;
1013
1014
  // Validate table properties related to comparator name and user defined
1015
  // timestamps persisted flag.
1016
0
  file_to_ingest->user_defined_timestamps_persisted =
1017
0
      static_cast<bool>(props->user_defined_timestamps_persisted);
1018
0
  bool mark_sst_file_has_no_udt = false;
1019
0
  Status s = ValidateUserDefinedTimestampsOptions(
1020
0
      cfd_->user_comparator(), props->comparator_name,
1021
0
      cfd_->ioptions().persist_user_defined_timestamps,
1022
0
      file_to_ingest->user_defined_timestamps_persisted,
1023
0
      &mark_sst_file_has_no_udt);
1024
0
  if (s.ok() && mark_sst_file_has_no_udt) {
1025
    // A column family that enables user-defined timestamps in Memtable only
1026
    // feature can also ingest external files created by a setting that disables
1027
    // user-defined timestamps. In that case, we need to re-mark the
1028
    // user_defined_timestamps_persisted flag for the file.
1029
0
    file_to_ingest->user_defined_timestamps_persisted = false;
1030
0
  } else if (!s.ok()) {
1031
0
    ROCKS_LOG_WARN(
1032
0
        db_options_.info_log,
1033
0
        "ValidateUserDefinedTimestampsOptions failed for external file %s: %s",
1034
0
        external_file.c_str(), s.ToString().c_str());
1035
0
    return s;
1036
0
  }
1037
1038
  // `TableReader` is initialized with `user_defined_timestamps_persisted` flag
1039
  // to be true. If its value changed to false after this sanity check, we
1040
  // need to reset the `TableReader`.
1041
0
  if (ucmp_->timestamp_size() > 0 &&
1042
0
      !file_to_ingest->user_defined_timestamps_persisted) {
1043
0
    s = ResetTableReader(external_file, new_file_number,
1044
0
                         file_to_ingest->user_defined_timestamps_persisted, sv,
1045
0
                         file_to_ingest, table_reader);
1046
0
  }
1047
0
  return s;
1048
0
}
1049
1050
Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
1051
    const std::string& external_file, uint64_t new_file_number,
1052
0
    IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
1053
0
  file_to_ingest->external_file_path = external_file;
1054
1055
  // Get external file size
1056
0
  Status status = fs_->GetFileSize(external_file, IOOptions(),
1057
0
                                   &file_to_ingest->file_size, nullptr);
1058
0
  if (!status.ok()) {
1059
0
    ROCKS_LOG_WARN(db_options_.info_log,
1060
0
                   "Failed to get file size for external file %s: %s",
1061
0
                   external_file.c_str(), status.ToString().c_str());
1062
0
    return status;
1063
0
  }
1064
1065
  // Assign FD with number
1066
0
  file_to_ingest->fd =
1067
0
      FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
1068
1069
  // Create TableReader for external file
1070
0
  std::unique_ptr<TableReader> table_reader;
1071
  // Initially create the `TableReader` with flag
1072
  // `user_defined_timestamps_persisted` to be true since that's the most common
1073
  // case
1074
0
  status = ResetTableReader(external_file, new_file_number,
1075
0
                            /*user_defined_timestamps_persisted=*/true, sv,
1076
0
                            file_to_ingest, &table_reader);
1077
0
  if (!status.ok()) {
1078
0
    ROCKS_LOG_WARN(db_options_.info_log,
1079
0
                   "Failed to reset table reader for external file %s: %s",
1080
0
                   external_file.c_str(), status.ToString().c_str());
1081
0
    return status;
1082
0
  }
1083
1084
0
  status = SanityCheckTableProperties(external_file, new_file_number, sv,
1085
0
                                      file_to_ingest, &table_reader);
1086
0
  if (!status.ok()) {
1087
0
    ROCKS_LOG_WARN(
1088
0
        db_options_.info_log,
1089
0
        "Failed to sanity check table properties for external file %s: %s",
1090
0
        external_file.c_str(), status.ToString().c_str());
1091
0
    return status;
1092
0
  }
1093
1094
0
  const bool allow_data_in_errors = db_options_.allow_data_in_errors;
1095
0
  ParsedInternalKey key;
1096
0
  if (ingestion_options_.allow_db_generated_files) {
1097
    // We are ingesting a DB generated SST file for which we don't reassign
1098
    // sequence numbers. We need its smallest sequence number and largest
1099
    // sequence number for FileMetaData.
1100
0
    Status seqno_status = GetSeqnoBoundaryForFile(
1101
0
        table_reader.get(), sv, file_to_ingest, allow_data_in_errors);
1102
1103
0
    if (!seqno_status.ok()) {
1104
0
      ROCKS_LOG_WARN(
1105
0
          db_options_.info_log,
1106
0
          "Failed to get sequence number boundary for external file %s: %s",
1107
0
          external_file.c_str(), seqno_status.ToString().c_str());
1108
0
      return seqno_status;
1109
0
    }
1110
0
    assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno);
1111
0
    assert(file_to_ingest->largest_seqno < kMaxSequenceNumber);
1112
0
  } else {
1113
0
    SequenceNumber largest_seqno =
1114
0
        table_reader.get()->GetTableProperties()->key_largest_seqno;
1115
    // UINT64_MAX means unknown and the file is generated before table property
1116
    // `key_largest_seqno` is introduced.
1117
0
    if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
1118
0
      return Status::Corruption(
1119
0
          "External file has non zero largest sequence number " +
1120
0
          std::to_string(largest_seqno));
1121
0
    }
1122
0
  }
1123
1124
0
  if (ingestion_options_.verify_checksums_before_ingest) {
1125
    // If customized readahead size is needed, we can pass a user option
1126
    // all the way to here. Right now we just rely on the default readahead
1127
    // to keep things simple.
1128
    // TODO: plumb Env::IOActivity, Env::IOPriority
1129
0
    ReadOptions ro;
1130
0
    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
1131
0
    ro.fill_cache = ingestion_options_.fill_cache;
1132
0
    status = table_reader->VerifyChecksum(
1133
0
        ro, TableReaderCaller::kExternalSSTIngestion);
1134
0
    if (!status.ok()) {
1135
0
      ROCKS_LOG_WARN(db_options_.info_log,
1136
0
                     "Failed to verify checksum for table reader: %s",
1137
0
                     status.ToString().c_str());
1138
0
      return status;
1139
0
    }
1140
0
  }
1141
1142
  // TODO: plumb Env::IOActivity, Env::IOPriority
1143
0
  ReadOptions ro;
1144
0
  ro.fill_cache = ingestion_options_.fill_cache;
1145
0
  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
1146
0
      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
1147
0
      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
1148
1149
  // Get first (smallest) and last (largest) key from file.
1150
0
  iter->SeekToFirst();
1151
0
  if (iter->Valid()) {
1152
0
    Status pik_status =
1153
0
        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
1154
0
    if (!pik_status.ok()) {
1155
0
      return Status::Corruption("Corrupted key in external file. ",
1156
0
                                pik_status.getState());
1157
0
    }
1158
0
    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
1159
0
      return Status::Corruption("External file has non zero sequence number");
1160
0
    }
1161
0
    file_to_ingest->smallest_internal_key.SetFrom(key);
1162
1163
0
    Slice largest;
1164
0
    if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") ==
1165
0
        0) {
1166
      // PlainTable iterator does not support SeekToLast().
1167
0
      largest = iter->key();
1168
0
      for (; iter->Valid(); iter->Next()) {
1169
0
        if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) {
1170
0
          largest = iter->key();
1171
0
        }
1172
0
      }
1173
0
      if (!iter->status().ok()) {
1174
0
        return iter->status();
1175
0
      }
1176
0
    } else {
1177
0
      iter->SeekToLast();
1178
0
      if (!iter->Valid()) {
1179
0
        if (iter->status().ok()) {
1180
          // The file contains at least 1 key since iter is valid after
1181
          // SeekToFirst().
1182
0
          return Status::Corruption("Can not find largest key in sst file");
1183
0
        } else {
1184
0
          return iter->status();
1185
0
        }
1186
0
      }
1187
0
      largest = iter->key();
1188
0
    }
1189
1190
0
    pik_status = ParseInternalKey(largest, &key, allow_data_in_errors);
1191
0
    if (!pik_status.ok()) {
1192
0
      return Status::Corruption("Corrupted key in external file. ",
1193
0
                                pik_status.getState());
1194
0
    }
1195
0
    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
1196
0
      return Status::Corruption("External file has non zero sequence number");
1197
0
    }
1198
0
    file_to_ingest->largest_internal_key.SetFrom(key);
1199
0
  } else if (!iter->status().ok()) {
1200
0
    return iter->status();
1201
0
  }
1202
1203
0
  std::unique_ptr<InternalIterator> range_del_iter(
1204
0
      table_reader->NewRangeTombstoneIterator(ro));
1205
  // We may need to adjust these key bounds, depending on whether any range
1206
  // deletion tombstones extend past them.
1207
0
  if (range_del_iter != nullptr) {
1208
0
    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
1209
0
         range_del_iter->Next()) {
1210
0
      Status pik_status =
1211
0
          ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
1212
0
      if (!pik_status.ok()) {
1213
0
        return Status::Corruption("Corrupted key in external file. ",
1214
0
                                  pik_status.getState());
1215
0
      }
1216
0
      if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
1217
0
        return Status::Corruption(
1218
0
            "External file has a range deletion with non zero sequence "
1219
0
            "number.");
1220
0
      }
1221
0
      RangeTombstone tombstone(key, range_del_iter->value());
1222
0
      file_range_checker_.MaybeUpdateRange(tombstone.SerializeKey(),
1223
0
                                           tombstone.SerializeEndKey(),
1224
0
                                           file_to_ingest);
1225
0
    }
1226
0
  }
1227
1228
0
  const size_t ts_sz = ucmp_->timestamp_size();
1229
0
  Slice smallest = file_to_ingest->smallest_internal_key.user_key();
1230
0
  Slice largest = file_to_ingest->largest_internal_key.user_key();
1231
0
  if (ts_sz > 0) {
1232
0
    AppendUserKeyWithMaxTimestamp(&file_to_ingest->start_ukey, smallest, ts_sz);
1233
0
    AppendUserKeyWithMinTimestamp(&file_to_ingest->limit_ukey, largest, ts_sz);
1234
0
  } else {
1235
0
    file_to_ingest->start_ukey.assign(smallest.data(), smallest.size());
1236
0
    file_to_ingest->limit_ukey.assign(largest.data(), largest.size());
1237
0
  }
1238
1239
0
  auto s =
1240
0
      GetSstInternalUniqueId(file_to_ingest->table_properties.db_id,
1241
0
                             file_to_ingest->table_properties.db_session_id,
1242
0
                             file_to_ingest->table_properties.orig_file_number,
1243
0
                             &(file_to_ingest->unique_id));
1244
0
  if (!s.ok()) {
1245
0
    ROCKS_LOG_WARN(db_options_.info_log,
1246
0
                   "Failed to get SST unique id for file %s",
1247
0
                   file_to_ingest->internal_file_path.c_str());
1248
0
    file_to_ingest->unique_id = kNullUniqueId64x2;
1249
0
  }
1250
1251
0
  return status;
1252
0
}
1253
1254
Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
1255
    SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
1256
    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
1257
    SequenceNumber* assigned_seqno,
1258
0
    std::optional<int> prev_batch_uppermost_level) {
1259
0
  Status status;
1260
0
  *assigned_seqno = 0;
1261
0
  const size_t ts_sz = ucmp_->timestamp_size();
1262
0
  assert(!prev_batch_uppermost_level.has_value() ||
1263
0
         prev_batch_uppermost_level.value() < cfd_->NumberLevels());
1264
0
  bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() &&
1265
0
                            prev_batch_uppermost_level.value() == 0) ||
1266
0
                           compaction_style == kCompactionStyleFIFO;
1267
1268
0
  if (force_global_seqno || (!ingestion_options_.allow_db_generated_files &&
1269
0
                             (files_overlap_ || must_assign_to_l0))) {
1270
0
    *assigned_seqno = last_seqno + 1;
1271
0
    if (must_assign_to_l0) {
1272
0
      assert(ts_sz == 0);
1273
0
      file_to_ingest->picked_level = 0;
1274
0
      if (ingestion_options_.fail_if_not_bottommost_level &&
1275
0
          cfd_->NumberLevels() > 1) {
1276
0
        status = Status::TryAgain(
1277
0
            "Files cannot be ingested to Lmax. Please make sure key range of "
1278
0
            "Lmax does not overlap with files to ingest.");
1279
0
      }
1280
0
      return status;
1281
0
    }
1282
0
  }
1283
1284
0
  bool overlap_with_db = false;
1285
0
  Arena arena;
1286
  // TODO: plumb Env::IOActivity, Env::IOPriority
1287
0
  ReadOptions ro;
1288
0
  ro.fill_cache = ingestion_options_.fill_cache;
1289
0
  ro.total_order_seek = true;
1290
0
  int target_level = 0;
1291
0
  auto* vstorage = cfd_->current()->storage_info();
1292
0
  assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files);
1293
0
  int assigned_level_exclusive_end = cfd_->NumberLevels();
1294
0
  if (must_assign_to_l0) {
1295
0
    assigned_level_exclusive_end = 0;
1296
0
  } else if (prev_batch_uppermost_level.has_value()) {
1297
0
    assigned_level_exclusive_end = prev_batch_uppermost_level.value();
1298
0
  }
1299
1300
  // When ingesting db generated files, we require that ingested files do not
1301
  // overlap with any file in the DB. So we need to check all levels.
1302
0
  int overlap_checking_exclusive_end =
1303
0
      ingestion_options_.allow_db_generated_files
1304
0
          ? cfd_->NumberLevels()
1305
0
          : assigned_level_exclusive_end;
1306
0
  for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) {
1307
0
    if (lvl > 0 && lvl < vstorage->base_level()) {
1308
0
      continue;
1309
0
    }
1310
0
    if (lvl < assigned_level_exclusive_end &&
1311
0
        atomic_replace_range_.has_value()) {
1312
0
      target_level = lvl;
1313
0
      continue;
1314
0
    }
1315
0
    if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey,
1316
0
                                         file_to_ingest->limit_ukey, lvl)) {
1317
      // We must use L0 or any level higher than `lvl` to be able to overwrite
1318
      // the compaction output keys that we overlap with in this level, We also
1319
      // need to assign this file a seqno to overwrite the compaction output
1320
      // keys in level `lvl`
1321
0
      overlap_with_db = true;
1322
0
      break;
1323
0
    } else if (vstorage->NumLevelFiles(lvl) > 0) {
1324
0
      bool overlap_with_level = false;
1325
0
      status = sv->current->OverlapWithLevelIterator(
1326
0
          ro, env_options_, file_to_ingest->start_ukey,
1327
0
          file_to_ingest->limit_ukey, lvl, &overlap_with_level);
1328
0
      if (!status.ok()) {
1329
0
        ROCKS_LOG_WARN(db_options_.info_log,
1330
0
                       "Failed to check overlap with level iterator: %s",
1331
0
                       status.ToString().c_str());
1332
0
        return status;
1333
0
      }
1334
0
      if (overlap_with_level) {
1335
        // We must use L0 or any level higher than `lvl` to be able to overwrite
1336
        // the keys that we overlap with in this level, We also need to assign
1337
        // this file a seqno to overwrite the existing keys in level `lvl`
1338
0
        overlap_with_db = true;
1339
0
        break;
1340
0
      }
1341
0
    }
1342
1343
    // We don't overlap with any keys in this level, but we still need to check
1344
    // if our file can fit in it
1345
0
    if (lvl < assigned_level_exclusive_end &&
1346
0
        IngestedFileFitInLevel(file_to_ingest, lvl)) {
1347
0
      target_level = lvl;
1348
0
    }
1349
0
  }
1350
1351
0
  if (ingestion_options_.fail_if_not_bottommost_level &&
1352
0
      target_level < cfd_->NumberLevels() - 1) {
1353
0
    status = Status::TryAgain(
1354
0
        "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
1355
0
        "and ongoing compaction's output to Lmax does not overlap with files "
1356
0
        "to ingest. Input files overlapping with each other can cause some "
1357
0
        "file to be assigned to non Lmax level.");
1358
0
    return status;
1359
0
  }
1360
1361
0
  TEST_SYNC_POINT_CALLBACK(
1362
0
      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
1363
0
      &overlap_with_db);
1364
0
  file_to_ingest->picked_level = target_level;
1365
0
  if (overlap_with_db) {
1366
0
    if (ts_sz > 0) {
1367
0
      status = Status::InvalidArgument(
1368
0
          "Column family enables user-defined timestamps, please make sure the "
1369
0
          "key range (without timestamp) of external file does not overlap "
1370
0
          "with key range (without timestamp) in the db");
1371
0
      return status;
1372
0
    }
1373
0
    if (*assigned_seqno == 0) {
1374
0
      *assigned_seqno = last_seqno + 1;
1375
0
    }
1376
0
  }
1377
1378
0
  return status;
1379
0
}
1380
1381
Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
1382
0
    IngestedFileInfo* file_to_ingest) {
1383
0
  assert(!atomic_replace_range_.has_value());
1384
1385
0
  auto* vstorage = cfd_->current()->storage_info();
1386
  // First, check if new files fit in the last level
1387
0
  int last_lvl = cfd_->NumberLevels() - 1;
1388
0
  if (!IngestedFileFitInLevel(file_to_ingest, last_lvl)) {
1389
0
    return Status::InvalidArgument(
1390
0
        "Can't ingest_behind file as it doesn't fit "
1391
0
        "at the last level!");
1392
0
  }
1393
1394
  // Second, check if despite cf_allow_ingest_behind=true we still have 0
1395
  // seqnums at some upper level
1396
0
  for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
1397
0
    for (auto file : vstorage->LevelFiles(lvl)) {
1398
0
      if (file->fd.smallest_seqno == 0) {
1399
0
        return Status::InvalidArgument(
1400
0
            "Can't ingest_behind file as despite cf_allow_ingest_behind=true "
1401
0
            "there are files with 0 seqno in database at upper levels!");
1402
0
      }
1403
0
    }
1404
0
  }
1405
1406
0
  file_to_ingest->picked_level = last_lvl;
1407
0
  return Status::OK();
1408
0
}
1409
1410
Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
1411
0
    IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
1412
0
  if (ingestion_options_.allow_db_generated_files) {
1413
0
    assert(seqno == 0);
1414
0
    assert(file_to_ingest->original_seqno == 0);
1415
0
  }
1416
0
  if (file_to_ingest->original_seqno == seqno) {
1417
    // This file already has the correct global seqno.
1418
0
    return Status::OK();
1419
0
  } else if (!ingestion_options_.allow_global_seqno) {
1420
0
    return Status::InvalidArgument("Global seqno is required, but disabled");
1421
0
  } else if (ingestion_options_.write_global_seqno &&
1422
0
             file_to_ingest->global_seqno_offset == 0) {
1423
0
    return Status::InvalidArgument(
1424
0
        "Trying to set global seqno for a file that don't have a global seqno "
1425
0
        "field");
1426
0
  }
1427
1428
0
  if (ingestion_options_.write_global_seqno) {
1429
    // Determine if we can write global_seqno to a given offset of file.
1430
    // If the file system does not support random write, then we should not.
1431
    // Otherwise we should.
1432
0
    std::unique_ptr<FSRandomRWFile> rwfile;
1433
0
    Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path,
1434
0
                                         env_options_, &rwfile, nullptr);
1435
0
    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
1436
0
                             &status);
1437
0
    if (status.ok()) {
1438
0
      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
1439
0
                              file_to_ingest->internal_file_path);
1440
0
      std::string seqno_val;
1441
0
      PutFixed64(&seqno_val, seqno);
1442
0
      status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
1443
0
                            IOOptions(), nullptr);
1444
0
      if (!status.ok()) {
1445
0
        ROCKS_LOG_WARN(db_options_.info_log,
1446
0
                       "Failed to write global seqno to %s: %s",
1447
0
                       file_to_ingest->internal_file_path.c_str(),
1448
0
                       status.ToString().c_str());
1449
0
        return status;
1450
0
      }
1451
1452
0
      if (status.ok()) {
1453
0
        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
1454
0
        status = SyncIngestedFile(fsptr.get());
1455
0
        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
1456
0
        if (!status.ok()) {
1457
0
          ROCKS_LOG_WARN(db_options_.info_log,
1458
0
                         "Failed to sync ingested file %s after writing global "
1459
0
                         "sequence number: %s",
1460
0
                         file_to_ingest->internal_file_path.c_str(),
1461
0
                         status.ToString().c_str());
1462
0
        }
1463
0
      }
1464
0
      if (!status.ok()) {
1465
0
        return status;
1466
0
      }
1467
0
    } else if (!status.IsNotSupported()) {
1468
0
      ROCKS_LOG_WARN(
1469
0
          db_options_.info_log,
1470
0
          "Failed to open ingested file %s for random read/write: %s",
1471
0
          file_to_ingest->internal_file_path.c_str(),
1472
0
          status.ToString().c_str());
1473
0
      return status;
1474
0
    }
1475
0
  }
1476
1477
0
  file_to_ingest->assigned_seqno = seqno;
1478
0
  return Status::OK();
1479
0
}
1480
1481
IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
1482
0
    IngestedFileInfo* file_to_ingest) {
1483
0
  if (db_options_.file_checksum_gen_factory == nullptr ||
1484
0
      need_generate_file_checksum_ == false ||
1485
0
      ingestion_options_.write_global_seqno == false) {
1486
    // If file_checksum_gen_factory is not set, we are not able to generate
1487
    // the checksum. if write_global_seqno is false, it means we will use
1488
    // file checksum generated during Prepare(). This step will be skipped.
1489
0
    return IOStatus::OK();
1490
0
  }
1491
0
  std::string file_checksum;
1492
0
  std::string file_checksum_func_name;
1493
0
  std::string requested_checksum_func_name;
1494
  // TODO: rate limit file reads for checksum calculation during file ingestion.
1495
  // TODO: plumb Env::IOActivity
1496
0
  ReadOptions ro;
1497
0
  FileOptions gen_fopts;
1498
0
  gen_fopts.file_checksum_func_name = kNoFileChecksumFuncName;
1499
0
  IOStatus io_s = GenerateOneFileChecksum(
1500
0
      fs_.get(), file_to_ingest->internal_file_path,
1501
0
      db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
1502
0
      &file_checksum, &file_checksum_func_name,
1503
0
      ingestion_options_.verify_checksums_readahead_size,
1504
0
      db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
1505
0
      ro, db_options_.stats, db_options_.clock, gen_fopts);
1506
0
  if (!io_s.ok()) {
1507
0
    ROCKS_LOG_WARN(
1508
0
        db_options_.info_log, "Failed to generate checksum for %s: %s",
1509
0
        file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str());
1510
0
    return io_s;
1511
0
  }
1512
0
  file_to_ingest->file_checksum = std::move(file_checksum);
1513
0
  file_to_ingest->file_checksum_func_name = std::move(file_checksum_func_name);
1514
0
  return IOStatus::OK();
1515
0
}
1516
1517
bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
1518
0
    const IngestedFileInfo* file_to_ingest, int level) {
1519
0
  if (level == 0) {
1520
    // Files can always fit in L0
1521
0
    return true;
1522
0
  }
1523
1524
0
  auto* vstorage = cfd_->current()->storage_info();
1525
0
  Slice file_smallest_user_key(file_to_ingest->start_ukey);
1526
0
  Slice file_largest_user_key(file_to_ingest->limit_ukey);
1527
1528
0
  if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
1529
0
                               &file_largest_user_key)) {
1530
    // File overlap with another files in this level, we cannot
1531
    // add it to this level
1532
0
    return false;
1533
0
  }
1534
1535
  // File did not overlap with level files, nor compaction output
1536
0
  return true;
1537
0
}
1538
1539
template <typename TWritableFile>
1540
0
Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
1541
0
  assert(file != nullptr);
1542
0
  if (db_options_.use_fsync) {
1543
0
    return file->Fsync(IOOptions(), nullptr);
1544
0
  } else {
1545
0
    return file->Sync(IOOptions(), nullptr);
1546
0
  }
1547
0
}
Unexecuted instantiation: rocksdb::Status rocksdb::ExternalSstFileIngestionJob::SyncIngestedFile<rocksdb::FSWritableFile>(rocksdb::FSWritableFile*)
Unexecuted instantiation: rocksdb::Status rocksdb::ExternalSstFileIngestionJob::SyncIngestedFile<rocksdb::FSRandomRWFile>(rocksdb::FSRandomRWFile*)
1548
1549
Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile(
1550
    TableReader* table_reader, SuperVersion* sv,
1551
0
    IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) {
1552
0
  const auto tp = table_reader->GetTableProperties();
1553
0
  const bool has_largest_seqno = tp->HasKeyLargestSeqno();
1554
0
  SequenceNumber largest_seqno = tp->key_largest_seqno;
1555
0
  if (has_largest_seqno) {
1556
0
    file_to_ingest->largest_seqno = largest_seqno;
1557
0
    if (largest_seqno == 0) {
1558
0
      file_to_ingest->smallest_seqno = 0;
1559
0
      return Status::OK();
1560
0
    }
1561
0
    if (tp->HasKeySmallestSeqno()) {
1562
0
      file_to_ingest->smallest_seqno = tp->key_smallest_seqno;
1563
0
      return Status::OK();
1564
0
    }
1565
0
  }
1566
1567
  // For older SST files they may not be recorded in table properties, so
1568
  // we scan the file to find out.
1569
0
  TEST_SYNC_POINT(
1570
0
      "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan");
1571
0
  SequenceNumber smallest_seqno = kMaxSequenceNumber;
1572
0
  SequenceNumber largest_seqno_from_iter = 0;
1573
0
  ReadOptions ro;
1574
0
  ro.fill_cache = ingestion_options_.fill_cache;
1575
0
  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
1576
0
      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
1577
0
      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
1578
0
  ParsedInternalKey key;
1579
0
  iter->SeekToFirst();
1580
0
  while (iter->Valid()) {
1581
0
    Status pik_status =
1582
0
        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
1583
0
    if (!pik_status.ok()) {
1584
0
      return Status::Corruption("Corrupted key in external file. ",
1585
0
                                pik_status.getState());
1586
0
    }
1587
0
    smallest_seqno = std::min(smallest_seqno, key.sequence);
1588
0
    largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence);
1589
0
    iter->Next();
1590
0
  }
1591
0
  if (!iter->status().ok()) {
1592
0
    return iter->status();
1593
0
  }
1594
1595
0
  if (table_reader->GetTableProperties()->num_range_deletions > 0) {
1596
0
    std::unique_ptr<InternalIterator> range_del_iter(
1597
0
        table_reader->NewRangeTombstoneIterator(ro));
1598
0
    if (range_del_iter != nullptr) {
1599
0
      for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
1600
0
           range_del_iter->Next()) {
1601
0
        Status pik_status =
1602
0
            ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
1603
0
        if (!pik_status.ok()) {
1604
0
          return Status::Corruption("Corrupted key in external file. ",
1605
0
                                    pik_status.getState());
1606
0
        }
1607
0
        smallest_seqno = std::min(smallest_seqno, key.sequence);
1608
0
        largest_seqno_from_iter =
1609
0
            std::max(largest_seqno_from_iter, key.sequence);
1610
0
      }
1611
0
      if (!range_del_iter->status().ok()) {
1612
0
        return range_del_iter->status();
1613
0
      }
1614
0
    }
1615
0
  }
1616
1617
0
  file_to_ingest->smallest_seqno = smallest_seqno;
1618
0
  if (!has_largest_seqno) {
1619
0
    file_to_ingest->largest_seqno = largest_seqno_from_iter;
1620
0
  } else {
1621
0
    assert(largest_seqno == largest_seqno_from_iter);
1622
0
    file_to_ingest->largest_seqno = largest_seqno;
1623
0
  }
1624
1625
0
  if (file_to_ingest->largest_seqno == kMaxSequenceNumber) {
1626
0
    return Status::InvalidArgument(
1627
0
        "Unknown smallest seqno for db generated file.");
1628
0
  }
1629
0
  if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) {
1630
0
    return Status::InvalidArgument(
1631
0
        "Unknown largest seqno for db generated file.");
1632
0
  }
1633
0
  return Status::OK();
1634
0
}
1635
1636
}  // namespace ROCKSDB_NAMESPACE