/src/rocksdb/db/external_sst_file_ingestion_job.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | |
6 | | #include "db/external_sst_file_ingestion_job.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cinttypes> |
10 | | #include <string> |
11 | | #include <unordered_set> |
12 | | #include <vector> |
13 | | |
14 | | #include "db/builder.h" |
15 | | #include "db/db_impl/db_impl.h" |
16 | | #include "db/version_edit.h" |
17 | | #include "file/file_util.h" |
18 | | #include "file/random_access_file_reader.h" |
19 | | #include "logging/logging.h" |
20 | | #include "table/merging_iterator.h" |
21 | | #include "table/sst_file_writer_collectors.h" |
22 | | #include "table/table_builder.h" |
23 | | #include "table/unique_id_impl.h" |
24 | | #include "test_util/sync_point.h" |
25 | | #include "util/udt_util.h" |
26 | | |
27 | | namespace ROCKSDB_NAMESPACE { |
28 | | |
29 | | Status ExternalSstFileIngestionJob::Prepare( |
30 | | const std::vector<std::string>& external_files_paths, |
31 | | const std::vector<std::string>& files_checksums, |
32 | | const std::vector<std::string>& files_checksum_func_names, |
33 | | const std::optional<RangeOpt>& atomic_replace_range, |
34 | | const Temperature& file_temperature, uint64_t next_file_number, |
35 | 0 | SuperVersion* sv) { |
36 | 0 | Status status; |
37 | | |
38 | | // Read the information of files we are ingesting |
39 | 0 | for (const std::string& file_path : external_files_paths) { |
40 | 0 | IngestedFileInfo file_to_ingest; |
41 | | // For temperature, first assume it matches provided hint |
42 | 0 | file_to_ingest.file_temperature = file_temperature; |
43 | 0 | status = |
44 | 0 | GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); |
45 | 0 | if (!status.ok()) { |
46 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
47 | 0 | "Failed to get ingested file info: %s: %s", |
48 | 0 | file_path.c_str(), status.ToString().c_str()); |
49 | 0 | return status; |
50 | 0 | } |
51 | | |
52 | | // Files generated in another DB or CF may have a different column family |
53 | | // ID, so we let it pass here. |
54 | 0 | if (file_to_ingest.cf_id != |
55 | 0 | TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && |
56 | 0 | file_to_ingest.cf_id != cfd_->GetID() && |
57 | 0 | !ingestion_options_.allow_db_generated_files) { |
58 | 0 | return Status::InvalidArgument( |
59 | 0 | "External file column family id don't match"); |
60 | 0 | } |
61 | | |
62 | 0 | if (file_to_ingest.num_entries == 0 && |
63 | 0 | file_to_ingest.num_range_deletions == 0) { |
64 | 0 | return Status::InvalidArgument("File contain no entries"); |
65 | 0 | } |
66 | | |
67 | 0 | if (!file_to_ingest.smallest_internal_key.Valid() || |
68 | 0 | !file_to_ingest.largest_internal_key.Valid()) { |
69 | 0 | return Status::Corruption("Generated table have corrupted keys"); |
70 | 0 | } |
71 | | |
72 | 0 | files_to_ingest_.emplace_back(std::move(file_to_ingest)); |
73 | 0 | } |
74 | | |
75 | 0 | auto num_files = files_to_ingest_.size(); |
76 | 0 | if (num_files == 0) { |
77 | 0 | return Status::InvalidArgument("The list of files is empty"); |
78 | 0 | } else if (num_files > 1) { |
79 | | // Verify that passed files don't have overlapping ranges |
80 | 0 | autovector<const IngestedFileInfo*> sorted_files; |
81 | 0 | for (size_t i = 0; i < num_files; i++) { |
82 | 0 | sorted_files.push_back(&files_to_ingest_[i]); |
83 | 0 | } |
84 | |
|
85 | 0 | std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_); |
86 | |
|
87 | 0 | for (size_t i = 0; i + 1 < num_files; i++) { |
88 | 0 | if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1], |
89 | 0 | /* known_sorted= */ true)) { |
90 | 0 | files_overlap_ = true; |
91 | 0 | break; |
92 | 0 | } |
93 | 0 | } |
94 | 0 | } |
95 | | |
96 | 0 | if (atomic_replace_range.has_value()) { |
97 | 0 | atomic_replace_range_.emplace(); |
98 | |
|
99 | 0 | if (atomic_replace_range->start && atomic_replace_range->limit) { |
100 | | // User keys to internal keys (with timestamps) |
101 | 0 | const size_t ts_sz = ucmp_->timestamp_size(); |
102 | 0 | std::string start_with_ts, limit_with_ts; |
103 | 0 | auto [start, limit] = MaybeAddTimestampsToRange( |
104 | 0 | atomic_replace_range->start, atomic_replace_range->limit, ts_sz, |
105 | 0 | &start_with_ts, &limit_with_ts); |
106 | 0 | assert(start.has_value()); |
107 | 0 | assert(limit.has_value()); |
108 | 0 | atomic_replace_range_->smallest_internal_key.Set( |
109 | 0 | *start, kMaxSequenceNumber, kValueTypeForSeek); |
110 | 0 | atomic_replace_range_->largest_internal_key.Set( |
111 | 0 | *limit, kMaxSequenceNumber, kValueTypeForSeek); |
112 | | // Check files to ingest against replace range |
113 | 0 | for (size_t i = 0; i < num_files; i++) { |
114 | 0 | if (!file_range_checker_.Contains(*atomic_replace_range_, |
115 | 0 | files_to_ingest_[i])) { |
116 | 0 | return Status::InvalidArgument( |
117 | 0 | "Atomic replace range does not contain all files"); |
118 | 0 | } |
119 | 0 | } |
120 | 0 | } else { |
121 | | // Currently if either bound is not present, both must be |
122 | 0 | assert(atomic_replace_range->start.has_value() == false); |
123 | 0 | assert(atomic_replace_range->limit.has_value() == false); |
124 | 0 | assert(atomic_replace_range_->smallest_internal_key.unset()); |
125 | 0 | assert(atomic_replace_range_->largest_internal_key.unset()); |
126 | 0 | } |
127 | 0 | } |
128 | | |
129 | 0 | if (files_overlap_) { |
130 | 0 | if (ingestion_options_.ingest_behind) { |
131 | 0 | return Status::NotSupported( |
132 | 0 | "Files with overlapping ranges cannot be ingested with ingestion " |
133 | 0 | "behind mode."); |
134 | 0 | } |
135 | | |
136 | | // Overlapping files need at least two different sequence numbers. If |
137 | | // settings disables global seqno, ingestion will fail anyway, so fail |
138 | | // fast in prepare. |
139 | 0 | if (!ingestion_options_.allow_global_seqno && |
140 | 0 | !ingestion_options_.allow_db_generated_files) { |
141 | 0 | return Status::InvalidArgument( |
142 | 0 | "Global seqno is required, but disabled (because external files key " |
143 | 0 | "range overlaps)."); |
144 | 0 | } |
145 | | |
146 | 0 | if (ucmp_->timestamp_size() > 0) { |
147 | 0 | return Status::NotSupported( |
148 | 0 | "Files with overlapping ranges cannot be ingested to column " |
149 | 0 | "family with user-defined timestamp enabled."); |
150 | 0 | } |
151 | 0 | } |
152 | | |
153 | | // Copy/Move external files into DB |
154 | 0 | std::unordered_set<size_t> ingestion_path_ids; |
155 | 0 | for (IngestedFileInfo& f : files_to_ingest_) { |
156 | 0 | f.copy_file = false; |
157 | 0 | const std::string path_outside_db = f.external_file_path; |
158 | 0 | const std::string path_inside_db = TableFileName( |
159 | 0 | cfd_->ioptions().cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); |
160 | 0 | if (ingestion_options_.move_files || ingestion_options_.link_files) { |
161 | 0 | status = |
162 | 0 | fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); |
163 | 0 | if (status.ok()) { |
164 | | // It is unsafe to assume application had sync the file and file |
165 | | // directory before ingest the file. For integrity of RocksDB we need |
166 | | // to sync the file. |
167 | | |
168 | | // TODO(xingbo), We should in general be moving away from production |
169 | | // uses of ReuseWritableFile (except explicitly for WAL recycling), |
170 | | // ReopenWritableFile, and NewRandomRWFile. We should create a |
171 | | // FileSystem::SyncFile/FsyncFile API that by default does the |
172 | | // re-open+sync+close combo but can (a) be reused easily, and (b) be |
173 | | // overridden to do that more cleanly, e.g. in EncryptedEnv. |
174 | | // https://github.com/facebook/rocksdb/issues/13741 |
175 | 0 | std::unique_ptr<FSWritableFile> file_to_sync; |
176 | 0 | Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, |
177 | 0 | &file_to_sync, nullptr); |
178 | 0 | TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", |
179 | 0 | &s); |
180 | | // Some file systems (especially remote/distributed) don't support |
181 | | // reopening a file for writing and don't require reopening and |
182 | | // syncing the file. Ignore the NotSupported error in that case. |
183 | 0 | if (!s.IsNotSupported()) { |
184 | 0 | status = s; |
185 | 0 | if (status.ok()) { |
186 | 0 | TEST_SYNC_POINT( |
187 | 0 | "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); |
188 | 0 | status = SyncIngestedFile(file_to_sync.get()); |
189 | 0 | TEST_SYNC_POINT( |
190 | 0 | "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); |
191 | 0 | if (!status.ok()) { |
192 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
193 | 0 | "Failed to sync ingested file %s: %s", |
194 | 0 | path_inside_db.c_str(), status.ToString().c_str()); |
195 | 0 | } |
196 | 0 | } |
197 | 0 | } |
198 | 0 | } else if (status.IsNotSupported() && |
199 | 0 | ingestion_options_.failed_move_fall_back_to_copy) { |
200 | | // Original file is on a different FS, use copy instead of hard linking. |
201 | 0 | f.copy_file = true; |
202 | 0 | ROCKS_LOG_INFO(db_options_.info_log, |
203 | 0 | "Tried to link file %s but it's not supported : %s", |
204 | 0 | path_outside_db.c_str(), status.ToString().c_str()); |
205 | 0 | } else { |
206 | 0 | ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s", |
207 | 0 | path_outside_db.c_str(), path_inside_db.c_str(), |
208 | 0 | status.ToString().c_str()); |
209 | 0 | } |
210 | 0 | } else { |
211 | 0 | f.copy_file = true; |
212 | 0 | } |
213 | |
|
214 | 0 | if (f.copy_file) { |
215 | 0 | TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", |
216 | 0 | nullptr); |
217 | | // Always determining the destination temperature from the ingested-to |
218 | | // level would be difficult because in general we only find out the level |
219 | | // ingested to later, during Run(). |
220 | | // However, we can guarantee "last level" temperature for when the user |
221 | | // requires ingestion to the last level. |
222 | 0 | Temperature dst_temp = |
223 | 0 | (ingestion_options_.ingest_behind || |
224 | 0 | ingestion_options_.fail_if_not_bottommost_level) |
225 | 0 | ? sv->mutable_cf_options.last_level_temperature |
226 | 0 | : sv->mutable_cf_options.default_write_temperature; |
227 | | // Note: CopyFile also syncs the new file. |
228 | 0 | status = CopyFile(fs_.get(), path_outside_db, f.file_temperature, |
229 | 0 | path_inside_db, dst_temp, 0, db_options_.use_fsync, |
230 | 0 | io_tracer_); |
231 | | // The destination of the copy will be ingested |
232 | 0 | f.file_temperature = dst_temp; |
233 | |
|
234 | 0 | if (!status.ok()) { |
235 | 0 | ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s", |
236 | 0 | path_outside_db.c_str(), path_inside_db.c_str(), |
237 | 0 | status.ToString().c_str()); |
238 | 0 | } |
239 | 0 | } else { |
240 | | // Note: we currently assume that linking files does not cross |
241 | | // temperatures, so no need to change f.file_temperature |
242 | 0 | } |
243 | 0 | TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); |
244 | 0 | if (!status.ok()) { |
245 | 0 | break; |
246 | 0 | } |
247 | 0 | f.internal_file_path = path_inside_db; |
248 | | // Initialize the checksum information of ingested files. |
249 | 0 | f.file_checksum = kUnknownFileChecksum; |
250 | 0 | f.file_checksum_func_name = kUnknownFileChecksumFuncName; |
251 | 0 | ingestion_path_ids.insert(f.fd.GetPathId()); |
252 | 0 | } |
253 | |
|
254 | 0 | TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); |
255 | 0 | if (status.ok()) { |
256 | 0 | for (auto path_id : ingestion_path_ids) { |
257 | 0 | status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( |
258 | 0 | IOOptions(), nullptr, |
259 | 0 | DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); |
260 | 0 | if (!status.ok()) { |
261 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
262 | 0 | "Failed to sync directory %" ROCKSDB_PRIszt |
263 | 0 | " while ingest file: %s", |
264 | 0 | path_id, status.ToString().c_str()); |
265 | 0 | break; |
266 | 0 | } |
267 | 0 | } |
268 | 0 | } |
269 | 0 | TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); |
270 | | |
271 | | // Generate and check the sst file checksum. Note that, if |
272 | | // IngestExternalFileOptions::write_global_seqno is true, we will not update |
273 | | // the checksum information in the files_to_ingests_ here, since the file is |
274 | | // updated with the new global_seqno. After global_seqno is updated, DB will |
275 | | // generate the new checksum and store it in the Manifest. In all other cases |
276 | | // if ingestion_options_.write_global_seqno == true and |
277 | | // verify_file_checksum is false, we only check the checksum function name. |
278 | 0 | if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) { |
279 | 0 | if (ingestion_options_.verify_file_checksum == false && |
280 | 0 | files_checksums.size() == files_to_ingest_.size() && |
281 | 0 | files_checksum_func_names.size() == files_to_ingest_.size()) { |
282 | | // Only when verify_file_checksum == false and the checksum for ingested |
283 | | // files are provided, DB will use the provided checksum and does not |
284 | | // generate the checksum for ingested files. |
285 | 0 | need_generate_file_checksum_ = false; |
286 | 0 | } else { |
287 | 0 | need_generate_file_checksum_ = true; |
288 | 0 | } |
289 | 0 | std::vector<std::string> generated_checksums; |
290 | 0 | std::vector<std::string> generated_checksum_func_names; |
291 | | // Step 1: generate the checksum for ingested sst file. |
292 | 0 | if (need_generate_file_checksum_) { |
293 | 0 | for (size_t i = 0; i < files_to_ingest_.size(); i++) { |
294 | 0 | std::string generated_checksum; |
295 | 0 | std::string generated_checksum_func_name; |
296 | 0 | std::string requested_checksum_func_name = |
297 | 0 | i < files_checksum_func_names.size() ? files_checksum_func_names[i] |
298 | 0 | : ""; |
299 | | // TODO: rate limit file reads for checksum calculation during file |
300 | | // ingestion. |
301 | | // TODO: plumb Env::IOActivity |
302 | 0 | ReadOptions ro; |
303 | | // Pass user-provided checksums through FileOptions when available. |
304 | | // The caller may not have provided checksums at all (empty vectors), |
305 | | // so we guard with a bounds check. |
306 | 0 | FileOptions fopts; |
307 | 0 | if (i < files_checksums.size()) { |
308 | 0 | fopts.file_checksum = files_checksums[i]; |
309 | 0 | } |
310 | 0 | if (i < files_checksum_func_names.size()) { |
311 | 0 | fopts.file_checksum_func_name = files_checksum_func_names[i]; |
312 | 0 | } else { |
313 | 0 | fopts.file_checksum_func_name = kNoFileChecksumFuncName; |
314 | 0 | } |
315 | 0 | IOStatus io_s = GenerateOneFileChecksum( |
316 | 0 | fs_.get(), files_to_ingest_[i].internal_file_path, |
317 | 0 | db_options_.file_checksum_gen_factory.get(), |
318 | 0 | requested_checksum_func_name, &generated_checksum, |
319 | 0 | &generated_checksum_func_name, |
320 | 0 | ingestion_options_.verify_checksums_readahead_size, |
321 | 0 | db_options_.allow_mmap_reads, io_tracer_, |
322 | 0 | db_options_.rate_limiter.get(), ro, db_options_.stats, |
323 | 0 | db_options_.clock, fopts); |
324 | 0 | if (!io_s.ok()) { |
325 | 0 | status = io_s; |
326 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
327 | 0 | "Sst file checksum generation of file: %s failed: %s", |
328 | 0 | files_to_ingest_[i].internal_file_path.c_str(), |
329 | 0 | status.ToString().c_str()); |
330 | 0 | break; |
331 | 0 | } |
332 | 0 | if (ingestion_options_.write_global_seqno == false) { |
333 | 0 | files_to_ingest_[i].file_checksum = generated_checksum; |
334 | 0 | files_to_ingest_[i].file_checksum_func_name = |
335 | 0 | generated_checksum_func_name; |
336 | 0 | } |
337 | 0 | generated_checksums.push_back(generated_checksum); |
338 | 0 | generated_checksum_func_names.push_back(generated_checksum_func_name); |
339 | 0 | } |
340 | 0 | } |
341 | | |
342 | | // Step 2: based on the verify_file_checksum and ingested checksum |
343 | | // information, do the verification. |
344 | 0 | if (status.ok()) { |
345 | 0 | if (files_checksums.size() == files_to_ingest_.size() && |
346 | 0 | files_checksum_func_names.size() == files_to_ingest_.size()) { |
347 | | // Verify the checksum and checksum function name. |
348 | 0 | if (ingestion_options_.verify_file_checksum) { |
349 | 0 | for (size_t i = 0; i < files_to_ingest_.size(); i++) { |
350 | 0 | if (files_checksum_func_names[i] != |
351 | 0 | generated_checksum_func_names[i]) { |
352 | 0 | status = Status::InvalidArgument( |
353 | 0 | "DB file checksum gen factory " + |
354 | 0 | std::string(db_options_.file_checksum_gen_factory->Name()) + |
355 | 0 | " generated checksum function name " + |
356 | 0 | generated_checksum_func_names[i] + " for file " + |
357 | 0 | external_files_paths[i] + |
358 | 0 | " which does not match requested/provided " + |
359 | 0 | files_checksum_func_names[i]); |
360 | 0 | break; |
361 | 0 | } |
362 | 0 | if (files_checksums[i] != generated_checksums[i]) { |
363 | 0 | status = Status::Corruption( |
364 | 0 | "Checksum verification mismatch for ingestion file " + |
365 | 0 | external_files_paths[i] + " using function " + |
366 | 0 | generated_checksum_func_names[i] + ". Expected: " + |
367 | 0 | Slice(files_checksums[i]).ToString(/*hex=*/true) + |
368 | 0 | " Computed: " + |
369 | 0 | Slice(generated_checksums[i]).ToString(/*hex=*/true)); |
370 | 0 | break; |
371 | 0 | } |
372 | 0 | } |
373 | 0 | } else { |
374 | | // If verify_file_checksum is not enabled, we only verify the factory |
375 | | // recognizes the checksum function name. If it does not match, fail |
376 | | // the ingestion. If matches, we trust the ingested checksum |
377 | | // information and store in the Manifest. |
378 | 0 | for (size_t i = 0; i < files_to_ingest_.size(); i++) { |
379 | 0 | FileChecksumGenContext gen_context; |
380 | 0 | gen_context.file_name = files_to_ingest_[i].internal_file_path; |
381 | 0 | gen_context.requested_checksum_func_name = |
382 | 0 | files_checksum_func_names[i]; |
383 | 0 | auto file_checksum_gen = |
384 | 0 | db_options_.file_checksum_gen_factory |
385 | 0 | ->CreateFileChecksumGenerator(gen_context); |
386 | |
|
387 | 0 | if (file_checksum_gen == nullptr || |
388 | 0 | files_checksum_func_names[i] != file_checksum_gen->Name()) { |
389 | 0 | status = Status::InvalidArgument( |
390 | 0 | "Checksum function name " + files_checksum_func_names[i] + |
391 | 0 | " for file " + external_files_paths[i] + |
392 | 0 | " not recognized by DB checksum gen factory" + |
393 | 0 | db_options_.file_checksum_gen_factory->Name() + |
394 | 0 | (file_checksum_gen ? (" Returned function " + |
395 | 0 | std::string(file_checksum_gen->Name())) |
396 | 0 | : "")); |
397 | 0 | break; |
398 | 0 | } |
399 | 0 | files_to_ingest_[i].file_checksum = files_checksums[i]; |
400 | 0 | files_to_ingest_[i].file_checksum_func_name = |
401 | 0 | files_checksum_func_names[i]; |
402 | 0 | } |
403 | 0 | } |
404 | 0 | } else if (files_checksums.size() != files_checksum_func_names.size() || |
405 | 0 | files_checksums.size() != 0) { |
406 | | // The checksum or checksum function name vector are not both empty |
407 | | // and they are incomplete. |
408 | 0 | status = Status::InvalidArgument( |
409 | 0 | "The checksum information of ingested sst files are nonempty and " |
410 | 0 | "the size of checksums or the size of the checksum function " |
411 | 0 | "names does not match with the number of ingested sst files"); |
412 | 0 | } |
413 | 0 | if (!status.ok()) { |
414 | 0 | ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s", |
415 | 0 | status.ToString().c_str()); |
416 | 0 | } |
417 | 0 | } |
418 | 0 | } |
419 | |
|
420 | 0 | if (status.ok()) { |
421 | 0 | DivideInputFilesIntoBatches(); |
422 | 0 | } |
423 | |
|
424 | 0 | return status; |
425 | 0 | } |
426 | | |
427 | 0 | void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() { |
428 | 0 | if (!files_overlap_) { |
429 | | // No overlap, treat as one batch without the need of tracking overall batch |
430 | | // range. |
431 | 0 | file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ false); |
432 | 0 | for (auto& file : files_to_ingest_) { |
433 | 0 | file_batches_to_ingest_.back().AddFile(&file, file_range_checker_); |
434 | 0 | } |
435 | 0 | return; |
436 | 0 | } |
437 | | |
438 | 0 | file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); |
439 | 0 | for (auto& file : files_to_ingest_) { |
440 | 0 | if (!file_batches_to_ingest_.back().unset() && |
441 | 0 | file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file, |
442 | 0 | /* known_sorted= */ false)) { |
443 | 0 | file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); |
444 | 0 | } |
445 | 0 | file_batches_to_ingest_.back().AddFile(&file, file_range_checker_); |
446 | 0 | } |
447 | 0 | } |
448 | | |
449 | | Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, |
450 | 0 | SuperVersion* super_version) { |
451 | 0 | Status status; |
452 | 0 | if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) { |
453 | | // For replacing whole CF, we can simply check whether memtable is empty |
454 | 0 | *flush_needed = !super_version->mem->IsEmpty(); |
455 | 0 | } else { |
456 | 0 | autovector<UserKeyRange> ranges; |
457 | 0 | if (atomic_replace_range_.has_value()) { |
458 | 0 | assert(!atomic_replace_range_->smallest_internal_key.unset()); |
459 | 0 | assert(!atomic_replace_range_->largest_internal_key.unset()); |
460 | | // NOTE: we already checked in Prepare() that the atomic_replace_range |
461 | | // covers all the files_to_ingest |
462 | | // FIXME: need to make upper bound key exclusive (not easy here because |
463 | | // the existing internal APIs deal in inclusive upper bound user keys) |
464 | 0 | ranges.emplace_back( |
465 | 0 | atomic_replace_range_->smallest_internal_key.user_key(), |
466 | 0 | atomic_replace_range_->largest_internal_key.user_key()); |
467 | 0 | } else { |
468 | 0 | ranges.reserve(files_to_ingest_.size()); |
469 | 0 | for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { |
470 | 0 | ranges.emplace_back(file_to_ingest.start_ukey, |
471 | 0 | file_to_ingest.limit_ukey); |
472 | 0 | } |
473 | 0 | } |
474 | 0 | status = cfd_->RangesOverlapWithMemtables( |
475 | 0 | ranges, super_version, db_options_.allow_data_in_errors, flush_needed); |
476 | 0 | if (!status.ok()) { |
477 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
478 | 0 | "Failed to check ranges overlap with memtables: %s", |
479 | 0 | status.ToString().c_str()); |
480 | 0 | } |
481 | 0 | } |
482 | 0 | if (status.ok() && *flush_needed) { |
483 | 0 | if (!ingestion_options_.allow_blocking_flush) { |
484 | 0 | status = Status::InvalidArgument("External file requires flush"); |
485 | 0 | } |
486 | 0 | if (ucmp_->timestamp_size() > 0) { |
487 | 0 | status = Status::InvalidArgument( |
488 | 0 | "Column family enables user-defined timestamps, please make " |
489 | 0 | "sure the key range (without timestamp) of external file does not " |
490 | 0 | "overlap with key range in the memtables."); |
491 | 0 | } |
492 | 0 | } |
493 | 0 | return status; |
494 | 0 | } |
495 | | |
496 | | // REQUIRES: we have become the only writer by entering both write_thread_ and |
497 | | // nonmem_write_thread_ |
498 | 0 | Status ExternalSstFileIngestionJob::Run() { |
499 | 0 | SuperVersion* super_version = cfd_->GetSuperVersion(); |
500 | | // If column family is flushed after Prepare and before Run, we should have a |
501 | | // specific state of Memtables. The mutable Memtable should be empty, and the |
502 | | // immutable Memtable list should be empty. |
503 | 0 | if (flushed_before_run_ && (super_version->imm->NumNotFlushed() != 0 || |
504 | 0 | !super_version->mem->IsEmpty())) { |
505 | 0 | return Status::TryAgain( |
506 | 0 | "Inconsistent memtable state detected when flushed before run."); |
507 | 0 | } |
508 | 0 | Status status; |
509 | | #ifndef NDEBUG |
510 | | // We should never run the job with a memtable that is overlapping |
511 | | // with the files we are ingesting |
512 | | bool need_flush = false; |
513 | | status = NeedsFlush(&need_flush, super_version); |
514 | | if (!status.ok()) { |
515 | | ROCKS_LOG_WARN(db_options_.info_log, |
516 | | "Failed to check if flush is needed: %s", |
517 | | status.ToString().c_str()); |
518 | | return status; |
519 | | } |
520 | | if (need_flush) { |
521 | | return Status::TryAgain("need_flush"); |
522 | | } |
523 | | assert(status.ok() && need_flush == false); |
524 | | #endif |
525 | |
|
526 | 0 | bool force_global_seqno = false; |
527 | |
|
528 | 0 | if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { |
529 | | // We need to assign a global sequence number to all the files even |
530 | | // if the don't overlap with any ranges since we have snapshots |
531 | 0 | force_global_seqno = true; |
532 | 0 | } |
533 | | // It is safe to use this instead of LastAllocatedSequence since we are |
534 | | // the only active writer, and hence they are equal |
535 | 0 | SequenceNumber last_seqno = versions_->LastSequence(); |
536 | 0 | edit_.SetColumnFamily(cfd_->GetID()); |
537 | |
|
538 | 0 | if (atomic_replace_range_.has_value()) { |
539 | 0 | auto* vstorage = super_version->current->storage_info(); |
540 | 0 | if (atomic_replace_range_->unset()) { |
541 | 0 | if (cfd_->compaction_picker()->IsCompactionInProgress()) { |
542 | 0 | return Status::InvalidArgument( |
543 | 0 | "Atomic replace range (full) overlaps with pending compaction"); |
544 | 0 | } |
545 | 0 | for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { |
546 | 0 | for (auto file : vstorage->LevelFiles(lvl)) { |
547 | | // Set up to delete file to be replaced |
548 | 0 | edit_.DeleteFile(lvl, file->fd.GetNumber()); |
549 | 0 | } |
550 | 0 | } |
551 | 0 | } else { |
552 | 0 | assert(!atomic_replace_range_->smallest_internal_key.unset()); |
553 | 0 | assert(!atomic_replace_range_->largest_internal_key.unset()); |
554 | 0 | for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { |
555 | 0 | if (cfd_->RangeOverlapWithCompaction( |
556 | 0 | atomic_replace_range_->smallest_internal_key.user_key(), |
557 | 0 | atomic_replace_range_->largest_internal_key.user_key(), lvl)) { |
558 | 0 | return Status::InvalidArgument( |
559 | 0 | "Atomic replace range overlaps with pending compaction"); |
560 | 0 | } |
561 | 0 | for (auto file : vstorage->LevelFiles(lvl)) { |
562 | 0 | if (file_range_checker_.Overlaps(*atomic_replace_range_, |
563 | 0 | file->smallest, file->largest)) { |
564 | 0 | if (file_range_checker_.Contains(*atomic_replace_range_, |
565 | 0 | file->smallest, file->largest)) { |
566 | | // Set up to delete file to be replaced |
567 | 0 | edit_.DeleteFile(lvl, file->fd.GetNumber()); |
568 | 0 | } else { |
569 | | // TODO: generate and ingest a tombstone file also |
570 | 0 | return Status::InvalidArgument( |
571 | 0 | "Atomic replace range partially overlaps with existing file"); |
572 | 0 | } |
573 | 0 | } |
574 | 0 | } |
575 | 0 | } |
576 | 0 | } |
577 | 0 | } |
578 | | |
579 | | // Find levels to ingest into |
580 | 0 | std::optional<int> prev_batch_uppermost_level; |
581 | | // batches at the front of file_batches_to_ingest_ contains older updates and |
582 | | // are placed in smaller levels. |
583 | 0 | for (auto& batch : file_batches_to_ingest_) { |
584 | 0 | int batch_uppermost_level = 0; |
585 | 0 | status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno, |
586 | 0 | &last_seqno, &batch_uppermost_level, |
587 | 0 | prev_batch_uppermost_level); |
588 | 0 | if (!status.ok()) { |
589 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
590 | 0 | "Failed to assign levels for one batch: %s", |
591 | 0 | status.ToString().c_str()); |
592 | 0 | return status; |
593 | 0 | } |
594 | | |
595 | 0 | prev_batch_uppermost_level = batch_uppermost_level; |
596 | 0 | } |
597 | | |
598 | 0 | CreateEquivalentFileIngestingCompactions(); |
599 | 0 | return status; |
600 | 0 | } |
601 | | |
602 | | Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( |
603 | | FileBatchInfo& batch, SuperVersion* super_version, bool force_global_seqno, |
604 | | SequenceNumber* last_seqno, int* batch_uppermost_level, |
605 | 0 | std::optional<int> prev_batch_uppermost_level) { |
606 | 0 | Status status; |
607 | 0 | assert(batch_uppermost_level); |
608 | 0 | *batch_uppermost_level = std::numeric_limits<int>::max(); |
609 | 0 | for (IngestedFileInfo* file : batch.files) { |
610 | 0 | assert(file); |
611 | 0 | SequenceNumber assigned_seqno = 0; |
612 | 0 | if (ingestion_options_.ingest_behind) { |
613 | 0 | status = CheckLevelForIngestedBehindFile(file); |
614 | 0 | } else { |
615 | 0 | status = AssignLevelAndSeqnoForIngestedFile( |
616 | 0 | super_version, force_global_seqno, cfd_->ioptions().compaction_style, |
617 | 0 | *last_seqno, file, &assigned_seqno, prev_batch_uppermost_level); |
618 | 0 | } |
619 | | |
620 | | // Modify the smallest/largest internal key to include the sequence number |
621 | | // that we just learned. Only overwrite sequence number zero. There could |
622 | | // be a nonzero sequence number already to indicate a range tombstone's |
623 | | // exclusive endpoint. |
624 | 0 | ParsedInternalKey smallest_parsed, largest_parsed; |
625 | 0 | if (status.ok()) { |
626 | 0 | status = ParseInternalKey(*(file->smallest_internal_key.rep()), |
627 | 0 | &smallest_parsed, false /* log_err_key */); |
628 | 0 | } |
629 | 0 | if (status.ok()) { |
630 | 0 | status = ParseInternalKey(*(file->largest_internal_key.rep()), |
631 | 0 | &largest_parsed, false /* log_err_key */); |
632 | 0 | } |
633 | 0 | if (!status.ok()) { |
634 | 0 | ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s", |
635 | 0 | status.ToString().c_str()); |
636 | 0 | return status; |
637 | 0 | } |
638 | | |
639 | | // If any ingested file overlaps with the DB, it will fail here. |
640 | 0 | if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) { |
641 | 0 | return Status::InvalidArgument( |
642 | 0 | "An ingested file overlaps with existing data in the DB and has been " |
643 | 0 | "assigned a non-zero sequence number, which is not allowed when " |
644 | 0 | "'allow_db_generated_files' is enabled."); |
645 | 0 | } |
646 | | |
647 | 0 | if (smallest_parsed.sequence == 0 && assigned_seqno != 0) { |
648 | 0 | UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno, |
649 | 0 | smallest_parsed.type); |
650 | 0 | } |
651 | 0 | if (largest_parsed.sequence == 0 && assigned_seqno != 0) { |
652 | 0 | UpdateInternalKey(file->largest_internal_key.rep(), assigned_seqno, |
653 | 0 | largest_parsed.type); |
654 | 0 | } |
655 | |
|
656 | 0 | status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno); |
657 | 0 | if (!status.ok()) { |
658 | 0 | ROCKS_LOG_WARN( |
659 | 0 | db_options_.info_log, |
660 | 0 | "Failed to assign global sequence number for ingested file: %s", |
661 | 0 | status.ToString().c_str()); |
662 | 0 | return status; |
663 | 0 | } |
664 | 0 | TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", |
665 | 0 | &assigned_seqno); |
666 | 0 | assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1); |
667 | 0 | if (assigned_seqno > *last_seqno) { |
668 | 0 | *last_seqno = assigned_seqno; |
669 | 0 | } |
670 | 0 | max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno); |
671 | |
|
672 | 0 | status = GenerateChecksumForIngestedFile(file); |
673 | 0 | if (!status.ok()) { |
674 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
675 | 0 | "Failed to generate checksum for ingested file: %s", |
676 | 0 | status.ToString().c_str()); |
677 | 0 | return status; |
678 | 0 | } |
679 | | |
680 | | // We use the import time as the ancester time. This is the time the data |
681 | | // is written to the database. |
682 | 0 | int64_t temp_current_time = 0; |
683 | 0 | uint64_t current_time = kUnknownFileCreationTime; |
684 | 0 | uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; |
685 | 0 | if (clock_->GetCurrentTime(&temp_current_time).ok()) { |
686 | 0 | current_time = oldest_ancester_time = |
687 | 0 | static_cast<uint64_t>(temp_current_time); |
688 | 0 | } |
689 | 0 | uint64_t tail_size = FileMetaData::CalculateTailSize( |
690 | 0 | file->fd.GetFileSize(), file->table_properties); |
691 | |
|
692 | 0 | bool marked_for_compaction = |
693 | 0 | file->table_properties.num_range_deletions == 1 && |
694 | 0 | (file->table_properties.num_entries == |
695 | 0 | file->table_properties.num_range_deletions); |
696 | 0 | SequenceNumber smallest_seqno = file->assigned_seqno; |
697 | 0 | SequenceNumber largest_seqno = file->assigned_seqno; |
698 | 0 | if (ingestion_options_.allow_db_generated_files) { |
699 | 0 | assert(file->assigned_seqno == 0); |
700 | 0 | assert(file->smallest_seqno != kMaxSequenceNumber); |
701 | 0 | assert(file->largest_seqno != kMaxSequenceNumber); |
702 | 0 | smallest_seqno = file->smallest_seqno; |
703 | 0 | largest_seqno = file->largest_seqno; |
704 | 0 | max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno); |
705 | 0 | } |
706 | 0 | FileMetaData f_metadata( |
707 | 0 | file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(), |
708 | 0 | file->smallest_internal_key, file->largest_internal_key, smallest_seqno, |
709 | 0 | largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber, |
710 | 0 | oldest_ancester_time, current_time, |
711 | 0 | ingestion_options_.ingest_behind |
712 | 0 | ? kReservedEpochNumberForFileIngestedBehind |
713 | 0 | : cfd_->NewEpochNumber(), // orders files ingested to L0 |
714 | 0 | file->file_checksum, file->file_checksum_func_name, file->unique_id, 0, |
715 | 0 | tail_size, file->user_defined_timestamps_persisted, "", ""); |
716 | 0 | f_metadata.temperature = file->file_temperature; |
717 | 0 | f_metadata.marked_for_compaction = marked_for_compaction; |
718 | | // Extract min/max timestamps from table properties for UDT support. |
719 | | // This ensures ingested files have proper timestamp ranges in FileMetaData, |
720 | | // similar to files created by flush and compaction. |
721 | 0 | ExtractTimestampFromTableProperties(file->table_properties, &f_metadata); |
722 | 0 | edit_.AddFile(file->picked_level, f_metadata); |
723 | |
|
724 | 0 | *batch_uppermost_level = |
725 | 0 | std::min(*batch_uppermost_level, file->picked_level); |
726 | 0 | } |
727 | | |
728 | 0 | return Status::OK(); |
729 | 0 | } |
730 | | |
731 | 0 | void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { |
732 | | // A map from output level to input of compactions equivalent to this |
733 | | // ingestion job. |
734 | | // TODO: simplify below logic to creating compaction per ingested file |
735 | | // instead of per output level, once we figure out how to treat ingested files |
736 | | // with adjacent range deletion tombstones to same output level in the same |
737 | | // job as non-overlapping compactions. |
738 | 0 | std::map<int, CompactionInputFiles> |
739 | 0 | output_level_to_file_ingesting_compaction_input; |
740 | |
|
741 | 0 | for (const auto& pair : edit_.GetNewFiles()) { |
742 | 0 | int output_level = pair.first; |
743 | 0 | const FileMetaData& f_metadata = pair.second; |
744 | |
|
745 | 0 | CompactionInputFiles& input = |
746 | 0 | output_level_to_file_ingesting_compaction_input[output_level]; |
747 | 0 | if (input.files.empty()) { |
748 | | // Treat the source level of ingested files to be level 0 |
749 | 0 | input.level = 0; |
750 | 0 | } |
751 | |
|
752 | 0 | compaction_input_metdatas_.push_back(new FileMetaData(f_metadata)); |
753 | 0 | input.files.push_back(compaction_input_metdatas_.back()); |
754 | 0 | } |
755 | |
|
756 | 0 | for (const auto& pair : output_level_to_file_ingesting_compaction_input) { |
757 | 0 | int output_level = pair.first; |
758 | 0 | const CompactionInputFiles& input = pair.second; |
759 | |
|
760 | 0 | const auto& mutable_cf_options = cfd_->GetLatestMutableCFOptions(); |
761 | 0 | file_ingesting_compactions_.push_back(new Compaction( |
762 | 0 | cfd_->current()->storage_info(), cfd_->ioptions(), mutable_cf_options, |
763 | 0 | mutable_db_options_, {input}, output_level, |
764 | | /* output file size limit not applicable */ |
765 | 0 | MaxFileSizeForLevel(mutable_cf_options, output_level, |
766 | 0 | cfd_->ioptions().compaction_style), |
767 | 0 | LLONG_MAX /* max compaction bytes, not applicable */, |
768 | 0 | 0 /* output path ID, not applicable */, mutable_cf_options.compression, |
769 | 0 | mutable_cf_options.compression_opts, Temperature::kUnknown, |
770 | 0 | 0 /* max_subcompaction, not applicable */, |
771 | 0 | {} /* grandparents, not applicable */, |
772 | 0 | std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */, |
773 | 0 | CompactionReason::kExternalSstIngestion, "" /* trim_ts */, |
774 | 0 | -1 /* score, not applicable */, |
775 | 0 | files_overlap_ /* l0_files_might_overlap, not applicable */)); |
776 | 0 | } |
777 | 0 | } |
778 | | |
779 | 0 | void ExternalSstFileIngestionJob::RegisterRange() { |
780 | 0 | for (const auto& c : file_ingesting_compactions_) { |
781 | 0 | cfd_->compaction_picker()->RegisterCompaction(c); |
782 | 0 | } |
783 | 0 | } |
784 | | |
785 | 0 | void ExternalSstFileIngestionJob::UnregisterRange() { |
786 | 0 | for (const auto& c : file_ingesting_compactions_) { |
787 | 0 | cfd_->compaction_picker()->UnregisterCompaction(c); |
788 | 0 | delete c; |
789 | 0 | } |
790 | 0 | file_ingesting_compactions_.clear(); |
791 | |
|
792 | 0 | for (const auto& f : compaction_input_metdatas_) { |
793 | 0 | delete f; |
794 | 0 | } |
795 | 0 | compaction_input_metdatas_.clear(); |
796 | 0 | } |
797 | | |
798 | 0 | void ExternalSstFileIngestionJob::UpdateStats() { |
799 | | // Update internal stats for new ingested files |
800 | 0 | uint64_t total_keys = 0; |
801 | 0 | uint64_t total_l0_files = 0; |
802 | 0 | uint64_t total_time = clock_->NowMicros() - job_start_time_; |
803 | |
|
804 | 0 | EventLoggerStream stream = event_logger_->Log(); |
805 | 0 | stream << "event" << "ingest_finished"; |
806 | 0 | stream << "files_ingested"; |
807 | 0 | stream.StartArray(); |
808 | |
|
809 | 0 | for (IngestedFileInfo& f : files_to_ingest_) { |
810 | 0 | InternalStats::CompactionStats stats( |
811 | 0 | CompactionReason::kExternalSstIngestion, 1); |
812 | 0 | stats.micros = total_time; |
813 | | // If actual copy occurred for this file, then we need to count the file |
814 | | // size as the actual bytes written. If the file was linked, then we ignore |
815 | | // the bytes written for file metadata. |
816 | | // TODO (yanqin) maybe account for file metadata bytes for exact accuracy? |
817 | 0 | if (f.copy_file) { |
818 | 0 | stats.bytes_written = f.fd.GetFileSize(); |
819 | 0 | } else { |
820 | 0 | stats.bytes_moved = f.fd.GetFileSize(); |
821 | 0 | } |
822 | 0 | stats.num_output_files = 1; |
823 | 0 | cfd_->internal_stats()->AddCompactionStats(f.picked_level, |
824 | 0 | Env::Priority::USER, stats); |
825 | 0 | cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, |
826 | 0 | f.fd.GetFileSize()); |
827 | 0 | total_keys += f.num_entries; |
828 | 0 | if (f.picked_level == 0) { |
829 | 0 | total_l0_files += 1; |
830 | 0 | } |
831 | 0 | ROCKS_LOG_INFO( |
832 | 0 | db_options_.info_log, |
833 | 0 | "[AddFile] External SST file %s was ingested in L%d with path %s " |
834 | 0 | "(global_seqno=%" PRIu64 ")\n", |
835 | 0 | f.external_file_path.c_str(), f.picked_level, |
836 | 0 | f.internal_file_path.c_str(), f.assigned_seqno); |
837 | 0 | stream << "file" << f.internal_file_path << "level" << f.picked_level; |
838 | 0 | } |
839 | 0 | stream.EndArray(); |
840 | |
|
841 | 0 | stream << "lsm_state"; |
842 | 0 | stream.StartArray(); |
843 | 0 | auto vstorage = cfd_->current()->storage_info(); |
844 | 0 | for (int level = 0; level < vstorage->num_levels(); ++level) { |
845 | 0 | stream << vstorage->NumLevelFiles(level); |
846 | 0 | } |
847 | 0 | stream.EndArray(); |
848 | |
|
849 | 0 | cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, |
850 | 0 | total_keys); |
851 | 0 | cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, |
852 | 0 | files_to_ingest_.size()); |
853 | 0 | cfd_->internal_stats()->AddCFStats( |
854 | 0 | InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files); |
855 | 0 | } |
856 | | |
857 | 0 | void ExternalSstFileIngestionJob::Cleanup(const Status& status) { |
858 | 0 | IOOptions io_opts; |
859 | 0 | if (!status.ok()) { |
860 | | // We failed to add the files to the database |
861 | | // remove all the files we copied |
862 | 0 | DeleteInternalFiles(); |
863 | 0 | files_overlap_ = false; |
864 | 0 | } else if (status.ok() && ingestion_options_.move_files) { |
865 | | // The files were moved and added successfully, remove original file links |
866 | 0 | for (IngestedFileInfo& f : files_to_ingest_) { |
867 | 0 | Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); |
868 | 0 | if (!s.ok()) { |
869 | 0 | ROCKS_LOG_WARN( |
870 | 0 | db_options_.info_log, |
871 | 0 | "%s was added to DB successfully but failed to remove original " |
872 | 0 | "file link : %s", |
873 | 0 | f.external_file_path.c_str(), s.ToString().c_str()); |
874 | 0 | } |
875 | 0 | } |
876 | 0 | } |
877 | 0 | } |
878 | | |
879 | 0 | void ExternalSstFileIngestionJob::DeleteInternalFiles() { |
880 | 0 | IOOptions io_opts; |
881 | 0 | for (IngestedFileInfo& f : files_to_ingest_) { |
882 | 0 | if (f.internal_file_path.empty()) { |
883 | 0 | continue; |
884 | 0 | } |
885 | 0 | Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); |
886 | 0 | if (!s.ok()) { |
887 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
888 | 0 | "AddFile() clean up for file %s failed : %s", |
889 | 0 | f.internal_file_path.c_str(), s.ToString().c_str()); |
890 | 0 | } |
891 | 0 | } |
892 | 0 | } |
893 | | |
894 | | Status ExternalSstFileIngestionJob::ResetTableReader( |
895 | | const std::string& external_file, uint64_t new_file_number, |
896 | | bool user_defined_timestamps_persisted, SuperVersion* sv, |
897 | | IngestedFileInfo* file_to_ingest, |
898 | 0 | std::unique_ptr<TableReader>* table_reader) { |
899 | 0 | std::unique_ptr<FSRandomAccessFile> sst_file; |
900 | 0 | FileOptions fo{env_options_}; |
901 | 0 | fo.temperature = file_to_ingest->file_temperature; |
902 | 0 | Status status = |
903 | 0 | fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr); |
904 | 0 | if (!status.ok()) { |
905 | 0 | ROCKS_LOG_WARN( |
906 | 0 | db_options_.info_log, |
907 | 0 | "Failed to create random access file for external file %s: %s", |
908 | 0 | external_file.c_str(), status.ToString().c_str()); |
909 | 0 | return status; |
910 | 0 | } |
911 | 0 | Temperature updated_temp = sst_file->GetTemperature(); |
912 | 0 | if (updated_temp != Temperature::kUnknown && |
913 | 0 | updated_temp != file_to_ingest->file_temperature) { |
914 | | // The hint was missing or wrong. Track temperature reported by storage. |
915 | 0 | file_to_ingest->file_temperature = updated_temp; |
916 | 0 | } |
917 | 0 | std::unique_ptr<RandomAccessFileReader> sst_file_reader( |
918 | 0 | new RandomAccessFileReader(std::move(sst_file), external_file, |
919 | 0 | nullptr /*Env*/, io_tracer_)); |
920 | 0 | table_reader->reset(); |
921 | 0 | ReadOptions ro; |
922 | 0 | ro.fill_cache = ingestion_options_.fill_cache; |
923 | 0 | status = sv->mutable_cf_options.table_factory->NewTableReader( |
924 | 0 | ro, |
925 | 0 | TableReaderOptions( |
926 | 0 | cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, |
927 | 0 | sv->mutable_cf_options.compression_manager.get(), env_options_, |
928 | 0 | cfd_->internal_comparator(), |
929 | 0 | sv->mutable_cf_options.block_protection_bytes_per_key, |
930 | 0 | /*skip_filters*/ false, /*immortal*/ false, |
931 | 0 | /*force_direct_prefetch*/ false, /*level*/ -1, |
932 | 0 | /*block_cache_tracer*/ nullptr, |
933 | 0 | /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), |
934 | 0 | /*cur_file_num*/ new_file_number, |
935 | 0 | /* unique_id */ {}, /* largest_seqno */ 0, |
936 | 0 | /* tail_size */ 0, user_defined_timestamps_persisted), |
937 | 0 | std::move(sst_file_reader), file_to_ingest->file_size, table_reader, |
938 | | // No need to prefetch index/filter if caching is not needed. |
939 | 0 | /*prefetch_index_and_filter_in_cache=*/ingestion_options_.fill_cache); |
940 | 0 | return status; |
941 | 0 | } |
942 | | |
943 | | Status ExternalSstFileIngestionJob::SanityCheckTableProperties( |
944 | | const std::string& external_file, uint64_t new_file_number, |
945 | | SuperVersion* sv, IngestedFileInfo* file_to_ingest, |
946 | 0 | std::unique_ptr<TableReader>* table_reader) { |
947 | | // Get the external file properties |
948 | 0 | auto props = table_reader->get()->GetTableProperties(); |
949 | 0 | assert(props.get()); |
950 | 0 | const auto& uprops = props->user_collected_properties; |
951 | | |
952 | | // Get table version |
953 | 0 | auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); |
954 | 0 | if (version_iter == uprops.end()) { |
955 | 0 | assert(!SstFileWriter::CreatedBySstFileWriter(*props)); |
956 | 0 | if (!ingestion_options_.allow_db_generated_files) { |
957 | 0 | return Status::Corruption("External file version not found"); |
958 | 0 | } else { |
959 | | // 0 is special version for when a file from live DB does not have the |
960 | | // version table property |
961 | 0 | file_to_ingest->version = 0; |
962 | 0 | } |
963 | 0 | } else { |
964 | 0 | assert(SstFileWriter::CreatedBySstFileWriter(*props)); |
965 | 0 | file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); |
966 | 0 | } |
967 | | |
968 | 0 | auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno); |
969 | 0 | if (file_to_ingest->version == 2) { |
970 | | // version 2 imply that we have global sequence number |
971 | 0 | if (seqno_iter == uprops.end()) { |
972 | 0 | return Status::Corruption( |
973 | 0 | "External file global sequence number not found"); |
974 | 0 | } |
975 | | |
976 | | // Set the global sequence number |
977 | 0 | file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); |
978 | 0 | if (props->external_sst_file_global_seqno_offset == 0) { |
979 | 0 | file_to_ingest->global_seqno_offset = 0; |
980 | 0 | return Status::Corruption("Was not able to find file global seqno field"); |
981 | 0 | } |
982 | 0 | file_to_ingest->global_seqno_offset = |
983 | 0 | static_cast<size_t>(props->external_sst_file_global_seqno_offset); |
984 | 0 | } else if (file_to_ingest->version == 1) { |
985 | | // SST file V1 should not have global seqno field |
986 | 0 | assert(seqno_iter == uprops.end()); |
987 | 0 | file_to_ingest->original_seqno = 0; |
988 | 0 | if (ingestion_options_.allow_blocking_flush || |
989 | 0 | ingestion_options_.allow_global_seqno) { |
990 | 0 | return Status::InvalidArgument( |
991 | 0 | "External SST file V1 does not support global seqno"); |
992 | 0 | } |
993 | 0 | } else if (file_to_ingest->version == 0) { |
994 | | // allow_db_generated_files is true |
995 | 0 | assert(seqno_iter == uprops.end()); |
996 | 0 | file_to_ingest->original_seqno = 0; |
997 | 0 | file_to_ingest->global_seqno_offset = 0; |
998 | 0 | } else { |
999 | 0 | return Status::InvalidArgument("External file version " + |
1000 | 0 | std::to_string(file_to_ingest->version) + |
1001 | 0 | " is not supported"); |
1002 | 0 | } |
1003 | | |
1004 | 0 | file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id); |
1005 | | // This assignment works fine even though `table_reader` may later be reset, |
1006 | | // since that will not affect how table properties are parsed, and this |
1007 | | // assignment is making a copy. |
1008 | 0 | file_to_ingest->table_properties = *props; |
1009 | | |
1010 | | // Get number of entries in table |
1011 | 0 | file_to_ingest->num_entries = props->num_entries; |
1012 | 0 | file_to_ingest->num_range_deletions = props->num_range_deletions; |
1013 | | |
1014 | | // Validate table properties related to comparator name and user defined |
1015 | | // timestamps persisted flag. |
1016 | 0 | file_to_ingest->user_defined_timestamps_persisted = |
1017 | 0 | static_cast<bool>(props->user_defined_timestamps_persisted); |
1018 | 0 | bool mark_sst_file_has_no_udt = false; |
1019 | 0 | Status s = ValidateUserDefinedTimestampsOptions( |
1020 | 0 | cfd_->user_comparator(), props->comparator_name, |
1021 | 0 | cfd_->ioptions().persist_user_defined_timestamps, |
1022 | 0 | file_to_ingest->user_defined_timestamps_persisted, |
1023 | 0 | &mark_sst_file_has_no_udt); |
1024 | 0 | if (s.ok() && mark_sst_file_has_no_udt) { |
1025 | | // A column family that enables user-defined timestamps in Memtable only |
1026 | | // feature can also ingest external files created by a setting that disables |
1027 | | // user-defined timestamps. In that case, we need to re-mark the |
1028 | | // user_defined_timestamps_persisted flag for the file. |
1029 | 0 | file_to_ingest->user_defined_timestamps_persisted = false; |
1030 | 0 | } else if (!s.ok()) { |
1031 | 0 | ROCKS_LOG_WARN( |
1032 | 0 | db_options_.info_log, |
1033 | 0 | "ValidateUserDefinedTimestampsOptions failed for external file %s: %s", |
1034 | 0 | external_file.c_str(), s.ToString().c_str()); |
1035 | 0 | return s; |
1036 | 0 | } |
1037 | | |
1038 | | // `TableReader` is initialized with `user_defined_timestamps_persisted` flag |
1039 | | // to be true. If its value changed to false after this sanity check, we |
1040 | | // need to reset the `TableReader`. |
1041 | 0 | if (ucmp_->timestamp_size() > 0 && |
1042 | 0 | !file_to_ingest->user_defined_timestamps_persisted) { |
1043 | 0 | s = ResetTableReader(external_file, new_file_number, |
1044 | 0 | file_to_ingest->user_defined_timestamps_persisted, sv, |
1045 | 0 | file_to_ingest, table_reader); |
1046 | 0 | } |
1047 | 0 | return s; |
1048 | 0 | } |
1049 | | |
1050 | | Status ExternalSstFileIngestionJob::GetIngestedFileInfo( |
1051 | | const std::string& external_file, uint64_t new_file_number, |
1052 | 0 | IngestedFileInfo* file_to_ingest, SuperVersion* sv) { |
1053 | 0 | file_to_ingest->external_file_path = external_file; |
1054 | | |
1055 | | // Get external file size |
1056 | 0 | Status status = fs_->GetFileSize(external_file, IOOptions(), |
1057 | 0 | &file_to_ingest->file_size, nullptr); |
1058 | 0 | if (!status.ok()) { |
1059 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1060 | 0 | "Failed to get file size for external file %s: %s", |
1061 | 0 | external_file.c_str(), status.ToString().c_str()); |
1062 | 0 | return status; |
1063 | 0 | } |
1064 | | |
1065 | | // Assign FD with number |
1066 | 0 | file_to_ingest->fd = |
1067 | 0 | FileDescriptor(new_file_number, 0, file_to_ingest->file_size); |
1068 | | |
1069 | | // Create TableReader for external file |
1070 | 0 | std::unique_ptr<TableReader> table_reader; |
1071 | | // Initially create the `TableReader` with flag |
1072 | | // `user_defined_timestamps_persisted` to be true since that's the most common |
1073 | | // case |
1074 | 0 | status = ResetTableReader(external_file, new_file_number, |
1075 | 0 | /*user_defined_timestamps_persisted=*/true, sv, |
1076 | 0 | file_to_ingest, &table_reader); |
1077 | 0 | if (!status.ok()) { |
1078 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1079 | 0 | "Failed to reset table reader for external file %s: %s", |
1080 | 0 | external_file.c_str(), status.ToString().c_str()); |
1081 | 0 | return status; |
1082 | 0 | } |
1083 | | |
1084 | 0 | status = SanityCheckTableProperties(external_file, new_file_number, sv, |
1085 | 0 | file_to_ingest, &table_reader); |
1086 | 0 | if (!status.ok()) { |
1087 | 0 | ROCKS_LOG_WARN( |
1088 | 0 | db_options_.info_log, |
1089 | 0 | "Failed to sanity check table properties for external file %s: %s", |
1090 | 0 | external_file.c_str(), status.ToString().c_str()); |
1091 | 0 | return status; |
1092 | 0 | } |
1093 | | |
1094 | 0 | const bool allow_data_in_errors = db_options_.allow_data_in_errors; |
1095 | 0 | ParsedInternalKey key; |
1096 | 0 | if (ingestion_options_.allow_db_generated_files) { |
1097 | | // We are ingesting a DB generated SST file for which we don't reassign |
1098 | | // sequence numbers. We need its smallest sequence number and largest |
1099 | | // sequence number for FileMetaData. |
1100 | 0 | Status seqno_status = GetSeqnoBoundaryForFile( |
1101 | 0 | table_reader.get(), sv, file_to_ingest, allow_data_in_errors); |
1102 | |
|
1103 | 0 | if (!seqno_status.ok()) { |
1104 | 0 | ROCKS_LOG_WARN( |
1105 | 0 | db_options_.info_log, |
1106 | 0 | "Failed to get sequence number boundary for external file %s: %s", |
1107 | 0 | external_file.c_str(), seqno_status.ToString().c_str()); |
1108 | 0 | return seqno_status; |
1109 | 0 | } |
1110 | 0 | assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno); |
1111 | 0 | assert(file_to_ingest->largest_seqno < kMaxSequenceNumber); |
1112 | 0 | } else { |
1113 | 0 | SequenceNumber largest_seqno = |
1114 | 0 | table_reader.get()->GetTableProperties()->key_largest_seqno; |
1115 | | // UINT64_MAX means unknown and the file is generated before table property |
1116 | | // `key_largest_seqno` is introduced. |
1117 | 0 | if (largest_seqno != UINT64_MAX && largest_seqno > 0) { |
1118 | 0 | return Status::Corruption( |
1119 | 0 | "External file has non zero largest sequence number " + |
1120 | 0 | std::to_string(largest_seqno)); |
1121 | 0 | } |
1122 | 0 | } |
1123 | | |
1124 | 0 | if (ingestion_options_.verify_checksums_before_ingest) { |
1125 | | // If customized readahead size is needed, we can pass a user option |
1126 | | // all the way to here. Right now we just rely on the default readahead |
1127 | | // to keep things simple. |
1128 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
1129 | 0 | ReadOptions ro; |
1130 | 0 | ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; |
1131 | 0 | ro.fill_cache = ingestion_options_.fill_cache; |
1132 | 0 | status = table_reader->VerifyChecksum( |
1133 | 0 | ro, TableReaderCaller::kExternalSSTIngestion); |
1134 | 0 | if (!status.ok()) { |
1135 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1136 | 0 | "Failed to verify checksum for table reader: %s", |
1137 | 0 | status.ToString().c_str()); |
1138 | 0 | return status; |
1139 | 0 | } |
1140 | 0 | } |
1141 | | |
1142 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
1143 | 0 | ReadOptions ro; |
1144 | 0 | ro.fill_cache = ingestion_options_.fill_cache; |
1145 | 0 | std::unique_ptr<InternalIterator> iter(table_reader->NewIterator( |
1146 | 0 | ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, |
1147 | 0 | /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); |
1148 | | |
1149 | | // Get first (smallest) and last (largest) key from file. |
1150 | 0 | iter->SeekToFirst(); |
1151 | 0 | if (iter->Valid()) { |
1152 | 0 | Status pik_status = |
1153 | 0 | ParseInternalKey(iter->key(), &key, allow_data_in_errors); |
1154 | 0 | if (!pik_status.ok()) { |
1155 | 0 | return Status::Corruption("Corrupted key in external file. ", |
1156 | 0 | pik_status.getState()); |
1157 | 0 | } |
1158 | 0 | if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { |
1159 | 0 | return Status::Corruption("External file has non zero sequence number"); |
1160 | 0 | } |
1161 | 0 | file_to_ingest->smallest_internal_key.SetFrom(key); |
1162 | |
|
1163 | 0 | Slice largest; |
1164 | 0 | if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") == |
1165 | 0 | 0) { |
1166 | | // PlainTable iterator does not support SeekToLast(). |
1167 | 0 | largest = iter->key(); |
1168 | 0 | for (; iter->Valid(); iter->Next()) { |
1169 | 0 | if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) { |
1170 | 0 | largest = iter->key(); |
1171 | 0 | } |
1172 | 0 | } |
1173 | 0 | if (!iter->status().ok()) { |
1174 | 0 | return iter->status(); |
1175 | 0 | } |
1176 | 0 | } else { |
1177 | 0 | iter->SeekToLast(); |
1178 | 0 | if (!iter->Valid()) { |
1179 | 0 | if (iter->status().ok()) { |
1180 | | // The file contains at least 1 key since iter is valid after |
1181 | | // SeekToFirst(). |
1182 | 0 | return Status::Corruption("Can not find largest key in sst file"); |
1183 | 0 | } else { |
1184 | 0 | return iter->status(); |
1185 | 0 | } |
1186 | 0 | } |
1187 | 0 | largest = iter->key(); |
1188 | 0 | } |
1189 | | |
1190 | 0 | pik_status = ParseInternalKey(largest, &key, allow_data_in_errors); |
1191 | 0 | if (!pik_status.ok()) { |
1192 | 0 | return Status::Corruption("Corrupted key in external file. ", |
1193 | 0 | pik_status.getState()); |
1194 | 0 | } |
1195 | 0 | if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { |
1196 | 0 | return Status::Corruption("External file has non zero sequence number"); |
1197 | 0 | } |
1198 | 0 | file_to_ingest->largest_internal_key.SetFrom(key); |
1199 | 0 | } else if (!iter->status().ok()) { |
1200 | 0 | return iter->status(); |
1201 | 0 | } |
1202 | | |
1203 | 0 | std::unique_ptr<InternalIterator> range_del_iter( |
1204 | 0 | table_reader->NewRangeTombstoneIterator(ro)); |
1205 | | // We may need to adjust these key bounds, depending on whether any range |
1206 | | // deletion tombstones extend past them. |
1207 | 0 | if (range_del_iter != nullptr) { |
1208 | 0 | for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); |
1209 | 0 | range_del_iter->Next()) { |
1210 | 0 | Status pik_status = |
1211 | 0 | ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); |
1212 | 0 | if (!pik_status.ok()) { |
1213 | 0 | return Status::Corruption("Corrupted key in external file. ", |
1214 | 0 | pik_status.getState()); |
1215 | 0 | } |
1216 | 0 | if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { |
1217 | 0 | return Status::Corruption( |
1218 | 0 | "External file has a range deletion with non zero sequence " |
1219 | 0 | "number."); |
1220 | 0 | } |
1221 | 0 | RangeTombstone tombstone(key, range_del_iter->value()); |
1222 | 0 | file_range_checker_.MaybeUpdateRange(tombstone.SerializeKey(), |
1223 | 0 | tombstone.SerializeEndKey(), |
1224 | 0 | file_to_ingest); |
1225 | 0 | } |
1226 | 0 | } |
1227 | | |
1228 | 0 | const size_t ts_sz = ucmp_->timestamp_size(); |
1229 | 0 | Slice smallest = file_to_ingest->smallest_internal_key.user_key(); |
1230 | 0 | Slice largest = file_to_ingest->largest_internal_key.user_key(); |
1231 | 0 | if (ts_sz > 0) { |
1232 | 0 | AppendUserKeyWithMaxTimestamp(&file_to_ingest->start_ukey, smallest, ts_sz); |
1233 | 0 | AppendUserKeyWithMinTimestamp(&file_to_ingest->limit_ukey, largest, ts_sz); |
1234 | 0 | } else { |
1235 | 0 | file_to_ingest->start_ukey.assign(smallest.data(), smallest.size()); |
1236 | 0 | file_to_ingest->limit_ukey.assign(largest.data(), largest.size()); |
1237 | 0 | } |
1238 | |
|
1239 | 0 | auto s = |
1240 | 0 | GetSstInternalUniqueId(file_to_ingest->table_properties.db_id, |
1241 | 0 | file_to_ingest->table_properties.db_session_id, |
1242 | 0 | file_to_ingest->table_properties.orig_file_number, |
1243 | 0 | &(file_to_ingest->unique_id)); |
1244 | 0 | if (!s.ok()) { |
1245 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1246 | 0 | "Failed to get SST unique id for file %s", |
1247 | 0 | file_to_ingest->internal_file_path.c_str()); |
1248 | 0 | file_to_ingest->unique_id = kNullUniqueId64x2; |
1249 | 0 | } |
1250 | |
|
1251 | 0 | return status; |
1252 | 0 | } |
1253 | | |
1254 | | Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( |
1255 | | SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, |
1256 | | SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, |
1257 | | SequenceNumber* assigned_seqno, |
1258 | 0 | std::optional<int> prev_batch_uppermost_level) { |
1259 | 0 | Status status; |
1260 | 0 | *assigned_seqno = 0; |
1261 | 0 | const size_t ts_sz = ucmp_->timestamp_size(); |
1262 | 0 | assert(!prev_batch_uppermost_level.has_value() || |
1263 | 0 | prev_batch_uppermost_level.value() < cfd_->NumberLevels()); |
1264 | 0 | bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() && |
1265 | 0 | prev_batch_uppermost_level.value() == 0) || |
1266 | 0 | compaction_style == kCompactionStyleFIFO; |
1267 | |
|
1268 | 0 | if (force_global_seqno || (!ingestion_options_.allow_db_generated_files && |
1269 | 0 | (files_overlap_ || must_assign_to_l0))) { |
1270 | 0 | *assigned_seqno = last_seqno + 1; |
1271 | 0 | if (must_assign_to_l0) { |
1272 | 0 | assert(ts_sz == 0); |
1273 | 0 | file_to_ingest->picked_level = 0; |
1274 | 0 | if (ingestion_options_.fail_if_not_bottommost_level && |
1275 | 0 | cfd_->NumberLevels() > 1) { |
1276 | 0 | status = Status::TryAgain( |
1277 | 0 | "Files cannot be ingested to Lmax. Please make sure key range of " |
1278 | 0 | "Lmax does not overlap with files to ingest."); |
1279 | 0 | } |
1280 | 0 | return status; |
1281 | 0 | } |
1282 | 0 | } |
1283 | | |
1284 | 0 | bool overlap_with_db = false; |
1285 | 0 | Arena arena; |
1286 | | // TODO: plumb Env::IOActivity, Env::IOPriority |
1287 | 0 | ReadOptions ro; |
1288 | 0 | ro.fill_cache = ingestion_options_.fill_cache; |
1289 | 0 | ro.total_order_seek = true; |
1290 | 0 | int target_level = 0; |
1291 | 0 | auto* vstorage = cfd_->current()->storage_info(); |
1292 | 0 | assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files); |
1293 | 0 | int assigned_level_exclusive_end = cfd_->NumberLevels(); |
1294 | 0 | if (must_assign_to_l0) { |
1295 | 0 | assigned_level_exclusive_end = 0; |
1296 | 0 | } else if (prev_batch_uppermost_level.has_value()) { |
1297 | 0 | assigned_level_exclusive_end = prev_batch_uppermost_level.value(); |
1298 | 0 | } |
1299 | | |
1300 | | // When ingesting db generated files, we require that ingested files do not |
1301 | | // overlap with any file in the DB. So we need to check all levels. |
1302 | 0 | int overlap_checking_exclusive_end = |
1303 | 0 | ingestion_options_.allow_db_generated_files |
1304 | 0 | ? cfd_->NumberLevels() |
1305 | 0 | : assigned_level_exclusive_end; |
1306 | 0 | for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) { |
1307 | 0 | if (lvl > 0 && lvl < vstorage->base_level()) { |
1308 | 0 | continue; |
1309 | 0 | } |
1310 | 0 | if (lvl < assigned_level_exclusive_end && |
1311 | 0 | atomic_replace_range_.has_value()) { |
1312 | 0 | target_level = lvl; |
1313 | 0 | continue; |
1314 | 0 | } |
1315 | 0 | if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey, |
1316 | 0 | file_to_ingest->limit_ukey, lvl)) { |
1317 | | // We must use L0 or any level higher than `lvl` to be able to overwrite |
1318 | | // the compaction output keys that we overlap with in this level, We also |
1319 | | // need to assign this file a seqno to overwrite the compaction output |
1320 | | // keys in level `lvl` |
1321 | 0 | overlap_with_db = true; |
1322 | 0 | break; |
1323 | 0 | } else if (vstorage->NumLevelFiles(lvl) > 0) { |
1324 | 0 | bool overlap_with_level = false; |
1325 | 0 | status = sv->current->OverlapWithLevelIterator( |
1326 | 0 | ro, env_options_, file_to_ingest->start_ukey, |
1327 | 0 | file_to_ingest->limit_ukey, lvl, &overlap_with_level); |
1328 | 0 | if (!status.ok()) { |
1329 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1330 | 0 | "Failed to check overlap with level iterator: %s", |
1331 | 0 | status.ToString().c_str()); |
1332 | 0 | return status; |
1333 | 0 | } |
1334 | 0 | if (overlap_with_level) { |
1335 | | // We must use L0 or any level higher than `lvl` to be able to overwrite |
1336 | | // the keys that we overlap with in this level, We also need to assign |
1337 | | // this file a seqno to overwrite the existing keys in level `lvl` |
1338 | 0 | overlap_with_db = true; |
1339 | 0 | break; |
1340 | 0 | } |
1341 | 0 | } |
1342 | | |
1343 | | // We don't overlap with any keys in this level, but we still need to check |
1344 | | // if our file can fit in it |
1345 | 0 | if (lvl < assigned_level_exclusive_end && |
1346 | 0 | IngestedFileFitInLevel(file_to_ingest, lvl)) { |
1347 | 0 | target_level = lvl; |
1348 | 0 | } |
1349 | 0 | } |
1350 | | |
1351 | 0 | if (ingestion_options_.fail_if_not_bottommost_level && |
1352 | 0 | target_level < cfd_->NumberLevels() - 1) { |
1353 | 0 | status = Status::TryAgain( |
1354 | 0 | "Files cannot be ingested to Lmax. Please make sure key range of Lmax " |
1355 | 0 | "and ongoing compaction's output to Lmax does not overlap with files " |
1356 | 0 | "to ingest. Input files overlapping with each other can cause some " |
1357 | 0 | "file to be assigned to non Lmax level."); |
1358 | 0 | return status; |
1359 | 0 | } |
1360 | | |
1361 | 0 | TEST_SYNC_POINT_CALLBACK( |
1362 | 0 | "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", |
1363 | 0 | &overlap_with_db); |
1364 | 0 | file_to_ingest->picked_level = target_level; |
1365 | 0 | if (overlap_with_db) { |
1366 | 0 | if (ts_sz > 0) { |
1367 | 0 | status = Status::InvalidArgument( |
1368 | 0 | "Column family enables user-defined timestamps, please make sure the " |
1369 | 0 | "key range (without timestamp) of external file does not overlap " |
1370 | 0 | "with key range (without timestamp) in the db"); |
1371 | 0 | return status; |
1372 | 0 | } |
1373 | 0 | if (*assigned_seqno == 0) { |
1374 | 0 | *assigned_seqno = last_seqno + 1; |
1375 | 0 | } |
1376 | 0 | } |
1377 | | |
1378 | 0 | return status; |
1379 | 0 | } |
1380 | | |
1381 | | Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( |
1382 | 0 | IngestedFileInfo* file_to_ingest) { |
1383 | 0 | assert(!atomic_replace_range_.has_value()); |
1384 | |
|
1385 | 0 | auto* vstorage = cfd_->current()->storage_info(); |
1386 | | // First, check if new files fit in the last level |
1387 | 0 | int last_lvl = cfd_->NumberLevels() - 1; |
1388 | 0 | if (!IngestedFileFitInLevel(file_to_ingest, last_lvl)) { |
1389 | 0 | return Status::InvalidArgument( |
1390 | 0 | "Can't ingest_behind file as it doesn't fit " |
1391 | 0 | "at the last level!"); |
1392 | 0 | } |
1393 | | |
1394 | | // Second, check if despite cf_allow_ingest_behind=true we still have 0 |
1395 | | // seqnums at some upper level |
1396 | 0 | for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { |
1397 | 0 | for (auto file : vstorage->LevelFiles(lvl)) { |
1398 | 0 | if (file->fd.smallest_seqno == 0) { |
1399 | 0 | return Status::InvalidArgument( |
1400 | 0 | "Can't ingest_behind file as despite cf_allow_ingest_behind=true " |
1401 | 0 | "there are files with 0 seqno in database at upper levels!"); |
1402 | 0 | } |
1403 | 0 | } |
1404 | 0 | } |
1405 | | |
1406 | 0 | file_to_ingest->picked_level = last_lvl; |
1407 | 0 | return Status::OK(); |
1408 | 0 | } |
1409 | | |
1410 | | Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( |
1411 | 0 | IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { |
1412 | 0 | if (ingestion_options_.allow_db_generated_files) { |
1413 | 0 | assert(seqno == 0); |
1414 | 0 | assert(file_to_ingest->original_seqno == 0); |
1415 | 0 | } |
1416 | 0 | if (file_to_ingest->original_seqno == seqno) { |
1417 | | // This file already has the correct global seqno. |
1418 | 0 | return Status::OK(); |
1419 | 0 | } else if (!ingestion_options_.allow_global_seqno) { |
1420 | 0 | return Status::InvalidArgument("Global seqno is required, but disabled"); |
1421 | 0 | } else if (ingestion_options_.write_global_seqno && |
1422 | 0 | file_to_ingest->global_seqno_offset == 0) { |
1423 | 0 | return Status::InvalidArgument( |
1424 | 0 | "Trying to set global seqno for a file that don't have a global seqno " |
1425 | 0 | "field"); |
1426 | 0 | } |
1427 | | |
1428 | 0 | if (ingestion_options_.write_global_seqno) { |
1429 | | // Determine if we can write global_seqno to a given offset of file. |
1430 | | // If the file system does not support random write, then we should not. |
1431 | | // Otherwise we should. |
1432 | 0 | std::unique_ptr<FSRandomRWFile> rwfile; |
1433 | 0 | Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, |
1434 | 0 | env_options_, &rwfile, nullptr); |
1435 | 0 | TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", |
1436 | 0 | &status); |
1437 | 0 | if (status.ok()) { |
1438 | 0 | FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, |
1439 | 0 | file_to_ingest->internal_file_path); |
1440 | 0 | std::string seqno_val; |
1441 | 0 | PutFixed64(&seqno_val, seqno); |
1442 | 0 | status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, |
1443 | 0 | IOOptions(), nullptr); |
1444 | 0 | if (!status.ok()) { |
1445 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1446 | 0 | "Failed to write global seqno to %s: %s", |
1447 | 0 | file_to_ingest->internal_file_path.c_str(), |
1448 | 0 | status.ToString().c_str()); |
1449 | 0 | return status; |
1450 | 0 | } |
1451 | | |
1452 | 0 | if (status.ok()) { |
1453 | 0 | TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); |
1454 | 0 | status = SyncIngestedFile(fsptr.get()); |
1455 | 0 | TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); |
1456 | 0 | if (!status.ok()) { |
1457 | 0 | ROCKS_LOG_WARN(db_options_.info_log, |
1458 | 0 | "Failed to sync ingested file %s after writing global " |
1459 | 0 | "sequence number: %s", |
1460 | 0 | file_to_ingest->internal_file_path.c_str(), |
1461 | 0 | status.ToString().c_str()); |
1462 | 0 | } |
1463 | 0 | } |
1464 | 0 | if (!status.ok()) { |
1465 | 0 | return status; |
1466 | 0 | } |
1467 | 0 | } else if (!status.IsNotSupported()) { |
1468 | 0 | ROCKS_LOG_WARN( |
1469 | 0 | db_options_.info_log, |
1470 | 0 | "Failed to open ingested file %s for random read/write: %s", |
1471 | 0 | file_to_ingest->internal_file_path.c_str(), |
1472 | 0 | status.ToString().c_str()); |
1473 | 0 | return status; |
1474 | 0 | } |
1475 | 0 | } |
1476 | | |
1477 | 0 | file_to_ingest->assigned_seqno = seqno; |
1478 | 0 | return Status::OK(); |
1479 | 0 | } |
1480 | | |
1481 | | IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( |
1482 | 0 | IngestedFileInfo* file_to_ingest) { |
1483 | 0 | if (db_options_.file_checksum_gen_factory == nullptr || |
1484 | 0 | need_generate_file_checksum_ == false || |
1485 | 0 | ingestion_options_.write_global_seqno == false) { |
1486 | | // If file_checksum_gen_factory is not set, we are not able to generate |
1487 | | // the checksum. if write_global_seqno is false, it means we will use |
1488 | | // file checksum generated during Prepare(). This step will be skipped. |
1489 | 0 | return IOStatus::OK(); |
1490 | 0 | } |
1491 | 0 | std::string file_checksum; |
1492 | 0 | std::string file_checksum_func_name; |
1493 | 0 | std::string requested_checksum_func_name; |
1494 | | // TODO: rate limit file reads for checksum calculation during file ingestion. |
1495 | | // TODO: plumb Env::IOActivity |
1496 | 0 | ReadOptions ro; |
1497 | 0 | FileOptions gen_fopts; |
1498 | 0 | gen_fopts.file_checksum_func_name = kNoFileChecksumFuncName; |
1499 | 0 | IOStatus io_s = GenerateOneFileChecksum( |
1500 | 0 | fs_.get(), file_to_ingest->internal_file_path, |
1501 | 0 | db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, |
1502 | 0 | &file_checksum, &file_checksum_func_name, |
1503 | 0 | ingestion_options_.verify_checksums_readahead_size, |
1504 | 0 | db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), |
1505 | 0 | ro, db_options_.stats, db_options_.clock, gen_fopts); |
1506 | 0 | if (!io_s.ok()) { |
1507 | 0 | ROCKS_LOG_WARN( |
1508 | 0 | db_options_.info_log, "Failed to generate checksum for %s: %s", |
1509 | 0 | file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str()); |
1510 | 0 | return io_s; |
1511 | 0 | } |
1512 | 0 | file_to_ingest->file_checksum = std::move(file_checksum); |
1513 | 0 | file_to_ingest->file_checksum_func_name = std::move(file_checksum_func_name); |
1514 | 0 | return IOStatus::OK(); |
1515 | 0 | } |
1516 | | |
1517 | | bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( |
1518 | 0 | const IngestedFileInfo* file_to_ingest, int level) { |
1519 | 0 | if (level == 0) { |
1520 | | // Files can always fit in L0 |
1521 | 0 | return true; |
1522 | 0 | } |
1523 | | |
1524 | 0 | auto* vstorage = cfd_->current()->storage_info(); |
1525 | 0 | Slice file_smallest_user_key(file_to_ingest->start_ukey); |
1526 | 0 | Slice file_largest_user_key(file_to_ingest->limit_ukey); |
1527 | |
|
1528 | 0 | if (vstorage->OverlapInLevel(level, &file_smallest_user_key, |
1529 | 0 | &file_largest_user_key)) { |
1530 | | // File overlap with another files in this level, we cannot |
1531 | | // add it to this level |
1532 | 0 | return false; |
1533 | 0 | } |
1534 | | |
1535 | | // File did not overlap with level files, nor compaction output |
1536 | 0 | return true; |
1537 | 0 | } |
1538 | | |
1539 | | template <typename TWritableFile> |
1540 | 0 | Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { |
1541 | 0 | assert(file != nullptr); |
1542 | 0 | if (db_options_.use_fsync) { |
1543 | 0 | return file->Fsync(IOOptions(), nullptr); |
1544 | 0 | } else { |
1545 | 0 | return file->Sync(IOOptions(), nullptr); |
1546 | 0 | } |
1547 | 0 | } Unexecuted instantiation: rocksdb::Status rocksdb::ExternalSstFileIngestionJob::SyncIngestedFile<rocksdb::FSWritableFile>(rocksdb::FSWritableFile*) Unexecuted instantiation: rocksdb::Status rocksdb::ExternalSstFileIngestionJob::SyncIngestedFile<rocksdb::FSRandomRWFile>(rocksdb::FSRandomRWFile*) |
1548 | | |
1549 | | Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile( |
1550 | | TableReader* table_reader, SuperVersion* sv, |
1551 | 0 | IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) { |
1552 | 0 | const auto tp = table_reader->GetTableProperties(); |
1553 | 0 | const bool has_largest_seqno = tp->HasKeyLargestSeqno(); |
1554 | 0 | SequenceNumber largest_seqno = tp->key_largest_seqno; |
1555 | 0 | if (has_largest_seqno) { |
1556 | 0 | file_to_ingest->largest_seqno = largest_seqno; |
1557 | 0 | if (largest_seqno == 0) { |
1558 | 0 | file_to_ingest->smallest_seqno = 0; |
1559 | 0 | return Status::OK(); |
1560 | 0 | } |
1561 | 0 | if (tp->HasKeySmallestSeqno()) { |
1562 | 0 | file_to_ingest->smallest_seqno = tp->key_smallest_seqno; |
1563 | 0 | return Status::OK(); |
1564 | 0 | } |
1565 | 0 | } |
1566 | | |
1567 | | // For older SST files they may not be recorded in table properties, so |
1568 | | // we scan the file to find out. |
1569 | 0 | TEST_SYNC_POINT( |
1570 | 0 | "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan"); |
1571 | 0 | SequenceNumber smallest_seqno = kMaxSequenceNumber; |
1572 | 0 | SequenceNumber largest_seqno_from_iter = 0; |
1573 | 0 | ReadOptions ro; |
1574 | 0 | ro.fill_cache = ingestion_options_.fill_cache; |
1575 | 0 | std::unique_ptr<InternalIterator> iter(table_reader->NewIterator( |
1576 | 0 | ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, |
1577 | 0 | /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); |
1578 | 0 | ParsedInternalKey key; |
1579 | 0 | iter->SeekToFirst(); |
1580 | 0 | while (iter->Valid()) { |
1581 | 0 | Status pik_status = |
1582 | 0 | ParseInternalKey(iter->key(), &key, allow_data_in_errors); |
1583 | 0 | if (!pik_status.ok()) { |
1584 | 0 | return Status::Corruption("Corrupted key in external file. ", |
1585 | 0 | pik_status.getState()); |
1586 | 0 | } |
1587 | 0 | smallest_seqno = std::min(smallest_seqno, key.sequence); |
1588 | 0 | largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence); |
1589 | 0 | iter->Next(); |
1590 | 0 | } |
1591 | 0 | if (!iter->status().ok()) { |
1592 | 0 | return iter->status(); |
1593 | 0 | } |
1594 | | |
1595 | 0 | if (table_reader->GetTableProperties()->num_range_deletions > 0) { |
1596 | 0 | std::unique_ptr<InternalIterator> range_del_iter( |
1597 | 0 | table_reader->NewRangeTombstoneIterator(ro)); |
1598 | 0 | if (range_del_iter != nullptr) { |
1599 | 0 | for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); |
1600 | 0 | range_del_iter->Next()) { |
1601 | 0 | Status pik_status = |
1602 | 0 | ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); |
1603 | 0 | if (!pik_status.ok()) { |
1604 | 0 | return Status::Corruption("Corrupted key in external file. ", |
1605 | 0 | pik_status.getState()); |
1606 | 0 | } |
1607 | 0 | smallest_seqno = std::min(smallest_seqno, key.sequence); |
1608 | 0 | largest_seqno_from_iter = |
1609 | 0 | std::max(largest_seqno_from_iter, key.sequence); |
1610 | 0 | } |
1611 | 0 | if (!range_del_iter->status().ok()) { |
1612 | 0 | return range_del_iter->status(); |
1613 | 0 | } |
1614 | 0 | } |
1615 | 0 | } |
1616 | | |
1617 | 0 | file_to_ingest->smallest_seqno = smallest_seqno; |
1618 | 0 | if (!has_largest_seqno) { |
1619 | 0 | file_to_ingest->largest_seqno = largest_seqno_from_iter; |
1620 | 0 | } else { |
1621 | 0 | assert(largest_seqno == largest_seqno_from_iter); |
1622 | 0 | file_to_ingest->largest_seqno = largest_seqno; |
1623 | 0 | } |
1624 | |
|
1625 | 0 | if (file_to_ingest->largest_seqno == kMaxSequenceNumber) { |
1626 | 0 | return Status::InvalidArgument( |
1627 | 0 | "Unknown smallest seqno for db generated file."); |
1628 | 0 | } |
1629 | 0 | if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) { |
1630 | 0 | return Status::InvalidArgument( |
1631 | 0 | "Unknown largest seqno for db generated file."); |
1632 | 0 | } |
1633 | 0 | return Status::OK(); |
1634 | 0 | } |
1635 | | |
1636 | | } // namespace ROCKSDB_NAMESPACE |