Coverage Report

Created: 2026-02-14 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/db/compaction/compaction_picker_level.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/compaction/compaction_picker_level.h"
11
12
#include <string>
13
#include <utility>
14
#include <vector>
15
16
#include "db/version_edit.h"
17
#include "logging/log_buffer.h"
18
#include "test_util/sync_point.h"
19
20
namespace ROCKSDB_NAMESPACE {
21
22
bool LevelCompactionPicker::NeedsCompaction(
23
44.7k
    const VersionStorageInfo* vstorage) const {
24
44.7k
  if (!vstorage->ExpiredTtlFiles().empty()) {
25
0
    return true;
26
0
  }
27
44.7k
  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
28
0
    return true;
29
0
  }
30
44.7k
  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
31
754
    return true;
32
754
  }
33
44.0k
  if (!vstorage->FilesMarkedForCompaction().empty()) {
34
0
    return true;
35
0
  }
36
44.0k
  if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
37
0
    return true;
38
0
  }
39
301k
  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
40
258k
    if (vstorage->CompactionScore(i) >= 1) {
41
1.18k
      return true;
42
1.18k
    }
43
258k
  }
44
42.8k
  return false;
45
44.0k
}
46
47
namespace {
48
49
enum class CompactToNextLevel {
50
  kNo,   // compact to the same level as the input file
51
  kYes,  // compact to the next level except the last level to the same level
52
  kSkipLastLevel,  // compact to the next level but skip the last level
53
};
54
55
// A class to build a leveled compaction step-by-step.
56
class LevelCompactionBuilder {
57
 public:
58
  LevelCompactionBuilder(const std::string& cf_name,
59
                         VersionStorageInfo* vstorage,
60
                         CompactionPicker* compaction_picker,
61
                         LogBuffer* log_buffer,
62
                         const MutableCFOptions& mutable_cf_options,
63
                         const ImmutableOptions& ioptions,
64
                         const MutableDBOptions& mutable_db_options,
65
                         const std::string& full_history_ts_low)
66
1.86k
      : cf_name_(cf_name),
67
1.86k
        vstorage_(vstorage),
68
1.86k
        compaction_picker_(compaction_picker),
69
1.86k
        log_buffer_(log_buffer),
70
1.86k
        mutable_cf_options_(mutable_cf_options),
71
1.86k
        ioptions_(ioptions),
72
1.86k
        mutable_db_options_(mutable_db_options),
73
1.86k
        full_history_ts_low_(full_history_ts_low) {}
74
75
  // Pick and return a compaction.
76
  Compaction* PickCompaction();
77
78
  // Pick the initial files to compact to the next level. (or together
79
  // in Intra-L0 compactions)
80
  void SetupInitialFiles();
81
82
  // If the initial files are from L0 level, pick other L0
83
  // files if needed.
84
  bool SetupOtherL0FilesIfNeeded();
85
86
  // Compaction with round-robin compaction priority allows more files to be
87
  // picked to form a large compaction
88
  void SetupOtherFilesWithRoundRobinExpansion();
89
  // Based on initial files, setup other files need to be compacted
90
  // in this compaction, accordingly.
91
  bool SetupOtherInputsIfNeeded();
92
93
  Compaction* GetCompaction();
94
95
  // From `start_level_`, pick files to compact to `output_level_`.
96
  // Returns false if there is no file to compact.
97
  // If it returns true, inputs->files.size() will be exactly one for
98
  // all compaction priorities except round-robin. For round-robin,
99
  // multiple consecutive files may be put into inputs->files.
100
  // If level is 0 and there is already a compaction on that level, this
101
  // function will return false.
102
  bool PickFileToCompact();
103
104
  // Return true if a L0 trivial move is picked up.
105
  bool TryPickL0TrivialMove();
106
107
  // For L0->L0, picks the longest span of files that aren't currently
108
  // undergoing compaction for which work-per-deleted-file decreases. The span
109
  // always starts from the newest L0 file.
110
  //
111
  // Intra-L0 compaction is independent of all other files, so it can be
112
  // performed even when L0->base_level compactions are blocked.
113
  //
114
  // Returns true if `inputs` is populated with a span of files to be compacted;
115
  // otherwise, returns false.
116
  bool PickIntraL0Compaction();
117
118
  // When total L0 size is small compared to Lbase, try to pick intra-L0
119
  // compaction starting from the newest L0 file. This helps to prevent
120
  // L0->Lbase compaction with large write-amp.
121
  //
122
  // Returns true iff an intra-L0 compaction is picked.
123
  // `start_level_inputs_` and `output_level_` will be updated accordingly if
124
  // a compaction is picked.
125
  bool PickSizeBasedIntraL0Compaction();
126
127
  // Return true if TrivialMove is extended. `start_index` is the index of
128
  // the initial file picked, which should already be in `start_level_inputs_`.
129
  bool TryExtendNonL0TrivialMove(int start_index,
130
                                 bool only_expand_right = false);
131
132
  // Picks a file from level_files to compact.
133
  // level_files is a vector of (level, file metadata) in ascending order of
134
  // level. If compact_to_next_level is true, compact the file to the next
135
  // level, otherwise, compact to the same level as the input file.
136
  // If skip_last_level is true, skip the last level.
137
  void PickFileToCompact(
138
      const autovector<std::pair<int, FileMetaData*>>& level_files,
139
      CompactToNextLevel compact_to_next_level);
140
141
  const std::string& cf_name_;
142
  VersionStorageInfo* vstorage_;
143
  CompactionPicker* compaction_picker_;
144
  LogBuffer* log_buffer_;
145
  int start_level_ = -1;
146
  int output_level_ = -1;
147
  int parent_index_ = -1;
148
  int base_index_ = -1;
149
  double start_level_score_ = 0;
150
  bool is_l0_trivial_move_ = false;
151
  CompactionInputFiles start_level_inputs_;
152
  std::vector<CompactionInputFiles> compaction_inputs_;
153
  CompactionInputFiles output_level_inputs_;
154
  std::vector<FileMetaData*> grandparents_;
155
  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
156
157
  const MutableCFOptions& mutable_cf_options_;
158
  const ImmutableOptions& ioptions_;
159
  const MutableDBOptions& mutable_db_options_;
160
  const std::string& full_history_ts_low_;
161
  // Pick a path ID to place a newly generated file, with its level
162
  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
163
                            const MutableCFOptions& mutable_cf_options,
164
                            int level);
165
166
  static const int kMinFilesForIntraL0Compaction = 4;
167
};
168
169
void LevelCompactionBuilder::PickFileToCompact(
170
    const autovector<std::pair<int, FileMetaData*>>& level_files,
171
786
    CompactToNextLevel compact_to_next_level) {
172
786
  for (auto& level_file : level_files) {
173
    // If it's being compacted it has nothing to do here.
174
    // If this assert() fails that means that some function marked some
175
    // files as being_compacted, but didn't call ComputeCompactionScore()
176
659
    assert(!level_file.second->being_compacted);
177
659
    start_level_ = level_file.first;
178
659
    if ((compact_to_next_level == CompactToNextLevel::kSkipLastLevel &&
179
0
         start_level_ == vstorage_->num_non_empty_levels() - 1) ||
180
659
        (start_level_ == 0 &&
181
403
         !compaction_picker_->level0_compactions_in_progress()->empty())) {
182
5
      continue;
183
5
    }
184
185
    // Compact to the next level only if the file is not in the last level and
186
    // compact_to_next_level is kYes or kSkipLastLevel.
187
654
    if (compact_to_next_level != CompactToNextLevel::kNo &&
188
0
        (start_level_ < vstorage_->num_non_empty_levels() - 1)) {
189
0
      output_level_ =
190
0
          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
191
654
    } else {
192
654
      output_level_ = start_level_;
193
654
    }
194
654
    start_level_inputs_.files = {level_file.second};
195
654
    start_level_inputs_.level = start_level_;
196
654
    if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
197
654
                                                   &start_level_inputs_)) {
198
654
      return;
199
654
    }
200
654
  }
201
132
  start_level_inputs_.files.clear();
202
132
}
203
204
1.86k
void LevelCompactionBuilder::SetupInitialFiles() {
205
  // Find the compactions by size on all levels.
206
1.86k
  bool skipped_l0_to_base = false;
207
1.86k
  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
208
1.86k
    start_level_score_ = vstorage_->CompactionScore(i);
209
1.86k
    start_level_ = vstorage_->CompactionScoreLevel(i);
210
1.86k
    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
211
1.86k
    if (start_level_score_ >= 1) {
212
1.17k
      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
213
        // If L0->base_level compaction is pending, don't schedule further
214
        // compaction from base level. Otherwise L0->base_level compaction
215
        // may starve.
216
0
        continue;
217
0
      }
218
1.17k
      output_level_ =
219
1.17k
          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
220
1.17k
      bool picked_file_to_compact = PickFileToCompact();
221
1.17k
      TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact",
222
1.17k
                               &picked_file_to_compact);
223
1.17k
      if (picked_file_to_compact) {
224
        // found the compaction!
225
1.17k
        if (start_level_ == 0) {
226
          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
227
1.17k
          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
228
1.17k
        } else {
229
          // L1+ score = `Level files size` / `MaxBytesForLevel`
230
0
          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
231
0
        }
232
1.17k
        break;
233
1.17k
      } else {
234
        // didn't find the compaction, clear the inputs
235
0
        start_level_inputs_.clear();
236
0
        if (start_level_ == 0) {
237
0
          skipped_l0_to_base = true;
238
          // L0->base_level may be blocked due to ongoing L0->base_level
239
          // compactions. It may also be blocked by an ongoing compaction from
240
          // base_level downwards.
241
          //
242
          // In these cases, to reduce L0 file count and thus reduce likelihood
243
          // of write stalls, we can attempt compacting a span of files within
244
          // L0.
245
0
          if (PickIntraL0Compaction()) {
246
0
            output_level_ = 0;
247
0
            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
248
0
            break;
249
0
          }
250
0
        }
251
0
      }
252
1.17k
    } else {
253
      // Compaction scores are sorted in descending order, no further scores
254
      // will be >= 1.
255
687
      break;
256
687
    }
257
1.86k
  }
258
1.86k
  if (!start_level_inputs_.empty()) {
259
1.17k
    return;
260
1.17k
  }
261
262
  // if we didn't find a compaction, check if there are any files marked for
263
  // compaction
264
687
  parent_index_ = base_index_ = -1;
265
266
687
  compaction_picker_->PickFilesMarkedForCompaction(
267
687
      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_,
268
687
      /*skip_marked_file*/ [](const FileMetaData* /* file */) {
269
0
        return false;
270
0
      });
271
687
  if (!start_level_inputs_.empty()) {
272
0
    compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
273
0
    return;
274
0
  }
275
276
  // Bottommost Files Compaction on deleting tombstones
277
687
  PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(),
278
687
                    CompactToNextLevel::kNo);
279
687
  if (!start_level_inputs_.empty()) {
280
654
    compaction_reason_ = CompactionReason::kBottommostFiles;
281
654
    return;
282
654
  }
283
284
  // TTL Compaction
285
33
  if (ioptions_.compaction_pri == kRoundRobin &&
286
0
      !vstorage_->ExpiredTtlFiles().empty()) {
287
0
    auto expired_files = vstorage_->ExpiredTtlFiles();
288
    // the expired files list should already be sorted by level
289
0
    start_level_ = expired_files.front().first;
290
#ifndef NDEBUG
291
    for (const auto& file : expired_files) {
292
      assert(start_level_ <= file.first);
293
    }
294
#endif
295
0
    if (start_level_ > 0) {
296
0
      output_level_ = start_level_ + 1;
297
0
      if (PickFileToCompact()) {
298
0
        compaction_reason_ = CompactionReason::kRoundRobinTtl;
299
0
        return;
300
0
      }
301
0
    }
302
0
  }
303
304
33
  PickFileToCompact(vstorage_->ExpiredTtlFiles(),
305
33
                    CompactToNextLevel::kSkipLastLevel);
306
33
  if (!start_level_inputs_.empty()) {
307
0
    compaction_reason_ = CompactionReason::kTtl;
308
0
    return;
309
0
  }
310
311
  // Periodic Compaction
312
33
  PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(),
313
33
                    ioptions_.level_compaction_dynamic_level_bytes
314
33
                        ? CompactToNextLevel::kYes
315
33
                        : CompactToNextLevel::kNo);
316
33
  if (!start_level_inputs_.empty()) {
317
0
    compaction_reason_ = CompactionReason::kPeriodicCompaction;
318
0
    return;
319
0
  }
320
321
  // Forced blob garbage collection
322
33
  PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(),
323
33
                    CompactToNextLevel::kNo);
324
33
  if (!start_level_inputs_.empty()) {
325
0
    compaction_reason_ = CompactionReason::kForcedBlobGC;
326
0
    return;
327
0
  }
328
33
}
329
330
1.83k
bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
331
1.83k
  if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
332
877
    return compaction_picker_->GetOverlappingL0Files(
333
877
        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
334
877
  }
335
955
  return true;
336
1.83k
}
337
338
0
void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
339
  // We only expand when the start level is not L0 under round robin
340
0
  assert(start_level_ >= 1);
341
342
  // For round-robin compaction priority, we have 3 constraints when picking
343
  // multiple files.
344
  // Constraint 1: We can only pick consecutive files
345
  //  -> Constraint 1a: When a file is being compacted (or some input files
346
  //                    are being compacted after expanding, we cannot
347
  //                    choose it and have to stop choosing more files
348
  //  -> Constraint 1b: When we reach the last file (with largest keys), we
349
  //                    cannot choose more files (the next file will be the
350
  //                    first one)
351
  // Constraint 2: We should ensure the total compaction bytes (including the
352
  //               overlapped files from the next level) is no more than
353
  //               mutable_cf_options_.max_compaction_bytes
354
  // Constraint 3: We try our best to pick as many files as possible so that
355
  //               the post-compaction level size is less than
356
  //               MaxBytesForLevel(start_level_)
357
  // Constraint 4: We do not expand if it is possible to apply a trivial move
358
  // Constraint 5 (TODO): Try to pick minimal files to split into the target
359
  //               number of subcompactions
360
0
  TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
361
362
  // Only expand the inputs when we have selected a file in start_level_inputs_
363
0
  if (start_level_inputs_.size() == 0) {
364
0
    return;
365
0
  }
366
367
0
  uint64_t start_lvl_bytes_no_compacting = 0;
368
0
  uint64_t curr_bytes_to_compact = 0;
369
0
  uint64_t start_lvl_max_bytes_to_compact = 0;
370
0
  const std::vector<FileMetaData*>& level_files =
371
0
      vstorage_->LevelFiles(start_level_);
372
  // Constraint 3 (pre-calculate the ideal max bytes to compact)
373
0
  for (auto f : level_files) {
374
0
    if (!f->being_compacted) {
375
0
      start_lvl_bytes_no_compacting += f->fd.GetFileSize();
376
0
    }
377
0
  }
378
0
  if (start_lvl_bytes_no_compacting >
379
0
      vstorage_->MaxBytesForLevel(start_level_)) {
380
0
    start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
381
0
                                     vstorage_->MaxBytesForLevel(start_level_);
382
0
  }
383
384
0
  size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
385
0
  InternalKey smallest, largest;
386
  // Constraint 4 (No need to check again later)
387
0
  compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
388
0
  CompactionInputFiles output_level_inputs;
389
0
  output_level_inputs.level = output_level_;
390
0
  vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
391
0
                                  &output_level_inputs.files);
392
0
  if (output_level_inputs.empty()) {
393
0
    if (TryExtendNonL0TrivialMove((int)start_index,
394
0
                                  true /* only_expand_right */)) {
395
0
      return;
396
0
    }
397
0
  }
398
  // Constraint 3
399
0
  if (start_level_inputs_[0]->fd.GetFileSize() >=
400
0
      start_lvl_max_bytes_to_compact) {
401
0
    return;
402
0
  }
403
0
  CompactionInputFiles tmp_start_level_inputs;
404
0
  tmp_start_level_inputs = start_level_inputs_;
405
  // TODO (zichen): Future parallel round-robin may also need to update this
406
  // Constraint 1b (only expand till the end)
407
0
  for (size_t i = start_index + 1; i < level_files.size(); i++) {
408
0
    auto* f = level_files[i];
409
0
    if (f->being_compacted) {
410
      // Constraint 1a
411
0
      return;
412
0
    }
413
414
0
    tmp_start_level_inputs.files.push_back(f);
415
0
    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
416
0
                                                    &tmp_start_level_inputs) ||
417
0
        compaction_picker_->FilesRangeOverlapWithCompaction(
418
0
            {tmp_start_level_inputs}, output_level_,
419
0
            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
420
0
                                              ioptions_, start_level_,
421
0
                                              output_level_))) {
422
      // Constraint 1a
423
0
      tmp_start_level_inputs.clear();
424
0
      return;
425
0
    }
426
427
0
    curr_bytes_to_compact = 0;
428
0
    for (auto start_lvl_f : tmp_start_level_inputs.files) {
429
0
      curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
430
0
    }
431
432
    // Check whether any output level files are locked
433
0
    compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
434
0
    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
435
0
                                    &output_level_inputs.files);
436
0
    if (!output_level_inputs.empty() &&
437
0
        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
438
0
                                                    &output_level_inputs)) {
439
      // Constraint 1a
440
0
      tmp_start_level_inputs.clear();
441
0
      return;
442
0
    }
443
444
0
    uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
445
0
    for (auto output_lvl_f : output_level_inputs.files) {
446
0
      curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
447
0
    }
448
0
    if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
449
      // Constraint 2
450
0
      tmp_start_level_inputs.clear();
451
0
      return;
452
0
    }
453
454
0
    start_level_inputs_.files = tmp_start_level_inputs.files;
455
    // Constraint 3
456
0
    if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
457
0
      return;
458
0
    }
459
0
  }
460
0
}
461
462
1.83k
bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
463
  // Setup input files from output level. For output to L0, we only compact
464
  // spans of files that do not interact with any pending compactions, so don't
465
  // need to consider other levels.
466
1.83k
  if (output_level_ != 0) {
467
1.43k
    output_level_inputs_.level = output_level_;
468
1.43k
    bool round_robin_expanding =
469
1.43k
        ioptions_.compaction_pri == kRoundRobin &&
470
0
        compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
471
1.43k
    if (round_robin_expanding) {
472
0
      SetupOtherFilesWithRoundRobinExpansion();
473
0
    }
474
1.43k
    if (!is_l0_trivial_move_ &&
475
1.13k
        !compaction_picker_->SetupOtherInputs(
476
1.13k
            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
477
1.13k
            &output_level_inputs_, &parent_index_, base_index_,
478
1.13k
            round_robin_expanding)) {
479
0
      return false;
480
0
    }
481
482
1.43k
    compaction_inputs_.push_back(start_level_inputs_);
483
1.43k
    if (!output_level_inputs_.empty()) {
484
221
      compaction_inputs_.push_back(output_level_inputs_);
485
221
    }
486
487
    // In some edge cases we could pick a compaction that will be compacting
488
    // a key range that overlap with another running compaction, and both
489
    // of them have the same output level. This could happen if
490
    // (1) we are running a non-exclusive manual compaction
491
    // (2) AddFile ingest a new file into the LSM tree
492
    // We need to disallow this from happening.
493
1.43k
    if (compaction_picker_->FilesRangeOverlapWithCompaction(
494
1.43k
            compaction_inputs_, output_level_,
495
1.43k
            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
496
1.43k
                                              ioptions_, start_level_,
497
1.43k
                                              output_level_))) {
498
      // This compaction output could potentially conflict with the output
499
      // of a currently running compaction, we cannot run it.
500
0
      return false;
501
0
    }
502
1.43k
    if (!is_l0_trivial_move_) {
503
1.13k
      compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
504
1.13k
                                          output_level_inputs_, &grandparents_);
505
1.13k
    }
506
1.43k
  } else {
507
398
    compaction_inputs_.push_back(start_level_inputs_);
508
398
  }
509
1.83k
  return true;
510
1.83k
}
511
512
1.86k
Compaction* LevelCompactionBuilder::PickCompaction() {
513
  // Pick up the first file to start compaction. It may have been extended
514
  // to a clean cut.
515
1.86k
  SetupInitialFiles();
516
1.86k
  if (start_level_inputs_.empty()) {
517
33
    return nullptr;
518
33
  }
519
1.86k
  assert(start_level_ >= 0 && output_level_ >= 0);
520
521
  // If it is a L0 -> base level compaction, we need to set up other L0
522
  // files if needed.
523
1.83k
  if (!SetupOtherL0FilesIfNeeded()) {
524
0
    return nullptr;
525
0
  }
526
527
  // Pick files in the output level and expand more files in the start level
528
  // if needed.
529
1.83k
  if (!SetupOtherInputsIfNeeded()) {
530
0
    return nullptr;
531
0
  }
532
533
  // Form a compaction object containing the files we picked.
534
1.83k
  Compaction* c = GetCompaction();
535
536
1.83k
  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
537
538
1.83k
  return c;
539
1.83k
}
540
541
1.83k
Compaction* LevelCompactionBuilder::GetCompaction() {
542
  // TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
543
  // empty output level. So L0 files is picked in PickFileToCompact() by
544
  // compaction score. We may still be able to do trivial move when this file
545
  // does not overlap with other L0s. This happens when
546
  // compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
547
  // pull in more L0s.
548
1.83k
  assert(!compaction_inputs_.empty());
549
1.83k
  bool l0_files_might_overlap =
550
1.83k
      start_level_ == 0 && !is_l0_trivial_move_ &&
551
1.27k
      (compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
552
1.83k
  auto c = new Compaction(
553
1.83k
      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
554
1.83k
      std::move(compaction_inputs_), output_level_,
555
1.83k
      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
556
1.83k
                          ioptions_.compaction_style, vstorage_->base_level(),
557
1.83k
                          ioptions_.level_compaction_dynamic_level_bytes),
558
1.83k
      mutable_cf_options_.max_compaction_bytes,
559
1.83k
      GetPathId(ioptions_, mutable_cf_options_, output_level_),
560
1.83k
      GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
561
1.83k
                         vstorage_->base_level()),
562
1.83k
      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
563
1.83k
      Temperature::kUnknown,
564
1.83k
      /* max_subcompactions */ 0, std::move(grandparents_),
565
1.83k
      /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
566
1.83k
      compaction_reason_,
567
1.83k
      /* trim_ts */ "", start_level_score_, l0_files_might_overlap);
568
569
  // If it's level 0 compaction, make sure we don't execute any other level 0
570
  // compactions in parallel
571
1.83k
  compaction_picker_->RegisterCompaction(c);
572
573
  // Creating a compaction influences the compaction score because the score
574
  // takes running compactions into account (by skipping files that are already
575
  // being compacted). Since we just changed compaction score, we recalculate it
576
  // here
577
1.83k
  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
578
1.83k
                                    full_history_ts_low_);
579
1.83k
  return c;
580
1.83k
}
581
582
/*
583
 * Find the optimal path to place a file
584
 * Given a level, finds the path where levels up to it will fit in levels
585
 * up to and including this path
586
 */
587
uint32_t LevelCompactionBuilder::GetPathId(
588
    const ImmutableCFOptions& ioptions,
589
1.83k
    const MutableCFOptions& mutable_cf_options, int level) {
590
1.83k
  uint32_t p = 0;
591
1.83k
  assert(!ioptions.cf_paths.empty());
592
593
  // size remaining in the most recent path
594
1.83k
  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
595
596
1.83k
  uint64_t level_size;
597
1.83k
  int cur_level = 0;
598
599
  // max_bytes_for_level_base denotes L1 size.
600
  // We estimate L0 size to be the same as L1.
601
1.83k
  level_size = mutable_cf_options.max_bytes_for_level_base;
602
603
  // Last path is the fallback
604
1.83k
  while (p < ioptions.cf_paths.size() - 1) {
605
0
    if (level_size <= current_path_size) {
606
0
      if (cur_level == level) {
607
        // Does desired level fit in this path?
608
0
        return p;
609
0
      } else {
610
0
        current_path_size -= level_size;
611
0
        if (cur_level > 0) {
612
0
          if (ioptions.level_compaction_dynamic_level_bytes) {
613
            // Currently, level_compaction_dynamic_level_bytes is ignored when
614
            // multiple db paths are specified. https://github.com/facebook/
615
            // rocksdb/blob/main/db/column_family.cc.
616
            // Still, adding this check to avoid accidentally using
617
            // max_bytes_for_level_multiplier_additional
618
0
            level_size = static_cast<uint64_t>(
619
0
                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
620
0
          } else {
621
0
            level_size = static_cast<uint64_t>(
622
0
                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
623
0
                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
624
0
          }
625
0
        }
626
0
        cur_level++;
627
0
        continue;
628
0
      }
629
0
    }
630
0
    p++;
631
0
    current_path_size = ioptions.cf_paths[p].target_size;
632
0
  }
633
1.83k
  return p;
634
1.83k
}
635
636
1.17k
bool LevelCompactionBuilder::TryPickL0TrivialMove() {
637
1.17k
  if (vstorage_->base_level() <= 0) {
638
0
    return false;
639
0
  }
640
1.17k
  if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
641
1.17k
      !vstorage_->LevelFiles(output_level_).empty() &&
642
547
      ioptions_.db_paths.size() <= 1) {
643
    // Try to pick trivial move from L0 to L1. We start from the oldest
644
    // file. We keep expanding to newer files if it would form a
645
    // trivial move.
646
    // For now we don't support it with
647
    // mutable_cf_options_.compression_per_level to prevent the logic
648
    // of determining whether L0 can be trivial moved to the next level.
649
    // We skip the case where output level is empty, since in this case, at
650
    // least the oldest file would qualify for trivial move, and this would
651
    // be a surprising behavior with few benefits.
652
653
    // We search from the oldest file from the newest. In theory, there are
654
    // files in the middle can form trivial move too, but it is probably
655
    // uncommon and we ignore these cases for simplicity.
656
547
    const std::vector<FileMetaData*>& level_files =
657
547
        vstorage_->LevelFiles(start_level_);
658
659
547
    InternalKey my_smallest, my_largest;
660
1.20k
    for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
661
1.17k
      CompactionInputFiles output_level_inputs;
662
1.17k
      output_level_inputs.level = output_level_;
663
1.17k
      FileMetaData* file = *it;
664
1.17k
      if (it == level_files.rbegin()) {
665
547
        my_smallest = file->smallest;
666
547
        my_largest = file->largest;
667
623
      } else {
668
623
        if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
669
623
            0) {
670
422
          my_smallest = file->smallest;
671
422
        } else if (compaction_picker_->icmp()->Compare(file->smallest,
672
201
                                                       my_largest) > 0) {
673
143
          my_largest = file->largest;
674
143
        } else {
675
58
          break;
676
58
        }
677
623
      }
678
1.11k
      vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
679
1.11k
                                      &output_level_inputs.files);
680
1.11k
      if (output_level_inputs.empty()) {
681
654
        assert(!file->being_compacted);
682
654
        start_level_inputs_.files.push_back(file);
683
654
      } else {
684
458
        break;
685
458
      }
686
1.11k
    }
687
547
  }
688
689
1.17k
  if (!start_level_inputs_.empty()) {
690
    // Sort files by key range. Not sure it's 100% necessary but it's cleaner
691
    // to always keep files sorted by key the key ranges don't overlap.
692
301
    std::sort(start_level_inputs_.files.begin(),
693
301
              start_level_inputs_.files.end(),
694
301
              [icmp = compaction_picker_->icmp()](FileMetaData* f1,
695
459
                                                  FileMetaData* f2) -> bool {
696
459
                return (icmp->Compare(f1->smallest, f2->smallest) < 0);
697
459
              });
698
699
301
    is_l0_trivial_move_ = true;
700
301
    return true;
701
301
  }
702
877
  return false;
703
1.17k
}
704
705
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
706
0
                                                       bool only_expand_right) {
707
0
  if (start_level_inputs_.size() == 1 &&
708
0
      (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
709
0
      (mutable_cf_options_.compression_per_level.empty())) {
710
    // Only file of `index`, and it is likely a trivial move. Try to
711
    // expand if it is still a trivial move, but not beyond
712
    // max_compaction_bytes or 4 files, so that we don't create too
713
    // much compaction pressure for the next level.
714
    // Ignore if there are more than one DB path, as it would be hard
715
    // to predict whether it is a trivial move.
716
0
    const std::vector<FileMetaData*>& level_files =
717
0
        vstorage_->LevelFiles(start_level_);
718
0
    const size_t kMaxMultiTrivialMove = 4;
719
0
    FileMetaData* initial_file = start_level_inputs_.files[0];
720
0
    size_t total_size = initial_file->fd.GetFileSize();
721
0
    CompactionInputFiles output_level_inputs;
722
0
    output_level_inputs.level = output_level_;
723
    // Expand towards right
724
0
    for (int i = start_index + 1;
725
0
         i < static_cast<int>(level_files.size()) &&
726
0
         start_level_inputs_.size() < kMaxMultiTrivialMove;
727
0
         i++) {
728
0
      FileMetaData* next_file = level_files[i];
729
0
      if (next_file->being_compacted) {
730
0
        break;
731
0
      }
732
0
      vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
733
0
                                      &(next_file->largest),
734
0
                                      &output_level_inputs.files);
735
0
      if (!output_level_inputs.empty()) {
736
0
        break;
737
0
      }
738
0
      if (i < static_cast<int>(level_files.size()) - 1 &&
739
0
          compaction_picker_->icmp()
740
0
                  ->user_comparator()
741
0
                  ->CompareWithoutTimestamp(
742
0
                      next_file->largest.user_key(),
743
0
                      level_files[i + 1]->smallest.user_key()) == 0) {
744
0
        TEST_SYNC_POINT_CALLBACK(
745
0
            "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
746
0
            nullptr);
747
        // Not a clean up after adding the next file. Skip.
748
0
        break;
749
0
      }
750
0
      total_size += next_file->fd.GetFileSize();
751
0
      if (total_size > mutable_cf_options_.max_compaction_bytes) {
752
0
        break;
753
0
      }
754
0
      start_level_inputs_.files.push_back(next_file);
755
0
    }
756
    // Expand towards left
757
0
    if (!only_expand_right) {
758
0
      for (int i = start_index - 1;
759
0
           i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
760
0
        FileMetaData* next_file = level_files[i];
761
0
        if (next_file->being_compacted) {
762
0
          break;
763
0
        }
764
0
        vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
765
0
                                        &(initial_file->largest),
766
0
                                        &output_level_inputs.files);
767
0
        if (!output_level_inputs.empty()) {
768
0
          break;
769
0
        }
770
0
        if (i > 0 && compaction_picker_->icmp()
771
0
                             ->user_comparator()
772
0
                             ->CompareWithoutTimestamp(
773
0
                                 next_file->smallest.user_key(),
774
0
                                 level_files[i - 1]->largest.user_key()) == 0) {
775
          // Not a clean up after adding the next file. Skip.
776
0
          break;
777
0
        }
778
0
        total_size += next_file->fd.GetFileSize();
779
0
        if (total_size > mutable_cf_options_.max_compaction_bytes) {
780
0
          break;
781
0
        }
782
        // keep `files` sorted in increasing order by key range
783
0
        start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
784
0
                                         next_file);
785
0
      }
786
0
    }
787
0
    return start_level_inputs_.size() > 1;
788
0
  }
789
0
  return false;
790
0
}
791
792
1.17k
bool LevelCompactionBuilder::PickFileToCompact() {
793
  // level 0 files are overlapping. So we cannot pick more
794
  // than one concurrent compactions at this level. This
795
  // could be made better by looking at key-ranges that are
796
  // being compacted at level 0.
797
1.17k
  if (start_level_ == 0 &&
798
1.17k
      !compaction_picker_->level0_compactions_in_progress()->empty()) {
799
0
    if (PickSizeBasedIntraL0Compaction()) {
800
0
      return true;
801
0
    }
802
0
    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
803
0
    return false;
804
0
  }
805
806
1.17k
  start_level_inputs_.clear();
807
1.17k
  start_level_inputs_.level = start_level_;
808
809
1.17k
  assert(start_level_ >= 0);
810
811
1.17k
  if (TryPickL0TrivialMove()) {
812
301
    return true;
813
301
  }
814
877
  if (start_level_ == 0 && PickSizeBasedIntraL0Compaction()) {
815
0
    return true;
816
0
  }
817
818
877
  const std::vector<FileMetaData*>& level_files =
819
877
      vstorage_->LevelFiles(start_level_);
820
821
  // Pick the file with the highest score in this level that is not already
822
  // being compacted.
823
877
  const std::vector<int>& file_scores =
824
877
      vstorage_->FilesByCompactionPri(start_level_);
825
826
877
  unsigned int cmp_idx;
827
877
  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
828
877
       cmp_idx < file_scores.size(); cmp_idx++) {
829
877
    int index = file_scores[cmp_idx];
830
877
    auto* f = level_files[index];
831
832
    // do not pick a file to compact if it is being compacted
833
    // from n-1 level.
834
877
    if (f->being_compacted) {
835
0
      if (ioptions_.compaction_pri == kRoundRobin) {
836
        // TODO(zichen): this file may be involved in one compaction from
837
        // an upper level, cannot advance the cursor for round-robin policy.
838
        // Currently, we do not pick any file to compact in this case. We
839
        // should fix this later to ensure a compaction is picked but the
840
        // cursor shall not be advanced.
841
0
        return false;
842
0
      }
843
0
      continue;
844
0
    }
845
846
877
    start_level_inputs_.files.push_back(f);
847
877
    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
848
877
                                                    &start_level_inputs_) ||
849
877
        compaction_picker_->FilesRangeOverlapWithCompaction(
850
877
            {start_level_inputs_}, output_level_,
851
877
            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
852
877
                                              ioptions_, start_level_,
853
877
                                              output_level_))) {
854
      // A locked (pending compaction) input-level file was pulled in due to
855
      // user-key overlap.
856
0
      start_level_inputs_.clear();
857
858
0
      if (ioptions_.compaction_pri == kRoundRobin) {
859
0
        return false;
860
0
      }
861
0
      continue;
862
0
    }
863
864
    // Now that input level is fully expanded, we check whether any output
865
    // files are locked due to pending compaction.
866
    //
867
    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
868
    // level files are locked, not just the extra ones pulled in for user-key
869
    // overlap.
870
877
    InternalKey smallest, largest;
871
877
    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
872
877
    CompactionInputFiles output_level_inputs;
873
877
    output_level_inputs.level = output_level_;
874
877
    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
875
877
                                    &output_level_inputs.files);
876
877
    if (output_level_inputs.empty()) {
877
656
      if (start_level_ > 0 &&
878
0
          TryExtendNonL0TrivialMove(index,
879
0
                                    ioptions_.compaction_pri ==
880
0
                                        kRoundRobin /* only_expand_right */)) {
881
0
        break;
882
0
      }
883
656
    } else {
884
221
      if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
885
221
                                                      &output_level_inputs)) {
886
0
        start_level_inputs_.clear();
887
0
        if (ioptions_.compaction_pri == kRoundRobin) {
888
0
          return false;
889
0
        }
890
0
        continue;
891
0
      }
892
221
    }
893
894
877
    base_index_ = index;
895
877
    break;
896
877
  }
897
898
  // store where to start the iteration in the next call to PickCompaction
899
877
  if (ioptions_.compaction_pri != kRoundRobin) {
900
877
    vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
901
877
  }
902
877
  return start_level_inputs_.size() > 0;
903
877
}
904
905
0
bool LevelCompactionBuilder::PickIntraL0Compaction() {
906
0
  start_level_inputs_.clear();
907
0
  const std::vector<FileMetaData*>& level_files =
908
0
      vstorage_->LevelFiles(0 /* level */);
909
0
  if (level_files.size() <
910
0
          static_cast<size_t>(
911
0
              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
912
0
      level_files[0]->being_compacted) {
913
    // If L0 isn't accumulating much files beyond the regular trigger, don't
914
    // resort to L0->L0 compaction yet.
915
0
    return false;
916
0
  }
917
0
  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
918
0
                               std::numeric_limits<uint64_t>::max(),
919
0
                               mutable_cf_options_.max_compaction_bytes,
920
0
                               &start_level_inputs_);
921
0
}
922
923
877
bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
924
877
  assert(start_level_ == 0);
925
877
  int base_level = vstorage_->base_level();
926
877
  if (base_level <= 0) {
927
0
    return false;
928
0
  }
929
877
  const std::vector<FileMetaData*>& l0_files =
930
877
      vstorage_->LevelFiles(/*level=*/0);
931
877
  size_t min_num_file =
932
877
      std::max(2, mutable_cf_options_.level0_file_num_compaction_trigger);
933
877
  if (l0_files.size() < min_num_file) {
934
0
    return false;
935
0
  }
936
877
  uint64_t l0_size = 0;
937
3.73k
  for (const auto& file : l0_files) {
938
3.73k
    assert(file->compensated_file_size >= file->fd.GetFileSize());
939
    // Compact down L0s with more deletions.
940
3.73k
    l0_size += file->compensated_file_size;
941
3.73k
  }
942
943
  // Avoid L0->Lbase compactions that are inefficient for write-amp.
944
877
  const double kMultiplier =
945
877
      std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
946
877
  const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
947
877
  assert(min_lbase_size >= l0_size);
948
877
  const std::vector<FileMetaData*>& lbase_files =
949
877
      vstorage_->LevelFiles(/*level=*/base_level);
950
877
  uint64_t lbase_size = 0;
951
877
  for (const auto& file : lbase_files) {
952
780
    lbase_size += file->fd.GetFileSize();
953
780
    if (lbase_size > min_lbase_size) {
954
0
      break;
955
0
    }
956
780
  }
957
877
  if (lbase_size <= min_lbase_size) {
958
877
    return false;
959
877
  }
960
961
0
  start_level_inputs_.clear();
962
0
  start_level_inputs_.level = 0;
963
0
  for (const auto& file : l0_files) {
964
0
    if (file->being_compacted) {
965
0
      break;
966
0
    }
967
0
    start_level_inputs_.files.push_back(file);
968
0
  }
969
0
  if (start_level_inputs_.files.size() < min_num_file) {
970
0
    start_level_inputs_.clear();
971
0
    return false;
972
0
  }
973
0
  output_level_ = 0;
974
0
  return true /* picked an intra-L0 compaction */;
975
0
}
976
}  // namespace
977
978
Compaction* LevelCompactionPicker::PickCompaction(
979
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
980
    const MutableDBOptions& mutable_db_options,
981
    const std::vector<SequenceNumber>& /*existing_snapshots */,
982
    const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
983
    LogBuffer* log_buffer, const std::string& full_history_ts_low,
984
1.86k
    bool /* require_max_output_level*/) {
985
1.86k
  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
986
1.86k
                                 mutable_cf_options, ioptions_,
987
1.86k
                                 mutable_db_options, full_history_ts_low);
988
1.86k
  return builder.PickCompaction();
989
1.86k
}
990
}  // namespace ROCKSDB_NAMESPACE