Coverage Report

Created: 2025-07-18 07:11

/src/leveldb/db/version_set.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5
#include "db/version_set.h"
6
7
#include <algorithm>
8
#include <cstdio>
9
10
#include "db/filename.h"
11
#include "db/log_reader.h"
12
#include "db/log_writer.h"
13
#include "db/memtable.h"
14
#include "db/table_cache.h"
15
#include "leveldb/env.h"
16
#include "leveldb/table_builder.h"
17
#include "table/merger.h"
18
#include "table/two_level_iterator.h"
19
#include "util/coding.h"
20
#include "util/logging.h"
21
22
namespace leveldb {
23
24
1.34M
static size_t TargetFileSize(const Options* options) {
25
1.34M
  return options->max_file_size;
26
1.34M
}
27
28
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
29
// stop building a single file in a level->level+1 compaction.
30
1.30M
static int64_t MaxGrandParentOverlapBytes(const Options* options) {
31
1.30M
  return 10 * TargetFileSize(options);
32
1.30M
}
33
34
// Maximum number of bytes in all compacted files.  We avoid expanding
35
// the lower level file set of a compaction if it would make the
36
// total compaction cover more than this many bytes.
37
3.75k
static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
38
3.75k
  return 25 * TargetFileSize(options);
39
3.75k
}
40
41
1.46M
static double MaxBytesForLevel(const Options* options, int level) {
42
  // Note: the result for level zero is not really used since we set
43
  // the level-0 compaction threshold based on number of files.
44
45
  // Result for both level-0 and level-1
46
1.46M
  double result = 10. * 1048576.0;
47
4.40M
  while (level > 1) {
48
2.93M
    result *= 10;
49
2.93M
    level--;
50
2.93M
  }
51
1.46M
  return result;
52
1.46M
}
53
54
41.5k
static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
55
  // We could vary per level to reduce number of files?
56
41.5k
  return TargetFileSize(options);
57
41.5k
}
58
59
1.54M
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
60
1.54M
  int64_t sum = 0;
61
3.18M
  for (size_t i = 0; i < files.size(); i++) {
62
1.63M
    sum += files[i]->file_size;
63
1.63M
  }
64
1.54M
  return sum;
65
1.54M
}
66
67
529k
Version::~Version() {
68
529k
  assert(refs_ == 0);
69
70
  // Remove from linked list
71
529k
  prev_->next_ = next_;
72
529k
  next_->prev_ = prev_;
73
74
  // Drop references to files
75
4.23M
  for (int level = 0; level < config::kNumLevels; level++) {
76
5.88M
    for (size_t i = 0; i < files_[level].size(); i++) {
77
2.17M
      FileMetaData* f = files_[level][i];
78
2.17M
      assert(f->refs > 0);
79
2.17M
      f->refs--;
80
2.17M
      if (f->refs <= 0) {
81
894k
        delete f;
82
894k
      }
83
2.17M
    }
84
3.70M
  }
85
529k
}
86
87
int FindFile(const InternalKeyComparator& icmp,
88
229k
             const std::vector<FileMetaData*>& files, const Slice& key) {
89
229k
  uint32_t left = 0;
90
229k
  uint32_t right = files.size();
91
350k
  while (left < right) {
92
120k
    uint32_t mid = (left + right) / 2;
93
120k
    const FileMetaData* f = files[mid];
94
120k
    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
95
      // Key at "mid.largest" is < "target".  Therefore all
96
      // files at or before "mid" are uninteresting.
97
24.9k
      left = mid + 1;
98
95.5k
    } else {
99
      // Key at "mid.largest" is >= "target".  Therefore all files
100
      // after "mid" are uninteresting.
101
95.5k
      right = mid;
102
95.5k
    }
103
120k
  }
104
229k
  return right;
105
229k
}
106
107
static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
108
9.83k
                      const FileMetaData* f) {
109
  // null user_key occurs before all keys and is therefore never after *f
110
9.83k
  return (user_key != nullptr &&
111
9.83k
          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
112
9.83k
}
113
114
static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
115
45.2k
                       const FileMetaData* f) {
116
  // null user_key occurs after all keys and is therefore never before *f
117
45.2k
  return (user_key != nullptr &&
118
45.2k
          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
119
45.2k
}
120
121
bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
122
                           bool disjoint_sorted_files,
123
                           const std::vector<FileMetaData*>& files,
124
                           const Slice* smallest_user_key,
125
213k
                           const Slice* largest_user_key) {
126
213k
  const Comparator* ucmp = icmp.user_comparator();
127
213k
  if (!disjoint_sorted_files) {
128
    // Need to check against all files
129
21.1k
    for (size_t i = 0; i < files.size(); i++) {
130
9.83k
      const FileMetaData* f = files[i];
131
9.83k
      if (AfterFile(ucmp, smallest_user_key, f) ||
132
9.83k
          BeforeFile(ucmp, largest_user_key, f)) {
133
        // No overlap
134
6.69k
      } else {
135
3.14k
        return true;  // Overlap
136
3.14k
      }
137
9.83k
    }
138
11.3k
    return false;
139
14.4k
  }
140
141
  // Binary search over file list
142
198k
  uint32_t index = 0;
143
198k
  if (smallest_user_key != nullptr) {
144
    // Find the earliest possible internal key for smallest_user_key
145
198k
    InternalKey small_key(*smallest_user_key, kMaxSequenceNumber,
146
198k
                          kValueTypeForSeek);
147
198k
    index = FindFile(icmp, files, small_key.Encode());
148
198k
  }
149
150
198k
  if (index >= files.size()) {
151
    // beginning of range is after all files, so no overlap.
152
159k
    return false;
153
159k
  }
154
155
39.2k
  return !BeforeFile(ucmp, largest_user_key, files[index]);
156
198k
}
157
158
// An internal iterator.  For a given version/level pair, yields
159
// information about the files in the level.  For a given entry, key()
160
// is the largest key that occurs in the file, and value() is an
161
// 16-byte value containing the file number and file size, both
162
// encoded using EncodeFixed64.
163
class Version::LevelFileNumIterator : public Iterator {
164
 public:
165
  LevelFileNumIterator(const InternalKeyComparator& icmp,
166
                       const std::vector<FileMetaData*>* flist)
167
152k
      : icmp_(icmp), flist_(flist), index_(flist->size()) {  // Marks as invalid
168
152k
  }
169
587k
  bool Valid() const override { return index_ < flist_->size(); }
170
0
  void Seek(const Slice& target) override {
171
0
    index_ = FindFile(icmp_, *flist_, target);
172
0
  }
173
82.2k
  void SeekToFirst() override { index_ = 0; }
174
0
  void SeekToLast() override {
175
0
    index_ = flist_->empty() ? 0 : flist_->size() - 1;
176
0
  }
177
351k
  void Next() override {
178
351k
    assert(Valid());
179
351k
    index_++;
180
351k
  }
181
0
  void Prev() override {
182
0
    assert(Valid());
183
0
    if (index_ == 0) {
184
0
      index_ = flist_->size();  // Marks as invalid
185
0
    } else {
186
0
      index_--;
187
0
    }
188
0
  }
189
358k
  Slice key() const override {
190
358k
    assert(Valid());
191
358k
    return (*flist_)[index_]->largest.Encode();
192
358k
  }
193
358k
  Slice value() const override {
194
358k
    assert(Valid());
195
358k
    EncodeFixed64(value_buf_, (*flist_)[index_]->number);
196
358k
    EncodeFixed64(value_buf_ + 8, (*flist_)[index_]->file_size);
197
358k
    return Slice(value_buf_, sizeof(value_buf_));
198
358k
  }
199
36.5k
  Status status() const override { return Status::OK(); }
200
201
 private:
202
  const InternalKeyComparator icmp_;
203
  const std::vector<FileMetaData*>* const flist_;
204
  uint32_t index_;
205
206
  // Backing store for value().  Holds the file number and size.
207
  mutable char value_buf_[16];
208
};
209
210
static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
211
358k
                                 const Slice& file_value) {
212
358k
  TableCache* cache = reinterpret_cast<TableCache*>(arg);
213
358k
  if (file_value.size() != 16) {
214
0
    return NewErrorIterator(
215
0
        Status::Corruption("FileReader invoked with unexpected value"));
216
358k
  } else {
217
358k
    return cache->NewIterator(options, DecodeFixed64(file_value.data()),
218
358k
                              DecodeFixed64(file_value.data() + 8));
219
358k
  }
220
358k
}
221
222
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
223
126k
                                            int level) const {
224
126k
  return NewTwoLevelIterator(
225
126k
      new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator,
226
126k
      vset_->table_cache_, options);
227
126k
}
228
229
void Version::AddIterators(const ReadOptions& options,
230
148k
                           std::vector<Iterator*>* iters) {
231
  // Merge all level zero files together since they may overlap
232
480k
  for (size_t i = 0; i < files_[0].size(); i++) {
233
331k
    iters->push_back(vset_->table_cache_->NewIterator(
234
331k
        options, files_[0][i]->number, files_[0][i]->file_size));
235
331k
  }
236
237
  // For levels > 0, we can use a concatenating iterator that sequentially
238
  // walks through the non-overlapping files in the level, opening them
239
  // lazily.
240
1.04M
  for (int level = 1; level < config::kNumLevels; level++) {
241
893k
    if (!files_[level].empty()) {
242
126k
      iters->push_back(NewConcatenatingIterator(options, level));
243
126k
    }
244
893k
  }
245
148k
}
246
247
// Callback from TableCache::Get()
248
namespace {
249
enum SaverState {
250
  kNotFound,
251
  kFound,
252
  kDeleted,
253
  kCorrupt,
254
};
255
struct Saver {
256
  SaverState state;
257
  const Comparator* ucmp;
258
  Slice user_key;
259
  std::string* value;
260
};
261
}  // namespace
262
47.0k
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
263
47.0k
  Saver* s = reinterpret_cast<Saver*>(arg);
264
47.0k
  ParsedInternalKey parsed_key;
265
47.0k
  if (!ParseInternalKey(ikey, &parsed_key)) {
266
0
    s->state = kCorrupt;
267
47.0k
  } else {
268
47.0k
    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
269
16.4k
      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
270
16.4k
      if (s->state == kFound) {
271
10.5k
        s->value->assign(v.data(), v.size());
272
10.5k
      }
273
16.4k
    }
274
47.0k
  }
275
47.0k
}
276
277
39.7k
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
278
39.7k
  return a->number > b->number;
279
39.7k
}
280
281
void Version::ForEachOverlapping(Slice user_key, Slice internal_key, void* arg,
282
46.3k
                                 bool (*func)(void*, int, FileMetaData*)) {
283
46.3k
  const Comparator* ucmp = vset_->icmp_.user_comparator();
284
285
  // Search level-0 in order from newest to oldest.
286
46.3k
  std::vector<FileMetaData*> tmp;
287
46.3k
  tmp.reserve(files_[0].size());
288
163k
  for (uint32_t i = 0; i < files_[0].size(); i++) {
289
116k
    FileMetaData* f = files_[0][i];
290
116k
    if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
291
116k
        ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
292
55.4k
      tmp.push_back(f);
293
55.4k
    }
294
116k
  }
295
46.3k
  if (!tmp.empty()) {
296
24.6k
    std::sort(tmp.begin(), tmp.end(), NewestFirst);
297
45.5k
    for (uint32_t i = 0; i < tmp.size(); i++) {
298
33.0k
      if (!(*func)(arg, 0, tmp[i])) {
299
12.1k
        return;
300
12.1k
      }
301
33.0k
    }
302
24.6k
  }
303
304
  // Search other levels.
305
206k
  for (int level = 1; level < config::kNumLevels; level++) {
306
177k
    size_t num_files = files_[level].size();
307
177k
    if (num_files == 0) continue;
308
309
    // Binary search to find earliest index whose largest key >= internal_key.
310
31.0k
    uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
311
31.0k
    if (index < num_files) {
312
27.9k
      FileMetaData* f = files_[level][index];
313
27.9k
      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
314
        // All of "f" is past any data for user_key
315
22.2k
      } else {
316
22.2k
        if (!(*func)(arg, level, f)) {
317
5.83k
          return;
318
5.83k
        }
319
22.2k
      }
320
27.9k
    }
321
31.0k
  }
322
34.1k
}
323
324
Status Version::Get(const ReadOptions& options, const LookupKey& k,
325
39.8k
                    std::string* value, GetStats* stats) {
326
39.8k
  stats->seek_file = nullptr;
327
39.8k
  stats->seek_file_level = -1;
328
329
39.8k
  struct State {
330
39.8k
    Saver saver;
331
39.8k
    GetStats* stats;
332
39.8k
    const ReadOptions* options;
333
39.8k
    Slice ikey;
334
39.8k
    FileMetaData* last_file_read;
335
39.8k
    int last_file_read_level;
336
337
39.8k
    VersionSet* vset;
338
39.8k
    Status s;
339
39.8k
    bool found;
340
341
47.4k
    static bool Match(void* arg, int level, FileMetaData* f) {
342
47.4k
      State* state = reinterpret_cast<State*>(arg);
343
344
47.4k
      if (state->stats->seek_file == nullptr &&
345
47.4k
          state->last_file_read != nullptr) {
346
        // We have had more than one seek for this read.  Charge the 1st file.
347
10.9k
        state->stats->seek_file = state->last_file_read;
348
10.9k
        state->stats->seek_file_level = state->last_file_read_level;
349
10.9k
      }
350
351
47.4k
      state->last_file_read = f;
352
47.4k
      state->last_file_read_level = level;
353
354
47.4k
      state->s = state->vset->table_cache_->Get(*state->options, f->number,
355
47.4k
                                                f->file_size, state->ikey,
356
47.4k
                                                &state->saver, SaveValue);
357
47.4k
      if (!state->s.ok()) {
358
0
        state->found = true;
359
0
        return false;
360
0
      }
361
47.4k
      switch (state->saver.state) {
362
30.9k
        case kNotFound:
363
30.9k
          return true;  // Keep searching in other files
364
10.5k
        case kFound:
365
10.5k
          state->found = true;
366
10.5k
          return false;
367
5.94k
        case kDeleted:
368
5.94k
          return false;
369
0
        case kCorrupt:
370
0
          state->s =
371
0
              Status::Corruption("corrupted key for ", state->saver.user_key);
372
0
          state->found = true;
373
0
          return false;
374
47.4k
      }
375
376
      // Not reached. Added to avoid false compilation warnings of
377
      // "control reaches end of non-void function".
378
0
      return false;
379
47.4k
    }
380
39.8k
  };
381
382
39.8k
  State state;
383
39.8k
  state.found = false;
384
39.8k
  state.stats = stats;
385
39.8k
  state.last_file_read = nullptr;
386
39.8k
  state.last_file_read_level = -1;
387
388
39.8k
  state.options = &options;
389
39.8k
  state.ikey = k.internal_key();
390
39.8k
  state.vset = vset_;
391
392
39.8k
  state.saver.state = kNotFound;
393
39.8k
  state.saver.ucmp = vset_->icmp_.user_comparator();
394
39.8k
  state.saver.user_key = k.user_key();
395
39.8k
  state.saver.value = value;
396
397
39.8k
  ForEachOverlapping(state.saver.user_key, state.ikey, &state, &State::Match);
398
399
39.8k
  return state.found ? state.s : Status::NotFound(Slice());
400
39.8k
}
401
402
41.3k
bool Version::UpdateStats(const GetStats& stats) {
403
41.3k
  FileMetaData* f = stats.seek_file;
404
41.3k
  if (f != nullptr) {
405
12.4k
    f->allowed_seeks--;
406
12.4k
    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
407
45
      file_to_compact_ = f;
408
45
      file_to_compact_level_ = stats.seek_file_level;
409
45
      return true;
410
45
    }
411
12.4k
  }
412
41.3k
  return false;
413
41.3k
}
414
415
6.43k
bool Version::RecordReadSample(Slice internal_key) {
416
6.43k
  ParsedInternalKey ikey;
417
6.43k
  if (!ParseInternalKey(internal_key, &ikey)) {
418
0
    return false;
419
0
  }
420
421
6.43k
  struct State {
422
6.43k
    GetStats stats;  // Holds first matching file
423
6.43k
    int matches;
424
425
7.81k
    static bool Match(void* arg, int level, FileMetaData* f) {
426
7.81k
      State* state = reinterpret_cast<State*>(arg);
427
7.81k
      state->matches++;
428
7.81k
      if (state->matches == 1) {
429
        // Remember first match.
430
6.31k
        state->stats.seek_file = f;
431
6.31k
        state->stats.seek_file_level = level;
432
6.31k
      }
433
      // We can stop iterating once we have a second match.
434
7.81k
      return state->matches < 2;
435
7.81k
    }
436
6.43k
  };
437
438
6.43k
  State state;
439
6.43k
  state.matches = 0;
440
6.43k
  ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
441
442
  // Must have at least two matches since we want to merge across
443
  // files. But what if we have a single file that contains many
444
  // overwrites and deletions?  Should we have another mechanism for
445
  // finding such files?
446
6.43k
  if (state.matches >= 2) {
447
    // 1MB cost is about 1 seek (see comment in Builder::Apply).
448
1.50k
    return UpdateStats(state.stats);
449
1.50k
  }
450
4.93k
  return false;
451
6.43k
}
452
453
965k
void Version::Ref() { ++refs_; }
454
455
965k
void Version::Unref() {
456
965k
  assert(this != &vset_->dummy_versions_);
457
965k
  assert(refs_ >= 1);
458
965k
  --refs_;
459
965k
  if (refs_ == 0) {
460
411k
    delete this;
461
411k
  }
462
965k
}
463
464
bool Version::OverlapInLevel(int level, const Slice* smallest_user_key,
465
213k
                             const Slice* largest_user_key) {
466
213k
  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
467
213k
                               smallest_user_key, largest_user_key);
468
213k
}
469
470
int Version::PickLevelForMemTableOutput(const Slice& smallest_user_key,
471
14.4k
                                        const Slice& largest_user_key) {
472
14.4k
  int level = 0;
473
14.4k
  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
474
    // Push to next level if there is no overlap in next level,
475
    // and the #bytes overlapping in the level after that are limited.
476
11.3k
    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
477
11.3k
    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
478
11.3k
    std::vector<FileMetaData*> overlaps;
479
24.6k
    while (level < config::kMaxMemCompactLevel) {
480
20.0k
      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
481
6.76k
        break;
482
6.76k
      }
483
13.2k
      if (level + 2 < config::kNumLevels) {
484
        // Check that file does not overlap too many grandparent bytes.
485
13.2k
        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
486
13.2k
        const int64_t sum = TotalFileSize(overlaps);
487
13.2k
        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
488
0
          break;
489
0
        }
490
13.2k
      }
491
13.2k
      level++;
492
13.2k
    }
493
11.3k
  }
494
14.4k
  return level;
495
14.4k
}
496
497
// Store in "*inputs" all files in "level" that overlap [begin,end]
498
void Version::GetOverlappingInputs(int level, const InternalKey* begin,
499
                                   const InternalKey* end,
500
183k
                                   std::vector<FileMetaData*>* inputs) {
501
183k
  assert(level >= 0);
502
183k
  assert(level < config::kNumLevels);
503
183k
  inputs->clear();
504
183k
  Slice user_begin, user_end;
505
183k
  if (begin != nullptr) {
506
183k
    user_begin = begin->user_key();
507
183k
  }
508
183k
  if (end != nullptr) {
509
183k
    user_end = end->user_key();
510
183k
  }
511
183k
  const Comparator* user_cmp = vset_->icmp_.user_comparator();
512
814k
  for (size_t i = 0; i < files_[level].size();) {
513
630k
    FileMetaData* f = files_[level][i++];
514
630k
    const Slice file_start = f->smallest.user_key();
515
630k
    const Slice file_limit = f->largest.user_key();
516
630k
    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
517
      // "f" is completely before specified range; skip it
518
453k
    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
519
      // "f" is completely after specified range; skip it
520
252k
    } else {
521
201k
      inputs->push_back(f);
522
201k
      if (level == 0) {
523
        // Level-0 files may overlap each other.  So check if the newly
524
        // added file has expanded the range.  If so, restart search.
525
136k
        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
526
1.28k
          user_begin = file_start;
527
1.28k
          inputs->clear();
528
1.28k
          i = 0;
529
135k
        } else if (end != nullptr &&
530
135k
                   user_cmp->Compare(file_limit, user_end) > 0) {
531
3.84k
          user_end = file_limit;
532
3.84k
          inputs->clear();
533
3.84k
          i = 0;
534
3.84k
        }
535
136k
      }
536
201k
    }
537
630k
  }
538
183k
}
539
540
298
std::string Version::DebugString() const {
541
298
  std::string r;
542
2.38k
  for (int level = 0; level < config::kNumLevels; level++) {
543
    // E.g.,
544
    //   --- level 1 ---
545
    //   17:123['a' .. 'd']
546
    //   20:43['e' .. 'g']
547
2.08k
    r.append("--- level ");
548
2.08k
    AppendNumberTo(&r, level);
549
2.08k
    r.append(" ---\n");
550
2.08k
    const std::vector<FileMetaData*>& files = files_[level];
551
4.17k
    for (size_t i = 0; i < files.size(); i++) {
552
2.08k
      r.push_back(' ');
553
2.08k
      AppendNumberTo(&r, files[i]->number);
554
2.08k
      r.push_back(':');
555
2.08k
      AppendNumberTo(&r, files[i]->file_size);
556
2.08k
      r.append("[");
557
2.08k
      r.append(files[i]->smallest.DebugString());
558
2.08k
      r.append(" .. ");
559
2.08k
      r.append(files[i]->largest.DebugString());
560
2.08k
      r.append("]\n");
561
2.08k
    }
562
2.08k
  }
563
298
  return r;
564
298
}
565
566
// A helper class so we can efficiently apply a whole sequence
567
// of edits to a particular state without creating intermediate
568
// Versions that contain full copies of the intermediate state.
569
class VersionSet::Builder {
570
 private:
571
  // Helper to sort by v->files_[file_number].smallest
572
  struct BySmallestKey {
573
    const InternalKeyComparator* internal_comparator;
574
575
6.12M
    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
576
6.12M
      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
577
6.12M
      if (r != 0) {
578
6.10M
        return (r < 0);
579
6.10M
      } else {
580
        // Break ties by file number
581
16.3k
        return (f1->number < f2->number);
582
16.3k
      }
583
6.12M
    }
584
  };
585
586
  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
587
  struct LevelState {
588
    std::set<uint64_t> deleted_files;
589
    FileSet* added_files;
590
  };
591
592
  VersionSet* vset_;
593
  Version* base_;
594
  LevelState levels_[config::kNumLevels];
595
596
 public:
597
  // Initialize a builder with the files from *base and other info from *vset
598
293k
  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
599
293k
    base_->Ref();
600
293k
    BySmallestKey cmp;
601
293k
    cmp.internal_comparator = &vset_->icmp_;
602
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
603
2.05M
      levels_[level].added_files = new FileSet(cmp);
604
2.05M
    }
605
293k
  }
606
607
293k
  ~Builder() {
608
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
609
2.05M
      const FileSet* added = levels_[level].added_files;
610
2.05M
      std::vector<FileMetaData*> to_unref;
611
2.05M
      to_unref.reserve(added->size());
612
3.01M
      for (FileSet::const_iterator it = added->begin(); it != added->end();
613
2.05M
           ++it) {
614
958k
        to_unref.push_back(*it);
615
958k
      }
616
2.05M
      delete added;
617
3.01M
      for (uint32_t i = 0; i < to_unref.size(); i++) {
618
958k
        FileMetaData* f = to_unref[i];
619
958k
        f->refs--;
620
958k
        if (f->refs <= 0) {
621
64.0k
          delete f;
622
64.0k
        }
623
958k
      }
624
2.05M
    }
625
293k
    base_->Unref();
626
293k
  }
627
628
  // Apply all of the edits in *edit to the current state.
629
458k
  void Apply(const VersionEdit* edit) {
630
    // Update compaction pointers
631
600k
    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
632
142k
      const int level = edit->compact_pointers_[i].first;
633
142k
      vset_->compact_pointer_[level] =
634
142k
          edit->compact_pointers_[i].second.Encode().ToString();
635
142k
    }
636
637
    // Delete files
638
458k
    for (const auto& deleted_file_set_kvp : edit->deleted_files_) {
639
132k
      const int level = deleted_file_set_kvp.first;
640
132k
      const uint64_t number = deleted_file_set_kvp.second;
641
132k
      levels_[level].deleted_files.insert(number);
642
132k
    }
643
644
    // Add new files
645
1.41M
    for (size_t i = 0; i < edit->new_files_.size(); i++) {
646
958k
      const int level = edit->new_files_[i].first;
647
958k
      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
648
958k
      f->refs = 1;
649
650
      // We arrange to automatically compact this file after
651
      // a certain number of seeks.  Let's assume:
652
      //   (1) One seek costs 10ms
653
      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
654
      //   (3) A compaction of 1MB does 25MB of IO:
655
      //         1MB read from this level
656
      //         10-12MB read from next level (boundaries may be misaligned)
657
      //         10-12MB written to next level
658
      // This implies that 25 seeks cost the same as the compaction
659
      // of 1MB of data.  I.e., one seek costs approximately the
660
      // same as the compaction of 40KB of data.  We are a little
661
      // conservative and allow approximately one seek for every 16KB
662
      // of data before triggering a compaction.
663
958k
      f->allowed_seeks = static_cast<int>((f->file_size / 16384U));
664
958k
      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
665
666
958k
      levels_[level].deleted_files.erase(f->number);
667
958k
      levels_[level].added_files->insert(f);
668
958k
    }
669
458k
  }
670
671
  // Save the current state in *v.
672
293k
  void SaveTo(Version* v) {
673
293k
    BySmallestKey cmp;
674
293k
    cmp.internal_comparator = &vset_->icmp_;
675
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
676
      // Merge the set of added files with the set of pre-existing files.
677
      // Drop any deleted files.  Store the result in *v.
678
2.05M
      const std::vector<FileMetaData*>& base_files = base_->files_[level];
679
2.05M
      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
680
2.05M
      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
681
2.05M
      const FileSet* added_files = levels_[level].added_files;
682
2.05M
      v->files_[level].reserve(base_files.size() + added_files->size());
683
2.05M
      for (const auto& added_file : *added_files) {
684
        // Add all smaller files listed in base_
685
958k
        for (std::vector<FileMetaData*>::const_iterator bpos =
686
958k
                 std::upper_bound(base_iter, base_end, added_file, cmp);
687
1.13M
             base_iter != bpos; ++base_iter) {
688
171k
          MaybeAddFile(v, level, *base_iter);
689
171k
        }
690
691
958k
        MaybeAddFile(v, level, added_file);
692
958k
      }
693
694
      // Add remaining base files
695
3.23M
      for (; base_iter != base_end; ++base_iter) {
696
1.18M
        MaybeAddFile(v, level, *base_iter);
697
1.18M
      }
698
699
#ifndef NDEBUG
700
      // Make sure there is no overlap in levels > 0
701
      if (level > 0) {
702
        for (uint32_t i = 1; i < v->files_[level].size(); i++) {
703
          const InternalKey& prev_end = v->files_[level][i - 1]->largest;
704
          const InternalKey& this_begin = v->files_[level][i]->smallest;
705
          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
706
            std::fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
707
                         prev_end.DebugString().c_str(),
708
                         this_begin.DebugString().c_str());
709
            std::abort();
710
          }
711
        }
712
      }
713
#endif
714
2.05M
    }
715
293k
  }
716
717
2.31M
  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
718
2.31M
    if (levels_[level].deleted_files.count(f->number) > 0) {
719
      // File is deleted: do nothing
720
2.17M
    } else {
721
2.17M
      std::vector<FileMetaData*>* files = &v->files_[level];
722
2.17M
      if (level > 0 && !files->empty()) {
723
        // Must not overlap
724
1.22M
        assert(vset_->icmp_.Compare((*files)[files->size() - 1]->largest,
725
1.22M
                                    f->smallest) < 0);
726
1.22M
      }
727
2.17M
      f->refs++;
728
2.17M
      files->push_back(f);
729
2.17M
    }
730
2.31M
  }
731
};
732
733
VersionSet::VersionSet(const std::string& dbname, const Options* options,
734
                       TableCache* table_cache,
735
                       const InternalKeyComparator* cmp)
736
117k
    : env_(options->env),
737
117k
      dbname_(dbname),
738
117k
      options_(options),
739
117k
      table_cache_(table_cache),
740
117k
      icmp_(*cmp),
741
117k
      next_file_number_(2),
742
117k
      manifest_file_number_(0),  // Filled by Recover()
743
117k
      last_sequence_(0),
744
117k
      log_number_(0),
745
117k
      prev_log_number_(0),
746
117k
      descriptor_file_(nullptr),
747
117k
      descriptor_log_(nullptr),
748
117k
      dummy_versions_(this),
749
117k
      current_(nullptr) {
750
117k
  AppendVersion(new Version(this));
751
117k
}
752
753
117k
VersionSet::~VersionSet() {
754
117k
  current_->Unref();
755
117k
  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
756
117k
  delete descriptor_log_;
757
117k
  delete descriptor_file_;
758
117k
}
759
760
411k
void VersionSet::AppendVersion(Version* v) {
761
  // Make "v" current
762
411k
  assert(v->refs_ == 0);
763
411k
  assert(v != current_);
764
411k
  if (current_ != nullptr) {
765
293k
    current_->Unref();
766
293k
  }
767
411k
  current_ = v;
768
411k
  v->Ref();
769
770
  // Append to linked list
771
411k
  v->prev_ = dummy_versions_.prev_;
772
411k
  v->next_ = &dummy_versions_;
773
411k
  v->prev_->next_ = v;
774
411k
  v->next_->prev_ = v;
775
411k
}
776
777
175k
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
778
175k
  if (edit->has_log_number_) {
779
147k
    assert(edit->log_number_ >= log_number_);
780
147k
    assert(edit->log_number_ < next_file_number_);
781
147k
  } else {
782
27.8k
    edit->SetLogNumber(log_number_);
783
27.8k
  }
784
785
175k
  if (!edit->has_prev_log_number_) {
786
27.8k
    edit->SetPrevLogNumber(prev_log_number_);
787
27.8k
  }
788
789
175k
  edit->SetNextFile(next_file_number_);
790
175k
  edit->SetLastSequence(last_sequence_);
791
792
175k
  Version* v = new Version(this);
793
175k
  {
794
175k
    Builder builder(this, current_);
795
175k
    builder.Apply(edit);
796
175k
    builder.SaveTo(v);
797
175k
  }
798
175k
  Finalize(v);
799
800
  // Initialize new descriptor log file if necessary by creating
801
  // a temporary file that contains a snapshot of the current version.
802
175k
  std::string new_manifest_file;
803
175k
  Status s;
804
175k
  if (descriptor_log_ == nullptr) {
805
    // No reason to unlock *mu here since we only hit this path in the
806
    // first call to LogAndApply (when opening the database).
807
117k
    assert(descriptor_file_ == nullptr);
808
117k
    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
809
117k
    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
810
117k
    if (s.ok()) {
811
117k
      descriptor_log_ = new log::Writer(descriptor_file_);
812
117k
      s = WriteSnapshot(descriptor_log_);
813
117k
    }
814
117k
  }
815
816
  // Unlock during expensive MANIFEST log write
817
175k
  {
818
175k
    mu->Unlock();
819
820
    // Write new record to MANIFEST log
821
175k
    if (s.ok()) {
822
175k
      std::string record;
823
175k
      edit->EncodeTo(&record);
824
175k
      s = descriptor_log_->AddRecord(record);
825
175k
      if (s.ok()) {
826
175k
        s = descriptor_file_->Sync();
827
175k
      }
828
175k
      if (!s.ok()) {
829
0
        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
830
0
      }
831
175k
    }
832
833
    // If we just created a new descriptor file, install it by writing a
834
    // new CURRENT file that points to it.
835
175k
    if (s.ok() && !new_manifest_file.empty()) {
836
117k
      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
837
117k
    }
838
839
175k
    mu->Lock();
840
175k
  }
841
842
  // Install the new version
843
175k
  if (s.ok()) {
844
175k
    AppendVersion(v);
845
175k
    log_number_ = edit->log_number_;
846
175k
    prev_log_number_ = edit->prev_log_number_;
847
175k
  } else {
848
0
    delete v;
849
0
    if (!new_manifest_file.empty()) {
850
0
      delete descriptor_log_;
851
0
      delete descriptor_file_;
852
0
      descriptor_log_ = nullptr;
853
0
      descriptor_file_ = nullptr;
854
0
      env_->RemoveFile(new_manifest_file);
855
0
    }
856
0
  }
857
858
175k
  return s;
859
175k
}
860
861
117k
Status VersionSet::Recover(bool* save_manifest) {
862
117k
  struct LogReporter : public log::Reader::Reporter {
863
117k
    Status* status;
864
117k
    void Corruption(size_t bytes, const Status& s) override {
865
0
      if (this->status->ok()) *this->status = s;
866
0
    }
867
117k
  };
868
869
  // Read "CURRENT" file, which contains a pointer to the current manifest file
870
117k
  std::string current;
871
117k
  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
872
117k
  if (!s.ok()) {
873
0
    return s;
874
0
  }
875
117k
  if (current.empty() || current[current.size() - 1] != '\n') {
876
0
    return Status::Corruption("CURRENT file does not end with newline");
877
0
  }
878
117k
  current.resize(current.size() - 1);
879
880
117k
  std::string dscname = dbname_ + "/" + current;
881
117k
  SequentialFile* file;
882
117k
  s = env_->NewSequentialFile(dscname, &file);
883
117k
  if (!s.ok()) {
884
0
    if (s.IsNotFound()) {
885
0
      return Status::Corruption("CURRENT points to a non-existent file",
886
0
                                s.ToString());
887
0
    }
888
0
    return s;
889
0
  }
890
891
117k
  bool have_log_number = false;
892
117k
  bool have_prev_log_number = false;
893
117k
  bool have_next_file = false;
894
117k
  bool have_last_sequence = false;
895
117k
  uint64_t next_file = 0;
896
117k
  uint64_t last_sequence = 0;
897
117k
  uint64_t log_number = 0;
898
117k
  uint64_t prev_log_number = 0;
899
117k
  Builder builder(this, current_);
900
117k
  int read_records = 0;
901
902
117k
  {
903
117k
    LogReporter reporter;
904
117k
    reporter.status = &s;
905
117k
    log::Reader reader(file, &reporter, true /*checksum*/,
906
117k
                       0 /*initial_offset*/);
907
117k
    Slice record;
908
117k
    std::string scratch;
909
400k
    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
910
282k
      ++read_records;
911
282k
      VersionEdit edit;
912
282k
      s = edit.DecodeFrom(record);
913
282k
      if (s.ok()) {
914
282k
        if (edit.has_comparator_ &&
915
282k
            edit.comparator_ != icmp_.user_comparator()->Name()) {
916
0
          s = Status::InvalidArgument(
917
0
              edit.comparator_ + " does not match existing comparator ",
918
0
              icmp_.user_comparator()->Name());
919
0
        }
920
282k
      }
921
922
282k
      if (s.ok()) {
923
282k
        builder.Apply(&edit);
924
282k
      }
925
926
282k
      if (edit.has_log_number_) {
927
169k
        log_number = edit.log_number_;
928
169k
        have_log_number = true;
929
169k
      }
930
931
282k
      if (edit.has_prev_log_number_) {
932
164k
        prev_log_number = edit.prev_log_number_;
933
164k
        have_prev_log_number = true;
934
164k
      }
935
936
282k
      if (edit.has_next_file_number_) {
937
169k
        next_file = edit.next_file_number_;
938
169k
        have_next_file = true;
939
169k
      }
940
941
282k
      if (edit.has_last_sequence_) {
942
169k
        last_sequence = edit.last_sequence_;
943
169k
        have_last_sequence = true;
944
169k
      }
945
282k
    }
946
117k
  }
947
117k
  delete file;
948
117k
  file = nullptr;
949
950
117k
  if (s.ok()) {
951
117k
    if (!have_next_file) {
952
0
      s = Status::Corruption("no meta-nextfile entry in descriptor");
953
117k
    } else if (!have_log_number) {
954
0
      s = Status::Corruption("no meta-lognumber entry in descriptor");
955
117k
    } else if (!have_last_sequence) {
956
0
      s = Status::Corruption("no last-sequence-number entry in descriptor");
957
0
    }
958
959
117k
    if (!have_prev_log_number) {
960
5.38k
      prev_log_number = 0;
961
5.38k
    }
962
963
117k
    MarkFileNumberUsed(prev_log_number);
964
117k
    MarkFileNumberUsed(log_number);
965
117k
  }
966
967
117k
  if (s.ok()) {
968
117k
    Version* v = new Version(this);
969
117k
    builder.SaveTo(v);
970
    // Install recovered version
971
117k
    Finalize(v);
972
117k
    AppendVersion(v);
973
117k
    manifest_file_number_ = next_file;
974
117k
    next_file_number_ = next_file + 1;
975
117k
    last_sequence_ = last_sequence;
976
117k
    log_number_ = log_number;
977
117k
    prev_log_number_ = prev_log_number;
978
979
    // See if we can reuse the existing MANIFEST file.
980
117k
    if (ReuseManifest(dscname, current)) {
981
      // No need to save new manifest
982
117k
    } else {
983
117k
      *save_manifest = true;
984
117k
    }
985
117k
  } else {
986
0
    std::string error = s.ToString();
987
0
    Log(options_->info_log, "Error recovering version set with %d records: %s",
988
0
        read_records, error.c_str());
989
0
  }
990
991
117k
  return s;
992
117k
}
993
994
bool VersionSet::ReuseManifest(const std::string& dscname,
995
117k
                               const std::string& dscbase) {
996
117k
  if (!options_->reuse_logs) {
997
117k
    return false;
998
117k
  }
999
0
  FileType manifest_type;
1000
0
  uint64_t manifest_number;
1001
0
  uint64_t manifest_size;
1002
0
  if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
1003
0
      manifest_type != kDescriptorFile ||
1004
0
      !env_->GetFileSize(dscname, &manifest_size).ok() ||
1005
      // Make new compacted MANIFEST if old one is too big
1006
0
      manifest_size >= TargetFileSize(options_)) {
1007
0
    return false;
1008
0
  }
1009
1010
0
  assert(descriptor_file_ == nullptr);
1011
0
  assert(descriptor_log_ == nullptr);
1012
0
  Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
1013
0
  if (!r.ok()) {
1014
0
    Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
1015
0
    assert(descriptor_file_ == nullptr);
1016
0
    return false;
1017
0
  }
1018
1019
0
  Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
1020
0
  descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
1021
0
  manifest_file_number_ = manifest_number;
1022
0
  return true;
1023
0
}
1024
1025
348k
void VersionSet::MarkFileNumberUsed(uint64_t number) {
1026
348k
  if (next_file_number_ <= number) {
1027
112k
    next_file_number_ = number + 1;
1028
112k
  }
1029
348k
}
1030
1031
293k
void VersionSet::Finalize(Version* v) {
1032
  // Precomputed best level for next compaction
1033
293k
  int best_level = -1;
1034
293k
  double best_score = -1;
1035
1036
2.05M
  for (int level = 0; level < config::kNumLevels - 1; level++) {
1037
1.76M
    double score;
1038
1.76M
    if (level == 0) {
1039
      // We treat level-0 specially by bounding the number of files
1040
      // instead of number of bytes for two reasons:
1041
      //
1042
      // (1) With larger write-buffer sizes, it is nice not to do too
1043
      // many level-0 compactions.
1044
      //
1045
      // (2) The files in level-0 are merged on every read and
1046
      // therefore we wish to avoid too many files when the individual
1047
      // file size is small (perhaps because of a small write-buffer
1048
      // setting, or very high compression ratios, or lots of
1049
      // overwrites/deletions).
1050
293k
      score = v->files_[level].size() /
1051
293k
              static_cast<double>(config::kL0_CompactionTrigger);
1052
1.46M
    } else {
1053
      // Compute the ratio of current size to size limit.
1054
1.46M
      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
1055
1.46M
      score =
1056
1.46M
          static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
1057
1.46M
    }
1058
1059
1.76M
    if (score > best_score) {
1060
351k
      best_level = level;
1061
351k
      best_score = score;
1062
351k
    }
1063
1.76M
  }
1064
1065
293k
  v->compaction_level_ = best_level;
1066
293k
  v->compaction_score_ = best_score;
1067
293k
}
1068
1069
117k
Status VersionSet::WriteSnapshot(log::Writer* log) {
1070
  // TODO: Break up into multiple records to reduce memory usage on recovery?
1071
1072
  // Save metadata
1073
117k
  VersionEdit edit;
1074
117k
  edit.SetComparatorName(icmp_.user_comparator()->Name());
1075
1076
  // Save compaction pointers
1077
943k
  for (int level = 0; level < config::kNumLevels; level++) {
1078
825k
    if (!compact_pointer_[level].empty()) {
1079
90.8k
      InternalKey key;
1080
90.8k
      key.DecodeFrom(compact_pointer_[level]);
1081
90.8k
      edit.SetCompactPointer(level, key);
1082
90.8k
    }
1083
825k
  }
1084
1085
  // Save files
1086
943k
  for (int level = 0; level < config::kNumLevels; level++) {
1087
825k
    const std::vector<FileMetaData*>& files = current_->files_[level];
1088
1.63M
    for (size_t i = 0; i < files.size(); i++) {
1089
813k
      const FileMetaData* f = files[i];
1090
813k
      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
1091
813k
    }
1092
825k
  }
1093
1094
117k
  std::string record;
1095
117k
  edit.EncodeTo(&record);
1096
117k
  return log->AddRecord(record);
1097
117k
}
1098
1099
2.10M
int VersionSet::NumLevelFiles(int level) const {
1100
2.10M
  assert(level >= 0);
1101
2.10M
  assert(level < config::kNumLevels);
1102
2.10M
  return current_->files_[level].size();
1103
2.10M
}
1104
1105
35.3k
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
1106
  // Update code if kNumLevels changes
1107
35.3k
  static_assert(config::kNumLevels == 7, "");
1108
35.3k
  std::snprintf(
1109
35.3k
      scratch->buffer, sizeof(scratch->buffer), "files[ %d %d %d %d %d %d %d ]",
1110
35.3k
      int(current_->files_[0].size()), int(current_->files_[1].size()),
1111
35.3k
      int(current_->files_[2].size()), int(current_->files_[3].size()),
1112
35.3k
      int(current_->files_[4].size()), int(current_->files_[5].size()),
1113
35.3k
      int(current_->files_[6].size()));
1114
35.3k
  return scratch->buffer;
1115
35.3k
}
1116
1117
0
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
1118
0
  uint64_t result = 0;
1119
0
  for (int level = 0; level < config::kNumLevels; level++) {
1120
0
    const std::vector<FileMetaData*>& files = v->files_[level];
1121
0
    for (size_t i = 0; i < files.size(); i++) {
1122
0
      if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
1123
        // Entire file is before "ikey", so just add the file size
1124
0
        result += files[i]->file_size;
1125
0
      } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
1126
        // Entire file is after "ikey", so ignore
1127
0
        if (level > 0) {
1128
          // Files other than level 0 are sorted by meta->smallest, so
1129
          // no further files in this level will contain data for
1130
          // "ikey".
1131
0
          break;
1132
0
        }
1133
0
      } else {
1134
        // "ikey" falls in the range for this table.  Add the
1135
        // approximate offset of "ikey" within the table.
1136
0
        Table* tableptr;
1137
0
        Iterator* iter = table_cache_->NewIterator(
1138
0
            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
1139
0
        if (tableptr != nullptr) {
1140
0
          result += tableptr->ApproximateOffsetOf(ikey.Encode());
1141
0
        }
1142
0
        delete iter;
1143
0
      }
1144
0
    }
1145
0
  }
1146
0
  return result;
1147
0
}
1148
1149
285k
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
1150
571k
  for (Version* v = dummy_versions_.next_; v != &dummy_versions_;
1151
285k
       v = v->next_) {
1152
2.28M
    for (int level = 0; level < config::kNumLevels; level++) {
1153
1.99M
      const std::vector<FileMetaData*>& files = v->files_[level];
1154
3.94M
      for (size_t i = 0; i < files.size(); i++) {
1155
1.94M
        live->insert(files[i]->number);
1156
1.94M
      }
1157
1.99M
    }
1158
285k
  }
1159
285k
}
1160
1161
0
int64_t VersionSet::NumLevelBytes(int level) const {
1162
0
  assert(level >= 0);
1163
0
  assert(level < config::kNumLevels);
1164
0
  return TotalFileSize(current_->files_[level]);
1165
0
}
1166
1167
0
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
1168
0
  int64_t result = 0;
1169
0
  std::vector<FileMetaData*> overlaps;
1170
0
  for (int level = 1; level < config::kNumLevels - 1; level++) {
1171
0
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1172
0
      const FileMetaData* f = current_->files_[level][i];
1173
0
      current_->GetOverlappingInputs(level + 1, &f->smallest, &f->largest,
1174
0
                                     &overlaps);
1175
0
      const int64_t sum = TotalFileSize(overlaps);
1176
0
      if (sum > result) {
1177
0
        result = sum;
1178
0
      }
1179
0
    }
1180
0
  }
1181
0
  return result;
1182
0
}
1183
1184
// Stores the minimal range that covers all entries in inputs in
1185
// *smallest, *largest.
1186
// REQUIRES: inputs is not empty
1187
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
1188
99.2k
                          InternalKey* smallest, InternalKey* largest) {
1189
99.2k
  assert(!inputs.empty());
1190
99.2k
  smallest->Clear();
1191
99.2k
  largest->Clear();
1192
343k
  for (size_t i = 0; i < inputs.size(); i++) {
1193
243k
    FileMetaData* f = inputs[i];
1194
243k
    if (i == 0) {
1195
99.2k
      *smallest = f->smallest;
1196
99.2k
      *largest = f->largest;
1197
144k
    } else {
1198
144k
      if (icmp_.Compare(f->smallest, *smallest) < 0) {
1199
11.3k
        *smallest = f->smallest;
1200
11.3k
      }
1201
144k
      if (icmp_.Compare(f->largest, *largest) > 0) {
1202
97.5k
        *largest = f->largest;
1203
97.5k
      }
1204
144k
    }
1205
243k
  }
1206
99.2k
}
1207
1208
// Stores the minimal range that covers all entries in inputs1 and inputs2
1209
// in *smallest, *largest.
1210
// REQUIRES: inputs is not empty
1211
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
1212
                           const std::vector<FileMetaData*>& inputs2,
1213
39.0k
                           InternalKey* smallest, InternalKey* largest) {
1214
39.0k
  std::vector<FileMetaData*> all = inputs1;
1215
39.0k
  all.insert(all.end(), inputs2.begin(), inputs2.end());
1216
39.0k
  GetRange(all, smallest, largest);
1217
39.0k
}
1218
1219
27.3k
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
1220
27.3k
  ReadOptions options;
1221
27.3k
  options.verify_checksums = options_->paranoid_checks;
1222
27.3k
  options.fill_cache = false;
1223
1224
  // Level-0 files have to be merged together.  For other levels,
1225
  // we will make a concatenating iterator per level.
1226
  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
1227
27.3k
  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
1228
27.3k
  Iterator** list = new Iterator*[space];
1229
27.3k
  int num = 0;
1230
82.0k
  for (int which = 0; which < 2; which++) {
1231
54.6k
    if (!c->inputs_[which].empty()) {
1232
47.4k
      if (c->level() + which == 0) {
1233
21.0k
        const std::vector<FileMetaData*>& files = c->inputs_[which];
1234
94.1k
        for (size_t i = 0; i < files.size(); i++) {
1235
73.0k
          list[num++] = table_cache_->NewIterator(options, files[i]->number,
1236
73.0k
                                                  files[i]->file_size);
1237
73.0k
        }
1238
26.3k
      } else {
1239
        // Create concatenating iterator for the files from this level
1240
26.3k
        list[num++] = NewTwoLevelIterator(
1241
26.3k
            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
1242
26.3k
            &GetFileIterator, table_cache_, options);
1243
26.3k
      }
1244
47.4k
    }
1245
54.6k
  }
1246
27.3k
  assert(num <= space);
1247
27.3k
  Iterator* result = NewMergingIterator(&icmp_, list, num);
1248
27.3k
  delete[] list;
1249
27.3k
  return result;
1250
27.3k
}
1251
1252
21.1k
Compaction* VersionSet::PickCompaction() {
1253
21.1k
  Compaction* c;
1254
21.1k
  int level;
1255
1256
  // We prefer compactions triggered by too much data in a level over
1257
  // the compactions triggered by seeks.
1258
21.1k
  const bool size_compaction = (current_->compaction_score_ >= 1);
1259
21.1k
  const bool seek_compaction = (current_->file_to_compact_ != nullptr);
1260
21.1k
  if (size_compaction) {
1261
21.0k
    level = current_->compaction_level_;
1262
21.0k
    assert(level >= 0);
1263
21.0k
    assert(level + 1 < config::kNumLevels);
1264
21.0k
    c = new Compaction(options_, level);
1265
1266
    // Pick the first file that comes after compact_pointer_[level]
1267
84.5k
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1268
76.9k
      FileMetaData* f = current_->files_[level][i];
1269
76.9k
      if (compact_pointer_[level].empty() ||
1270
76.9k
          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
1271
13.4k
        c->inputs_[0].push_back(f);
1272
13.4k
        break;
1273
13.4k
      }
1274
76.9k
    }
1275
21.0k
    if (c->inputs_[0].empty()) {
1276
      // Wrap-around to the beginning of the key space
1277
7.65k
      c->inputs_[0].push_back(current_->files_[level][0]);
1278
7.65k
    }
1279
21.0k
  } else if (seek_compaction) {
1280
25
    level = current_->file_to_compact_level_;
1281
25
    c = new Compaction(options_, level);
1282
25
    c->inputs_[0].push_back(current_->file_to_compact_);
1283
25
  } else {
1284
0
    return nullptr;
1285
0
  }
1286
1287
21.1k
  c->input_version_ = current_;
1288
21.1k
  c->input_version_->Ref();
1289
1290
  // Files in level 0 may overlap each other, so pick up all overlapping ones
1291
21.1k
  if (level == 0) {
1292
21.0k
    InternalKey smallest, largest;
1293
21.0k
    GetRange(c->inputs_[0], &smallest, &largest);
1294
    // Note that the next call will discard the file we placed in
1295
    // c->inputs_[0] earlier and replace it with an overlapping set
1296
    // which will include the picked file.
1297
21.0k
    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
1298
21.0k
    assert(!c->inputs_[0].empty());
1299
21.0k
  }
1300
1301
21.1k
  SetupOtherInputs(c);
1302
1303
21.1k
  return c;
1304
21.1k
}
1305
1306
// Finds the largest key in a vector of files. Returns true if files is not
1307
// empty.
1308
bool FindLargestKey(const InternalKeyComparator& icmp,
1309
                    const std::vector<FileMetaData*>& files,
1310
94.5k
                    InternalKey* largest_key) {
1311
94.5k
  if (files.empty()) {
1312
15.2k
    return false;
1313
15.2k
  }
1314
79.3k
  *largest_key = files[0]->largest;
1315
171k
  for (size_t i = 1; i < files.size(); ++i) {
1316
92.2k
    FileMetaData* f = files[i];
1317
92.2k
    if (icmp.Compare(f->largest, *largest_key) > 0) {
1318
60.3k
      *largest_key = f->largest;
1319
60.3k
    }
1320
92.2k
  }
1321
79.3k
  return true;
1322
94.5k
}
1323
1324
// Finds minimum file b2=(l2, u2) in level file for which l2 > u1 and
1325
// user_key(l2) = user_key(u1)
1326
FileMetaData* FindSmallestBoundaryFile(
1327
    const InternalKeyComparator& icmp,
1328
    const std::vector<FileMetaData*>& level_files,
1329
79.3k
    const InternalKey& largest_key) {
1330
79.3k
  const Comparator* user_cmp = icmp.user_comparator();
1331
79.3k
  FileMetaData* smallest_boundary_file = nullptr;
1332
408k
  for (size_t i = 0; i < level_files.size(); ++i) {
1333
329k
    FileMetaData* f = level_files[i];
1334
329k
    if (icmp.Compare(f->smallest, largest_key) > 0 &&
1335
329k
        user_cmp->Compare(f->smallest.user_key(), largest_key.user_key()) ==
1336
102k
            0) {
1337
0
      if (smallest_boundary_file == nullptr ||
1338
0
          icmp.Compare(f->smallest, smallest_boundary_file->smallest) < 0) {
1339
0
        smallest_boundary_file = f;
1340
0
      }
1341
0
    }
1342
329k
  }
1343
79.3k
  return smallest_boundary_file;
1344
79.3k
}
1345
1346
// Extracts the largest file b1 from |compaction_files| and then searches for a
1347
// b2 in |level_files| for which user_key(u1) = user_key(l2). If it finds such a
1348
// file b2 (known as a boundary file) it adds it to |compaction_files| and then
1349
// searches again using this new upper bound.
1350
//
1351
// If there are two blocks, b1=(l1, u1) and b2=(l2, u2) and
1352
// user_key(u1) = user_key(l2), and if we compact b1 but not b2 then a
1353
// subsequent get operation will yield an incorrect result because it will
1354
// return the record from b2 in level i rather than from b1 because it searches
1355
// level by level for records matching the supplied user key.
1356
//
1357
// parameters:
1358
//   in     level_files:      List of files to search for boundary files.
1359
//   in/out compaction_files: List of files to extend by adding boundary files.
1360
void AddBoundaryInputs(const InternalKeyComparator& icmp,
1361
                       const std::vector<FileMetaData*>& level_files,
1362
94.5k
                       std::vector<FileMetaData*>* compaction_files) {
1363
94.5k
  InternalKey largest_key;
1364
1365
  // Quick return if compaction_files is empty.
1366
94.5k
  if (!FindLargestKey(icmp, *compaction_files, &largest_key)) {
1367
15.2k
    return;
1368
15.2k
  }
1369
1370
79.3k
  bool continue_searching = true;
1371
158k
  while (continue_searching) {
1372
79.3k
    FileMetaData* smallest_boundary_file =
1373
79.3k
        FindSmallestBoundaryFile(icmp, level_files, largest_key);
1374
1375
    // If a boundary file was found advance largest_key, otherwise we're done.
1376
79.3k
    if (smallest_boundary_file != NULL) {
1377
0
      compaction_files->push_back(smallest_boundary_file);
1378
0
      largest_key = smallest_boundary_file->largest;
1379
79.3k
    } else {
1380
79.3k
      continue_searching = false;
1381
79.3k
    }
1382
79.3k
  }
1383
79.3k
}
1384
1385
35.3k
void VersionSet::SetupOtherInputs(Compaction* c) {
1386
35.3k
  const int level = c->level();
1387
35.3k
  InternalKey smallest, largest;
1388
1389
35.3k
  AddBoundaryInputs(icmp_, current_->files_[level], &c->inputs_[0]);
1390
35.3k
  GetRange(c->inputs_[0], &smallest, &largest);
1391
1392
35.3k
  current_->GetOverlappingInputs(level + 1, &smallest, &largest,
1393
35.3k
                                 &c->inputs_[1]);
1394
35.3k
  AddBoundaryInputs(icmp_, current_->files_[level + 1], &c->inputs_[1]);
1395
1396
  // Get entire range covered by compaction
1397
35.3k
  InternalKey all_start, all_limit;
1398
35.3k
  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1399
1400
  // See if we can grow the number of inputs in "level" without
1401
  // changing the number of "level+1" files we pick up.
1402
35.3k
  if (!c->inputs_[1].empty()) {
1403
20.1k
    std::vector<FileMetaData*> expanded0;
1404
20.1k
    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
1405
20.1k
    AddBoundaryInputs(icmp_, current_->files_[level], &expanded0);
1406
20.1k
    const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
1407
20.1k
    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
1408
20.1k
    const int64_t expanded0_size = TotalFileSize(expanded0);
1409
20.1k
    if (expanded0.size() > c->inputs_[0].size() &&
1410
20.1k
        inputs1_size + expanded0_size <
1411
3.75k
            ExpandedCompactionByteSizeLimit(options_)) {
1412
3.75k
      InternalKey new_start, new_limit;
1413
3.75k
      GetRange(expanded0, &new_start, &new_limit);
1414
3.75k
      std::vector<FileMetaData*> expanded1;
1415
3.75k
      current_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
1416
3.75k
                                     &expanded1);
1417
3.75k
      AddBoundaryInputs(icmp_, current_->files_[level + 1], &expanded1);
1418
3.75k
      if (expanded1.size() == c->inputs_[1].size()) {
1419
3.68k
        Log(options_->info_log,
1420
3.68k
            "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
1421
3.68k
            level, int(c->inputs_[0].size()), int(c->inputs_[1].size()),
1422
3.68k
            long(inputs0_size), long(inputs1_size), int(expanded0.size()),
1423
3.68k
            int(expanded1.size()), long(expanded0_size), long(inputs1_size));
1424
3.68k
        smallest = new_start;
1425
3.68k
        largest = new_limit;
1426
3.68k
        c->inputs_[0] = expanded0;
1427
3.68k
        c->inputs_[1] = expanded1;
1428
3.68k
        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1429
3.68k
      }
1430
3.75k
    }
1431
20.1k
  }
1432
1433
  // Compute the set of grandparent files that overlap this compaction
1434
  // (parent == level+1; grandparent == level+2)
1435
35.3k
  if (level + 2 < config::kNumLevels) {
1436
35.3k
    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
1437
35.3k
                                   &c->grandparents_);
1438
35.3k
  }
1439
1440
  // Update the place where we will do the next compaction for this level.
1441
  // We update this immediately instead of waiting for the VersionEdit
1442
  // to be applied so that if the compaction fails, we will try a different
1443
  // key range next time.
1444
35.3k
  compact_pointer_[level] = largest.Encode().ToString();
1445
35.3k
  c->edit_.SetCompactPointer(level, largest);
1446
35.3k
}
1447
1448
Compaction* VersionSet::CompactRange(int level, const InternalKey* begin,
1449
54.9k
                                     const InternalKey* end) {
1450
54.9k
  std::vector<FileMetaData*> inputs;
1451
54.9k
  current_->GetOverlappingInputs(level, begin, end, &inputs);
1452
54.9k
  if (inputs.empty()) {
1453
40.7k
    return nullptr;
1454
40.7k
  }
1455
1456
  // Avoid compacting too much in one shot in case the range is large.
1457
  // But we cannot do this for level-0 since level-0 files can overlap
1458
  // and we must not pick one file and drop another older file if the
1459
  // two files overlap.
1460
14.2k
  if (level > 0) {
1461
6.24k
    const uint64_t limit = MaxFileSizeForLevel(options_, level);
1462
6.24k
    uint64_t total = 0;
1463
13.5k
    for (size_t i = 0; i < inputs.size(); i++) {
1464
7.30k
      uint64_t s = inputs[i]->file_size;
1465
7.30k
      total += s;
1466
7.30k
      if (total >= limit) {
1467
0
        inputs.resize(i + 1);
1468
0
        break;
1469
0
      }
1470
7.30k
    }
1471
6.24k
  }
1472
1473
14.2k
  Compaction* c = new Compaction(options_, level);
1474
14.2k
  c->input_version_ = current_;
1475
14.2k
  c->input_version_->Ref();
1476
14.2k
  c->inputs_[0] = inputs;
1477
14.2k
  SetupOtherInputs(c);
1478
14.2k
  return c;
1479
54.9k
}
1480
1481
Compaction::Compaction(const Options* options, int level)
1482
35.3k
    : level_(level),
1483
35.3k
      max_output_file_size_(MaxFileSizeForLevel(options, level)),
1484
35.3k
      input_version_(nullptr),
1485
35.3k
      grandparent_index_(0),
1486
35.3k
      seen_key_(false),
1487
35.3k
      overlapped_bytes_(0) {
1488
282k
  for (int i = 0; i < config::kNumLevels; i++) {
1489
247k
    level_ptrs_[i] = 0;
1490
247k
  }
1491
35.3k
}
1492
1493
35.3k
Compaction::~Compaction() {
1494
35.3k
  if (input_version_ != nullptr) {
1495
8.00k
    input_version_->Unref();
1496
8.00k
  }
1497
35.3k
}
1498
1499
21.1k
bool Compaction::IsTrivialMove() const {
1500
21.1k
  const VersionSet* vset = input_version_->vset_;
1501
  // Avoid a move if there is lots of overlapping grandparent data.
1502
  // Otherwise, the move could create a parent file that will require
1503
  // a very expensive merge later on.
1504
21.1k
  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
1505
21.1k
          TotalFileSize(grandparents_) <=
1506
8.00k
              MaxGrandParentOverlapBytes(vset->options_));
1507
21.1k
}
1508
1509
19.8k
void Compaction::AddInputDeletions(VersionEdit* edit) {
1510
59.6k
  for (int which = 0; which < 2; which++) {
1511
100k
    for (size_t i = 0; i < inputs_[which].size(); i++) {
1512
60.5k
      edit->RemoveFile(level_ + which, inputs_[which][i]->number);
1513
60.5k
    }
1514
39.7k
  }
1515
19.8k
}
1516
1517
48.9k
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
1518
  // Maybe use binary search to find right entry instead of linear search?
1519
48.9k
  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
1520
177k
  for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
1521
150k
    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
1522
154k
    while (level_ptrs_[lvl] < files.size()) {
1523
29.8k
      FileMetaData* f = files[level_ptrs_[lvl]];
1524
29.8k
      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
1525
        // We've advanced far enough
1526
25.5k
        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
1527
          // Key falls in this file's range, so definitely not base level
1528
21.9k
          return false;
1529
21.9k
        }
1530
3.60k
        break;
1531
25.5k
      }
1532
4.31k
      level_ptrs_[lvl]++;
1533
4.31k
    }
1534
150k
  }
1535
26.9k
  return true;
1536
48.9k
}
1537
1538
1.27M
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
1539
1.27M
  const VersionSet* vset = input_version_->vset_;
1540
  // Scan to find earliest grandparent file that contains key.
1541
1.27M
  const InternalKeyComparator* icmp = &vset->icmp_;
1542
1.28M
  while (grandparent_index_ < grandparents_.size() &&
1543
1.28M
         icmp->Compare(internal_key,
1544
96.2k
                       grandparents_[grandparent_index_]->largest.Encode()) >
1545
96.2k
             0) {
1546
4.19k
    if (seen_key_) {
1547
4.19k
      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
1548
4.19k
    }
1549
4.19k
    grandparent_index_++;
1550
4.19k
  }
1551
1.27M
  seen_key_ = true;
1552
1553
1.27M
  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
1554
    // Too much overlap for current output; start new output
1555
0
    overlapped_bytes_ = 0;
1556
0
    return true;
1557
1.27M
  } else {
1558
1.27M
    return false;
1559
1.27M
  }
1560
1.27M
}
1561
1562
27.3k
void Compaction::ReleaseInputs() {
1563
27.3k
  if (input_version_ != nullptr) {
1564
27.3k
    input_version_->Unref();
1565
27.3k
    input_version_ = nullptr;
1566
27.3k
  }
1567
27.3k
}
1568
1569
}  // namespace leveldb