Coverage Report

Created: 2025-10-28 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/leveldb/db/version_set.cc
Line
Count
Source
1
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5
#include "db/version_set.h"
6
7
#include <algorithm>
8
#include <cstdio>
9
10
#include "db/filename.h"
11
#include "db/log_reader.h"
12
#include "db/log_writer.h"
13
#include "db/memtable.h"
14
#include "db/table_cache.h"
15
#include "leveldb/env.h"
16
#include "leveldb/table_builder.h"
17
#include "table/merger.h"
18
#include "table/two_level_iterator.h"
19
#include "util/coding.h"
20
#include "util/logging.h"
21
22
namespace leveldb {
23
24
1.34M
static size_t TargetFileSize(const Options* options) {
25
1.34M
  return options->max_file_size;
26
1.34M
}
27
28
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
29
// stop building a single file in a level->level+1 compaction.
30
1.29M
static int64_t MaxGrandParentOverlapBytes(const Options* options) {
31
1.29M
  return 10 * TargetFileSize(options);
32
1.29M
}
33
34
// Maximum number of bytes in all compacted files.  We avoid expanding
35
// the lower level file set of a compaction if it would make the
36
// total compaction cover more than this many bytes.
37
3.82k
static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
38
3.82k
  return 25 * TargetFileSize(options);
39
3.82k
}
40
41
1.46M
static double MaxBytesForLevel(const Options* options, int level) {
42
  // Note: the result for level zero is not really used since we set
43
  // the level-0 compaction threshold based on number of files.
44
45
  // Result for both level-0 and level-1
46
1.46M
  double result = 10. * 1048576.0;
47
4.38M
  while (level > 1) {
48
2.92M
    result *= 10;
49
2.92M
    level--;
50
2.92M
  }
51
1.46M
  return result;
52
1.46M
}
53
54
43.2k
static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
55
  // We could vary per level to reduce number of files?
56
43.2k
  return TargetFileSize(options);
57
43.2k
}
58
59
1.54M
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
60
1.54M
  int64_t sum = 0;
61
3.12M
  for (size_t i = 0; i < files.size(); i++) {
62
1.58M
    sum += files[i]->file_size;
63
1.58M
  }
64
1.54M
  return sum;
65
1.54M
}
66
67
523k
Version::~Version() {
68
523k
  assert(refs_ == 0);
69
70
  // Remove from linked list
71
523k
  prev_->next_ = next_;
72
523k
  next_->prev_ = prev_;
73
74
  // Drop references to files
75
4.19M
  for (int level = 0; level < config::kNumLevels; level++) {
76
5.73M
    for (size_t i = 0; i < files_[level].size(); i++) {
77
2.07M
      FileMetaData* f = files_[level][i];
78
2.07M
      assert(f->refs > 0);
79
2.07M
      f->refs--;
80
2.07M
      if (f->refs <= 0) {
81
847k
        delete f;
82
847k
      }
83
2.07M
    }
84
3.66M
  }
85
523k
}
86
87
int FindFile(const InternalKeyComparator& icmp,
88
250k
             const std::vector<FileMetaData*>& files, const Slice& key) {
89
250k
  uint32_t left = 0;
90
250k
  uint32_t right = files.size();
91
377k
  while (left < right) {
92
126k
    uint32_t mid = (left + right) / 2;
93
126k
    const FileMetaData* f = files[mid];
94
126k
    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
95
      // Key at "mid.largest" is < "target".  Therefore all
96
      // files at or before "mid" are uninteresting.
97
27.1k
      left = mid + 1;
98
99.5k
    } else {
99
      // Key at "mid.largest" is >= "target".  Therefore all files
100
      // after "mid" are uninteresting.
101
99.5k
      right = mid;
102
99.5k
    }
103
126k
  }
104
250k
  return right;
105
250k
}
106
107
static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
108
9.86k
                      const FileMetaData* f) {
109
  // null user_key occurs before all keys and is therefore never after *f
110
9.86k
  return (user_key != nullptr &&
111
9.86k
          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
112
9.86k
}
113
114
static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
115
48.0k
                       const FileMetaData* f) {
116
  // null user_key occurs after all keys and is therefore never before *f
117
48.0k
  return (user_key != nullptr &&
118
48.0k
          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
119
48.0k
}
120
121
bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
122
                           bool disjoint_sorted_files,
123
                           const std::vector<FileMetaData*>& files,
124
                           const Slice* smallest_user_key,
125
230k
                           const Slice* largest_user_key) {
126
230k
  const Comparator* ucmp = icmp.user_comparator();
127
230k
  if (!disjoint_sorted_files) {
128
    // Need to check against all files
129
21.5k
    for (size_t i = 0; i < files.size(); i++) {
130
9.86k
      const FileMetaData* f = files[i];
131
9.86k
      if (AfterFile(ucmp, smallest_user_key, f) ||
132
6.74k
          BeforeFile(ucmp, largest_user_key, f)) {
133
        // No overlap
134
6.74k
      } else {
135
3.12k
        return true;  // Overlap
136
3.12k
      }
137
9.86k
    }
138
11.6k
    return false;
139
14.7k
  }
140
141
  // Binary search over file list
142
215k
  uint32_t index = 0;
143
215k
  if (smallest_user_key != nullptr) {
144
    // Find the earliest possible internal key for smallest_user_key
145
215k
    InternalKey small_key(*smallest_user_key, kMaxSequenceNumber,
146
215k
                          kValueTypeForSeek);
147
215k
    index = FindFile(icmp, files, small_key.Encode());
148
215k
  }
149
150
215k
  if (index >= files.size()) {
151
    // beginning of range is after all files, so no overlap.
152
173k
    return false;
153
173k
  }
154
155
41.8k
  return !BeforeFile(ucmp, largest_user_key, files[index]);
156
215k
}
157
158
// An internal iterator.  For a given version/level pair, yields
159
// information about the files in the level.  For a given entry, key()
160
// is the largest key that occurs in the file, and value() is an
161
// 16-byte value containing the file number and file size, both
162
// encoded using EncodeFixed64.
163
class Version::LevelFileNumIterator : public Iterator {
164
 public:
165
  LevelFileNumIterator(const InternalKeyComparator& icmp,
166
                       const std::vector<FileMetaData*>* flist)
167
153k
      : icmp_(icmp), flist_(flist), index_(flist->size()) {  // Marks as invalid
168
153k
  }
169
580k
  bool Valid() const override { return index_ < flist_->size(); }
170
0
  void Seek(const Slice& target) override {
171
0
    index_ = FindFile(icmp_, *flist_, target);
172
0
  }
173
83.0k
  void SeekToFirst() override { index_ = 0; }
174
0
  void SeekToLast() override {
175
0
    index_ = flist_->empty() ? 0 : flist_->size() - 1;
176
0
  }
177
344k
  void Next() override {
178
344k
    assert(Valid());
179
344k
    index_++;
180
344k
  }
181
0
  void Prev() override {
182
0
    assert(Valid());
183
0
    if (index_ == 0) {
184
0
      index_ = flist_->size();  // Marks as invalid
185
0
    } else {
186
0
      index_--;
187
0
    }
188
0
  }
189
350k
  Slice key() const override {
190
350k
    assert(Valid());
191
350k
    return (*flist_)[index_]->largest.Encode();
192
350k
  }
193
350k
  Slice value() const override {
194
350k
    assert(Valid());
195
350k
    EncodeFixed64(value_buf_, (*flist_)[index_]->number);
196
350k
    EncodeFixed64(value_buf_ + 8, (*flist_)[index_]->file_size);
197
350k
    return Slice(value_buf_, sizeof(value_buf_));
198
350k
  }
199
38.6k
  Status status() const override { return Status::OK(); }
200
201
 private:
202
  const InternalKeyComparator icmp_;
203
  const std::vector<FileMetaData*>* const flist_;
204
  uint32_t index_;
205
206
  // Backing store for value().  Holds the file number and size.
207
  mutable char value_buf_[16];
208
};
209
210
static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
211
351k
                                 const Slice& file_value) {
212
351k
  TableCache* cache = reinterpret_cast<TableCache*>(arg);
213
351k
  if (file_value.size() != 16) {
214
0
    return NewErrorIterator(
215
0
        Status::Corruption("FileReader invoked with unexpected value"));
216
351k
  } else {
217
351k
    return cache->NewIterator(options, DecodeFixed64(file_value.data()),
218
351k
                              DecodeFixed64(file_value.data() + 8));
219
351k
  }
220
351k
}
221
222
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
223
125k
                                            int level) const {
224
125k
  return NewTwoLevelIterator(
225
125k
      new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator,
226
125k
      vset_->table_cache_, options);
227
125k
}
228
229
void Version::AddIterators(const ReadOptions& options,
230
147k
                           std::vector<Iterator*>* iters) {
231
  // Merge all level zero files together since they may overlap
232
452k
  for (size_t i = 0; i < files_[0].size(); i++) {
233
304k
    iters->push_back(vset_->table_cache_->NewIterator(
234
304k
        options, files_[0][i]->number, files_[0][i]->file_size));
235
304k
  }
236
237
  // For levels > 0, we can use a concatenating iterator that sequentially
238
  // walks through the non-overlapping files in the level, opening them
239
  // lazily.
240
1.03M
  for (int level = 1; level < config::kNumLevels; level++) {
241
883k
    if (!files_[level].empty()) {
242
125k
      iters->push_back(NewConcatenatingIterator(options, level));
243
125k
    }
244
883k
  }
245
147k
}
246
247
// Callback from TableCache::Get()
248
namespace {
249
enum SaverState {
250
  kNotFound,
251
  kFound,
252
  kDeleted,
253
  kCorrupt,
254
};
255
struct Saver {
256
  SaverState state;
257
  const Comparator* ucmp;
258
  Slice user_key;
259
  std::string* value;
260
};
261
}  // namespace
262
57.1k
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
263
57.1k
  Saver* s = reinterpret_cast<Saver*>(arg);
264
57.1k
  ParsedInternalKey parsed_key;
265
57.1k
  if (!ParseInternalKey(ikey, &parsed_key)) {
266
0
    s->state = kCorrupt;
267
57.1k
  } else {
268
57.1k
    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
269
19.9k
      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
270
19.9k
      if (s->state == kFound) {
271
11.8k
        s->value->assign(v.data(), v.size());
272
11.8k
      }
273
19.9k
    }
274
57.1k
  }
275
57.1k
}
276
277
57.3k
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
278
57.3k
  return a->number > b->number;
279
57.3k
}
280
281
void Version::ForEachOverlapping(Slice user_key, Slice internal_key, void* arg,
282
52.0k
                                 bool (*func)(void*, int, FileMetaData*)) {
283
52.0k
  const Comparator* ucmp = vset_->icmp_.user_comparator();
284
285
  // Search level-0 in order from newest to oldest.
286
52.0k
  std::vector<FileMetaData*> tmp;
287
52.0k
  tmp.reserve(files_[0].size());
288
191k
  for (uint32_t i = 0; i < files_[0].size(); i++) {
289
139k
    FileMetaData* f = files_[0][i];
290
139k
    if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
291
104k
        ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
292
72.8k
      tmp.push_back(f);
293
72.8k
    }
294
139k
  }
295
52.0k
  if (!tmp.empty()) {
296
29.3k
    std::sort(tmp.begin(), tmp.end(), NewestFirst);
297
55.9k
    for (uint32_t i = 0; i < tmp.size(); i++) {
298
42.1k
      if (!(*func)(arg, 0, tmp[i])) {
299
15.5k
        return;
300
15.5k
      }
301
42.1k
    }
302
29.3k
  }
303
304
  // Search other levels.
305
220k
  for (int level = 1; level < config::kNumLevels; level++) {
306
189k
    size_t num_files = files_[level].size();
307
189k
    if (num_files == 0) continue;
308
309
    // Binary search to find earliest index whose largest key >= internal_key.
310
34.6k
    uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
311
34.6k
    if (index < num_files) {
312
31.0k
      FileMetaData* f = files_[level][index];
313
31.0k
      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
314
        // All of "f" is past any data for user_key
315
24.5k
      } else {
316
24.5k
        if (!(*func)(arg, level, f)) {
317
6.34k
          return;
318
6.34k
        }
319
24.5k
      }
320
31.0k
    }
321
34.6k
  }
322
36.5k
}
323
324
Status Version::Get(const ReadOptions& options, const LookupKey& k,
325
44.8k
                    std::string* value, GetStats* stats) {
326
44.8k
  stats->seek_file = nullptr;
327
44.8k
  stats->seek_file_level = -1;
328
329
44.8k
  struct State {
330
44.8k
    Saver saver;
331
44.8k
    GetStats* stats;
332
44.8k
    const ReadOptions* options;
333
44.8k
    Slice ikey;
334
44.8k
    FileMetaData* last_file_read;
335
44.8k
    int last_file_read_level;
336
337
44.8k
    VersionSet* vset;
338
44.8k
    Status s;
339
44.8k
    bool found;
340
341
57.6k
    static bool Match(void* arg, int level, FileMetaData* f) {
342
57.6k
      State* state = reinterpret_cast<State*>(arg);
343
344
57.6k
      if (state->stats->seek_file == nullptr &&
345
50.0k
          state->last_file_read != nullptr) {
346
        // We have had more than one seek for this read.  Charge the 1st file.
347
12.3k
        state->stats->seek_file = state->last_file_read;
348
12.3k
        state->stats->seek_file_level = state->last_file_read_level;
349
12.3k
      }
350
351
57.6k
      state->last_file_read = f;
352
57.6k
      state->last_file_read_level = level;
353
354
57.6k
      state->s = state->vset->table_cache_->Get(*state->options, f->number,
355
57.6k
                                                f->file_size, state->ikey,
356
57.6k
                                                &state->saver, SaveValue);
357
57.6k
      if (!state->s.ok()) {
358
0
        state->found = true;
359
0
        return false;
360
0
      }
361
57.6k
      switch (state->saver.state) {
362
37.6k
        case kNotFound:
363
37.6k
          return true;  // Keep searching in other files
364
11.8k
        case kFound:
365
11.8k
          state->found = true;
366
11.8k
          return false;
367
8.10k
        case kDeleted:
368
8.10k
          return false;
369
0
        case kCorrupt:
370
0
          state->s =
371
0
              Status::Corruption("corrupted key for ", state->saver.user_key);
372
0
          state->found = true;
373
0
          return false;
374
57.6k
      }
375
376
      // Not reached. Added to avoid false compilation warnings of
377
      // "control reaches end of non-void function".
378
0
      return false;
379
57.6k
    }
380
44.8k
  };
381
382
44.8k
  State state;
383
44.8k
  state.found = false;
384
44.8k
  state.stats = stats;
385
44.8k
  state.last_file_read = nullptr;
386
44.8k
  state.last_file_read_level = -1;
387
388
44.8k
  state.options = &options;
389
44.8k
  state.ikey = k.internal_key();
390
44.8k
  state.vset = vset_;
391
392
44.8k
  state.saver.state = kNotFound;
393
44.8k
  state.saver.ucmp = vset_->icmp_.user_comparator();
394
44.8k
  state.saver.user_key = k.user_key();
395
44.8k
  state.saver.value = value;
396
397
44.8k
  ForEachOverlapping(state.saver.user_key, state.ikey, &state, &State::Match);
398
399
44.8k
  return state.found ? state.s : Status::NotFound(Slice());
400
44.8k
}
401
402
46.7k
bool Version::UpdateStats(const GetStats& stats) {
403
46.7k
  FileMetaData* f = stats.seek_file;
404
46.7k
  if (f != nullptr) {
405
14.2k
    f->allowed_seeks--;
406
14.2k
    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
407
47
      file_to_compact_ = f;
408
47
      file_to_compact_level_ = stats.seek_file_level;
409
47
      return true;
410
47
    }
411
14.2k
  }
412
46.6k
  return false;
413
46.7k
}
414
415
7.23k
bool Version::RecordReadSample(Slice internal_key) {
416
7.23k
  ParsedInternalKey ikey;
417
7.23k
  if (!ParseInternalKey(internal_key, &ikey)) {
418
0
    return false;
419
0
  }
420
421
7.23k
  struct State {
422
7.23k
    GetStats stats;  // Holds first matching file
423
7.23k
    int matches;
424
425
9.03k
    static bool Match(void* arg, int level, FileMetaData* f) {
426
9.03k
      State* state = reinterpret_cast<State*>(arg);
427
9.03k
      state->matches++;
428
9.03k
      if (state->matches == 1) {
429
        // Remember first match.
430
7.12k
        state->stats.seek_file = f;
431
7.12k
        state->stats.seek_file_level = level;
432
7.12k
      }
433
      // We can stop iterating once we have a second match.
434
9.03k
      return state->matches < 2;
435
9.03k
    }
436
7.23k
  };
437
438
7.23k
  State state;
439
7.23k
  state.matches = 0;
440
7.23k
  ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
441
442
  // Must have at least two matches since we want to merge across
443
  // files. But what if we have a single file that contains many
444
  // overwrites and deletions?  Should we have another mechanism for
445
  // finding such files?
446
7.23k
  if (state.matches >= 2) {
447
    // 1MB cost is about 1 seek (see comment in Builder::Apply).
448
1.91k
    return UpdateStats(state.stats);
449
1.91k
  }
450
5.31k
  return false;
451
7.23k
}
452
453
969k
void Version::Ref() { ++refs_; }
454
455
969k
void Version::Unref() {
456
969k
  assert(this != &vset_->dummy_versions_);
457
969k
  assert(refs_ >= 1);
458
969k
  --refs_;
459
969k
  if (refs_ == 0) {
460
408k
    delete this;
461
408k
  }
462
969k
}
463
464
bool Version::OverlapInLevel(int level, const Slice* smallest_user_key,
465
230k
                             const Slice* largest_user_key) {
466
230k
  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
467
230k
                               smallest_user_key, largest_user_key);
468
230k
}
469
470
int Version::PickLevelForMemTableOutput(const Slice& smallest_user_key,
471
14.7k
                                        const Slice& largest_user_key) {
472
14.7k
  int level = 0;
473
14.7k
  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
474
    // Push to next level if there is no overlap in next level,
475
    // and the #bytes overlapping in the level after that are limited.
476
11.6k
    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
477
11.6k
    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
478
11.6k
    std::vector<FileMetaData*> overlaps;
479
24.9k
    while (level < config::kMaxMemCompactLevel) {
480
20.6k
      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
481
7.31k
        break;
482
7.31k
      }
483
13.3k
      if (level + 2 < config::kNumLevels) {
484
        // Check that file does not overlap too many grandparent bytes.
485
13.3k
        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
486
13.3k
        const int64_t sum = TotalFileSize(overlaps);
487
13.3k
        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
488
0
          break;
489
0
        }
490
13.3k
      }
491
13.3k
      level++;
492
13.3k
    }
493
11.6k
  }
494
14.7k
  return level;
495
14.7k
}
496
497
// Store in "*inputs" all files in "level" that overlap [begin,end]
498
void Version::GetOverlappingInputs(int level, const InternalKey* begin,
499
                                   const InternalKey* end,
500
192k
                                   std::vector<FileMetaData*>* inputs) {
501
192k
  assert(level >= 0);
502
192k
  assert(level < config::kNumLevels);
503
192k
  inputs->clear();
504
192k
  Slice user_begin, user_end;
505
192k
  if (begin != nullptr) {
506
192k
    user_begin = begin->user_key();
507
192k
  }
508
192k
  if (end != nullptr) {
509
192k
    user_end = end->user_key();
510
192k
  }
511
192k
  const Comparator* user_cmp = vset_->icmp_.user_comparator();
512
811k
  for (size_t i = 0; i < files_[level].size();) {
513
619k
    FileMetaData* f = files_[level][i++];
514
619k
    const Slice file_start = f->smallest.user_key();
515
619k
    const Slice file_limit = f->largest.user_key();
516
619k
    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
517
      // "f" is completely before specified range; skip it
518
449k
    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
519
      // "f" is completely after specified range; skip it
520
240k
    } else {
521
209k
      inputs->push_back(f);
522
209k
      if (level == 0) {
523
        // Level-0 files may overlap each other.  So check if the newly
524
        // added file has expanded the range.  If so, restart search.
525
143k
        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
526
1.50k
          user_begin = file_start;
527
1.50k
          inputs->clear();
528
1.50k
          i = 0;
529
141k
        } else if (end != nullptr &&
530
141k
                   user_cmp->Compare(file_limit, user_end) > 0) {
531
3.78k
          user_end = file_limit;
532
3.78k
          inputs->clear();
533
3.78k
          i = 0;
534
3.78k
        }
535
143k
      }
536
209k
    }
537
619k
  }
538
192k
}
539
540
284
std::string Version::DebugString() const {
541
284
  std::string r;
542
2.27k
  for (int level = 0; level < config::kNumLevels; level++) {
543
    // E.g.,
544
    //   --- level 1 ---
545
    //   17:123['a' .. 'd']
546
    //   20:43['e' .. 'g']
547
1.98k
    r.append("--- level ");
548
1.98k
    AppendNumberTo(&r, level);
549
1.98k
    r.append(" ---\n");
550
1.98k
    const std::vector<FileMetaData*>& files = files_[level];
551
3.84k
    for (size_t i = 0; i < files.size(); i++) {
552
1.86k
      r.push_back(' ');
553
1.86k
      AppendNumberTo(&r, files[i]->number);
554
1.86k
      r.push_back(':');
555
1.86k
      AppendNumberTo(&r, files[i]->file_size);
556
1.86k
      r.append("[");
557
1.86k
      r.append(files[i]->smallest.DebugString());
558
1.86k
      r.append(" .. ");
559
1.86k
      r.append(files[i]->largest.DebugString());
560
1.86k
      r.append("]\n");
561
1.86k
    }
562
1.98k
  }
563
284
  return r;
564
284
}
565
566
// A helper class so we can efficiently apply a whole sequence
567
// of edits to a particular state without creating intermediate
568
// Versions that contain full copies of the intermediate state.
569
class VersionSet::Builder {
570
 private:
571
  // Helper to sort by v->files_[file_number].smallest
572
  struct BySmallestKey {
573
    const InternalKeyComparator* internal_comparator;
574
575
5.64M
    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
576
5.64M
      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
577
5.64M
      if (r != 0) {
578
5.63M
        return (r < 0);
579
5.63M
      } else {
580
        // Break ties by file number
581
18.8k
        return (f1->number < f2->number);
582
18.8k
      }
583
5.64M
    }
584
  };
585
586
  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
587
  struct LevelState {
588
    std::set<uint64_t> deleted_files;
589
    FileSet* added_files;
590
  };
591
592
  VersionSet* vset_;
593
  Version* base_;
594
  LevelState levels_[config::kNumLevels];
595
596
 public:
597
  // Initialize a builder with the files from *base and other info from *vset
598
292k
  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
599
292k
    base_->Ref();
600
292k
    BySmallestKey cmp;
601
292k
    cmp.internal_comparator = &vset_->icmp_;
602
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
603
2.04M
      levels_[level].added_files = new FileSet(cmp);
604
2.04M
    }
605
292k
  }
606
607
292k
  ~Builder() {
608
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
609
2.04M
      const FileSet* added = levels_[level].added_files;
610
2.04M
      std::vector<FileMetaData*> to_unref;
611
2.04M
      to_unref.reserve(added->size());
612
2.96M
      for (FileSet::const_iterator it = added->begin(); it != added->end();
613
2.04M
           ++it) {
614
913k
        to_unref.push_back(*it);
615
913k
      }
616
2.04M
      delete added;
617
2.96M
      for (uint32_t i = 0; i < to_unref.size(); i++) {
618
913k
        FileMetaData* f = to_unref[i];
619
913k
        f->refs--;
620
913k
        if (f->refs <= 0) {
621
65.6k
          delete f;
622
65.6k
        }
623
913k
      }
624
2.04M
    }
625
292k
    base_->Unref();
626
292k
  }
627
628
  // Apply all of the edits in *edit to the current state.
629
458k
  void Apply(const VersionEdit* edit) {
630
    // Update compaction pointers
631
606k
    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
632
148k
      const int level = edit->compact_pointers_[i].first;
633
148k
      vset_->compact_pointer_[level] =
634
148k
          edit->compact_pointers_[i].second.Encode().ToString();
635
148k
    }
636
637
    // Delete files
638
458k
    for (const auto& deleted_file_set_kvp : edit->deleted_files_) {
639
135k
      const int level = deleted_file_set_kvp.first;
640
135k
      const uint64_t number = deleted_file_set_kvp.second;
641
135k
      levels_[level].deleted_files.insert(number);
642
135k
    }
643
644
    // Add new files
645
1.37M
    for (size_t i = 0; i < edit->new_files_.size(); i++) {
646
913k
      const int level = edit->new_files_[i].first;
647
913k
      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
648
913k
      f->refs = 1;
649
650
      // We arrange to automatically compact this file after
651
      // a certain number of seeks.  Let's assume:
652
      //   (1) One seek costs 10ms
653
      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
654
      //   (3) A compaction of 1MB does 25MB of IO:
655
      //         1MB read from this level
656
      //         10-12MB read from next level (boundaries may be misaligned)
657
      //         10-12MB written to next level
658
      // This implies that 25 seeks cost the same as the compaction
659
      // of 1MB of data.  I.e., one seek costs approximately the
660
      // same as the compaction of 40KB of data.  We are a little
661
      // conservative and allow approximately one seek for every 16KB
662
      // of data before triggering a compaction.
663
913k
      f->allowed_seeks = static_cast<int>((f->file_size / 16384U));
664
913k
      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
665
666
913k
      levels_[level].deleted_files.erase(f->number);
667
913k
      levels_[level].added_files->insert(f);
668
913k
    }
669
458k
  }
670
671
  // Save the current state in *v.
672
292k
  void SaveTo(Version* v) {
673
292k
    BySmallestKey cmp;
674
292k
    cmp.internal_comparator = &vset_->icmp_;
675
2.34M
    for (int level = 0; level < config::kNumLevels; level++) {
676
      // Merge the set of added files with the set of pre-existing files.
677
      // Drop any deleted files.  Store the result in *v.
678
2.04M
      const std::vector<FileMetaData*>& base_files = base_->files_[level];
679
2.04M
      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
680
2.04M
      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
681
2.04M
      const FileSet* added_files = levels_[level].added_files;
682
2.04M
      v->files_[level].reserve(base_files.size() + added_files->size());
683
2.04M
      for (const auto& added_file : *added_files) {
684
        // Add all smaller files listed in base_
685
913k
        for (std::vector<FileMetaData*>::const_iterator bpos =
686
913k
                 std::upper_bound(base_iter, base_end, added_file, cmp);
687
1.07M
             base_iter != bpos; ++base_iter) {
688
162k
          MaybeAddFile(v, level, *base_iter);
689
162k
        }
690
691
913k
        MaybeAddFile(v, level, added_file);
692
913k
      }
693
694
      // Add remaining base files
695
3.17M
      for (; base_iter != base_end; ++base_iter) {
696
1.13M
        MaybeAddFile(v, level, *base_iter);
697
1.13M
      }
698
699
#ifndef NDEBUG
700
      // Make sure there is no overlap in levels > 0
701
      if (level > 0) {
702
        for (uint32_t i = 1; i < v->files_[level].size(); i++) {
703
          const InternalKey& prev_end = v->files_[level][i - 1]->largest;
704
          const InternalKey& this_begin = v->files_[level][i]->smallest;
705
          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
706
            std::fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
707
                         prev_end.DebugString().c_str(),
708
                         this_begin.DebugString().c_str());
709
            std::abort();
710
          }
711
        }
712
      }
713
#endif
714
2.04M
    }
715
292k
  }
716
717
2.20M
  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
718
2.20M
    if (levels_[level].deleted_files.count(f->number) > 0) {
719
      // File is deleted: do nothing
720
2.07M
    } else {
721
2.07M
      std::vector<FileMetaData*>* files = &v->files_[level];
722
2.07M
      if (level > 0 && !files->empty()) {
723
        // Must not overlap
724
1.15M
        assert(vset_->icmp_.Compare((*files)[files->size() - 1]->largest,
725
1.15M
                                    f->smallest) < 0);
726
1.15M
      }
727
2.07M
      f->refs++;
728
2.07M
      files->push_back(f);
729
2.07M
    }
730
2.20M
  }
731
};
732
733
VersionSet::VersionSet(const std::string& dbname, const Options* options,
734
                       TableCache* table_cache,
735
                       const InternalKeyComparator* cmp)
736
115k
    : env_(options->env),
737
115k
      dbname_(dbname),
738
115k
      options_(options),
739
115k
      table_cache_(table_cache),
740
115k
      icmp_(*cmp),
741
115k
      next_file_number_(2),
742
115k
      manifest_file_number_(0),  // Filled by Recover()
743
115k
      last_sequence_(0),
744
115k
      log_number_(0),
745
115k
      prev_log_number_(0),
746
115k
      descriptor_file_(nullptr),
747
115k
      descriptor_log_(nullptr),
748
115k
      dummy_versions_(this),
749
115k
      current_(nullptr) {
750
115k
  AppendVersion(new Version(this));
751
115k
}
752
753
115k
VersionSet::~VersionSet() {
754
115k
  current_->Unref();
755
115k
  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
756
115k
  delete descriptor_log_;
757
115k
  delete descriptor_file_;
758
115k
}
759
760
408k
void VersionSet::AppendVersion(Version* v) {
761
  // Make "v" current
762
408k
  assert(v->refs_ == 0);
763
408k
  assert(v != current_);
764
408k
  if (current_ != nullptr) {
765
292k
    current_->Unref();
766
292k
  }
767
408k
  current_ = v;
768
408k
  v->Ref();
769
770
  // Append to linked list
771
408k
  v->prev_ = dummy_versions_.prev_;
772
408k
  v->next_ = &dummy_versions_;
773
408k
  v->prev_->next_ = v;
774
408k
  v->next_->prev_ = v;
775
408k
}
776
777
176k
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
778
176k
  if (edit->has_log_number_) {
779
148k
    assert(edit->log_number_ >= log_number_);
780
148k
    assert(edit->log_number_ < next_file_number_);
781
148k
  } else {
782
28.6k
    edit->SetLogNumber(log_number_);
783
28.6k
  }
784
785
176k
  if (!edit->has_prev_log_number_) {
786
28.6k
    edit->SetPrevLogNumber(prev_log_number_);
787
28.6k
  }
788
789
176k
  edit->SetNextFile(next_file_number_);
790
176k
  edit->SetLastSequence(last_sequence_);
791
792
176k
  Version* v = new Version(this);
793
176k
  {
794
176k
    Builder builder(this, current_);
795
176k
    builder.Apply(edit);
796
176k
    builder.SaveTo(v);
797
176k
  }
798
176k
  Finalize(v);
799
800
  // Initialize new descriptor log file if necessary by creating
801
  // a temporary file that contains a snapshot of the current version.
802
176k
  std::string new_manifest_file;
803
176k
  Status s;
804
176k
  if (descriptor_log_ == nullptr) {
805
    // No reason to unlock *mu here since we only hit this path in the
806
    // first call to LogAndApply (when opening the database).
807
115k
    assert(descriptor_file_ == nullptr);
808
115k
    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
809
115k
    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
810
115k
    if (s.ok()) {
811
115k
      descriptor_log_ = new log::Writer(descriptor_file_);
812
115k
      s = WriteSnapshot(descriptor_log_);
813
115k
    }
814
115k
  }
815
816
  // Unlock during expensive MANIFEST log write
817
176k
  {
818
176k
    mu->Unlock();
819
820
    // Write new record to MANIFEST log
821
176k
    if (s.ok()) {
822
176k
      std::string record;
823
176k
      edit->EncodeTo(&record);
824
176k
      s = descriptor_log_->AddRecord(record);
825
176k
      if (s.ok()) {
826
176k
        s = descriptor_file_->Sync();
827
176k
      }
828
176k
      if (!s.ok()) {
829
0
        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
830
0
      }
831
176k
    }
832
833
    // If we just created a new descriptor file, install it by writing a
834
    // new CURRENT file that points to it.
835
176k
    if (s.ok() && !new_manifest_file.empty()) {
836
115k
      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
837
115k
    }
838
839
176k
    mu->Lock();
840
176k
  }
841
842
  // Install the new version
843
176k
  if (s.ok()) {
844
176k
    AppendVersion(v);
845
176k
    log_number_ = edit->log_number_;
846
176k
    prev_log_number_ = edit->prev_log_number_;
847
176k
  } else {
848
0
    delete v;
849
0
    if (!new_manifest_file.empty()) {
850
0
      delete descriptor_log_;
851
0
      delete descriptor_file_;
852
0
      descriptor_log_ = nullptr;
853
0
      descriptor_file_ = nullptr;
854
0
      env_->RemoveFile(new_manifest_file);
855
0
    }
856
0
  }
857
858
176k
  return s;
859
176k
}
860
861
115k
Status VersionSet::Recover(bool* save_manifest) {
862
115k
  struct LogReporter : public log::Reader::Reporter {
863
115k
    Status* status;
864
115k
    void Corruption(size_t bytes, const Status& s) override {
865
0
      if (this->status->ok()) *this->status = s;
866
0
    }
867
115k
  };
868
869
  // Read "CURRENT" file, which contains a pointer to the current manifest file
870
115k
  std::string current;
871
115k
  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
872
115k
  if (!s.ok()) {
873
0
    return s;
874
0
  }
875
115k
  if (current.empty() || current[current.size() - 1] != '\n') {
876
0
    return Status::Corruption("CURRENT file does not end with newline");
877
0
  }
878
115k
  current.resize(current.size() - 1);
879
880
115k
  std::string dscname = dbname_ + "/" + current;
881
115k
  SequentialFile* file;
882
115k
  s = env_->NewSequentialFile(dscname, &file);
883
115k
  if (!s.ok()) {
884
0
    if (s.IsNotFound()) {
885
0
      return Status::Corruption("CURRENT points to a non-existent file",
886
0
                                s.ToString());
887
0
    }
888
0
    return s;
889
0
  }
890
891
115k
  bool have_log_number = false;
892
115k
  bool have_prev_log_number = false;
893
115k
  bool have_next_file = false;
894
115k
  bool have_last_sequence = false;
895
115k
  uint64_t next_file = 0;
896
115k
  uint64_t last_sequence = 0;
897
115k
  uint64_t log_number = 0;
898
115k
  uint64_t prev_log_number = 0;
899
115k
  Builder builder(this, current_);
900
115k
  int read_records = 0;
901
902
115k
  {
903
115k
    LogReporter reporter;
904
115k
    reporter.status = &s;
905
115k
    log::Reader reader(file, &reporter, true /*checksum*/,
906
115k
                       0 /*initial_offset*/);
907
115k
    Slice record;
908
115k
    std::string scratch;
909
397k
    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
910
281k
      ++read_records;
911
281k
      VersionEdit edit;
912
281k
      s = edit.DecodeFrom(record);
913
281k
      if (s.ok()) {
914
281k
        if (edit.has_comparator_ &&
915
115k
            edit.comparator_ != icmp_.user_comparator()->Name()) {
916
0
          s = Status::InvalidArgument(
917
0
              edit.comparator_ + " does not match existing comparator ",
918
0
              icmp_.user_comparator()->Name());
919
0
        }
920
281k
      }
921
922
281k
      if (s.ok()) {
923
281k
        builder.Apply(&edit);
924
281k
      }
925
926
281k
      if (edit.has_log_number_) {
927
171k
        log_number = edit.log_number_;
928
171k
        have_log_number = true;
929
171k
      }
930
931
281k
      if (edit.has_prev_log_number_) {
932
166k
        prev_log_number = edit.prev_log_number_;
933
166k
        have_prev_log_number = true;
934
166k
      }
935
936
281k
      if (edit.has_next_file_number_) {
937
171k
        next_file = edit.next_file_number_;
938
171k
        have_next_file = true;
939
171k
      }
940
941
281k
      if (edit.has_last_sequence_) {
942
171k
        last_sequence = edit.last_sequence_;
943
171k
        have_last_sequence = true;
944
171k
      }
945
281k
    }
946
115k
  }
947
115k
  delete file;
948
115k
  file = nullptr;
949
950
115k
  if (s.ok()) {
951
115k
    if (!have_next_file) {
952
0
      s = Status::Corruption("no meta-nextfile entry in descriptor");
953
115k
    } else if (!have_log_number) {
954
0
      s = Status::Corruption("no meta-lognumber entry in descriptor");
955
115k
    } else if (!have_last_sequence) {
956
0
      s = Status::Corruption("no last-sequence-number entry in descriptor");
957
0
    }
958
959
115k
    if (!have_prev_log_number) {
960
5.07k
      prev_log_number = 0;
961
5.07k
    }
962
963
115k
    MarkFileNumberUsed(prev_log_number);
964
115k
    MarkFileNumberUsed(log_number);
965
115k
  }
966
967
115k
  if (s.ok()) {
968
115k
    Version* v = new Version(this);
969
115k
    builder.SaveTo(v);
970
    // Install recovered version
971
115k
    Finalize(v);
972
115k
    AppendVersion(v);
973
115k
    manifest_file_number_ = next_file;
974
115k
    next_file_number_ = next_file + 1;
975
115k
    last_sequence_ = last_sequence;
976
115k
    log_number_ = log_number;
977
115k
    prev_log_number_ = prev_log_number;
978
979
    // See if we can reuse the existing MANIFEST file.
980
115k
    if (ReuseManifest(dscname, current)) {
981
      // No need to save new manifest
982
115k
    } else {
983
115k
      *save_manifest = true;
984
115k
    }
985
115k
  } else {
986
0
    std::string error = s.ToString();
987
0
    Log(options_->info_log, "Error recovering version set with %d records: %s",
988
0
        read_records, error.c_str());
989
0
  }
990
991
115k
  return s;
992
115k
}
993
994
bool VersionSet::ReuseManifest(const std::string& dscname,
995
115k
                               const std::string& dscbase) {
996
115k
  if (!options_->reuse_logs) {
997
115k
    return false;
998
115k
  }
999
0
  FileType manifest_type;
1000
0
  uint64_t manifest_number;
1001
0
  uint64_t manifest_size;
1002
0
  if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
1003
0
      manifest_type != kDescriptorFile ||
1004
0
      !env_->GetFileSize(dscname, &manifest_size).ok() ||
1005
      // Make new compacted MANIFEST if old one is too big
1006
0
      manifest_size >= TargetFileSize(options_)) {
1007
0
    return false;
1008
0
  }
1009
1010
0
  assert(descriptor_file_ == nullptr);
1011
0
  assert(descriptor_log_ == nullptr);
1012
0
  Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
1013
0
  if (!r.ok()) {
1014
0
    Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
1015
0
    assert(descriptor_file_ == nullptr);
1016
0
    return false;
1017
0
  }
1018
1019
0
  Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
1020
0
  descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
1021
0
  manifest_file_number_ = manifest_number;
1022
0
  return true;
1023
0
}
1024
1025
341k
void VersionSet::MarkFileNumberUsed(uint64_t number) {
1026
341k
  if (next_file_number_ <= number) {
1027
110k
    next_file_number_ = number + 1;
1028
110k
  }
1029
341k
}
1030
1031
292k
void VersionSet::Finalize(Version* v) {
1032
  // Precomputed best level for next compaction
1033
292k
  int best_level = -1;
1034
292k
  double best_score = -1;
1035
1036
2.04M
  for (int level = 0; level < config::kNumLevels - 1; level++) {
1037
1.75M
    double score;
1038
1.75M
    if (level == 0) {
1039
      // We treat level-0 specially by bounding the number of files
1040
      // instead of number of bytes for two reasons:
1041
      //
1042
      // (1) With larger write-buffer sizes, it is nice not to do too
1043
      // many level-0 compactions.
1044
      //
1045
      // (2) The files in level-0 are merged on every read and
1046
      // therefore we wish to avoid too many files when the individual
1047
      // file size is small (perhaps because of a small write-buffer
1048
      // setting, or very high compression ratios, or lots of
1049
      // overwrites/deletions).
1050
292k
      score = v->files_[level].size() /
1051
292k
              static_cast<double>(config::kL0_CompactionTrigger);
1052
1.46M
    } else {
1053
      // Compute the ratio of current size to size limit.
1054
1.46M
      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
1055
1.46M
      score =
1056
1.46M
          static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
1057
1.46M
    }
1058
1059
1.75M
    if (score > best_score) {
1060
354k
      best_level = level;
1061
354k
      best_score = score;
1062
354k
    }
1063
1.75M
  }
1064
1065
292k
  v->compaction_level_ = best_level;
1066
292k
  v->compaction_score_ = best_score;
1067
292k
}
1068
1069
115k
Status VersionSet::WriteSnapshot(log::Writer* log) {
1070
  // TODO: Break up into multiple records to reduce memory usage on recovery?
1071
1072
  // Save metadata
1073
115k
  VersionEdit edit;
1074
115k
  edit.SetComparatorName(icmp_.user_comparator()->Name());
1075
1076
  // Save compaction pointers
1077
925k
  for (int level = 0; level < config::kNumLevels; level++) {
1078
809k
    if (!compact_pointer_[level].empty()) {
1079
94.4k
      InternalKey key;
1080
94.4k
      key.DecodeFrom(compact_pointer_[level]);
1081
94.4k
      edit.SetCompactPointer(level, key);
1082
94.4k
    }
1083
809k
  }
1084
1085
  // Save files
1086
925k
  for (int level = 0; level < config::kNumLevels; level++) {
1087
809k
    const std::vector<FileMetaData*>& files = current_->files_[level];
1088
1.57M
    for (size_t i = 0; i < files.size(); i++) {
1089
766k
      const FileMetaData* f = files[i];
1090
766k
      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
1091
766k
    }
1092
809k
  }
1093
1094
115k
  std::string record;
1095
115k
  edit.EncodeTo(&record);
1096
115k
  return log->AddRecord(record);
1097
115k
}
1098
1099
1.95M
int VersionSet::NumLevelFiles(int level) const {
1100
1.95M
  assert(level >= 0);
1101
1.95M
  assert(level < config::kNumLevels);
1102
1.95M
  return current_->files_[level].size();
1103
1.95M
}
1104
1105
36.5k
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
1106
  // Update code if kNumLevels changes
1107
36.5k
  static_assert(config::kNumLevels == 7, "");
1108
36.5k
  std::snprintf(
1109
36.5k
      scratch->buffer, sizeof(scratch->buffer), "files[ %d %d %d %d %d %d %d ]",
1110
36.5k
      int(current_->files_[0].size()), int(current_->files_[1].size()),
1111
36.5k
      int(current_->files_[2].size()), int(current_->files_[3].size()),
1112
36.5k
      int(current_->files_[4].size()), int(current_->files_[5].size()),
1113
36.5k
      int(current_->files_[6].size()));
1114
36.5k
  return scratch->buffer;
1115
36.5k
}
1116
1117
0
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
1118
0
  uint64_t result = 0;
1119
0
  for (int level = 0; level < config::kNumLevels; level++) {
1120
0
    const std::vector<FileMetaData*>& files = v->files_[level];
1121
0
    for (size_t i = 0; i < files.size(); i++) {
1122
0
      if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
1123
        // Entire file is before "ikey", so just add the file size
1124
0
        result += files[i]->file_size;
1125
0
      } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
1126
        // Entire file is after "ikey", so ignore
1127
0
        if (level > 0) {
1128
          // Files other than level 0 are sorted by meta->smallest, so
1129
          // no further files in this level will contain data for
1130
          // "ikey".
1131
0
          break;
1132
0
        }
1133
0
      } else {
1134
        // "ikey" falls in the range for this table.  Add the
1135
        // approximate offset of "ikey" within the table.
1136
0
        Table* tableptr;
1137
0
        Iterator* iter = table_cache_->NewIterator(
1138
0
            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
1139
0
        if (tableptr != nullptr) {
1140
0
          result += tableptr->ApproximateOffsetOf(ikey.Encode());
1141
0
        }
1142
0
        delete iter;
1143
0
      }
1144
0
    }
1145
0
  }
1146
0
  return result;
1147
0
}
1148
1149
284k
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
1150
568k
  for (Version* v = dummy_versions_.next_; v != &dummy_versions_;
1151
284k
       v = v->next_) {
1152
2.27M
    for (int level = 0; level < config::kNumLevels; level++) {
1153
1.99M
      const std::vector<FileMetaData*>& files = v->files_[level];
1154
3.83M
      for (size_t i = 0; i < files.size(); i++) {
1155
1.84M
        live->insert(files[i]->number);
1156
1.84M
      }
1157
1.99M
    }
1158
284k
  }
1159
284k
}
1160
1161
0
int64_t VersionSet::NumLevelBytes(int level) const {
1162
0
  assert(level >= 0);
1163
0
  assert(level < config::kNumLevels);
1164
0
  return TotalFileSize(current_->files_[level]);
1165
0
}
1166
1167
0
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
1168
0
  int64_t result = 0;
1169
0
  std::vector<FileMetaData*> overlaps;
1170
0
  for (int level = 1; level < config::kNumLevels - 1; level++) {
1171
0
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1172
0
      const FileMetaData* f = current_->files_[level][i];
1173
0
      current_->GetOverlappingInputs(level + 1, &f->smallest, &f->largest,
1174
0
                                     &overlaps);
1175
0
      const int64_t sum = TotalFileSize(overlaps);
1176
0
      if (sum > result) {
1177
0
        result = sum;
1178
0
      }
1179
0
    }
1180
0
  }
1181
0
  return result;
1182
0
}
1183
1184
// Stores the minimal range that covers all entries in inputs in
1185
// *smallest, *largest.
1186
// REQUIRES: inputs is not empty
1187
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
1188
102k
                          InternalKey* smallest, InternalKey* largest) {
1189
102k
  assert(!inputs.empty());
1190
102k
  smallest->Clear();
1191
102k
  largest->Clear();
1192
355k
  for (size_t i = 0; i < inputs.size(); i++) {
1193
253k
    FileMetaData* f = inputs[i];
1194
253k
    if (i == 0) {
1195
102k
      *smallest = f->smallest;
1196
102k
      *largest = f->largest;
1197
151k
    } else {
1198
151k
      if (icmp_.Compare(f->smallest, *smallest) < 0) {
1199
12.9k
        *smallest = f->smallest;
1200
12.9k
      }
1201
151k
      if (icmp_.Compare(f->largest, *largest) > 0) {
1202
100k
        *largest = f->largest;
1203
100k
      }
1204
151k
    }
1205
253k
  }
1206
102k
}
1207
1208
// Stores the minimal range that covers all entries in inputs1 and inputs2
1209
// in *smallest, *largest.
1210
// REQUIRES: inputs is not empty
1211
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
1212
                           const std::vector<FileMetaData*>& inputs2,
1213
40.3k
                           InternalKey* smallest, InternalKey* largest) {
1214
40.3k
  std::vector<FileMetaData*> all = inputs1;
1215
40.3k
  all.insert(all.end(), inputs2.begin(), inputs2.end());
1216
40.3k
  GetRange(all, smallest, largest);
1217
40.3k
}
1218
1219
28.3k
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
1220
28.3k
  ReadOptions options;
1221
28.3k
  options.verify_checksums = options_->paranoid_checks;
1222
28.3k
  options.fill_cache = false;
1223
1224
  // Level-0 files have to be merged together.  For other levels,
1225
  // we will make a concatenating iterator per level.
1226
  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
1227
28.3k
  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
1228
28.3k
  Iterator** list = new Iterator*[space];
1229
28.3k
  int num = 0;
1230
85.1k
  for (int which = 0; which < 2; which++) {
1231
56.7k
    if (!c->inputs_[which].empty()) {
1232
49.5k
      if (c->level() + which == 0) {
1233
21.7k
        const std::vector<FileMetaData*>& files = c->inputs_[which];
1234
97.8k
        for (size_t i = 0; i < files.size(); i++) {
1235
76.1k
          list[num++] = table_cache_->NewIterator(options, files[i]->number,
1236
76.1k
                                                  files[i]->file_size);
1237
76.1k
        }
1238
27.8k
      } else {
1239
        // Create concatenating iterator for the files from this level
1240
27.8k
        list[num++] = NewTwoLevelIterator(
1241
27.8k
            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
1242
27.8k
            &GetFileIterator, table_cache_, options);
1243
27.8k
      }
1244
49.5k
    }
1245
56.7k
  }
1246
28.3k
  assert(num <= space);
1247
28.3k
  Iterator* result = NewMergingIterator(&icmp_, list, num);
1248
28.3k
  delete[] list;
1249
28.3k
  return result;
1250
28.3k
}
1251
1252
21.7k
Compaction* VersionSet::PickCompaction() {
1253
21.7k
  Compaction* c;
1254
21.7k
  int level;
1255
1256
  // We prefer compactions triggered by too much data in a level over
1257
  // the compactions triggered by seeks.
1258
21.7k
  const bool size_compaction = (current_->compaction_score_ >= 1);
1259
21.7k
  const bool seek_compaction = (current_->file_to_compact_ != nullptr);
1260
21.7k
  if (size_compaction) {
1261
21.7k
    level = current_->compaction_level_;
1262
21.7k
    assert(level >= 0);
1263
21.7k
    assert(level + 1 < config::kNumLevels);
1264
21.7k
    c = new Compaction(options_, level);
1265
1266
    // Pick the first file that comes after compact_pointer_[level]
1267
86.7k
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1268
78.6k
      FileMetaData* f = current_->files_[level][i];
1269
78.6k
      if (compact_pointer_[level].empty() ||
1270
77.2k
          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
1271
13.5k
        c->inputs_[0].push_back(f);
1272
13.5k
        break;
1273
13.5k
      }
1274
78.6k
    }
1275
21.7k
    if (c->inputs_[0].empty()) {
1276
      // Wrap-around to the beginning of the key space
1277
8.16k
      c->inputs_[0].push_back(current_->files_[level][0]);
1278
8.16k
    }
1279
21.7k
  } else if (seek_compaction) {
1280
28
    level = current_->file_to_compact_level_;
1281
28
    c = new Compaction(options_, level);
1282
28
    c->inputs_[0].push_back(current_->file_to_compact_);
1283
28
  } else {
1284
0
    return nullptr;
1285
0
  }
1286
1287
21.7k
  c->input_version_ = current_;
1288
21.7k
  c->input_version_->Ref();
1289
1290
  // Files in level 0 may overlap each other, so pick up all overlapping ones
1291
21.7k
  if (level == 0) {
1292
21.7k
    InternalKey smallest, largest;
1293
21.7k
    GetRange(c->inputs_[0], &smallest, &largest);
1294
    // Note that the next call will discard the file we placed in
1295
    // c->inputs_[0] earlier and replace it with an overlapping set
1296
    // which will include the picked file.
1297
21.7k
    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
1298
21.7k
    assert(!c->inputs_[0].empty());
1299
21.7k
  }
1300
1301
21.7k
  SetupOtherInputs(c);
1302
1303
21.7k
  return c;
1304
21.7k
}
1305
1306
// Finds the largest key in a vector of files. Returns true if files is not
1307
// empty.
1308
bool FindLargestKey(const InternalKeyComparator& icmp,
1309
                    const std::vector<FileMetaData*>& files,
1310
98.1k
                    InternalKey* largest_key) {
1311
98.1k
  if (files.empty()) {
1312
15.3k
    return false;
1313
15.3k
  }
1314
82.7k
  *largest_key = files[0]->largest;
1315
180k
  for (size_t i = 1; i < files.size(); ++i) {
1316
97.7k
    FileMetaData* f = files[i];
1317
97.7k
    if (icmp.Compare(f->largest, *largest_key) > 0) {
1318
62.7k
      *largest_key = f->largest;
1319
62.7k
    }
1320
97.7k
  }
1321
82.7k
  return true;
1322
98.1k
}
1323
1324
// Finds minimum file b2=(l2, u2) in level file for which l2 > u1 and
1325
// user_key(l2) = user_key(u1)
1326
FileMetaData* FindSmallestBoundaryFile(
1327
    const InternalKeyComparator& icmp,
1328
    const std::vector<FileMetaData*>& level_files,
1329
82.7k
    const InternalKey& largest_key) {
1330
82.7k
  const Comparator* user_cmp = icmp.user_comparator();
1331
82.7k
  FileMetaData* smallest_boundary_file = nullptr;
1332
405k
  for (size_t i = 0; i < level_files.size(); ++i) {
1333
322k
    FileMetaData* f = level_files[i];
1334
322k
    if (icmp.Compare(f->smallest, largest_key) > 0 &&
1335
92.7k
        user_cmp->Compare(f->smallest.user_key(), largest_key.user_key()) ==
1336
92.7k
            0) {
1337
0
      if (smallest_boundary_file == nullptr ||
1338
0
          icmp.Compare(f->smallest, smallest_boundary_file->smallest) < 0) {
1339
0
        smallest_boundary_file = f;
1340
0
      }
1341
0
    }
1342
322k
  }
1343
82.7k
  return smallest_boundary_file;
1344
82.7k
}
1345
1346
// Extracts the largest file b1 from |compaction_files| and then searches for a
1347
// b2 in |level_files| for which user_key(u1) = user_key(l2). If it finds such a
1348
// file b2 (known as a boundary file) it adds it to |compaction_files| and then
1349
// searches again using this new upper bound.
1350
//
1351
// If there are two blocks, b1=(l1, u1) and b2=(l2, u2) and
1352
// user_key(u1) = user_key(l2), and if we compact b1 but not b2 then a
1353
// subsequent get operation will yield an incorrect result because it will
1354
// return the record from b2 in level i rather than from b1 because it searches
1355
// level by level for records matching the supplied user key.
1356
//
1357
// parameters:
1358
//   in     level_files:      List of files to search for boundary files.
1359
//   in/out compaction_files: List of files to extend by adding boundary files.
1360
void AddBoundaryInputs(const InternalKeyComparator& icmp,
1361
                       const std::vector<FileMetaData*>& level_files,
1362
98.1k
                       std::vector<FileMetaData*>* compaction_files) {
1363
98.1k
  InternalKey largest_key;
1364
1365
  // Quick return if compaction_files is empty.
1366
98.1k
  if (!FindLargestKey(icmp, *compaction_files, &largest_key)) {
1367
15.3k
    return;
1368
15.3k
  }
1369
1370
82.7k
  bool continue_searching = true;
1371
165k
  while (continue_searching) {
1372
82.7k
    FileMetaData* smallest_boundary_file =
1373
82.7k
        FindSmallestBoundaryFile(icmp, level_files, largest_key);
1374
1375
    // If a boundary file was found advance largest_key, otherwise we're done.
1376
82.7k
    if (smallest_boundary_file != NULL) {
1377
0
      compaction_files->push_back(smallest_boundary_file);
1378
0
      largest_key = smallest_boundary_file->largest;
1379
82.7k
    } else {
1380
82.7k
      continue_searching = false;
1381
82.7k
    }
1382
82.7k
  }
1383
82.7k
}
1384
1385
36.5k
void VersionSet::SetupOtherInputs(Compaction* c) {
1386
36.5k
  const int level = c->level();
1387
36.5k
  InternalKey smallest, largest;
1388
1389
36.5k
  AddBoundaryInputs(icmp_, current_->files_[level], &c->inputs_[0]);
1390
36.5k
  GetRange(c->inputs_[0], &smallest, &largest);
1391
1392
36.5k
  current_->GetOverlappingInputs(level + 1, &smallest, &largest,
1393
36.5k
                                 &c->inputs_[1]);
1394
36.5k
  AddBoundaryInputs(icmp_, current_->files_[level + 1], &c->inputs_[1]);
1395
1396
  // Get entire range covered by compaction
1397
36.5k
  InternalKey all_start, all_limit;
1398
36.5k
  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1399
1400
  // See if we can grow the number of inputs in "level" without
1401
  // changing the number of "level+1" files we pick up.
1402
36.5k
  if (!c->inputs_[1].empty()) {
1403
21.1k
    std::vector<FileMetaData*> expanded0;
1404
21.1k
    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
1405
21.1k
    AddBoundaryInputs(icmp_, current_->files_[level], &expanded0);
1406
21.1k
    const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
1407
21.1k
    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
1408
21.1k
    const int64_t expanded0_size = TotalFileSize(expanded0);
1409
21.1k
    if (expanded0.size() > c->inputs_[0].size() &&
1410
3.82k
        inputs1_size + expanded0_size <
1411
3.82k
            ExpandedCompactionByteSizeLimit(options_)) {
1412
3.82k
      InternalKey new_start, new_limit;
1413
3.82k
      GetRange(expanded0, &new_start, &new_limit);
1414
3.82k
      std::vector<FileMetaData*> expanded1;
1415
3.82k
      current_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
1416
3.82k
                                     &expanded1);
1417
3.82k
      AddBoundaryInputs(icmp_, current_->files_[level + 1], &expanded1);
1418
3.82k
      if (expanded1.size() == c->inputs_[1].size()) {
1419
3.74k
        Log(options_->info_log,
1420
3.74k
            "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
1421
3.74k
            level, int(c->inputs_[0].size()), int(c->inputs_[1].size()),
1422
3.74k
            long(inputs0_size), long(inputs1_size), int(expanded0.size()),
1423
3.74k
            int(expanded1.size()), long(expanded0_size), long(inputs1_size));
1424
3.74k
        smallest = new_start;
1425
3.74k
        largest = new_limit;
1426
3.74k
        c->inputs_[0] = expanded0;
1427
3.74k
        c->inputs_[1] = expanded1;
1428
3.74k
        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1429
3.74k
      }
1430
3.82k
    }
1431
21.1k
  }
1432
1433
  // Compute the set of grandparent files that overlap this compaction
1434
  // (parent == level+1; grandparent == level+2)
1435
36.5k
  if (level + 2 < config::kNumLevels) {
1436
36.5k
    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
1437
36.5k
                                   &c->grandparents_);
1438
36.5k
  }
1439
1440
  // Update the place where we will do the next compaction for this level.
1441
  // We update this immediately instead of waiting for the VersionEdit
1442
  // to be applied so that if the compaction fails, we will try a different
1443
  // key range next time.
1444
36.5k
  compact_pointer_[level] = largest.Encode().ToString();
1445
36.5k
  c->edit_.SetCompactPointer(level, largest);
1446
36.5k
}
1447
1448
Compaction* VersionSet::CompactRange(int level, const InternalKey* begin,
1449
59.5k
                                     const InternalKey* end) {
1450
59.5k
  std::vector<FileMetaData*> inputs;
1451
59.5k
  current_->GetOverlappingInputs(level, begin, end, &inputs);
1452
59.5k
  if (inputs.empty()) {
1453
44.7k
    return nullptr;
1454
44.7k
  }
1455
1456
  // Avoid compacting too much in one shot in case the range is large.
1457
  // But we cannot do this for level-0 since level-0 files can overlap
1458
  // and we must not pick one file and drop another older file if the
1459
  // two files overlap.
1460
14.8k
  if (level > 0) {
1461
6.67k
    const uint64_t limit = MaxFileSizeForLevel(options_, level);
1462
6.67k
    uint64_t total = 0;
1463
14.3k
    for (size_t i = 0; i < inputs.size(); i++) {
1464
7.71k
      uint64_t s = inputs[i]->file_size;
1465
7.71k
      total += s;
1466
7.71k
      if (total >= limit) {
1467
0
        inputs.resize(i + 1);
1468
0
        break;
1469
0
      }
1470
7.71k
    }
1471
6.67k
  }
1472
1473
14.8k
  Compaction* c = new Compaction(options_, level);
1474
14.8k
  c->input_version_ = current_;
1475
14.8k
  c->input_version_->Ref();
1476
14.8k
  c->inputs_[0] = inputs;
1477
14.8k
  SetupOtherInputs(c);
1478
14.8k
  return c;
1479
59.5k
}
1480
1481
Compaction::Compaction(const Options* options, int level)
1482
36.5k
    : level_(level),
1483
36.5k
      max_output_file_size_(MaxFileSizeForLevel(options, level)),
1484
36.5k
      input_version_(nullptr),
1485
36.5k
      grandparent_index_(0),
1486
36.5k
      seen_key_(false),
1487
36.5k
      overlapped_bytes_(0) {
1488
292k
  for (int i = 0; i < config::kNumLevels; i++) {
1489
255k
    level_ptrs_[i] = 0;
1490
255k
  }
1491
36.5k
}
1492
1493
36.5k
Compaction::~Compaction() {
1494
36.5k
  if (input_version_ != nullptr) {
1495
8.17k
    input_version_->Unref();
1496
8.17k
  }
1497
36.5k
}
1498
1499
21.7k
bool Compaction::IsTrivialMove() const {
1500
21.7k
  const VersionSet* vset = input_version_->vset_;
1501
  // Avoid a move if there is lots of overlapping grandparent data.
1502
  // Otherwise, the move could create a parent file that will require
1503
  // a very expensive merge later on.
1504
21.7k
  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
1505
8.17k
          TotalFileSize(grandparents_) <=
1506
8.17k
              MaxGrandParentOverlapBytes(vset->options_));
1507
21.7k
}
1508
1509
20.4k
void Compaction::AddInputDeletions(VersionEdit* edit) {
1510
61.3k
  for (int which = 0; which < 2; which++) {
1511
102k
    for (size_t i = 0; i < inputs_[which].size(); i++) {
1512
61.5k
      edit->RemoveFile(level_ + which, inputs_[which][i]->number);
1513
61.5k
    }
1514
40.9k
  }
1515
20.4k
}
1516
1517
48.0k
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
1518
  // Maybe use binary search to find right entry instead of linear search?
1519
48.0k
  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
1520
182k
  for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
1521
153k
    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
1522
158k
    while (level_ptrs_[lvl] < files.size()) {
1523
27.4k
      FileMetaData* f = files[level_ptrs_[lvl]];
1524
27.4k
      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
1525
        // We've advanced far enough
1526
23.2k
        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
1527
          // Key falls in this file's range, so definitely not base level
1528
19.7k
          return false;
1529
19.7k
        }
1530
3.47k
        break;
1531
23.2k
      }
1532
4.24k
      level_ptrs_[lvl]++;
1533
4.24k
    }
1534
153k
  }
1535
28.2k
  return true;
1536
48.0k
}
1537
1538
1.27M
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
1539
1.27M
  const VersionSet* vset = input_version_->vset_;
1540
  // Scan to find earliest grandparent file that contains key.
1541
1.27M
  const InternalKeyComparator* icmp = &vset->icmp_;
1542
1.27M
  while (grandparent_index_ < grandparents_.size() &&
1543
104k
         icmp->Compare(internal_key,
1544
104k
                       grandparents_[grandparent_index_]->largest.Encode()) >
1545
104k
             0) {
1546
4.15k
    if (seen_key_) {
1547
4.15k
      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
1548
4.15k
    }
1549
4.15k
    grandparent_index_++;
1550
4.15k
  }
1551
1.27M
  seen_key_ = true;
1552
1553
1.27M
  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
1554
    // Too much overlap for current output; start new output
1555
0
    overlapped_bytes_ = 0;
1556
0
    return true;
1557
1.27M
  } else {
1558
1.27M
    return false;
1559
1.27M
  }
1560
1.27M
}
1561
1562
28.3k
void Compaction::ReleaseInputs() {
1563
28.3k
  if (input_version_ != nullptr) {
1564
28.3k
    input_version_->Unref();
1565
28.3k
    input_version_ = nullptr;
1566
28.3k
  }
1567
28.3k
}
1568
1569
}  // namespace leveldb