Coverage Report

Created: 2026-05-30 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/leveldb/db/version_set.cc
Line
Count
Source
1
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5
#include "db/version_set.h"
6
7
#include <algorithm>
8
#include <cstdio>
9
10
#include "db/filename.h"
11
#include "db/log_reader.h"
12
#include "db/log_writer.h"
13
#include "db/memtable.h"
14
#include "db/table_cache.h"
15
#include "leveldb/env.h"
16
#include "leveldb/table_builder.h"
17
#include "table/merger.h"
18
#include "table/two_level_iterator.h"
19
#include "util/coding.h"
20
#include "util/logging.h"
21
22
namespace leveldb {
23
24
332
static size_t TargetFileSize(const Options* options) {
25
332
  return options->max_file_size;
26
332
}
27
28
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
29
// stop building a single file in a level->level+1 compaction.
30
325
static int64_t MaxGrandParentOverlapBytes(const Options* options) {
31
325
  return 10 * TargetFileSize(options);
32
325
}
33
34
// Maximum number of bytes in all compacted files.  We avoid expanding
35
// the lower level file set of a compaction if it would make the
36
// total compaction cover more than this many bytes.
37
0
static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
38
0
  return 25 * TargetFileSize(options);
39
0
}
40
41
1.12k
static double MaxBytesForLevel(const Options* options, int level) {
42
  // Note: the result for level zero is not really used since we set
43
  // the level-0 compaction threshold based on number of files.
44
45
  // Result for both level-0 and level-1
46
1.12k
  double result = 10. * 1048576.0;
47
3.37k
  while (level > 1) {
48
2.25k
    result *= 10;
49
2.25k
    level--;
50
2.25k
  }
51
1.12k
  return result;
52
1.12k
}
53
54
7
static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
55
  // We could vary per level to reduce number of files?
56
7
  return TargetFileSize(options);
57
7
}
58
59
1.13k
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
60
1.13k
  int64_t sum = 0;
61
1.22k
  for (size_t i = 0; i < files.size(); i++) {
62
89
    sum += files[i]->file_size;
63
89
  }
64
1.13k
  return sum;
65
1.13k
}
66
67
425
Version::~Version() {
68
425
  assert(refs_ == 0);
69
70
  // Remove from linked list
71
425
  prev_->next_ = next_;
72
425
  next_->prev_ = prev_;
73
74
  // Drop references to files
75
3.40k
  for (int level = 0; level < config::kNumLevels; level++) {
76
3.20k
    for (size_t i = 0; i < files_[level].size(); i++) {
77
233
      FileMetaData* f = files_[level][i];
78
233
      assert(f->refs > 0);
79
233
      f->refs--;
80
233
      if (f->refs <= 0) {
81
118
        delete f;
82
118
      }
83
233
    }
84
2.97k
  }
85
425
}
86
87
int FindFile(const InternalKeyComparator& icmp,
88
126
             const std::vector<FileMetaData*>& files, const Slice& key) {
89
126
  uint32_t left = 0;
90
126
  uint32_t right = files.size();
91
145
  while (left < right) {
92
19
    uint32_t mid = (left + right) / 2;
93
19
    const FileMetaData* f = files[mid];
94
19
    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
95
      // Key at "mid.largest" is < "target".  Therefore all
96
      // files at or before "mid" are uninteresting.
97
9
      left = mid + 1;
98
10
    } else {
99
      // Key at "mid.largest" is >= "target".  Therefore all files
100
      // after "mid" are uninteresting.
101
10
      right = mid;
102
10
    }
103
19
  }
104
126
  return right;
105
126
}
106
107
static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
108
4
                      const FileMetaData* f) {
109
  // null user_key occurs before all keys and is therefore never after *f
110
4
  return (user_key != nullptr &&
111
4
          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
112
4
}
113
114
static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
115
6
                       const FileMetaData* f) {
116
  // null user_key occurs after all keys and is therefore never before *f
117
6
  return (user_key != nullptr &&
118
6
          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
119
6
}
120
121
bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
122
                           bool disjoint_sorted_files,
123
                           const std::vector<FileMetaData*>& files,
124
                           const Slice* smallest_user_key,
125
118
                           const Slice* largest_user_key) {
126
118
  const Comparator* ucmp = icmp.user_comparator();
127
118
  if (!disjoint_sorted_files) {
128
    // Need to check against all files
129
6
    for (size_t i = 0; i < files.size(); i++) {
130
4
      const FileMetaData* f = files[i];
131
4
      if (AfterFile(ucmp, smallest_user_key, f) ||
132
4
          BeforeFile(ucmp, largest_user_key, f)) {
133
        // No overlap
134
4
      } else {
135
4
        return true;  // Overlap
136
4
      }
137
4
    }
138
2
    return false;
139
6
  }
140
141
  // Binary search over file list
142
112
  uint32_t index = 0;
143
112
  if (smallest_user_key != nullptr) {
144
    // Find the earliest possible internal key for smallest_user_key
145
112
    InternalKey small_key(*smallest_user_key, kMaxSequenceNumber,
146
112
                          kValueTypeForSeek);
147
112
    index = FindFile(icmp, files, small_key.Encode());
148
112
  }
149
150
112
  if (index >= files.size()) {
151
    // beginning of range is after all files, so no overlap.
152
110
    return false;
153
110
  }
154
155
2
  return !BeforeFile(ucmp, largest_user_key, files[index]);
156
112
}
157
158
// An internal iterator.  For a given version/level pair, yields
159
// information about the files in the level.  For a given entry, key()
160
// is the largest key that occurs in the file, and value() is an
161
// 16-byte value containing the file number and file size, both
162
// encoded using EncodeFixed64.
163
class Version::LevelFileNumIterator : public Iterator {
164
 public:
165
  LevelFileNumIterator(const InternalKeyComparator& icmp,
166
                       const std::vector<FileMetaData*>* flist)
167
18
      : icmp_(icmp), flist_(flist), index_(flist->size()) {  // Marks as invalid
168
18
  }
169
34
  bool Valid() const override { return index_ < flist_->size(); }
170
0
  void Seek(const Slice& target) override {
171
0
    index_ = FindFile(icmp_, *flist_, target);
172
0
  }
173
8
  void SeekToFirst() override { index_ = 0; }
174
0
  void SeekToLast() override {
175
0
    index_ = flist_->empty() ? 0 : flist_->size() - 1;
176
0
  }
177
8
  void Next() override {
178
8
    assert(Valid());
179
8
    index_++;
180
8
  }
181
0
  void Prev() override {
182
0
    assert(Valid());
183
0
    if (index_ == 0) {
184
0
      index_ = flist_->size();  // Marks as invalid
185
0
    } else {
186
0
      index_--;
187
0
    }
188
0
  }
189
8
  Slice key() const override {
190
8
    assert(Valid());
191
8
    return (*flist_)[index_]->largest.Encode();
192
8
  }
193
8
  Slice value() const override {
194
8
    assert(Valid());
195
8
    EncodeFixed64(value_buf_, (*flist_)[index_]->number);
196
8
    EncodeFixed64(value_buf_ + 8, (*flist_)[index_]->file_size);
197
8
    return Slice(value_buf_, sizeof(value_buf_));
198
8
  }
199
4
  Status status() const override { return Status::OK(); }
200
201
 private:
202
  const InternalKeyComparator icmp_;
203
  const std::vector<FileMetaData*>* const flist_;
204
  uint32_t index_;
205
206
  // Backing store for value().  Holds the file number and size.
207
  mutable char value_buf_[16];
208
};
209
210
static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
211
8
                                 const Slice& file_value) {
212
8
  TableCache* cache = reinterpret_cast<TableCache*>(arg);
213
8
  if (file_value.size() != 16) {
214
0
    return NewErrorIterator(
215
0
        Status::Corruption("FileReader invoked with unexpected value"));
216
8
  } else {
217
8
    return cache->NewIterator(options, DecodeFixed64(file_value.data()),
218
8
                              DecodeFixed64(file_value.data() + 8));
219
8
  }
220
8
}
221
222
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
223
16
                                            int level) const {
224
16
  return NewTwoLevelIterator(
225
16
      new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator,
226
16
      vset_->table_cache_, options);
227
16
}
228
229
void Version::AddIterators(const ReadOptions& options,
230
34
                           std::vector<Iterator*>* iters) {
231
  // Merge all level zero files together since they may overlap
232
66
  for (size_t i = 0; i < files_[0].size(); i++) {
233
32
    iters->push_back(vset_->table_cache_->NewIterator(
234
32
        options, files_[0][i]->number, files_[0][i]->file_size));
235
32
  }
236
237
  // For levels > 0, we can use a concatenating iterator that sequentially
238
  // walks through the non-overlapping files in the level, opening them
239
  // lazily.
240
238
  for (int level = 1; level < config::kNumLevels; level++) {
241
204
    if (!files_[level].empty()) {
242
16
      iters->push_back(NewConcatenatingIterator(options, level));
243
16
    }
244
204
  }
245
34
}
246
247
// Callback from TableCache::Get()
248
namespace {
249
enum SaverState {
250
  kNotFound,
251
  kFound,
252
  kDeleted,
253
  kCorrupt,
254
};
255
struct Saver {
256
  SaverState state;
257
  const Comparator* ucmp;
258
  Slice user_key;
259
  std::string* value;
260
};
261
}  // namespace
262
10
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
263
10
  Saver* s = reinterpret_cast<Saver*>(arg);
264
10
  ParsedInternalKey parsed_key;
265
10
  if (!ParseInternalKey(ikey, &parsed_key)) {
266
0
    s->state = kCorrupt;
267
10
  } else {
268
10
    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
269
0
      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
270
0
      if (s->state == kFound) {
271
0
        s->value->assign(v.data(), v.size());
272
0
      }
273
0
    }
274
10
  }
275
10
}
276
277
6
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
278
6
  return a->number > b->number;
279
6
}
280
281
void Version::ForEachOverlapping(Slice user_key, Slice internal_key, void* arg,
282
29
                                 bool (*func)(void*, int, FileMetaData*)) {
283
29
  const Comparator* ucmp = vset_->icmp_.user_comparator();
284
285
  // Search level-0 in order from newest to oldest.
286
29
  std::vector<FileMetaData*> tmp;
287
29
  tmp.reserve(files_[0].size());
288
52
  for (uint32_t i = 0; i < files_[0].size(); i++) {
289
23
    FileMetaData* f = files_[0][i];
290
23
    if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
291
20
        ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
292
16
      tmp.push_back(f);
293
16
    }
294
23
  }
295
29
  if (!tmp.empty()) {
296
12
    std::sort(tmp.begin(), tmp.end(), NewestFirst);
297
28
    for (uint32_t i = 0; i < tmp.size(); i++) {
298
16
      if (!(*func)(arg, 0, tmp[i])) {
299
0
        return;
300
0
      }
301
16
    }
302
12
  }
303
304
  // Search other levels.
305
191
  for (int level = 1; level < config::kNumLevels; level++) {
306
164
    size_t num_files = files_[level].size();
307
164
    if (num_files == 0) continue;
308
309
    // Binary search to find earliest index whose largest key >= internal_key.
310
14
    uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
311
14
    if (index < num_files) {
312
8
      FileMetaData* f = files_[level][index];
313
8
      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
314
        // All of "f" is past any data for user_key
315
4
      } else {
316
4
        if (!(*func)(arg, level, f)) {
317
2
          return;
318
2
        }
319
4
      }
320
8
    }
321
14
  }
322
29
}
323
324
Status Version::Get(const ReadOptions& options, const LookupKey& k,
325
21
                    std::string* value, GetStats* stats) {
326
21
  stats->seek_file = nullptr;
327
21
  stats->seek_file_level = -1;
328
329
21
  struct State {
330
21
    Saver saver;
331
21
    GetStats* stats;
332
21
    const ReadOptions* options;
333
21
    Slice ikey;
334
21
    FileMetaData* last_file_read;
335
21
    int last_file_read_level;
336
337
21
    VersionSet* vset;
338
21
    Status s;
339
21
    bool found;
340
341
21
    static bool Match(void* arg, int level, FileMetaData* f) {
342
10
      State* state = reinterpret_cast<State*>(arg);
343
344
10
      if (state->stats->seek_file == nullptr &&
345
6
          state->last_file_read != nullptr) {
346
        // We have had more than one seek for this read.  Charge the 1st file.
347
2
        state->stats->seek_file = state->last_file_read;
348
2
        state->stats->seek_file_level = state->last_file_read_level;
349
2
      }
350
351
10
      state->last_file_read = f;
352
10
      state->last_file_read_level = level;
353
354
10
      state->s = state->vset->table_cache_->Get(*state->options, f->number,
355
10
                                                f->file_size, state->ikey,
356
10
                                                &state->saver, SaveValue);
357
10
      if (!state->s.ok()) {
358
0
        state->found = true;
359
0
        return false;
360
0
      }
361
10
      switch (state->saver.state) {
362
10
        case kNotFound:
363
10
          return true;  // Keep searching in other files
364
0
        case kFound:
365
0
          state->found = true;
366
0
          return false;
367
0
        case kDeleted:
368
0
          return false;
369
0
        case kCorrupt:
370
0
          state->s =
371
0
              Status::Corruption("corrupted key for ", state->saver.user_key);
372
0
          state->found = true;
373
0
          return false;
374
10
      }
375
376
      // Not reached. Added to avoid false compilation warnings of
377
      // "control reaches end of non-void function".
378
0
      return false;
379
10
    }
380
21
  };
381
382
21
  State state;
383
21
  state.found = false;
384
21
  state.stats = stats;
385
21
  state.last_file_read = nullptr;
386
21
  state.last_file_read_level = -1;
387
388
21
  state.options = &options;
389
21
  state.ikey = k.internal_key();
390
21
  state.vset = vset_;
391
392
21
  state.saver.state = kNotFound;
393
21
  state.saver.ucmp = vset_->icmp_.user_comparator();
394
21
  state.saver.user_key = k.user_key();
395
21
  state.saver.value = value;
396
397
21
  ForEachOverlapping(state.saver.user_key, state.ikey, &state, &State::Match);
398
399
21
  return state.found ? state.s : Status::NotFound(Slice());
400
21
}
401
402
23
bool Version::UpdateStats(const GetStats& stats) {
403
23
  FileMetaData* f = stats.seek_file;
404
23
  if (f != nullptr) {
405
4
    f->allowed_seeks--;
406
4
    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
407
0
      file_to_compact_ = f;
408
0
      file_to_compact_level_ = stats.seek_file_level;
409
0
      return true;
410
0
    }
411
4
  }
412
23
  return false;
413
23
}
414
415
8
bool Version::RecordReadSample(Slice internal_key) {
416
8
  ParsedInternalKey ikey;
417
8
  if (!ParseInternalKey(internal_key, &ikey)) {
418
0
    return false;
419
0
  }
420
421
8
  struct State {
422
8
    GetStats stats;  // Holds first matching file
423
8
    int matches;
424
425
10
    static bool Match(void* arg, int level, FileMetaData* f) {
426
10
      State* state = reinterpret_cast<State*>(arg);
427
10
      state->matches++;
428
10
      if (state->matches == 1) {
429
        // Remember first match.
430
8
        state->stats.seek_file = f;
431
8
        state->stats.seek_file_level = level;
432
8
      }
433
      // We can stop iterating once we have a second match.
434
10
      return state->matches < 2;
435
10
    }
436
8
  };
437
438
8
  State state;
439
8
  state.matches = 0;
440
8
  ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
441
442
  // Must have at least two matches since we want to merge across
443
  // files. But what if we have a single file that contains many
444
  // overwrites and deletions?  Should we have another mechanism for
445
  // finding such files?
446
8
  if (state.matches >= 2) {
447
    // 1MB cost is about 1 seek (see comment in Builder::Apply).
448
2
    return UpdateStats(state.stats);
449
2
  }
450
6
  return false;
451
8
}
452
453
631
void Version::Ref() { ++refs_; }
454
455
631
void Version::Unref() {
456
631
  assert(this != &vset_->dummy_versions_);
457
631
  assert(refs_ >= 1);
458
631
  --refs_;
459
631
  if (refs_ == 0) {
460
325
    delete this;
461
325
  }
462
631
}
463
464
bool Version::OverlapInLevel(int level, const Slice* smallest_user_key,
465
118
                             const Slice* largest_user_key) {
466
118
  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
467
118
                               smallest_user_key, largest_user_key);
468
118
}
469
470
int Version::PickLevelForMemTableOutput(const Slice& smallest_user_key,
471
6
                                        const Slice& largest_user_key) {
472
6
  int level = 0;
473
6
  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
474
    // Push to next level if there is no overlap in next level,
475
    // and the #bytes overlapping in the level after that are limited.
476
2
    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
477
2
    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
478
2
    std::vector<FileMetaData*> overlaps;
479
6
    while (level < config::kMaxMemCompactLevel) {
480
4
      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
481
0
        break;
482
0
      }
483
4
      if (level + 2 < config::kNumLevels) {
484
        // Check that file does not overlap too many grandparent bytes.
485
4
        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
486
4
        const int64_t sum = TotalFileSize(overlaps);
487
4
        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
488
0
          break;
489
0
        }
490
4
      }
491
4
      level++;
492
4
    }
493
2
  }
494
6
  return level;
495
6
}
496
497
// Store in "*inputs" all files in "level" that overlap [begin,end]
498
void Version::GetOverlappingInputs(int level, const InternalKey* begin,
499
                                   const InternalKey* end,
500
45
                                   std::vector<FileMetaData*>* inputs) {
501
45
  assert(level >= 0);
502
45
  assert(level < config::kNumLevels);
503
45
  inputs->clear();
504
45
  Slice user_begin, user_end;
505
45
  if (begin != nullptr) {
506
45
    user_begin = begin->user_key();
507
45
  }
508
45
  if (end != nullptr) {
509
45
    user_end = end->user_key();
510
45
  }
511
45
  const Comparator* user_cmp = vset_->icmp_.user_comparator();
512
112
  for (size_t i = 0; i < files_[level].size();) {
513
67
    FileMetaData* f = files_[level][i++];
514
67
    const Slice file_start = f->smallest.user_key();
515
67
    const Slice file_limit = f->largest.user_key();
516
67
    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
517
      // "f" is completely before specified range; skip it
518
55
    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
519
      // "f" is completely after specified range; skip it
520
45
    } else {
521
45
      inputs->push_back(f);
522
45
      if (level == 0) {
523
        // Level-0 files may overlap each other.  So check if the newly
524
        // added file has expanded the range.  If so, restart search.
525
43
        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
526
2
          user_begin = file_start;
527
2
          inputs->clear();
528
2
          i = 0;
529
41
        } else if (end != nullptr &&
530
41
                   user_cmp->Compare(file_limit, user_end) > 0) {
531
6
          user_end = file_limit;
532
6
          inputs->clear();
533
6
          i = 0;
534
6
        }
535
43
      }
536
45
    }
537
67
  }
538
45
}
539
540
0
std::string Version::DebugString() const {
541
0
  std::string r;
542
0
  for (int level = 0; level < config::kNumLevels; level++) {
543
    // E.g.,
544
    //   --- level 1 ---
545
    //   17:123['a' .. 'd']
546
    //   20:43['e' .. 'g']
547
0
    r.append("--- level ");
548
0
    AppendNumberTo(&r, level);
549
0
    r.append(" ---\n");
550
0
    const std::vector<FileMetaData*>& files = files_[level];
551
0
    for (size_t i = 0; i < files.size(); i++) {
552
0
      r.push_back(' ');
553
0
      AppendNumberTo(&r, files[i]->number);
554
0
      r.push_back(':');
555
0
      AppendNumberTo(&r, files[i]->file_size);
556
0
      r.append("[");
557
0
      r.append(files[i]->smallest.DebugString());
558
0
      r.append(" .. ");
559
0
      r.append(files[i]->largest.DebugString());
560
0
      r.append("]\n");
561
0
    }
562
0
  }
563
0
  return r;
564
0
}
565
566
// A helper class so we can efficiently apply a whole sequence
567
// of edits to a particular state without creating intermediate
568
// Versions that contain full copies of the intermediate state.
569
class VersionSet::Builder {
570
 private:
571
  // Helper to sort by v->files_[file_number].smallest
572
  struct BySmallestKey {
573
    const InternalKeyComparator* internal_comparator;
574
575
133
    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
576
133
      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
577
133
      if (r != 0) {
578
133
        return (r < 0);
579
133
      } else {
580
        // Break ties by file number
581
0
        return (f1->number < f2->number);
582
0
      }
583
133
    }
584
  };
585
586
  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
587
  struct LevelState {
588
    std::set<uint64_t> deleted_files;
589
    FileSet* added_files;
590
  };
591
592
  VersionSet* vset_;
593
  Version* base_;
594
  LevelState levels_[config::kNumLevels];
595
596
 public:
597
  // Initialize a builder with the files from *base and other info from *vset
598
225
  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
599
225
    base_->Ref();
600
225
    BySmallestKey cmp;
601
225
    cmp.internal_comparator = &vset_->icmp_;
602
1.80k
    for (int level = 0; level < config::kNumLevels; level++) {
603
1.57k
      levels_[level].added_files = new FileSet(cmp);
604
1.57k
    }
605
225
  }
606
607
225
  ~Builder() {
608
1.80k
    for (int level = 0; level < config::kNumLevels; level++) {
609
1.57k
      const FileSet* added = levels_[level].added_files;
610
1.57k
      std::vector<FileMetaData*> to_unref;
611
1.57k
      to_unref.reserve(added->size());
612
1.71k
      for (FileSet::const_iterator it = added->begin(); it != added->end();
613
1.57k
           ++it) {
614
139
        to_unref.push_back(*it);
615
139
      }
616
1.57k
      delete added;
617
1.71k
      for (uint32_t i = 0; i < to_unref.size(); i++) {
618
139
        FileMetaData* f = to_unref[i];
619
139
        f->refs--;
620
139
        if (f->refs <= 0) {
621
21
          delete f;
622
21
        }
623
139
      }
624
1.57k
    }
625
225
    base_->Unref();
626
225
  }
627
628
  // Apply all of the edits in *edit to the current state.
629
295
  void Apply(const VersionEdit* edit) {
630
    // Update compaction pointers
631
320
    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
632
25
      const int level = edit->compact_pointers_[i].first;
633
25
      vset_->compact_pointer_[level] =
634
25
          edit->compact_pointers_[i].second.Encode().ToString();
635
25
    }
636
637
    // Delete files
638
295
    for (const auto& deleted_file_set_kvp : edit->deleted_files_) {
639
42
      const int level = deleted_file_set_kvp.first;
640
42
      const uint64_t number = deleted_file_set_kvp.second;
641
42
      levels_[level].deleted_files.insert(number);
642
42
    }
643
644
    // Add new files
645
434
    for (size_t i = 0; i < edit->new_files_.size(); i++) {
646
139
      const int level = edit->new_files_[i].first;
647
139
      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
648
139
      f->refs = 1;
649
650
      // We arrange to automatically compact this file after
651
      // a certain number of seeks.  Let's assume:
652
      //   (1) One seek costs 10ms
653
      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
654
      //   (3) A compaction of 1MB does 25MB of IO:
655
      //         1MB read from this level
656
      //         10-12MB read from next level (boundaries may be misaligned)
657
      //         10-12MB written to next level
658
      // This implies that 25 seeks cost the same as the compaction
659
      // of 1MB of data.  I.e., one seek costs approximately the
660
      // same as the compaction of 40KB of data.  We are a little
661
      // conservative and allow approximately one seek for every 16KB
662
      // of data before triggering a compaction.
663
139
      f->allowed_seeks = static_cast<int>((f->file_size / 16384U));
664
139
      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
665
666
139
      levels_[level].deleted_files.erase(f->number);
667
139
      levels_[level].added_files->insert(f);
668
139
    }
669
295
  }
670
671
  // Save the current state in *v.
672
225
  void SaveTo(Version* v) {
673
225
    BySmallestKey cmp;
674
225
    cmp.internal_comparator = &vset_->icmp_;
675
1.80k
    for (int level = 0; level < config::kNumLevels; level++) {
676
      // Merge the set of added files with the set of pre-existing files.
677
      // Drop any deleted files.  Store the result in *v.
678
1.57k
      const std::vector<FileMetaData*>& base_files = base_->files_[level];
679
1.57k
      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
680
1.57k
      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
681
1.57k
      const FileSet* added_files = levels_[level].added_files;
682
1.57k
      v->files_[level].reserve(base_files.size() + added_files->size());
683
1.57k
      for (const auto& added_file : *added_files) {
684
        // Add all smaller files listed in base_
685
139
        for (std::vector<FileMetaData*>::const_iterator bpos =
686
139
                 std::upper_bound(base_iter, base_end, added_file, cmp);
687
163
             base_iter != bpos; ++base_iter) {
688
24
          MaybeAddFile(v, level, *base_iter);
689
24
        }
690
691
139
        MaybeAddFile(v, level, added_file);
692
139
      }
693
694
      // Add remaining base files
695
1.68k
      for (; base_iter != base_end; ++base_iter) {
696
112
        MaybeAddFile(v, level, *base_iter);
697
112
      }
698
699
#ifndef NDEBUG
700
      // Make sure there is no overlap in levels > 0
701
      if (level > 0) {
702
        for (uint32_t i = 1; i < v->files_[level].size(); i++) {
703
          const InternalKey& prev_end = v->files_[level][i - 1]->largest;
704
          const InternalKey& this_begin = v->files_[level][i]->smallest;
705
          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
706
            std::fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
707
                         prev_end.DebugString().c_str(),
708
                         this_begin.DebugString().c_str());
709
            std::abort();
710
          }
711
        }
712
      }
713
#endif
714
1.57k
    }
715
225
  }
716
717
275
  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
718
275
    if (levels_[level].deleted_files.count(f->number) > 0) {
719
      // File is deleted: do nothing
720
233
    } else {
721
233
      std::vector<FileMetaData*>* files = &v->files_[level];
722
233
      if (level > 0 && !files->empty()) {
723
        // Must not overlap
724
3
        assert(vset_->icmp_.Compare((*files)[files->size() - 1]->largest,
725
3
                                    f->smallest) < 0);
726
3
      }
727
233
      f->refs++;
728
233
      files->push_back(f);
729
233
    }
730
275
  }
731
};
732
733
VersionSet::VersionSet(const std::string& dbname, const Options* options,
734
                       TableCache* table_cache,
735
                       const InternalKeyComparator* cmp)
736
100
    : env_(options->env),
737
100
      dbname_(dbname),
738
100
      options_(options),
739
100
      table_cache_(table_cache),
740
100
      icmp_(*cmp),
741
100
      next_file_number_(2),
742
100
      manifest_file_number_(0),  // Filled by Recover()
743
100
      last_sequence_(0),
744
100
      log_number_(0),
745
100
      prev_log_number_(0),
746
100
      descriptor_file_(nullptr),
747
100
      descriptor_log_(nullptr),
748
100
      dummy_versions_(this),
749
100
      current_(nullptr) {
750
100
  AppendVersion(new Version(this));
751
100
}
752
753
100
VersionSet::~VersionSet() {
754
100
  current_->Unref();
755
100
  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
756
100
  delete descriptor_log_;
757
100
  delete descriptor_file_;
758
100
}
759
760
325
void VersionSet::AppendVersion(Version* v) {
761
  // Make "v" current
762
325
  assert(v->refs_ == 0);
763
325
  assert(v != current_);
764
325
  if (current_ != nullptr) {
765
225
    current_->Unref();
766
225
  }
767
325
  current_ = v;
768
325
  v->Ref();
769
770
  // Append to linked list
771
325
  v->prev_ = dummy_versions_.prev_;
772
325
  v->next_ = &dummy_versions_;
773
325
  v->prev_->next_ = v;
774
325
  v->next_->prev_ = v;
775
325
}
776
777
125
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
778
125
  if (edit->has_log_number_) {
779
118
    assert(edit->log_number_ >= log_number_);
780
118
    assert(edit->log_number_ < next_file_number_);
781
118
  } else {
782
7
    edit->SetLogNumber(log_number_);
783
7
  }
784
785
125
  if (!edit->has_prev_log_number_) {
786
7
    edit->SetPrevLogNumber(prev_log_number_);
787
7
  }
788
789
125
  edit->SetNextFile(next_file_number_);
790
125
  edit->SetLastSequence(last_sequence_);
791
792
125
  Version* v = new Version(this);
793
125
  {
794
125
    Builder builder(this, current_);
795
125
    builder.Apply(edit);
796
125
    builder.SaveTo(v);
797
125
  }
798
125
  Finalize(v);
799
800
  // Initialize new descriptor log file if necessary by creating
801
  // a temporary file that contains a snapshot of the current version.
802
125
  std::string new_manifest_file;
803
125
  Status s;
804
125
  if (descriptor_log_ == nullptr) {
805
    // No reason to unlock *mu here since we only hit this path in the
806
    // first call to LogAndApply (when opening the database).
807
100
    assert(descriptor_file_ == nullptr);
808
100
    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
809
100
    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
810
100
    if (s.ok()) {
811
100
      descriptor_log_ = new log::Writer(descriptor_file_);
812
100
      s = WriteSnapshot(descriptor_log_);
813
100
    }
814
100
  }
815
816
  // Unlock during expensive MANIFEST log write
817
125
  {
818
125
    mu->Unlock();
819
820
    // Write new record to MANIFEST log
821
125
    if (s.ok()) {
822
125
      std::string record;
823
125
      edit->EncodeTo(&record);
824
125
      s = descriptor_log_->AddRecord(record);
825
125
      if (s.ok()) {
826
125
        s = descriptor_file_->Sync();
827
125
      }
828
125
      if (!s.ok()) {
829
0
        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
830
0
      }
831
125
    }
832
833
    // If we just created a new descriptor file, install it by writing a
834
    // new CURRENT file that points to it.
835
125
    if (s.ok() && !new_manifest_file.empty()) {
836
100
      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
837
100
    }
838
839
125
    mu->Lock();
840
125
  }
841
842
  // Install the new version
843
125
  if (s.ok()) {
844
125
    AppendVersion(v);
845
125
    log_number_ = edit->log_number_;
846
125
    prev_log_number_ = edit->prev_log_number_;
847
125
  } else {
848
0
    delete v;
849
0
    if (!new_manifest_file.empty()) {
850
0
      delete descriptor_log_;
851
0
      delete descriptor_file_;
852
0
      descriptor_log_ = nullptr;
853
0
      descriptor_file_ = nullptr;
854
0
      env_->RemoveFile(new_manifest_file);
855
0
    }
856
0
  }
857
858
125
  return s;
859
125
}
860
861
100
Status VersionSet::Recover(bool* save_manifest) {
862
100
  struct LogReporter : public log::Reader::Reporter {
863
100
    Status* status;
864
100
    void Corruption(size_t bytes, const Status& s) override {
865
0
      if (this->status->ok()) *this->status = s;
866
0
    }
867
100
  };
868
869
  // Read "CURRENT" file, which contains a pointer to the current manifest file
870
100
  std::string current;
871
100
  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
872
100
  if (!s.ok()) {
873
0
    return s;
874
0
  }
875
100
  if (current.empty() || current[current.size() - 1] != '\n') {
876
0
    return Status::Corruption("CURRENT file does not end with newline");
877
0
  }
878
100
  current.resize(current.size() - 1);
879
880
100
  std::string dscname = dbname_ + "/" + current;
881
100
  SequentialFile* file;
882
100
  s = env_->NewSequentialFile(dscname, &file);
883
100
  if (!s.ok()) {
884
0
    if (s.IsNotFound()) {
885
0
      return Status::Corruption("CURRENT points to a non-existent file",
886
0
                                s.ToString());
887
0
    }
888
0
    return s;
889
0
  }
890
891
100
  bool have_log_number = false;
892
100
  bool have_prev_log_number = false;
893
100
  bool have_next_file = false;
894
100
  bool have_last_sequence = false;
895
100
  uint64_t next_file = 0;
896
100
  uint64_t last_sequence = 0;
897
100
  uint64_t log_number = 0;
898
100
  uint64_t prev_log_number = 0;
899
100
  Builder builder(this, current_);
900
100
  int read_records = 0;
901
902
100
  {
903
100
    LogReporter reporter;
904
100
    reporter.status = &s;
905
100
    log::Reader reader(file, &reporter, true /*checksum*/,
906
100
                       0 /*initial_offset*/);
907
100
    Slice record;
908
100
    std::string scratch;
909
270
    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
910
170
      ++read_records;
911
170
      VersionEdit edit;
912
170
      s = edit.DecodeFrom(record);
913
170
      if (s.ok()) {
914
170
        if (edit.has_comparator_ &&
915
100
            edit.comparator_ != icmp_.user_comparator()->Name()) {
916
0
          s = Status::InvalidArgument(
917
0
              edit.comparator_ + " does not match existing comparator ",
918
0
              icmp_.user_comparator()->Name());
919
0
        }
920
170
      }
921
922
170
      if (s.ok()) {
923
170
        builder.Apply(&edit);
924
170
      }
925
926
170
      if (edit.has_log_number_) {
927
117
        log_number = edit.log_number_;
928
117
        have_log_number = true;
929
117
      }
930
931
170
      if (edit.has_prev_log_number_) {
932
70
        prev_log_number = edit.prev_log_number_;
933
70
        have_prev_log_number = true;
934
70
      }
935
936
170
      if (edit.has_next_file_number_) {
937
117
        next_file = edit.next_file_number_;
938
117
        have_next_file = true;
939
117
      }
940
941
170
      if (edit.has_last_sequence_) {
942
117
        last_sequence = edit.last_sequence_;
943
117
        have_last_sequence = true;
944
117
      }
945
170
    }
946
100
  }
947
100
  delete file;
948
100
  file = nullptr;
949
950
100
  if (s.ok()) {
951
100
    if (!have_next_file) {
952
0
      s = Status::Corruption("no meta-nextfile entry in descriptor");
953
100
    } else if (!have_log_number) {
954
0
      s = Status::Corruption("no meta-lognumber entry in descriptor");
955
100
    } else if (!have_last_sequence) {
956
0
      s = Status::Corruption("no last-sequence-number entry in descriptor");
957
0
    }
958
959
100
    if (!have_prev_log_number) {
960
47
      prev_log_number = 0;
961
47
    }
962
963
100
    MarkFileNumberUsed(prev_log_number);
964
100
    MarkFileNumberUsed(log_number);
965
100
  }
966
967
100
  if (s.ok()) {
968
100
    Version* v = new Version(this);
969
100
    builder.SaveTo(v);
970
    // Install recovered version
971
100
    Finalize(v);
972
100
    AppendVersion(v);
973
100
    manifest_file_number_ = next_file;
974
100
    next_file_number_ = next_file + 1;
975
100
    last_sequence_ = last_sequence;
976
100
    log_number_ = log_number;
977
100
    prev_log_number_ = prev_log_number;
978
979
    // See if we can reuse the existing MANIFEST file.
980
100
    if (ReuseManifest(dscname, current)) {
981
      // No need to save new manifest
982
100
    } else {
983
100
      *save_manifest = true;
984
100
    }
985
100
  } else {
986
0
    std::string error = s.ToString();
987
0
    Log(options_->info_log, "Error recovering version set with %d records: %s",
988
0
        read_records, error.c_str());
989
0
  }
990
991
100
  return s;
992
100
}
993
994
bool VersionSet::ReuseManifest(const std::string& dscname,
995
100
                               const std::string& dscbase) {
996
100
  if (!options_->reuse_logs) {
997
100
    return false;
998
100
  }
999
0
  FileType manifest_type;
1000
0
  uint64_t manifest_number;
1001
0
  uint64_t manifest_size;
1002
0
  if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
1003
0
      manifest_type != kDescriptorFile ||
1004
0
      !env_->GetFileSize(dscname, &manifest_size).ok() ||
1005
      // Make new compacted MANIFEST if old one is too big
1006
0
      manifest_size >= TargetFileSize(options_)) {
1007
0
    return false;
1008
0
  }
1009
1010
0
  assert(descriptor_file_ == nullptr);
1011
0
  assert(descriptor_log_ == nullptr);
1012
0
  Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
1013
0
  if (!r.ok()) {
1014
0
    Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
1015
0
    assert(descriptor_file_ == nullptr);
1016
0
    return false;
1017
0
  }
1018
1019
0
  Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
1020
0
  descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
1021
0
  manifest_file_number_ = manifest_number;
1022
0
  return true;
1023
0
}
1024
1025
254
void VersionSet::MarkFileNumberUsed(uint64_t number) {
1026
254
  if (next_file_number_ <= number) {
1027
54
    next_file_number_ = number + 1;
1028
54
  }
1029
254
}
1030
1031
225
void VersionSet::Finalize(Version* v) {
1032
  // Precomputed best level for next compaction
1033
225
  int best_level = -1;
1034
225
  double best_score = -1;
1035
1036
1.57k
  for (int level = 0; level < config::kNumLevels - 1; level++) {
1037
1.35k
    double score;
1038
1.35k
    if (level == 0) {
1039
      // We treat level-0 specially by bounding the number of files
1040
      // instead of number of bytes for two reasons:
1041
      //
1042
      // (1) With larger write-buffer sizes, it is nice not to do too
1043
      // many level-0 compactions.
1044
      //
1045
      // (2) The files in level-0 are merged on every read and
1046
      // therefore we wish to avoid too many files when the individual
1047
      // file size is small (perhaps because of a small write-buffer
1048
      // setting, or very high compression ratios, or lots of
1049
      // overwrites/deletions).
1050
225
      score = v->files_[level].size() /
1051
225
              static_cast<double>(config::kL0_CompactionTrigger);
1052
1.12k
    } else {
1053
      // Compute the ratio of current size to size limit.
1054
1.12k
      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
1055
1.12k
      score =
1056
1.12k
          static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
1057
1.12k
    }
1058
1059
1.35k
    if (score > best_score) {
1060
249
      best_level = level;
1061
249
      best_score = score;
1062
249
    }
1063
1.35k
  }
1064
1065
225
  v->compaction_level_ = best_level;
1066
225
  v->compaction_score_ = best_score;
1067
225
}
1068
1069
100
Status VersionSet::WriteSnapshot(log::Writer* log) {
1070
  // TODO: Break up into multiple records to reduce memory usage on recovery?
1071
1072
  // Save metadata
1073
100
  VersionEdit edit;
1074
100
  edit.SetComparatorName(icmp_.user_comparator()->Name());
1075
1076
  // Save compaction pointers
1077
800
  for (int level = 0; level < config::kNumLevels; level++) {
1078
700
    if (!compact_pointer_[level].empty()) {
1079
15
      InternalKey key;
1080
15
      key.DecodeFrom(compact_pointer_[level]);
1081
15
      edit.SetCompactPointer(level, key);
1082
15
    }
1083
700
  }
1084
1085
  // Save files
1086
800
  for (int level = 0; level < config::kNumLevels; level++) {
1087
700
    const std::vector<FileMetaData*>& files = current_->files_[level];
1088
775
    for (size_t i = 0; i < files.size(); i++) {
1089
75
      const FileMetaData* f = files[i];
1090
75
      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
1091
75
    }
1092
700
  }
1093
1094
100
  std::string record;
1095
100
  edit.EncodeTo(&record);
1096
100
  return log->AddRecord(record);
1097
100
}
1098
1099
452
int VersionSet::NumLevelFiles(int level) const {
1100
452
  assert(level >= 0);
1101
452
  assert(level < config::kNumLevels);
1102
452
  return current_->files_[level].size();
1103
452
}
1104
1105
7
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
1106
  // Update code if kNumLevels changes
1107
7
  static_assert(config::kNumLevels == 7, "");
1108
7
  std::snprintf(
1109
7
      scratch->buffer, sizeof(scratch->buffer), "files[ %d %d %d %d %d %d %d ]",
1110
7
      int(current_->files_[0].size()), int(current_->files_[1].size()),
1111
7
      int(current_->files_[2].size()), int(current_->files_[3].size()),
1112
7
      int(current_->files_[4].size()), int(current_->files_[5].size()),
1113
7
      int(current_->files_[6].size()));
1114
7
  return scratch->buffer;
1115
7
}
1116
1117
0
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
1118
0
  uint64_t result = 0;
1119
0
  for (int level = 0; level < config::kNumLevels; level++) {
1120
0
    const std::vector<FileMetaData*>& files = v->files_[level];
1121
0
    for (size_t i = 0; i < files.size(); i++) {
1122
0
      if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
1123
        // Entire file is before "ikey", so just add the file size
1124
0
        result += files[i]->file_size;
1125
0
      } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
1126
        // Entire file is after "ikey", so ignore
1127
0
        if (level > 0) {
1128
          // Files other than level 0 are sorted by meta->smallest, so
1129
          // no further files in this level will contain data for
1130
          // "ikey".
1131
0
          break;
1132
0
        }
1133
0
      } else {
1134
        // "ikey" falls in the range for this table.  Add the
1135
        // approximate offset of "ikey" within the table.
1136
0
        Table* tableptr;
1137
0
        Iterator* iter = table_cache_->NewIterator(
1138
0
            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
1139
0
        if (tableptr != nullptr) {
1140
0
          result += tableptr->ApproximateOffsetOf(ikey.Encode());
1141
0
        }
1142
0
        delete iter;
1143
0
      }
1144
0
    }
1145
0
  }
1146
0
  return result;
1147
0
}
1148
1149
224
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
1150
448
  for (Version* v = dummy_versions_.next_; v != &dummy_versions_;
1151
224
       v = v->next_) {
1152
1.79k
    for (int level = 0; level < config::kNumLevels; level++) {
1153
1.56k
      const std::vector<FileMetaData*>& files = v->files_[level];
1154
1.79k
      for (size_t i = 0; i < files.size(); i++) {
1155
227
        live->insert(files[i]->number);
1156
227
      }
1157
1.56k
    }
1158
224
  }
1159
224
}
1160
1161
0
int64_t VersionSet::NumLevelBytes(int level) const {
1162
0
  assert(level >= 0);
1163
0
  assert(level < config::kNumLevels);
1164
0
  return TotalFileSize(current_->files_[level]);
1165
0
}
1166
1167
0
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
1168
0
  int64_t result = 0;
1169
0
  std::vector<FileMetaData*> overlaps;
1170
0
  for (int level = 1; level < config::kNumLevels - 1; level++) {
1171
0
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1172
0
      const FileMetaData* f = current_->files_[level][i];
1173
0
      current_->GetOverlappingInputs(level + 1, &f->smallest, &f->largest,
1174
0
                                     &overlaps);
1175
0
      const int64_t sum = TotalFileSize(overlaps);
1176
0
      if (sum > result) {
1177
0
        result = sum;
1178
0
      }
1179
0
    }
1180
0
  }
1181
0
  return result;
1182
0
}
1183
1184
// Stores the minimal range that covers all entries in inputs in
1185
// *smallest, *largest.
1186
// REQUIRES: inputs is not empty
1187
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
1188
18
                          InternalKey* smallest, InternalKey* largest) {
1189
18
  assert(!inputs.empty());
1190
18
  smallest->Clear();
1191
18
  largest->Clear();
1192
62
  for (size_t i = 0; i < inputs.size(); i++) {
1193
44
    FileMetaData* f = inputs[i];
1194
44
    if (i == 0) {
1195
18
      *smallest = f->smallest;
1196
18
      *largest = f->largest;
1197
26
    } else {
1198
26
      if (icmp_.Compare(f->smallest, *smallest) < 0) {
1199
0
        *smallest = f->smallest;
1200
0
      }
1201
26
      if (icmp_.Compare(f->largest, *largest) > 0) {
1202
10
        *largest = f->largest;
1203
10
      }
1204
26
    }
1205
44
  }
1206
18
}
1207
1208
// Stores the minimal range that covers all entries in inputs1 and inputs2
1209
// in *smallest, *largest.
1210
// REQUIRES: inputs is not empty
1211
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
1212
                           const std::vector<FileMetaData*>& inputs2,
1213
7
                           InternalKey* smallest, InternalKey* largest) {
1214
7
  std::vector<FileMetaData*> all = inputs1;
1215
7
  all.insert(all.end(), inputs2.begin(), inputs2.end());
1216
7
  GetRange(all, smallest, largest);
1217
7
}
1218
1219
6
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
1220
6
  ReadOptions options;
1221
6
  options.verify_checksums = options_->paranoid_checks;
1222
6
  options.fill_cache = false;
1223
1224
  // Level-0 files have to be merged together.  For other levels,
1225
  // we will make a concatenating iterator per level.
1226
  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
1227
6
  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
1228
6
  Iterator** list = new Iterator*[space];
1229
6
  int num = 0;
1230
18
  for (int which = 0; which < 2; which++) {
1231
12
    if (!c->inputs_[which].empty()) {
1232
8
      if (c->level() + which == 0) {
1233
6
        const std::vector<FileMetaData*>& files = c->inputs_[which];
1234
24
        for (size_t i = 0; i < files.size(); i++) {
1235
18
          list[num++] = table_cache_->NewIterator(options, files[i]->number,
1236
18
                                                  files[i]->file_size);
1237
18
        }
1238
6
      } else {
1239
        // Create concatenating iterator for the files from this level
1240
2
        list[num++] = NewTwoLevelIterator(
1241
2
            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
1242
2
            &GetFileIterator, table_cache_, options);
1243
2
      }
1244
8
    }
1245
12
  }
1246
6
  assert(num <= space);
1247
6
  Iterator* result = NewMergingIterator(&icmp_, list, num);
1248
6
  delete[] list;
1249
6
  return result;
1250
6
}
1251
1252
4
Compaction* VersionSet::PickCompaction() {
1253
4
  Compaction* c;
1254
4
  int level;
1255
1256
  // We prefer compactions triggered by too much data in a level over
1257
  // the compactions triggered by seeks.
1258
4
  const bool size_compaction = (current_->compaction_score_ >= 1);
1259
4
  const bool seek_compaction = (current_->file_to_compact_ != nullptr);
1260
4
  if (size_compaction) {
1261
4
    level = current_->compaction_level_;
1262
4
    assert(level >= 0);
1263
4
    assert(level + 1 < config::kNumLevels);
1264
4
    c = new Compaction(options_, level);
1265
1266
    // Pick the first file that comes after compact_pointer_[level]
1267
12
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
1268
10
      FileMetaData* f = current_->files_[level][i];
1269
10
      if (compact_pointer_[level].empty() ||
1270
9
          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
1271
2
        c->inputs_[0].push_back(f);
1272
2
        break;
1273
2
      }
1274
10
    }
1275
4
    if (c->inputs_[0].empty()) {
1276
      // Wrap-around to the beginning of the key space
1277
2
      c->inputs_[0].push_back(current_->files_[level][0]);
1278
2
    }
1279
4
  } else if (seek_compaction) {
1280
0
    level = current_->file_to_compact_level_;
1281
0
    c = new Compaction(options_, level);
1282
0
    c->inputs_[0].push_back(current_->file_to_compact_);
1283
0
  } else {
1284
0
    return nullptr;
1285
0
  }
1286
1287
4
  c->input_version_ = current_;
1288
4
  c->input_version_->Ref();
1289
1290
  // Files in level 0 may overlap each other, so pick up all overlapping ones
1291
4
  if (level == 0) {
1292
4
    InternalKey smallest, largest;
1293
4
    GetRange(c->inputs_[0], &smallest, &largest);
1294
    // Note that the next call will discard the file we placed in
1295
    // c->inputs_[0] earlier and replace it with an overlapping set
1296
    // which will include the picked file.
1297
4
    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
1298
4
    assert(!c->inputs_[0].empty());
1299
4
  }
1300
1301
4
  SetupOtherInputs(c);
1302
1303
4
  return c;
1304
4
}
1305
1306
// Finds the largest key in a vector of files. Returns true if files is not
1307
// empty.
1308
bool FindLargestKey(const InternalKeyComparator& icmp,
1309
                    const std::vector<FileMetaData*>& files,
1310
16
                    InternalKey* largest_key) {
1311
16
  if (files.empty()) {
1312
5
    return false;
1313
5
  }
1314
11
  *largest_key = files[0]->largest;
1315
29
  for (size_t i = 1; i < files.size(); ++i) {
1316
18
    FileMetaData* f = files[i];
1317
18
    if (icmp.Compare(f->largest, *largest_key) > 0) {
1318
8
      *largest_key = f->largest;
1319
8
    }
1320
18
  }
1321
11
  return true;
1322
16
}
1323
1324
// Finds minimum file b2=(l2, u2) in level file for which l2 > u1 and
1325
// user_key(l2) = user_key(u1)
1326
FileMetaData* FindSmallestBoundaryFile(
1327
    const InternalKeyComparator& icmp,
1328
    const std::vector<FileMetaData*>& level_files,
1329
11
    const InternalKey& largest_key) {
1330
11
  const Comparator* user_cmp = icmp.user_comparator();
1331
11
  FileMetaData* smallest_boundary_file = nullptr;
1332
49
  for (size_t i = 0; i < level_files.size(); ++i) {
1333
38
    FileMetaData* f = level_files[i];
1334
38
    if (icmp.Compare(f->smallest, largest_key) > 0 &&
1335
8
        user_cmp->Compare(f->smallest.user_key(), largest_key.user_key()) ==
1336
8
            0) {
1337
0
      if (smallest_boundary_file == nullptr ||
1338
0
          icmp.Compare(f->smallest, smallest_boundary_file->smallest) < 0) {
1339
0
        smallest_boundary_file = f;
1340
0
      }
1341
0
    }
1342
38
  }
1343
11
  return smallest_boundary_file;
1344
11
}
1345
1346
// Extracts the largest file b1 from |compaction_files| and then searches for a
1347
// b2 in |level_files| for which user_key(u1) = user_key(l2). If it finds such a
1348
// file b2 (known as a boundary file) it adds it to |compaction_files| and then
1349
// searches again using this new upper bound.
1350
//
1351
// If there are two blocks, b1=(l1, u1) and b2=(l2, u2) and
1352
// user_key(u1) = user_key(l2), and if we compact b1 but not b2 then a
1353
// subsequent get operation will yield an incorrect result because it will
1354
// return the record from b2 in level i rather than from b1 because it searches
1355
// level by level for records matching the supplied user key.
1356
//
1357
// parameters:
1358
//   in     level_files:      List of files to search for boundary files.
1359
//   in/out compaction_files: List of files to extend by adding boundary files.
1360
void AddBoundaryInputs(const InternalKeyComparator& icmp,
1361
                       const std::vector<FileMetaData*>& level_files,
1362
16
                       std::vector<FileMetaData*>* compaction_files) {
1363
16
  InternalKey largest_key;
1364
1365
  // Quick return if compaction_files is empty.
1366
16
  if (!FindLargestKey(icmp, *compaction_files, &largest_key)) {
1367
5
    return;
1368
5
  }
1369
1370
11
  bool continue_searching = true;
1371
22
  while (continue_searching) {
1372
11
    FileMetaData* smallest_boundary_file =
1373
11
        FindSmallestBoundaryFile(icmp, level_files, largest_key);
1374
1375
    // If a boundary file was found advance largest_key, otherwise we're done.
1376
11
    if (smallest_boundary_file != NULL) {
1377
0
      compaction_files->push_back(smallest_boundary_file);
1378
0
      largest_key = smallest_boundary_file->largest;
1379
11
    } else {
1380
11
      continue_searching = false;
1381
11
    }
1382
11
  }
1383
11
}
1384
1385
7
void VersionSet::SetupOtherInputs(Compaction* c) {
1386
7
  const int level = c->level();
1387
7
  InternalKey smallest, largest;
1388
1389
7
  AddBoundaryInputs(icmp_, current_->files_[level], &c->inputs_[0]);
1390
7
  GetRange(c->inputs_[0], &smallest, &largest);
1391
1392
7
  current_->GetOverlappingInputs(level + 1, &smallest, &largest,
1393
7
                                 &c->inputs_[1]);
1394
7
  AddBoundaryInputs(icmp_, current_->files_[level + 1], &c->inputs_[1]);
1395
1396
  // Get entire range covered by compaction
1397
7
  InternalKey all_start, all_limit;
1398
7
  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1399
1400
  // See if we can grow the number of inputs in "level" without
1401
  // changing the number of "level+1" files we pick up.
1402
7
  if (!c->inputs_[1].empty()) {
1403
2
    std::vector<FileMetaData*> expanded0;
1404
2
    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
1405
2
    AddBoundaryInputs(icmp_, current_->files_[level], &expanded0);
1406
2
    const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
1407
2
    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
1408
2
    const int64_t expanded0_size = TotalFileSize(expanded0);
1409
2
    if (expanded0.size() > c->inputs_[0].size() &&
1410
0
        inputs1_size + expanded0_size <
1411
0
            ExpandedCompactionByteSizeLimit(options_)) {
1412
0
      InternalKey new_start, new_limit;
1413
0
      GetRange(expanded0, &new_start, &new_limit);
1414
0
      std::vector<FileMetaData*> expanded1;
1415
0
      current_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
1416
0
                                     &expanded1);
1417
0
      AddBoundaryInputs(icmp_, current_->files_[level + 1], &expanded1);
1418
0
      if (expanded1.size() == c->inputs_[1].size()) {
1419
0
        Log(options_->info_log,
1420
0
            "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
1421
0
            level, int(c->inputs_[0].size()), int(c->inputs_[1].size()),
1422
0
            long(inputs0_size), long(inputs1_size), int(expanded0.size()),
1423
0
            int(expanded1.size()), long(expanded0_size), long(inputs1_size));
1424
0
        smallest = new_start;
1425
0
        largest = new_limit;
1426
0
        c->inputs_[0] = expanded0;
1427
0
        c->inputs_[1] = expanded1;
1428
0
        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
1429
0
      }
1430
0
    }
1431
2
  }
1432
1433
  // Compute the set of grandparent files that overlap this compaction
1434
  // (parent == level+1; grandparent == level+2)
1435
7
  if (level + 2 < config::kNumLevels) {
1436
7
    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
1437
7
                                   &c->grandparents_);
1438
7
  }
1439
1440
  // Update the place where we will do the next compaction for this level.
1441
  // We update this immediately instead of waiting for the VersionEdit
1442
  // to be applied so that if the compaction fails, we will try a different
1443
  // key range next time.
1444
7
  compact_pointer_[level] = largest.Encode().ToString();
1445
7
  c->edit_.SetCompactPointer(level, largest);
1446
7
}
1447
1448
Compaction* VersionSet::CompactRange(int level, const InternalKey* begin,
1449
21
                                     const InternalKey* end) {
1450
21
  std::vector<FileMetaData*> inputs;
1451
21
  current_->GetOverlappingInputs(level, begin, end, &inputs);
1452
21
  if (inputs.empty()) {
1453
18
    return nullptr;
1454
18
  }
1455
1456
  // Avoid compacting too much in one shot in case the range is large.
1457
  // But we cannot do this for level-0 since level-0 files can overlap
1458
  // and we must not pick one file and drop another older file if the
1459
  // two files overlap.
1460
3
  if (level > 0) {
1461
0
    const uint64_t limit = MaxFileSizeForLevel(options_, level);
1462
0
    uint64_t total = 0;
1463
0
    for (size_t i = 0; i < inputs.size(); i++) {
1464
0
      uint64_t s = inputs[i]->file_size;
1465
0
      total += s;
1466
0
      if (total >= limit) {
1467
0
        inputs.resize(i + 1);
1468
0
        break;
1469
0
      }
1470
0
    }
1471
0
  }
1472
1473
3
  Compaction* c = new Compaction(options_, level);
1474
3
  c->input_version_ = current_;
1475
3
  c->input_version_->Ref();
1476
3
  c->inputs_[0] = inputs;
1477
3
  SetupOtherInputs(c);
1478
3
  return c;
1479
21
}
1480
1481
Compaction::Compaction(const Options* options, int level)
1482
7
    : level_(level),
1483
7
      max_output_file_size_(MaxFileSizeForLevel(options, level)),
1484
7
      input_version_(nullptr),
1485
7
      grandparent_index_(0),
1486
7
      seen_key_(false),
1487
7
      overlapped_bytes_(0) {
1488
56
  for (int i = 0; i < config::kNumLevels; i++) {
1489
49
    level_ptrs_[i] = 0;
1490
49
  }
1491
7
}
1492
1493
7
Compaction::~Compaction() {
1494
7
  if (input_version_ != nullptr) {
1495
1
    input_version_->Unref();
1496
1
  }
1497
7
}
1498
1499
4
bool Compaction::IsTrivialMove() const {
1500
4
  const VersionSet* vset = input_version_->vset_;
1501
  // Avoid a move if there is lots of overlapping grandparent data.
1502
  // Otherwise, the move could create a parent file that will require
1503
  // a very expensive merge later on.
1504
4
  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
1505
1
          TotalFileSize(grandparents_) <=
1506
1
              MaxGrandParentOverlapBytes(vset->options_));
1507
4
}
1508
1509
6
void Compaction::AddInputDeletions(VersionEdit* edit) {
1510
18
  for (int which = 0; which < 2; which++) {
1511
32
    for (size_t i = 0; i < inputs_[which].size(); i++) {
1512
20
      edit->RemoveFile(level_ + which, inputs_[which][i]->number);
1513
20
    }
1514
12
  }
1515
6
}
1516
1517
6
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
1518
  // Maybe use binary search to find right entry instead of linear search?
1519
6
  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
1520
36
  for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
1521
30
    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
1522
30
    while (level_ptrs_[lvl] < files.size()) {
1523
0
      FileMetaData* f = files[level_ptrs_[lvl]];
1524
0
      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
1525
        // We've advanced far enough
1526
0
        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
1527
          // Key falls in this file's range, so definitely not base level
1528
0
          return false;
1529
0
        }
1530
0
        break;
1531
0
      }
1532
0
      level_ptrs_[lvl]++;
1533
0
    }
1534
30
  }
1535
6
  return true;
1536
6
}
1537
1538
320
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
1539
320
  const VersionSet* vset = input_version_->vset_;
1540
  // Scan to find earliest grandparent file that contains key.
1541
320
  const InternalKeyComparator* icmp = &vset->icmp_;
1542
320
  while (grandparent_index_ < grandparents_.size() &&
1543
0
         icmp->Compare(internal_key,
1544
0
                       grandparents_[grandparent_index_]->largest.Encode()) >
1545
0
             0) {
1546
0
    if (seen_key_) {
1547
0
      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
1548
0
    }
1549
0
    grandparent_index_++;
1550
0
  }
1551
320
  seen_key_ = true;
1552
1553
320
  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
1554
    // Too much overlap for current output; start new output
1555
0
    overlapped_bytes_ = 0;
1556
0
    return true;
1557
320
  } else {
1558
320
    return false;
1559
320
  }
1560
320
}
1561
1562
6
void Compaction::ReleaseInputs() {
1563
6
  if (input_version_ != nullptr) {
1564
6
    input_version_->Unref();
1565
6
    input_version_ = nullptr;
1566
6
  }
1567
6
}
1568
1569
}  // namespace leveldb