Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "math"
11 : "sort"
12 : "strings"
13 :
14 : "github.com/cockroachdb/errors"
15 : "github.com/cockroachdb/pebble/internal/base"
16 : "github.com/cockroachdb/pebble/internal/humanize"
17 : "github.com/cockroachdb/pebble/internal/invariants"
18 : "github.com/cockroachdb/pebble/internal/manifest"
19 : )
20 :
21 : // The minimum count for an intra-L0 compaction. This matches the RocksDB
22 : // heuristic.
23 : const minIntraL0Count = 4
24 :
25 : type compactionEnv struct {
26 : // diskAvailBytes holds a statistic on the number of bytes available on
27 : // disk, as reported by the filesystem. It's used to be more restrictive in
28 : // expanding compactions if available disk space is limited.
29 : //
30 : // The cached value (d.diskAvailBytes) is updated whenever a file is deleted
31 : // and whenever a compaction or flush completes. Since file removal is the
32 : // primary means of reclaiming space, there is a rough bound on the
33 : // statistic's staleness when available bytes is growing. Compactions and
34 : // flushes are longer, slower operations and provide a much looser bound
35 : // when available bytes is decreasing.
36 : diskAvailBytes uint64
37 : earliestUnflushedSeqNum base.SeqNum
38 : earliestSnapshotSeqNum base.SeqNum
39 : inProgressCompactions []compactionInfo
40 : readCompactionEnv readCompactionEnv
41 : }
42 :
43 : type compactionPicker interface {
44 : getScores([]compactionInfo) [numLevels]float64
45 : getBaseLevel() int
46 : estimatedCompactionDebt(l0ExtraSize uint64) uint64
47 : pickAuto(env compactionEnv) (pc *pickedCompaction)
48 : pickElisionOnlyCompaction(env compactionEnv) (pc *pickedCompaction)
49 : pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction)
50 : pickReadTriggeredCompaction(env compactionEnv) (pc *pickedCompaction)
51 : forceBaseLevel1()
52 : }
53 :
54 : // readCompactionEnv is used to hold data required to perform read compactions
55 : type readCompactionEnv struct {
56 : rescheduleReadCompaction *bool
57 : readCompactions *readCompactionQueue
58 : flushing bool
59 : }
60 :
61 : // Information about in-progress compactions provided to the compaction picker.
62 : // These are used to constrain the new compactions that will be picked.
63 : type compactionInfo struct {
64 : // versionEditApplied is true if this compaction's version edit has already
65 : // been committed. The compaction may still be in-progress deleting newly
66 : // obsolete files.
67 : versionEditApplied bool
68 : inputs []compactionLevel
69 : outputLevel int
70 : smallest InternalKey
71 : largest InternalKey
72 : }
73 :
74 0 : func (info compactionInfo) String() string {
75 0 : var buf bytes.Buffer
76 0 : var largest int
77 0 : for i, in := range info.inputs {
78 0 : if i > 0 {
79 0 : fmt.Fprintf(&buf, " -> ")
80 0 : }
81 0 : fmt.Fprintf(&buf, "L%d", in.level)
82 0 : in.files.Each(func(m *fileMetadata) {
83 0 : fmt.Fprintf(&buf, " %s", m.FileNum)
84 0 : })
85 0 : if largest < in.level {
86 0 : largest = in.level
87 0 : }
88 : }
89 0 : if largest != info.outputLevel || len(info.inputs) == 1 {
90 0 : fmt.Fprintf(&buf, " -> L%d", info.outputLevel)
91 0 : }
92 0 : return buf.String()
93 : }
94 :
95 : type sortCompactionLevelsByPriority []candidateLevelInfo
96 :
97 1 : func (s sortCompactionLevelsByPriority) Len() int {
98 1 : return len(s)
99 1 : }
100 :
101 : // A level should be picked for compaction if the compensatedScoreRatio is >= the
102 : // compactionScoreThreshold.
103 : const compactionScoreThreshold = 1
104 :
105 : // Less should return true if s[i] must be placed earlier than s[j] in the final
106 : // sorted list. The candidateLevelInfo for the level placed earlier is more likely
107 : // to be picked for a compaction.
108 1 : func (s sortCompactionLevelsByPriority) Less(i, j int) bool {
109 1 : iShouldCompact := s[i].compensatedScoreRatio >= compactionScoreThreshold
110 1 : jShouldCompact := s[j].compensatedScoreRatio >= compactionScoreThreshold
111 1 : // Ordering is defined as decreasing on (shouldCompact, uncompensatedScoreRatio)
112 1 : // where shouldCompact is 1 for true and 0 for false.
113 1 : if iShouldCompact && !jShouldCompact {
114 1 : return true
115 1 : }
116 1 : if !iShouldCompact && jShouldCompact {
117 1 : return false
118 1 : }
119 :
120 1 : if s[i].uncompensatedScoreRatio != s[j].uncompensatedScoreRatio {
121 1 : return s[i].uncompensatedScoreRatio > s[j].uncompensatedScoreRatio
122 1 : }
123 1 : return s[i].level < s[j].level
124 : }
125 :
126 1 : func (s sortCompactionLevelsByPriority) Swap(i, j int) {
127 1 : s[i], s[j] = s[j], s[i]
128 1 : }
129 :
130 : // sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the
131 : // sublevel.
132 : type sublevelInfo struct {
133 : manifest.LevelSlice
134 : sublevel manifest.Layer
135 : }
136 :
137 1 : func (cl sublevelInfo) Clone() sublevelInfo {
138 1 : return sublevelInfo{
139 1 : sublevel: cl.sublevel,
140 1 : LevelSlice: cl.LevelSlice,
141 1 : }
142 1 : }
143 0 : func (cl sublevelInfo) String() string {
144 0 : return fmt.Sprintf(`Sublevel %s; Levels %s`, cl.sublevel, cl.LevelSlice)
145 0 : }
146 :
147 : // generateSublevelInfo will generate the level slices for each of the sublevels
148 : // from the level slice for all of L0.
149 1 : func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo {
150 1 : sublevelMap := make(map[uint64][]*fileMetadata)
151 1 : it := levelFiles.Iter()
152 1 : for f := it.First(); f != nil; f = it.Next() {
153 1 : sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f)
154 1 : }
155 :
156 1 : var sublevels []int
157 1 : for level := range sublevelMap {
158 1 : sublevels = append(sublevels, int(level))
159 1 : }
160 1 : sort.Ints(sublevels)
161 1 :
162 1 : var levelSlices []sublevelInfo
163 1 : for _, sublevel := range sublevels {
164 1 : metas := sublevelMap[uint64(sublevel)]
165 1 : levelSlices = append(
166 1 : levelSlices,
167 1 : sublevelInfo{
168 1 : manifest.NewLevelSliceKeySorted(cmp, metas),
169 1 : manifest.L0Sublevel(sublevel),
170 1 : },
171 1 : )
172 1 : }
173 1 : return levelSlices
174 : }
175 :
176 : // compactionPickerMetrics holds metrics related to the compaction picking process
177 : type compactionPickerMetrics struct {
178 : // scores contains the compensatedScoreRatio from the candidateLevelInfo.
179 : scores []float64
180 : singleLevelOverlappingRatio float64
181 : multiLevelOverlappingRatio float64
182 : }
183 :
184 : // pickedCompaction contains information about a compaction that has already
185 : // been chosen, and is being constructed. Compaction construction info lives in
186 : // this struct, and is copied over into the compaction struct when that's
187 : // created.
188 : type pickedCompaction struct {
189 : cmp Compare
190 : // score of the chosen compaction. This is the same as the
191 : // compensatedScoreRatio in the candidateLevelInfo.
192 : score float64
193 : // kind indicates the kind of compaction.
194 : kind compactionKind
195 : // startLevel is the level that is being compacted. Inputs from startLevel
196 : // and outputLevel will be merged to produce a set of outputLevel files.
197 : startLevel *compactionLevel
198 : // outputLevel is the level that files are being produced in. outputLevel is
199 : // equal to startLevel+1 except when:
200 : // - if startLevel is 0, the output level equals compactionPicker.baseLevel().
201 : // - in multilevel compaction, the output level is the lowest level involved in
202 : // the compaction
203 : outputLevel *compactionLevel
204 : // extraLevels contain additional levels in between the input and output
205 : // levels that get compacted in multi level compactions
206 : extraLevels []*compactionLevel
207 : inputs []compactionLevel
208 : // LBase at the time of compaction picking.
209 : baseLevel int
210 : // L0-specific compaction info. Set to a non-nil value for all compactions
211 : // where startLevel == 0 that were generated by L0Sublevels.
212 : lcf *manifest.L0CompactionFiles
213 : // maxOutputFileSize is the maximum size of an individual table created
214 : // during compaction.
215 : maxOutputFileSize uint64
216 : // maxOverlapBytes is the maximum number of bytes of overlap allowed for a
217 : // single output table with the tables in the grandparent level.
218 : maxOverlapBytes uint64
219 : // maxReadCompactionBytes is the maximum bytes a read compaction is allowed to
220 : // overlap in its output level with. If the overlap is greater than
221 : // maxReadCompaction bytes, then we don't proceed with the compaction.
222 : maxReadCompactionBytes uint64
223 :
224 : // The boundaries of the input data.
225 : smallest InternalKey
226 : largest InternalKey
227 : version *version
228 : pickerMetrics compactionPickerMetrics
229 : }
230 :
231 1 : func (pc *pickedCompaction) userKeyBounds() base.UserKeyBounds {
232 1 : return base.UserKeyBoundsFromInternal(pc.smallest, pc.largest)
233 1 : }
234 :
235 1 : func defaultOutputLevel(startLevel, baseLevel int) int {
236 1 : outputLevel := startLevel + 1
237 1 : if startLevel == 0 {
238 1 : outputLevel = baseLevel
239 1 : }
240 1 : if outputLevel >= numLevels-1 {
241 1 : outputLevel = numLevels - 1
242 1 : }
243 1 : return outputLevel
244 : }
245 :
246 : func newPickedCompaction(
247 : opts *Options, cur *version, startLevel, outputLevel, baseLevel int,
248 1 : ) *pickedCompaction {
249 1 : if startLevel > 0 && startLevel < baseLevel {
250 0 : panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)",
251 0 : startLevel, baseLevel))
252 : }
253 :
254 1 : adjustedLevel := adjustedOutputLevel(outputLevel, baseLevel)
255 1 : pc := &pickedCompaction{
256 1 : cmp: opts.Comparer.Compare,
257 1 : version: cur,
258 1 : baseLevel: baseLevel,
259 1 : inputs: []compactionLevel{{level: startLevel}, {level: outputLevel}},
260 1 : maxOutputFileSize: uint64(opts.Level(adjustedLevel).TargetFileSize),
261 1 : maxOverlapBytes: maxGrandparentOverlapBytes(opts, adjustedLevel),
262 1 : maxReadCompactionBytes: maxReadCompactionBytes(opts, adjustedLevel),
263 1 : }
264 1 : pc.startLevel = &pc.inputs[0]
265 1 : pc.outputLevel = &pc.inputs[1]
266 1 : return pc
267 : }
268 :
269 : // adjustedOutputLevel is the output level used for the purpose of
270 : // determining the target output file size, overlap bytes, and expanded
271 : // bytes, taking into account the base level.
272 1 : func adjustedOutputLevel(outputLevel int, baseLevel int) int {
273 1 : adjustedOutputLevel := outputLevel
274 1 : if adjustedOutputLevel > 0 {
275 1 : // Output level is in the range [baseLevel, numLevels]. For the purpose of
276 1 : // determining the target output file size, overlap bytes, and expanded
277 1 : // bytes, we want to adjust the range to [1,numLevels].
278 1 : adjustedOutputLevel = 1 + outputLevel - baseLevel
279 1 : }
280 1 : return adjustedOutputLevel
281 : }
282 :
283 : func newPickedCompactionFromL0(
284 : lcf *manifest.L0CompactionFiles, opts *Options, vers *version, baseLevel int, isBase bool,
285 1 : ) *pickedCompaction {
286 1 : outputLevel := baseLevel
287 1 : if !isBase {
288 1 : outputLevel = 0 // Intra L0
289 1 : }
290 :
291 1 : pc := newPickedCompaction(opts, vers, 0, outputLevel, baseLevel)
292 1 : pc.lcf = lcf
293 1 : pc.outputLevel.level = outputLevel
294 1 :
295 1 : // Manually build the compaction as opposed to calling
296 1 : // pickAutoHelper. This is because L0Sublevels has already added
297 1 : // any overlapping L0 SSTables that need to be added, and
298 1 : // because compactions built by L0SSTables do not necessarily
299 1 : // pick contiguous sequences of files in pc.version.Levels[0].
300 1 : files := make([]*manifest.FileMetadata, 0, len(lcf.Files))
301 1 : iter := vers.Levels[0].Iter()
302 1 : for f := iter.First(); f != nil; f = iter.Next() {
303 1 : if lcf.FilesIncluded[f.L0Index] {
304 1 : files = append(files, f)
305 1 : }
306 : }
307 1 : pc.startLevel.files = manifest.NewLevelSliceSeqSorted(files)
308 1 : return pc
309 : }
310 :
311 0 : func (pc *pickedCompaction) String() string {
312 0 : var builder strings.Builder
313 0 : builder.WriteString(fmt.Sprintf(`Score=%f, `, pc.score))
314 0 : builder.WriteString(fmt.Sprintf(`Kind=%s, `, pc.kind))
315 0 : builder.WriteString(fmt.Sprintf(`AdjustedOutputLevel=%d, `, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel)))
316 0 : builder.WriteString(fmt.Sprintf(`maxOutputFileSize=%d, `, pc.maxOutputFileSize))
317 0 : builder.WriteString(fmt.Sprintf(`maxReadCompactionBytes=%d, `, pc.maxReadCompactionBytes))
318 0 : builder.WriteString(fmt.Sprintf(`smallest=%s, `, pc.smallest))
319 0 : builder.WriteString(fmt.Sprintf(`largest=%s, `, pc.largest))
320 0 : builder.WriteString(fmt.Sprintf(`version=%s, `, pc.version))
321 0 : builder.WriteString(fmt.Sprintf(`inputs=%s, `, pc.inputs))
322 0 : builder.WriteString(fmt.Sprintf(`startlevel=%s, `, pc.startLevel))
323 0 : builder.WriteString(fmt.Sprintf(`outputLevel=%s, `, pc.outputLevel))
324 0 : builder.WriteString(fmt.Sprintf(`extraLevels=%s, `, pc.extraLevels))
325 0 : builder.WriteString(fmt.Sprintf(`l0SublevelInfo=%s, `, pc.startLevel.l0SublevelInfo))
326 0 : builder.WriteString(fmt.Sprintf(`lcf=%s`, pc.lcf))
327 0 : return builder.String()
328 0 : }
329 :
330 : // Clone creates a deep copy of the pickedCompaction
331 1 : func (pc *pickedCompaction) clone() *pickedCompaction {
332 1 :
333 1 : // Quickly copy over fields that do not require special deep copy care, and
334 1 : // set all fields that will require a deep copy to nil.
335 1 : newPC := &pickedCompaction{
336 1 : cmp: pc.cmp,
337 1 : score: pc.score,
338 1 : kind: pc.kind,
339 1 : baseLevel: pc.baseLevel,
340 1 : maxOutputFileSize: pc.maxOutputFileSize,
341 1 : maxOverlapBytes: pc.maxOverlapBytes,
342 1 : maxReadCompactionBytes: pc.maxReadCompactionBytes,
343 1 : smallest: pc.smallest.Clone(),
344 1 : largest: pc.largest.Clone(),
345 1 :
346 1 : // TODO(msbutler): properly clone picker metrics
347 1 : pickerMetrics: pc.pickerMetrics,
348 1 :
349 1 : // Both copies see the same manifest, therefore, it's ok for them to se
350 1 : // share the same pc. version.
351 1 : version: pc.version,
352 1 : }
353 1 :
354 1 : newPC.inputs = make([]compactionLevel, len(pc.inputs))
355 1 : newPC.extraLevels = make([]*compactionLevel, 0, len(pc.extraLevels))
356 1 : for i := range pc.inputs {
357 1 : newPC.inputs[i] = pc.inputs[i].Clone()
358 1 : if i == 0 {
359 1 : newPC.startLevel = &newPC.inputs[i]
360 1 : } else if i == len(pc.inputs)-1 {
361 1 : newPC.outputLevel = &newPC.inputs[i]
362 1 : } else {
363 0 : newPC.extraLevels = append(newPC.extraLevels, &newPC.inputs[i])
364 0 : }
365 : }
366 :
367 1 : if len(pc.startLevel.l0SublevelInfo) > 0 {
368 1 : newPC.startLevel.l0SublevelInfo = make([]sublevelInfo, len(pc.startLevel.l0SublevelInfo))
369 1 : for i := range pc.startLevel.l0SublevelInfo {
370 1 : newPC.startLevel.l0SublevelInfo[i] = pc.startLevel.l0SublevelInfo[i].Clone()
371 1 : }
372 : }
373 1 : if pc.lcf != nil {
374 1 : newPC.lcf = pc.lcf.Clone()
375 1 : }
376 1 : return newPC
377 : }
378 :
379 : // maybeExpandBounds is a helper function for setupInputs which ensures the
380 : // pickedCompaction's smallest and largest internal keys are updated iff
381 : // the candidate keys expand the key span. This avoids a bug for multi-level
382 : // compactions: during the second call to setupInputs, the picked compaction's
383 : // smallest and largest keys should not decrease the key span.
384 1 : func (pc *pickedCompaction) maybeExpandBounds(smallest InternalKey, largest InternalKey) {
385 1 : if len(smallest.UserKey) == 0 && len(largest.UserKey) == 0 {
386 1 : return
387 1 : }
388 1 : if len(pc.smallest.UserKey) == 0 && len(pc.largest.UserKey) == 0 {
389 1 : pc.smallest = smallest
390 1 : pc.largest = largest
391 1 : return
392 1 : }
393 1 : if base.InternalCompare(pc.cmp, pc.smallest, smallest) >= 0 {
394 1 : pc.smallest = smallest
395 1 : }
396 1 : if base.InternalCompare(pc.cmp, pc.largest, largest) <= 0 {
397 1 : pc.largest = largest
398 1 : }
399 : }
400 :
401 : // setupInputs returns true if a compaction has been set up. It returns false if
402 : // a concurrent compaction is occurring on the start or output level files.
403 : func (pc *pickedCompaction) setupInputs(
404 : opts *Options, diskAvailBytes uint64, startLevel *compactionLevel,
405 1 : ) bool {
406 1 : // maxExpandedBytes is the maximum size of an expanded compaction. If
407 1 : // growing a compaction results in a larger size, the original compaction
408 1 : // is used instead.
409 1 : maxExpandedBytes := expandedCompactionByteSizeLimit(
410 1 : opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes,
411 1 : )
412 1 :
413 1 : if anyTablesCompacting(startLevel.files) {
414 1 : return false
415 1 : }
416 :
417 1 : pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, startLevel.files.Iter()))
418 1 :
419 1 : // Determine the sstables in the output level which overlap with the input
420 1 : // sstables. No need to do this for intra-L0 compactions; outputLevel.files is
421 1 : // left empty for those.
422 1 : if startLevel.level != pc.outputLevel.level {
423 1 : pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.userKeyBounds())
424 1 : if anyTablesCompacting(pc.outputLevel.files) {
425 1 : return false
426 1 : }
427 :
428 1 : pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
429 1 : startLevel.files.Iter(), pc.outputLevel.files.Iter()))
430 : }
431 :
432 : // Grow the sstables in startLevel.level as long as it doesn't affect the number
433 : // of sstables included from pc.outputLevel.level.
434 1 : if pc.lcf != nil && startLevel.level == 0 && pc.outputLevel.level != 0 {
435 1 : // Call the L0-specific compaction extension method. Similar logic as
436 1 : // pc.grow. Additional L0 files are optionally added to the compaction at
437 1 : // this step. Note that the bounds passed in are not the bounds of the
438 1 : // compaction, but rather the smallest and largest internal keys that
439 1 : // the compaction cannot include from L0 without pulling in more Lbase
440 1 : // files. Consider this example:
441 1 : //
442 1 : // L0: c-d e+f g-h
443 1 : // Lbase: a-b e+f i-j
444 1 : // a b c d e f g h i j
445 1 : //
446 1 : // The e-f files have already been chosen in the compaction. As pulling
447 1 : // in more LBase files is undesirable, the logic below will pass in
448 1 : // smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which
449 1 : // will expand the compaction to include c-d and g-h from L0. The
450 1 : // bounds passed in are exclusive; the compaction cannot be expanded
451 1 : // to include files that "touch" it.
452 1 : smallestBaseKey := base.InvalidInternalKey
453 1 : largestBaseKey := base.InvalidInternalKey
454 1 : if pc.outputLevel.files.Empty() {
455 1 : baseIter := pc.version.Levels[pc.outputLevel.level].Iter()
456 1 : if sm := baseIter.SeekLT(pc.cmp, pc.smallest.UserKey); sm != nil {
457 1 : smallestBaseKey = sm.Largest
458 1 : }
459 1 : if la := baseIter.SeekGE(pc.cmp, pc.largest.UserKey); la != nil {
460 1 : largestBaseKey = la.Smallest
461 1 : }
462 1 : } else {
463 1 : // NB: We use Reslice to access the underlying level's files, but
464 1 : // we discard the returned slice. The pc.outputLevel.files slice
465 1 : // is not modified.
466 1 : _ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) {
467 1 : if sm := start.Prev(); sm != nil {
468 1 : smallestBaseKey = sm.Largest
469 1 : }
470 1 : if la := end.Next(); la != nil {
471 1 : largestBaseKey = la.Smallest
472 1 : }
473 : })
474 : }
475 1 : oldLcf := pc.lcf.Clone()
476 1 : if pc.version.L0Sublevels.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) {
477 1 : var newStartLevelFiles []*fileMetadata
478 1 : iter := pc.version.Levels[0].Iter()
479 1 : var sizeSum uint64
480 1 : for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() {
481 1 : if pc.lcf.FilesIncluded[f.L0Index] {
482 1 : newStartLevelFiles = append(newStartLevelFiles, f)
483 1 : sizeSum += f.Size
484 1 : }
485 : }
486 1 : if sizeSum+pc.outputLevel.files.SizeSum() < maxExpandedBytes {
487 1 : startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles)
488 1 : pc.smallest, pc.largest = manifest.KeyRange(pc.cmp,
489 1 : startLevel.files.Iter(), pc.outputLevel.files.Iter())
490 1 : } else {
491 1 : *pc.lcf = *oldLcf
492 1 : }
493 : }
494 1 : } else if pc.grow(pc.smallest, pc.largest, maxExpandedBytes, startLevel) {
495 1 : pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
496 1 : startLevel.files.Iter(), pc.outputLevel.files.Iter()))
497 1 : }
498 :
499 1 : if pc.startLevel.level == 0 {
500 1 : // We don't change the input files for the compaction beyond this point.
501 1 : pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
502 1 : }
503 :
504 1 : return true
505 : }
506 :
507 : // grow grows the number of inputs at c.level without changing the number of
508 : // c.level+1 files in the compaction, and returns whether the inputs grew. sm
509 : // and la are the smallest and largest InternalKeys in all of the inputs.
510 : func (pc *pickedCompaction) grow(
511 : sm, la InternalKey, maxExpandedBytes uint64, startLevel *compactionLevel,
512 1 : ) bool {
513 1 : if pc.outputLevel.files.Empty() {
514 1 : return false
515 1 : }
516 1 : grow0 := pc.version.Overlaps(startLevel.level, base.UserKeyBoundsFromInternal(sm, la))
517 1 : if anyTablesCompacting(grow0) {
518 1 : return false
519 1 : }
520 1 : if grow0.Len() <= startLevel.files.Len() {
521 1 : return false
522 1 : }
523 1 : if grow0.SizeSum()+pc.outputLevel.files.SizeSum() >= maxExpandedBytes {
524 1 : return false
525 1 : }
526 : // We need to include the outputLevel iter because without it, in a multiLevel scenario,
527 : // sm1 and la1 could shift the output level keyspace when pc.outputLevel.files is set to grow1.
528 1 : sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter(), pc.outputLevel.files.Iter())
529 1 : grow1 := pc.version.Overlaps(pc.outputLevel.level, base.UserKeyBoundsFromInternal(sm1, la1))
530 1 : if anyTablesCompacting(grow1) {
531 1 : return false
532 1 : }
533 1 : if grow1.Len() != pc.outputLevel.files.Len() {
534 1 : return false
535 1 : }
536 1 : startLevel.files = grow0
537 1 : pc.outputLevel.files = grow1
538 1 : return true
539 : }
540 :
541 1 : func (pc *pickedCompaction) compactionSize() uint64 {
542 1 : var bytesToCompact uint64
543 1 : for i := range pc.inputs {
544 1 : bytesToCompact += pc.inputs[i].files.SizeSum()
545 1 : }
546 1 : return bytesToCompact
547 : }
548 :
549 : // setupMultiLevelCandidated returns true if it successfully added another level
550 : // to the compaction.
551 1 : func (pc *pickedCompaction) setupMultiLevelCandidate(opts *Options, diskAvailBytes uint64) bool {
552 1 : pc.inputs = append(pc.inputs, compactionLevel{level: pc.outputLevel.level + 1})
553 1 :
554 1 : // Recalibrate startLevel and outputLevel:
555 1 : // - startLevel and outputLevel pointers may be obsolete after appending to pc.inputs.
556 1 : // - push outputLevel to extraLevels and move the new level to outputLevel
557 1 : pc.startLevel = &pc.inputs[0]
558 1 : pc.extraLevels = []*compactionLevel{&pc.inputs[1]}
559 1 : pc.outputLevel = &pc.inputs[2]
560 1 : return pc.setupInputs(opts, diskAvailBytes, pc.extraLevels[len(pc.extraLevels)-1])
561 1 : }
562 :
563 : // anyTablesCompacting returns true if any tables in the level slice are
564 : // compacting.
565 1 : func anyTablesCompacting(inputs manifest.LevelSlice) bool {
566 1 : it := inputs.Iter()
567 1 : for f := it.First(); f != nil; f = it.Next() {
568 1 : if f.IsCompacting() {
569 1 : return true
570 1 : }
571 : }
572 1 : return false
573 : }
574 :
575 : // newCompactionPickerByScore creates a compactionPickerByScore associated with
576 : // the newest version. The picker is used under logLock (until a new version is
577 : // installed).
578 : func newCompactionPickerByScore(
579 : v *version,
580 : virtualBackings *manifest.VirtualBackings,
581 : opts *Options,
582 : inProgressCompactions []compactionInfo,
583 1 : ) *compactionPickerByScore {
584 1 : p := &compactionPickerByScore{
585 1 : opts: opts,
586 1 : vers: v,
587 1 : virtualBackings: virtualBackings,
588 1 : }
589 1 : p.initLevelMaxBytes(inProgressCompactions)
590 1 : return p
591 1 : }
592 :
593 : // Information about a candidate compaction level that has been identified by
594 : // the compaction picker.
595 : type candidateLevelInfo struct {
596 : // The compensatedScore of the level after adjusting according to the other
597 : // levels' sizes. For L0, the compensatedScoreRatio is equivalent to the
598 : // uncompensatedScoreRatio as we don't account for level size compensation in
599 : // L0.
600 : compensatedScoreRatio float64
601 : // The score of the level after accounting for level size compensation before
602 : // adjusting according to other levels' sizes. For L0, the compensatedScore
603 : // is equivalent to the uncompensatedScore as we don't account for level
604 : // size compensation in L0.
605 : compensatedScore float64
606 : // The score of the level to be compacted, calculated using uncompensated file
607 : // sizes and without any adjustments.
608 : uncompensatedScore float64
609 : // uncompensatedScoreRatio is the uncompensatedScore adjusted according to
610 : // the other levels' sizes.
611 : uncompensatedScoreRatio float64
612 : level int
613 : // The level to compact to.
614 : outputLevel int
615 : // The file in level that will be compacted. Additional files may be
616 : // picked by the compaction, and a pickedCompaction created for the
617 : // compaction.
618 : file manifest.LevelFile
619 : }
620 :
621 1 : func (c *candidateLevelInfo) shouldCompact() bool {
622 1 : return c.compensatedScoreRatio >= compactionScoreThreshold
623 1 : }
624 :
625 1 : func fileCompensation(f *fileMetadata) uint64 {
626 1 : return uint64(f.Stats.PointDeletionsBytesEstimate) + f.Stats.RangeDeletionsBytesEstimate
627 1 : }
628 :
629 : // compensatedSize returns f's file size, inflated according to compaction
630 : // priorities.
631 1 : func compensatedSize(f *fileMetadata) uint64 {
632 1 : // Add in the estimate of disk space that may be reclaimed by compacting the
633 1 : // file's tombstones.
634 1 : return f.Size + fileCompensation(f)
635 1 : }
636 :
637 : // compensatedSizeAnnotator is a manifest.Annotator that annotates B-Tree
638 : // nodes with the sum of the files' compensated sizes. Compensated sizes may
639 : // change once a table's stats are loaded asynchronously, so its values are
640 : // marked as cacheable only if a file's stats have been loaded.
641 1 : var compensatedSizeAnnotator = manifest.SumAnnotator(func(f *fileMetadata) (uint64, bool) {
642 1 : return compensatedSize(f), f.StatsValid()
643 1 : })
644 :
645 : // totalCompensatedSize computes the compensated size over a file metadata
646 : // iterator. Note that this function is linear in the files available to the
647 : // iterator. Use the compensatedSizeAnnotator if querying the total
648 : // compensated size of a level.
649 1 : func totalCompensatedSize(iter manifest.LevelIterator) uint64 {
650 1 : var sz uint64
651 1 : for f := iter.First(); f != nil; f = iter.Next() {
652 1 : sz += compensatedSize(f)
653 1 : }
654 1 : return sz
655 : }
656 :
657 : // compactionPickerByScore holds the state and logic for picking a compaction. A
658 : // compaction picker is associated with a single version. A new compaction
659 : // picker is created and initialized every time a new version is installed.
660 : type compactionPickerByScore struct {
661 : opts *Options
662 : vers *version
663 : virtualBackings *manifest.VirtualBackings
664 : // The level to target for L0 compactions. Levels L1 to baseLevel must be
665 : // empty.
666 : baseLevel int
667 : // levelMaxBytes holds the dynamically adjusted max bytes setting for each
668 : // level.
669 : levelMaxBytes [numLevels]int64
670 : }
671 :
672 : var _ compactionPicker = &compactionPickerByScore{}
673 :
674 1 : func (p *compactionPickerByScore) getScores(inProgress []compactionInfo) [numLevels]float64 {
675 1 : var scores [numLevels]float64
676 1 : for _, info := range p.calculateLevelScores(inProgress) {
677 1 : scores[info.level] = info.compensatedScoreRatio
678 1 : }
679 1 : return scores
680 : }
681 :
682 1 : func (p *compactionPickerByScore) getBaseLevel() int {
683 1 : if p == nil {
684 0 : return 1
685 0 : }
686 1 : return p.baseLevel
687 : }
688 :
689 : // estimatedCompactionDebt estimates the number of bytes which need to be
690 : // compacted before the LSM tree becomes stable.
691 1 : func (p *compactionPickerByScore) estimatedCompactionDebt(l0ExtraSize uint64) uint64 {
692 1 : if p == nil {
693 0 : return 0
694 0 : }
695 :
696 : // We assume that all the bytes in L0 need to be compacted to Lbase. This is
697 : // unlike the RocksDB logic that figures out whether L0 needs compaction.
698 1 : bytesAddedToNextLevel := l0ExtraSize + p.vers.Levels[0].Size()
699 1 : lbaseSize := p.vers.Levels[p.baseLevel].Size()
700 1 :
701 1 : var compactionDebt uint64
702 1 : if bytesAddedToNextLevel > 0 && lbaseSize > 0 {
703 1 : // We only incur compaction debt if both L0 and Lbase contain data. If L0
704 1 : // is empty, no compaction is necessary. If Lbase is empty, a move-based
705 1 : // compaction from L0 would occur.
706 1 : compactionDebt += bytesAddedToNextLevel + lbaseSize
707 1 : }
708 :
709 : // loop invariant: At the beginning of the loop, bytesAddedToNextLevel is the
710 : // bytes added to `level` in the loop.
711 1 : for level := p.baseLevel; level < numLevels-1; level++ {
712 1 : levelSize := p.vers.Levels[level].Size() + bytesAddedToNextLevel
713 1 : nextLevelSize := p.vers.Levels[level+1].Size()
714 1 : if levelSize > uint64(p.levelMaxBytes[level]) {
715 1 : bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level])
716 1 : if nextLevelSize > 0 {
717 1 : // We only incur compaction debt if the next level contains data. If the
718 1 : // next level is empty, a move-based compaction would be used.
719 1 : levelRatio := float64(nextLevelSize) / float64(levelSize)
720 1 : // The current level contributes bytesAddedToNextLevel to compactions.
721 1 : // The next level contributes levelRatio * bytesAddedToNextLevel.
722 1 : compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1))
723 1 : }
724 1 : } else {
725 1 : // We're not moving any bytes to the next level.
726 1 : bytesAddedToNextLevel = 0
727 1 : }
728 : }
729 1 : return compactionDebt
730 : }
731 :
732 1 : func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) {
733 1 : // The levelMaxBytes calculations here differ from RocksDB in two ways:
734 1 : //
735 1 : // 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum
736 1 : // level in L1-L6, rather than determining the size of the bottom level
737 1 : // based on the total amount of data in the dB. The RocksDB calculation is
738 1 : // problematic if L0 contains a significant fraction of data, or if the
739 1 : // level sizes are roughly equal and thus there is a significant fraction
740 1 : // of data outside of the largest level.
741 1 : //
742 1 : // 2. Not adjusting the size of Lbase based on L0. RocksDB computes
743 1 : // baseBytesMax as the maximum of the configured LBaseMaxBytes and the
744 1 : // size of L0. This is problematic because baseBytesMax is used to compute
745 1 : // the max size of lower levels. A very large baseBytesMax will result in
746 1 : // an overly large value for the size of lower levels which will caused
747 1 : // those levels not to be compacted even when they should be
748 1 : // compacted. This often results in "inverted" LSM shapes where Ln is
749 1 : // larger than Ln+1.
750 1 :
751 1 : // Determine the first non-empty level and the total DB size.
752 1 : firstNonEmptyLevel := -1
753 1 : var dbSize uint64
754 1 : for level := 1; level < numLevels; level++ {
755 1 : if p.vers.Levels[level].Size() > 0 {
756 1 : if firstNonEmptyLevel == -1 {
757 1 : firstNonEmptyLevel = level
758 1 : }
759 1 : dbSize += p.vers.Levels[level].Size()
760 : }
761 : }
762 1 : for _, c := range inProgressCompactions {
763 1 : if c.outputLevel == 0 || c.outputLevel == -1 {
764 1 : continue
765 : }
766 1 : if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) {
767 1 : firstNonEmptyLevel = c.outputLevel
768 1 : }
769 : }
770 :
771 : // Initialize the max-bytes setting for each level to "infinity" which will
772 : // disallow compaction for that level. We'll fill in the actual value below
773 : // for levels we want to allow compactions from.
774 1 : for level := 0; level < numLevels; level++ {
775 1 : p.levelMaxBytes[level] = math.MaxInt64
776 1 : }
777 :
778 1 : if dbSize == 0 {
779 1 : // No levels for L1 and up contain any data. Target L0 compactions for the
780 1 : // last level or to the level to which there is an ongoing L0 compaction.
781 1 : p.baseLevel = numLevels - 1
782 1 : if firstNonEmptyLevel >= 0 {
783 1 : p.baseLevel = firstNonEmptyLevel
784 1 : }
785 1 : return
786 : }
787 :
788 1 : dbSize += p.vers.Levels[0].Size()
789 1 : bottomLevelSize := dbSize - dbSize/uint64(p.opts.Experimental.LevelMultiplier)
790 1 :
791 1 : curLevelSize := bottomLevelSize
792 1 : for level := numLevels - 2; level >= firstNonEmptyLevel; level-- {
793 1 : curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier))
794 1 : }
795 :
796 : // Compute base level (where L0 data is compacted to).
797 1 : baseBytesMax := uint64(p.opts.LBaseMaxBytes)
798 1 : p.baseLevel = firstNonEmptyLevel
799 1 : for p.baseLevel > 1 && curLevelSize > baseBytesMax {
800 1 : p.baseLevel--
801 1 : curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier))
802 1 : }
803 :
804 1 : smoothedLevelMultiplier := 1.0
805 1 : if p.baseLevel < numLevels-1 {
806 1 : smoothedLevelMultiplier = math.Pow(
807 1 : float64(bottomLevelSize)/float64(baseBytesMax),
808 1 : 1.0/float64(numLevels-p.baseLevel-1))
809 1 : }
810 :
811 1 : levelSize := float64(baseBytesMax)
812 1 : for level := p.baseLevel; level < numLevels; level++ {
813 1 : if level > p.baseLevel && levelSize > 0 {
814 1 : levelSize *= smoothedLevelMultiplier
815 1 : }
816 : // Round the result since test cases use small target level sizes, which
817 : // can be impacted by floating-point imprecision + integer truncation.
818 1 : roundedLevelSize := math.Round(levelSize)
819 1 : if roundedLevelSize > float64(math.MaxInt64) {
820 0 : p.levelMaxBytes[level] = math.MaxInt64
821 1 : } else {
822 1 : p.levelMaxBytes[level] = int64(roundedLevelSize)
823 1 : }
824 : }
825 : }
826 :
827 : type levelSizeAdjust struct {
828 : incomingActualBytes uint64
829 : outgoingActualBytes uint64
830 : outgoingCompensatedBytes uint64
831 : }
832 :
833 1 : func (a levelSizeAdjust) compensated() uint64 {
834 1 : return a.incomingActualBytes - a.outgoingCompensatedBytes
835 1 : }
836 :
837 1 : func (a levelSizeAdjust) actual() uint64 {
838 1 : return a.incomingActualBytes - a.outgoingActualBytes
839 1 : }
840 :
841 1 : func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]levelSizeAdjust {
842 1 : // Compute size adjustments for each level based on the in-progress
843 1 : // compactions. We sum the file sizes of all files leaving and entering each
844 1 : // level in in-progress compactions. For outgoing files, we also sum a
845 1 : // separate sum of 'compensated file sizes', which are inflated according
846 1 : // to deletion estimates.
847 1 : //
848 1 : // When we adjust a level's size according to these values during score
849 1 : // calculation, we subtract the compensated size of start level inputs to
850 1 : // account for the fact that score calculation uses compensated sizes.
851 1 : //
852 1 : // Since compensated file sizes may be compensated because they reclaim
853 1 : // space from the output level's files, we only add the real file size to
854 1 : // the output level.
855 1 : //
856 1 : // This is slightly different from RocksDB's behavior, which simply elides
857 1 : // compacting files from the level size calculation.
858 1 : var sizeAdjust [numLevels]levelSizeAdjust
859 1 : for i := range inProgressCompactions {
860 1 : c := &inProgressCompactions[i]
861 1 : // If this compaction's version edit has already been applied, there's
862 1 : // no need to adjust: The LSM we'll examine will already reflect the
863 1 : // new LSM state.
864 1 : if c.versionEditApplied {
865 1 : continue
866 : }
867 :
868 1 : for _, input := range c.inputs {
869 1 : actualSize := input.files.SizeSum()
870 1 : compensatedSize := totalCompensatedSize(input.files.Iter())
871 1 :
872 1 : if input.level != c.outputLevel {
873 1 : sizeAdjust[input.level].outgoingCompensatedBytes += compensatedSize
874 1 : sizeAdjust[input.level].outgoingActualBytes += actualSize
875 1 : if c.outputLevel != -1 {
876 1 : sizeAdjust[c.outputLevel].incomingActualBytes += actualSize
877 1 : }
878 : }
879 : }
880 : }
881 1 : return sizeAdjust
882 : }
883 :
884 : func (p *compactionPickerByScore) calculateLevelScores(
885 : inProgressCompactions []compactionInfo,
886 1 : ) [numLevels]candidateLevelInfo {
887 1 : var scores [numLevels]candidateLevelInfo
888 1 : for i := range scores {
889 1 : scores[i].level = i
890 1 : scores[i].outputLevel = i + 1
891 1 : }
892 1 : l0UncompensatedScore := calculateL0UncompensatedScore(p.vers, p.opts, inProgressCompactions)
893 1 : scores[0] = candidateLevelInfo{
894 1 : outputLevel: p.baseLevel,
895 1 : uncompensatedScore: l0UncompensatedScore,
896 1 : compensatedScore: l0UncompensatedScore, /* No level size compensation for L0 */
897 1 : }
898 1 : sizeAdjust := calculateSizeAdjust(inProgressCompactions)
899 1 : for level := 1; level < numLevels; level++ {
900 1 : compensatedLevelSize := *compensatedSizeAnnotator.LevelAnnotation(p.vers.Levels[level]) + sizeAdjust[level].compensated()
901 1 : scores[level].compensatedScore = float64(compensatedLevelSize) / float64(p.levelMaxBytes[level])
902 1 : scores[level].uncompensatedScore = float64(p.vers.Levels[level].Size()+sizeAdjust[level].actual()) / float64(p.levelMaxBytes[level])
903 1 : }
904 :
905 : // Adjust each level's {compensated, uncompensated}Score by the uncompensatedScore
906 : // of the next level to get a {compensated, uncompensated}ScoreRatio. If the
907 : // next level has a high uncompensatedScore, and is thus a priority for compaction,
908 : // this reduces the priority for compacting the current level. If the next level
909 : // has a low uncompensatedScore (i.e. it is below its target size), this increases
910 : // the priority for compacting the current level.
911 : //
912 : // The effect of this adjustment is to help prioritize compactions in lower
913 : // levels. The following example shows the compensatedScoreRatio and the
914 : // compensatedScore. In this scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase)
915 : // is significantly above its target size. The original score prioritizes
916 : // compactions from those two levels, but doing so ends up causing a future
917 : // problem: data piles up in the higher levels, starving L5->L6 compactions,
918 : // and to a lesser degree starving L4->L5 compactions.
919 : //
920 : // Note that in the example shown there is no level size compensation so the
921 : // compensatedScore and the uncompensatedScore is the same for each level.
922 : //
923 : // compensatedScoreRatio compensatedScore uncompensatedScore size max-size
924 : // L0 3.2 68.0 68.0 2.2 G -
925 : // L3 3.2 21.1 21.1 1.3 G 64 M
926 : // L4 3.4 6.7 6.7 3.1 G 467 M
927 : // L5 3.4 2.0 2.0 6.6 G 3.3 G
928 : // L6 0.6 0.6 0.6 14 G 24 G
929 1 : var prevLevel int
930 1 : for level := p.baseLevel; level < numLevels; level++ {
931 1 : // The compensated scores, and uncompensated scores will be turned into
932 1 : // ratios as they're adjusted according to other levels' sizes.
933 1 : scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore
934 1 : scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore
935 1 :
936 1 : // Avoid absurdly large scores by placing a floor on the score that we'll
937 1 : // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily.
938 1 : const minScore = 0.01
939 1 : if scores[prevLevel].compensatedScoreRatio >= compactionScoreThreshold {
940 1 : if scores[level].uncompensatedScore >= minScore {
941 1 : scores[prevLevel].compensatedScoreRatio /= scores[level].uncompensatedScore
942 1 : } else {
943 1 : scores[prevLevel].compensatedScoreRatio /= minScore
944 1 : }
945 : }
946 1 : if scores[prevLevel].uncompensatedScoreRatio >= compactionScoreThreshold {
947 1 : if scores[level].uncompensatedScore >= minScore {
948 1 : scores[prevLevel].uncompensatedScoreRatio /= scores[level].uncompensatedScore
949 1 : } else {
950 1 : scores[prevLevel].uncompensatedScoreRatio /= minScore
951 1 : }
952 : }
953 1 : prevLevel = level
954 : }
955 : // Set the score ratios for the lowest level.
956 : // INVARIANT: prevLevel == numLevels-1
957 1 : scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore
958 1 : scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore
959 1 :
960 1 : sort.Sort(sortCompactionLevelsByPriority(scores[:]))
961 1 : return scores
962 : }
963 :
964 : // calculateL0UncompensatedScore calculates a float score representing the
965 : // relative priority of compacting L0. Level L0 is special in that files within
966 : // L0 may overlap one another, so a different set of heuristics that take into
967 : // account read amplification apply.
968 : func calculateL0UncompensatedScore(
969 : vers *version, opts *Options, inProgressCompactions []compactionInfo,
970 1 : ) float64 {
971 1 : // Use the sublevel count to calculate the score. The base vs intra-L0
972 1 : // compaction determination happens in pickAuto, not here.
973 1 : score := float64(2*vers.L0Sublevels.MaxDepthAfterOngoingCompactions()) /
974 1 : float64(opts.L0CompactionThreshold)
975 1 :
976 1 : // Also calculate a score based on the file count but use it only if it
977 1 : // produces a higher score than the sublevel-based one. This heuristic is
978 1 : // designed to accommodate cases where L0 is accumulating non-overlapping
979 1 : // files in L0. Letting too many non-overlapping files accumulate in few
980 1 : // sublevels is undesirable, because:
981 1 : // 1) we can produce a massive backlog to compact once files do overlap.
982 1 : // 2) constructing L0 sublevels has a runtime that grows superlinearly with
983 1 : // the number of files in L0 and must be done while holding D.mu.
984 1 : noncompactingFiles := vers.Levels[0].Len()
985 1 : for _, c := range inProgressCompactions {
986 1 : for _, cl := range c.inputs {
987 1 : if cl.level == 0 {
988 1 : noncompactingFiles -= cl.files.Len()
989 1 : }
990 : }
991 : }
992 1 : fileScore := float64(noncompactingFiles) / float64(opts.L0CompactionFileThreshold)
993 1 : if score < fileScore {
994 1 : score = fileScore
995 1 : }
996 1 : return score
997 : }
998 :
999 : // pickCompactionSeedFile picks a file from `level` in the `vers` to build a
1000 : // compaction around. Currently, this function implements a heuristic similar to
1001 : // RocksDB's kMinOverlappingRatio, seeking to minimize write amplification. This
1002 : // function is linear with respect to the number of files in `level` and
1003 : // `outputLevel`.
1004 : func pickCompactionSeedFile(
1005 : vers *version,
1006 : virtualBackings *manifest.VirtualBackings,
1007 : opts *Options,
1008 : level, outputLevel int,
1009 : earliestSnapshotSeqNum base.SeqNum,
1010 1 : ) (manifest.LevelFile, bool) {
1011 1 : // Select the file within the level to compact. We want to minimize write
1012 1 : // amplification, but also ensure that (a) deletes are propagated to the
1013 1 : // bottom level in a timely fashion, and (b) virtual sstables that are
1014 1 : // pinning backing sstables where most of the data is garbage are compacted
1015 1 : // away. Doing (a) and (b) reclaims disk space. A table's smallest sequence
1016 1 : // number provides a measure of its age. The ratio of overlapping-bytes /
1017 1 : // table-size gives an indication of write amplification (a smaller ratio is
1018 1 : // preferrable).
1019 1 : //
1020 1 : // The current heuristic is based off the RocksDB kMinOverlappingRatio
1021 1 : // heuristic. It chooses the file with the minimum overlapping ratio with
1022 1 : // the target level, which minimizes write amplification.
1023 1 : //
1024 1 : // The heuristic uses a "compensated size" for the denominator, which is the
1025 1 : // file size inflated by (a) an estimate of the space that may be reclaimed
1026 1 : // through compaction, and (b) a fraction of the amount of garbage in the
1027 1 : // backing sstable pinned by this (virtual) sstable.
1028 1 : //
1029 1 : // TODO(peter): For concurrent compactions, we may want to try harder to
1030 1 : // pick a seed file whose resulting compaction bounds do not overlap with
1031 1 : // an in-progress compaction.
1032 1 :
1033 1 : cmp := opts.Comparer.Compare
1034 1 : startIter := vers.Levels[level].Iter()
1035 1 : outputIter := vers.Levels[outputLevel].Iter()
1036 1 :
1037 1 : var file manifest.LevelFile
1038 1 : smallestRatio := uint64(math.MaxUint64)
1039 1 :
1040 1 : outputFile := outputIter.First()
1041 1 :
1042 1 : for f := startIter.First(); f != nil; f = startIter.Next() {
1043 1 : var overlappingBytes uint64
1044 1 : compacting := f.IsCompacting()
1045 1 : if compacting {
1046 1 : // Move on if this file is already being compacted. We'll likely
1047 1 : // still need to move past the overlapping output files regardless,
1048 1 : // but in cases where all start-level files are compacting we won't.
1049 1 : continue
1050 : }
1051 :
1052 : // Trim any output-level files smaller than f.
1053 1 : for outputFile != nil && sstableKeyCompare(cmp, outputFile.Largest, f.Smallest) < 0 {
1054 1 : outputFile = outputIter.Next()
1055 1 : }
1056 :
1057 1 : for outputFile != nil && sstableKeyCompare(cmp, outputFile.Smallest, f.Largest) <= 0 && !compacting {
1058 1 : overlappingBytes += outputFile.Size
1059 1 : compacting = compacting || outputFile.IsCompacting()
1060 1 :
1061 1 : // For files in the bottommost level of the LSM, the
1062 1 : // Stats.RangeDeletionsBytesEstimate field is set to the estimate
1063 1 : // of bytes /within/ the file itself that may be dropped by
1064 1 : // recompacting the file. These bytes from obsolete keys would not
1065 1 : // need to be rewritten if we compacted `f` into `outputFile`, so
1066 1 : // they don't contribute to write amplification. Subtracting them
1067 1 : // out of the overlapping bytes helps prioritize these compactions
1068 1 : // that are cheaper than their file sizes suggest.
1069 1 : if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum {
1070 1 : overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate
1071 1 : }
1072 :
1073 : // If the file in the next level extends beyond f's largest key,
1074 : // break out and don't advance outputIter because f's successor
1075 : // might also overlap.
1076 : //
1077 : // Note, we stop as soon as we encounter an output-level file with a
1078 : // largest key beyond the input-level file's largest bound. We
1079 : // perform a simple user key comparison here using sstableKeyCompare
1080 : // which handles the potential for exclusive largest key bounds.
1081 : // There's some subtlety when the bounds are equal (eg, equal and
1082 : // inclusive, or equal and exclusive). Current Pebble doesn't split
1083 : // user keys across sstables within a level (and in format versions
1084 : // FormatSplitUserKeysMarkedCompacted and later we guarantee no
1085 : // split user keys exist within the entire LSM). In that case, we're
1086 : // assured that neither the input level nor the output level's next
1087 : // file shares the same user key, so compaction expansion will not
1088 : // include them in any compaction compacting `f`.
1089 : //
1090 : // NB: If we /did/ allow split user keys, or we're running on an
1091 : // old database with an earlier format major version where there are
1092 : // existing split user keys, this logic would be incorrect. Consider
1093 : // L1: [a#120,a#100] [a#80,a#60]
1094 : // L2: [a#55,a#45] [a#35,a#25] [a#15,a#5]
1095 : // While considering the first file in L1, [a#120,a#100], we'd skip
1096 : // past all of the files in L2. When considering the second file in
1097 : // L1, we'd improperly conclude that the second file overlaps
1098 : // nothing in the second level and is cheap to compact, when in
1099 : // reality we'd need to expand the compaction to include all 5
1100 : // files.
1101 1 : if sstableKeyCompare(cmp, outputFile.Largest, f.Largest) > 0 {
1102 1 : break
1103 : }
1104 1 : outputFile = outputIter.Next()
1105 : }
1106 :
1107 : // If the input level file or one of the overlapping files is
1108 : // compacting, we're not going to be able to compact this file
1109 : // anyways, so skip it.
1110 1 : if compacting {
1111 1 : continue
1112 : }
1113 :
1114 1 : compSz := compensatedSize(f) + responsibleForGarbageBytes(virtualBackings, f)
1115 1 : scaledRatio := overlappingBytes * 1024 / compSz
1116 1 : if scaledRatio < smallestRatio {
1117 1 : smallestRatio = scaledRatio
1118 1 : file = startIter.Take()
1119 1 : }
1120 : }
1121 1 : return file, file.FileMetadata != nil
1122 : }
1123 :
1124 : // responsibleForGarbageBytes returns the amount of garbage in the backing
1125 : // sstable that we consider the responsibility of this virtual sstable. For
1126 : // non-virtual sstables, this is of course 0. For virtual sstables, we equally
1127 : // distribute the responsibility of the garbage across all the virtual
1128 : // sstables that are referencing the same backing sstable. One could
1129 : // alternatively distribute this in proportion to the virtual sst sizes, but
1130 : // it isn't clear that more sophisticated heuristics are worth it, given that
1131 : // the garbage cannot be reclaimed until all the referencing virtual sstables
1132 : // are compacted.
1133 1 : func responsibleForGarbageBytes(virtualBackings *manifest.VirtualBackings, m *fileMetadata) uint64 {
1134 1 : if !m.Virtual {
1135 1 : return 0
1136 1 : }
1137 1 : useCount, virtualizedSize := virtualBackings.Usage(m.FileBacking.DiskFileNum)
1138 1 : // Since virtualizedSize is the sum of the estimated size of all virtual
1139 1 : // ssts, we allow for the possibility that virtualizedSize could exceed
1140 1 : // m.FileBacking.Size.
1141 1 : totalGarbage := int64(m.FileBacking.Size) - int64(virtualizedSize)
1142 1 : if totalGarbage <= 0 {
1143 1 : return 0
1144 1 : }
1145 1 : if useCount == 0 {
1146 0 : // This cannot happen if m exists in the latest version. The call to
1147 0 : // ResponsibleForGarbageBytes during compaction picking ensures that m
1148 0 : // exists in the latest version by holding versionSet.logLock.
1149 0 : panic(errors.AssertionFailedf("%s has zero useCount", m.String()))
1150 : }
1151 1 : return uint64(totalGarbage) / uint64(useCount)
1152 : }
1153 :
1154 : // pickAuto picks the best compaction, if any.
1155 : //
1156 : // On each call, pickAuto computes per-level size adjustments based on
1157 : // in-progress compactions, and computes a per-level score. The levels are
1158 : // iterated over in decreasing score order trying to find a valid compaction
1159 : // anchored at that level.
1160 : //
1161 : // If a score-based compaction cannot be found, pickAuto falls back to looking
1162 : // for an elision-only compaction to remove obsolete keys.
1163 1 : func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) {
1164 1 : // Compaction concurrency is controlled by L0 read-amp. We allow one
1165 1 : // additional compaction per L0CompactionConcurrency sublevels, as well as
1166 1 : // one additional compaction per CompactionDebtConcurrency bytes of
1167 1 : // compaction debt. Compaction concurrency is tied to L0 sublevels as that
1168 1 : // signal is independent of the database size. We tack on the compaction
1169 1 : // debt as a second signal to prevent compaction concurrency from dropping
1170 1 : // significantly right after a base compaction finishes, and before those
1171 1 : // bytes have been compacted further down the LSM.
1172 1 : if n := len(env.inProgressCompactions); n > 0 {
1173 1 : l0ReadAmp := p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions()
1174 1 : compactionDebt := p.estimatedCompactionDebt(0)
1175 1 : ccSignal1 := n * p.opts.Experimental.L0CompactionConcurrency
1176 1 : ccSignal2 := uint64(n) * p.opts.Experimental.CompactionDebtConcurrency
1177 1 : if l0ReadAmp < ccSignal1 && compactionDebt < ccSignal2 {
1178 1 : return nil
1179 1 : }
1180 : }
1181 :
1182 1 : scores := p.calculateLevelScores(env.inProgressCompactions)
1183 1 :
1184 1 : // TODO(bananabrick): Either remove, or change this into an event sent to the
1185 1 : // EventListener.
1186 1 : logCompaction := func(pc *pickedCompaction) {
1187 0 : var buf bytes.Buffer
1188 0 : for i := 0; i < numLevels; i++ {
1189 0 : if i != 0 && i < p.baseLevel {
1190 0 : continue
1191 : }
1192 :
1193 0 : var info *candidateLevelInfo
1194 0 : for j := range scores {
1195 0 : if scores[j].level == i {
1196 0 : info = &scores[j]
1197 0 : break
1198 : }
1199 : }
1200 :
1201 0 : marker := " "
1202 0 : if pc.startLevel.level == info.level {
1203 0 : marker = "*"
1204 0 : }
1205 0 : fmt.Fprintf(&buf, " %sL%d: %5.1f %5.1f %5.1f %5.1f %8s %8s",
1206 0 : marker, info.level, info.compensatedScoreRatio, info.compensatedScore,
1207 0 : info.uncompensatedScoreRatio, info.uncompensatedScore,
1208 0 : humanize.Bytes.Int64(int64(totalCompensatedSize(
1209 0 : p.vers.Levels[info.level].Iter(),
1210 0 : ))),
1211 0 : humanize.Bytes.Int64(p.levelMaxBytes[info.level]),
1212 0 : )
1213 0 :
1214 0 : count := 0
1215 0 : for i := range env.inProgressCompactions {
1216 0 : c := &env.inProgressCompactions[i]
1217 0 : if c.inputs[0].level != info.level {
1218 0 : continue
1219 : }
1220 0 : count++
1221 0 : if count == 1 {
1222 0 : fmt.Fprintf(&buf, " [")
1223 0 : } else {
1224 0 : fmt.Fprintf(&buf, " ")
1225 0 : }
1226 0 : fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel)
1227 : }
1228 0 : if count > 0 {
1229 0 : fmt.Fprintf(&buf, "]")
1230 0 : }
1231 0 : fmt.Fprintf(&buf, "\n")
1232 : }
1233 0 : p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s",
1234 0 : pc.startLevel.level, pc.outputLevel.level, buf.String())
1235 : }
1236 :
1237 : // Check for a score-based compaction. candidateLevelInfos are first sorted
1238 : // by whether they should be compacted, so if we find a level which shouldn't
1239 : // be compacted, we can break early.
1240 1 : for i := range scores {
1241 1 : info := &scores[i]
1242 1 : if !info.shouldCompact() {
1243 1 : break
1244 : }
1245 1 : if info.level == numLevels-1 {
1246 1 : continue
1247 : }
1248 :
1249 1 : if info.level == 0 {
1250 1 : pc = pickL0(env, p.opts, p.vers, p.baseLevel)
1251 1 : // Fail-safe to protect against compacting the same sstable
1252 1 : // concurrently.
1253 1 : if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
1254 1 : p.addScoresToPickedCompactionMetrics(pc, scores)
1255 1 : pc.score = info.compensatedScoreRatio
1256 1 : // TODO(bananabrick): Create an EventListener for logCompaction.
1257 1 : if false {
1258 0 : logCompaction(pc)
1259 0 : }
1260 1 : return pc
1261 : }
1262 1 : continue
1263 : }
1264 :
1265 : // info.level > 0
1266 1 : var ok bool
1267 1 : info.file, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum)
1268 1 : if !ok {
1269 1 : continue
1270 : }
1271 :
1272 1 : pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel)
1273 1 : // Fail-safe to protect against compacting the same sstable concurrently.
1274 1 : if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
1275 1 : p.addScoresToPickedCompactionMetrics(pc, scores)
1276 1 : pc.score = info.compensatedScoreRatio
1277 1 : // TODO(bananabrick): Create an EventListener for logCompaction.
1278 1 : if false {
1279 0 : logCompaction(pc)
1280 0 : }
1281 1 : return pc
1282 : }
1283 : }
1284 :
1285 : // Check for files which contain excessive point tombstones that could slow
1286 : // down reads. Unlike elision-only compactions, these compactions may select
1287 : // a file at any level rather than only the lowest level.
1288 1 : if pc := p.pickTombstoneDensityCompaction(env); pc != nil {
1289 1 : return pc
1290 1 : }
1291 :
1292 : // Check for L6 files with tombstones that may be elided. These files may
1293 : // exist if a snapshot prevented the elision of a tombstone or because of
1294 : // a move compaction. These are low-priority compactions because they
1295 : // don't help us keep up with writes, just reclaim disk space.
1296 1 : if pc := p.pickElisionOnlyCompaction(env); pc != nil {
1297 1 : return pc
1298 1 : }
1299 :
1300 1 : if pc := p.pickReadTriggeredCompaction(env); pc != nil {
1301 0 : return pc
1302 0 : }
1303 :
1304 : // NB: This should only be run if a read compaction wasn't
1305 : // scheduled.
1306 : //
1307 : // We won't be scheduling a read compaction right now, and in
1308 : // read heavy workloads, compactions won't be scheduled frequently
1309 : // because flushes aren't frequent. So we need to signal to the
1310 : // iterator to schedule a compaction when it adds compactions to
1311 : // the read compaction queue.
1312 : //
1313 : // We need the nil check here because without it, we have some
1314 : // tests which don't set that variable fail. Since there's a
1315 : // chance that one of those tests wouldn't want extra compactions
1316 : // to be scheduled, I added this check here, instead of
1317 : // setting rescheduleReadCompaction in those tests.
1318 1 : if env.readCompactionEnv.rescheduleReadCompaction != nil {
1319 1 : *env.readCompactionEnv.rescheduleReadCompaction = true
1320 1 : }
1321 :
1322 : // At the lowest possible compaction-picking priority, look for files marked
1323 : // for compaction. Pebble will mark files for compaction if they have atomic
1324 : // compaction units that span multiple files. While current Pebble code does
1325 : // not construct such sstables, RocksDB and earlier versions of Pebble may
1326 : // have created them. These split user keys form sets of files that must be
1327 : // compacted together for correctness (referred to as "atomic compaction
1328 : // units" within the code). Rewrite them in-place.
1329 : //
1330 : // It's also possible that a file may have been marked for compaction by
1331 : // even earlier versions of Pebble code, since FileMetadata's
1332 : // MarkedForCompaction field is persisted in the manifest. That's okay. We
1333 : // previously would've ignored the designation, whereas now we'll re-compact
1334 : // the file in place.
1335 1 : if p.vers.Stats.MarkedForCompaction > 0 {
1336 0 : if pc := p.pickRewriteCompaction(env); pc != nil {
1337 0 : return pc
1338 0 : }
1339 : }
1340 :
1341 1 : return nil
1342 : }
1343 :
1344 : func (p *compactionPickerByScore) addScoresToPickedCompactionMetrics(
1345 : pc *pickedCompaction, candInfo [numLevels]candidateLevelInfo,
1346 1 : ) {
1347 1 :
1348 1 : // candInfo is sorted by score, not by compaction level.
1349 1 : infoByLevel := [numLevels]candidateLevelInfo{}
1350 1 : for i := range candInfo {
1351 1 : level := candInfo[i].level
1352 1 : infoByLevel[level] = candInfo[i]
1353 1 : }
1354 : // Gather the compaction scores for the levels participating in the compaction.
1355 1 : pc.pickerMetrics.scores = make([]float64, len(pc.inputs))
1356 1 : inputIdx := 0
1357 1 : for i := range infoByLevel {
1358 1 : if pc.inputs[inputIdx].level == infoByLevel[i].level {
1359 1 : pc.pickerMetrics.scores[inputIdx] = infoByLevel[i].compensatedScoreRatio
1360 1 : inputIdx++
1361 1 : }
1362 1 : if inputIdx == len(pc.inputs) {
1363 1 : break
1364 : }
1365 : }
1366 : }
1367 :
1368 : // elisionOnlyAnnotator is a manifest.Annotator that annotates B-Tree
1369 : // nodes with the *fileMetadata of a file meeting the obsolete keys criteria
1370 : // for an elision-only compaction within the subtree. If multiple files meet
1371 : // the criteria, it chooses whichever file has the lowest LargestSeqNum. The
1372 : // lowest LargestSeqNum file will be the first eligible for an elision-only
1373 : // compaction once snapshots less than or equal to its LargestSeqNum are closed.
1374 : var elisionOnlyAnnotator = &manifest.Annotator[fileMetadata]{
1375 : Aggregator: manifest.PickFileAggregator{
1376 1 : Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) {
1377 1 : if f.IsCompacting() {
1378 1 : return false, true
1379 1 : }
1380 1 : if !f.StatsValid() {
1381 1 : return false, false
1382 1 : }
1383 : // Bottommost files are large and not worthwhile to compact just
1384 : // to remove a few tombstones. Consider a file eligible only if
1385 : // either its own range deletions delete at least 10% of its data or
1386 : // its deletion tombstones make at least 10% of its entries.
1387 : //
1388 : // TODO(jackson): This does not account for duplicate user keys
1389 : // which may be collapsed. Ideally, we would have 'obsolete keys'
1390 : // statistics that would include tombstones, the keys that are
1391 : // dropped by tombstones and duplicated user keys. See #847.
1392 : //
1393 : // Note that tables that contain exclusively range keys (i.e. no point keys,
1394 : // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded
1395 : // from elision-only compactions.
1396 : // TODO(travers): Consider an alternative heuristic for elision of range-keys.
1397 1 : return f.Stats.RangeDeletionsBytesEstimate*10 >= f.Size || f.Stats.NumDeletions*10 > f.Stats.NumEntries, true
1398 : },
1399 1 : Compare: func(f1 *fileMetadata, f2 *fileMetadata) bool {
1400 1 : return f1.LargestSeqNum < f2.LargestSeqNum
1401 1 : },
1402 : },
1403 : }
1404 :
1405 : // markedForCompactionAnnotator is a manifest.Annotator that annotates B-Tree
1406 : // nodes with the *fileMetadata of a file that is marked for compaction
1407 : // within the subtree. If multiple files meet the criteria, it chooses
1408 : // whichever file has the lowest LargestSeqNum.
1409 : var markedForCompactionAnnotator = &manifest.Annotator[fileMetadata]{
1410 : Aggregator: manifest.PickFileAggregator{
1411 0 : Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) {
1412 0 : return f.MarkedForCompaction, true
1413 0 : },
1414 0 : Compare: func(f1 *fileMetadata, f2 *fileMetadata) bool {
1415 0 : return f1.LargestSeqNum < f2.LargestSeqNum
1416 0 : },
1417 : },
1418 : }
1419 :
1420 : // pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata
1421 : // with various checks to ensure that the file still exists in the expected level
1422 : // and isn't already being compacted.
1423 : func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
1424 : candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
1425 1 : ) *pickedCompaction {
1426 1 : if candidate == nil || candidate.IsCompacting() {
1427 1 : return nil
1428 1 : }
1429 :
1430 1 : var inputs manifest.LevelSlice
1431 1 : if startLevel == 0 {
1432 1 : // Overlapping L0 files must also be compacted alongside the candidate.
1433 1 : inputs = p.vers.Overlaps(0, candidate.UserKeyBounds())
1434 1 : } else {
1435 1 : inputs = p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate)
1436 1 : }
1437 1 : if invariants.Enabled {
1438 1 : found := false
1439 1 : inputs.Each(func(f *fileMetadata) {
1440 1 : if f.FileNum == candidate.FileNum {
1441 1 : found = true
1442 1 : }
1443 : })
1444 1 : if !found {
1445 0 : panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel))
1446 : }
1447 : }
1448 :
1449 1 : pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
1450 1 : pc.kind = kind
1451 1 : pc.startLevel.files = inputs
1452 1 : pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
1453 1 :
1454 1 : // Fail-safe to protect against compacting the same sstable concurrently.
1455 1 : if inputRangeAlreadyCompacting(env, pc) {
1456 1 : return nil
1457 1 : }
1458 :
1459 1 : if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
1460 1 : return nil
1461 1 : }
1462 :
1463 1 : return pc
1464 : }
1465 :
1466 : // pickElisionOnlyCompaction looks for compactions of sstables in the
1467 : // bottommost level containing obsolete records that may now be dropped.
1468 : func (p *compactionPickerByScore) pickElisionOnlyCompaction(
1469 : env compactionEnv,
1470 1 : ) (pc *pickedCompaction) {
1471 1 : if p.opts.private.disableElisionOnlyCompactions {
1472 1 : return nil
1473 1 : }
1474 1 : candidate := elisionOnlyAnnotator.LevelAnnotation(p.vers.Levels[numLevels-1])
1475 1 : if candidate == nil {
1476 1 : return nil
1477 1 : }
1478 1 : if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
1479 1 : return nil
1480 1 : }
1481 1 : return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
1482 : }
1483 :
1484 : // pickRewriteCompaction attempts to construct a compaction that
1485 : // rewrites a file marked for compaction. pickRewriteCompaction will
1486 : // pull in adjacent files in the file's atomic compaction unit if
1487 : // necessary. A rewrite compaction outputs files to the same level as
1488 : // the input level.
1489 0 : func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) {
1490 0 : for l := numLevels - 1; l >= 0; l-- {
1491 0 : candidate := markedForCompactionAnnotator.LevelAnnotation(p.vers.Levels[l])
1492 0 : if candidate == nil {
1493 0 : // Try the next level.
1494 0 : continue
1495 : }
1496 0 : pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
1497 0 : if pc != nil {
1498 0 : return pc
1499 0 : }
1500 : }
1501 0 : return nil
1502 : }
1503 :
1504 : // pickTombstoneDensityCompaction looks for a compaction that eliminates
1505 : // regions of extremely high point tombstone density. For each level, it picks
1506 : // a file where the ratio of tombstone-dense blocks is at least
1507 : // options.Experimental.MinTombstoneDenseRatio, prioritizing compaction of
1508 : // files with higher ratios of tombstone-dense blocks.
1509 : func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
1510 : env compactionEnv,
1511 1 : ) (pc *pickedCompaction) {
1512 1 : if p.opts.Experimental.TombstoneDenseCompactionThreshold <= 0 {
1513 0 : // Tombstone density compactions are disabled.
1514 0 : return nil
1515 0 : }
1516 :
1517 1 : var candidate *fileMetadata
1518 1 : var level int
1519 1 : // If a candidate file has a very high overlapping ratio, point tombstones
1520 1 : // in it are likely sparse in keyspace even if the sstable itself is tombstone
1521 1 : // dense. These tombstones likely wouldn't be slow to iterate over, so we exclude
1522 1 : // these files from tombstone density compactions. The threshold of 40.0 is
1523 1 : // chosen somewhat arbitrarily, after some observations around excessively large
1524 1 : // tombstone density compactions.
1525 1 : const maxOverlappingRatio = 40.0
1526 1 : // NB: we don't consider the lowest level because elision-only compactions
1527 1 : // handle that case.
1528 1 : lastNonEmptyLevel := numLevels - 1
1529 1 : for l := numLevels - 2; l >= 0; l-- {
1530 1 : iter := p.vers.Levels[l].Iter()
1531 1 : for f := iter.First(); f != nil; f = iter.Next() {
1532 1 : if f.IsCompacting() || !f.StatsValid() || f.Size == 0 {
1533 1 : continue
1534 : }
1535 1 : if f.Stats.TombstoneDenseBlocksRatio < p.opts.Experimental.TombstoneDenseCompactionThreshold {
1536 1 : continue
1537 : }
1538 1 : overlaps := p.vers.Overlaps(lastNonEmptyLevel, f.UserKeyBounds())
1539 1 : if float64(overlaps.SizeSum())/float64(f.Size) > maxOverlappingRatio {
1540 0 : continue
1541 : }
1542 1 : if candidate == nil || candidate.Stats.TombstoneDenseBlocksRatio < f.Stats.TombstoneDenseBlocksRatio {
1543 1 : candidate = f
1544 1 : level = l
1545 1 : }
1546 : }
1547 : // We prefer lower level (ie. L5) candidates over higher level (ie. L4) ones.
1548 1 : if candidate != nil {
1549 1 : break
1550 : }
1551 1 : if !p.vers.Levels[l].Empty() {
1552 1 : lastNonEmptyLevel = l
1553 1 : }
1554 : }
1555 :
1556 1 : return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity)
1557 : }
1558 :
1559 : // pickAutoLPositive picks an automatic compaction for the candidate
1560 : // file in a positive-numbered level. This function must not be used for
1561 : // L0.
1562 : func pickAutoLPositive(
1563 : env compactionEnv, opts *Options, vers *version, cInfo candidateLevelInfo, baseLevel int,
1564 1 : ) (pc *pickedCompaction) {
1565 1 : if cInfo.level == 0 {
1566 0 : panic("pebble: pickAutoLPositive called for L0")
1567 : }
1568 :
1569 1 : pc = newPickedCompaction(opts, vers, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel)
1570 1 : if pc.outputLevel.level != cInfo.outputLevel {
1571 0 : panic("pebble: compaction picked unexpected output level")
1572 : }
1573 1 : pc.startLevel.files = cInfo.file.Slice()
1574 1 : // Files in level 0 may overlap each other, so pick up all overlapping ones.
1575 1 : if pc.startLevel.level == 0 {
1576 0 : cmp := opts.Comparer.Compare
1577 0 : smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter())
1578 0 : pc.startLevel.files = vers.Overlaps(0, base.UserKeyBoundsFromInternal(smallest, largest))
1579 0 : if pc.startLevel.files.Empty() {
1580 0 : panic("pebble: empty compaction")
1581 : }
1582 : }
1583 :
1584 1 : if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
1585 0 : return nil
1586 0 : }
1587 1 : return pc.maybeAddLevel(opts, env.diskAvailBytes)
1588 : }
1589 :
1590 : // maybeAddLevel maybe adds a level to the picked compaction.
1591 1 : func (pc *pickedCompaction) maybeAddLevel(opts *Options, diskAvailBytes uint64) *pickedCompaction {
1592 1 : pc.pickerMetrics.singleLevelOverlappingRatio = pc.overlappingRatio()
1593 1 : if pc.outputLevel.level == numLevels-1 {
1594 1 : // Don't add a level if the current output level is in L6
1595 1 : return pc
1596 1 : }
1597 1 : if !opts.Experimental.MultiLevelCompactionHeuristic.allowL0() && pc.startLevel.level == 0 {
1598 1 : return pc
1599 1 : }
1600 1 : if pc.compactionSize() > expandedCompactionByteSizeLimit(
1601 1 : opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes) {
1602 1 : // Don't add a level if the current compaction exceeds the compaction size limit
1603 1 : return pc
1604 1 : }
1605 1 : return opts.Experimental.MultiLevelCompactionHeuristic.pick(pc, opts, diskAvailBytes)
1606 : }
1607 :
1608 : // MultiLevelHeuristic evaluates whether to add files from the next level into the compaction.
1609 : type MultiLevelHeuristic interface {
1610 : // Evaluate returns the preferred compaction.
1611 : pick(pc *pickedCompaction, opts *Options, diskAvailBytes uint64) *pickedCompaction
1612 :
1613 : // Returns if the heuristic allows L0 to be involved in ML compaction
1614 : allowL0() bool
1615 :
1616 : // String implements fmt.Stringer.
1617 : String() string
1618 : }
1619 :
1620 : // NoMultiLevel will never add an additional level to the compaction.
1621 : type NoMultiLevel struct{}
1622 :
1623 : var _ MultiLevelHeuristic = (*NoMultiLevel)(nil)
1624 :
1625 : func (nml NoMultiLevel) pick(
1626 : pc *pickedCompaction, opts *Options, diskAvailBytes uint64,
1627 1 : ) *pickedCompaction {
1628 1 : return pc
1629 1 : }
1630 :
1631 1 : func (nml NoMultiLevel) allowL0() bool { return false }
1632 1 : func (nml NoMultiLevel) String() string { return "none" }
1633 :
1634 1 : func (pc *pickedCompaction) predictedWriteAmp() float64 {
1635 1 : var bytesToCompact uint64
1636 1 : var higherLevelBytes uint64
1637 1 : for i := range pc.inputs {
1638 1 : levelSize := pc.inputs[i].files.SizeSum()
1639 1 : bytesToCompact += levelSize
1640 1 : if i != len(pc.inputs)-1 {
1641 1 : higherLevelBytes += levelSize
1642 1 : }
1643 : }
1644 1 : return float64(bytesToCompact) / float64(higherLevelBytes)
1645 : }
1646 :
1647 1 : func (pc *pickedCompaction) overlappingRatio() float64 {
1648 1 : var higherLevelBytes uint64
1649 1 : var lowestLevelBytes uint64
1650 1 : for i := range pc.inputs {
1651 1 : levelSize := pc.inputs[i].files.SizeSum()
1652 1 : if i == len(pc.inputs)-1 {
1653 1 : lowestLevelBytes += levelSize
1654 1 : continue
1655 : }
1656 1 : higherLevelBytes += levelSize
1657 : }
1658 1 : return float64(lowestLevelBytes) / float64(higherLevelBytes)
1659 : }
1660 :
1661 : // WriteAmpHeuristic defines a multi level compaction heuristic which will add
1662 : // an additional level to the picked compaction if it reduces predicted write
1663 : // amp of the compaction + the addPropensity constant.
1664 : type WriteAmpHeuristic struct {
1665 : // addPropensity is a constant that affects the propensity to conduct multilevel
1666 : // compactions. If positive, a multilevel compaction may get picked even if
1667 : // the single level compaction has lower write amp, and vice versa.
1668 : AddPropensity float64
1669 :
1670 : // AllowL0 if true, allow l0 to be involved in a ML compaction.
1671 : AllowL0 bool
1672 : }
1673 :
1674 : var _ MultiLevelHeuristic = (*WriteAmpHeuristic)(nil)
1675 :
1676 : // TODO(msbutler): microbenchmark the extent to which multilevel compaction
1677 : // picking slows down the compaction picking process. This should be as fast as
1678 : // possible since Compaction-picking holds d.mu, which prevents WAL rotations,
1679 : // in-progress flushes and compactions from completing, etc. Consider ways to
1680 : // deduplicate work, given that setupInputs has already been called.
1681 : func (wa WriteAmpHeuristic) pick(
1682 : pcOrig *pickedCompaction, opts *Options, diskAvailBytes uint64,
1683 1 : ) *pickedCompaction {
1684 1 : pcMulti := pcOrig.clone()
1685 1 : if !pcMulti.setupMultiLevelCandidate(opts, diskAvailBytes) {
1686 1 : return pcOrig
1687 1 : }
1688 : // We consider the addition of a level as an "expansion" of the compaction.
1689 : // If pcMulti is past the expanded compaction byte size limit already,
1690 : // we don't consider it.
1691 1 : if pcMulti.compactionSize() >= expandedCompactionByteSizeLimit(
1692 1 : opts, adjustedOutputLevel(pcMulti.outputLevel.level, pcMulti.baseLevel), diskAvailBytes) {
1693 1 : return pcOrig
1694 1 : }
1695 1 : picked := pcOrig
1696 1 : if pcMulti.predictedWriteAmp() <= pcOrig.predictedWriteAmp()+wa.AddPropensity {
1697 1 : picked = pcMulti
1698 1 : }
1699 : // Regardless of what compaction was picked, log the multilevelOverlapping ratio.
1700 1 : picked.pickerMetrics.multiLevelOverlappingRatio = pcMulti.overlappingRatio()
1701 1 : return picked
1702 : }
1703 :
1704 1 : func (wa WriteAmpHeuristic) allowL0() bool {
1705 1 : return wa.AllowL0
1706 1 : }
1707 :
1708 : // String implements fmt.Stringer.
1709 1 : func (wa WriteAmpHeuristic) String() string {
1710 1 : return fmt.Sprintf("wamp(%.2f, %t)", wa.AddPropensity, wa.AllowL0)
1711 1 : }
1712 :
1713 : // Helper method to pick compactions originating from L0. Uses information about
1714 : // sublevels to generate a compaction.
1715 1 : func pickL0(env compactionEnv, opts *Options, vers *version, baseLevel int) (pc *pickedCompaction) {
1716 1 : // It is important to pass information about Lbase files to L0Sublevels
1717 1 : // so it can pick a compaction that does not conflict with an Lbase => Lbase+1
1718 1 : // compaction. Without this, we observed reduced concurrency of L0=>Lbase
1719 1 : // compactions, and increasing read amplification in L0.
1720 1 : //
1721 1 : // TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1
1722 1 : // has been shown to not cause a performance regression.
1723 1 : lcf, err := vers.L0Sublevels.PickBaseCompaction(1, vers.Levels[baseLevel].Slice())
1724 1 : if err != nil {
1725 0 : opts.Logger.Errorf("error when picking base compaction: %s", err)
1726 0 : return
1727 0 : }
1728 1 : if lcf != nil {
1729 1 : pc = newPickedCompactionFromL0(lcf, opts, vers, baseLevel, true)
1730 1 : pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel)
1731 1 : if pc.startLevel.files.Empty() {
1732 0 : opts.Logger.Fatalf("empty compaction chosen")
1733 0 : }
1734 1 : return pc.maybeAddLevel(opts, env.diskAvailBytes)
1735 : }
1736 :
1737 : // Couldn't choose a base compaction. Try choosing an intra-L0
1738 : // compaction. Note that we pass in L0CompactionThreshold here as opposed to
1739 : // 1, since choosing a single sublevel intra-L0 compaction is
1740 : // counterproductive.
1741 1 : lcf, err = vers.L0Sublevels.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count)
1742 1 : if err != nil {
1743 0 : opts.Logger.Errorf("error when picking intra-L0 compaction: %s", err)
1744 0 : return
1745 0 : }
1746 1 : if lcf != nil {
1747 1 : pc = newPickedCompactionFromL0(lcf, opts, vers, 0, false)
1748 1 : if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
1749 0 : return nil
1750 0 : }
1751 1 : if pc.startLevel.files.Empty() {
1752 0 : opts.Logger.Fatalf("empty compaction chosen")
1753 0 : }
1754 1 : {
1755 1 : iter := pc.startLevel.files.Iter()
1756 1 : if iter.First() == nil || iter.Next() == nil {
1757 0 : // A single-file intra-L0 compaction is unproductive.
1758 0 : return nil
1759 0 : }
1760 : }
1761 :
1762 1 : pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
1763 : }
1764 1 : return pc
1765 : }
1766 :
1767 : func pickManualCompaction(
1768 : vers *version, opts *Options, env compactionEnv, baseLevel int, manual *manualCompaction,
1769 1 : ) (pc *pickedCompaction, retryLater bool) {
1770 1 : outputLevel := manual.level + 1
1771 1 : if manual.level == 0 {
1772 1 : outputLevel = baseLevel
1773 1 : } else if manual.level < baseLevel {
1774 1 : // The start level for a compaction must be >= Lbase. A manual
1775 1 : // compaction could have been created adhering to that condition, and
1776 1 : // then an automatic compaction came in and compacted all of the
1777 1 : // sstables in Lbase to Lbase+1 which caused Lbase to change. Simply
1778 1 : // ignore this manual compaction as there is nothing to do (manual.level
1779 1 : // points to an empty level).
1780 1 : return nil, false
1781 1 : }
1782 : // This conflictsWithInProgress call is necessary for the manual compaction to
1783 : // be retried when it conflicts with an ongoing automatic compaction. Without
1784 : // it, the compaction is dropped due to pc.setupInputs returning false since
1785 : // the input/output range is already being compacted, and the manual
1786 : // compaction ends with a non-compacted LSM.
1787 1 : if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, opts.Comparer.Compare) {
1788 1 : return nil, true
1789 1 : }
1790 1 : pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel)
1791 1 : manual.outputLevel = pc.outputLevel.level
1792 1 : pc.startLevel.files = vers.Overlaps(manual.level, base.UserKeyBoundsInclusive(manual.start, manual.end))
1793 1 : if pc.startLevel.files.Empty() {
1794 1 : // Nothing to do
1795 1 : return nil, false
1796 1 : }
1797 1 : if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
1798 1 : // setupInputs returned false indicating there's a conflicting
1799 1 : // concurrent compaction.
1800 1 : return nil, true
1801 1 : }
1802 1 : if pc = pc.maybeAddLevel(opts, env.diskAvailBytes); pc == nil {
1803 0 : return nil, false
1804 0 : }
1805 1 : if pc.outputLevel.level != outputLevel {
1806 1 : if len(pc.extraLevels) > 0 {
1807 1 : // multilevel compactions relax this invariant
1808 1 : } else {
1809 0 : panic("pebble: compaction picked unexpected output level")
1810 : }
1811 : }
1812 : // Fail-safe to protect against compacting the same sstable concurrently.
1813 1 : if inputRangeAlreadyCompacting(env, pc) {
1814 0 : return nil, true
1815 0 : }
1816 1 : return pc, false
1817 : }
1818 :
1819 : // pickDownloadCompaction picks a download compaction for the downloadSpan,
1820 : // which could be specified as being performed either by a copy compaction of
1821 : // the backing file or a rewrite compaction.
1822 : func pickDownloadCompaction(
1823 : vers *version,
1824 : opts *Options,
1825 : env compactionEnv,
1826 : baseLevel int,
1827 : kind compactionKind,
1828 : level int,
1829 : file *fileMetadata,
1830 1 : ) (pc *pickedCompaction) {
1831 1 : // Check if the file is compacting already.
1832 1 : if file.CompactionState == manifest.CompactionStateCompacting {
1833 0 : return nil
1834 0 : }
1835 1 : if kind != compactionKindCopy && kind != compactionKindRewrite {
1836 0 : panic("invalid download/rewrite compaction kind")
1837 : }
1838 1 : pc = newPickedCompaction(opts, vers, level, level, baseLevel)
1839 1 : pc.kind = kind
1840 1 : pc.startLevel.files = manifest.NewLevelSliceKeySorted(opts.Comparer.Compare, []*fileMetadata{file})
1841 1 : if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
1842 0 : // setupInputs returned false indicating there's a conflicting
1843 0 : // concurrent compaction.
1844 0 : return nil
1845 0 : }
1846 1 : if pc.outputLevel.level != level {
1847 0 : panic("pebble: download compaction picked unexpected output level")
1848 : }
1849 : // Fail-safe to protect against compacting the same sstable concurrently.
1850 1 : if inputRangeAlreadyCompacting(env, pc) {
1851 0 : return nil
1852 0 : }
1853 1 : return pc
1854 : }
1855 :
1856 : func (p *compactionPickerByScore) pickReadTriggeredCompaction(
1857 : env compactionEnv,
1858 1 : ) (pc *pickedCompaction) {
1859 1 : // If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would
1860 1 : // soon be scheduling more write focussed compactions. In this case, skip read compactions as they are
1861 1 : // lower priority.
1862 1 : if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil {
1863 1 : return nil
1864 1 : }
1865 1 : for env.readCompactionEnv.readCompactions.size > 0 {
1866 0 : rc := env.readCompactionEnv.readCompactions.remove()
1867 0 : if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil {
1868 0 : break
1869 : }
1870 : }
1871 1 : return pc
1872 : }
1873 :
1874 : func pickReadTriggeredCompactionHelper(
1875 : p *compactionPickerByScore, rc *readCompaction, env compactionEnv,
1876 0 : ) (pc *pickedCompaction) {
1877 0 : overlapSlice := p.vers.Overlaps(rc.level, base.UserKeyBoundsInclusive(rc.start, rc.end))
1878 0 : if overlapSlice.Empty() {
1879 0 : // If there is no overlap, then the file with the key range
1880 0 : // must have been compacted away. So, we don't proceed to
1881 0 : // compact the same key range again.
1882 0 : return nil
1883 0 : }
1884 :
1885 0 : iter := overlapSlice.Iter()
1886 0 : var fileMatches bool
1887 0 : for f := iter.First(); f != nil; f = iter.Next() {
1888 0 : if f.FileNum == rc.fileNum {
1889 0 : fileMatches = true
1890 0 : break
1891 : }
1892 : }
1893 0 : if !fileMatches {
1894 0 : return nil
1895 0 : }
1896 :
1897 0 : pc = newPickedCompaction(p.opts, p.vers, rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel)
1898 0 :
1899 0 : pc.startLevel.files = overlapSlice
1900 0 : if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
1901 0 : return nil
1902 0 : }
1903 0 : if inputRangeAlreadyCompacting(env, pc) {
1904 0 : return nil
1905 0 : }
1906 0 : pc.kind = compactionKindRead
1907 0 :
1908 0 : // Prevent read compactions which are too wide.
1909 0 : outputOverlaps := pc.version.Overlaps(pc.outputLevel.level, pc.userKeyBounds())
1910 0 : if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes {
1911 0 : return nil
1912 0 : }
1913 :
1914 : // Prevent compactions which start with a small seed file X, but overlap
1915 : // with over allowedCompactionWidth * X file sizes in the output layer.
1916 0 : const allowedCompactionWidth = 35
1917 0 : if outputOverlaps.SizeSum() > overlapSlice.SizeSum()*allowedCompactionWidth {
1918 0 : return nil
1919 0 : }
1920 :
1921 0 : return pc
1922 : }
1923 :
1924 0 : func (p *compactionPickerByScore) forceBaseLevel1() {
1925 0 : p.baseLevel = 1
1926 0 : }
1927 :
1928 1 : func inputRangeAlreadyCompacting(env compactionEnv, pc *pickedCompaction) bool {
1929 1 : for _, cl := range pc.inputs {
1930 1 : iter := cl.files.Iter()
1931 1 : for f := iter.First(); f != nil; f = iter.Next() {
1932 1 : if f.IsCompacting() {
1933 1 : return true
1934 1 : }
1935 : }
1936 : }
1937 :
1938 : // Look for active compactions outputting to the same region of the key
1939 : // space in the same output level. Two potential compactions may conflict
1940 : // without sharing input files if there are no files in the output level
1941 : // that overlap with the intersection of the compactions' key spaces.
1942 : //
1943 : // Consider an active L0->Lbase compaction compacting two L0 files one
1944 : // [a-f] and the other [t-z] into Lbase.
1945 : //
1946 : // L0
1947 : // ↦ 000100 ↤ ↦ 000101 ↤
1948 : // L1
1949 : // ↦ 000004 ↤
1950 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
1951 : //
1952 : // If a new file 000102 [j-p] is flushed while the existing compaction is
1953 : // still ongoing, new file would not be in any compacting sublevel
1954 : // intervals and would not overlap with any Lbase files that are also
1955 : // compacting. However, this compaction cannot be picked because the
1956 : // compaction's output key space [j-p] would overlap the existing
1957 : // compaction's output key space [a-z].
1958 : //
1959 : // L0
1960 : // ↦ 000100* ↤ ↦ 000102 ↤ ↦ 000101* ↤
1961 : // L1
1962 : // ↦ 000004* ↤
1963 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
1964 : //
1965 : // * - currently compacting
1966 1 : if pc.outputLevel != nil && pc.outputLevel.level != 0 {
1967 1 : for _, c := range env.inProgressCompactions {
1968 1 : if pc.outputLevel.level != c.outputLevel {
1969 1 : continue
1970 : }
1971 1 : if base.InternalCompare(pc.cmp, c.largest, pc.smallest) < 0 ||
1972 1 : base.InternalCompare(pc.cmp, c.smallest, pc.largest) > 0 {
1973 1 : continue
1974 : }
1975 :
1976 : // The picked compaction and the in-progress compaction c are
1977 : // outputting to the same region of the key space of the same
1978 : // level.
1979 1 : return true
1980 : }
1981 : }
1982 1 : return false
1983 : }
1984 :
1985 : // conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace.
1986 : func conflictsWithInProgress(
1987 : manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare,
1988 1 : ) bool {
1989 1 : for _, c := range inProgressCompactions {
1990 1 : if (c.outputLevel == manual.level || c.outputLevel == outputLevel) &&
1991 1 : isUserKeysOverlapping(manual.start, manual.end, c.smallest.UserKey, c.largest.UserKey, cmp) {
1992 1 : return true
1993 1 : }
1994 1 : for _, in := range c.inputs {
1995 1 : if in.files.Empty() {
1996 1 : continue
1997 : }
1998 1 : iter := in.files.Iter()
1999 1 : smallest := iter.First().Smallest.UserKey
2000 1 : largest := iter.Last().Largest.UserKey
2001 1 : if (in.level == manual.level || in.level == outputLevel) &&
2002 1 : isUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) {
2003 1 : return true
2004 1 : }
2005 : }
2006 : }
2007 1 : return false
2008 : }
2009 :
2010 1 : func isUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool {
2011 1 : return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0
2012 1 : }
|