Line data Source code
1 : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "math"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : "github.com/cockroachdb/pebble/internal/keyspan"
15 : "github.com/cockroachdb/pebble/internal/manifest"
16 : "github.com/cockroachdb/pebble/sstable"
17 : )
18 :
19 : // In-memory statistics about tables help inform compaction picking, but may
20 : // be expensive to calculate or load from disk. Every time a database is
21 : // opened, these statistics must be reloaded or recalculated. To minimize
22 : // impact on user activity and compactions, we load these statistics
23 : // asynchronously in the background and store loaded statistics in each
24 : // table's *FileMetadata.
25 : //
26 : // This file implements the asynchronous loading of statistics by maintaining
27 : // a list of files that require statistics, alongside their LSM levels.
28 : // Whenever new files are added to the LSM, the files are appended to
29 : // d.mu.tableStats.pending. If a stats collection job is not currently
30 : // running, one is started in a separate goroutine.
31 : //
32 : // The stats collection job grabs and clears the pending list, computes table
33 : // statistics relative to the current readState and updates the tables' file
34 : // metadata. New pending files may accumulate during a stats collection job,
35 : // so a completing job triggers a new job if necessary. Only one job runs at a
36 : // time.
37 : //
38 : // When an existing database is opened, all files lack in-memory statistics.
39 : // These files' stats are loaded incrementally whenever the pending list is
40 : // empty by scanning a current readState for files missing statistics. Once a
41 : // job completes a scan without finding any remaining files without
42 : // statistics, it flips a `loadedInitial` flag. From then on, the stats
43 : // collection job only needs to load statistics for new files appended to the
44 : // pending list.
45 :
46 1 : func (d *DB) maybeCollectTableStatsLocked() {
47 1 : if d.shouldCollectTableStatsLocked() {
48 1 : go d.collectTableStats()
49 1 : }
50 : }
51 :
52 : // updateTableStatsLocked is called when new files are introduced, after the
53 : // read state has been updated. It may trigger a new stat collection.
54 : // DB.mu must be locked when calling.
55 1 : func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
56 1 : var needStats bool
57 1 : for _, nf := range newFiles {
58 1 : if !nf.Meta.StatsValid() {
59 1 : needStats = true
60 1 : break
61 : }
62 : }
63 1 : if !needStats {
64 1 : return
65 1 : }
66 :
67 1 : d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
68 1 : d.maybeCollectTableStatsLocked()
69 : }
70 :
71 1 : func (d *DB) shouldCollectTableStatsLocked() bool {
72 1 : return !d.mu.tableStats.loading &&
73 1 : d.closed.Load() == nil &&
74 1 : !d.opts.DisableTableStats &&
75 1 : (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
76 1 : }
77 :
78 : // collectTableStats runs a table stats collection job, returning true if the
79 : // invocation did the collection work, false otherwise (e.g. if another job was
80 : // already running).
81 1 : func (d *DB) collectTableStats() bool {
82 1 : const maxTableStatsPerScan = 50
83 1 :
84 1 : d.mu.Lock()
85 1 : if !d.shouldCollectTableStatsLocked() {
86 1 : d.mu.Unlock()
87 1 : return false
88 1 : }
89 :
90 1 : pending := d.mu.tableStats.pending
91 1 : d.mu.tableStats.pending = nil
92 1 : d.mu.tableStats.loading = true
93 1 : jobID := d.mu.nextJobID
94 1 : d.mu.nextJobID++
95 1 : loadedInitial := d.mu.tableStats.loadedInitial
96 1 : // Drop DB.mu before performing IO.
97 1 : d.mu.Unlock()
98 1 :
99 1 : // Every run of collectTableStats either collects stats from the pending
100 1 : // list (if non-empty) or from scanning the version (loadedInitial is
101 1 : // false). This job only runs if at least one of those conditions holds.
102 1 :
103 1 : // Grab a read state to scan for tables.
104 1 : rs := d.loadReadState()
105 1 : var collected []collectedStats
106 1 : var hints []deleteCompactionHint
107 1 : if len(pending) > 0 {
108 1 : collected, hints = d.loadNewFileStats(rs, pending)
109 1 : } else {
110 1 : var moreRemain bool
111 1 : var buf [maxTableStatsPerScan]collectedStats
112 1 : collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
113 1 : loadedInitial = !moreRemain
114 1 : }
115 1 : rs.unref()
116 1 :
117 1 : // Update the FileMetadata with the loaded stats while holding d.mu.
118 1 : d.mu.Lock()
119 1 : defer d.mu.Unlock()
120 1 : d.mu.tableStats.loading = false
121 1 : if loadedInitial && !d.mu.tableStats.loadedInitial {
122 1 : d.mu.tableStats.loadedInitial = loadedInitial
123 1 : d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
124 1 : JobID: jobID,
125 1 : })
126 1 : }
127 :
128 1 : maybeCompact := false
129 1 : for _, c := range collected {
130 1 : c.fileMetadata.Stats = c.TableStats
131 1 : maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
132 1 : c.fileMetadata.StatsMarkValid()
133 1 : }
134 1 : d.mu.tableStats.cond.Broadcast()
135 1 : d.maybeCollectTableStatsLocked()
136 1 : if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
137 1 : // Verify that all of the hint tombstones' files still exist in the
138 1 : // current version. Otherwise, the tombstone itself may have been
139 1 : // compacted into L6 and more recent keys may have had their sequence
140 1 : // numbers zeroed.
141 1 : //
142 1 : // Note that it's possible that the tombstone file is being compacted
143 1 : // presently. In that case, the file will be present in v. When the
144 1 : // compaction finishes compacting the tombstone file, it will detect
145 1 : // and clear the hint.
146 1 : //
147 1 : // See DB.maybeUpdateDeleteCompactionHints.
148 1 : v := d.mu.versions.currentVersion()
149 1 : keepHints := hints[:0]
150 1 : for _, h := range hints {
151 1 : if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) {
152 1 : keepHints = append(keepHints, h)
153 1 : }
154 : }
155 1 : d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
156 : }
157 1 : if maybeCompact {
158 1 : d.maybeScheduleCompaction()
159 1 : }
160 1 : return true
161 : }
162 :
163 : type collectedStats struct {
164 : *fileMetadata
165 : manifest.TableStats
166 : }
167 :
168 : func (d *DB) loadNewFileStats(
169 : rs *readState, pending []manifest.NewFileEntry,
170 1 : ) ([]collectedStats, []deleteCompactionHint) {
171 1 : var hints []deleteCompactionHint
172 1 : collected := make([]collectedStats, 0, len(pending))
173 1 : for _, nf := range pending {
174 1 : // A file's stats might have been populated by an earlier call to
175 1 : // loadNewFileStats if the file was moved.
176 1 : // NB: We're not holding d.mu which protects f.Stats, but only
177 1 : // collectTableStats updates f.Stats for active files, and we
178 1 : // ensure only one goroutine runs it at a time through
179 1 : // d.mu.tableStats.loading.
180 1 : if nf.Meta.StatsValid() {
181 1 : continue
182 : }
183 :
184 : // The file isn't guaranteed to still be live in the readState's
185 : // version. It may have been deleted or moved. Skip it if it's not in
186 : // the expected level.
187 1 : if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) {
188 1 : continue
189 : }
190 :
191 1 : stats, newHints, err := d.loadTableStats(
192 1 : rs.current, nf.Level,
193 1 : nf.Meta,
194 1 : )
195 1 : if err != nil {
196 0 : d.opts.EventListener.BackgroundError(err)
197 0 : continue
198 : }
199 : // NB: We don't update the FileMetadata yet, because we aren't
200 : // holding DB.mu. We'll copy it to the FileMetadata after we're
201 : // finished with IO.
202 1 : collected = append(collected, collectedStats{
203 1 : fileMetadata: nf.Meta,
204 1 : TableStats: stats,
205 1 : })
206 1 : hints = append(hints, newHints...)
207 : }
208 1 : return collected, hints
209 : }
210 :
211 : // scanReadStateTableStats is run by an active stat collection job when there
212 : // are no pending new files, but there might be files that existed at Open for
213 : // which we haven't loaded table stats.
214 : func (d *DB) scanReadStateTableStats(
215 : rs *readState, fill []collectedStats,
216 1 : ) ([]collectedStats, []deleteCompactionHint, bool) {
217 1 : moreRemain := false
218 1 : var hints []deleteCompactionHint
219 1 : sizesChecked := make(map[base.DiskFileNum]struct{})
220 1 : for l, levelMetadata := range rs.current.Levels {
221 1 : iter := levelMetadata.Iter()
222 1 : for f := iter.First(); f != nil; f = iter.Next() {
223 1 : // NB: We're not holding d.mu which protects f.Stats, but only the
224 1 : // active stats collection job updates f.Stats for active files,
225 1 : // and we ensure only one goroutine runs it at a time through
226 1 : // d.mu.tableStats.loading. This makes it safe to read validity
227 1 : // through f.Stats.ValidLocked despite not holding d.mu.
228 1 : if f.StatsValid() {
229 1 : continue
230 : }
231 :
232 : // Limit how much work we do per read state. The older the read
233 : // state is, the higher the likelihood files are no longer being
234 : // used in the current version. If we've exhausted our allowance,
235 : // return true for the last return value to signal there's more
236 : // work to do.
237 1 : if len(fill) == cap(fill) {
238 1 : moreRemain = true
239 1 : return fill, hints, moreRemain
240 1 : }
241 :
242 : // If the file is remote and not SharedForeign, we should check if its size
243 : // matches. This is because checkConsistency skips over remote files.
244 : //
245 : // SharedForeign and External files are skipped as their sizes are allowed
246 : // to have a mismatch; the size stored in the FileBacking is just the part
247 : // of the file that is referenced by this Pebble instance, not the size of
248 : // the whole object.
249 1 : objMeta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
250 1 : if err != nil {
251 0 : // Set `moreRemain` so we'll try again.
252 0 : moreRemain = true
253 0 : d.opts.EventListener.BackgroundError(err)
254 0 : continue
255 : }
256 :
257 1 : shouldCheckSize := objMeta.IsRemote() &&
258 1 : !d.objProvider.IsSharedForeign(objMeta) &&
259 1 : !objMeta.IsExternal()
260 1 : if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize {
261 1 : size, err := d.objProvider.Size(objMeta)
262 1 : fileSize := f.FileBacking.Size
263 1 : if err != nil {
264 0 : moreRemain = true
265 0 : d.opts.EventListener.BackgroundError(err)
266 0 : continue
267 : }
268 1 : if size != int64(fileSize) {
269 0 : err := errors.Errorf(
270 0 : "during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)",
271 0 : errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta),
272 0 : errors.Safe(size), errors.Safe(fileSize))
273 0 : d.opts.EventListener.BackgroundError(err)
274 0 : d.opts.Logger.Fatalf("%s", err)
275 0 : }
276 :
277 1 : sizesChecked[f.FileBacking.DiskFileNum] = struct{}{}
278 : }
279 :
280 1 : stats, newHints, err := d.loadTableStats(
281 1 : rs.current, l, f,
282 1 : )
283 1 : if err != nil {
284 0 : // Set `moreRemain` so we'll try again.
285 0 : moreRemain = true
286 0 : d.opts.EventListener.BackgroundError(err)
287 0 : continue
288 : }
289 1 : fill = append(fill, collectedStats{
290 1 : fileMetadata: f,
291 1 : TableStats: stats,
292 1 : })
293 1 : hints = append(hints, newHints...)
294 : }
295 : }
296 1 : return fill, hints, moreRemain
297 : }
298 :
299 : func (d *DB) loadTableStats(
300 : v *version, level int, meta *fileMetadata,
301 1 : ) (manifest.TableStats, []deleteCompactionHint, error) {
302 1 : var stats manifest.TableStats
303 1 : var compactionHints []deleteCompactionHint
304 1 : err := d.tableCache.withCommonReader(
305 1 : meta, func(r sstable.CommonReader) (err error) {
306 1 : props := r.CommonProperties()
307 1 : stats.NumEntries = props.NumEntries
308 1 : stats.NumDeletions = props.NumDeletions
309 1 : if props.NumPointDeletions() > 0 {
310 1 : if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
311 0 : return
312 0 : }
313 : }
314 1 : if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
315 1 : if compactionHints, err = d.loadTableRangeDelStats(
316 1 : r, v, level, meta, &stats,
317 1 : ); err != nil {
318 0 : return
319 0 : }
320 : }
321 : // TODO(travers): Once we have real-world data, consider collecting
322 : // additional stats that may provide improved heuristics for compaction
323 : // picking.
324 1 : stats.NumRangeKeySets = props.NumRangeKeySets
325 1 : stats.ValueBlocksSize = props.ValueBlocksSize
326 1 : return
327 : })
328 1 : if err != nil {
329 0 : return stats, nil, err
330 0 : }
331 1 : return stats, compactionHints, nil
332 : }
333 :
334 : // loadTablePointKeyStats calculates the point key statistics for the given
335 : // table. The provided manifest.TableStats are updated.
336 : func (d *DB) loadTablePointKeyStats(
337 : props *sstable.CommonProperties,
338 : v *version,
339 : level int,
340 : meta *fileMetadata,
341 : stats *manifest.TableStats,
342 1 : ) error {
343 1 : // TODO(jackson): If the file has a wide keyspace, the average
344 1 : // value size beneath the entire file might not be representative
345 1 : // of the size of the keys beneath the point tombstones.
346 1 : // We could write the ranges of 'clusters' of point tombstones to
347 1 : // a sstable property and call averageValueSizeBeneath for each of
348 1 : // these narrower ranges to improve the estimate.
349 1 : avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
350 1 : if err != nil {
351 0 : return err
352 0 : }
353 1 : stats.PointDeletionsBytesEstimate =
354 1 : pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
355 1 : return nil
356 : }
357 :
358 : // loadTableRangeDelStats calculates the range deletion and range key deletion
359 : // statistics for the given table.
360 : func (d *DB) loadTableRangeDelStats(
361 : r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
362 1 : ) ([]deleteCompactionHint, error) {
363 1 : iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
364 1 : if err != nil {
365 0 : return nil, err
366 0 : }
367 1 : defer iter.Close()
368 1 : var compactionHints []deleteCompactionHint
369 1 : // We iterate over the defragmented range tombstones and range key deletions,
370 1 : // which ensures we don't double count ranges deleted at different sequence
371 1 : // numbers. Also, merging abutting tombstones reduces the number of calls to
372 1 : // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
373 1 : // our overall estimate.
374 1 : s, err := iter.First()
375 1 : for ; s != nil; s, err = iter.Next() {
376 1 : start, end := s.Start, s.End
377 1 : // We only need to consider deletion size estimates for tables that contain
378 1 : // RANGEDELs.
379 1 : var maxRangeDeleteSeqNum uint64
380 1 : for _, k := range s.Keys {
381 1 : if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
382 1 : maxRangeDeleteSeqNum = k.SeqNum()
383 1 : break
384 : }
385 : }
386 :
387 : // If the file is in the last level of the LSM, there is no data beneath
388 : // it. The fact that there is still a range tombstone in a bottommost file
389 : // indicates two possibilites:
390 : // 1. an open snapshot kept the tombstone around, and the data the
391 : // tombstone deletes is contained within the file itself.
392 : // 2. the file was ingested.
393 : // In the first case, we'd like to estimate disk usage within the file
394 : // itself since compacting the file will drop that covered data. In the
395 : // second case, we expect that compacting the file will NOT drop any
396 : // data and rewriting the file is a waste of write bandwidth. We can
397 : // distinguish these cases by looking at the file metadata's sequence
398 : // numbers. A file's range deletions can only delete data within the
399 : // file at lower sequence numbers. All keys in an ingested sstable adopt
400 : // the same sequence number, preventing tombstones from deleting keys
401 : // within the same file. We check here if the largest RANGEDEL sequence
402 : // number is greater than the file's smallest sequence number. If it is,
403 : // the RANGEDEL could conceivably (although inconclusively) delete data
404 : // within the same file.
405 : //
406 : // Note that this heuristic is imperfect. If a table containing a range
407 : // deletion is ingested into L5 and subsequently compacted into L6 but
408 : // an open snapshot prevents elision of covered keys in L6, the
409 : // resulting RangeDeletionsBytesEstimate will incorrectly include all
410 : // covered keys.
411 : //
412 : // TODO(jackson): We could prevent the above error in the heuristic by
413 : // computing the file's RangeDeletionsBytesEstimate during the
414 : // compaction itself. It's unclear how common this is.
415 : //
416 : // NOTE: If the span `s` wholly contains a table containing range keys,
417 : // the returned size estimate will be slightly inflated by the range key
418 : // block. However, in practice, range keys are expected to be rare, and
419 : // the size of the range key block relative to the overall size of the
420 : // table is expected to be small.
421 1 : if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
422 1 : size, err := r.EstimateDiskUsage(start, end)
423 1 : if err != nil {
424 0 : return nil, err
425 0 : }
426 1 : stats.RangeDeletionsBytesEstimate += size
427 1 :
428 1 : // As the file is in the bottommost level, there is no need to collect a
429 1 : // deletion hint.
430 1 : continue
431 : }
432 :
433 : // While the size estimates for point keys should only be updated if this
434 : // span contains a range del, the sequence numbers are required for the
435 : // hint. Unconditionally descend, but conditionally update the estimates.
436 1 : hintType := compactionHintFromKeys(s.Keys)
437 1 : estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
438 1 : if err != nil {
439 0 : return nil, err
440 0 : }
441 1 : stats.RangeDeletionsBytesEstimate += estimate
442 1 :
443 1 : // If any files were completely contained with the range,
444 1 : // hintSeqNum is the smallest sequence number contained in any
445 1 : // such file.
446 1 : if hintSeqNum == math.MaxUint64 {
447 1 : continue
448 : }
449 1 : hint := deleteCompactionHint{
450 1 : hintType: hintType,
451 1 : start: make([]byte, len(start)),
452 1 : end: make([]byte, len(end)),
453 1 : tombstoneFile: meta,
454 1 : tombstoneLevel: level,
455 1 : tombstoneLargestSeqNum: s.LargestSeqNum(),
456 1 : tombstoneSmallestSeqNum: s.SmallestSeqNum(),
457 1 : fileSmallestSeqNum: hintSeqNum,
458 1 : }
459 1 : copy(hint.start, start)
460 1 : copy(hint.end, end)
461 1 : compactionHints = append(compactionHints, hint)
462 : }
463 1 : if err != nil {
464 0 : return nil, err
465 0 : }
466 1 : return compactionHints, nil
467 : }
468 :
469 : func (d *DB) estimateSizesBeneath(
470 : v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties,
471 1 : ) (avgValueLogicalSize, compressionRatio float64, err error) {
472 1 : // Find all files in lower levels that overlap with meta,
473 1 : // summing their value sizes and entry counts.
474 1 : file := meta
475 1 : var fileSum, keySum, valSum, entryCount uint64
476 1 : // Include the file itself. This is important because in some instances, the
477 1 : // computed compression ratio is applied to the tombstones contained within
478 1 : // `meta` itself. If there are no files beneath `meta` in the LSM, we would
479 1 : // calculate a compression ratio of 0 which is not accurate for the file's
480 1 : // own tombstones.
481 1 : fileSum += file.Size
482 1 : entryCount += fileProps.NumEntries
483 1 : keySum += fileProps.RawKeySize
484 1 : valSum += fileProps.RawValueSize
485 1 :
486 1 : addPhysicalTableStats := func(r *sstable.Reader) (err error) {
487 1 : fileSum += file.Size
488 1 : entryCount += r.Properties.NumEntries
489 1 : keySum += r.Properties.RawKeySize
490 1 : valSum += r.Properties.RawValueSize
491 1 : return nil
492 1 : }
493 1 : addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
494 0 : fileSum += file.Size
495 0 : entryCount += file.Stats.NumEntries
496 0 : keySum += v.Properties.RawKeySize
497 0 : valSum += v.Properties.RawValueSize
498 0 : return nil
499 0 : }
500 :
501 1 : for l := level + 1; l < numLevels; l++ {
502 1 : overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey,
503 1 : meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
504 1 : iter := overlaps.Iter()
505 1 : for file = iter.First(); file != nil; file = iter.Next() {
506 1 : var err error
507 1 : if file.Virtual {
508 0 : err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
509 1 : } else {
510 1 : err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
511 1 : }
512 1 : if err != nil {
513 0 : return 0, 0, err
514 0 : }
515 : }
516 : }
517 1 : if entryCount == 0 {
518 0 : return 0, 0, nil
519 0 : }
520 : // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
521 : // the value sum according to the data size to account for compression,
522 : // index blocks and metadata overhead. Eg:
523 : //
524 : // Compression rate × Average uncompressed value size
525 : //
526 : // ↓
527 : //
528 : // FileSize RawValueSize
529 : // ----------------------- × ------------
530 : // RawKeySize+RawValueSize NumEntries
531 : //
532 : // We return the average logical value size plus the compression ratio,
533 : // leaving the scaling to the caller. This allows the caller to perform
534 : // additional compression ratio scaling if necessary.
535 1 : uncompressedSum := float64(keySum + valSum)
536 1 : compressionRatio = float64(fileSum) / uncompressedSum
537 1 : avgValueLogicalSize = (float64(valSum) / float64(entryCount))
538 1 : return avgValueLogicalSize, compressionRatio, nil
539 : }
540 :
541 : func (d *DB) estimateReclaimedSizeBeneath(
542 : v *version, level int, start, end []byte, hintType deleteCompactionHintType,
543 1 : ) (estimate uint64, hintSeqNum uint64, err error) {
544 1 : // Find all files in lower levels that overlap with the deleted range
545 1 : // [start, end).
546 1 : //
547 1 : // An overlapping file might be completely contained by the range
548 1 : // tombstone, in which case we can count the entire file size in
549 1 : // our estimate without doing any additional I/O.
550 1 : //
551 1 : // Otherwise, estimating the range for the file requires
552 1 : // additional I/O to read the file's index blocks.
553 1 : hintSeqNum = math.MaxUint64
554 1 : for l := level + 1; l < numLevels; l++ {
555 1 : overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */)
556 1 : iter := overlaps.Iter()
557 1 : for file := iter.First(); file != nil; file = iter.Next() {
558 1 : startCmp := d.cmp(start, file.Smallest.UserKey)
559 1 : endCmp := d.cmp(file.Largest.UserKey, end)
560 1 : if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
561 1 : // The range fully contains the file, so skip looking it up in table
562 1 : // cache/looking at its indexes and add the full file size. Whether the
563 1 : // disk estimate and hint seqnums are updated depends on a) the type of
564 1 : // hint that requested the estimate and b) the keys contained in this
565 1 : // current file.
566 1 : var updateEstimates, updateHints bool
567 1 : switch hintType {
568 1 : case deleteCompactionHintTypePointKeyOnly:
569 1 : // The range deletion byte estimates should only be updated if this
570 1 : // table contains point keys. This ends up being an overestimate in
571 1 : // the case that table also has range keys, but such keys are expected
572 1 : // to contribute a negligible amount of the table's overall size,
573 1 : // relative to point keys.
574 1 : if file.HasPointKeys {
575 1 : updateEstimates = true
576 1 : }
577 : // As the initiating span contained only range dels, hints can only be
578 : // updated if this table does _not_ contain range keys.
579 1 : if !file.HasRangeKeys {
580 1 : updateHints = true
581 1 : }
582 1 : case deleteCompactionHintTypeRangeKeyOnly:
583 1 : // The initiating span contained only range key dels. The estimates
584 1 : // apply only to point keys, and are therefore not updated.
585 1 : updateEstimates = false
586 1 : // As the initiating span contained only range key dels, hints can
587 1 : // only be updated if this table does _not_ contain point keys.
588 1 : if !file.HasPointKeys {
589 1 : updateHints = true
590 1 : }
591 1 : case deleteCompactionHintTypePointAndRangeKey:
592 1 : // Always update the estimates and hints, as this hint type can drop a
593 1 : // file, irrespective of the mixture of keys. Similar to above, the
594 1 : // range del bytes estimates is an overestimate.
595 1 : updateEstimates, updateHints = true, true
596 0 : default:
597 0 : panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
598 : }
599 1 : if updateEstimates {
600 1 : estimate += file.Size
601 1 : }
602 1 : if updateHints && hintSeqNum > file.SmallestSeqNum {
603 1 : hintSeqNum = file.SmallestSeqNum
604 1 : }
605 1 : } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
606 1 : // Partial overlap.
607 1 : if hintType == deleteCompactionHintTypeRangeKeyOnly {
608 1 : // If the hint that generated this overlap contains only range keys,
609 1 : // there is no need to calculate disk usage, as the reclaimable space
610 1 : // is expected to be minimal relative to point keys.
611 1 : continue
612 : }
613 1 : var size uint64
614 1 : var err error
615 1 : if file.Virtual {
616 1 : err = d.tableCache.withVirtualReader(
617 1 : file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
618 1 : size, err = r.EstimateDiskUsage(start, end)
619 1 : return err
620 1 : })
621 1 : } else {
622 1 : err = d.tableCache.withReader(
623 1 : file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
624 1 : size, err = r.EstimateDiskUsage(start, end)
625 1 : return err
626 1 : })
627 : }
628 :
629 1 : if err != nil {
630 0 : return 0, hintSeqNum, err
631 0 : }
632 1 : estimate += size
633 : }
634 : }
635 : }
636 1 : return estimate, hintSeqNum, nil
637 : }
638 :
639 1 : func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
640 1 : // If a table contains range deletions or range key deletions, we defer the
641 1 : // stats collection. There are two main reasons for this:
642 1 : //
643 1 : // 1. Estimating the potential for reclaimed space due to a range deletion
644 1 : // tombstone requires scanning the LSM - a potentially expensive operation
645 1 : // that should be deferred.
646 1 : // 2. Range deletions and / or range key deletions present an opportunity to
647 1 : // compute "deletion hints", which also requires a scan of the LSM to
648 1 : // compute tables that would be eligible for deletion.
649 1 : //
650 1 : // These two tasks are deferred to the table stats collector goroutine.
651 1 : if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
652 1 : return false
653 1 : }
654 :
655 : // If a table is more than 10% point deletions without user-provided size
656 : // estimates, don't calculate the PointDeletionsBytesEstimate statistic
657 : // using our limited knowledge. The table stats collector can populate the
658 : // stats and calculate an average of value size of all the tables beneath
659 : // the table in the LSM, which will be more accurate.
660 1 : if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
661 1 : return false
662 1 : }
663 :
664 1 : var pointEstimate uint64
665 1 : if props.NumEntries > 0 {
666 1 : // Use the file's own average key and value sizes as an estimate. This
667 1 : // doesn't require any additional IO and since the number of point
668 1 : // deletions in the file is low, the error introduced by this crude
669 1 : // estimate is expected to be small.
670 1 : commonProps := &props.CommonProperties
671 1 : avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
672 1 : pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
673 1 : }
674 :
675 1 : meta.Stats.NumEntries = props.NumEntries
676 1 : meta.Stats.NumDeletions = props.NumDeletions
677 1 : meta.Stats.NumRangeKeySets = props.NumRangeKeySets
678 1 : meta.Stats.PointDeletionsBytesEstimate = pointEstimate
679 1 : meta.Stats.RangeDeletionsBytesEstimate = 0
680 1 : meta.Stats.ValueBlocksSize = props.ValueBlocksSize
681 1 : meta.StatsMarkValid()
682 1 : return true
683 : }
684 :
685 : func pointDeletionsBytesEstimate(
686 : fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
687 1 : ) (estimate uint64) {
688 1 : if props.NumEntries == 0 {
689 0 : return 0
690 0 : }
691 1 : numPointDels := props.NumPointDeletions()
692 1 : if numPointDels == 0 {
693 1 : return 0
694 1 : }
695 : // Estimate the potential space to reclaim using the table's own properties.
696 : // There may or may not be keys covered by any individual point tombstone.
697 : // If not, compacting the point tombstone into L6 will at least allow us to
698 : // drop the point deletion key and will reclaim the tombstone's key bytes.
699 : // If there are covered key(s), we also get to drop key and value bytes for
700 : // each covered key.
701 : //
702 : // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
703 : // uncompressed size of entries that will be elided by fully compacting the
704 : // tombstone. For these tombstones, there's no guesswork—we use the
705 : // RawPointTombstoneValueSizeHint property which is the sum of all these
706 : // tombstones' encoded values.
707 : //
708 : // For un-sized point tombstones (DELs), we estimate assuming that each
709 : // point tombstone on average covers 1 key and using average value sizes.
710 : // This is almost certainly an overestimate, but that's probably okay
711 : // because point tombstones can slow range iterations even when they don't
712 : // cover a key.
713 : //
714 : // TODO(jackson): This logic doesn't directly incorporate fixed per-key
715 : // overhead (8-byte trailer, plus at least 1 byte encoding the length of the
716 : // key and 1 byte encoding the length of the value). This overhead is
717 : // indirectly incorporated through the compression ratios, but that results
718 : // in the overhead being smeared per key-byte and value-byte, rather than
719 : // per-entry. This per-key fixed overhead can be nontrivial, especially for
720 : // dense swaths of point tombstones. Give some thought as to whether we
721 : // should directly include fixed per-key overhead in the calculations.
722 :
723 : // Below, we calculate the tombstone contributions and the shadowed keys'
724 : // contributions separately.
725 1 : var tombstonesLogicalSize float64
726 1 : var shadowedLogicalSize float64
727 1 :
728 1 : // 1. Calculate the contribution of the tombstone keys themselves.
729 1 : if props.RawPointTombstoneKeySize > 0 {
730 1 : tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
731 1 : } else {
732 0 : // This sstable predates the existence of the RawPointTombstoneKeySize
733 0 : // property. We can use the average key size within the file itself and
734 0 : // the count of point deletions to estimate the size.
735 0 : tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
736 0 : }
737 :
738 : // 2. Calculate the contribution of the keys shadowed by tombstones.
739 : //
740 : // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
741 : // tombstones encode the size of both the key and value of the shadowed KV
742 : // entries. These sizes are aggregated into a sstable property.
743 1 : shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
744 1 :
745 1 : // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
746 1 : // keys.
747 1 : numUnsizedDels := numPointDels - props.NumSizedDeletions
748 1 : {
749 1 : // The shadowed keys have the same exact user keys as the tombstones
750 1 : // themselves, so we can use the `tombstonesLogicalSize` we computed
751 1 : // earlier as an estimate. There's a complication that
752 1 : // `tombstonesLogicalSize` may include DELSIZED keys we already
753 1 : // accounted for.
754 1 : shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
755 1 :
756 1 : // Calculate the contribution of the deleted values. The caller has
757 1 : // already computed an average logical size (possibly computed across
758 1 : // many sstables).
759 1 : shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
760 1 : }
761 :
762 : // Scale both tombstone and shadowed totals by logical:physical ratios to
763 : // account for compression, metadata overhead, etc.
764 : //
765 : // Physical FileSize
766 : // ----------- = -----------------------
767 : // Logical RawKeySize+RawValueSize
768 : //
769 1 : return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
770 : }
771 :
772 : func estimatePhysicalSizes(
773 : fileSize uint64, props *sstable.CommonProperties,
774 1 : ) (avgValLogicalSize, compressionRatio float64) {
775 1 : // RawKeySize and RawValueSize are uncompressed totals. Scale according to
776 1 : // the data size to account for compression, index blocks and metadata
777 1 : // overhead. Eg:
778 1 : //
779 1 : // Compression rate × Average uncompressed value size
780 1 : //
781 1 : // ↓
782 1 : //
783 1 : // FileSize RawValSize
784 1 : // ----------------------- × ----------
785 1 : // RawKeySize+RawValueSize NumEntries
786 1 : //
787 1 : uncompressedSum := props.RawKeySize + props.RawValueSize
788 1 : compressionRatio = float64(fileSize) / float64(uncompressedSum)
789 1 : avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
790 1 : return avgValLogicalSize, compressionRatio
791 1 : }
792 :
793 : // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
794 : // returns "ranged deletion" spans for a single table, providing a combined view
795 : // of both range deletion and range key deletion spans. The
796 : // tableRangedDeletionIter is intended for use in the specific case of computing
797 : // the statistics and deleteCompactionHints for a single table.
798 : //
799 : // As an example, consider the following set of spans from the range deletion
800 : // and range key blocks of a table:
801 : //
802 : // |---------| |---------| |-------| RANGEKEYDELs
803 : // |-----------|-------------| |-----| RANGEDELs
804 : // __________________________________________________________
805 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
806 : //
807 : // The tableRangedDeletionIter produces the following set of output spans, where
808 : // '1' indicates a span containing only range deletions, '2' is a span
809 : // containing only range key deletions, and '3' is a span containing a mixture
810 : // of both range deletions and range key deletions.
811 : //
812 : // 1 3 1 3 2 1 3 2
813 : // |-----|---------|-----|---|-----| |---|-|-----|
814 : // __________________________________________________________
815 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
816 : //
817 : // Algorithm.
818 : //
819 : // The iterator first defragments the range deletion and range key blocks
820 : // separately. During this defragmentation, the range key block is also filtered
821 : // so that keys other than range key deletes are ignored. The range delete and
822 : // range key delete keyspaces are then merged.
823 : //
824 : // Note that the only fragmentation introduced by merging is from where a range
825 : // del span overlaps with a range key del span. Within the bounds of any overlap
826 : // there is guaranteed to be no further fragmentation, as the constituent spans
827 : // have already been defragmented. To the left and right of any overlap, the
828 : // same reasoning applies. For example,
829 : //
830 : // |--------| |-------| RANGEKEYDEL
831 : // |---------------------------| RANGEDEL
832 : // |----1---|----3---|----1----|---2---| Merged, fragmented spans.
833 : // __________________________________________________________
834 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
835 : //
836 : // Any fragmented abutting spans produced by the merging iter will be of
837 : // differing types (i.e. a transition from a span with homogenous key kinds to a
838 : // heterogeneous span, or a transition from a span with exclusively range dels
839 : // to a span with exclusively range key dels). Therefore, further
840 : // defragmentation is not required.
841 : //
842 : // Each span returned by the tableRangeDeletionIter will have at most four keys,
843 : // corresponding to the largest and smallest sequence numbers encountered across
844 : // the range deletes and range keys deletes that comprised the merged spans.
845 : func newCombinedDeletionKeyspanIter(
846 : comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata,
847 1 : ) (keyspan.FragmentIterator, error) {
848 1 : // The range del iter and range key iter are each wrapped in their own
849 1 : // defragmenting iter. For each iter, abutting spans can always be merged.
850 1 : var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
851 : // Reduce keys by maintaining a slice of at most length two, corresponding to
852 : // the largest and smallest keys in the defragmented span. This maintains the
853 : // contract that the emitted slice is sorted by (SeqNum, Kind) descending.
854 1 : reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
855 1 : if len(current) == 0 && len(incoming) == 0 {
856 0 : // While this should never occur in practice, a defensive return is used
857 0 : // here to preserve correctness.
858 0 : return current
859 0 : }
860 1 : var largest, smallest keyspan.Key
861 1 : var set bool
862 1 : for _, keys := range [2][]keyspan.Key{current, incoming} {
863 1 : if len(keys) == 0 {
864 0 : continue
865 : }
866 1 : first, last := keys[0], keys[len(keys)-1]
867 1 : if !set {
868 1 : largest, smallest = first, last
869 1 : set = true
870 1 : continue
871 : }
872 1 : if first.Trailer > largest.Trailer {
873 1 : largest = first
874 1 : }
875 1 : if last.Trailer < smallest.Trailer {
876 1 : smallest = last
877 1 : }
878 : }
879 1 : if largest.Equal(comparer.Equal, smallest) {
880 1 : current = append(current[:0], largest)
881 1 : } else {
882 1 : current = append(current[:0], largest, smallest)
883 1 : }
884 1 : return current
885 : }
886 :
887 : // The separate iters for the range dels and range keys are wrapped in a
888 : // merging iter to join the keyspaces into a single keyspace. The separate
889 : // iters are only added if the particular key kind is present.
890 1 : mIter := &keyspan.MergingIter{}
891 1 : var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
892 1 : if in.KeysOrder != keyspan.ByTrailerDesc {
893 0 : panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
894 : }
895 1 : out.Start, out.End = in.Start, in.End
896 1 : out.Keys = append(out.Keys[:0], in.Keys...)
897 1 : out.KeysOrder = keyspan.ByTrailerDesc
898 1 : // NB: The order of by-trailer descending may have been violated,
899 1 : // because we've layered rangekey and rangedel iterators from the same
900 1 : // sstable into the same keyspan.MergingIter. The MergingIter will
901 1 : // return the keys in the order that the child iterators were provided.
902 1 : // Sort the keys to ensure they're sorted by trailer descending.
903 1 : keyspan.SortKeysByTrailer(&out.Keys)
904 1 : return nil
905 : })
906 1 : mIter.Init(comparer.Compare, transform, new(keyspan.MergingBuffers))
907 1 :
908 1 : iter, err := cr.NewRawRangeDelIter()
909 1 : if err != nil {
910 0 : return nil, err
911 0 : }
912 1 : if iter != nil {
913 1 : // Assert expected bounds. In previous versions of Pebble, range
914 1 : // deletions persisted to sstables could exceed the bounds of the
915 1 : // containing files due to "split user keys." This required readers to
916 1 : // constrain the tombstones' bounds to the containing file at read time.
917 1 : // See docs/range_deletions.md for an extended discussion of the design
918 1 : // and invariants at that time.
919 1 : //
920 1 : // We've since compacted away all 'split user-keys' and in the process
921 1 : // eliminated all "untruncated range tombstones" for physical sstables.
922 1 : // We no longer need to perform truncation at read time for these
923 1 : // sstables.
924 1 : //
925 1 : // At the same time, we've also introduced the concept of "virtual
926 1 : // SSTables" where the file metadata's effective bounds can again be
927 1 : // reduced to be narrower than the contained tombstones. These virtual
928 1 : // SSTables handle truncation differently, performing it using
929 1 : // keyspan.Truncate when the sstable's range deletion iterator is
930 1 : // opened.
931 1 : //
932 1 : // Together, these mean that we should never see untruncated range
933 1 : // tombstones any more—and the merging iterator no longer accounts for
934 1 : // their existence. Since there's abundant subtlety that we're relying
935 1 : // on, we choose to be conservative and assert that these invariants
936 1 : // hold. We could (and previously did) choose to only validate these
937 1 : // bounds in invariants builds, but the most likely avenue for these
938 1 : // tombstones' existence is through a bug in a migration and old data
939 1 : // sitting around in an old store from long ago.
940 1 : //
941 1 : // The table stats collector will read all files range deletions
942 1 : // asynchronously after Open, and provides a perfect opportunity to
943 1 : // validate our invariants without harming user latency. We also
944 1 : // previously performed truncation here which similarly required key
945 1 : // comparisons, so replacing those key comparisons with assertions
946 1 : // should be roughly similar in performance.
947 1 : //
948 1 : // TODO(jackson): Only use AssertBounds in invariants builds in the
949 1 : // following release.
950 1 : iter = keyspan.AssertBounds(
951 1 : iter, m.SmallestPointKey, m.LargestPointKey.UserKey, comparer.Compare,
952 1 : )
953 1 : dIter := &keyspan.DefragmentingIter{}
954 1 : dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
955 1 : iter = dIter
956 1 : mIter.AddLevel(iter)
957 1 : }
958 :
959 1 : iter, err = cr.NewRawRangeKeyIter()
960 1 : if err != nil {
961 0 : return nil, err
962 0 : }
963 1 : if iter != nil {
964 1 : // Assert expected bounds in tests.
965 1 : if invariants.Enabled {
966 1 : iter = keyspan.AssertBounds(
967 1 : iter, m.SmallestRangeKey, m.LargestRangeKey.UserKey, comparer.Compare,
968 1 : )
969 1 : }
970 : // Wrap the range key iterator in a filter that elides keys other than range
971 : // key deletions.
972 1 : iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
973 1 : out.Start, out.End = in.Start, in.End
974 1 : out.Keys = out.Keys[:0]
975 1 : for _, k := range in.Keys {
976 1 : if k.Kind() != base.InternalKeyKindRangeKeyDelete {
977 1 : continue
978 : }
979 1 : out.Keys = append(out.Keys, k)
980 : }
981 1 : return len(out.Keys) > 0
982 : }, comparer.Compare)
983 1 : dIter := &keyspan.DefragmentingIter{}
984 1 : dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
985 1 : iter = dIter
986 1 : mIter.AddLevel(iter)
987 : }
988 :
989 1 : return mIter, nil
990 : }
991 :
992 : // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
993 : // with the sum of the files' counts of range key fragments. Its annotation type
994 : // is a *uint64. The count of range key sets may change once a table's stats are
995 : // loaded asynchronously, so its values are marked as cacheable only if a file's
996 : // stats have been loaded.
997 : type rangeKeySetsAnnotator struct{}
998 :
999 : var _ manifest.Annotator = rangeKeySetsAnnotator{}
1000 :
1001 1 : func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
1002 1 : if dst == nil {
1003 1 : return new(uint64)
1004 1 : }
1005 0 : v := dst.(*uint64)
1006 0 : *v = 0
1007 0 : return v
1008 : }
1009 :
1010 : func (a rangeKeySetsAnnotator) Accumulate(
1011 : f *fileMetadata, dst interface{},
1012 1 : ) (v interface{}, cacheOK bool) {
1013 1 : vptr := dst.(*uint64)
1014 1 : *vptr = *vptr + f.Stats.NumRangeKeySets
1015 1 : return vptr, f.StatsValid()
1016 1 : }
1017 :
1018 1 : func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1019 1 : srcV := src.(*uint64)
1020 1 : dstV := dst.(*uint64)
1021 1 : *dstV = *dstV + *srcV
1022 1 : return dstV
1023 1 : }
1024 :
1025 : // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
1026 : // files of the LSM. It only counts keys in files for which table stats have
1027 : // been loaded. It uses a b-tree annotator to cache intermediate values between
1028 : // calculations when possible.
1029 1 : func countRangeKeySetFragments(v *version) (count uint64) {
1030 1 : for l := 0; l < numLevels; l++ {
1031 1 : if v.RangeKeyLevels[l].Empty() {
1032 1 : continue
1033 : }
1034 1 : count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
1035 : }
1036 1 : return count
1037 : }
1038 :
1039 : // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes
1040 : // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk
1041 : // eys). Its annotation type is a *uint64. The count of tombstones may change
1042 : // once a table's stats are loaded asynchronously, so its values are marked as
1043 : // cacheable only if a file's stats have been loaded.
1044 : type tombstonesAnnotator struct{}
1045 :
1046 : var _ manifest.Annotator = tombstonesAnnotator{}
1047 :
1048 1 : func (a tombstonesAnnotator) Zero(dst interface{}) interface{} {
1049 1 : if dst == nil {
1050 1 : return new(uint64)
1051 1 : }
1052 1 : v := dst.(*uint64)
1053 1 : *v = 0
1054 1 : return v
1055 : }
1056 :
1057 : func (a tombstonesAnnotator) Accumulate(
1058 : f *fileMetadata, dst interface{},
1059 1 : ) (v interface{}, cacheOK bool) {
1060 1 : vptr := dst.(*uint64)
1061 1 : *vptr = *vptr + f.Stats.NumDeletions
1062 1 : return vptr, f.StatsValid()
1063 1 : }
1064 :
1065 1 : func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1066 1 : srcV := src.(*uint64)
1067 1 : dstV := dst.(*uint64)
1068 1 : *dstV = *dstV + *srcV
1069 1 : return dstV
1070 1 : }
1071 :
1072 : // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL)
1073 : // internal keys across all files of the LSM. It only counts keys in files for
1074 : // which table stats have been loaded. It uses a b-tree annotator to cache
1075 : // intermediate values between calculations when possible.
1076 1 : func countTombstones(v *version) (count uint64) {
1077 1 : for l := 0; l < numLevels; l++ {
1078 1 : if v.Levels[l].Empty() {
1079 1 : continue
1080 : }
1081 1 : count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64)
1082 : }
1083 1 : return count
1084 : }
1085 :
1086 : // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree
1087 : // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation
1088 : // type is a *uint64. The value block size may change once a table's stats are
1089 : // loaded asynchronously, so its values are marked as cacheable only if a
1090 : // file's stats have been loaded.
1091 : type valueBlocksSizeAnnotator struct{}
1092 :
1093 : var _ manifest.Annotator = valueBlocksSizeAnnotator{}
1094 :
1095 1 : func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} {
1096 1 : if dst == nil {
1097 1 : return new(uint64)
1098 1 : }
1099 1 : v := dst.(*uint64)
1100 1 : *v = 0
1101 1 : return v
1102 : }
1103 :
1104 : func (a valueBlocksSizeAnnotator) Accumulate(
1105 : f *fileMetadata, dst interface{},
1106 1 : ) (v interface{}, cacheOK bool) {
1107 1 : vptr := dst.(*uint64)
1108 1 : *vptr = *vptr + f.Stats.ValueBlocksSize
1109 1 : return vptr, f.StatsValid()
1110 1 : }
1111 :
1112 1 : func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1113 1 : srcV := src.(*uint64)
1114 1 : dstV := dst.(*uint64)
1115 1 : *dstV = *dstV + *srcV
1116 1 : return dstV
1117 1 : }
1118 :
1119 : // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all
1120 : // files for a level of the LSM. It only includes the size for files for which
1121 : // table stats have been loaded. It uses a b-tree annotator to cache
1122 : // intermediate values between calculations when possible. It must not be
1123 : // called concurrently.
1124 : //
1125 : // REQUIRES: 0 <= level <= numLevels.
1126 1 : func valueBlocksSizeForLevel(v *version, level int) (count uint64) {
1127 1 : if v.Levels[level].Empty() {
1128 1 : return 0
1129 1 : }
1130 1 : return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64)
1131 : }
|