Line data Source code
1 : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "fmt"
9 : "math"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : "github.com/cockroachdb/pebble/internal/keyspan"
15 : "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
16 : "github.com/cockroachdb/pebble/internal/manifest"
17 : "github.com/cockroachdb/pebble/sstable"
18 : )
19 :
20 : // In-memory statistics about tables help inform compaction picking, but may
21 : // be expensive to calculate or load from disk. Every time a database is
22 : // opened, these statistics must be reloaded or recalculated. To minimize
23 : // impact on user activity and compactions, we load these statistics
24 : // asynchronously in the background and store loaded statistics in each
25 : // table's *FileMetadata.
26 : //
27 : // This file implements the asynchronous loading of statistics by maintaining
28 : // a list of files that require statistics, alongside their LSM levels.
29 : // Whenever new files are added to the LSM, the files are appended to
30 : // d.mu.tableStats.pending. If a stats collection job is not currently
31 : // running, one is started in a separate goroutine.
32 : //
33 : // The stats collection job grabs and clears the pending list, computes table
34 : // statistics relative to the current readState and updates the tables' file
35 : // metadata. New pending files may accumulate during a stats collection job,
36 : // so a completing job triggers a new job if necessary. Only one job runs at a
37 : // time.
38 : //
39 : // When an existing database is opened, all files lack in-memory statistics.
40 : // These files' stats are loaded incrementally whenever the pending list is
41 : // empty by scanning a current readState for files missing statistics. Once a
42 : // job completes a scan without finding any remaining files without
43 : // statistics, it flips a `loadedInitial` flag. From then on, the stats
44 : // collection job only needs to load statistics for new files appended to the
45 : // pending list.
46 :
47 2 : func (d *DB) maybeCollectTableStatsLocked() {
48 2 : if d.shouldCollectTableStatsLocked() {
49 2 : go d.collectTableStats()
50 2 : }
51 : }
52 :
53 : // updateTableStatsLocked is called when new files are introduced, after the
54 : // read state has been updated. It may trigger a new stat collection.
55 : // DB.mu must be locked when calling.
56 2 : func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
57 2 : var needStats bool
58 2 : for _, nf := range newFiles {
59 2 : if !nf.Meta.StatsValid() {
60 2 : needStats = true
61 2 : break
62 : }
63 : }
64 2 : if !needStats {
65 2 : return
66 2 : }
67 :
68 2 : d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
69 2 : d.maybeCollectTableStatsLocked()
70 : }
71 :
72 2 : func (d *DB) shouldCollectTableStatsLocked() bool {
73 2 : return !d.mu.tableStats.loading &&
74 2 : d.closed.Load() == nil &&
75 2 : !d.opts.DisableTableStats &&
76 2 : (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
77 2 : }
78 :
79 : // collectTableStats runs a table stats collection job, returning true if the
80 : // invocation did the collection work, false otherwise (e.g. if another job was
81 : // already running).
82 2 : func (d *DB) collectTableStats() bool {
83 2 : const maxTableStatsPerScan = 50
84 2 :
85 2 : d.mu.Lock()
86 2 : if !d.shouldCollectTableStatsLocked() {
87 2 : d.mu.Unlock()
88 2 : return false
89 2 : }
90 :
91 2 : pending := d.mu.tableStats.pending
92 2 : d.mu.tableStats.pending = nil
93 2 : d.mu.tableStats.loading = true
94 2 : jobID := d.mu.nextJobID
95 2 : d.mu.nextJobID++
96 2 : loadedInitial := d.mu.tableStats.loadedInitial
97 2 : // Drop DB.mu before performing IO.
98 2 : d.mu.Unlock()
99 2 :
100 2 : // Every run of collectTableStats either collects stats from the pending
101 2 : // list (if non-empty) or from scanning the version (loadedInitial is
102 2 : // false). This job only runs if at least one of those conditions holds.
103 2 :
104 2 : // Grab a read state to scan for tables.
105 2 : rs := d.loadReadState()
106 2 : var collected []collectedStats
107 2 : var hints []deleteCompactionHint
108 2 : if len(pending) > 0 {
109 2 : collected, hints = d.loadNewFileStats(rs, pending)
110 2 : } else {
111 2 : var moreRemain bool
112 2 : var buf [maxTableStatsPerScan]collectedStats
113 2 : collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
114 2 : loadedInitial = !moreRemain
115 2 : }
116 2 : rs.unref()
117 2 :
118 2 : // Update the FileMetadata with the loaded stats while holding d.mu.
119 2 : d.mu.Lock()
120 2 : defer d.mu.Unlock()
121 2 : d.mu.tableStats.loading = false
122 2 : if loadedInitial && !d.mu.tableStats.loadedInitial {
123 2 : d.mu.tableStats.loadedInitial = loadedInitial
124 2 : d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
125 2 : JobID: jobID,
126 2 : })
127 2 : }
128 :
129 2 : maybeCompact := false
130 2 : for _, c := range collected {
131 2 : c.fileMetadata.Stats = c.TableStats
132 2 : maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
133 2 : c.fileMetadata.StatsMarkValid()
134 2 : }
135 2 : d.mu.tableStats.cond.Broadcast()
136 2 : d.maybeCollectTableStatsLocked()
137 2 : if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
138 2 : // Verify that all of the hint tombstones' files still exist in the
139 2 : // current version. Otherwise, the tombstone itself may have been
140 2 : // compacted into L6 and more recent keys may have had their sequence
141 2 : // numbers zeroed.
142 2 : //
143 2 : // Note that it's possible that the tombstone file is being compacted
144 2 : // presently. In that case, the file will be present in v. When the
145 2 : // compaction finishes compacting the tombstone file, it will detect
146 2 : // and clear the hint.
147 2 : //
148 2 : // See DB.maybeUpdateDeleteCompactionHints.
149 2 : v := d.mu.versions.currentVersion()
150 2 : keepHints := hints[:0]
151 2 : for _, h := range hints {
152 2 : if v.Contains(h.tombstoneLevel, h.tombstoneFile) {
153 2 : keepHints = append(keepHints, h)
154 2 : }
155 : }
156 2 : d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
157 : }
158 2 : if maybeCompact {
159 2 : d.maybeScheduleCompaction()
160 2 : }
161 2 : return true
162 : }
163 :
164 : type collectedStats struct {
165 : *fileMetadata
166 : manifest.TableStats
167 : }
168 :
169 : func (d *DB) loadNewFileStats(
170 : rs *readState, pending []manifest.NewFileEntry,
171 2 : ) ([]collectedStats, []deleteCompactionHint) {
172 2 : var hints []deleteCompactionHint
173 2 : collected := make([]collectedStats, 0, len(pending))
174 2 : for _, nf := range pending {
175 2 : // A file's stats might have been populated by an earlier call to
176 2 : // loadNewFileStats if the file was moved.
177 2 : // NB: We're not holding d.mu which protects f.Stats, but only
178 2 : // collectTableStats updates f.Stats for active files, and we
179 2 : // ensure only one goroutine runs it at a time through
180 2 : // d.mu.tableStats.loading.
181 2 : if nf.Meta.StatsValid() {
182 2 : continue
183 : }
184 :
185 : // The file isn't guaranteed to still be live in the readState's
186 : // version. It may have been deleted or moved. Skip it if it's not in
187 : // the expected level.
188 2 : if !rs.current.Contains(nf.Level, nf.Meta) {
189 2 : continue
190 : }
191 :
192 2 : stats, newHints, err := d.loadTableStats(
193 2 : rs.current, nf.Level,
194 2 : nf.Meta,
195 2 : )
196 2 : if err != nil {
197 0 : d.opts.EventListener.BackgroundError(err)
198 0 : continue
199 : }
200 : // NB: We don't update the FileMetadata yet, because we aren't
201 : // holding DB.mu. We'll copy it to the FileMetadata after we're
202 : // finished with IO.
203 2 : collected = append(collected, collectedStats{
204 2 : fileMetadata: nf.Meta,
205 2 : TableStats: stats,
206 2 : })
207 2 : hints = append(hints, newHints...)
208 : }
209 2 : return collected, hints
210 : }
211 :
212 : // scanReadStateTableStats is run by an active stat collection job when there
213 : // are no pending new files, but there might be files that existed at Open for
214 : // which we haven't loaded table stats.
215 : func (d *DB) scanReadStateTableStats(
216 : rs *readState, fill []collectedStats,
217 2 : ) ([]collectedStats, []deleteCompactionHint, bool) {
218 2 : moreRemain := false
219 2 : var hints []deleteCompactionHint
220 2 : sizesChecked := make(map[base.DiskFileNum]struct{})
221 2 : for l, levelMetadata := range rs.current.Levels {
222 2 : iter := levelMetadata.Iter()
223 2 : for f := iter.First(); f != nil; f = iter.Next() {
224 2 : // NB: We're not holding d.mu which protects f.Stats, but only the
225 2 : // active stats collection job updates f.Stats for active files,
226 2 : // and we ensure only one goroutine runs it at a time through
227 2 : // d.mu.tableStats.loading. This makes it safe to read validity
228 2 : // through f.Stats.ValidLocked despite not holding d.mu.
229 2 : if f.StatsValid() {
230 2 : continue
231 : }
232 :
233 : // Limit how much work we do per read state. The older the read
234 : // state is, the higher the likelihood files are no longer being
235 : // used in the current version. If we've exhausted our allowance,
236 : // return true for the last return value to signal there's more
237 : // work to do.
238 2 : if len(fill) == cap(fill) {
239 2 : moreRemain = true
240 2 : return fill, hints, moreRemain
241 2 : }
242 :
243 : // If the file is remote and not SharedForeign, we should check if its size
244 : // matches. This is because checkConsistency skips over remote files.
245 : //
246 : // SharedForeign and External files are skipped as their sizes are allowed
247 : // to have a mismatch; the size stored in the FileBacking is just the part
248 : // of the file that is referenced by this Pebble instance, not the size of
249 : // the whole object.
250 2 : objMeta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
251 2 : if err != nil {
252 0 : // Set `moreRemain` so we'll try again.
253 0 : moreRemain = true
254 0 : d.opts.EventListener.BackgroundError(err)
255 0 : continue
256 : }
257 :
258 2 : shouldCheckSize := objMeta.IsRemote() &&
259 2 : !d.objProvider.IsSharedForeign(objMeta) &&
260 2 : !objMeta.IsExternal()
261 2 : if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize {
262 2 : size, err := d.objProvider.Size(objMeta)
263 2 : fileSize := f.FileBacking.Size
264 2 : if err != nil {
265 0 : moreRemain = true
266 0 : d.opts.EventListener.BackgroundError(err)
267 0 : continue
268 : }
269 2 : if size != int64(fileSize) {
270 0 : err := errors.Errorf(
271 0 : "during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)",
272 0 : errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta),
273 0 : errors.Safe(size), errors.Safe(fileSize))
274 0 : d.opts.EventListener.BackgroundError(err)
275 0 : d.opts.Logger.Fatalf("%s", err)
276 0 : }
277 :
278 2 : sizesChecked[f.FileBacking.DiskFileNum] = struct{}{}
279 : }
280 :
281 2 : stats, newHints, err := d.loadTableStats(
282 2 : rs.current, l, f,
283 2 : )
284 2 : if err != nil {
285 0 : // Set `moreRemain` so we'll try again.
286 0 : moreRemain = true
287 0 : d.opts.EventListener.BackgroundError(err)
288 0 : continue
289 : }
290 2 : fill = append(fill, collectedStats{
291 2 : fileMetadata: f,
292 2 : TableStats: stats,
293 2 : })
294 2 : hints = append(hints, newHints...)
295 : }
296 : }
297 2 : return fill, hints, moreRemain
298 : }
299 :
300 : func (d *DB) loadTableStats(
301 : v *version, level int, meta *fileMetadata,
302 2 : ) (manifest.TableStats, []deleteCompactionHint, error) {
303 2 : var stats manifest.TableStats
304 2 : var compactionHints []deleteCompactionHint
305 2 : err := d.tableCache.withCommonReader(
306 2 : meta, func(r sstable.CommonReader) (err error) {
307 2 : props := r.CommonProperties()
308 2 : stats.NumEntries = props.NumEntries
309 2 : stats.NumDeletions = props.NumDeletions
310 2 : if props.NumPointDeletions() > 0 {
311 2 : if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
312 0 : return
313 0 : }
314 : }
315 2 : if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
316 2 : if compactionHints, err = d.loadTableRangeDelStats(
317 2 : r, v, level, meta, &stats,
318 2 : ); err != nil {
319 0 : return
320 0 : }
321 : }
322 : // TODO(travers): Once we have real-world data, consider collecting
323 : // additional stats that may provide improved heuristics for compaction
324 : // picking.
325 2 : stats.NumRangeKeySets = props.NumRangeKeySets
326 2 : stats.ValueBlocksSize = props.ValueBlocksSize
327 2 : return
328 : })
329 2 : if err != nil {
330 0 : return stats, nil, err
331 0 : }
332 2 : return stats, compactionHints, nil
333 : }
334 :
335 : // loadTablePointKeyStats calculates the point key statistics for the given
336 : // table. The provided manifest.TableStats are updated.
337 : func (d *DB) loadTablePointKeyStats(
338 : props *sstable.CommonProperties,
339 : v *version,
340 : level int,
341 : meta *fileMetadata,
342 : stats *manifest.TableStats,
343 2 : ) error {
344 2 : // TODO(jackson): If the file has a wide keyspace, the average
345 2 : // value size beneath the entire file might not be representative
346 2 : // of the size of the keys beneath the point tombstones.
347 2 : // We could write the ranges of 'clusters' of point tombstones to
348 2 : // a sstable property and call averageValueSizeBeneath for each of
349 2 : // these narrower ranges to improve the estimate.
350 2 : avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
351 2 : if err != nil {
352 0 : return err
353 0 : }
354 2 : stats.PointDeletionsBytesEstimate =
355 2 : pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
356 2 : return nil
357 : }
358 :
359 : // loadTableRangeDelStats calculates the range deletion and range key deletion
360 : // statistics for the given table.
361 : func (d *DB) loadTableRangeDelStats(
362 : r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
363 2 : ) ([]deleteCompactionHint, error) {
364 2 : iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
365 2 : if err != nil {
366 0 : return nil, err
367 0 : }
368 2 : defer iter.Close()
369 2 : var compactionHints []deleteCompactionHint
370 2 : // We iterate over the defragmented range tombstones and range key deletions,
371 2 : // which ensures we don't double count ranges deleted at different sequence
372 2 : // numbers. Also, merging abutting tombstones reduces the number of calls to
373 2 : // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
374 2 : // our overall estimate.
375 2 : s, err := iter.First()
376 2 : for ; s != nil; s, err = iter.Next() {
377 2 : start, end := s.Start, s.End
378 2 : // We only need to consider deletion size estimates for tables that contain
379 2 : // RANGEDELs.
380 2 : var maxRangeDeleteSeqNum uint64
381 2 : for _, k := range s.Keys {
382 2 : if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
383 2 : maxRangeDeleteSeqNum = k.SeqNum()
384 2 : break
385 : }
386 : }
387 :
388 : // If the file is in the last level of the LSM, there is no data beneath
389 : // it. The fact that there is still a range tombstone in a bottommost file
390 : // indicates two possibilites:
391 : // 1. an open snapshot kept the tombstone around, and the data the
392 : // tombstone deletes is contained within the file itself.
393 : // 2. the file was ingested.
394 : // In the first case, we'd like to estimate disk usage within the file
395 : // itself since compacting the file will drop that covered data. In the
396 : // second case, we expect that compacting the file will NOT drop any
397 : // data and rewriting the file is a waste of write bandwidth. We can
398 : // distinguish these cases by looking at the file metadata's sequence
399 : // numbers. A file's range deletions can only delete data within the
400 : // file at lower sequence numbers. All keys in an ingested sstable adopt
401 : // the same sequence number, preventing tombstones from deleting keys
402 : // within the same file. We check here if the largest RANGEDEL sequence
403 : // number is greater than the file's smallest sequence number. If it is,
404 : // the RANGEDEL could conceivably (although inconclusively) delete data
405 : // within the same file.
406 : //
407 : // Note that this heuristic is imperfect. If a table containing a range
408 : // deletion is ingested into L5 and subsequently compacted into L6 but
409 : // an open snapshot prevents elision of covered keys in L6, the
410 : // resulting RangeDeletionsBytesEstimate will incorrectly include all
411 : // covered keys.
412 : //
413 : // TODO(jackson): We could prevent the above error in the heuristic by
414 : // computing the file's RangeDeletionsBytesEstimate during the
415 : // compaction itself. It's unclear how common this is.
416 : //
417 : // NOTE: If the span `s` wholly contains a table containing range keys,
418 : // the returned size estimate will be slightly inflated by the range key
419 : // block. However, in practice, range keys are expected to be rare, and
420 : // the size of the range key block relative to the overall size of the
421 : // table is expected to be small.
422 2 : if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
423 2 : size, err := r.EstimateDiskUsage(start, end)
424 2 : if err != nil {
425 0 : return nil, err
426 0 : }
427 2 : stats.RangeDeletionsBytesEstimate += size
428 2 :
429 2 : // As the file is in the bottommost level, there is no need to collect a
430 2 : // deletion hint.
431 2 : continue
432 : }
433 :
434 : // While the size estimates for point keys should only be updated if this
435 : // span contains a range del, the sequence numbers are required for the
436 : // hint. Unconditionally descend, but conditionally update the estimates.
437 2 : hintType := compactionHintFromKeys(s.Keys)
438 2 : estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
439 2 : if err != nil {
440 0 : return nil, err
441 0 : }
442 2 : stats.RangeDeletionsBytesEstimate += estimate
443 2 :
444 2 : // If any files were completely contained with the range,
445 2 : // hintSeqNum is the smallest sequence number contained in any
446 2 : // such file.
447 2 : if hintSeqNum == math.MaxUint64 {
448 2 : continue
449 : }
450 2 : hint := deleteCompactionHint{
451 2 : hintType: hintType,
452 2 : start: make([]byte, len(start)),
453 2 : end: make([]byte, len(end)),
454 2 : tombstoneFile: meta,
455 2 : tombstoneLevel: level,
456 2 : tombstoneLargestSeqNum: s.LargestSeqNum(),
457 2 : tombstoneSmallestSeqNum: s.SmallestSeqNum(),
458 2 : fileSmallestSeqNum: hintSeqNum,
459 2 : }
460 2 : copy(hint.start, start)
461 2 : copy(hint.end, end)
462 2 : compactionHints = append(compactionHints, hint)
463 : }
464 2 : if err != nil {
465 0 : return nil, err
466 0 : }
467 2 : return compactionHints, nil
468 : }
469 :
470 : func (d *DB) estimateSizesBeneath(
471 : v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties,
472 2 : ) (avgValueLogicalSize, compressionRatio float64, err error) {
473 2 : // Find all files in lower levels that overlap with meta,
474 2 : // summing their value sizes and entry counts.
475 2 : file := meta
476 2 : var fileSum, keySum, valSum, entryCount uint64
477 2 : // Include the file itself. This is important because in some instances, the
478 2 : // computed compression ratio is applied to the tombstones contained within
479 2 : // `meta` itself. If there are no files beneath `meta` in the LSM, we would
480 2 : // calculate a compression ratio of 0 which is not accurate for the file's
481 2 : // own tombstones.
482 2 : fileSum += file.Size
483 2 : entryCount += fileProps.NumEntries
484 2 : keySum += fileProps.RawKeySize
485 2 : valSum += fileProps.RawValueSize
486 2 :
487 2 : addPhysicalTableStats := func(r *sstable.Reader) (err error) {
488 2 : fileSum += file.Size
489 2 : entryCount += r.Properties.NumEntries
490 2 : keySum += r.Properties.RawKeySize
491 2 : valSum += r.Properties.RawValueSize
492 2 : return nil
493 2 : }
494 2 : addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
495 1 : fileSum += file.Size
496 1 : entryCount += file.Stats.NumEntries
497 1 : keySum += v.Properties.RawKeySize
498 1 : valSum += v.Properties.RawValueSize
499 1 : return nil
500 1 : }
501 :
502 2 : for l := level + 1; l < numLevels; l++ {
503 2 : overlaps := v.Overlaps(l, meta.Smallest.UserKey,
504 2 : meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
505 2 : iter := overlaps.Iter()
506 2 : for file = iter.First(); file != nil; file = iter.Next() {
507 2 : var err error
508 2 : if file.Virtual {
509 1 : err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
510 2 : } else {
511 2 : err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
512 2 : }
513 2 : if err != nil {
514 0 : return 0, 0, err
515 0 : }
516 : }
517 : }
518 2 : if entryCount == 0 {
519 0 : return 0, 0, nil
520 0 : }
521 : // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
522 : // the value sum according to the data size to account for compression,
523 : // index blocks and metadata overhead. Eg:
524 : //
525 : // Compression rate × Average uncompressed value size
526 : //
527 : // ↓
528 : //
529 : // FileSize RawValueSize
530 : // ----------------------- × ------------
531 : // RawKeySize+RawValueSize NumEntries
532 : //
533 : // We return the average logical value size plus the compression ratio,
534 : // leaving the scaling to the caller. This allows the caller to perform
535 : // additional compression ratio scaling if necessary.
536 2 : uncompressedSum := float64(keySum + valSum)
537 2 : compressionRatio = float64(fileSum) / uncompressedSum
538 2 : avgValueLogicalSize = (float64(valSum) / float64(entryCount))
539 2 : return avgValueLogicalSize, compressionRatio, nil
540 : }
541 :
542 : func (d *DB) estimateReclaimedSizeBeneath(
543 : v *version, level int, start, end []byte, hintType deleteCompactionHintType,
544 2 : ) (estimate uint64, hintSeqNum uint64, err error) {
545 2 : // Find all files in lower levels that overlap with the deleted range
546 2 : // [start, end).
547 2 : //
548 2 : // An overlapping file might be completely contained by the range
549 2 : // tombstone, in which case we can count the entire file size in
550 2 : // our estimate without doing any additional I/O.
551 2 : //
552 2 : // Otherwise, estimating the range for the file requires
553 2 : // additional I/O to read the file's index blocks.
554 2 : hintSeqNum = math.MaxUint64
555 2 : for l := level + 1; l < numLevels; l++ {
556 2 : overlaps := v.Overlaps(l, start, end, true /* exclusiveEnd */)
557 2 : iter := overlaps.Iter()
558 2 : for file := iter.First(); file != nil; file = iter.Next() {
559 2 : startCmp := d.cmp(start, file.Smallest.UserKey)
560 2 : endCmp := d.cmp(file.Largest.UserKey, end)
561 2 : if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
562 2 : // The range fully contains the file, so skip looking it up in table
563 2 : // cache/looking at its indexes and add the full file size. Whether the
564 2 : // disk estimate and hint seqnums are updated depends on a) the type of
565 2 : // hint that requested the estimate and b) the keys contained in this
566 2 : // current file.
567 2 : var updateEstimates, updateHints bool
568 2 : switch hintType {
569 2 : case deleteCompactionHintTypePointKeyOnly:
570 2 : // The range deletion byte estimates should only be updated if this
571 2 : // table contains point keys. This ends up being an overestimate in
572 2 : // the case that table also has range keys, but such keys are expected
573 2 : // to contribute a negligible amount of the table's overall size,
574 2 : // relative to point keys.
575 2 : if file.HasPointKeys {
576 2 : updateEstimates = true
577 2 : }
578 : // As the initiating span contained only range dels, hints can only be
579 : // updated if this table does _not_ contain range keys.
580 2 : if !file.HasRangeKeys {
581 2 : updateHints = true
582 2 : }
583 2 : case deleteCompactionHintTypeRangeKeyOnly:
584 2 : // The initiating span contained only range key dels. The estimates
585 2 : // apply only to point keys, and are therefore not updated.
586 2 : updateEstimates = false
587 2 : // As the initiating span contained only range key dels, hints can
588 2 : // only be updated if this table does _not_ contain point keys.
589 2 : if !file.HasPointKeys {
590 2 : updateHints = true
591 2 : }
592 2 : case deleteCompactionHintTypePointAndRangeKey:
593 2 : // Always update the estimates and hints, as this hint type can drop a
594 2 : // file, irrespective of the mixture of keys. Similar to above, the
595 2 : // range del bytes estimates is an overestimate.
596 2 : updateEstimates, updateHints = true, true
597 0 : default:
598 0 : panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
599 : }
600 2 : if updateEstimates {
601 2 : estimate += file.Size
602 2 : }
603 2 : if updateHints && hintSeqNum > file.SmallestSeqNum {
604 2 : hintSeqNum = file.SmallestSeqNum
605 2 : }
606 2 : } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
607 2 : // Partial overlap.
608 2 : if hintType == deleteCompactionHintTypeRangeKeyOnly {
609 2 : // If the hint that generated this overlap contains only range keys,
610 2 : // there is no need to calculate disk usage, as the reclaimable space
611 2 : // is expected to be minimal relative to point keys.
612 2 : continue
613 : }
614 2 : var size uint64
615 2 : var err error
616 2 : if file.Virtual {
617 2 : err = d.tableCache.withVirtualReader(
618 2 : file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
619 2 : size, err = r.EstimateDiskUsage(start, end)
620 2 : return err
621 2 : })
622 2 : } else {
623 2 : err = d.tableCache.withReader(
624 2 : file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
625 2 : size, err = r.EstimateDiskUsage(start, end)
626 2 : return err
627 2 : })
628 : }
629 :
630 2 : if err != nil {
631 0 : return 0, hintSeqNum, err
632 0 : }
633 2 : estimate += size
634 : }
635 : }
636 : }
637 2 : return estimate, hintSeqNum, nil
638 : }
639 :
640 2 : func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
641 2 : // If a table contains range deletions or range key deletions, we defer the
642 2 : // stats collection. There are two main reasons for this:
643 2 : //
644 2 : // 1. Estimating the potential for reclaimed space due to a range deletion
645 2 : // tombstone requires scanning the LSM - a potentially expensive operation
646 2 : // that should be deferred.
647 2 : // 2. Range deletions and / or range key deletions present an opportunity to
648 2 : // compute "deletion hints", which also requires a scan of the LSM to
649 2 : // compute tables that would be eligible for deletion.
650 2 : //
651 2 : // These two tasks are deferred to the table stats collector goroutine.
652 2 : if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
653 2 : return false
654 2 : }
655 :
656 : // If a table is more than 10% point deletions without user-provided size
657 : // estimates, don't calculate the PointDeletionsBytesEstimate statistic
658 : // using our limited knowledge. The table stats collector can populate the
659 : // stats and calculate an average of value size of all the tables beneath
660 : // the table in the LSM, which will be more accurate.
661 2 : if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
662 2 : return false
663 2 : }
664 :
665 2 : var pointEstimate uint64
666 2 : if props.NumEntries > 0 {
667 2 : // Use the file's own average key and value sizes as an estimate. This
668 2 : // doesn't require any additional IO and since the number of point
669 2 : // deletions in the file is low, the error introduced by this crude
670 2 : // estimate is expected to be small.
671 2 : commonProps := &props.CommonProperties
672 2 : avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
673 2 : pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
674 2 : }
675 :
676 2 : meta.Stats.NumEntries = props.NumEntries
677 2 : meta.Stats.NumDeletions = props.NumDeletions
678 2 : meta.Stats.NumRangeKeySets = props.NumRangeKeySets
679 2 : meta.Stats.PointDeletionsBytesEstimate = pointEstimate
680 2 : meta.Stats.RangeDeletionsBytesEstimate = 0
681 2 : meta.Stats.ValueBlocksSize = props.ValueBlocksSize
682 2 : meta.StatsMarkValid()
683 2 : return true
684 : }
685 :
686 : func pointDeletionsBytesEstimate(
687 : fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
688 2 : ) (estimate uint64) {
689 2 : if props.NumEntries == 0 {
690 0 : return 0
691 0 : }
692 2 : numPointDels := props.NumPointDeletions()
693 2 : if numPointDels == 0 {
694 2 : return 0
695 2 : }
696 : // Estimate the potential space to reclaim using the table's own properties.
697 : // There may or may not be keys covered by any individual point tombstone.
698 : // If not, compacting the point tombstone into L6 will at least allow us to
699 : // drop the point deletion key and will reclaim the tombstone's key bytes.
700 : // If there are covered key(s), we also get to drop key and value bytes for
701 : // each covered key.
702 : //
703 : // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
704 : // uncompressed size of entries that will be elided by fully compacting the
705 : // tombstone. For these tombstones, there's no guesswork—we use the
706 : // RawPointTombstoneValueSizeHint property which is the sum of all these
707 : // tombstones' encoded values.
708 : //
709 : // For un-sized point tombstones (DELs), we estimate assuming that each
710 : // point tombstone on average covers 1 key and using average value sizes.
711 : // This is almost certainly an overestimate, but that's probably okay
712 : // because point tombstones can slow range iterations even when they don't
713 : // cover a key.
714 : //
715 : // TODO(jackson): This logic doesn't directly incorporate fixed per-key
716 : // overhead (8-byte trailer, plus at least 1 byte encoding the length of the
717 : // key and 1 byte encoding the length of the value). This overhead is
718 : // indirectly incorporated through the compression ratios, but that results
719 : // in the overhead being smeared per key-byte and value-byte, rather than
720 : // per-entry. This per-key fixed overhead can be nontrivial, especially for
721 : // dense swaths of point tombstones. Give some thought as to whether we
722 : // should directly include fixed per-key overhead in the calculations.
723 :
724 : // Below, we calculate the tombstone contributions and the shadowed keys'
725 : // contributions separately.
726 2 : var tombstonesLogicalSize float64
727 2 : var shadowedLogicalSize float64
728 2 :
729 2 : // 1. Calculate the contribution of the tombstone keys themselves.
730 2 : if props.RawPointTombstoneKeySize > 0 {
731 2 : tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
732 2 : } else {
733 0 : // This sstable predates the existence of the RawPointTombstoneKeySize
734 0 : // property. We can use the average key size within the file itself and
735 0 : // the count of point deletions to estimate the size.
736 0 : tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
737 0 : }
738 :
739 : // 2. Calculate the contribution of the keys shadowed by tombstones.
740 : //
741 : // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
742 : // tombstones encode the size of both the key and value of the shadowed KV
743 : // entries. These sizes are aggregated into a sstable property.
744 2 : shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
745 2 :
746 2 : // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
747 2 : // keys.
748 2 : numUnsizedDels := numPointDels - props.NumSizedDeletions
749 2 : {
750 2 : // The shadowed keys have the same exact user keys as the tombstones
751 2 : // themselves, so we can use the `tombstonesLogicalSize` we computed
752 2 : // earlier as an estimate. There's a complication that
753 2 : // `tombstonesLogicalSize` may include DELSIZED keys we already
754 2 : // accounted for.
755 2 : shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
756 2 :
757 2 : // Calculate the contribution of the deleted values. The caller has
758 2 : // already computed an average logical size (possibly computed across
759 2 : // many sstables).
760 2 : shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
761 2 : }
762 :
763 : // Scale both tombstone and shadowed totals by logical:physical ratios to
764 : // account for compression, metadata overhead, etc.
765 : //
766 : // Physical FileSize
767 : // ----------- = -----------------------
768 : // Logical RawKeySize+RawValueSize
769 : //
770 2 : return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
771 : }
772 :
773 : func estimatePhysicalSizes(
774 : fileSize uint64, props *sstable.CommonProperties,
775 2 : ) (avgValLogicalSize, compressionRatio float64) {
776 2 : // RawKeySize and RawValueSize are uncompressed totals. Scale according to
777 2 : // the data size to account for compression, index blocks and metadata
778 2 : // overhead. Eg:
779 2 : //
780 2 : // Compression rate × Average uncompressed value size
781 2 : //
782 2 : // ↓
783 2 : //
784 2 : // FileSize RawValSize
785 2 : // ----------------------- × ----------
786 2 : // RawKeySize+RawValueSize NumEntries
787 2 : //
788 2 : uncompressedSum := props.RawKeySize + props.RawValueSize
789 2 : compressionRatio = float64(fileSize) / float64(uncompressedSum)
790 2 : avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
791 2 : return avgValLogicalSize, compressionRatio
792 2 : }
793 :
794 : // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
795 : // returns "ranged deletion" spans for a single table, providing a combined view
796 : // of both range deletion and range key deletion spans. The
797 : // tableRangedDeletionIter is intended for use in the specific case of computing
798 : // the statistics and deleteCompactionHints for a single table.
799 : //
800 : // As an example, consider the following set of spans from the range deletion
801 : // and range key blocks of a table:
802 : //
803 : // |---------| |---------| |-------| RANGEKEYDELs
804 : // |-----------|-------------| |-----| RANGEDELs
805 : // __________________________________________________________
806 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
807 : //
808 : // The tableRangedDeletionIter produces the following set of output spans, where
809 : // '1' indicates a span containing only range deletions, '2' is a span
810 : // containing only range key deletions, and '3' is a span containing a mixture
811 : // of both range deletions and range key deletions.
812 : //
813 : // 1 3 1 3 2 1 3 2
814 : // |-----|---------|-----|---|-----| |---|-|-----|
815 : // __________________________________________________________
816 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
817 : //
818 : // Algorithm.
819 : //
820 : // The iterator first defragments the range deletion and range key blocks
821 : // separately. During this defragmentation, the range key block is also filtered
822 : // so that keys other than range key deletes are ignored. The range delete and
823 : // range key delete keyspaces are then merged.
824 : //
825 : // Note that the only fragmentation introduced by merging is from where a range
826 : // del span overlaps with a range key del span. Within the bounds of any overlap
827 : // there is guaranteed to be no further fragmentation, as the constituent spans
828 : // have already been defragmented. To the left and right of any overlap, the
829 : // same reasoning applies. For example,
830 : //
831 : // |--------| |-------| RANGEKEYDEL
832 : // |---------------------------| RANGEDEL
833 : // |----1---|----3---|----1----|---2---| Merged, fragmented spans.
834 : // __________________________________________________________
835 : // a b c d e f g h i j k l m n o p q r s t u v w x y z
836 : //
837 : // Any fragmented abutting spans produced by the merging iter will be of
838 : // differing types (i.e. a transition from a span with homogenous key kinds to a
839 : // heterogeneous span, or a transition from a span with exclusively range dels
840 : // to a span with exclusively range key dels). Therefore, further
841 : // defragmentation is not required.
842 : //
843 : // Each span returned by the tableRangeDeletionIter will have at most four keys,
844 : // corresponding to the largest and smallest sequence numbers encountered across
845 : // the range deletes and range keys deletes that comprised the merged spans.
846 : func newCombinedDeletionKeyspanIter(
847 : comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata,
848 2 : ) (keyspan.FragmentIterator, error) {
849 2 : // The range del iter and range key iter are each wrapped in their own
850 2 : // defragmenting iter. For each iter, abutting spans can always be merged.
851 2 : var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
852 : // Reduce keys by maintaining a slice of at most length two, corresponding to
853 : // the largest and smallest keys in the defragmented span. This maintains the
854 : // contract that the emitted slice is sorted by (SeqNum, Kind) descending.
855 2 : reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
856 2 : if len(current) == 0 && len(incoming) == 0 {
857 0 : // While this should never occur in practice, a defensive return is used
858 0 : // here to preserve correctness.
859 0 : return current
860 0 : }
861 2 : var largest, smallest keyspan.Key
862 2 : var set bool
863 2 : for _, keys := range [2][]keyspan.Key{current, incoming} {
864 2 : if len(keys) == 0 {
865 0 : continue
866 : }
867 2 : first, last := keys[0], keys[len(keys)-1]
868 2 : if !set {
869 2 : largest, smallest = first, last
870 2 : set = true
871 2 : continue
872 : }
873 2 : if first.Trailer > largest.Trailer {
874 2 : largest = first
875 2 : }
876 2 : if last.Trailer < smallest.Trailer {
877 2 : smallest = last
878 2 : }
879 : }
880 2 : if largest.Equal(comparer.Equal, smallest) {
881 2 : current = append(current[:0], largest)
882 2 : } else {
883 2 : current = append(current[:0], largest, smallest)
884 2 : }
885 2 : return current
886 : }
887 :
888 : // The separate iters for the range dels and range keys are wrapped in a
889 : // merging iter to join the keyspaces into a single keyspace. The separate
890 : // iters are only added if the particular key kind is present.
891 2 : mIter := &keyspanimpl.MergingIter{}
892 2 : var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
893 2 : if in.KeysOrder != keyspan.ByTrailerDesc {
894 0 : panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
895 : }
896 2 : out.Start, out.End = in.Start, in.End
897 2 : out.Keys = append(out.Keys[:0], in.Keys...)
898 2 : out.KeysOrder = keyspan.ByTrailerDesc
899 2 : // NB: The order of by-trailer descending may have been violated,
900 2 : // because we've layered rangekey and rangedel iterators from the same
901 2 : // sstable into the same keyspanimpl.MergingIter. The MergingIter will
902 2 : // return the keys in the order that the child iterators were provided.
903 2 : // Sort the keys to ensure they're sorted by trailer descending.
904 2 : keyspan.SortKeysByTrailer(&out.Keys)
905 2 : return nil
906 : })
907 2 : mIter.Init(comparer.Compare, transform, new(keyspanimpl.MergingBuffers))
908 2 :
909 2 : iter, err := cr.NewRawRangeDelIter(m.IterTransforms())
910 2 : if err != nil {
911 0 : return nil, err
912 0 : }
913 2 : if iter != nil {
914 2 : // Assert expected bounds. In previous versions of Pebble, range
915 2 : // deletions persisted to sstables could exceed the bounds of the
916 2 : // containing files due to "split user keys." This required readers to
917 2 : // constrain the tombstones' bounds to the containing file at read time.
918 2 : // See docs/range_deletions.md for an extended discussion of the design
919 2 : // and invariants at that time.
920 2 : //
921 2 : // We've since compacted away all 'split user-keys' and in the process
922 2 : // eliminated all "untruncated range tombstones" for physical sstables.
923 2 : // We no longer need to perform truncation at read time for these
924 2 : // sstables.
925 2 : //
926 2 : // At the same time, we've also introduced the concept of "virtual
927 2 : // SSTables" where the file metadata's effective bounds can again be
928 2 : // reduced to be narrower than the contained tombstones. These virtual
929 2 : // SSTables handle truncation differently, performing it using
930 2 : // keyspan.Truncate when the sstable's range deletion iterator is
931 2 : // opened.
932 2 : //
933 2 : // Together, these mean that we should never see untruncated range
934 2 : // tombstones any more—and the merging iterator no longer accounts for
935 2 : // their existence. Since there's abundant subtlety that we're relying
936 2 : // on, we choose to be conservative and assert that these invariants
937 2 : // hold. We could (and previously did) choose to only validate these
938 2 : // bounds in invariants builds, but the most likely avenue for these
939 2 : // tombstones' existence is through a bug in a migration and old data
940 2 : // sitting around in an old store from long ago.
941 2 : //
942 2 : // The table stats collector will read all files range deletions
943 2 : // asynchronously after Open, and provides a perfect opportunity to
944 2 : // validate our invariants without harming user latency. We also
945 2 : // previously performed truncation here which similarly required key
946 2 : // comparisons, so replacing those key comparisons with assertions
947 2 : // should be roughly similar in performance.
948 2 : //
949 2 : // TODO(jackson): Only use AssertBounds in invariants builds in the
950 2 : // following release.
951 2 : iter = keyspan.AssertBounds(
952 2 : iter, m.SmallestPointKey, m.LargestPointKey.UserKey, comparer.Compare,
953 2 : )
954 2 : dIter := &keyspan.DefragmentingIter{}
955 2 : dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
956 2 : iter = dIter
957 2 : mIter.AddLevel(iter)
958 2 : }
959 :
960 2 : iter, err = cr.NewRawRangeKeyIter(m.IterTransforms())
961 2 : if err != nil {
962 0 : return nil, err
963 0 : }
964 2 : if iter != nil {
965 2 : // Assert expected bounds in tests.
966 2 : if invariants.Enabled {
967 2 : iter = keyspan.AssertBounds(
968 2 : iter, m.SmallestRangeKey, m.LargestRangeKey.UserKey, comparer.Compare,
969 2 : )
970 2 : }
971 : // Wrap the range key iterator in a filter that elides keys other than range
972 : // key deletions.
973 2 : iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
974 2 : out.Start, out.End = in.Start, in.End
975 2 : out.Keys = out.Keys[:0]
976 2 : for _, k := range in.Keys {
977 2 : if k.Kind() != base.InternalKeyKindRangeKeyDelete {
978 2 : continue
979 : }
980 2 : out.Keys = append(out.Keys, k)
981 : }
982 2 : return len(out.Keys) > 0
983 : }, comparer.Compare)
984 2 : dIter := &keyspan.DefragmentingIter{}
985 2 : dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
986 2 : iter = dIter
987 2 : mIter.AddLevel(iter)
988 : }
989 :
990 2 : return mIter, nil
991 : }
992 :
993 : // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
994 : // with the sum of the files' counts of range key fragments. Its annotation type
995 : // is a *uint64. The count of range key sets may change once a table's stats are
996 : // loaded asynchronously, so its values are marked as cacheable only if a file's
997 : // stats have been loaded.
998 : type rangeKeySetsAnnotator struct{}
999 :
1000 : var _ manifest.Annotator = rangeKeySetsAnnotator{}
1001 :
1002 2 : func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
1003 2 : if dst == nil {
1004 2 : return new(uint64)
1005 2 : }
1006 0 : v := dst.(*uint64)
1007 0 : *v = 0
1008 0 : return v
1009 : }
1010 :
1011 : func (a rangeKeySetsAnnotator) Accumulate(
1012 : f *fileMetadata, dst interface{},
1013 2 : ) (v interface{}, cacheOK bool) {
1014 2 : vptr := dst.(*uint64)
1015 2 : *vptr = *vptr + f.Stats.NumRangeKeySets
1016 2 : return vptr, f.StatsValid()
1017 2 : }
1018 :
1019 1 : func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1020 1 : srcV := src.(*uint64)
1021 1 : dstV := dst.(*uint64)
1022 1 : *dstV = *dstV + *srcV
1023 1 : return dstV
1024 1 : }
1025 :
1026 : // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
1027 : // files of the LSM. It only counts keys in files for which table stats have
1028 : // been loaded. It uses a b-tree annotator to cache intermediate values between
1029 : // calculations when possible.
1030 2 : func countRangeKeySetFragments(v *version) (count uint64) {
1031 2 : for l := 0; l < numLevels; l++ {
1032 2 : if v.RangeKeyLevels[l].Empty() {
1033 2 : continue
1034 : }
1035 2 : count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
1036 : }
1037 2 : return count
1038 : }
1039 :
1040 : // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes
1041 : // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk
1042 : // eys). Its annotation type is a *uint64. The count of tombstones may change
1043 : // once a table's stats are loaded asynchronously, so its values are marked as
1044 : // cacheable only if a file's stats have been loaded.
1045 : type tombstonesAnnotator struct{}
1046 :
1047 : var _ manifest.Annotator = tombstonesAnnotator{}
1048 :
1049 2 : func (a tombstonesAnnotator) Zero(dst interface{}) interface{} {
1050 2 : if dst == nil {
1051 2 : return new(uint64)
1052 2 : }
1053 1 : v := dst.(*uint64)
1054 1 : *v = 0
1055 1 : return v
1056 : }
1057 :
1058 : func (a tombstonesAnnotator) Accumulate(
1059 : f *fileMetadata, dst interface{},
1060 2 : ) (v interface{}, cacheOK bool) {
1061 2 : vptr := dst.(*uint64)
1062 2 : *vptr = *vptr + f.Stats.NumDeletions
1063 2 : return vptr, f.StatsValid()
1064 2 : }
1065 :
1066 2 : func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1067 2 : srcV := src.(*uint64)
1068 2 : dstV := dst.(*uint64)
1069 2 : *dstV = *dstV + *srcV
1070 2 : return dstV
1071 2 : }
1072 :
1073 : // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL)
1074 : // internal keys across all files of the LSM. It only counts keys in files for
1075 : // which table stats have been loaded. It uses a b-tree annotator to cache
1076 : // intermediate values between calculations when possible.
1077 2 : func countTombstones(v *version) (count uint64) {
1078 2 : for l := 0; l < numLevels; l++ {
1079 2 : if v.Levels[l].Empty() {
1080 2 : continue
1081 : }
1082 2 : count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64)
1083 : }
1084 2 : return count
1085 : }
1086 :
1087 : // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree
1088 : // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation
1089 : // type is a *uint64. The value block size may change once a table's stats are
1090 : // loaded asynchronously, so its values are marked as cacheable only if a
1091 : // file's stats have been loaded.
1092 : type valueBlocksSizeAnnotator struct{}
1093 :
1094 : var _ manifest.Annotator = valueBlocksSizeAnnotator{}
1095 :
1096 2 : func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} {
1097 2 : if dst == nil {
1098 2 : return new(uint64)
1099 2 : }
1100 1 : v := dst.(*uint64)
1101 1 : *v = 0
1102 1 : return v
1103 : }
1104 :
1105 : func (a valueBlocksSizeAnnotator) Accumulate(
1106 : f *fileMetadata, dst interface{},
1107 2 : ) (v interface{}, cacheOK bool) {
1108 2 : vptr := dst.(*uint64)
1109 2 : *vptr = *vptr + f.Stats.ValueBlocksSize
1110 2 : return vptr, f.StatsValid()
1111 2 : }
1112 :
1113 2 : func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
1114 2 : srcV := src.(*uint64)
1115 2 : dstV := dst.(*uint64)
1116 2 : *dstV = *dstV + *srcV
1117 2 : return dstV
1118 2 : }
1119 :
1120 : // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all
1121 : // files for a level of the LSM. It only includes the size for files for which
1122 : // table stats have been loaded. It uses a b-tree annotator to cache
1123 : // intermediate values between calculations when possible. It must not be
1124 : // called concurrently.
1125 : //
1126 : // REQUIRES: 0 <= level <= numLevels.
1127 2 : func valueBlocksSizeForLevel(v *version, level int) (count uint64) {
1128 2 : if v.Levels[level].Empty() {
1129 2 : return 0
1130 2 : }
1131 2 : return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64)
1132 : }
|