LCOV - 2024-07-28 08:16Z ed42fb43 - meta test only.lcov

LCOV - code coverage report

Current view:	top level - pebble/sstable - writer.go (source / functions)		Hit	Total	Coverage
Test:	2024-07-28 08:16Z ed42fb43 - meta test only.lcov	Lines:	925	1298	71.3 %
Date:	2024-07-28 08:17:03	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package sstable
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "encoding/binary"
      10             :         "fmt"
      11             :         "math"
      12             :         "runtime"
      13             :         "slices"
      14             :         "sort"
      15             :         "sync"
      16             : 
      17             :         "github.com/cockroachdb/errors"
      18             :         "github.com/cockroachdb/pebble/internal/base"
      19             :         "github.com/cockroachdb/pebble/internal/bytealloc"
      20             :         "github.com/cockroachdb/pebble/internal/cache"
      21             :         "github.com/cockroachdb/pebble/internal/invariants"
      22             :         "github.com/cockroachdb/pebble/internal/keyspan"
      23             :         "github.com/cockroachdb/pebble/internal/rangedel"
      24             :         "github.com/cockroachdb/pebble/internal/rangekey"
      25             :         "github.com/cockroachdb/pebble/objstorage"
      26             :         "github.com/cockroachdb/pebble/sstable/block"
      27             :         "github.com/cockroachdb/pebble/sstable/rowblk"
      28             : )
      29             : 
      30             : // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties.
      31             : // It would also be nice to account for the length of the data block properties here,
      32             : // but isn't necessary since this is an estimate.
      33             : const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2
      34             : 
      35             : var errWriterClosed = errors.New("pebble: writer is closed")
      36             : 
      37             : // WriterMetadata holds info about a finished sstable.
      38             : type WriterMetadata struct {
      39             :         Size          uint64
      40             :         SmallestPoint InternalKey
      41             :         // LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed
      42             :         // before Writer.Close is called, because they may only be set on
      43             :         // Writer.Close.
      44             :         LargestPoint     InternalKey
      45             :         SmallestRangeDel InternalKey
      46             :         LargestRangeDel  InternalKey
      47             :         SmallestRangeKey InternalKey
      48             :         LargestRangeKey  InternalKey
      49             :         HasPointKeys     bool
      50             :         HasRangeDelKeys  bool
      51             :         HasRangeKeys     bool
      52             :         SmallestSeqNum   base.SeqNum
      53             :         LargestSeqNum    base.SeqNum
      54             :         Properties       Properties
      55             : }
      56             : 
      57             : // SetSmallestPointKey sets the smallest point key to the given key.
      58             : // NB: this method set the "absolute" smallest point key. Any existing key is
      59             : // overridden.
      60           1 : func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) {
      61           1 :         m.SmallestPoint = k
      62           1 :         m.HasPointKeys = true
      63           1 : }
      64             : 
      65             : // SetSmallestRangeDelKey sets the smallest rangedel key to the given key.
      66             : // NB: this method set the "absolute" smallest rangedel key. Any existing key is
      67             : // overridden.
      68           1 : func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) {
      69           1 :         m.SmallestRangeDel = k
      70           1 :         m.HasRangeDelKeys = true
      71           1 : }
      72             : 
      73             : // SetSmallestRangeKey sets the smallest range key to the given key.
      74             : // NB: this method set the "absolute" smallest range key. Any existing key is
      75             : // overridden.
      76           1 : func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) {
      77           1 :         m.SmallestRangeKey = k
      78           1 :         m.HasRangeKeys = true
      79           1 : }
      80             : 
      81             : // SetLargestPointKey sets the largest point key to the given key.
      82             : // NB: this method set the "absolute" largest point key. Any existing key is
      83             : // overridden.
      84           1 : func (m *WriterMetadata) SetLargestPointKey(k InternalKey) {
      85           1 :         m.LargestPoint = k
      86           1 :         m.HasPointKeys = true
      87           1 : }
      88             : 
      89             : // SetLargestRangeDelKey sets the largest rangedel key to the given key.
      90             : // NB: this method set the "absolute" largest rangedel key. Any existing key is
      91             : // overridden.
      92           1 : func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) {
      93           1 :         m.LargestRangeDel = k
      94           1 :         m.HasRangeDelKeys = true
      95           1 : }
      96             : 
      97             : // SetLargestRangeKey sets the largest range key to the given key.
      98             : // NB: this method set the "absolute" largest range key. Any existing key is
      99             : // overridden.
     100           1 : func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) {
     101           1 :         m.LargestRangeKey = k
     102           1 :         m.HasRangeKeys = true
     103           1 : }
     104             : 
     105           1 : func (m *WriterMetadata) updateSeqNum(seqNum base.SeqNum) {
     106           1 :         if m.SmallestSeqNum > seqNum {
     107           1 :                 m.SmallestSeqNum = seqNum
     108           1 :         }
     109           1 :         if m.LargestSeqNum < seqNum {
     110           1 :                 m.LargestSeqNum = seqNum
     111           1 :         }
     112             : }
     113             : 
     114             : // flushDecisionOptions holds parameters to inform the sstable block flushing
     115             : // heuristics.
     116             : type flushDecisionOptions struct {
     117             :         blockSize          int
     118             :         blockSizeThreshold int
     119             :         // sizeClassAwareThreshold takes precedence over blockSizeThreshold when the
     120             :         // Writer is aware of the allocator's size classes.
     121             :         sizeClassAwareThreshold int
     122             : }
     123             : 
     124             : // Writer is a table writer.
     125             : type Writer struct {
     126             :         layout layoutWriter
     127             :         meta   WriterMetadata
     128             :         err    error
     129             :         // dataBlockOptions and indexBlockOptions are used to configure the sstable
     130             :         // block flush heuristics.
     131             :         dataBlockOptions  flushDecisionOptions
     132             :         indexBlockOptions flushDecisionOptions
     133             :         // The following fields are copied from Options.
     134             :         compare              Compare
     135             :         split                Split
     136             :         formatKey            base.FormatKey
     137             :         compression          Compression
     138             :         separator            Separator
     139             :         successor            Successor
     140             :         tableFormat          TableFormat
     141             :         isStrictObsolete     bool
     142             :         writingToLowestLevel bool
     143             :         restartInterval      int
     144             :         checksumType         block.ChecksumType
     145             :         // disableKeyOrderChecks disables the checks that keys are added to an
     146             :         // sstable in order. It is intended for internal use only in the construction
     147             :         // of invalid sstables for testing. See tool/make_test_sstables.go.
     148             :         disableKeyOrderChecks bool
     149             :         // With two level indexes, the index/filter of a SST file is partitioned into
     150             :         // smaller blocks with an additional top-level index on them. When reading an
     151             :         // index/filter, only the top-level index is loaded into memory. The two level
     152             :         // index/filter then uses the top-level index to load on demand into the block
     153             :         // cache the partitions that are required to perform the index/filter query.
     154             :         //
     155             :         // Two level indexes are enabled automatically when there is more than one
     156             :         // index block.
     157             :         //
     158             :         // This is useful when there are very large index blocks, which generally occurs
     159             :         // with the usage of large keys. With large index blocks, the index blocks fight
     160             :         // the data blocks for block cache space and the index blocks are likely to be
     161             :         // re-read many times from the disk. The top level index, which has a much
     162             :         // smaller memory footprint, can be used to prevent the entire index block from
     163             :         // being loaded into the block cache.
     164             :         twoLevelIndex       bool
     165             :         indexBlock          *indexBlockBuf
     166             :         rangeDelBlock       rowblk.Writer
     167             :         rangeKeyBlock       rowblk.Writer
     168             :         topLevelIndexBlock  rowblk.Writer
     169             :         props               Properties
     170             :         blockPropCollectors []BlockPropertyCollector
     171             :         obsoleteCollector   obsoleteKeyBlockPropertyCollector
     172             :         blockPropsEncoder   blockPropertiesEncoder
     173             :         // filter accumulates the filter block. If populated, the filter ingests
     174             :         // either the output of w.split (i.e. a prefix extractor) if w.split is not
     175             :         // nil, or the full keys otherwise.
     176             :         filter          filterWriter
     177             :         indexPartitions []indexBlockAndBlockProperties
     178             : 
     179             :         // indexBlockAlloc is used to bulk-allocate byte slices used to store index
     180             :         // blocks in indexPartitions. These live until the index finishes.
     181             :         indexBlockAlloc []byte
     182             :         // indexSepAlloc is used to bulk-allocate index block separator slices stored
     183             :         // in indexPartitions. These live until the index finishes.
     184             :         indexSepAlloc bytealloc.A
     185             : 
     186             :         // To allow potentially overlapping (i.e. un-fragmented) range keys spans to
     187             :         // be added to the Writer, a keyspan.Fragmenter is used to retain the keys
     188             :         // and values, emitting fragmented, coalesced spans as appropriate. Range
     189             :         // keys must be added in order of their start user-key.
     190             :         fragmenter        keyspan.Fragmenter
     191             :         rangeKeyEncoder   rangekey.Encoder
     192             :         rangeKeysBySuffix keyspan.KeysBySuffix
     193             :         rangeKeySpan      keyspan.Span
     194             :         rkBuf             []byte
     195             :         // dataBlockBuf consists of the state which is currently owned by and used by
     196             :         // the Writer client goroutine. This state can be handed off to other goroutines.
     197             :         dataBlockBuf *dataBlockBuf
     198             :         // blockBuf consists of the state which is owned by and used by the Writer client
     199             :         // goroutine.
     200             :         blockBuf blockBuf
     201             : 
     202             :         coordination coordinationState
     203             : 
     204             :         // Information (other than the byte slice) about the last point key, to
     205             :         // avoid extracting it again.
     206             :         lastPointKeyInfo pointKeyInfo
     207             : 
     208             :         // For value blocks.
     209             :         shortAttributeExtractor   base.ShortAttributeExtractor
     210             :         requiredInPlaceValueBound UserKeyPrefixBound
     211             :         // When w.tableFormat >= TableFormatPebblev3, valueBlockWriter is nil iff
     212             :         // WriterOptions.DisableValueBlocks was true.
     213             :         valueBlockWriter *valueBlockWriter
     214             : 
     215             :         allocatorSizeClasses []int
     216             : }
     217             : 
     218             : type pointKeyInfo struct {
     219             :         trailer base.InternalKeyTrailer
     220             :         // Only computed when w.valueBlockWriter is not nil.
     221             :         userKeyLen int
     222             :         // prefixLen uses w.split, if not nil. Only computed when w.valueBlockWriter
     223             :         // is not nil.
     224             :         prefixLen int
     225             :         // True iff the point was marked obsolete.
     226             :         isObsolete bool
     227             : }
     228             : 
     229             : type coordinationState struct {
     230             :         parallelismEnabled bool
     231             : 
     232             :         // writeQueue is used to write data blocks to disk. The writeQueue is primarily
     233             :         // used to maintain the order in which data blocks must be written to disk. For
     234             :         // this reason, every single data block write must be done through the writeQueue.
     235             :         writeQueue *writeQueue
     236             : 
     237             :         sizeEstimate dataBlockEstimates
     238             : }
     239             : 
     240           1 : func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) {
     241           1 :         c.parallelismEnabled = parallelismEnabled
     242           1 :         // useMutex is false regardless of parallelismEnabled, because we do not do
     243           1 :         // parallel compression yet.
     244           1 :         c.sizeEstimate.useMutex = false
     245           1 : 
     246           1 :         // writeQueueSize determines the size of the write queue, or the number
     247           1 :         // of items which can be added to the queue without blocking. By default, we
     248           1 :         // use a writeQueue size of 0, since we won't be doing any block writes in
     249           1 :         // parallel.
     250           1 :         writeQueueSize := 0
     251           1 :         if parallelismEnabled {
     252           1 :                 writeQueueSize = runtime.GOMAXPROCS(0)
     253           1 :         }
     254           1 :         c.writeQueue = newWriteQueue(writeQueueSize, writer)
     255             : }
     256             : 
     257             : // sizeEstimate is a general purpose helper for estimating two kinds of sizes:
     258             : // A. The compressed sstable size, which is useful for deciding when to start
     259             : //
     260             : //      a new sstable during flushes or compactions. In practice, we use this in
     261             : //      estimating the data size (excluding the index).
     262             : //
     263             : // B. The size of index blocks to decide when to start a new index block.
     264             : //
     265             : // There are some terminology peculiarities which are due to the origin of
     266             : // sizeEstimate for use case A with parallel compression enabled (for which
     267             : // the code has not been merged). Specifically this relates to the terms
     268             : // "written" and "compressed".
     269             : //   - The notion of "written" for case A is sufficiently defined by saying that
     270             : //     the data block is compressed. Waiting for the actual data block write to
     271             : //     happen can result in unnecessary estimation, when we already know how big
     272             : //     it will be in compressed form. Additionally, with the forthcoming value
     273             : //     blocks containing older MVCC values, these compressed block will be held
     274             : //     in-memory until late in the sstable writing, and we do want to accurately
     275             : //     account for them without waiting for the actual write.
     276             : //     For case B, "written" means that the index entry has been fully
     277             : //     generated, and has been added to the uncompressed block buffer for that
     278             : //     index block. It does not include actually writing a potentially
     279             : //     compressed index block.
     280             : //   - The notion of "compressed" is to differentiate between a "inflight" size
     281             : //     and the actual size, and is handled via computing a compression ratio
     282             : //     observed so far (defaults to 1).
     283             : //     For case A, this is actual data block compression, so the "inflight" size
     284             : //     is uncompressed blocks (that are no longer being written to) and the
     285             : //     "compressed" size is after they have been compressed.
     286             : //     For case B the inflight size is for a key-value pair in the index for
     287             : //     which the value size (the encoded size of the BlockHandleWithProperties)
     288             : //     is not accurately known, while the compressed size is the size of that
     289             : //     entry when it has been added to the (in-progress) index ssblock.
     290             : //
     291             : // Usage: To update state, one can optionally provide an inflight write value
     292             : // using addInflight (used for case B). When something is "written" the state
     293             : // can be updated using either writtenWithDelta or writtenWithTotal, which
     294             : // provide the actual delta size or the total size (latter must be
     295             : // monotonically non-decreasing). If there were no calls to addInflight, there
     296             : // isn't any real estimation happening here. So case A does not do any real
     297             : // estimation. However, when we introduce parallel compression, there will be
     298             : // estimation in that the client goroutine will call addInFlight and the
     299             : // compression goroutines will call writtenWithDelta.
     300             : type sizeEstimate struct {
     301             :         // emptySize is the size when there is no inflight data, and numEntries is 0.
     302             :         // emptySize is constant once set.
     303             :         emptySize uint64
     304             : 
     305             :         // inflightSize is the estimated size of some inflight data which hasn't
     306             :         // been written yet.
     307             :         inflightSize uint64
     308             : 
     309             :         // totalSize is the total size of the data which has already been written.
     310             :         totalSize uint64
     311             : 
     312             :         // numWrittenEntries is the total number of entries which have already been
     313             :         // written.
     314             :         numWrittenEntries uint64
     315             :         // numInflightEntries is the total number of entries which are inflight, and
     316             :         // haven't been written.
     317             :         numInflightEntries uint64
     318             : 
     319             :         // maxEstimatedSize stores the maximum result returned from sizeEstimate.size.
     320             :         // It ensures that values returned from subsequent calls to Writer.EstimatedSize
     321             :         // never decrease.
     322             :         maxEstimatedSize uint64
     323             : 
     324             :         // We assume that the entries added to the sizeEstimate can be compressed.
     325             :         // For this reason, we keep track of a compressedSize and an uncompressedSize
     326             :         // to compute a compression ratio for the inflight entries. If the entries
     327             :         // aren't being compressed, then compressedSize and uncompressedSize must be
     328             :         // equal.
     329             :         compressedSize   uint64
     330             :         uncompressedSize uint64
     331             : }
     332             : 
     333           1 : func (s *sizeEstimate) init(emptySize uint64) {
     334           1 :         s.emptySize = emptySize
     335           1 : }
     336             : 
     337           1 : func (s *sizeEstimate) size() uint64 {
     338           1 :         ratio := float64(1)
     339           1 :         if s.uncompressedSize > 0 {
     340           1 :                 ratio = float64(s.compressedSize) / float64(s.uncompressedSize)
     341           1 :         }
     342           1 :         estimatedInflightSize := uint64(float64(s.inflightSize) * ratio)
     343           1 :         total := s.totalSize + estimatedInflightSize
     344           1 :         if total > s.maxEstimatedSize {
     345           1 :                 s.maxEstimatedSize = total
     346           1 :         } else {
     347           1 :                 total = s.maxEstimatedSize
     348           1 :         }
     349             : 
     350           1 :         if total == 0 {
     351           1 :                 return s.emptySize
     352           1 :         }
     353             : 
     354           1 :         return total
     355             : }
     356             : 
     357           1 : func (s *sizeEstimate) numTotalEntries() uint64 {
     358           1 :         return s.numWrittenEntries + s.numInflightEntries
     359           1 : }
     360             : 
     361           1 : func (s *sizeEstimate) addInflight(size int) {
     362           1 :         s.numInflightEntries++
     363           1 :         s.inflightSize += uint64(size)
     364           1 : }
     365             : 
     366           1 : func (s *sizeEstimate) writtenWithTotal(newTotalSize uint64, inflightSize int) {
     367           1 :         finalEntrySize := int(newTotalSize - s.totalSize)
     368           1 :         s.writtenWithDelta(finalEntrySize, inflightSize)
     369           1 : }
     370             : 
     371           1 : func (s *sizeEstimate) writtenWithDelta(finalEntrySize int, inflightSize int) {
     372           1 :         if inflightSize > 0 {
     373           1 :                 // This entry was previously inflight, so we should decrement inflight
     374           1 :                 // entries and update the "compression" stats for future estimation.
     375           1 :                 s.numInflightEntries--
     376           1 :                 s.inflightSize -= uint64(inflightSize)
     377           1 :                 s.uncompressedSize += uint64(inflightSize)
     378           1 :                 s.compressedSize += uint64(finalEntrySize)
     379           1 :         }
     380           1 :         s.numWrittenEntries++
     381           1 :         s.totalSize += uint64(finalEntrySize)
     382             : }
     383             : 
     384           1 : func (s *sizeEstimate) clear() {
     385           1 :         *s = sizeEstimate{emptySize: s.emptySize}
     386           1 : }
     387             : 
     388             : type indexBlockBuf struct {
     389             :         // block will only be accessed from the writeQueue.
     390             :         block rowblk.Writer
     391             : 
     392             :         size struct {
     393             :                 useMutex bool
     394             :                 mu       sync.Mutex
     395             :                 estimate sizeEstimate
     396             :         }
     397             : 
     398             :         // restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block`
     399             :         // must only be accessed from the writeQueue goroutine.
     400             :         restartInterval int
     401             : }
     402             : 
     403           1 : func (i *indexBlockBuf) clear() {
     404           1 :         i.block.Reset()
     405           1 :         if i.size.useMutex {
     406           1 :                 i.size.mu.Lock()
     407           1 :                 defer i.size.mu.Unlock()
     408           1 :         }
     409           1 :         i.size.estimate.clear()
     410           1 :         i.restartInterval = 0
     411             : }
     412             : 
     413             : var indexBlockBufPool = sync.Pool{
     414           1 :         New: func() interface{} {
     415           1 :                 return &indexBlockBuf{}
     416           1 :         },
     417             : }
     418             : 
     419             : const indexBlockRestartInterval = 1
     420             : 
     421           1 : func newIndexBlockBuf(useMutex bool) *indexBlockBuf {
     422           1 :         i := indexBlockBufPool.Get().(*indexBlockBuf)
     423           1 :         i.size.useMutex = useMutex
     424           1 :         i.restartInterval = indexBlockRestartInterval
     425           1 :         i.block.RestartInterval = indexBlockRestartInterval
     426           1 :         i.size.estimate.init(rowblk.EmptySize)
     427           1 :         return i
     428           1 : }
     429             : 
     430             : func (i *indexBlockBuf) shouldFlush(
     431             :         sep InternalKey, valueLen int, flushOptions flushDecisionOptions, sizeClassHints []int,
     432           1 : ) bool {
     433           1 :         if i.size.useMutex {
     434           1 :                 i.size.mu.Lock()
     435           1 :                 defer i.size.mu.Unlock()
     436           1 :         }
     437             : 
     438           1 :         nEntries := i.size.estimate.numTotalEntries()
     439           1 :         return shouldFlushWithHints(
     440           1 :                 sep.Size(), valueLen, i.restartInterval, int(i.size.estimate.size()),
     441           1 :                 int(nEntries), flushOptions, sizeClassHints)
     442             : }
     443             : 
     444           1 : func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) {
     445           1 :         i.block.Add(key, value)
     446           1 :         size := i.block.EstimatedSize()
     447           1 :         if i.size.useMutex {
     448           1 :                 i.size.mu.Lock()
     449           1 :                 defer i.size.mu.Unlock()
     450           1 :         }
     451           1 :         i.size.estimate.writtenWithTotal(uint64(size), inflightSize)
     452             : }
     453             : 
     454           1 : func (i *indexBlockBuf) finish() []byte {
     455           1 :         b := i.block.Finish()
     456           1 :         return b
     457           1 : }
     458             : 
     459           1 : func (i *indexBlockBuf) addInflight(inflightSize int) {
     460           1 :         if i.size.useMutex {
     461           1 :                 i.size.mu.Lock()
     462           1 :                 defer i.size.mu.Unlock()
     463           1 :         }
     464           1 :         i.size.estimate.addInflight(inflightSize)
     465             : }
     466             : 
     467           1 : func (i *indexBlockBuf) estimatedSize() uint64 {
     468           1 :         if i.size.useMutex {
     469           1 :                 i.size.mu.Lock()
     470           1 :                 defer i.size.mu.Unlock()
     471           1 :         }
     472             : 
     473             :         // Make sure that the size estimation works as expected when parallelism
     474             :         // is disabled.
     475           1 :         if invariants.Enabled && !i.size.useMutex {
     476           1 :                 if i.size.estimate.inflightSize != 0 {
     477           0 :                         panic("unexpected inflight entry in index block size estimation")
     478             :                 }
     479             : 
     480             :                 // NB: The i.block should only be accessed from the writeQueue goroutine,
     481             :                 // when parallelism is enabled. We break that invariant here, but that's
     482             :                 // okay since parallelism is disabled.
     483           1 :                 if i.size.estimate.size() != uint64(i.block.EstimatedSize()) {
     484           0 :                         panic("index block size estimation sans parallelism is incorrect")
     485             :                 }
     486             :         }
     487           1 :         return i.size.estimate.size()
     488             : }
     489             : 
     490             : // sizeEstimate is used for sstable size estimation. sizeEstimate can be
     491             : // accessed by the Writer client and compressionQueue goroutines. Fields
     492             : // should only be read/updated through the functions defined on the
     493             : // *sizeEstimate type.
     494             : type dataBlockEstimates struct {
     495             :         // If we don't do block compression in parallel, then we don't need to take
     496             :         // the performance hit of synchronizing using this mutex.
     497             :         useMutex bool
     498             :         mu       sync.Mutex
     499             : 
     500             :         estimate sizeEstimate
     501             : }
     502             : 
     503             : // inflightSize is the uncompressed block size estimate which has been
     504             : // previously provided to addInflightDataBlock(). If addInflightDataBlock()
     505             : // has not been called, this must be set to 0. compressedSize is the
     506             : // compressed size of the block.
     507           1 : func (d *dataBlockEstimates) dataBlockCompressed(compressedSize int, inflightSize int) {
     508           1 :         if d.useMutex {
     509           0 :                 d.mu.Lock()
     510           0 :                 defer d.mu.Unlock()
     511           0 :         }
     512           1 :         d.estimate.writtenWithDelta(compressedSize+block.TrailerLen, inflightSize)
     513             : }
     514             : 
     515             : // size is an estimated size of datablock data which has been written to disk.
     516           1 : func (d *dataBlockEstimates) size() uint64 {
     517           1 :         if d.useMutex {
     518           0 :                 d.mu.Lock()
     519           0 :                 defer d.mu.Unlock()
     520           0 :         }
     521             :         // If there is no parallel compression, there should not be any inflight bytes.
     522           1 :         if invariants.Enabled && !d.useMutex {
     523           1 :                 if d.estimate.inflightSize != 0 {
     524           0 :                         panic("unexpected inflight entry in data block size estimation")
     525             :                 }
     526             :         }
     527           1 :         return d.estimate.size()
     528             : }
     529             : 
     530             : // Avoid linter unused error.
     531             : var _ = (&dataBlockEstimates{}).addInflightDataBlock
     532             : 
     533             : // NB: unused since no parallel compression.
     534           0 : func (d *dataBlockEstimates) addInflightDataBlock(size int) {
     535           0 :         if d.useMutex {
     536           0 :                 d.mu.Lock()
     537           0 :                 defer d.mu.Unlock()
     538           0 :         }
     539             : 
     540           0 :         d.estimate.addInflight(size)
     541             : }
     542             : 
     543             : var writeTaskPool = sync.Pool{
     544           1 :         New: func() interface{} {
     545           1 :                 t := &writeTask{}
     546           1 :                 t.compressionDone = make(chan bool, 1)
     547           1 :                 return t
     548           1 :         },
     549             : }
     550             : 
     551             : type blockBuf struct {
     552             :         // tmp is a scratch buffer, large enough to hold either footerLen bytes,
     553             :         // blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most
     554             :         // likely large enough for a block handle with properties.
     555             :         tmp [blockHandleLikelyMaxLen]byte
     556             :         // compressedBuf is the destination buffer for compression. It is re-used over the
     557             :         // lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block.
     558             :         compressedBuf []byte
     559             :         checksummer   block.Checksummer
     560             : }
     561             : 
     562           1 : func (b *blockBuf) clear() {
     563           1 :         // We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies
     564           1 :         // on the length of the buffer, and not the capacity to determine if it needs
     565           1 :         // to make an allocation.
     566           1 :         *b = blockBuf{
     567           1 :                 compressedBuf: b.compressedBuf, checksummer: b.checksummer,
     568           1 :         }
     569           1 : }
     570             : 
     571             : // A dataBlockBuf holds all the state required to compress and write a data block to disk.
     572             : // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer
     573             : // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter
     574             : // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed
     575             : // to other goroutines for compression and file I/O.
     576             : type dataBlockBuf struct {
     577             :         blockBuf
     578             :         dataBlock rowblk.Writer
     579             : 
     580             :         // uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
     581             :         // next byte slice to be compressed. The uncompressed byte slice will be backed by the
     582             :         // dataBlock.buf.
     583             :         uncompressed []byte
     584             :         // compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
     585             :         // compressed byte slice which must be written to disk. The compressed byte slice may be
     586             :         // backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether
     587             :         // we use the result of the compression.
     588             :         compressed []byte
     589             :         // trailer is the block trailer encoding the compression type and checksum.
     590             :         trailer block.Trailer
     591             : 
     592             :         // We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to
     593             :         // pass the encoded block properties over to the write queue. To prevent copies, and allocations,
     594             :         // we give each dataBlockBuf, a blockPropertiesEncoder.
     595             :         blockPropsEncoder blockPropertiesEncoder
     596             :         // dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is
     597             :         // a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder.
     598             :         dataBlockProps []byte
     599             : 
     600             :         // sepScratch is reusable scratch space for computing separator keys.
     601             :         sepScratch []byte
     602             : }
     603             : 
     604           1 : func (d *dataBlockBuf) clear() {
     605           1 :         d.blockBuf.clear()
     606           1 :         d.dataBlock.Reset()
     607           1 : 
     608           1 :         d.uncompressed = nil
     609           1 :         d.compressed = nil
     610           1 :         d.dataBlockProps = nil
     611           1 :         d.sepScratch = d.sepScratch[:0]
     612           1 : }
     613             : 
     614             : var dataBlockBufPool = sync.Pool{
     615           1 :         New: func() interface{} {
     616           1 :                 return &dataBlockBuf{}
     617           1 :         },
     618             : }
     619             : 
     620           1 : func newDataBlockBuf(restartInterval int, checksumType block.ChecksumType) *dataBlockBuf {
     621           1 :         d := dataBlockBufPool.Get().(*dataBlockBuf)
     622           1 :         d.dataBlock.RestartInterval = restartInterval
     623           1 :         d.checksummer.Type = checksumType
     624           1 :         return d
     625           1 : }
     626             : 
     627           1 : func (d *dataBlockBuf) finish() {
     628           1 :         d.uncompressed = d.dataBlock.Finish()
     629           1 : }
     630             : 
     631           1 : func (d *dataBlockBuf) compressAndChecksum(c Compression) {
     632           1 :         d.compressed, d.trailer = compressAndChecksum(d.uncompressed, c, &d.blockBuf)
     633           1 : }
     634             : 
     635             : func (d *dataBlockBuf) shouldFlush(
     636             :         key InternalKey, valueLen int, flushOptions flushDecisionOptions, sizeClassHints []int,
     637           1 : ) bool {
     638           1 :         return shouldFlushWithHints(
     639           1 :                 key.Size(), valueLen, d.dataBlock.RestartInterval, d.dataBlock.EstimatedSize(),
     640           1 :                 d.dataBlock.EntryCount(), flushOptions, sizeClassHints)
     641           1 : }
     642             : 
     643             : type indexBlockAndBlockProperties struct {
     644             :         nEntries int
     645             :         // sep is the last key added to this block, for computing a separator later.
     646             :         sep        InternalKey
     647             :         properties []byte
     648             :         // block is the encoded block produced by blockWriter.finish.
     649             :         block []byte
     650             : }
     651             : 
     652             : // Set sets the value for the given key. The sequence number is set to 0.
     653             : // Intended for use to externally construct an sstable before ingestion into a
     654             : // DB. For a given Writer, the keys passed to Set must be in strictly increasing
     655             : // order.
     656             : //
     657             : // TODO(peter): untested
     658           1 : func (w *Writer) Set(key, value []byte) error {
     659           1 :         if w.err != nil {
     660           0 :                 return w.err
     661           0 :         }
     662           1 :         if w.isStrictObsolete {
     663           0 :                 return errors.Errorf("use AddWithForceObsolete")
     664           0 :         }
     665             :         // forceObsolete is false based on the assumption that no RANGEDELs in the
     666             :         // sstable delete the added points.
     667           1 :         return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false)
     668             : }
     669             : 
     670             : // Delete deletes the value for the given key. The sequence number is set to
     671             : // 0. Intended for use to externally construct an sstable before ingestion into
     672             : // a DB.
     673             : //
     674             : // TODO(peter): untested
     675           0 : func (w *Writer) Delete(key []byte) error {
     676           0 :         if w.err != nil {
     677           0 :                 return w.err
     678           0 :         }
     679           0 :         if w.isStrictObsolete {
     680           0 :                 return errors.Errorf("use AddWithForceObsolete")
     681           0 :         }
     682             :         // forceObsolete is false based on the assumption that no RANGEDELs in the
     683             :         // sstable delete the added points.
     684           0 :         return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false)
     685             : }
     686             : 
     687             : // DeleteRange deletes all of the keys (and values) in the range [start,end)
     688             : // (inclusive on start, exclusive on end). The sequence number is set to
     689             : // 0. Intended for use to externally construct an sstable before ingestion into
     690             : // a DB.
     691             : //
     692             : // TODO(peter): untested
     693           1 : func (w *Writer) DeleteRange(start, end []byte) error {
     694           1 :         if w.err != nil {
     695           0 :                 return w.err
     696           0 :         }
     697           1 :         return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end)
     698             : }
     699             : 
     700             : // Merge adds an action to the DB that merges the value at key with the new
     701             : // value. The details of the merge are dependent upon the configured merge
     702             : // operator. The sequence number is set to 0. Intended for use to externally
     703             : // construct an sstable before ingestion into a DB.
     704             : //
     705             : // TODO(peter): untested
     706           0 : func (w *Writer) Merge(key, value []byte) error {
     707           0 :         if w.err != nil {
     708           0 :                 return w.err
     709           0 :         }
     710           0 :         if w.isStrictObsolete {
     711           0 :                 return errors.Errorf("use AddWithForceObsolete")
     712           0 :         }
     713             :         // forceObsolete is false based on the assumption that no RANGEDELs in the
     714             :         // sstable that delete the added points. If the user configured this writer
     715             :         // to be strict-obsolete, addPoint will reject the addition of this MERGE.
     716           0 :         return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false)
     717             : }
     718             : 
     719             : // Add adds a key/value pair to the table being written. For a given Writer,
     720             : // the keys passed to Add must be in increasing order. The exception to this
     721             : // rule is range deletion tombstones. Range deletion tombstones need to be
     722             : // added ordered by their start key, but they can be added out of order from
     723             : // point entries. Additionally, range deletion tombstones must be fragmented
     724             : // (i.e. by keyspan.Fragmenter).
     725           1 : func (w *Writer) Add(key InternalKey, value []byte) error {
     726           1 :         if w.isStrictObsolete {
     727           0 :                 return errors.Errorf("use AddWithForceObsolete")
     728           0 :         }
     729           1 :         return w.AddWithForceObsolete(key, value, false)
     730             : }
     731             : 
     732             : // AddWithForceObsolete must be used when writing a strict-obsolete sstable.
     733             : //
     734             : // forceObsolete indicates whether the caller has determined that this key is
     735             : // obsolete even though it may be the latest point key for this userkey. This
     736             : // should be set to true for keys obsoleted by RANGEDELs, and is required for
     737             : // strict-obsolete sstables.
     738             : //
     739             : // Note that there are two properties, S1 and S2 (see comment in format.go)
     740             : // that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the
     741             : // responsibility of the caller. S1 is solely the responsibility of the
     742             : // callee.
     743           1 : func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error {
     744           1 :         if w.err != nil {
     745           0 :                 return w.err
     746           0 :         }
     747             : 
     748           1 :         switch key.Kind() {
     749           1 :         case InternalKeyKindRangeDelete:
     750           1 :                 return w.addTombstone(key, value)
     751             :         case base.InternalKeyKindRangeKeyDelete,
     752             :                 base.InternalKeyKindRangeKeySet,
     753           0 :                 base.InternalKeyKindRangeKeyUnset:
     754           0 :                 w.err = errors.Errorf(
     755           0 :                         "pebble: range keys must be added via one of the RangeKey* functions")
     756           0 :                 return w.err
     757             :         }
     758           1 :         return w.addPoint(key, value, forceObsolete)
     759             : }
     760             : 
     761           1 : func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
     762           1 :         prevTrailer := w.lastPointKeyInfo.trailer
     763           1 :         w.lastPointKeyInfo.trailer = key.Trailer
     764           1 :         if w.dataBlockBuf.dataBlock.EntryCount() == 0 {
     765           1 :                 return nil
     766           1 :         }
     767           1 :         if !w.disableKeyOrderChecks {
     768           1 :                 prevPointUserKey := w.dataBlockBuf.dataBlock.CurUserKey()
     769           1 :                 cmpUser := w.compare(prevPointUserKey, key.UserKey)
     770           1 :                 if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) {
     771           0 :                         return errors.Errorf(
     772           0 :                                 "pebble: keys must be added in strictly increasing order: %s, %s",
     773           0 :                                 InternalKey{UserKey: prevPointUserKey, Trailer: prevTrailer}.Pretty(w.formatKey),
     774           0 :                                 key.Pretty(w.formatKey))
     775           0 :                 }
     776             :         }
     777           1 :         return nil
     778             : }
     779             : 
     780             : // REQUIRES: at least one point has been written to the Writer.
     781           1 : func (w *Writer) getLastPointUserKey() []byte {
     782           1 :         if w.dataBlockBuf.dataBlock.EntryCount() == 0 {
     783           0 :                 panic(errors.AssertionFailedf("no point keys added to writer"))
     784             :         }
     785           1 :         return w.dataBlockBuf.dataBlock.CurUserKey()
     786             : }
     787             : 
     788             : // REQUIRES: w.tableFormat >= TableFormatPebblev3
     789             : func (w *Writer) makeAddPointDecisionV3(
     790             :         key InternalKey, valueLen int,
     791           1 : ) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) {
     792           1 :         prevPointKeyInfo := w.lastPointKeyInfo
     793           1 :         w.lastPointKeyInfo.userKeyLen = len(key.UserKey)
     794           1 :         w.lastPointKeyInfo.prefixLen = w.split(key.UserKey)
     795           1 :         w.lastPointKeyInfo.trailer = key.Trailer
     796           1 :         w.lastPointKeyInfo.isObsolete = false
     797           1 :         if !w.meta.HasPointKeys {
     798           1 :                 return false, false, false, nil
     799           1 :         }
     800           1 :         keyKind := key.Trailer.Kind()
     801           1 :         prevPointUserKey := w.getLastPointUserKey()
     802           1 :         prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer}
     803           1 :         prevKeyKind := prevPointKeyInfo.trailer.Kind()
     804           1 :         considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet &&
     805           1 :                 keyKind == InternalKeyKindSet
     806           1 :         if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() {
     807           0 :                 keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen]
     808           0 :                 cmpUpper := w.compare(
     809           0 :                         w.requiredInPlaceValueBound.Upper, keyPrefix)
     810           0 :                 if cmpUpper <= 0 {
     811           0 :                         // Common case for CockroachDB. Make it empty since all future keys in
     812           0 :                         // this sstable will also have cmpUpper <= 0.
     813           0 :                         w.requiredInPlaceValueBound = UserKeyPrefixBound{}
     814           0 :                 } else if w.compare(keyPrefix, w.requiredInPlaceValueBound.Lower) >= 0 {
     815           0 :                         considerWriteToValueBlock = false
     816           0 :                 }
     817             :         }
     818             :         // cmpPrefix is initialized iff considerWriteToValueBlock.
     819           1 :         var cmpPrefix int
     820           1 :         var cmpUser int
     821           1 :         if considerWriteToValueBlock {
     822           1 :                 // Compare the prefixes.
     823           1 :                 cmpPrefix = w.compare(prevPointUserKey[:prevPointKeyInfo.prefixLen],
     824           1 :                         key.UserKey[:w.lastPointKeyInfo.prefixLen])
     825           1 :                 cmpUser = cmpPrefix
     826           1 :                 if cmpPrefix == 0 {
     827           1 :                         // Need to compare suffixes to compute cmpUser.
     828           1 :                         cmpUser = w.compare(prevPointUserKey[prevPointKeyInfo.prefixLen:],
     829           1 :                                 key.UserKey[w.lastPointKeyInfo.prefixLen:])
     830           1 :                 }
     831           1 :         } else {
     832           1 :                 cmpUser = w.compare(prevPointUserKey, key.UserKey)
     833           1 :         }
     834             :         // Ensure that no one adds a point key kind without considering the obsolete
     835             :         // handling for that kind.
     836           1 :         switch keyKind {
     837             :         case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge,
     838           1 :                 InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
     839           0 :         default:
     840           0 :                 panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String()))
     841             :         }
     842             :         // If same user key, then the current key is obsolete if any of the
     843             :         // following is true:
     844             :         // C1 The prev key was obsolete.
     845             :         // C2 The prev key was not a MERGE. When the previous key is a MERGE we must
     846             :         //    preserve SET* and MERGE since their values will be merged into the
     847             :         //    previous key. We also must preserve DEL* since there may be an older
     848             :         //    SET*/MERGE in a lower level that must not be merged with the MERGE --
     849             :         //    if we omit the DEL* that lower SET*/MERGE will become visible.
     850             :         //
     851             :         // Regardless of whether it is the same user key or not
     852             :         // C3 The current key is some kind of point delete, and we are writing to
     853             :         //    the lowest level, then it is also obsolete. The correctness of this
     854             :         //    relies on the same user key not spanning multiple sstables in a level.
     855             :         //
     856             :         // C1 ensures that for a user key there is at most one transition from
     857             :         // !obsolete to obsolete. Consider a user key k, for which the first n keys
     858             :         // are not obsolete. We consider the various value of n:
     859             :         //
     860             :         // n = 0: This happens due to forceObsolete being set by the caller, or due
     861             :         // to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL
     862             :         // must also delete all the lower seqnums for the same user key. C3 triggers
     863             :         // due to a point delete and that deletes all the lower seqnums for the same
     864             :         // user key.
     865             :         //
     866             :         // n = 1: This is the common case. It happens when the first key is not a
     867             :         // MERGE, or the current key is some kind of point delete.
     868             :         //
     869             :         // n > 1: This is due to a sequence of MERGE keys, potentially followed by a
     870             :         // single non-MERGE key.
     871           1 :         isObsoleteC1AndC2 := cmpUser == 0 &&
     872           1 :                 (prevPointKeyInfo.isObsolete || prevKeyKind != InternalKeyKindMerge)
     873           1 :         isObsoleteC3 := w.writingToLowestLevel &&
     874           1 :                 (keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete ||
     875           1 :                         keyKind == InternalKeyKindDeleteSized)
     876           1 :         isObsolete = isObsoleteC1AndC2 || isObsoleteC3
     877           1 :         // TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is
     878           1 :         // possible, but requires some care in documenting and checking invariants.
     879           1 :         // There is code that assumes nothing in value blocks because of single MVCC
     880           1 :         // version (those should be ok). We have to ensure setHasSamePrefix is
     881           1 :         // correctly initialized here etc.
     882           1 : 
     883           1 :         if !w.disableKeyOrderChecks &&
     884           1 :                 (cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) {
     885           0 :                 return false, false, false, errors.Errorf(
     886           0 :                         "pebble: keys must be added in strictly increasing order: %s, %s",
     887           0 :                         prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
     888           0 :         }
     889           1 :         if !considerWriteToValueBlock {
     890           1 :                 return false, false, isObsolete, nil
     891           1 :         }
     892             :         // NB: it is possible that cmpUser == 0, i.e., these two SETs have identical
     893             :         // user keys (because of an open snapshot). This should be the rare case.
     894           1 :         setHasSamePrefix = cmpPrefix == 0
     895           1 :         // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
     896           1 :         // valueHandle, this should be > 3. But tiny values are common in test and
     897           1 :         // unlikely in production, so we use 0 here for better test coverage.
     898           1 :         const tinyValueThreshold = 0
     899           1 :         // NB: setting WriterOptions.DisableValueBlocks does not disable the
     900           1 :         // setHasSamePrefix optimization.
     901           1 :         considerWriteToValueBlock = setHasSamePrefix && valueLen > tinyValueThreshold && w.valueBlockWriter != nil
     902           1 :         return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil
     903             : }
     904             : 
     905           1 : func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
     906           1 :         if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
     907           0 :                 return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
     908           0 :         }
     909           1 :         var err error
     910           1 :         var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool
     911           1 :         var isObsolete bool
     912           1 :         maxSharedKeyLen := len(key.UserKey)
     913           1 :         if w.tableFormat >= TableFormatPebblev3 {
     914           1 :                 // maxSharedKeyLen is limited to the prefix of the preceding key. If the
     915           1 :                 // preceding key was in a different block, then the blockWriter will
     916           1 :                 // ignore this maxSharedKeyLen.
     917           1 :                 maxSharedKeyLen = w.lastPointKeyInfo.prefixLen
     918           1 :                 setHasSameKeyPrefix, writeToValueBlock, isObsolete, err =
     919           1 :                         w.makeAddPointDecisionV3(key, len(value))
     920           1 :                 addPrefixToValueStoredWithKey = key.Kind() == InternalKeyKindSet
     921           1 :         } else {
     922           1 :                 err = w.makeAddPointDecisionV2(key)
     923           1 :         }
     924           1 :         if err != nil {
     925           0 :                 return err
     926           0 :         }
     927           1 :         isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete)
     928           1 :         w.lastPointKeyInfo.isObsolete = isObsolete
     929           1 :         var valueStoredWithKey []byte
     930           1 :         var prefix block.ValuePrefix
     931           1 :         var valueStoredWithKeyLen int
     932           1 :         if writeToValueBlock {
     933           1 :                 vh, err := w.valueBlockWriter.addValue(value)
     934           1 :                 if err != nil {
     935           0 :                         return err
     936           0 :                 }
     937           1 :                 n := encodeValueHandle(w.blockBuf.tmp[:], vh)
     938           1 :                 valueStoredWithKey = w.blockBuf.tmp[:n]
     939           1 :                 valueStoredWithKeyLen = len(valueStoredWithKey) + 1
     940           1 :                 var attribute base.ShortAttribute
     941           1 :                 if w.shortAttributeExtractor != nil {
     942           0 :                         // TODO(sumeer): for compactions, it is possible that the input sstable
     943           0 :                         // already has this value in the value section and so we have already
     944           0 :                         // extracted the ShortAttribute. Avoid extracting it again. This will
     945           0 :                         // require changing the Writer.Add interface.
     946           0 :                         if attribute, err = w.shortAttributeExtractor(
     947           0 :                                 key.UserKey, w.lastPointKeyInfo.prefixLen, value); err != nil {
     948           0 :                                 return err
     949           0 :                         }
     950             :                 }
     951           1 :                 prefix = block.ValueHandlePrefix(setHasSameKeyPrefix, attribute)
     952           1 :         } else {
     953           1 :                 valueStoredWithKey = value
     954           1 :                 valueStoredWithKeyLen = len(value)
     955           1 :                 if addPrefixToValueStoredWithKey {
     956           1 :                         valueStoredWithKeyLen++
     957           1 :                 }
     958           1 :                 prefix = block.InPlaceValuePrefix(setHasSameKeyPrefix)
     959             :         }
     960             : 
     961           1 :         if err := w.maybeFlush(key, valueStoredWithKeyLen); err != nil {
     962           0 :                 return err
     963           0 :         }
     964             : 
     965           1 :         for i := range w.blockPropCollectors {
     966           1 :                 v := value
     967           1 :                 if addPrefixToValueStoredWithKey {
     968           1 :                         // Values for SET are not required to be in-place, and in the future may
     969           1 :                         // not even be read by the compaction, so pass nil values. Block
     970           1 :                         // property collectors in such Pebble DB's must not look at the value.
     971           1 :                         v = nil
     972           1 :                 }
     973           1 :                 if err := w.blockPropCollectors[i].AddPointKey(key, v); err != nil {
     974           0 :                         w.err = err
     975           0 :                         return err
     976           0 :                 }
     977             :         }
     978           1 :         if w.tableFormat >= TableFormatPebblev4 {
     979           1 :                 w.obsoleteCollector.AddPoint(isObsolete)
     980           1 :         }
     981             : 
     982           1 :         w.maybeAddToFilter(key.UserKey)
     983           1 :         w.dataBlockBuf.dataBlock.AddWithOptionalValuePrefix(
     984           1 :                 key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
     985           1 :                 setHasSameKeyPrefix)
     986           1 : 
     987           1 :         w.meta.updateSeqNum(key.SeqNum())
     988           1 : 
     989           1 :         if !w.meta.HasPointKeys {
     990           1 :                 k := w.dataBlockBuf.dataBlock.CurKey()
     991           1 :                 // NB: We need to ensure that SmallestPoint.UserKey is set, so we create
     992           1 :                 // an InternalKey which is semantically identical to the key, but won't
     993           1 :                 // have a nil UserKey. We do this, because key.UserKey could be nil, and
     994           1 :                 // we don't want SmallestPoint.UserKey to be nil.
     995           1 :                 //
     996           1 :                 // todo(bananabrick): Determine if it's okay to have a nil SmallestPoint
     997           1 :                 // .UserKey now that we don't rely on a nil UserKey to determine if the
     998           1 :                 // key has been set or not.
     999           1 :                 w.meta.SetSmallestPointKey(k.Clone())
    1000           1 :         }
    1001             : 
    1002           1 :         w.props.NumEntries++
    1003           1 :         switch key.Kind() {
    1004           1 :         case InternalKeyKindDelete, InternalKeyKindSingleDelete:
    1005           1 :                 w.props.NumDeletions++
    1006           1 :                 w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
    1007           1 :         case InternalKeyKindDeleteSized:
    1008           1 :                 var size uint64
    1009           1 :                 if len(value) > 0 {
    1010           1 :                         var n int
    1011           1 :                         size, n = binary.Uvarint(value)
    1012           1 :                         if n <= 0 {
    1013           0 :                                 w.err = errors.Newf("%s key's value (%x) does not parse as uvarint",
    1014           0 :                                         errors.Safe(key.Kind().String()), value)
    1015           0 :                                 return w.err
    1016           0 :                         }
    1017             :                 }
    1018           1 :                 w.props.NumDeletions++
    1019           1 :                 w.props.NumSizedDeletions++
    1020           1 :                 w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
    1021           1 :                 w.props.RawPointTombstoneValueSize += size
    1022           1 :         case InternalKeyKindMerge:
    1023           1 :                 w.props.NumMergeOperands++
    1024             :         }
    1025           1 :         w.props.RawKeySize += uint64(key.Size())
    1026           1 :         w.props.RawValueSize += uint64(len(value))
    1027           1 :         return nil
    1028             : }
    1029             : 
    1030           0 : func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
    1031           0 :         return keyspan.Span{
    1032           0 :                 Start: k.UserKey,
    1033           0 :                 End:   value,
    1034           0 :                 Keys:  []keyspan.Key{{Trailer: k.Trailer}},
    1035           0 :         }.Pretty(w.formatKey)
    1036           0 : }
    1037             : 
    1038           1 : func (w *Writer) addTombstone(key InternalKey, value []byte) error {
    1039           1 :         if !w.disableKeyOrderChecks && w.rangeDelBlock.EntryCount() > 0 {
    1040           1 :                 // Check that tombstones are being added in fragmented order. If the two
    1041           1 :                 // tombstones overlap, their start and end keys must be identical.
    1042           1 :                 prevKey := w.rangeDelBlock.CurKey()
    1043           1 :                 switch c := w.compare(prevKey.UserKey, key.UserKey); {
    1044           0 :                 case c > 0:
    1045           0 :                         w.err = errors.Errorf("pebble: keys must be added in order: %s, %s",
    1046           0 :                                 prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
    1047           0 :                         return w.err
    1048           1 :                 case c == 0:
    1049           1 :                         prevValue := w.rangeDelBlock.CurValue()
    1050           1 :                         if w.compare(prevValue, value) != 0 {
    1051           0 :                                 w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
    1052           0 :                                         w.prettyTombstone(prevKey, prevValue),
    1053           0 :                                         w.prettyTombstone(key, value))
    1054           0 :                                 return w.err
    1055           0 :                         }
    1056           1 :                         if prevKey.SeqNum() <= key.SeqNum() {
    1057           0 :                                 w.err = errors.Errorf("pebble: keys must be added in strictly increasing order: %s, %s",
    1058           0 :                                         prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
    1059           0 :                                 return w.err
    1060           0 :                         }
    1061           1 :                 default:
    1062           1 :                         prevValue := w.rangeDelBlock.CurValue()
    1063           1 :                         if w.compare(prevValue, key.UserKey) > 0 {
    1064           0 :                                 w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
    1065           0 :                                         w.prettyTombstone(prevKey, prevValue),
    1066           0 :                                         w.prettyTombstone(key, value))
    1067           0 :                                 return w.err
    1068           0 :                         }
    1069             :                 }
    1070             :         }
    1071             : 
    1072           1 :         if key.Trailer == base.InternalKeyRangeDeleteSentinel {
    1073           0 :                 w.err = errors.Errorf("pebble: cannot add range delete sentinel: %s", key.Pretty(w.formatKey))
    1074           0 :                 return w.err
    1075           0 :         }
    1076             : 
    1077           1 :         w.meta.updateSeqNum(key.SeqNum())
    1078           1 : 
    1079           1 :         // Range tombstones are fragmented in the v2 range deletion block format,
    1080           1 :         // so the start key of the first range tombstone added will be the smallest
    1081           1 :         // range tombstone key. The largest range tombstone key will be determined
    1082           1 :         // in Writer.Close() as the end key of the last range tombstone added.
    1083           1 :         if w.props.NumRangeDeletions == 0 {
    1084           1 :                 w.meta.SetSmallestRangeDelKey(key.Clone())
    1085           1 :         }
    1086             : 
    1087           1 :         w.props.NumEntries++
    1088           1 :         w.props.NumDeletions++
    1089           1 :         w.props.NumRangeDeletions++
    1090           1 :         w.props.RawKeySize += uint64(key.Size())
    1091           1 :         w.props.RawValueSize += uint64(len(value))
    1092           1 :         w.rangeDelBlock.Add(key, value)
    1093           1 :         return nil
    1094             : }
    1095             : 
    1096             : // RangeKeySet sets a range between start (inclusive) and end (exclusive) with
    1097             : // the given suffix to the given value. The resulting range key is given the
    1098             : // sequence number zero, with the expectation that the resulting sstable will be
    1099             : // ingested.
    1100             : //
    1101             : // Keys must be added to the table in increasing order of start key. Spans are
    1102             : // not required to be fragmented. The same suffix may not be set or unset twice
    1103             : // over the same keyspan, because it would result in inconsistent state. Both
    1104             : // the Set and Unset would share the zero sequence number, and a key cannot be
    1105             : // both simultaneously set and unset.
    1106           0 : func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error {
    1107           0 :         return w.addRangeKeySpanToFragmenter(keyspan.Span{
    1108           0 :                 Start: w.tempRangeKeyCopy(start),
    1109           0 :                 End:   w.tempRangeKeyCopy(end),
    1110           0 :                 Keys: []keyspan.Key{
    1111           0 :                         {
    1112           0 :                                 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet),
    1113           0 :                                 Suffix:  w.tempRangeKeyCopy(suffix),
    1114           0 :                                 Value:   w.tempRangeKeyCopy(value),
    1115           0 :                         },
    1116           0 :                 },
    1117           0 :         })
    1118           0 : }
    1119             : 
    1120             : // RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive)
    1121             : // with the given suffix. The resulting range key is given the
    1122             : // sequence number zero, with the expectation that the resulting sstable will be
    1123             : // ingested.
    1124             : //
    1125             : // Keys must be added to the table in increasing order of start key. Spans are
    1126             : // not required to be fragmented. The same suffix may not be set or unset twice
    1127             : // over the same keyspan, because it would result in inconsistent state. Both
    1128             : // the Set and Unset would share the zero sequence number, and a key cannot be
    1129             : // both simultaneously set and unset.
    1130           0 : func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error {
    1131           0 :         return w.addRangeKeySpanToFragmenter(keyspan.Span{
    1132           0 :                 Start: w.tempRangeKeyCopy(start),
    1133           0 :                 End:   w.tempRangeKeyCopy(end),
    1134           0 :                 Keys: []keyspan.Key{
    1135           0 :                         {
    1136           0 :                                 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset),
    1137           0 :                                 Suffix:  w.tempRangeKeyCopy(suffix),
    1138           0 :                         },
    1139           0 :                 },
    1140           0 :         })
    1141           0 : }
    1142             : 
    1143             : // RangeKeyDelete deletes a range between start (inclusive) and end (exclusive).
    1144             : //
    1145             : // Keys must be added to the table in increasing order of start key. Spans are
    1146             : // not required to be fragmented.
    1147           0 : func (w *Writer) RangeKeyDelete(start, end []byte) error {
    1148           0 :         return w.addRangeKeySpanToFragmenter(keyspan.Span{
    1149           0 :                 Start: w.tempRangeKeyCopy(start),
    1150           0 :                 End:   w.tempRangeKeyCopy(end),
    1151           0 :                 Keys: []keyspan.Key{
    1152           0 :                         {Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)},
    1153           0 :                 },
    1154           0 :         })
    1155           0 : }
    1156             : 
    1157           0 : func (w *Writer) addRangeKeySpanToFragmenter(span keyspan.Span) error {
    1158           0 :         if w.compare(span.Start, span.End) >= 0 {
    1159           0 :                 return errors.Errorf(
    1160           0 :                         "pebble: start key must be strictly less than end key",
    1161           0 :                 )
    1162           0 :         }
    1163           0 :         if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 {
    1164           0 :                 return errors.Errorf("pebble: spans must be added in order: %s > %s",
    1165           0 :                         w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start))
    1166           0 :         }
    1167             :         // Add this span to the fragmenter.
    1168           0 :         w.fragmenter.Add(span)
    1169           0 :         return w.err
    1170             : }
    1171             : 
    1172           0 : func (w *Writer) encodeFragmentedRangeKeySpan(span keyspan.Span) {
    1173           0 :         // This method is the emit function of the Fragmenter.
    1174           0 :         //
    1175           0 :         // NB: The span should only contain range keys and be internally consistent
    1176           0 :         // (eg, no duplicate suffixes, no additional keys after a RANGEKEYDEL).
    1177           0 :         //
    1178           0 :         // We use w.rangeKeysBySuffix and w.rangeKeySpan to avoid allocations.
    1179           0 : 
    1180           0 :         // Sort the keys by suffix. Iteration doesn't *currently* depend on it, but
    1181           0 :         // we may want to in the future.
    1182           0 :         w.rangeKeysBySuffix.Cmp = w.compare
    1183           0 :         w.rangeKeysBySuffix.Keys = span.Keys
    1184           0 :         sort.Sort(&w.rangeKeysBySuffix)
    1185           0 : 
    1186           0 :         w.rangeKeySpan = span
    1187           0 :         w.rangeKeySpan.Keys = w.rangeKeysBySuffix.Keys
    1188           0 :         if w.err == nil {
    1189           0 :                 w.err = w.EncodeSpan(&w.rangeKeySpan)
    1190           0 :         }
    1191             : }
    1192             : 
    1193             : // addRangeKey adds a range key set, unset, or delete key/value pair to the
    1194             : // table being written.
    1195             : //
    1196             : // Range keys must be supplied in strictly ascending order of start key (i.e.
    1197             : // user key ascending, sequence number descending, and key type descending).
    1198             : // Ranges added must also be supplied in fragmented span order - i.e. other than
    1199             : // spans that are perfectly aligned (same start and end keys), spans may not
    1200             : // overlap. Range keys may be added out of order relative to point keys and
    1201             : // range deletions.
    1202           1 : func (w *Writer) addRangeKey(key InternalKey, value []byte) error {
    1203           1 :         if !w.disableKeyOrderChecks && w.rangeKeyBlock.EntryCount() > 0 {
    1204           1 :                 prevStartKey := w.rangeKeyBlock.CurKey()
    1205           1 :                 prevEndKey, _, err := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.CurValue())
    1206           1 :                 if err != nil {
    1207           0 :                         // We panic here as we should have previously decoded and validated this
    1208           0 :                         // key and value when it was first added to the range key block.
    1209           0 :                         panic(err)
    1210             :                 }
    1211             : 
    1212           1 :                 curStartKey := key
    1213           1 :                 curEndKey, _, err := rangekey.DecodeEndKey(curStartKey.Kind(), value)
    1214           1 :                 if err != nil {
    1215           0 :                         w.err = err
    1216           0 :                         return w.err
    1217           0 :                 }
    1218             : 
    1219             :                 // Start keys must be strictly increasing.
    1220           1 :                 if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 {
    1221           0 :                         w.err = errors.Errorf(
    1222           0 :                                 "pebble: range keys starts must be added in increasing order: %s, %s",
    1223           0 :                                 prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
    1224           0 :                         return w.err
    1225           0 :                 }
    1226             : 
    1227             :                 // Start keys are increasing. If the start user keys are equal, the
    1228             :                 // end keys must be equal (i.e. aligned spans).
    1229           1 :                 if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 {
    1230           1 :                         if w.compare(prevEndKey, curEndKey) != 0 {
    1231           0 :                                 w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
    1232           0 :                                         prevStartKey.Pretty(w.formatKey),
    1233           0 :                                         curStartKey.Pretty(w.formatKey))
    1234           0 :                                 return w.err
    1235           0 :                         }
    1236           1 :                 } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 {
    1237           0 :                         // If the start user keys are NOT equal, the spans must be disjoint (i.e.
    1238           0 :                         // no overlap).
    1239           0 :                         // NOTE: the inequality excludes zero, as we allow the end key of the
    1240           0 :                         // lower span be the same as the start key of the upper span, because
    1241           0 :                         // the range end key is considered an exclusive bound.
    1242           0 :                         w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
    1243           0 :                                 prevStartKey.Pretty(w.formatKey),
    1244           0 :                                 curStartKey.Pretty(w.formatKey))
    1245           0 :                         return w.err
    1246           0 :                 }
    1247             :         }
    1248             : 
    1249             :         // TODO(travers): Add an invariant-gated check to ensure that suffix-values
    1250             :         // are sorted within coalesced spans.
    1251             : 
    1252             :         // Range-keys and point-keys are intended to live in "parallel" keyspaces.
    1253             :         // However, we track a single seqnum in the table metadata that spans both of
    1254             :         // these keyspaces.
    1255             :         // TODO(travers): Consider tracking range key seqnums separately.
    1256           1 :         w.meta.updateSeqNum(key.SeqNum())
    1257           1 : 
    1258           1 :         // Range tombstones are fragmented, so the start key of the first range key
    1259           1 :         // added will be the smallest. The largest range key is determined in
    1260           1 :         // Writer.Close() as the end key of the last range key added to the block.
    1261           1 :         if w.props.NumRangeKeys() == 0 {
    1262           1 :                 w.meta.SetSmallestRangeKey(key.Clone())
    1263           1 :         }
    1264             : 
    1265             :         // Update table properties.
    1266           1 :         w.props.RawRangeKeyKeySize += uint64(key.Size())
    1267           1 :         w.props.RawRangeKeyValueSize += uint64(len(value))
    1268           1 :         switch key.Kind() {
    1269           1 :         case base.InternalKeyKindRangeKeyDelete:
    1270           1 :                 w.props.NumRangeKeyDels++
    1271           1 :         case base.InternalKeyKindRangeKeySet:
    1272           1 :                 w.props.NumRangeKeySets++
    1273           1 :         case base.InternalKeyKindRangeKeyUnset:
    1274           1 :                 w.props.NumRangeKeyUnsets++
    1275           0 :         default:
    1276           0 :                 panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind()))
    1277             :         }
    1278             : 
    1279             :         // Add the key to the block.
    1280           1 :         w.rangeKeyBlock.Add(key, value)
    1281           1 :         return nil
    1282             : }
    1283             : 
    1284             : // tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte
    1285             : // slice. Any byte written to the returned slice is retained for the lifetime of
    1286             : // the Writer.
    1287           0 : func (w *Writer) tempRangeKeyBuf(n int) []byte {
    1288           0 :         if cap(w.rkBuf)-len(w.rkBuf) < n {
    1289           0 :                 size := len(w.rkBuf) + 2*n
    1290           0 :                 if size < 2*cap(w.rkBuf) {
    1291           0 :                         size = 2 * cap(w.rkBuf)
    1292           0 :                 }
    1293           0 :                 buf := make([]byte, len(w.rkBuf), size)
    1294           0 :                 copy(buf, w.rkBuf)
    1295           0 :                 w.rkBuf = buf
    1296             :         }
    1297           0 :         b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n]
    1298           0 :         w.rkBuf = w.rkBuf[:len(w.rkBuf)+n]
    1299           0 :         return b
    1300             : }
    1301             : 
    1302             : // tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's
    1303             : // range key buffer.
    1304           0 : func (w *Writer) tempRangeKeyCopy(k []byte) []byte {
    1305           0 :         if len(k) == 0 {
    1306           0 :                 return nil
    1307           0 :         }
    1308           0 :         buf := w.tempRangeKeyBuf(len(k))
    1309           0 :         copy(buf, k)
    1310           0 :         return buf
    1311             : }
    1312             : 
    1313           1 : func (w *Writer) maybeAddToFilter(key []byte) {
    1314           1 :         if w.filter != nil {
    1315           1 :                 prefix := key[:w.split(key)]
    1316           1 :                 w.filter.addKey(prefix)
    1317           1 :         }
    1318             : }
    1319             : 
    1320           1 : func (w *Writer) flush(key InternalKey) error {
    1321           1 :         // We're finishing a data block.
    1322           1 :         err := w.finishDataBlockProps(w.dataBlockBuf)
    1323           1 :         if err != nil {
    1324           0 :                 return err
    1325           0 :         }
    1326           1 :         w.dataBlockBuf.finish()
    1327           1 :         w.dataBlockBuf.compressAndChecksum(w.compression)
    1328           1 :         // Since dataBlockEstimates.addInflightDataBlock was never called, the
    1329           1 :         // inflightSize is set to 0.
    1330           1 :         w.coordination.sizeEstimate.dataBlockCompressed(len(w.dataBlockBuf.compressed), 0)
    1331           1 : 
    1332           1 :         // Determine if the index block should be flushed. Since we're accessing the
    1333           1 :         // dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start
    1334           1 :         // to pool the dataBlockBufs, the curKey isn't used by the Writer once the
    1335           1 :         // dataBlockBuf is added back to a sync.Pool. In this particular case, the
    1336           1 :         // byte slice which supports "sep" will eventually be copied when "sep" is
    1337           1 :         // added to the index block.
    1338           1 :         prevKey := w.dataBlockBuf.dataBlock.CurKey()
    1339           1 :         sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
    1340           1 :         // We determine that we should flush an index block from the Writer client
    1341           1 :         // goroutine, but we actually finish the index block from the writeQueue.
    1342           1 :         // When we determine that an index block should be flushed, we need to call
    1343           1 :         // BlockPropertyCollector.FinishIndexBlock. But block property collector
    1344           1 :         // calls must happen sequentially from the Writer client. Therefore, we need
    1345           1 :         // to determine that we are going to flush the index block from the Writer
    1346           1 :         // client.
    1347           1 :         shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush(
    1348           1 :                 sep, encodedBHPEstimatedSize, w.indexBlockOptions, w.allocatorSizeClasses,
    1349           1 :         )
    1350           1 : 
    1351           1 :         var indexProps []byte
    1352           1 :         var flushableIndexBlock *indexBlockBuf
    1353           1 :         if shouldFlushIndexBlock {
    1354           1 :                 flushableIndexBlock = w.indexBlock
    1355           1 :                 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
    1356           1 :                 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
    1357           1 :                 // flush the index block.
    1358           1 :                 indexProps, err = w.finishIndexBlockProps()
    1359           1 :                 if err != nil {
    1360           0 :                         return err
    1361           0 :                 }
    1362             :         }
    1363             : 
    1364             :         // We've called BlockPropertyCollector.FinishDataBlock, and, if necessary,
    1365             :         // BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish
    1366             :         // the data block, we can call
    1367             :         // BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
    1368           1 :         w.addPrevDataBlockToIndexBlockProps()
    1369           1 : 
    1370           1 :         // Schedule a write.
    1371           1 :         writeTask := writeTaskPool.Get().(*writeTask)
    1372           1 :         // We're setting compressionDone to indicate that compression of this block
    1373           1 :         // has already been completed.
    1374           1 :         writeTask.compressionDone <- true
    1375           1 :         writeTask.buf = w.dataBlockBuf
    1376           1 :         writeTask.indexEntrySep = sep
    1377           1 :         writeTask.currIndexBlock = w.indexBlock
    1378           1 :         writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize
    1379           1 :         writeTask.finishedIndexProps = indexProps
    1380           1 :         writeTask.flushableIndexBlock = flushableIndexBlock
    1381           1 : 
    1382           1 :         // The writeTask corresponds to an unwritten index entry.
    1383           1 :         w.indexBlock.addInflight(writeTask.indexInflightSize)
    1384           1 : 
    1385           1 :         w.dataBlockBuf = nil
    1386           1 :         if w.coordination.parallelismEnabled {
    1387           1 :                 w.coordination.writeQueue.add(writeTask)
    1388           1 :         } else {
    1389           1 :                 err = w.coordination.writeQueue.addSync(writeTask)
    1390           1 :         }
    1391           1 :         w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
    1392           1 : 
    1393           1 :         return err
    1394             : }
    1395             : 
    1396           1 : func (w *Writer) maybeFlush(key InternalKey, valueLen int) error {
    1397           1 :         if !w.dataBlockBuf.shouldFlush(key, valueLen, w.dataBlockOptions, w.allocatorSizeClasses) {
    1398           1 :                 return nil
    1399           1 :         }
    1400             : 
    1401           1 :         err := w.flush(key)
    1402           1 : 
    1403           1 :         if err != nil {
    1404           0 :                 w.err = err
    1405           0 :                 return err
    1406           0 :         }
    1407             : 
    1408           1 :         return nil
    1409             : }
    1410             : 
    1411             : // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the
    1412             : // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the
    1413             : // blockPropsEncoder.
    1414           1 : func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error {
    1415           1 :         if len(w.blockPropCollectors) == 0 {
    1416           1 :                 return nil
    1417           1 :         }
    1418           1 :         var err error
    1419           1 :         buf.blockPropsEncoder.resetProps()
    1420           1 :         for i := range w.blockPropCollectors {
    1421           1 :                 scratch := buf.blockPropsEncoder.getScratchForProp()
    1422           1 :                 if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil {
    1423           0 :                         return err
    1424           0 :                 }
    1425           1 :                 buf.blockPropsEncoder.addProp(shortID(i), scratch)
    1426             :         }
    1427             : 
    1428           1 :         buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps()
    1429           1 :         return nil
    1430             : }
    1431             : 
    1432             : // The BlockHandleWithProperties returned by this method must be encoded before any future use of
    1433             : // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder.
    1434             : // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously
    1435             : // with the Writer client.
    1436             : func (w *Writer) maybeAddBlockPropertiesToBlockHandle(
    1437             :         bh block.Handle,
    1438           1 : ) (BlockHandleWithProperties, error) {
    1439           1 :         err := w.finishDataBlockProps(w.dataBlockBuf)
    1440           1 :         if err != nil {
    1441           0 :                 return BlockHandleWithProperties{}, err
    1442           0 :         }
    1443           1 :         return BlockHandleWithProperties{Handle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil
    1444             : }
    1445             : 
    1446           1 : func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey {
    1447           1 :         // Make a rough guess that we want key-sized scratch to compute the separator.
    1448           1 :         if cap(dataBlockBuf.sepScratch) < key.Size() {
    1449           1 :                 dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2)
    1450           1 :         }
    1451             : 
    1452           1 :         var sep InternalKey
    1453           1 :         if key.UserKey == nil && key.Trailer == 0 {
    1454           1 :                 sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0])
    1455           1 :         } else {
    1456           1 :                 sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key)
    1457           1 :         }
    1458           1 :         return sep
    1459             : }
    1460             : 
    1461             : // addIndexEntry adds an index entry for the specified key and block handle.
    1462             : // addIndexEntry can be called from both the Writer client goroutine, and the
    1463             : // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as
    1464             : // they're used when the index block is finished.
    1465             : //
    1466             : // Invariant:
    1467             : //  1. addIndexEntry must not store references to the sep InternalKey, the tmp
    1468             : //     byte slice, bhp.Props. That is, these must be either deep copied or
    1469             : //     encoded.
    1470             : //  2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo
    1471             : //     indexBlockBufs.
    1472             : func (w *Writer) addIndexEntry(
    1473             :         sep InternalKey,
    1474             :         bhp BlockHandleWithProperties,
    1475             :         tmp []byte,
    1476             :         flushIndexBuf *indexBlockBuf,
    1477             :         writeTo *indexBlockBuf,
    1478             :         inflightSize int,
    1479             :         indexProps []byte,
    1480           1 : ) error {
    1481           1 :         if bhp.Length == 0 {
    1482           0 :                 // A valid blockHandle must be non-zero.
    1483           0 :                 // In particular, it must have a non-zero length.
    1484           0 :                 return nil
    1485           0 :         }
    1486             : 
    1487           1 :         encoded := encodeBlockHandleWithProperties(tmp, bhp)
    1488           1 : 
    1489           1 :         if flushIndexBuf != nil {
    1490           1 :                 if cap(w.indexPartitions) == 0 {
    1491           1 :                         w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32)
    1492           1 :                 }
    1493             :                 // Enable two level indexes if there is more than one index block.
    1494           1 :                 w.twoLevelIndex = true
    1495           1 :                 if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil {
    1496           0 :                         return err
    1497           0 :                 }
    1498             :         }
    1499             : 
    1500           1 :         writeTo.add(sep, encoded, inflightSize)
    1501           1 :         return nil
    1502             : }
    1503             : 
    1504           1 : func (w *Writer) addPrevDataBlockToIndexBlockProps() {
    1505           1 :         for i := range w.blockPropCollectors {
    1506           1 :                 w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock()
    1507           1 :         }
    1508             : }
    1509             : 
    1510             : // addIndexEntrySync adds an index entry for the specified key and block handle.
    1511             : // Writer.addIndexEntry is only called synchronously once Writer.Close is called.
    1512             : // addIndexEntrySync should only be called if we're sure that index entries
    1513             : // aren't being written asynchronously.
    1514             : //
    1515             : // Invariant:
    1516             : //  1. addIndexEntrySync must not store references to the prevKey, key InternalKey's,
    1517             : //     the tmp byte slice. That is, these must be either deep copied or encoded.
    1518             : //
    1519             : // TODO: Improve coverage of this method. e.g. tests passed without the line
    1520             : // `w.twoLevelIndex = true` previously.
    1521             : func (w *Writer) addIndexEntrySync(
    1522             :         prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte,
    1523           1 : ) error {
    1524           1 :         return w.addIndexEntrySep(w.indexEntrySep(prevKey, key, w.dataBlockBuf), bhp, tmp)
    1525           1 : }
    1526             : 
    1527             : func (w *Writer) addIndexEntrySep(
    1528             :         sep InternalKey, bhp BlockHandleWithProperties, tmp []byte,
    1529           1 : ) error {
    1530           1 :         shouldFlush := supportsTwoLevelIndex(
    1531           1 :                 w.tableFormat) && w.indexBlock.shouldFlush(
    1532           1 :                 sep, encodedBHPEstimatedSize, w.indexBlockOptions, w.allocatorSizeClasses,
    1533           1 :         )
    1534           1 :         var flushableIndexBlock *indexBlockBuf
    1535           1 :         var props []byte
    1536           1 :         var err error
    1537           1 :         if shouldFlush {
    1538           1 :                 flushableIndexBlock = w.indexBlock
    1539           1 :                 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
    1540           1 :                 w.twoLevelIndex = true
    1541           1 :                 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
    1542           1 :                 // flush the index block.
    1543           1 :                 props, err = w.finishIndexBlockProps()
    1544           1 :                 if err != nil {
    1545           0 :                         return err
    1546           0 :                 }
    1547             :         }
    1548             : 
    1549           1 :         err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props)
    1550           1 :         if flushableIndexBlock != nil {
    1551           1 :                 flushableIndexBlock.clear()
    1552           1 :                 indexBlockBufPool.Put(flushableIndexBlock)
    1553           1 :         }
    1554           1 :         w.addPrevDataBlockToIndexBlockProps()
    1555           1 :         return err
    1556             : }
    1557             : 
    1558             : func shouldFlushWithHints(
    1559             :         keyLen, valueLen int,
    1560             :         restartInterval, estimatedBlockSize, numEntries int,
    1561             :         flushOptions flushDecisionOptions,
    1562             :         sizeClassHints []int,
    1563           1 : ) bool {
    1564           1 :         if numEntries == 0 {
    1565           1 :                 return false
    1566           1 :         }
    1567             : 
    1568             :         // If we are not informed about the memory allocator's size classes we fall
    1569             :         // back to a simple set of flush heuristics that are unaware of internal
    1570             :         // fragmentation in block cache allocations.
    1571           1 :         if len(sizeClassHints) == 0 {
    1572           1 :                 return shouldFlushWithoutHints(
    1573           1 :                         keyLen, valueLen, restartInterval, estimatedBlockSize, numEntries, flushOptions)
    1574           1 :         }
    1575             : 
    1576             :         // For size-class aware flushing we need to account for the metadata that is
    1577             :         // allocated when this block is loaded into the block cache. For instance, if
    1578             :         // a block has size 1020B it may fit within a 1024B class. However, when
    1579             :         // loaded into the block cache we also allocate space for the cache entry
    1580             :         // metadata. The new allocation of size ~1052B may now only fit within a
    1581             :         // 2048B class, which increases internal fragmentation.
    1582           0 :         blockSizeWithMetadata := estimatedBlockSize + cache.ValueMetadataSize
    1583           0 : 
    1584           0 :         // For the fast path we can avoid computing the exact varint encoded
    1585           0 :         // key-value pair size. Instead, we combine the key-value pair size with an
    1586           0 :         // upper-bound estimate of the associated metadata (4B restart point, 4B
    1587           0 :         // shared prefix length, 5B varint unshared key size, 5B varint value size).
    1588           0 :         newEstimatedSize := blockSizeWithMetadata + keyLen + valueLen + 18
    1589           0 :         // Our new block size estimate disregards key prefix compression. This puts
    1590           0 :         // us at risk of overestimating the size and flushing small blocks. We
    1591           0 :         // mitigate this by imposing a minimum size restriction.
    1592           0 :         if blockSizeWithMetadata <= flushOptions.sizeClassAwareThreshold || newEstimatedSize <= flushOptions.blockSize {
    1593           0 :                 return false
    1594           0 :         }
    1595             : 
    1596           0 :         sizeClass, ok := blockSizeClass(blockSizeWithMetadata, sizeClassHints)
    1597           0 :         // If the block size could not be mapped to a size class we fall back to
    1598           0 :         // using a simpler set of flush heuristics.
    1599           0 :         if !ok {
    1600           0 :                 return shouldFlushWithoutHints(
    1601           0 :                         keyLen, valueLen, restartInterval, estimatedBlockSize, numEntries, flushOptions)
    1602           0 :         }
    1603             : 
    1604             :         // Tighter upper-bound estimate of the metadata stored with the next
    1605             :         // key-value pair.
    1606           0 :         newSize := blockSizeWithMetadata + keyLen + valueLen
    1607           0 :         if numEntries%restartInterval == 0 {
    1608           0 :                 newSize += 4
    1609           0 :         }
    1610           0 :         newSize += 4                            // varint for shared prefix length
    1611           0 :         newSize += uvarintLen(uint32(keyLen))   // varint for unshared key bytes
    1612           0 :         newSize += uvarintLen(uint32(valueLen)) // varint for value size
    1613           0 : 
    1614           0 :         if blockSizeWithMetadata < flushOptions.blockSize {
    1615           0 :                 newSizeClass, ok := blockSizeClass(newSize, sizeClassHints)
    1616           0 :                 if ok && newSizeClass-newSize >= sizeClass-blockSizeWithMetadata {
    1617           0 :                         // Although the block hasn't reached the target size, waiting to insert the
    1618           0 :                         // next entry would exceed the target and increase memory fragmentation.
    1619           0 :                         return true
    1620           0 :                 }
    1621           0 :                 return false
    1622             :         }
    1623             : 
    1624             :         // Flush if inserting the next entry bumps the block size to the memory
    1625             :         // allocator's next size class.
    1626           0 :         return newSize > sizeClass
    1627             : }
    1628             : 
    1629             : func shouldFlushWithoutHints(
    1630             :         keyLen, valueLen int,
    1631             :         restartInterval, estimatedBlockSize, numEntries int,
    1632             :         flushOptions flushDecisionOptions,
    1633           1 : ) bool {
    1634           1 :         if estimatedBlockSize >= flushOptions.blockSize {
    1635           1 :                 return true
    1636           1 :         }
    1637             : 
    1638             :         // The block is currently smaller than the target size.
    1639           1 :         if estimatedBlockSize <= flushOptions.blockSizeThreshold {
    1640           1 :                 // The block is smaller than the threshold size at which we'll consider
    1641           1 :                 // flushing it.
    1642           1 :                 return false
    1643           1 :         }
    1644             : 
    1645           1 :         newSize := estimatedBlockSize + keyLen + valueLen
    1646           1 :         if numEntries%restartInterval == 0 {
    1647           1 :                 newSize += 4
    1648           1 :         }
    1649           1 :         newSize += 4                            // varint for shared prefix length
    1650           1 :         newSize += uvarintLen(uint32(keyLen))   // varint for unshared key bytes
    1651           1 :         newSize += uvarintLen(uint32(valueLen)) // varint for value size
    1652           1 :         // Flush if the block plus the new entry is larger than the target size.
    1653           1 :         return newSize > flushOptions.blockSize
    1654             : }
    1655             : 
    1656             : // blockSizeClass returns the smallest memory allocator size class that could
    1657             : // hold a block of a given size and returns a boolean indicating whether an
    1658             : // appropriate size class was found. It is useful for computing the potential
    1659             : // space wasted by an allocation.
    1660           0 : func blockSizeClass(blockSize int, sizeClassHints []int) (int, bool) {
    1661           0 :         sizeClassIdx, _ := slices.BinarySearch(sizeClassHints, blockSize)
    1662           0 :         if sizeClassIdx == len(sizeClassHints) {
    1663           0 :                 return -1, false
    1664           0 :         }
    1665           0 :         return sizeClassHints[sizeClassIdx], true
    1666             : }
    1667             : 
    1668           1 : func cloneKeyWithBuf(k InternalKey, a bytealloc.A) (bytealloc.A, InternalKey) {
    1669           1 :         if len(k.UserKey) == 0 {
    1670           0 :                 return a, k
    1671           0 :         }
    1672           1 :         a, keyCopy := a.Copy(k.UserKey)
    1673           1 :         return a, InternalKey{UserKey: keyCopy, Trailer: k.Trailer}
    1674             : }
    1675             : 
    1676             : // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated
    1677             : //
    1678             : //      and has its own lifetime, independent of the Writer and the blockPropsEncoder,
    1679             : //
    1680             : // and it is safe to:
    1681             : //  1. Reuse w.blockPropsEncoder without first encoding the byte slice returned.
    1682             : //  2. Store the byte slice in the Writer since it is a copy and not supported by
    1683             : //     an underlying buffer.
    1684           1 : func (w *Writer) finishIndexBlockProps() ([]byte, error) {
    1685           1 :         w.blockPropsEncoder.resetProps()
    1686           1 :         for i := range w.blockPropCollectors {
    1687           1 :                 scratch := w.blockPropsEncoder.getScratchForProp()
    1688           1 :                 var err error
    1689           1 :                 if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil {
    1690           0 :                         return nil, err
    1691           0 :                 }
    1692           1 :                 w.blockPropsEncoder.addProp(shortID(i), scratch)
    1693             :         }
    1694           1 :         return w.blockPropsEncoder.props(), nil
    1695             : }
    1696             : 
    1697             : // finishIndexBlock finishes the current index block and adds it to the top
    1698             : // level index block. This is only used when two level indexes are enabled.
    1699             : //
    1700             : // Invariants:
    1701             : //  1. The props slice passed into finishedIndexBlock must not be a
    1702             : //     owned by any other struct, since it will be stored in the Writer.indexPartitions
    1703             : //     slice.
    1704             : //  2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere.
    1705             : //     That is, it must be safe to reuse indexBuf after finishIndexBlock has been called.
    1706           1 : func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
    1707           1 :         part := indexBlockAndBlockProperties{
    1708           1 :                 nEntries: indexBuf.block.EntryCount(), properties: props,
    1709           1 :         }
    1710           1 :         w.indexSepAlloc, part.sep = cloneKeyWithBuf(
    1711           1 :                 indexBuf.block.CurKey(), w.indexSepAlloc,
    1712           1 :         )
    1713           1 :         bk := indexBuf.finish()
    1714           1 :         if len(w.indexBlockAlloc) < len(bk) {
    1715           1 :                 // Allocate enough bytes for approximately 16 index blocks.
    1716           1 :                 w.indexBlockAlloc = make([]byte, len(bk)*16)
    1717           1 :         }
    1718           1 :         n := copy(w.indexBlockAlloc, bk)
    1719           1 :         part.block = w.indexBlockAlloc[:n:n]
    1720           1 :         w.indexBlockAlloc = w.indexBlockAlloc[n:]
    1721           1 :         w.indexPartitions = append(w.indexPartitions, part)
    1722           1 :         return nil
    1723             : }
    1724             : 
    1725           1 : func (w *Writer) writeTwoLevelIndex() (block.Handle, error) {
    1726           1 :         props, err := w.finishIndexBlockProps()
    1727           1 :         if err != nil {
    1728           0 :                 return block.Handle{}, err
    1729           0 :         }
    1730             :         // Add the final unfinished index.
    1731           1 :         if err = w.finishIndexBlock(w.indexBlock, props); err != nil {
    1732           0 :                 return block.Handle{}, err
    1733           0 :         }
    1734             : 
    1735           1 :         for i := range w.indexPartitions {
    1736           1 :                 b := &w.indexPartitions[i]
    1737           1 :                 w.props.NumDataBlocks += uint64(b.nEntries)
    1738           1 : 
    1739           1 :                 data := b.block
    1740           1 :                 w.props.IndexSize += uint64(len(data))
    1741           1 :                 bh, err := w.layout.WriteIndexBlock(data)
    1742           1 :                 if err != nil {
    1743           0 :                         return block.Handle{}, err
    1744           0 :                 }
    1745           1 :                 bhp := BlockHandleWithProperties{
    1746           1 :                         Handle: bh,
    1747           1 :                         Props:  b.properties,
    1748           1 :                 }
    1749           1 :                 encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp)
    1750           1 :                 w.topLevelIndexBlock.Add(b.sep, encoded)
    1751             :         }
    1752             : 
    1753             :         // NB: RocksDB includes the block trailer length in the index size
    1754             :         // property, though it doesn't include the trailer in the top level
    1755             :         // index size property.
    1756           1 :         w.props.IndexPartitions = uint64(len(w.indexPartitions))
    1757           1 :         w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.EstimatedSize())
    1758           1 :         w.props.IndexSize += w.props.TopLevelIndexSize + block.TrailerLen
    1759           1 :         return w.layout.WriteIndexBlock(w.topLevelIndexBlock.Finish())
    1760             : }
    1761             : 
    1762             : func compressAndChecksum(
    1763             :         b []byte, compression Compression, blockBuf *blockBuf,
    1764           1 : ) (compressed []byte, trailer block.Trailer) {
    1765           1 :         // Compress the buffer, discarding the result if the improvement isn't at
    1766           1 :         // least 12.5%.
    1767           1 :         blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf)
    1768           1 :         if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) {
    1769           1 :                 blockBuf.compressedBuf = compressed[:cap(compressed)]
    1770           1 :         }
    1771           1 :         if len(compressed) < len(b)-len(b)/8 {
    1772           1 :                 b = compressed
    1773           1 :         } else {
    1774           1 :                 blockType = noCompressionBlockType
    1775           1 :         }
    1776             : 
    1777             :         // Calculate the checksum.
    1778           1 :         trailer[0] = byte(blockType)
    1779           1 :         checksum := blockBuf.checksummer.Checksum(b, trailer[:1])
    1780           1 :         return b, block.MakeTrailer(byte(blockType), checksum)
    1781             : }
    1782             : 
    1783             : // assertFormatCompatibility ensures that the features present on the table are
    1784             : // compatible with the table format version.
    1785           1 : func (w *Writer) assertFormatCompatibility() error {
    1786           1 :         // PebbleDBv1: block properties.
    1787           1 :         if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 {
    1788           0 :                 return errors.Newf(
    1789           0 :                         "table format version %s is less than the minimum required version %s for block properties",
    1790           0 :                         w.tableFormat, TableFormatPebblev1,
    1791           0 :                 )
    1792           0 :         }
    1793             : 
    1794             :         // PebbleDBv2: range keys.
    1795           1 :         if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 {
    1796           0 :                 return errors.Newf(
    1797           0 :                         "table format version %s is less than the minimum required version %s for range keys",
    1798           0 :                         w.tableFormat, TableFormatPebblev2,
    1799           0 :                 )
    1800           0 :         }
    1801             : 
    1802             :         // PebbleDBv3: value blocks.
    1803           1 :         if (w.props.NumValueBlocks > 0 || w.props.NumValuesInValueBlocks > 0 ||
    1804           1 :                 w.props.ValueBlocksSize > 0) && w.tableFormat < TableFormatPebblev3 {
    1805           0 :                 return errors.Newf(
    1806           0 :                         "table format version %s is less than the minimum required version %s for value blocks",
    1807           0 :                         w.tableFormat, TableFormatPebblev3)
    1808           0 :         }
    1809             : 
    1810             :         // PebbleDBv4: DELSIZED tombstones.
    1811           1 :         if w.props.NumSizedDeletions > 0 && w.tableFormat < TableFormatPebblev4 {
    1812           0 :                 return errors.Newf(
    1813           0 :                         "table format version %s is less than the minimum required version %s for sized deletion tombstones",
    1814           0 :                         w.tableFormat, TableFormatPebblev4)
    1815           0 :         }
    1816           1 :         return nil
    1817             : }
    1818             : 
    1819             : // UnsafeLastPointUserKey returns the last point key written to the writer to
    1820             : // which this option was passed during creation. The returned key points
    1821             : // directly into a buffer belonging to the Writer. The value's lifetime ends the
    1822             : // next time a point key is added to the Writer.
    1823             : //
    1824             : // Must not be called after Writer is closed.
    1825           1 : func (w *Writer) UnsafeLastPointUserKey() []byte {
    1826           1 :         if w != nil && w.dataBlockBuf.dataBlock.EntryCount() >= 1 {
    1827           1 :                 // w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key
    1828           1 :                 // which was added to the Writer.
    1829           1 :                 return w.dataBlockBuf.dataBlock.CurUserKey()
    1830           1 :         }
    1831           0 :         return nil
    1832             : }
    1833             : 
    1834             : // EncodeSpan encodes the keys in the given span. The span can contain either
    1835             : // only RANGEDEL keys or only range keys.
    1836             : //
    1837             : // This is a low-level API that bypasses the fragmenter. The spans passed to
    1838             : // this function must be fragmented and ordered.
    1839           1 : func (w *Writer) EncodeSpan(span *keyspan.Span) error {
    1840           1 :         if span.Empty() {
    1841           1 :                 return nil
    1842           1 :         }
    1843           1 :         if span.Keys[0].Kind() == base.InternalKeyKindRangeDelete {
    1844           1 :                 return rangedel.Encode(span, w.Add)
    1845           1 :         }
    1846           1 :         for i := range w.blockPropCollectors {
    1847           1 :                 if err := w.blockPropCollectors[i].AddRangeKeys(*span); err != nil {
    1848           0 :                         return err
    1849           0 :                 }
    1850             :         }
    1851           1 :         return w.rangeKeyEncoder.Encode(span)
    1852             : }
    1853             : 
    1854             : // Close finishes writing the table and closes the underlying file that the
    1855             : // table was written to.
    1856           1 : func (w *Writer) Close() (err error) {
    1857           1 :         defer func() {
    1858           1 :                 if w.valueBlockWriter != nil {
    1859           1 :                         releaseValueBlockWriter(w.valueBlockWriter)
    1860           1 :                         // Defensive code in case Close gets called again. We don't want to put
    1861           1 :                         // the same object to a sync.Pool.
    1862           1 :                         w.valueBlockWriter = nil
    1863           1 :                 }
    1864           1 :                 w.layout.Abort()
    1865           1 :                 // Record any error in the writer (so we can exit early if Close is called
    1866           1 :                 // again).
    1867           1 :                 if err != nil {
    1868           0 :                         w.err = err
    1869           0 :                 }
    1870             :         }()
    1871             : 
    1872             :         // finish must be called before we check for an error, because finish will
    1873             :         // block until every single task added to the writeQueue has been processed,
    1874             :         // and an error could be encountered while any of those tasks are processed.
    1875           1 :         if err := w.coordination.writeQueue.finish(); err != nil {
    1876           0 :                 return err
    1877           0 :         }
    1878             : 
    1879           1 :         if w.err != nil {
    1880           0 :                 return w.err
    1881           0 :         }
    1882             : 
    1883             :         // The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it
    1884             :         // when the Writer is closed.
    1885             :         //
    1886             :         // The following invariants ensure that setting the largest key at this point of a Writer close
    1887             :         // is correct:
    1888             :         // 1. Keys must only be added to the Writer in an increasing order.
    1889             :         // 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This
    1890             :         //    must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed,
    1891             :         //    however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
    1892             :         //    addPoint function after the flush occurs.
    1893           1 :         if w.dataBlockBuf.dataBlock.EntryCount() >= 1 {
    1894           1 :                 w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.CurKey().Clone())
    1895           1 :         }
    1896             : 
    1897             :         // Finish the last data block, or force an empty data block if there
    1898             :         // aren't any data blocks at all.
    1899           1 :         if w.dataBlockBuf.dataBlock.EntryCount() > 0 || w.indexBlock.block.EntryCount() == 0 {
    1900           1 :                 bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.dataBlock.Finish(), &w.dataBlockBuf.blockBuf)
    1901           1 :                 if err != nil {
    1902           0 :                         return err
    1903           0 :                 }
    1904           1 :                 bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
    1905           1 :                 if err != nil {
    1906           0 :                         return err
    1907           0 :                 }
    1908           1 :                 prevKey := w.dataBlockBuf.dataBlock.CurKey()
    1909           1 :                 if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
    1910           0 :                         return err
    1911           0 :                 }
    1912             :         }
    1913           1 :         w.props.DataSize = w.layout.offset
    1914           1 : 
    1915           1 :         // Write the filter block.
    1916           1 :         if w.filter != nil {
    1917           1 :                 bh, err := w.layout.WriteFilterBlock(w.filter)
    1918           1 :                 if err != nil {
    1919           0 :                         return err
    1920           0 :                 }
    1921           1 :                 w.props.FilterPolicyName = w.filter.policyName()
    1922           1 :                 w.props.FilterSize = bh.Length
    1923             :         }
    1924             : 
    1925           1 :         if w.twoLevelIndex {
    1926           1 :                 w.props.IndexType = twoLevelIndex
    1927           1 :                 // Write the two level index block.
    1928           1 :                 if _, err = w.writeTwoLevelIndex(); err != nil {
    1929           0 :                         return err
    1930           0 :                 }
    1931           1 :         } else {
    1932           1 :                 w.props.IndexType = binarySearchIndex
    1933           1 :                 // NB: RocksDB includes the block trailer length in the index size
    1934           1 :                 // property, though it doesn't include the trailer in the filter size
    1935           1 :                 // property.
    1936           1 :                 w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + block.TrailerLen
    1937           1 :                 w.props.NumDataBlocks = uint64(w.indexBlock.block.EntryCount())
    1938           1 :                 // Write the single level index block.
    1939           1 :                 if _, err = w.layout.WriteIndexBlock(w.indexBlock.finish()); err != nil {
    1940           0 :                         return err
    1941           0 :                 }
    1942             :         }
    1943             : 
    1944             :         // Write the range-del block.
    1945           1 :         if w.props.NumRangeDeletions > 0 {
    1946           1 :                 // Because the range tombstones are fragmented, the end key of the last
    1947           1 :                 // added range tombstone will be the largest range tombstone key. Note
    1948           1 :                 // that we need to make this into a range deletion sentinel because
    1949           1 :                 // sstable boundaries are inclusive while the end key of a range
    1950           1 :                 // deletion tombstone is exclusive. A Clone() is necessary as
    1951           1 :                 // rangeDelBlock.curValue is the same slice that will get passed into
    1952           1 :                 // w.writer, and some implementations of vfs.File mutate the slice
    1953           1 :                 // passed into Write(). Also, w.meta will often outlive the blockWriter,
    1954           1 :                 // and so cloning curValue allows the rangeDelBlock's internal buffer to
    1955           1 :                 // get gc'd.
    1956           1 :                 k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.CurValue()).Clone()
    1957           1 :                 w.meta.SetLargestRangeDelKey(k)
    1958           1 :                 if _, err := w.layout.WriteRangeDeletionBlock(w.rangeDelBlock.Finish()); err != nil {
    1959           0 :                         return err
    1960           0 :                 }
    1961             :         }
    1962             : 
    1963             :         // Write the range-key block, flushing any remaining spans from the
    1964             :         // fragmenter first.
    1965           1 :         w.fragmenter.Finish()
    1966           1 : 
    1967           1 :         if w.props.NumRangeKeys() > 0 {
    1968           1 :                 key := w.rangeKeyBlock.CurKey()
    1969           1 :                 kind := key.Kind()
    1970           1 :                 endKey, _, err := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.CurValue())
    1971           1 :                 if err != nil {
    1972           0 :                         return err
    1973           0 :                 }
    1974           1 :                 k := base.MakeExclusiveSentinelKey(kind, endKey).Clone()
    1975           1 :                 w.meta.SetLargestRangeKey(k)
    1976           1 :                 if _, err := w.layout.WriteRangeKeyBlock(w.rangeKeyBlock.Finish()); err != nil {
    1977           0 :                         return err
    1978           0 :                 }
    1979             :         }
    1980             : 
    1981           1 :         if w.valueBlockWriter != nil {
    1982           1 :                 _, vbStats, err := w.valueBlockWriter.finish(&w.layout, w.layout.offset)
    1983           1 :                 if err != nil {
    1984           0 :                         return err
    1985           0 :                 }
    1986           1 :                 w.props.NumValueBlocks = vbStats.numValueBlocks
    1987           1 :                 w.props.NumValuesInValueBlocks = vbStats.numValuesInValueBlocks
    1988           1 :                 w.props.ValueBlocksSize = vbStats.valueBlocksAndIndexSize
    1989             :         }
    1990             : 
    1991           1 :         {
    1992           1 :                 // Finish and record the prop collectors if props are not yet recorded.
    1993           1 :                 // Pre-computed props might have been copied by specialized sst creators
    1994           1 :                 // like suffix replacer.
    1995           1 :                 if len(w.props.UserProperties) == 0 {
    1996           1 :                         userProps := make(map[string]string)
    1997           1 :                         for i := range w.blockPropCollectors {
    1998           1 :                                 scratch := w.blockPropsEncoder.getScratchForProp()
    1999           1 :                                 // Place the shortID in the first byte.
    2000           1 :                                 scratch = append(scratch, byte(i))
    2001           1 :                                 buf, err := w.blockPropCollectors[i].FinishTable(scratch)
    2002           1 :                                 if err != nil {
    2003           0 :                                         return err
    2004           0 :                                 }
    2005           1 :                                 var prop string
    2006           1 :                                 if len(buf) > 0 {
    2007           1 :                                         prop = string(buf)
    2008           1 :                                 }
    2009             :                                 // NB: The property is populated in the map even if it is the
    2010             :                                 // empty string, since the presence in the map is what indicates
    2011             :                                 // that the block property collector was used when writing.
    2012           1 :                                 userProps[w.blockPropCollectors[i].Name()] = prop
    2013             :                         }
    2014           1 :                         if len(userProps) > 0 {
    2015           1 :                                 w.props.UserProperties = userProps
    2016           1 :                         }
    2017             :                 }
    2018             : 
    2019             :                 // Write the properties block.
    2020           1 :                 var raw rowblk.Writer
    2021           1 :                 // The restart interval is set to infinity because the properties block
    2022           1 :                 // is always read sequentially and cached in a heap located object. This
    2023           1 :                 // reduces table size without a significant impact on performance.
    2024           1 :                 raw.RestartInterval = propertiesBlockRestartInterval
    2025           1 :                 w.props.CompressionOptions = rocksDBCompressionOptions
    2026           1 :                 w.props.save(w.tableFormat, &raw)
    2027           1 :                 w.layout.WritePropertiesBlock(raw.Finish())
    2028             :         }
    2029             : 
    2030             :         // Write the table footer.
    2031           1 :         w.meta.Size, err = w.layout.Finish()
    2032           1 :         if err != nil {
    2033           0 :                 return err
    2034           0 :         }
    2035           1 :         w.meta.Properties = w.props
    2036           1 : 
    2037           1 :         // Check that the features present in the table are compatible with the format
    2038           1 :         // configured for the table.
    2039           1 :         if err = w.assertFormatCompatibility(); err != nil {
    2040           0 :                 return err
    2041           0 :         }
    2042             : 
    2043           1 :         w.dataBlockBuf.clear()
    2044           1 :         dataBlockBufPool.Put(w.dataBlockBuf)
    2045           1 :         w.dataBlockBuf = nil
    2046           1 :         w.indexBlock.clear()
    2047           1 :         indexBlockBufPool.Put(w.indexBlock)
    2048           1 :         w.indexBlock = nil
    2049           1 : 
    2050           1 :         // Make any future calls to Set or Close return an error.
    2051           1 :         w.err = errWriterClosed
    2052           1 :         return nil
    2053             : }
    2054             : 
    2055             : // EstimatedSize returns the estimated size of the sstable being written if a
    2056             : // call to Finish() was made without adding additional keys.
    2057           1 : func (w *Writer) EstimatedSize() uint64 {
    2058           1 :         if w == nil {
    2059           0 :                 return 0
    2060           0 :         }
    2061           1 :         return w.coordination.sizeEstimate.size() +
    2062           1 :                 uint64(w.dataBlockBuf.dataBlock.EstimatedSize()) +
    2063           1 :                 w.indexBlock.estimatedSize()
    2064             : }
    2065             : 
    2066             : // Metadata returns the metadata for the finished sstable. Only valid to call
    2067             : // after the sstable has been finished.
    2068           1 : func (w *Writer) Metadata() (*WriterMetadata, error) {
    2069           1 :         if !w.layout.IsFinished() {
    2070           0 :                 return nil, errors.New("pebble: writer is not closed")
    2071           0 :         }
    2072           1 :         return &w.meta, nil
    2073             : }
    2074             : 
    2075             : // WriterOption provide an interface to do work on Writer while it is being
    2076             : // opened.
    2077             : type WriterOption interface {
    2078             :         // writerApply is called on the writer during opening in order to set
    2079             :         // internal parameters.
    2080             :         writerApply(*Writer)
    2081             : }
    2082             : 
    2083             : // NewWriter returns a new table writer for the file. Closing the writer will
    2084             : // close the file.
    2085           1 : func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer {
    2086           1 :         o = o.ensureDefaults()
    2087           1 :         w := &Writer{
    2088           1 :                 layout: makeLayoutWriter(writable, o),
    2089           1 :                 meta: WriterMetadata{
    2090           1 :                         SmallestSeqNum: math.MaxUint64,
    2091           1 :                 },
    2092           1 :                 dataBlockOptions: flushDecisionOptions{
    2093           1 :                         blockSize:               o.BlockSize,
    2094           1 :                         blockSizeThreshold:      (o.BlockSize*o.BlockSizeThreshold + 99) / 100,
    2095           1 :                         sizeClassAwareThreshold: (o.BlockSize*o.SizeClassAwareThreshold + 99) / 100,
    2096           1 :                 },
    2097           1 :                 indexBlockOptions: flushDecisionOptions{
    2098           1 :                         blockSize:               o.IndexBlockSize,
    2099           1 :                         blockSizeThreshold:      (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100,
    2100           1 :                         sizeClassAwareThreshold: (o.IndexBlockSize*o.SizeClassAwareThreshold + 99) / 100,
    2101           1 :                 },
    2102           1 :                 compare:               o.Comparer.Compare,
    2103           1 :                 split:                 o.Comparer.Split,
    2104           1 :                 formatKey:             o.Comparer.FormatKey,
    2105           1 :                 compression:           o.Compression,
    2106           1 :                 separator:             o.Comparer.Separator,
    2107           1 :                 successor:             o.Comparer.Successor,
    2108           1 :                 tableFormat:           o.TableFormat,
    2109           1 :                 isStrictObsolete:      o.IsStrictObsolete,
    2110           1 :                 writingToLowestLevel:  o.WritingToLowestLevel,
    2111           1 :                 restartInterval:       o.BlockRestartInterval,
    2112           1 :                 checksumType:          o.Checksum,
    2113           1 :                 disableKeyOrderChecks: o.internal.DisableKeyOrderChecks,
    2114           1 :                 indexBlock:            newIndexBlockBuf(o.Parallelism),
    2115           1 :                 rangeDelBlock:         rowblk.Writer{RestartInterval: 1},
    2116           1 :                 rangeKeyBlock:         rowblk.Writer{RestartInterval: 1},
    2117           1 :                 topLevelIndexBlock:    rowblk.Writer{RestartInterval: 1},
    2118           1 :                 fragmenter: keyspan.Fragmenter{
    2119           1 :                         Cmp:    o.Comparer.Compare,
    2120           1 :                         Format: o.Comparer.FormatKey,
    2121           1 :                 },
    2122           1 :                 allocatorSizeClasses: o.AllocatorSizeClasses,
    2123           1 :         }
    2124           1 :         if w.tableFormat >= TableFormatPebblev3 {
    2125           1 :                 w.shortAttributeExtractor = o.ShortAttributeExtractor
    2126           1 :                 w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound
    2127           1 :                 if !o.DisableValueBlocks {
    2128           1 :                         w.valueBlockWriter = newValueBlockWriter(
    2129           1 :                                 w.dataBlockOptions.blockSize, w.dataBlockOptions.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) {
    2130           1 :                                         w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0)
    2131           1 :                                 })
    2132             :                 }
    2133             :         }
    2134             : 
    2135           1 :         w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
    2136           1 : 
    2137           1 :         w.blockBuf = blockBuf{
    2138           1 :                 checksummer: block.Checksummer{Type: o.Checksum},
    2139           1 :         }
    2140           1 : 
    2141           1 :         w.coordination.init(o.Parallelism, w)
    2142           1 : 
    2143           1 :         if writable == nil {
    2144           0 :                 w.err = errors.New("pebble: nil writable")
    2145           0 :                 return w
    2146           0 :         }
    2147             : 
    2148             :         // Note that WriterOptions are applied in two places; the ones with a
    2149             :         // preApply() method are applied here. The rest are applied down below after
    2150             :         // default properties are set.
    2151           1 :         type preApply interface{ preApply() }
    2152           1 :         for _, opt := range extraOpts {
    2153           0 :                 if _, ok := opt.(preApply); ok {
    2154           0 :                         opt.writerApply(w)
    2155           0 :                 }
    2156             :         }
    2157             : 
    2158           1 :         if o.FilterPolicy != nil {
    2159           1 :                 switch o.FilterType {
    2160           1 :                 case TableFilter:
    2161           1 :                         w.filter = newTableFilterWriter(o.FilterPolicy)
    2162           0 :                 default:
    2163           0 :                         panic(fmt.Sprintf("unknown filter type: %v", o.FilterType))
    2164             :                 }
    2165             :         }
    2166             : 
    2167           1 :         w.props.ComparerName = o.Comparer.Name
    2168           1 :         w.props.CompressionName = o.Compression.String()
    2169           1 :         w.props.MergerName = o.MergerName
    2170           1 :         w.props.PropertyCollectorNames = "[]"
    2171           1 : 
    2172           1 :         numBlockPropertyCollectors := len(o.BlockPropertyCollectors)
    2173           1 :         if w.tableFormat >= TableFormatPebblev4 {
    2174           1 :                 numBlockPropertyCollectors++
    2175           1 :         }
    2176             : 
    2177           1 :         if numBlockPropertyCollectors > 0 {
    2178           1 :                 if numBlockPropertyCollectors > maxPropertyCollectors {
    2179           0 :                         w.err = errors.New("pebble: too many block property collectors")
    2180           0 :                         return w
    2181           0 :                 }
    2182           1 :                 w.blockPropCollectors = make([]BlockPropertyCollector, 0, numBlockPropertyCollectors)
    2183           1 :                 for _, constructFn := range o.BlockPropertyCollectors {
    2184           1 :                         w.blockPropCollectors = append(w.blockPropCollectors, constructFn())
    2185           1 :                 }
    2186           1 :                 if w.tableFormat >= TableFormatPebblev4 {
    2187           1 :                         w.blockPropCollectors = append(w.blockPropCollectors, &w.obsoleteCollector)
    2188           1 :                 }
    2189             : 
    2190           1 :                 var buf bytes.Buffer
    2191           1 :                 buf.WriteString("[")
    2192           1 :                 for i := range w.blockPropCollectors {
    2193           1 :                         if i > 0 {
    2194           1 :                                 buf.WriteString(",")
    2195           1 :                         }
    2196           1 :                         buf.WriteString(w.blockPropCollectors[i].Name())
    2197             :                 }
    2198           1 :                 buf.WriteString("]")
    2199           1 :                 w.props.PropertyCollectorNames = buf.String()
    2200             :         }
    2201             : 
    2202             :         // Apply the remaining WriterOptions that do not have a preApply() method.
    2203           1 :         for _, opt := range extraOpts {
    2204           0 :                 if _, ok := opt.(preApply); ok {
    2205           0 :                         continue
    2206             :                 }
    2207           0 :                 opt.writerApply(w)
    2208             :         }
    2209             : 
    2210             :         // Initialize the range key fragmenter and encoder.
    2211           1 :         w.fragmenter.Emit = w.encodeFragmentedRangeKeySpan
    2212           1 :         w.rangeKeyEncoder.Emit = w.addRangeKey
    2213           1 :         return w
    2214             : }
    2215             : 
    2216             : // SetSnapshotPinnedProperties sets the properties for pinned keys. Should only
    2217             : // be used internally by Pebble.
    2218             : func (w *Writer) SetSnapshotPinnedProperties(
    2219             :         pinnedKeyCount, pinnedKeySize, pinnedValueSize uint64,
    2220           1 : ) {
    2221           1 :         w.props.SnapshotPinnedKeys = pinnedKeyCount
    2222           1 :         w.props.SnapshotPinnedKeySize = pinnedKeySize
    2223           1 :         w.props.SnapshotPinnedValueSize = pinnedValueSize
    2224           1 : }

Generated by: LCOV version 1.14