Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "fmt"
12 : "math"
13 : "runtime"
14 : "sync"
15 :
16 : "github.com/cockroachdb/errors"
17 : "github.com/cockroachdb/pebble/internal/base"
18 : "github.com/cockroachdb/pebble/internal/bytealloc"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : "github.com/cockroachdb/pebble/internal/keyspan"
21 : "github.com/cockroachdb/pebble/internal/rangedel"
22 : "github.com/cockroachdb/pebble/internal/rangekey"
23 : "github.com/cockroachdb/pebble/objstorage"
24 : "github.com/cockroachdb/pebble/sstable/block"
25 : "github.com/cockroachdb/pebble/sstable/rowblk"
26 : "github.com/cockroachdb/pebble/sstable/valblk"
27 : )
28 :
29 : // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties.
30 : // It would also be nice to account for the length of the data block properties here,
31 : // but isn't necessary since this is an estimate.
32 : const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2
33 :
34 : var errWriterClosed = errors.New("pebble: writer is closed")
35 :
36 : // RawRowWriter is a sstable RawWriter that writes sstables with row-oriented
37 : // blocks. All table formats TableFormatPebblev4 and earlier write row-oriented
38 : // blocks and use RawRowWriter.
39 : type RawRowWriter struct {
40 : layout layoutWriter
41 : meta WriterMetadata
42 : err error
43 : dataFlush block.FlushGovernor
44 : indexFlush block.FlushGovernor
45 : // The following fields are copied from Options.
46 : compare Compare
47 : pointSuffixCmp base.ComparePointSuffixes
48 : split Split
49 : formatKey base.FormatKey
50 : compression block.Compression
51 : separator Separator
52 : successor Successor
53 : tableFormat TableFormat
54 : isStrictObsolete bool
55 : writingToLowestLevel bool
56 : restartInterval int
57 : checksumType block.ChecksumType
58 : // disableKeyOrderChecks disables the checks that keys are added to an
59 : // sstable in order. It is intended for internal use only in the construction
60 : // of invalid sstables for testing. See tool/make_test_sstables.go.
61 : disableKeyOrderChecks bool
62 : // With two level indexes, the index/filter of a SST file is partitioned into
63 : // smaller blocks with an additional top-level index on them. When reading an
64 : // index/filter, only the top-level index is loaded into memory. The two level
65 : // index/filter then uses the top-level index to load on demand into the block
66 : // cache the partitions that are required to perform the index/filter query.
67 : //
68 : // Two level indexes are enabled automatically when there is more than one
69 : // index block.
70 : //
71 : // This is useful when there are very large index blocks, which generally occurs
72 : // with the usage of large keys. With large index blocks, the index blocks fight
73 : // the data blocks for block cache space and the index blocks are likely to be
74 : // re-read many times from the disk. The top level index, which has a much
75 : // smaller memory footprint, can be used to prevent the entire index block from
76 : // being loaded into the block cache.
77 : twoLevelIndex bool
78 : indexBlock *indexBlockBuf
79 : rangeDelBlock rowblk.Writer
80 : rangeKeyBlock rowblk.Writer
81 : topLevelIndexBlock rowblk.Writer
82 : props Properties
83 : blockPropCollectors []BlockPropertyCollector
84 : obsoleteCollector obsoleteKeyBlockPropertyCollector
85 : blockPropsEncoder blockPropertiesEncoder
86 : // filter accumulates the filter block. If populated, the filter ingests
87 : // either the output of w.split (i.e. a prefix extractor) if w.split is not
88 : // nil, or the full keys otherwise.
89 : filter filterWriter
90 : indexPartitions []bufferedIndexBlock
91 :
92 : // indexBlockAlloc is used to bulk-allocate byte slices used to store index
93 : // blocks in indexPartitions. These live until the index finishes.
94 : indexBlockAlloc []byte
95 : // indexSepAlloc is used to bulk-allocate index block separator slices stored
96 : // in indexPartitions. These live until the index finishes.
97 : indexSepAlloc bytealloc.A
98 :
99 : rangeKeyEncoder rangekey.Encoder
100 : // dataBlockBuf consists of the state which is currently owned by and used by
101 : // the Writer client goroutine. This state can be handed off to other goroutines.
102 : dataBlockBuf *dataBlockBuf
103 : // blockBuf consists of the state which is owned by and used by the Writer client
104 : // goroutine.
105 : blockBuf blockBuf
106 :
107 : coordination coordinationState
108 :
109 : // Information (other than the byte slice) about the last point key, to
110 : // avoid extracting it again.
111 : lastPointKeyInfo pointKeyInfo
112 :
113 : // For value blocks.
114 : shortAttributeExtractor base.ShortAttributeExtractor
115 : requiredInPlaceValueBound UserKeyPrefixBound
116 : // When w.tableFormat >= TableFormatPebblev3, valueBlockWriter is nil iff
117 : // WriterOptions.DisableValueBlocks was true.
118 : valueBlockWriter *valblk.Writer
119 :
120 : allocatorSizeClasses []int
121 :
122 : numDeletionsThreshold int
123 : deletionSizeRatioThreshold float32
124 : }
125 :
126 : type pointKeyInfo struct {
127 : trailer base.InternalKeyTrailer
128 : // Only computed when w.valueBlockWriter is not nil.
129 : userKeyLen int
130 : // prefixLen uses w.split, if not nil. Only computed when w.valueBlockWriter
131 : // is not nil.
132 : prefixLen int
133 : // True iff the point was marked obsolete.
134 : isObsolete bool
135 : }
136 :
137 : type coordinationState struct {
138 : parallelismEnabled bool
139 :
140 : // writeQueue is used to write data blocks to disk. The writeQueue is primarily
141 : // used to maintain the order in which data blocks must be written to disk. For
142 : // this reason, every single data block write must be done through the writeQueue.
143 : writeQueue *writeQueue
144 :
145 : sizeEstimate dataBlockEstimates
146 : }
147 :
148 1 : func (c *coordinationState) init(parallelismEnabled bool, writer *RawRowWriter) {
149 1 : c.parallelismEnabled = parallelismEnabled
150 1 : // useMutex is false regardless of parallelismEnabled, because we do not do
151 1 : // parallel compression yet.
152 1 : c.sizeEstimate.useMutex = false
153 1 :
154 1 : // writeQueueSize determines the size of the write queue, or the number
155 1 : // of items which can be added to the queue without blocking. By default, we
156 1 : // use a writeQueue size of 0, since we won't be doing any block writes in
157 1 : // parallel.
158 1 : writeQueueSize := 0
159 1 : if parallelismEnabled {
160 1 : writeQueueSize = runtime.GOMAXPROCS(0)
161 1 : }
162 1 : c.writeQueue = newWriteQueue(writeQueueSize, writer)
163 : }
164 :
165 : // sizeEstimate is a general purpose helper for estimating two kinds of sizes:
166 : // A. The compressed sstable size, which is useful for deciding when to start
167 : //
168 : // a new sstable during flushes or compactions. In practice, we use this in
169 : // estimating the data size (excluding the index).
170 : //
171 : // B. The size of index blocks to decide when to start a new index block.
172 : //
173 : // There are some terminology peculiarities which are due to the origin of
174 : // sizeEstimate for use case A with parallel compression enabled (for which
175 : // the code has not been merged). Specifically this relates to the terms
176 : // "written" and "compressed".
177 : // - The notion of "written" for case A is sufficiently defined by saying that
178 : // the data block is compressed. Waiting for the actual data block write to
179 : // happen can result in unnecessary estimation, when we already know how big
180 : // it will be in compressed form. Additionally, with the forthcoming value
181 : // blocks containing older MVCC values, these compressed block will be held
182 : // in-memory until late in the sstable writing, and we do want to accurately
183 : // account for them without waiting for the actual write.
184 : // For case B, "written" means that the index entry has been fully
185 : // generated, and has been added to the uncompressed block buffer for that
186 : // index block. It does not include actually writing a potentially
187 : // compressed index block.
188 : // - The notion of "compressed" is to differentiate between a "inflight" size
189 : // and the actual size, and is handled via computing a compression ratio
190 : // observed so far (defaults to 1).
191 : // For case A, this is actual data block compression, so the "inflight" size
192 : // is uncompressed blocks (that are no longer being written to) and the
193 : // "compressed" size is after they have been compressed.
194 : // For case B the inflight size is for a key-value pair in the index for
195 : // which the value size (the encoded size of the BlockHandleWithProperties)
196 : // is not accurately known, while the compressed size is the size of that
197 : // entry when it has been added to the (in-progress) index ssblock.
198 : //
199 : // Usage: To update state, one can optionally provide an inflight write value
200 : // using addInflight (used for case B). When something is "written" the state
201 : // can be updated using either writtenWithDelta or writtenWithTotal, which
202 : // provide the actual delta size or the total size (latter must be
203 : // monotonically non-decreasing). If there were no calls to addInflight, there
204 : // isn't any real estimation happening here. So case A does not do any real
205 : // estimation. However, when we introduce parallel compression, there will be
206 : // estimation in that the client goroutine will call addInFlight and the
207 : // compression goroutines will call writtenWithDelta.
208 : type sizeEstimate struct {
209 : // emptySize is the size when there is no inflight data, and numEntries is 0.
210 : // emptySize is constant once set.
211 : emptySize uint64
212 :
213 : // inflightSize is the estimated size of some inflight data which hasn't
214 : // been written yet.
215 : inflightSize uint64
216 :
217 : // totalSize is the total size of the data which has already been written.
218 : totalSize uint64
219 :
220 : // numWrittenEntries is the total number of entries which have already been
221 : // written.
222 : numWrittenEntries uint64
223 : // numInflightEntries is the total number of entries which are inflight, and
224 : // haven't been written.
225 : numInflightEntries uint64
226 :
227 : // maxEstimatedSize stores the maximum result returned from sizeEstimate.size.
228 : // It ensures that values returned from subsequent calls to Writer.EstimatedSize
229 : // never decrease.
230 : maxEstimatedSize uint64
231 :
232 : // We assume that the entries added to the sizeEstimate can be compressed.
233 : // For this reason, we keep track of a compressedSize and an uncompressedSize
234 : // to compute a compression ratio for the inflight entries. If the entries
235 : // aren't being compressed, then compressedSize and uncompressedSize must be
236 : // equal.
237 : compressedSize uint64
238 : uncompressedSize uint64
239 : }
240 :
241 1 : func (s *sizeEstimate) init(emptySize uint64) {
242 1 : s.emptySize = emptySize
243 1 : }
244 :
245 1 : func (s *sizeEstimate) size() uint64 {
246 1 : ratio := float64(1)
247 1 : if s.uncompressedSize > 0 {
248 1 : ratio = float64(s.compressedSize) / float64(s.uncompressedSize)
249 1 : }
250 1 : estimatedInflightSize := uint64(float64(s.inflightSize) * ratio)
251 1 : total := s.totalSize + estimatedInflightSize
252 1 : if total > s.maxEstimatedSize {
253 1 : s.maxEstimatedSize = total
254 1 : } else {
255 1 : total = s.maxEstimatedSize
256 1 : }
257 :
258 1 : if total == 0 {
259 1 : return s.emptySize
260 1 : }
261 :
262 1 : return total
263 : }
264 :
265 1 : func (s *sizeEstimate) numTotalEntries() uint64 {
266 1 : return s.numWrittenEntries + s.numInflightEntries
267 1 : }
268 :
269 1 : func (s *sizeEstimate) addInflight(size int) {
270 1 : s.numInflightEntries++
271 1 : s.inflightSize += uint64(size)
272 1 : }
273 :
274 1 : func (s *sizeEstimate) writtenWithTotal(newTotalSize uint64, inflightSize int) {
275 1 : finalEntrySize := int(newTotalSize - s.totalSize)
276 1 : s.writtenWithDelta(finalEntrySize, inflightSize)
277 1 : }
278 :
279 1 : func (s *sizeEstimate) writtenWithDelta(finalEntrySize int, inflightSize int) {
280 1 : if inflightSize > 0 {
281 1 : // This entry was previously inflight, so we should decrement inflight
282 1 : // entries and update the "compression" stats for future estimation.
283 1 : s.numInflightEntries--
284 1 : s.inflightSize -= uint64(inflightSize)
285 1 : s.uncompressedSize += uint64(inflightSize)
286 1 : s.compressedSize += uint64(finalEntrySize)
287 1 : }
288 1 : s.numWrittenEntries++
289 1 : s.totalSize += uint64(finalEntrySize)
290 : }
291 :
292 1 : func (s *sizeEstimate) clear() {
293 1 : *s = sizeEstimate{emptySize: s.emptySize}
294 1 : }
295 :
296 : type indexBlockBuf struct {
297 : // block will only be accessed from the writeQueue.
298 : block rowblk.Writer
299 :
300 : size struct {
301 : useMutex bool
302 : mu sync.Mutex
303 : estimate sizeEstimate
304 : }
305 :
306 : // restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block`
307 : // must only be accessed from the writeQueue goroutine.
308 : restartInterval int
309 : }
310 :
311 1 : func (i *indexBlockBuf) clear() {
312 1 : i.block.Reset()
313 1 : if i.size.useMutex {
314 1 : i.size.mu.Lock()
315 1 : defer i.size.mu.Unlock()
316 1 : }
317 1 : i.size.estimate.clear()
318 1 : i.restartInterval = 0
319 : }
320 :
321 : var indexBlockBufPool = sync.Pool{
322 1 : New: func() interface{} {
323 1 : return &indexBlockBuf{}
324 1 : },
325 : }
326 :
327 : const indexBlockRestartInterval = 1
328 :
329 1 : func newIndexBlockBuf(useMutex bool) *indexBlockBuf {
330 1 : i := indexBlockBufPool.Get().(*indexBlockBuf)
331 1 : i.size.useMutex = useMutex
332 1 : i.restartInterval = indexBlockRestartInterval
333 1 : i.block.RestartInterval = indexBlockRestartInterval
334 1 : i.size.estimate.init(rowblk.EmptySize)
335 1 : return i
336 1 : }
337 :
338 : func (i *indexBlockBuf) shouldFlush(
339 : sep InternalKey, valueLen int, flushGovernor *block.FlushGovernor,
340 1 : ) bool {
341 1 : if i.size.useMutex {
342 1 : i.size.mu.Lock()
343 1 : defer i.size.mu.Unlock()
344 1 : }
345 :
346 1 : nEntries := i.size.estimate.numTotalEntries()
347 1 : return shouldFlush(
348 1 : sep.Size(), valueLen, i.restartInterval, int(i.size.estimate.size()),
349 1 : int(nEntries), flushGovernor)
350 : }
351 :
352 1 : func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) {
353 1 : i.block.Add(key, value)
354 1 : size := i.block.EstimatedSize()
355 1 : if i.size.useMutex {
356 1 : i.size.mu.Lock()
357 1 : defer i.size.mu.Unlock()
358 1 : }
359 1 : i.size.estimate.writtenWithTotal(uint64(size), inflightSize)
360 : }
361 :
362 1 : func (i *indexBlockBuf) finish() []byte {
363 1 : b := i.block.Finish()
364 1 : return b
365 1 : }
366 :
367 1 : func (i *indexBlockBuf) addInflight(inflightSize int) {
368 1 : if i.size.useMutex {
369 1 : i.size.mu.Lock()
370 1 : defer i.size.mu.Unlock()
371 1 : }
372 1 : i.size.estimate.addInflight(inflightSize)
373 : }
374 :
375 1 : func (i *indexBlockBuf) estimatedSize() uint64 {
376 1 : if i.size.useMutex {
377 1 : i.size.mu.Lock()
378 1 : defer i.size.mu.Unlock()
379 1 : }
380 :
381 : // Make sure that the size estimation works as expected when parallelism
382 : // is disabled.
383 1 : if invariants.Enabled && !i.size.useMutex {
384 1 : if i.size.estimate.inflightSize != 0 {
385 0 : panic("unexpected inflight entry in index block size estimation")
386 : }
387 :
388 : // NB: The i.block should only be accessed from the writeQueue goroutine,
389 : // when parallelism is enabled. We break that invariant here, but that's
390 : // okay since parallelism is disabled.
391 1 : if i.size.estimate.size() != uint64(i.block.EstimatedSize()) {
392 0 : panic("index block size estimation sans parallelism is incorrect")
393 : }
394 : }
395 1 : return i.size.estimate.size()
396 : }
397 :
398 : // sizeEstimate is used for sstable size estimation. sizeEstimate can be
399 : // accessed by the Writer client and compressionQueue goroutines. Fields
400 : // should only be read/updated through the functions defined on the
401 : // *sizeEstimate type.
402 : type dataBlockEstimates struct {
403 : // If we don't do block compression in parallel, then we don't need to take
404 : // the performance hit of synchronizing using this mutex.
405 : useMutex bool
406 : mu sync.Mutex
407 :
408 : estimate sizeEstimate
409 : }
410 :
411 : // inflightSize is the uncompressed block size estimate which has been
412 : // previously provided to addInflightDataBlock(). If addInflightDataBlock()
413 : // has not been called, this must be set to 0. compressedSize is the
414 : // compressed size of the block.
415 1 : func (d *dataBlockEstimates) dataBlockCompressed(compressedSize int, inflightSize int) {
416 1 : if d.useMutex {
417 0 : d.mu.Lock()
418 0 : defer d.mu.Unlock()
419 0 : }
420 1 : d.estimate.writtenWithDelta(compressedSize+block.TrailerLen, inflightSize)
421 : }
422 :
423 : // size is an estimated size of datablock data which has been written to disk.
424 1 : func (d *dataBlockEstimates) size() uint64 {
425 1 : if d.useMutex {
426 0 : d.mu.Lock()
427 0 : defer d.mu.Unlock()
428 0 : }
429 : // If there is no parallel compression, there should not be any inflight bytes.
430 1 : if invariants.Enabled && !d.useMutex {
431 1 : if d.estimate.inflightSize != 0 {
432 0 : panic("unexpected inflight entry in data block size estimation")
433 : }
434 : }
435 1 : return d.estimate.size()
436 : }
437 :
438 : // Avoid linter unused error.
439 : var _ = (&dataBlockEstimates{}).addInflightDataBlock
440 :
441 : // NB: unused since no parallel compression.
442 0 : func (d *dataBlockEstimates) addInflightDataBlock(size int) {
443 0 : if d.useMutex {
444 0 : d.mu.Lock()
445 0 : defer d.mu.Unlock()
446 0 : }
447 :
448 0 : d.estimate.addInflight(size)
449 : }
450 :
451 : var writeTaskPool = sync.Pool{
452 1 : New: func() interface{} {
453 1 : t := &writeTask{}
454 1 : t.compressionDone = make(chan bool, 1)
455 1 : return t
456 1 : },
457 : }
458 :
459 : type blockBuf struct {
460 : // tmp is a scratch buffer, large enough to hold either footerLen bytes,
461 : // blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most
462 : // likely large enough for a block handle with properties.
463 : tmp [blockHandleLikelyMaxLen]byte
464 : // compressedBuf is the destination buffer for compression. It is re-used over the
465 : // lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block.
466 : compressedBuf []byte
467 : checksummer block.Checksummer
468 : }
469 :
470 1 : func (b *blockBuf) clear() {
471 1 : // We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies
472 1 : // on the length of the buffer, and not the capacity to determine if it needs
473 1 : // to make an allocation.
474 1 : *b = blockBuf{
475 1 : compressedBuf: b.compressedBuf, checksummer: b.checksummer,
476 1 : }
477 1 : }
478 :
479 : // A dataBlockBuf holds all the state required to compress and write a data block to disk.
480 : // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer
481 : // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter
482 : // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed
483 : // to other goroutines for compression and file I/O.
484 : type dataBlockBuf struct {
485 : blockBuf
486 : dataBlock rowblk.Writer
487 :
488 : // uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
489 : // next byte slice to be compressed. The uncompressed byte slice will be backed by the
490 : // dataBlock.buf.
491 : uncompressed []byte
492 :
493 : // physical holds the (possibly) compressed block and its trailer. The
494 : // underlying block data's byte slice is owned by the dataBlockBuf. It may
495 : // be backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf,
496 : // depending on whether we use the result of the compression.
497 : physical block.PhysicalBlock
498 :
499 : // We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to
500 : // pass the encoded block properties over to the write queue. To prevent copies, and allocations,
501 : // we give each dataBlockBuf, a blockPropertiesEncoder.
502 : blockPropsEncoder blockPropertiesEncoder
503 : // dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is
504 : // a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder.
505 : dataBlockProps []byte
506 :
507 : // sepScratch is reusable scratch space for computing separator keys.
508 : sepScratch []byte
509 :
510 : // numDeletions stores the count of point tombstones in this data block.
511 : // It's used to determine if this data block is considered tombstone-dense
512 : // for the purposes of compaction.
513 : numDeletions int
514 : // deletionSize stores the raw size of point tombstones in this data block.
515 : // It's used to determine if this data block is considered tombstone-dense
516 : // for the purposes of compaction.
517 : deletionSize int
518 : }
519 :
520 1 : func (d *dataBlockBuf) clear() {
521 1 : d.blockBuf.clear()
522 1 : d.dataBlock.Reset()
523 1 :
524 1 : d.uncompressed = nil
525 1 : d.physical = block.PhysicalBlock{}
526 1 : d.dataBlockProps = nil
527 1 : d.sepScratch = d.sepScratch[:0]
528 1 : }
529 :
530 : var dataBlockBufPool = sync.Pool{
531 1 : New: func() interface{} {
532 1 : return &dataBlockBuf{}
533 1 : },
534 : }
535 :
536 1 : func newDataBlockBuf(restartInterval int, checksumType block.ChecksumType) *dataBlockBuf {
537 1 : d := dataBlockBufPool.Get().(*dataBlockBuf)
538 1 : d.dataBlock.RestartInterval = restartInterval
539 1 : d.checksummer.Type = checksumType
540 1 : return d
541 1 : }
542 :
543 1 : func (d *dataBlockBuf) finish() {
544 1 : d.uncompressed = d.dataBlock.Finish()
545 1 : }
546 :
547 1 : func (d *dataBlockBuf) compressAndChecksum(c block.Compression) {
548 1 : d.physical = block.CompressAndChecksum(&d.compressedBuf, d.uncompressed, c, &d.checksummer)
549 1 : }
550 :
551 : func (d *dataBlockBuf) shouldFlush(
552 : key InternalKey, valueLen int, flushGovernor *block.FlushGovernor,
553 1 : ) bool {
554 1 : return shouldFlush(
555 1 : key.Size(), valueLen, d.dataBlock.RestartInterval, d.dataBlock.EstimatedSize(),
556 1 : d.dataBlock.EntryCount(), flushGovernor)
557 1 : }
558 :
559 : type bufferedIndexBlock struct {
560 : nEntries int
561 : // sep is the last key added to this block, for computing a separator later.
562 : sep InternalKey
563 : properties []byte
564 : // block is the encoded block produced by blockWriter.finish.
565 : block []byte
566 : }
567 :
568 : // AddWithForceObsolete must be used when writing a strict-obsolete sstable.
569 : //
570 : // forceObsolete indicates whether the caller has determined that this key is
571 : // obsolete even though it may be the latest point key for this userkey. This
572 : // should be set to true for keys obsoleted by RANGEDELs, and is required for
573 : // strict-obsolete sstables.
574 : //
575 : // Note that there are two properties, S1 and S2 (see comment in format.go)
576 : // that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the
577 : // responsibility of the caller. S1 is solely the responsibility of the
578 : // callee.
579 : func (w *RawRowWriter) AddWithForceObsolete(
580 : key InternalKey, value []byte, forceObsolete bool,
581 1 : ) error {
582 1 : if w.err != nil {
583 0 : return w.err
584 0 : }
585 :
586 1 : switch key.Kind() {
587 0 : case InternalKeyKindRangeDelete:
588 0 : return w.addTombstone(key, value)
589 : case base.InternalKeyKindRangeKeyDelete,
590 : base.InternalKeyKindRangeKeySet,
591 0 : base.InternalKeyKindRangeKeyUnset:
592 0 : w.err = errors.Errorf(
593 0 : "pebble: range keys must be added via one of the RangeKey* functions")
594 0 : return w.err
595 : }
596 1 : return w.addPoint(key, value, forceObsolete)
597 : }
598 :
599 1 : func (w *RawRowWriter) makeAddPointDecisionV2(key InternalKey) error {
600 1 : prevTrailer := w.lastPointKeyInfo.trailer
601 1 : w.lastPointKeyInfo.trailer = key.Trailer
602 1 : if w.dataBlockBuf.dataBlock.EntryCount() == 0 {
603 1 : return nil
604 1 : }
605 1 : if !w.disableKeyOrderChecks {
606 1 : prevPointUserKey := w.dataBlockBuf.dataBlock.CurUserKey()
607 1 : cmpUser := w.compare(prevPointUserKey, key.UserKey)
608 1 : if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) {
609 0 : return errors.Errorf(
610 0 : "pebble: keys must be added in strictly increasing order: %s, %s",
611 0 : InternalKey{UserKey: prevPointUserKey, Trailer: prevTrailer}.Pretty(w.formatKey),
612 0 : key.Pretty(w.formatKey))
613 0 : }
614 : }
615 1 : return nil
616 : }
617 :
618 : // REQUIRES: at least one point has been written to the Writer.
619 1 : func (w *RawRowWriter) getLastPointUserKey() []byte {
620 1 : if w.dataBlockBuf.dataBlock.EntryCount() == 0 {
621 0 : panic(errors.AssertionFailedf("no point keys added to writer"))
622 : }
623 1 : return w.dataBlockBuf.dataBlock.CurUserKey()
624 : }
625 :
626 : // REQUIRES: w.tableFormat >= TableFormatPebblev3
627 : func (w *RawRowWriter) makeAddPointDecisionV3(
628 : key InternalKey, valueLen int,
629 1 : ) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) {
630 1 : prevPointKeyInfo := w.lastPointKeyInfo
631 1 : w.lastPointKeyInfo.userKeyLen = len(key.UserKey)
632 1 : w.lastPointKeyInfo.prefixLen = w.split(key.UserKey)
633 1 : w.lastPointKeyInfo.trailer = key.Trailer
634 1 : w.lastPointKeyInfo.isObsolete = false
635 1 : if !w.meta.HasPointKeys {
636 1 : return false, false, false, nil
637 1 : }
638 1 : keyKind := key.Trailer.Kind()
639 1 : prevPointUserKey := w.getLastPointUserKey()
640 1 : prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer}
641 1 : prevKeyKind := prevPointKeyInfo.trailer.Kind()
642 1 : considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet &&
643 1 : keyKind == InternalKeyKindSet
644 1 : if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() {
645 0 : keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen]
646 0 : cmpUpper := w.compare(
647 0 : w.requiredInPlaceValueBound.Upper, keyPrefix)
648 0 : if cmpUpper <= 0 {
649 0 : // Common case for CockroachDB. Make it empty since all future keys in
650 0 : // this sstable will also have cmpUpper <= 0.
651 0 : w.requiredInPlaceValueBound = UserKeyPrefixBound{}
652 0 : } else if w.compare(keyPrefix, w.requiredInPlaceValueBound.Lower) >= 0 {
653 0 : considerWriteToValueBlock = false
654 0 : }
655 : }
656 : // cmpPrefix is initialized iff considerWriteToValueBlock.
657 1 : var cmpPrefix int
658 1 : var cmpUser int
659 1 : if considerWriteToValueBlock {
660 1 : // Compare the prefixes.
661 1 : cmpPrefix = w.compare(prevPointUserKey[:prevPointKeyInfo.prefixLen],
662 1 : key.UserKey[:w.lastPointKeyInfo.prefixLen])
663 1 : cmpUser = cmpPrefix
664 1 : if cmpPrefix == 0 {
665 1 : // Need to compare suffixes to compute cmpUser.
666 1 : cmpUser = w.pointSuffixCmp(prevPointUserKey[prevPointKeyInfo.prefixLen:],
667 1 : key.UserKey[w.lastPointKeyInfo.prefixLen:])
668 1 : }
669 1 : } else {
670 1 : cmpUser = w.compare(prevPointUserKey, key.UserKey)
671 1 : }
672 : // Ensure that no one adds a point key kind without considering the obsolete
673 : // handling for that kind.
674 1 : switch keyKind {
675 : case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge,
676 1 : InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
677 0 : default:
678 0 : panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String()))
679 : }
680 : // If same user key, then the current key is obsolete if any of the
681 : // following is true:
682 : // C1 The prev key was obsolete.
683 : // C2 The prev key was not a MERGE. When the previous key is a MERGE we must
684 : // preserve SET* and MERGE since their values will be merged into the
685 : // previous key. We also must preserve DEL* since there may be an older
686 : // SET*/MERGE in a lower level that must not be merged with the MERGE --
687 : // if we omit the DEL* that lower SET*/MERGE will become visible.
688 : //
689 : // Regardless of whether it is the same user key or not
690 : // C3 The current key is some kind of point delete, and we are writing to
691 : // the lowest level, then it is also obsolete. The correctness of this
692 : // relies on the same user key not spanning multiple sstables in a level.
693 : //
694 : // C1 ensures that for a user key there is at most one transition from
695 : // !obsolete to obsolete. Consider a user key k, for which the first n keys
696 : // are not obsolete. We consider the various value of n:
697 : //
698 : // n = 0: This happens due to forceObsolete being set by the caller, or due
699 : // to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL
700 : // must also delete all the lower seqnums for the same user key. C3 triggers
701 : // due to a point delete and that deletes all the lower seqnums for the same
702 : // user key.
703 : //
704 : // n = 1: This is the common case. It happens when the first key is not a
705 : // MERGE, or the current key is some kind of point delete.
706 : //
707 : // n > 1: This is due to a sequence of MERGE keys, potentially followed by a
708 : // single non-MERGE key.
709 1 : isObsoleteC1AndC2 := cmpUser == 0 &&
710 1 : (prevPointKeyInfo.isObsolete || prevKeyKind != InternalKeyKindMerge)
711 1 : isObsoleteC3 := w.writingToLowestLevel &&
712 1 : (keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete ||
713 1 : keyKind == InternalKeyKindDeleteSized)
714 1 : isObsolete = isObsoleteC1AndC2 || isObsoleteC3
715 1 : // TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is
716 1 : // possible, but requires some care in documenting and checking invariants.
717 1 : // There is code that assumes nothing in value blocks because of single MVCC
718 1 : // version (those should be ok). We have to ensure setHasSamePrefix is
719 1 : // correctly initialized here etc.
720 1 :
721 1 : if !w.disableKeyOrderChecks &&
722 1 : (cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) {
723 0 : return false, false, false, errors.Errorf(
724 0 : "pebble: keys must be added in strictly increasing order: %s, %s",
725 0 : prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
726 0 : }
727 1 : if !considerWriteToValueBlock {
728 1 : return false, false, isObsolete, nil
729 1 : }
730 : // NB: it is possible that cmpUser == 0, i.e., these two SETs have identical
731 : // user keys (because of an open snapshot). This should be the rare case.
732 1 : setHasSamePrefix = cmpPrefix == 0
733 1 : // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
734 1 : // valueHandle, this should be > 3. But tiny values are common in test and
735 1 : // unlikely in production, so we use 0 here for better test coverage.
736 1 : const tinyValueThreshold = 0
737 1 : // NB: setting WriterOptions.DisableValueBlocks does not disable the
738 1 : // setHasSamePrefix optimization.
739 1 : considerWriteToValueBlock = setHasSamePrefix && valueLen > tinyValueThreshold && w.valueBlockWriter != nil
740 1 : return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil
741 : }
742 :
743 1 : func (w *RawRowWriter) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
744 1 : if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
745 0 : return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
746 0 : }
747 1 : var err error
748 1 : var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool
749 1 : var isObsolete bool
750 1 : maxSharedKeyLen := len(key.UserKey)
751 1 : if w.tableFormat >= TableFormatPebblev3 {
752 1 : // maxSharedKeyLen is limited to the prefix of the preceding key. If the
753 1 : // preceding key was in a different block, then the blockWriter will
754 1 : // ignore this maxSharedKeyLen.
755 1 : maxSharedKeyLen = w.lastPointKeyInfo.prefixLen
756 1 : setHasSameKeyPrefix, writeToValueBlock, isObsolete, err =
757 1 : w.makeAddPointDecisionV3(key, len(value))
758 1 : addPrefixToValueStoredWithKey = key.Kind() == InternalKeyKindSet
759 1 : } else {
760 1 : err = w.makeAddPointDecisionV2(key)
761 1 : }
762 1 : if err != nil {
763 0 : return err
764 0 : }
765 1 : isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete)
766 1 : w.lastPointKeyInfo.isObsolete = isObsolete
767 1 : var valueStoredWithKey []byte
768 1 : var prefix block.ValuePrefix
769 1 : var valueStoredWithKeyLen int
770 1 : if writeToValueBlock {
771 1 : vh, err := w.valueBlockWriter.AddValue(value)
772 1 : if err != nil {
773 0 : return err
774 0 : }
775 1 : n := valblk.EncodeHandle(w.blockBuf.tmp[:], vh)
776 1 : valueStoredWithKey = w.blockBuf.tmp[:n]
777 1 : valueStoredWithKeyLen = len(valueStoredWithKey) + 1
778 1 : var attribute base.ShortAttribute
779 1 : if w.shortAttributeExtractor != nil {
780 0 : // TODO(sumeer): for compactions, it is possible that the input sstable
781 0 : // already has this value in the value section and so we have already
782 0 : // extracted the ShortAttribute. Avoid extracting it again. This will
783 0 : // require changing the Writer.Add interface.
784 0 : if attribute, err = w.shortAttributeExtractor(
785 0 : key.UserKey, w.lastPointKeyInfo.prefixLen, value); err != nil {
786 0 : return err
787 0 : }
788 : }
789 1 : prefix = block.ValueHandlePrefix(setHasSameKeyPrefix, attribute)
790 1 : } else {
791 1 : valueStoredWithKey = value
792 1 : valueStoredWithKeyLen = len(value)
793 1 : if addPrefixToValueStoredWithKey {
794 1 : valueStoredWithKeyLen++
795 1 : }
796 1 : prefix = block.InPlaceValuePrefix(setHasSameKeyPrefix)
797 : }
798 :
799 1 : if err := w.maybeFlush(key, valueStoredWithKeyLen); err != nil {
800 0 : return err
801 0 : }
802 :
803 1 : for i := range w.blockPropCollectors {
804 1 : v := value
805 1 : if addPrefixToValueStoredWithKey {
806 1 : // Values for SET are not required to be in-place, and in the future may
807 1 : // not even be read by the compaction, so pass nil values. Block
808 1 : // property collectors in such Pebble DB's must not look at the value.
809 1 : v = nil
810 1 : }
811 1 : if err := w.blockPropCollectors[i].AddPointKey(key, v); err != nil {
812 0 : w.err = err
813 0 : return err
814 0 : }
815 : }
816 1 : if w.tableFormat >= TableFormatPebblev4 {
817 1 : w.obsoleteCollector.AddPoint(isObsolete)
818 1 : }
819 :
820 1 : w.maybeAddToFilter(key.UserKey)
821 1 : w.dataBlockBuf.dataBlock.AddWithOptionalValuePrefix(
822 1 : key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
823 1 : setHasSameKeyPrefix)
824 1 :
825 1 : w.meta.updateSeqNum(key.SeqNum())
826 1 :
827 1 : if !w.meta.HasPointKeys {
828 1 : k := w.dataBlockBuf.dataBlock.CurKey()
829 1 : // NB: We need to ensure that SmallestPoint.UserKey is set, so we create
830 1 : // an InternalKey which is semantically identical to the key, but won't
831 1 : // have a nil UserKey. We do this, because key.UserKey could be nil, and
832 1 : // we don't want SmallestPoint.UserKey to be nil.
833 1 : //
834 1 : // todo(bananabrick): Determine if it's okay to have a nil SmallestPoint
835 1 : // .UserKey now that we don't rely on a nil UserKey to determine if the
836 1 : // key has been set or not.
837 1 : w.meta.SetSmallestPointKey(k.Clone())
838 1 : }
839 :
840 1 : w.props.NumEntries++
841 1 : switch key.Kind() {
842 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete:
843 1 : w.props.NumDeletions++
844 1 : w.dataBlockBuf.numDeletions++
845 1 : w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
846 1 : w.dataBlockBuf.deletionSize += len(key.UserKey)
847 1 : case InternalKeyKindDeleteSized:
848 1 : var size uint64
849 1 : if len(value) > 0 {
850 1 : var n int
851 1 : size, n = binary.Uvarint(value)
852 1 : if n <= 0 {
853 0 : w.err = errors.Newf("%s key's value (%x) does not parse as uvarint",
854 0 : errors.Safe(key.Kind().String()), value)
855 0 : return w.err
856 0 : }
857 : }
858 1 : w.props.NumDeletions++
859 1 : w.props.NumSizedDeletions++
860 1 : w.dataBlockBuf.numDeletions++
861 1 : w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
862 1 : w.dataBlockBuf.deletionSize += len(key.UserKey)
863 1 : w.props.RawPointTombstoneValueSize += size
864 1 : case InternalKeyKindMerge:
865 1 : w.props.NumMergeOperands++
866 : }
867 1 : w.props.RawKeySize += uint64(key.Size())
868 1 : w.props.RawValueSize += uint64(len(value))
869 1 : return nil
870 : }
871 :
872 0 : func (w *RawRowWriter) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
873 0 : return keyspan.Span{
874 0 : Start: k.UserKey,
875 0 : End: value,
876 0 : Keys: []keyspan.Key{{Trailer: k.Trailer}},
877 0 : }.Pretty(w.formatKey)
878 0 : }
879 :
880 1 : func (w *RawRowWriter) addTombstone(key InternalKey, value []byte) error {
881 1 : if !w.disableKeyOrderChecks && w.rangeDelBlock.EntryCount() > 0 {
882 1 : // Check that tombstones are being added in fragmented order. If the two
883 1 : // tombstones overlap, their start and end keys must be identical.
884 1 : prevKey := w.rangeDelBlock.CurKey()
885 1 : switch c := w.compare(prevKey.UserKey, key.UserKey); {
886 0 : case c > 0:
887 0 : w.err = errors.Errorf("pebble: keys must be added in order: %s, %s",
888 0 : prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
889 0 : return w.err
890 1 : case c == 0:
891 1 : prevValue := w.rangeDelBlock.CurValue()
892 1 : if w.compare(prevValue, value) != 0 {
893 0 : w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
894 0 : w.prettyTombstone(prevKey, prevValue),
895 0 : w.prettyTombstone(key, value))
896 0 : return w.err
897 0 : }
898 1 : if prevKey.SeqNum() <= key.SeqNum() {
899 0 : w.err = errors.Errorf("pebble: keys must be added in strictly increasing order: %s, %s",
900 0 : prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
901 0 : return w.err
902 0 : }
903 1 : default:
904 1 : prevValue := w.rangeDelBlock.CurValue()
905 1 : if w.compare(prevValue, key.UserKey) > 0 {
906 0 : w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
907 0 : w.prettyTombstone(prevKey, prevValue),
908 0 : w.prettyTombstone(key, value))
909 0 : return w.err
910 0 : }
911 : }
912 : }
913 :
914 1 : if key.Trailer == base.InternalKeyRangeDeleteSentinel {
915 0 : w.err = errors.Errorf("pebble: cannot add range delete sentinel: %s", key.Pretty(w.formatKey))
916 0 : return w.err
917 0 : }
918 :
919 1 : w.meta.updateSeqNum(key.SeqNum())
920 1 :
921 1 : // Range tombstones are fragmented in the v2 range deletion block format,
922 1 : // so the start key of the first range tombstone added will be the smallest
923 1 : // range tombstone key. The largest range tombstone key will be determined
924 1 : // in Writer.Close() as the end key of the last range tombstone added.
925 1 : if w.props.NumRangeDeletions == 0 {
926 1 : w.meta.SetSmallestRangeDelKey(key.Clone())
927 1 : }
928 :
929 1 : w.props.NumEntries++
930 1 : w.props.NumDeletions++
931 1 : w.props.NumRangeDeletions++
932 1 : w.props.RawKeySize += uint64(key.Size())
933 1 : w.props.RawValueSize += uint64(len(value))
934 1 : w.rangeDelBlock.Add(key, value)
935 1 : return nil
936 : }
937 :
938 : // addRangeKey adds a range key set, unset, or delete key/value pair to the
939 : // table being written.
940 : //
941 : // Range keys must be supplied in strictly ascending order of start key (i.e.
942 : // user key ascending, sequence number descending, and key type descending).
943 : // Ranges added must also be supplied in fragmented span order - i.e. other than
944 : // spans that are perfectly aligned (same start and end keys), spans may not
945 : // overlap. Range keys may be added out of order relative to point keys and
946 : // range deletions.
947 1 : func (w *RawRowWriter) addRangeKey(key InternalKey, value []byte) error {
948 1 : if !w.disableKeyOrderChecks && w.rangeKeyBlock.EntryCount() > 0 {
949 1 : prevStartKey := w.rangeKeyBlock.CurKey()
950 1 : prevEndKey, _, err := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.CurValue())
951 1 : if err != nil {
952 0 : // We panic here as we should have previously decoded and validated this
953 0 : // key and value when it was first added to the range key block.
954 0 : panic(err)
955 : }
956 :
957 1 : curStartKey := key
958 1 : curEndKey, _, err := rangekey.DecodeEndKey(curStartKey.Kind(), value)
959 1 : if err != nil {
960 0 : w.err = err
961 0 : return w.err
962 0 : }
963 :
964 : // Start keys must be strictly increasing.
965 1 : if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 {
966 0 : w.err = errors.Errorf(
967 0 : "pebble: range keys starts must be added in increasing order: %s, %s",
968 0 : prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
969 0 : return w.err
970 0 : }
971 :
972 : // Start keys are increasing. If the start user keys are equal, the
973 : // end keys must be equal (i.e. aligned spans).
974 1 : if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 {
975 1 : if w.compare(prevEndKey, curEndKey) != 0 {
976 0 : w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
977 0 : prevStartKey.Pretty(w.formatKey),
978 0 : curStartKey.Pretty(w.formatKey))
979 0 : return w.err
980 0 : }
981 1 : } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 {
982 0 : // If the start user keys are NOT equal, the spans must be disjoint (i.e.
983 0 : // no overlap).
984 0 : // NOTE: the inequality excludes zero, as we allow the end key of the
985 0 : // lower span be the same as the start key of the upper span, because
986 0 : // the range end key is considered an exclusive bound.
987 0 : w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
988 0 : prevStartKey.Pretty(w.formatKey),
989 0 : curStartKey.Pretty(w.formatKey))
990 0 : return w.err
991 0 : }
992 : }
993 :
994 : // TODO(travers): Add an invariant-gated check to ensure that suffix-values
995 : // are sorted within coalesced spans.
996 :
997 : // Range-keys and point-keys are intended to live in "parallel" keyspaces.
998 : // However, we track a single seqnum in the table metadata that spans both of
999 : // these keyspaces.
1000 : // TODO(travers): Consider tracking range key seqnums separately.
1001 1 : w.meta.updateSeqNum(key.SeqNum())
1002 1 :
1003 1 : // Range tombstones are fragmented, so the start key of the first range key
1004 1 : // added will be the smallest. The largest range key is determined in
1005 1 : // Writer.Close() as the end key of the last range key added to the block.
1006 1 : if w.props.NumRangeKeys() == 0 {
1007 1 : w.meta.SetSmallestRangeKey(key.Clone())
1008 1 : }
1009 :
1010 : // Update table properties.
1011 1 : w.props.RawRangeKeyKeySize += uint64(key.Size())
1012 1 : w.props.RawRangeKeyValueSize += uint64(len(value))
1013 1 : switch key.Kind() {
1014 1 : case base.InternalKeyKindRangeKeyDelete:
1015 1 : w.props.NumRangeKeyDels++
1016 1 : case base.InternalKeyKindRangeKeySet:
1017 1 : w.props.NumRangeKeySets++
1018 1 : case base.InternalKeyKindRangeKeyUnset:
1019 1 : w.props.NumRangeKeyUnsets++
1020 0 : default:
1021 0 : panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind()))
1022 : }
1023 :
1024 : // Add the key to the block.
1025 1 : w.rangeKeyBlock.Add(key, value)
1026 1 : return nil
1027 : }
1028 :
1029 1 : func (w *RawRowWriter) maybeAddToFilter(key []byte) {
1030 1 : if w.filter != nil {
1031 1 : prefix := key[:w.split(key)]
1032 1 : w.filter.addKey(prefix)
1033 1 : }
1034 : }
1035 :
1036 : // maybeIncrementTombstoneDenseBlocks increments the number of tombstone dense
1037 : // blocks if the number of deletions in the data block exceeds a threshold or
1038 : // the deletion size exceeds a threshold. It should be called after the
1039 : // data block has been finished.
1040 : // Invariant: w.dataBlockBuf.uncompressed must already be populated.
1041 1 : func (w *RawRowWriter) maybeIncrementTombstoneDenseBlocks() {
1042 1 : minSize := w.deletionSizeRatioThreshold * float32(len(w.dataBlockBuf.uncompressed))
1043 1 : if w.dataBlockBuf.numDeletions > w.numDeletionsThreshold || float32(w.dataBlockBuf.deletionSize) > minSize {
1044 1 : w.props.NumTombstoneDenseBlocks++
1045 1 : }
1046 1 : w.dataBlockBuf.numDeletions = 0
1047 1 : w.dataBlockBuf.deletionSize = 0
1048 : }
1049 :
1050 1 : func (w *RawRowWriter) flush(key InternalKey) error {
1051 1 : // We're finishing a data block.
1052 1 : err := w.finishDataBlockProps(w.dataBlockBuf)
1053 1 : if err != nil {
1054 0 : return err
1055 0 : }
1056 1 : w.dataBlockBuf.finish()
1057 1 : w.maybeIncrementTombstoneDenseBlocks()
1058 1 : w.dataBlockBuf.compressAndChecksum(w.compression)
1059 1 : // Since dataBlockEstimates.addInflightDataBlock was never called, the
1060 1 : // inflightSize is set to 0.
1061 1 : w.coordination.sizeEstimate.dataBlockCompressed(w.dataBlockBuf.physical.LengthWithoutTrailer(), 0)
1062 1 :
1063 1 : // Determine if the index block should be flushed. Since we're accessing the
1064 1 : // dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start
1065 1 : // to pool the dataBlockBufs, the curKey isn't used by the Writer once the
1066 1 : // dataBlockBuf is added back to a sync.Pool. In this particular case, the
1067 1 : // byte slice which supports "sep" will eventually be copied when "sep" is
1068 1 : // added to the index block.
1069 1 : prevKey := w.dataBlockBuf.dataBlock.CurKey()
1070 1 : sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
1071 1 : // We determine that we should flush an index block from the Writer client
1072 1 : // goroutine, but we actually finish the index block from the writeQueue.
1073 1 : // When we determine that an index block should be flushed, we need to call
1074 1 : // BlockPropertyCollector.FinishIndexBlock. But block property collector
1075 1 : // calls must happen sequentially from the Writer client. Therefore, we need
1076 1 : // to determine that we are going to flush the index block from the Writer
1077 1 : // client.
1078 1 : shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) &&
1079 1 : w.indexBlock.shouldFlush(sep, encodedBHPEstimatedSize, &w.indexFlush)
1080 1 :
1081 1 : var indexProps []byte
1082 1 : var flushableIndexBlock *indexBlockBuf
1083 1 : if shouldFlushIndexBlock {
1084 1 : flushableIndexBlock = w.indexBlock
1085 1 : w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
1086 1 : // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
1087 1 : // flush the index block.
1088 1 : indexProps, err = w.finishIndexBlockProps()
1089 1 : if err != nil {
1090 0 : return err
1091 0 : }
1092 : }
1093 :
1094 : // We've called BlockPropertyCollector.FinishDataBlock, and, if necessary,
1095 : // BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish
1096 : // the data block, we can call
1097 : // BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
1098 1 : w.addPrevDataBlockToIndexBlockProps()
1099 1 :
1100 1 : // Schedule a write.
1101 1 : writeTask := writeTaskPool.Get().(*writeTask)
1102 1 : // We're setting compressionDone to indicate that compression of this block
1103 1 : // has already been completed.
1104 1 : writeTask.compressionDone <- true
1105 1 : writeTask.buf = w.dataBlockBuf
1106 1 : writeTask.indexEntrySep = sep
1107 1 : writeTask.currIndexBlock = w.indexBlock
1108 1 : writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize
1109 1 : writeTask.finishedIndexProps = indexProps
1110 1 : writeTask.flushableIndexBlock = flushableIndexBlock
1111 1 :
1112 1 : // The writeTask corresponds to an unwritten index entry.
1113 1 : w.indexBlock.addInflight(writeTask.indexInflightSize)
1114 1 :
1115 1 : w.dataBlockBuf = nil
1116 1 : if w.coordination.parallelismEnabled {
1117 1 : w.coordination.writeQueue.add(writeTask)
1118 1 : } else {
1119 1 : err = w.coordination.writeQueue.addSync(writeTask)
1120 1 : }
1121 1 : w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
1122 1 :
1123 1 : return err
1124 : }
1125 :
1126 1 : func (w *RawRowWriter) maybeFlush(key InternalKey, valueLen int) error {
1127 1 : if !w.dataBlockBuf.shouldFlush(key, valueLen, &w.dataFlush) {
1128 1 : return nil
1129 1 : }
1130 :
1131 1 : err := w.flush(key)
1132 1 :
1133 1 : if err != nil {
1134 0 : w.err = err
1135 0 : return err
1136 0 : }
1137 :
1138 1 : return nil
1139 : }
1140 :
1141 : // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the
1142 : // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the
1143 : // blockPropsEncoder.
1144 1 : func (w *RawRowWriter) finishDataBlockProps(buf *dataBlockBuf) error {
1145 1 : if len(w.blockPropCollectors) == 0 {
1146 1 : return nil
1147 1 : }
1148 1 : var err error
1149 1 : buf.blockPropsEncoder.resetProps()
1150 1 : for i := range w.blockPropCollectors {
1151 1 : scratch := buf.blockPropsEncoder.getScratchForProp()
1152 1 : if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil {
1153 0 : return err
1154 0 : }
1155 1 : buf.blockPropsEncoder.addProp(shortID(i), scratch)
1156 : }
1157 :
1158 1 : buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps()
1159 1 : return nil
1160 : }
1161 :
1162 : // The BlockHandleWithProperties returned by this method must be encoded before any future use of
1163 : // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder.
1164 : // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously
1165 : // with the Writer client.
1166 : func (w *RawRowWriter) maybeAddBlockPropertiesToBlockHandle(
1167 : bh block.Handle,
1168 1 : ) (block.HandleWithProperties, error) {
1169 1 : err := w.finishDataBlockProps(w.dataBlockBuf)
1170 1 : if err != nil {
1171 0 : return block.HandleWithProperties{}, err
1172 0 : }
1173 1 : return block.HandleWithProperties{Handle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil
1174 : }
1175 :
1176 : func (w *RawRowWriter) indexEntrySep(
1177 : prevKey, key InternalKey, dataBlockBuf *dataBlockBuf,
1178 1 : ) InternalKey {
1179 1 : // Make a rough guess that we want key-sized scratch to compute the separator.
1180 1 : if cap(dataBlockBuf.sepScratch) < key.Size() {
1181 1 : dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2)
1182 1 : }
1183 :
1184 1 : var sep InternalKey
1185 1 : if key.UserKey == nil && key.Trailer == 0 {
1186 1 : sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0])
1187 1 : } else {
1188 1 : sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key)
1189 1 : }
1190 1 : return sep
1191 : }
1192 :
1193 : // addIndexEntry adds an index entry for the specified key and block handle.
1194 : // addIndexEntry can be called from both the Writer client goroutine, and the
1195 : // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as
1196 : // they're used when the index block is finished.
1197 : //
1198 : // Invariant:
1199 : // 1. addIndexEntry must not store references to the sep InternalKey, the tmp
1200 : // byte slice, bhp.Props. That is, these must be either deep copied or
1201 : // encoded.
1202 : // 2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo
1203 : // indexBlockBufs.
1204 : func (w *RawRowWriter) addIndexEntry(
1205 : sep InternalKey,
1206 : bhp block.HandleWithProperties,
1207 : tmp []byte,
1208 : flushIndexBuf *indexBlockBuf,
1209 : writeTo *indexBlockBuf,
1210 : inflightSize int,
1211 : indexProps []byte,
1212 1 : ) error {
1213 1 : if bhp.Length == 0 {
1214 0 : // A valid blockHandle must be non-zero.
1215 0 : // In particular, it must have a non-zero length.
1216 0 : return nil
1217 0 : }
1218 :
1219 1 : encoded := bhp.EncodeVarints(tmp)
1220 1 : if flushIndexBuf != nil {
1221 1 : if cap(w.indexPartitions) == 0 {
1222 1 : w.indexPartitions = make([]bufferedIndexBlock, 0, 32)
1223 1 : }
1224 : // Enable two level indexes if there is more than one index block.
1225 1 : w.twoLevelIndex = true
1226 1 : if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil {
1227 0 : return err
1228 0 : }
1229 : }
1230 :
1231 1 : writeTo.add(sep, encoded, inflightSize)
1232 1 : return nil
1233 : }
1234 :
1235 1 : func (w *RawRowWriter) addPrevDataBlockToIndexBlockProps() {
1236 1 : for i := range w.blockPropCollectors {
1237 1 : w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock()
1238 1 : }
1239 : }
1240 :
1241 : // addIndexEntrySync adds an index entry for the specified key and block handle.
1242 : // Writer.addIndexEntry is only called synchronously once Writer.Close is called.
1243 : // addIndexEntrySync should only be called if we're sure that index entries
1244 : // aren't being written asynchronously.
1245 : //
1246 : // Invariant:
1247 : // 1. addIndexEntrySync must not store references to the prevKey, key InternalKey's,
1248 : // the tmp byte slice. That is, these must be either deep copied or encoded.
1249 : //
1250 : // TODO: Improve coverage of this method. e.g. tests passed without the line
1251 : // `w.twoLevelIndex = true` previously.
1252 : func (w *RawRowWriter) addIndexEntrySync(
1253 : prevKey, key InternalKey, bhp block.HandleWithProperties, tmp []byte,
1254 1 : ) error {
1255 1 : return w.addIndexEntrySep(w.indexEntrySep(prevKey, key, w.dataBlockBuf), bhp, tmp)
1256 1 : }
1257 :
1258 : func (w *RawRowWriter) addIndexEntrySep(
1259 : sep InternalKey, bhp block.HandleWithProperties, tmp []byte,
1260 1 : ) error {
1261 1 : shouldFlush := supportsTwoLevelIndex(w.tableFormat) &&
1262 1 : w.indexBlock.shouldFlush(sep, encodedBHPEstimatedSize, &w.indexFlush)
1263 1 : var flushableIndexBlock *indexBlockBuf
1264 1 : var props []byte
1265 1 : var err error
1266 1 : if shouldFlush {
1267 1 : flushableIndexBlock = w.indexBlock
1268 1 : w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
1269 1 : w.twoLevelIndex = true
1270 1 : // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
1271 1 : // flush the index block.
1272 1 : props, err = w.finishIndexBlockProps()
1273 1 : if err != nil {
1274 0 : return err
1275 0 : }
1276 : }
1277 :
1278 1 : err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props)
1279 1 : if flushableIndexBlock != nil {
1280 1 : flushableIndexBlock.clear()
1281 1 : indexBlockBufPool.Put(flushableIndexBlock)
1282 1 : }
1283 1 : w.addPrevDataBlockToIndexBlockProps()
1284 1 : return err
1285 : }
1286 :
1287 : func shouldFlush(
1288 : keyLen, valueLen int,
1289 : restartInterval, estimatedBlockSize, numEntries int,
1290 : flushGovernor *block.FlushGovernor,
1291 1 : ) bool {
1292 1 : if numEntries == 0 {
1293 1 : return false
1294 1 : }
1295 1 : if estimatedBlockSize < flushGovernor.LowWatermark() {
1296 1 : // Fast path when the block is too small to flush.
1297 1 : return false
1298 1 : }
1299 :
1300 : // Estimate the new size. This could be an overestimation because we don't
1301 : // know how much of the key will be shared.
1302 1 : newSize := estimatedBlockSize + keyLen + valueLen
1303 1 : if numEntries%restartInterval == 0 {
1304 1 : newSize += 4
1305 1 : }
1306 1 : newSize += 4 // varint for shared prefix length
1307 1 : newSize += uvarintLen(uint32(keyLen)) // varint for unshared key bytes
1308 1 : newSize += uvarintLen(uint32(valueLen)) // varint for value size
1309 1 :
1310 1 : return flushGovernor.ShouldFlush(estimatedBlockSize, newSize)
1311 : }
1312 :
1313 1 : func cloneKeyWithBuf(k InternalKey, a bytealloc.A) (bytealloc.A, InternalKey) {
1314 1 : if len(k.UserKey) == 0 {
1315 0 : return a, k
1316 0 : }
1317 1 : a, keyCopy := a.Copy(k.UserKey)
1318 1 : return a, InternalKey{UserKey: keyCopy, Trailer: k.Trailer}
1319 : }
1320 :
1321 : // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated
1322 : //
1323 : // and has its own lifetime, independent of the Writer and the blockPropsEncoder,
1324 : //
1325 : // and it is safe to:
1326 : // 1. Reuse w.blockPropsEncoder without first encoding the byte slice returned.
1327 : // 2. Store the byte slice in the Writer since it is a copy and not supported by
1328 : // an underlying buffer.
1329 1 : func (w *RawRowWriter) finishIndexBlockProps() ([]byte, error) {
1330 1 : w.blockPropsEncoder.resetProps()
1331 1 : for i := range w.blockPropCollectors {
1332 1 : scratch := w.blockPropsEncoder.getScratchForProp()
1333 1 : var err error
1334 1 : if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil {
1335 0 : return nil, err
1336 0 : }
1337 1 : w.blockPropsEncoder.addProp(shortID(i), scratch)
1338 : }
1339 1 : return w.blockPropsEncoder.props(), nil
1340 : }
1341 :
1342 : // finishIndexBlock finishes the current index block and adds it to the top
1343 : // level index block. This is only used when two level indexes are enabled.
1344 : //
1345 : // Invariants:
1346 : // 1. The props slice passed into finishedIndexBlock must not be a
1347 : // owned by any other struct, since it will be stored in the Writer.indexPartitions
1348 : // slice.
1349 : // 2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere.
1350 : // That is, it must be safe to reuse indexBuf after finishIndexBlock has been called.
1351 1 : func (w *RawRowWriter) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
1352 1 : part := bufferedIndexBlock{
1353 1 : nEntries: indexBuf.block.EntryCount(), properties: props,
1354 1 : }
1355 1 : w.indexSepAlloc, part.sep = cloneKeyWithBuf(
1356 1 : indexBuf.block.CurKey(), w.indexSepAlloc,
1357 1 : )
1358 1 : bk := indexBuf.finish()
1359 1 : if len(w.indexBlockAlloc) < len(bk) {
1360 1 : // Allocate enough bytes for approximately 16 index blocks.
1361 1 : w.indexBlockAlloc = make([]byte, len(bk)*16)
1362 1 : }
1363 1 : n := copy(w.indexBlockAlloc, bk)
1364 1 : part.block = w.indexBlockAlloc[:n:n]
1365 1 : w.indexBlockAlloc = w.indexBlockAlloc[n:]
1366 1 : w.indexPartitions = append(w.indexPartitions, part)
1367 1 : return nil
1368 : }
1369 :
1370 1 : func (w *RawRowWriter) writeTwoLevelIndex() (block.Handle, error) {
1371 1 : props, err := w.finishIndexBlockProps()
1372 1 : if err != nil {
1373 0 : return block.Handle{}, err
1374 0 : }
1375 : // Add the final unfinished index.
1376 1 : if err = w.finishIndexBlock(w.indexBlock, props); err != nil {
1377 0 : return block.Handle{}, err
1378 0 : }
1379 :
1380 1 : for i := range w.indexPartitions {
1381 1 : b := &w.indexPartitions[i]
1382 1 : w.props.NumDataBlocks += uint64(b.nEntries)
1383 1 :
1384 1 : data := b.block
1385 1 : w.props.IndexSize += uint64(len(data))
1386 1 : bh, err := w.layout.WriteIndexBlock(data)
1387 1 : if err != nil {
1388 0 : return block.Handle{}, err
1389 0 : }
1390 1 : w.topLevelIndexBlock.Add(b.sep, block.HandleWithProperties{
1391 1 : Handle: bh,
1392 1 : Props: b.properties,
1393 1 : }.EncodeVarints(w.blockBuf.tmp[:]))
1394 : }
1395 :
1396 : // NB: RocksDB includes the block trailer length in the index size
1397 : // property, though it doesn't include the trailer in the top level
1398 : // index size property.
1399 1 : w.props.IndexPartitions = uint64(len(w.indexPartitions))
1400 1 : w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.EstimatedSize())
1401 1 : w.props.IndexSize += w.props.TopLevelIndexSize + block.TrailerLen
1402 1 : return w.layout.WriteIndexBlock(w.topLevelIndexBlock.Finish())
1403 : }
1404 :
1405 : // assertFormatCompatibility ensures that the features present on the table are
1406 : // compatible with the table format version.
1407 1 : func (w *RawRowWriter) assertFormatCompatibility() error {
1408 1 : // PebbleDBv1: block properties.
1409 1 : if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 {
1410 0 : return errors.Newf(
1411 0 : "table format version %s is less than the minimum required version %s for block properties",
1412 0 : w.tableFormat, TableFormatPebblev1,
1413 0 : )
1414 0 : }
1415 :
1416 : // PebbleDBv2: range keys.
1417 1 : if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 {
1418 0 : return errors.Newf(
1419 0 : "table format version %s is less than the minimum required version %s for range keys",
1420 0 : w.tableFormat, TableFormatPebblev2,
1421 0 : )
1422 0 : }
1423 :
1424 : // PebbleDBv3: value blocks.
1425 1 : if (w.props.NumValueBlocks > 0 || w.props.NumValuesInValueBlocks > 0 ||
1426 1 : w.props.ValueBlocksSize > 0) && w.tableFormat < TableFormatPebblev3 {
1427 0 : return errors.Newf(
1428 0 : "table format version %s is less than the minimum required version %s for value blocks",
1429 0 : w.tableFormat, TableFormatPebblev3)
1430 0 : }
1431 :
1432 : // PebbleDBv4: DELSIZED tombstones.
1433 1 : if w.props.NumSizedDeletions > 0 && w.tableFormat < TableFormatPebblev4 {
1434 0 : return errors.Newf(
1435 0 : "table format version %s is less than the minimum required version %s for sized deletion tombstones",
1436 0 : w.tableFormat, TableFormatPebblev4)
1437 0 : }
1438 1 : return nil
1439 : }
1440 :
1441 : // ComparePrev compares the provided user to the last point key written to the
1442 : // writer. The returned value is equivalent to Compare(key, prevKey) where
1443 : // prevKey is the last point key written to the writer.
1444 : //
1445 : // If no key has been written yet, ComparePrev returns +1.
1446 : //
1447 : // Must not be called after Writer is closed.
1448 1 : func (w *RawRowWriter) ComparePrev(k []byte) int {
1449 1 : if w == nil || w.dataBlockBuf.dataBlock.EntryCount() == 0 {
1450 0 : return +1
1451 0 : }
1452 1 : return w.compare(k, w.dataBlockBuf.dataBlock.CurUserKey())
1453 : }
1454 :
1455 : // EncodeSpan encodes the keys in the given span. The span can contain either
1456 : // only RANGEDEL keys or only range keys.
1457 : //
1458 : // This is a low-level API that bypasses the fragmenter. The spans passed to
1459 : // this function must be fragmented and ordered.
1460 1 : func (w *RawRowWriter) EncodeSpan(span keyspan.Span) error {
1461 1 : if span.Empty() {
1462 1 : return nil
1463 1 : }
1464 1 : if span.Keys[0].Kind() == base.InternalKeyKindRangeDelete {
1465 1 : return rangedel.Encode(span, w.addTombstone)
1466 1 : }
1467 1 : for i := range w.blockPropCollectors {
1468 1 : if err := w.blockPropCollectors[i].AddRangeKeys(span); err != nil {
1469 0 : return err
1470 0 : }
1471 : }
1472 1 : return w.rangeKeyEncoder.Encode(span)
1473 : }
1474 :
1475 : // Error returns the current accumulated error, if any.
1476 1 : func (w *RawRowWriter) Error() error {
1477 1 : return w.err
1478 1 : }
1479 :
1480 : // Close finishes writing the table and closes the underlying file that the
1481 : // table was written to.
1482 1 : func (w *RawRowWriter) Close() (err error) {
1483 1 : defer func() {
1484 1 : if w.valueBlockWriter != nil {
1485 1 : w.valueBlockWriter.Release()
1486 1 : // Defensive code in case Close gets called again. We don't want to put
1487 1 : // the same object to a sync.Pool.
1488 1 : w.valueBlockWriter = nil
1489 1 : }
1490 1 : w.layout.Abort()
1491 1 : // Record any error in the writer (so we can exit early if Close is called
1492 1 : // again).
1493 1 : if err != nil {
1494 0 : w.err = err
1495 0 : }
1496 : }()
1497 :
1498 : // finish must be called before we check for an error, because finish will
1499 : // block until every single task added to the writeQueue has been processed,
1500 : // and an error could be encountered while any of those tasks are processed.
1501 1 : if err := w.coordination.writeQueue.finish(); err != nil {
1502 0 : return err
1503 0 : }
1504 1 : if w.err != nil {
1505 0 : return w.err
1506 0 : }
1507 :
1508 : // The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it
1509 : // when the Writer is closed.
1510 : //
1511 : // The following invariants ensure that setting the largest key at this point of a Writer close
1512 : // is correct:
1513 : // 1. Keys must only be added to the Writer in an increasing order.
1514 : // 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This
1515 : // must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed,
1516 : // however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
1517 : // addPoint function after the flush occurs.
1518 1 : if w.dataBlockBuf.dataBlock.EntryCount() >= 1 {
1519 1 : w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.CurKey().Clone())
1520 1 : }
1521 :
1522 : // Finish the last data block, or force an empty data block if there
1523 : // aren't any data blocks at all.
1524 1 : if w.dataBlockBuf.dataBlock.EntryCount() > 0 || w.indexBlock.block.EntryCount() == 0 {
1525 1 : w.dataBlockBuf.finish()
1526 1 : w.maybeIncrementTombstoneDenseBlocks()
1527 1 : bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.uncompressed, &w.dataBlockBuf.blockBuf)
1528 1 : if err != nil {
1529 0 : return err
1530 0 : }
1531 1 : bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
1532 1 : if err != nil {
1533 0 : return err
1534 0 : }
1535 1 : prevKey := w.dataBlockBuf.dataBlock.CurKey()
1536 1 : if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
1537 0 : return err
1538 0 : }
1539 : }
1540 1 : w.props.DataSize = w.layout.offset
1541 1 :
1542 1 : // Write the filter block.
1543 1 : if w.filter != nil {
1544 1 : bh, err := w.layout.WriteFilterBlock(w.filter)
1545 1 : if err != nil {
1546 0 : return err
1547 0 : }
1548 1 : w.props.FilterPolicyName = w.filter.policyName()
1549 1 : w.props.FilterSize = bh.Length
1550 : }
1551 :
1552 1 : if w.twoLevelIndex {
1553 1 : w.props.IndexType = twoLevelIndex
1554 1 : // Write the two level index block.
1555 1 : if _, err = w.writeTwoLevelIndex(); err != nil {
1556 0 : return err
1557 0 : }
1558 1 : } else {
1559 1 : w.props.IndexType = binarySearchIndex
1560 1 : // NB: RocksDB includes the block trailer length in the index size
1561 1 : // property, though it doesn't include the trailer in the filter size
1562 1 : // property.
1563 1 : w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + block.TrailerLen
1564 1 : w.props.NumDataBlocks = uint64(w.indexBlock.block.EntryCount())
1565 1 : // Write the single level index block.
1566 1 : if _, err = w.layout.WriteIndexBlock(w.indexBlock.finish()); err != nil {
1567 0 : return err
1568 0 : }
1569 : }
1570 :
1571 : // Write the range-del block.
1572 1 : if w.props.NumRangeDeletions > 0 {
1573 1 : // Because the range tombstones are fragmented, the end key of the last
1574 1 : // added range tombstone will be the largest range tombstone key. Note
1575 1 : // that we need to make this into a range deletion sentinel because
1576 1 : // sstable boundaries are inclusive while the end key of a range
1577 1 : // deletion tombstone is exclusive. A Clone() is necessary as
1578 1 : // rangeDelBlock.curValue is the same slice that will get passed into
1579 1 : // w.writer, and some implementations of vfs.File mutate the slice
1580 1 : // passed into Write(). Also, w.meta will often outlive the blockWriter,
1581 1 : // and so cloning curValue allows the rangeDelBlock's internal buffer to
1582 1 : // get gc'd.
1583 1 : k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.CurValue()).Clone()
1584 1 : w.meta.SetLargestRangeDelKey(k)
1585 1 : if _, err := w.layout.WriteRangeDeletionBlock(w.rangeDelBlock.Finish()); err != nil {
1586 0 : return err
1587 0 : }
1588 : }
1589 :
1590 1 : if w.props.NumRangeKeys() > 0 {
1591 1 : key := w.rangeKeyBlock.CurKey()
1592 1 : kind := key.Kind()
1593 1 : endKey, _, err := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.CurValue())
1594 1 : if err != nil {
1595 0 : return err
1596 0 : }
1597 1 : k := base.MakeExclusiveSentinelKey(kind, endKey).Clone()
1598 1 : w.meta.SetLargestRangeKey(k)
1599 1 : if _, err := w.layout.WriteRangeKeyBlock(w.rangeKeyBlock.Finish()); err != nil {
1600 0 : return err
1601 0 : }
1602 : }
1603 :
1604 1 : if w.valueBlockWriter != nil {
1605 1 : _, vbStats, err := w.valueBlockWriter.Finish(&w.layout, w.layout.offset)
1606 1 : if err != nil {
1607 0 : return err
1608 0 : }
1609 1 : w.props.NumValueBlocks = vbStats.NumValueBlocks
1610 1 : w.props.NumValuesInValueBlocks = vbStats.NumValuesInValueBlocks
1611 1 : w.props.ValueBlocksSize = vbStats.ValueBlocksAndIndexSize
1612 : }
1613 :
1614 1 : {
1615 1 : // Finish and record the prop collectors if props are not yet recorded.
1616 1 : // Pre-computed props might have been copied by specialized sst creators
1617 1 : // like suffix replacer.
1618 1 : if len(w.props.UserProperties) == 0 {
1619 1 : userProps := make(map[string]string)
1620 1 : for i := range w.blockPropCollectors {
1621 1 : scratch := w.blockPropsEncoder.getScratchForProp()
1622 1 : // Place the shortID in the first byte.
1623 1 : scratch = append(scratch, byte(i))
1624 1 : buf, err := w.blockPropCollectors[i].FinishTable(scratch)
1625 1 : if err != nil {
1626 0 : return err
1627 0 : }
1628 1 : var prop string
1629 1 : if len(buf) > 0 {
1630 1 : prop = string(buf)
1631 1 : }
1632 : // NB: The property is populated in the map even if it is the
1633 : // empty string, since the presence in the map is what indicates
1634 : // that the block property collector was used when writing.
1635 1 : userProps[w.blockPropCollectors[i].Name()] = prop
1636 : }
1637 1 : if len(userProps) > 0 {
1638 1 : w.props.UserProperties = userProps
1639 1 : }
1640 : }
1641 :
1642 : // Write the properties block.
1643 1 : var raw rowblk.Writer
1644 1 : // The restart interval is set to infinity because the properties block
1645 1 : // is always read sequentially and cached in a heap located object. This
1646 1 : // reduces table size without a significant impact on performance.
1647 1 : raw.RestartInterval = propertiesBlockRestartInterval
1648 1 : w.props.CompressionOptions = rocksDBCompressionOptions
1649 1 : w.props.save(w.tableFormat, &raw)
1650 1 : if _, err := w.layout.WritePropertiesBlock(raw.Finish()); err != nil {
1651 0 : return err
1652 0 : }
1653 : }
1654 :
1655 : // Write the table footer.
1656 1 : w.meta.Size, err = w.layout.Finish()
1657 1 : if err != nil {
1658 0 : return err
1659 0 : }
1660 1 : w.meta.Properties = w.props
1661 1 :
1662 1 : // Check that the features present in the table are compatible with the format
1663 1 : // configured for the table.
1664 1 : if err = w.assertFormatCompatibility(); err != nil {
1665 0 : return err
1666 0 : }
1667 :
1668 1 : w.dataBlockBuf.clear()
1669 1 : dataBlockBufPool.Put(w.dataBlockBuf)
1670 1 : w.dataBlockBuf = nil
1671 1 : w.indexBlock.clear()
1672 1 : indexBlockBufPool.Put(w.indexBlock)
1673 1 : w.indexBlock = nil
1674 1 :
1675 1 : // Make any future calls to Set or Close return an error.
1676 1 : w.err = errWriterClosed
1677 1 : return nil
1678 : }
1679 :
1680 : // EstimatedSize returns the estimated size of the sstable being written if a
1681 : // call to Finish() was made without adding additional keys.
1682 1 : func (w *RawRowWriter) EstimatedSize() uint64 {
1683 1 : if w == nil {
1684 0 : return 0
1685 0 : }
1686 1 : return w.coordination.sizeEstimate.size() +
1687 1 : uint64(w.dataBlockBuf.dataBlock.EstimatedSize()) +
1688 1 : w.indexBlock.estimatedSize()
1689 : }
1690 :
1691 : // Metadata returns the metadata for the finished sstable. Only valid to call
1692 : // after the sstable has been finished.
1693 1 : func (w *RawRowWriter) Metadata() (*WriterMetadata, error) {
1694 1 : if !w.layout.IsFinished() {
1695 0 : return nil, errors.New("pebble: writer is not closed")
1696 0 : }
1697 1 : return &w.meta, nil
1698 : }
1699 :
1700 1 : func newRowWriter(writable objstorage.Writable, o WriterOptions) *RawRowWriter {
1701 1 : if o.TableFormat.BlockColumnar() {
1702 0 : panic(errors.AssertionFailedf("newRowWriter cannot create sstables with %s format", o.TableFormat))
1703 : }
1704 1 : o = o.ensureDefaults()
1705 1 : w := &RawRowWriter{
1706 1 : layout: makeLayoutWriter(writable, o),
1707 1 : meta: WriterMetadata{
1708 1 : SmallestSeqNum: math.MaxUint64,
1709 1 : },
1710 1 : compare: o.Comparer.Compare,
1711 1 : pointSuffixCmp: o.Comparer.ComparePointSuffixes,
1712 1 : split: o.Comparer.Split,
1713 1 : formatKey: o.Comparer.FormatKey,
1714 1 : compression: o.Compression,
1715 1 : separator: o.Comparer.Separator,
1716 1 : successor: o.Comparer.Successor,
1717 1 : tableFormat: o.TableFormat,
1718 1 : isStrictObsolete: o.IsStrictObsolete,
1719 1 : writingToLowestLevel: o.WritingToLowestLevel,
1720 1 : restartInterval: o.BlockRestartInterval,
1721 1 : checksumType: o.Checksum,
1722 1 : disableKeyOrderChecks: o.internal.DisableKeyOrderChecks,
1723 1 : indexBlock: newIndexBlockBuf(o.Parallelism),
1724 1 : rangeDelBlock: rowblk.Writer{RestartInterval: 1},
1725 1 : rangeKeyBlock: rowblk.Writer{RestartInterval: 1},
1726 1 : topLevelIndexBlock: rowblk.Writer{RestartInterval: 1},
1727 1 : allocatorSizeClasses: o.AllocatorSizeClasses,
1728 1 : numDeletionsThreshold: o.NumDeletionsThreshold,
1729 1 : deletionSizeRatioThreshold: o.DeletionSizeRatioThreshold,
1730 1 : }
1731 1 : w.dataFlush = block.MakeFlushGovernor(o.BlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses)
1732 1 : w.indexFlush = block.MakeFlushGovernor(o.IndexBlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses)
1733 1 : if w.tableFormat >= TableFormatPebblev3 {
1734 1 : w.shortAttributeExtractor = o.ShortAttributeExtractor
1735 1 : w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound
1736 1 : if !o.DisableValueBlocks {
1737 1 : w.valueBlockWriter = valblk.NewWriter(
1738 1 : block.MakeFlushGovernor(o.BlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses),
1739 1 : w.compression, w.checksumType, func(compressedSize int) {
1740 1 : w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0)
1741 1 : },
1742 : )
1743 : }
1744 : }
1745 :
1746 1 : w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
1747 1 :
1748 1 : w.blockBuf = blockBuf{
1749 1 : checksummer: block.Checksummer{Type: o.Checksum},
1750 1 : }
1751 1 :
1752 1 : w.coordination.init(o.Parallelism, w)
1753 1 : defer func() {
1754 1 : if r := recover(); r != nil {
1755 0 : // Don't leak a goroutine if we hit a panic.
1756 0 : _ = w.coordination.writeQueue.finish()
1757 0 : panic(r)
1758 : }
1759 : }()
1760 :
1761 1 : if writable == nil {
1762 0 : w.err = errors.New("pebble: nil writable")
1763 0 : return w
1764 0 : }
1765 :
1766 1 : if o.FilterPolicy != nil {
1767 1 : switch o.FilterType {
1768 1 : case TableFilter:
1769 1 : w.filter = newTableFilterWriter(o.FilterPolicy)
1770 0 : default:
1771 0 : panic(fmt.Sprintf("unknown filter type: %v", o.FilterType))
1772 : }
1773 : }
1774 :
1775 1 : w.props.ComparerName = o.Comparer.Name
1776 1 : w.props.CompressionName = o.Compression.String()
1777 1 : w.props.MergerName = o.MergerName
1778 1 : w.props.PropertyCollectorNames = "[]"
1779 1 :
1780 1 : numBlockPropertyCollectors := len(o.BlockPropertyCollectors)
1781 1 : shouldAddObsoleteCollector := w.tableFormat >= TableFormatPebblev4 && !o.disableObsoleteCollector
1782 1 : if shouldAddObsoleteCollector {
1783 1 : numBlockPropertyCollectors++
1784 1 : }
1785 :
1786 1 : if numBlockPropertyCollectors > 0 {
1787 1 : if numBlockPropertyCollectors > maxPropertyCollectors {
1788 0 : w.err = errors.New("pebble: too many block property collectors")
1789 0 : return w
1790 0 : }
1791 1 : w.blockPropCollectors = make([]BlockPropertyCollector, 0, numBlockPropertyCollectors)
1792 1 : for _, constructFn := range o.BlockPropertyCollectors {
1793 1 : w.blockPropCollectors = append(w.blockPropCollectors, constructFn())
1794 1 : }
1795 1 : if shouldAddObsoleteCollector {
1796 1 : w.blockPropCollectors = append(w.blockPropCollectors, &w.obsoleteCollector)
1797 1 : }
1798 :
1799 1 : var buf bytes.Buffer
1800 1 : buf.WriteString("[")
1801 1 : for i := range w.blockPropCollectors {
1802 1 : if i > 0 {
1803 1 : buf.WriteString(",")
1804 1 : }
1805 1 : buf.WriteString(w.blockPropCollectors[i].Name())
1806 : }
1807 1 : buf.WriteString("]")
1808 1 : w.props.PropertyCollectorNames = buf.String()
1809 : }
1810 :
1811 : // Initialize the range key fragmenter and encoder.
1812 1 : w.rangeKeyEncoder.Emit = w.addRangeKey
1813 1 : return w
1814 : }
1815 :
1816 : // rewriteSuffixes implements RawWriter.
1817 : func (w *RawRowWriter) rewriteSuffixes(
1818 : r *Reader, wo WriterOptions, from, to []byte, concurrency int,
1819 0 : ) error {
1820 0 : for _, c := range w.blockPropCollectors {
1821 0 : if !c.SupportsSuffixReplacement() {
1822 0 : return errors.Errorf("block property collector %s does not support suffix replacement", c.Name())
1823 0 : }
1824 : }
1825 0 : l, err := r.Layout()
1826 0 : if err != nil {
1827 0 : return errors.Wrap(err, "reading layout")
1828 0 : }
1829 :
1830 : // Copy data blocks in parallel, rewriting suffixes as we go.
1831 0 : blocks, err := rewriteDataBlocksInParallel(r, wo, l.Data, from, to, concurrency, func() blockRewriter {
1832 0 : return rowblk.NewRewriter(r.Comparer, wo.BlockRestartInterval)
1833 0 : })
1834 0 : if err != nil {
1835 0 : return errors.Wrap(err, "rewriting data blocks")
1836 0 : }
1837 :
1838 : // oldShortIDs maps the shortID for the block property collector in the old
1839 : // blocks to the shortID in the new blocks. Initialized once for the sstable.
1840 0 : oldShortIDs, n, err := getShortIDs(r, w.blockPropCollectors)
1841 0 : if err != nil {
1842 0 : return errors.Wrap(err, "getting short IDs")
1843 0 : }
1844 0 : oldProps := make([][]byte, len(w.blockPropCollectors))
1845 0 :
1846 0 : for i := range blocks {
1847 0 : // Write the rewritten block to the file.
1848 0 : bh, err := w.layout.WritePrecompressedDataBlock(blocks[i].physical)
1849 0 : if err != nil {
1850 0 : return err
1851 0 : }
1852 :
1853 : // Load any previous values for our prop collectors into oldProps.
1854 0 : for i := range oldProps {
1855 0 : oldProps[i] = nil
1856 0 : }
1857 0 : decoder := makeBlockPropertiesDecoder(n, l.Data[i].Props)
1858 0 : for !decoder.Done() {
1859 0 : id, val, err := decoder.Next()
1860 0 : if err != nil {
1861 0 : return err
1862 0 : }
1863 0 : if oldShortIDs[id].IsValid() {
1864 0 : oldProps[oldShortIDs[id]] = val
1865 0 : }
1866 : }
1867 0 : for i, p := range w.blockPropCollectors {
1868 0 : if err := p.AddCollectedWithSuffixReplacement(oldProps[i], from, to); err != nil {
1869 0 : return err
1870 0 : }
1871 : }
1872 :
1873 0 : bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
1874 0 : if err != nil {
1875 0 : return err
1876 0 : }
1877 0 : var nextKey InternalKey
1878 0 : if i+1 < len(blocks) {
1879 0 : nextKey = blocks[i+1].start
1880 0 : }
1881 0 : if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil {
1882 0 : return err
1883 0 : }
1884 : }
1885 0 : if len(blocks) > 0 {
1886 0 : w.meta.Size = w.layout.offset
1887 0 : w.meta.updateSeqNum(blocks[0].start.SeqNum())
1888 0 : w.props.NumEntries = r.Properties.NumEntries
1889 0 : w.props.RawKeySize = r.Properties.RawKeySize
1890 0 : w.props.RawValueSize = r.Properties.RawValueSize
1891 0 : w.meta.SetSmallestPointKey(blocks[0].start)
1892 0 : w.meta.SetLargestPointKey(blocks[len(blocks)-1].end)
1893 0 : }
1894 :
1895 : // Copy range key block, replacing suffixes if it exists.
1896 0 : if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
1897 0 : return errors.Wrap(err, "rewriting range key blocks")
1898 0 : }
1899 : // Copy over the filter block if it exists (rewriteDataBlocksToWriter will
1900 : // already have ensured this is valid if it exists).
1901 0 : if w.filter != nil {
1902 0 : if filterBlockBH, ok := l.FilterByName(w.filter.metaName()); ok {
1903 0 : filterBlock, _, err := readBlockBuf(r, filterBlockBH, nil)
1904 0 : if err != nil {
1905 0 : return errors.Wrap(err, "reading filter")
1906 0 : }
1907 0 : w.filter = copyFilterWriter{
1908 0 : origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock,
1909 0 : }
1910 : }
1911 : }
1912 0 : return nil
1913 : }
1914 :
1915 : // copyDataBlocks implements RawWriter.
1916 : func (w *RawRowWriter) copyDataBlocks(
1917 : ctx context.Context, blocks []indexEntry, rh objstorage.ReadHandle,
1918 1 : ) error {
1919 1 : blockOffset := blocks[0].bh.Offset
1920 1 : // The block lengths don't include their trailers, which just sit after the
1921 1 : // block length, before the next offset; We get the ones between the blocks
1922 1 : // we copy implicitly but need to explicitly add the last trailer to length.
1923 1 : length := blocks[len(blocks)-1].bh.Offset + blocks[len(blocks)-1].bh.Length + block.TrailerLen - blockOffset
1924 1 : if spanEnd := length + blockOffset; spanEnd < blockOffset {
1925 0 : return base.AssertionFailedf("invalid intersecting span for CopySpan [%d, %d)", blockOffset, spanEnd)
1926 0 : }
1927 1 : if err := objstorage.Copy(ctx, rh, w.layout.writable, blockOffset, length); err != nil {
1928 0 : return err
1929 0 : }
1930 : // Update w.meta.Size so subsequently flushed metadata has correct offsets.
1931 1 : w.meta.Size += length
1932 1 : for i := range blocks {
1933 1 : blocks[i].bh.Offset = w.layout.offset
1934 1 : // blocks[i].bh.Length remains unmodified.
1935 1 : sepKey := base.MakeInternalKey(blocks[i].sep, base.SeqNumMax, base.InternalKeyKindSeparator)
1936 1 : if err := w.addIndexEntrySep(sepKey, blocks[i].bh, w.dataBlockBuf.tmp[:]); err != nil {
1937 0 : return err
1938 0 : }
1939 1 : w.layout.offset += uint64(blocks[i].bh.Length) + block.TrailerLen
1940 : }
1941 1 : return nil
1942 : }
1943 :
1944 : // addDataBlock implements RawWriter.
1945 1 : func (w *RawRowWriter) addDataBlock(b, sep []byte, bhp block.HandleWithProperties) error {
1946 1 : // layout.WriteDataBlock keeps layout.offset up-to-date for us.
1947 1 : bh, err := w.layout.WriteDataBlock(b, &w.dataBlockBuf.blockBuf)
1948 1 : if err != nil {
1949 0 : return err
1950 0 : }
1951 1 : bhp.Handle = bh
1952 1 :
1953 1 : sepKey := base.MakeInternalKey(sep, base.SeqNumMax, base.InternalKeyKindSeparator)
1954 1 : if err := w.addIndexEntrySep(sepKey, bhp, w.dataBlockBuf.tmp[:]); err != nil {
1955 0 : return err
1956 0 : }
1957 1 : w.meta.Size += uint64(bh.Length) + block.TrailerLen
1958 1 : return nil
1959 : }
1960 :
1961 : // copyProperties implements RawWriter.
1962 1 : func (w *RawRowWriter) copyProperties(props Properties) {
1963 1 : w.props = props
1964 1 : // Remove all user properties to disable block properties, which we do not
1965 1 : // calculate.
1966 1 : w.props.UserProperties = nil
1967 1 : // Reset props that we'll re-derive as we build our own index.
1968 1 : w.props.IndexPartitions = 0
1969 1 : w.props.TopLevelIndexSize = 0
1970 1 : w.props.IndexSize = 0
1971 1 : w.props.IndexType = 0
1972 1 : }
1973 :
1974 : // copyFilter implements RawWriter.
1975 1 : func (w *RawRowWriter) copyFilter(filter []byte, filterName string) error {
1976 1 : if w.filter != nil && filterName != w.filter.policyName() {
1977 0 : return errors.New("mismatched filters")
1978 0 : }
1979 1 : w.filter = copyFilterWriter{
1980 1 : origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filter,
1981 1 : }
1982 1 : return nil
1983 : }
1984 :
1985 : // SetSnapshotPinnedProperties sets the properties for pinned keys. Should only
1986 : // be used internally by Pebble.
1987 : func (w *RawRowWriter) SetSnapshotPinnedProperties(
1988 : pinnedKeyCount, pinnedKeySize, pinnedValueSize uint64,
1989 1 : ) {
1990 1 : w.props.SnapshotPinnedKeys = pinnedKeyCount
1991 1 : w.props.SnapshotPinnedKeySize = pinnedKeySize
1992 1 : w.props.SnapshotPinnedValueSize = pinnedValueSize
1993 1 : }
|