Line data Source code
1 : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "context"
9 : "encoding/binary"
10 : "io"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/errors"
15 : "github.com/cockroachdb/pebble/internal/base"
16 : "github.com/cockroachdb/pebble/internal/invariants"
17 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
18 : "golang.org/x/exp/rand"
19 : )
20 :
21 : // Value blocks are supported in TableFormatPebblev3.
22 : //
23 : // 1. Motivation and overview
24 : //
25 : // Value blocks are a mechanism designed for sstables storing MVCC data, where
26 : // there can be many versions of a key that need to be kept, but only the
27 : // latest value is typically read (see the documentation for Comparer.Split
28 : // regarding MVCC keys). The goal is faster reads. Unlike Pebble versions,
29 : // which can be eagerly thrown away (except when there are snapshots), MVCC
30 : // versions are long-lived (e.g. default CockroachDB garbage collection
31 : // threshold for older versions is 24 hours) and can significantly slow down
32 : // reads. We have seen CockroachDB production workloads with very slow reads
33 : // due to:
34 : // - 100s of versions for each key in a table.
35 : //
36 : // - Tables with mostly MVCC garbage consisting of 2 versions per key -- a
37 : // real key-value pair, followed by a key-value pair whose value (usually
38 : // with zero byte length) indicates it is an MVCC tombstone.
39 : //
40 : // The value blocks mechanism attempts to improve read throughput in these
41 : // cases when the key size is smaller than the value sizes of older versions.
42 : // This is done by moving the value of an older version to a value block in a
43 : // different part of the sstable. This improves spatial locality of the data
44 : // being read by the workload, which increases caching effectiveness.
45 : //
46 : // Additionally, even when the key size is not smaller than the value of older
47 : // versions (e.g. secondary indexes in CockroachDB), TableFormatPebblev3
48 : // stores the result of key comparisons done at write time inside the sstable,
49 : // which makes stepping from one key prefix to the next prefix (i.e., skipping
50 : // over older versions of a MVCC key) more efficient by avoiding key
51 : // comparisons and key decoding. See the results in
52 : // https://github.com/cockroachdb/pebble/pull/2149 and more details in the
53 : // comment inside BenchmarkIteratorScanNextPrefix. These improvements are also
54 : // visible in end-to-end CockroachDB tests, as outlined in
55 : // https://github.com/cockroachdb/cockroach/pull/96652.
56 : //
57 : // In TableFormatPebblev3, each SET has a one byte value prefix that tells us
58 : // whether the value is in-place or in a value block. This 1 byte prefix
59 : // encodes additional information:
60 : //
61 : // - ShortAttribute: This is an attribute of the value. Currently, CockroachDB
62 : // uses it to represent whether the value is a tombstone or not. This avoids
63 : // the need to fetch a value from the value block if the caller only wants
64 : // to figure out whether it is an MVCC tombstone. The length of the value is
65 : // another attribute that the caller can be interested in, and it is also
66 : // accessible without reading the value in the value block (see the value
67 : // handle in the details section).
68 : //
69 : // - SET-same-prefix: this enables the aforementioned optimization when
70 : // stepping from one key prefix to the next key prefix.
71 : //
72 : // We further optimize this iteration over prefixes by using the restart
73 : // points in a block to encode whether the SET at a restart point has the same
74 : // prefix since the last restart point. This allows us to skip over restart
75 : // points within the same block. See the comment in blockWriter, and how both
76 : // SET-same-prefix and the restart point information is used in
77 : // blockIter.nextPrefixV3.
78 : //
79 : // This flexibility of values that are in-place or in value blocks requires
80 : // flexibility in the iterator interface. The InternalIterator interface
81 : // returns a LazyValue instead of a byte slice. Additionally, pebble.Iterator
82 : // allows the caller to ask for a LazyValue. See lazy_value.go for details,
83 : // including the memory lifetime management.
84 : //
85 : // For historical discussions about this feature, see the issue
86 : // https://github.com/cockroachdb/pebble/issues/1170 and the prototype in
87 : // https://github.com/cockroachdb/pebble/pull/1443.
88 : //
89 : // The code in this file mainly covers value block and related encodings. We
90 : // discuss these in the next section.
91 : //
92 : // 2. Details
93 : //
94 : // Note that the notion of the latest value is local to the sstable. It is
95 : // possible that that latest value has been deleted by a sstable in a higher
96 : // level, and what is the latest value from the perspective of the whole LSM
97 : // is an older MVCC version. This only affects performance and not
98 : // correctness. This local knowledge is also why we continue to store these
99 : // older versions in the same sstable -- we need to be able to conveniently
100 : // read them. The code in this file is agnostic to the policy regarding what
101 : // should be stored in value blocks -- it allows even the latest MVCC version
102 : // to be stored in a value block. The policy decision in made in the
103 : // sstable.Writer. See Writer.makeAddPointDecisionV3.
104 : //
105 : // Data blocks contain two kinds of SET keys: those with in-place values and
106 : // those with a value handle. To distinguish these two cases we use a single
107 : // byte prefix (valuePrefix). This single byte prefix is split into multiple
108 : // parts, where nb represents information that is encoded in n bits.
109 : //
110 : // +---------------+--------------------+-----------+--------------------+
111 : // | value-kind 2b | SET-same-prefix 1b | unused 2b | short-attribute 3b |
112 : // +---------------+--------------------+-----------+--------------------+
113 : //
114 : // The 2 bit value-kind specifies whether this is an in-place value or a value
115 : // handle pointing to a value block. We use 2 bits here for future
116 : // representation of values that are in separate files. The 1 bit
117 : // SET-same-prefix is true if this key is a SET and is immediately preceded by
118 : // a SET that shares the same prefix. The 3 bit short-attribute is described
119 : // in base.ShortAttribute -- it stores user-defined attributes about the
120 : // value. It is unused for in-place values.
121 : //
122 : // Value Handle and Value Blocks:
123 : // valueHandles refer to values in value blocks. Value blocks are simpler than
124 : // normal data blocks (that contain key-value pairs, and allow for binary
125 : // search), which makes them cheap for value retrieval purposes. A valueHandle
126 : // is a tuple (valueLen, blockNum, offsetInBlock), where blockNum is the 0
127 : // indexed value block number and offsetInBlock is the byte offset in that
128 : // block containing the value. The valueHandle.valueLen is included since
129 : // there are multiple use cases in CockroachDB that need the value length but
130 : // not the value, for which we can avoid reading the value in the value block
131 : // (see
132 : // https://github.com/cockroachdb/pebble/issues/1170#issuecomment-958203245).
133 : //
134 : // A value block has a checksum like other blocks, and is optionally
135 : // compressed. An uncompressed value block is a sequence of values with no
136 : // separator or length (we rely on the valueHandle to demarcate). The
137 : // valueHandle.offsetInBlock points to the value, of length
138 : // valueHandle.valueLen. While writing a sstable, all the (possibly
139 : // compressed) value blocks need to be held in-memory until they can be
140 : // written. Value blocks are placed after the "meta rangedel" and "meta range
141 : // key" blocks since value blocks are considered less likely to be read.
142 : //
143 : // Meta Value Index Block:
144 : // Since the (key, valueHandle) pair are written before there is any knowledge
145 : // of the byte offset of the value block in the file, or its compressed
146 : // length, we need another lookup to map the valueHandle.blockNum to the
147 : // information needed to read it from the file. This information is provided
148 : // by the "value index block". The "value index block" is referred to by the
149 : // metaindex block. The design intentionally avoids making the "value index
150 : // block" a general purpose key-value block, since each caller wants to lookup
151 : // the information for a particular blockNum (there is no need for SeekGE
152 : // etc.). Instead, this index block stores a sequence of (blockNum,
153 : // blockOffset, blockLength) tuples, where the blockNums are consecutive
154 : // integers, and the tuples are encoded with a fixed width encoding. This
155 : // allows a reader to find the tuple for block K by looking at the offset
156 : // K*fixed-width. The fixed width for each field is decided by looking at the
157 : // maximum value of each of these fields. As a concrete example of a large
158 : // sstable with many value blocks, we constructed a 100MB sstable with many
159 : // versions and had 2475 value blocks (~32KB each). This sstable had this
160 : // tuple encoded using 2+4+2=8 bytes, which means the uncompressed value index
161 : // block was 2475*8=~19KB, which is modest. Therefore, we don't support more
162 : // than one value index block. Consider the example of 2 byte blockNum, 4 byte
163 : // blockOffset and 2 byte blockLen. The value index block will look like:
164 : //
165 : // +---------------+------------------+---------------+
166 : // | blockNum (2B) | blockOffset (4B) | blockLen (2B) |
167 : // +---------------+------------------+---------------+
168 : // | 0 | 7,123,456 | 30,000 |
169 : // +---------------+------------------+---------------+
170 : // | 1 | 7,153,456 | 20,000 |
171 : // +---------------+------------------+---------------+
172 : // | 2 | 7,173,456 | 25,567 |
173 : // +---------------+------------------+---------------+
174 : // | .... | ... | ... |
175 : //
176 : //
177 : // The metaindex block contains the valueBlocksIndexHandle which in addition
178 : // to the BlockHandle also specifies the widths of these tuple fields. In the
179 : // above example, the
180 : // valueBlockIndexHandle.{blockNumByteLength,blockOffsetByteLength,blockLengthByteLength}
181 : // will be (2,4,2).
182 :
183 : // valueHandle is stored with a key when the value is in a value block. This
184 : // handle is the pointer to that value.
185 : type valueHandle struct {
186 : valueLen uint32
187 : blockNum uint32
188 : offsetInBlock uint32
189 : }
190 :
191 : // valuePrefix is the single byte prefix for either the in-place value or the
192 : // encoded valueHandle. It encoded multiple kinds of information.
193 : type valuePrefix byte
194 :
195 : const (
196 : // 2 most-significant bits of valuePrefix encodes the value-kind.
197 : valueKindMask valuePrefix = '\xC0'
198 : valueKindIsValueHandle valuePrefix = '\x80'
199 : valueKindIsInPlaceValue valuePrefix = '\x00'
200 :
201 : // 1 bit indicates SET has same key prefix as immediately preceding key that
202 : // is also a SET. If the immediately preceding key in the same block is a
203 : // SET, AND this bit is 0, the prefix must have changed.
204 : //
205 : // Note that the current policy of only storing older MVCC versions in value
206 : // blocks means that valueKindIsValueHandle => SET has same prefix. But no
207 : // code should rely on this behavior. Also, SET has same prefix does *not*
208 : // imply valueKindIsValueHandle.
209 : setHasSameKeyPrefixMask valuePrefix = '\x20'
210 :
211 : // 3 least-significant bits for the user-defined base.ShortAttribute.
212 : // Undefined for valueKindIsInPlaceValue.
213 : userDefinedShortAttributeMask valuePrefix = '\x07'
214 : )
215 :
216 : // valueHandle fields are varint encoded, so maximum 5 bytes each, plus 1 byte
217 : // for the valuePrefix. This could alternatively be group varint encoded, but
218 : // experiments were inconclusive
219 : // (https://github.com/cockroachdb/pebble/pull/1443#issuecomment-1270298802).
220 : const valueHandleMaxLen = 5*3 + 1
221 :
222 : // Assert blockHandleLikelyMaxLen >= valueHandleMaxLen.
223 : const _ = uint(blockHandleLikelyMaxLen - valueHandleMaxLen)
224 :
225 2 : func encodeValueHandle(dst []byte, v valueHandle) int {
226 2 : n := 0
227 2 : n += binary.PutUvarint(dst[n:], uint64(v.valueLen))
228 2 : n += binary.PutUvarint(dst[n:], uint64(v.blockNum))
229 2 : n += binary.PutUvarint(dst[n:], uint64(v.offsetInBlock))
230 2 : return n
231 2 : }
232 :
233 2 : func makePrefixForValueHandle(setHasSameKeyPrefix bool, attribute base.ShortAttribute) valuePrefix {
234 2 : prefix := valueKindIsValueHandle | valuePrefix(attribute)
235 2 : if setHasSameKeyPrefix {
236 2 : prefix = prefix | setHasSameKeyPrefixMask
237 2 : }
238 2 : return prefix
239 : }
240 :
241 2 : func makePrefixForInPlaceValue(setHasSameKeyPrefix bool) valuePrefix {
242 2 : prefix := valueKindIsInPlaceValue
243 2 : if setHasSameKeyPrefix {
244 2 : prefix = prefix | setHasSameKeyPrefixMask
245 2 : }
246 2 : return prefix
247 : }
248 :
249 2 : func isValueHandle(b valuePrefix) bool {
250 2 : return b&valueKindMask == valueKindIsValueHandle
251 2 : }
252 :
253 : // REQUIRES: isValueHandle(b)
254 2 : func getShortAttribute(b valuePrefix) base.ShortAttribute {
255 2 : return base.ShortAttribute(b & userDefinedShortAttributeMask)
256 2 : }
257 :
258 2 : func setHasSamePrefix(b valuePrefix) bool {
259 2 : return b&setHasSameKeyPrefixMask == setHasSameKeyPrefixMask
260 2 : }
261 :
262 2 : func decodeLenFromValueHandle(src []byte) (uint32, []byte) {
263 2 : ptr := unsafe.Pointer(&src[0])
264 2 : var v uint32
265 2 : if a := *((*uint8)(ptr)); a < 128 {
266 2 : v = uint32(a)
267 2 : src = src[1:]
268 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
269 0 : v = uint32(b)<<7 | uint32(a)
270 0 : src = src[2:]
271 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
272 0 : v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
273 0 : src = src[3:]
274 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
275 0 : v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
276 0 : src = src[4:]
277 1 : } else {
278 1 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
279 1 : v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
280 1 : src = src[5:]
281 1 : }
282 2 : return v, src
283 : }
284 :
285 2 : func decodeRemainingValueHandle(src []byte) valueHandle {
286 2 : var vh valueHandle
287 2 : ptr := unsafe.Pointer(&src[0])
288 2 : // Manually inlined uvarint decoding. Saves ~25% in benchmarks. Unrolling
289 2 : // a loop for i:=0; i<2; i++, saves ~6%.
290 2 : var v uint32
291 2 : if a := *((*uint8)(ptr)); a < 128 {
292 2 : v = uint32(a)
293 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
294 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
295 1 : v = uint32(b)<<7 | uint32(a)
296 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
297 2 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
298 1 : v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
299 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
300 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
301 0 : v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
302 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
303 1 : } else {
304 1 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
305 1 : v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
306 1 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
307 1 : }
308 2 : vh.blockNum = v
309 2 :
310 2 : if a := *((*uint8)(ptr)); a < 128 {
311 2 : v = uint32(a)
312 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
313 2 : v = uint32(b)<<7 | uint32(a)
314 2 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
315 0 : v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
316 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
317 0 : v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
318 1 : } else {
319 1 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
320 1 : v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
321 1 : }
322 2 : vh.offsetInBlock = v
323 2 :
324 2 : return vh
325 : }
326 :
327 1 : func decodeValueHandle(src []byte) valueHandle {
328 1 : valLen, src := decodeLenFromValueHandle(src)
329 1 : vh := decodeRemainingValueHandle(src)
330 1 : vh.valueLen = valLen
331 1 : return vh
332 1 : }
333 :
334 : // valueBlocksIndexHandle is placed in the metaindex if there are any value
335 : // blocks. If there are no value blocks, there is no value blocks index, and
336 : // no entry in the metaindex. Note that the lack of entry in the metaindex
337 : // should not be used to ascertain whether the values are prefixed, since the
338 : // former is an emergent property of the data that was written and not known
339 : // until all the key-value pairs in the sstable are written.
340 : type valueBlocksIndexHandle struct {
341 : h BlockHandle
342 : blockNumByteLength uint8
343 : blockOffsetByteLength uint8
344 : blockLengthByteLength uint8
345 : }
346 :
347 : const valueBlocksIndexHandleMaxLen = blockHandleMaxLenWithoutProperties + 3
348 :
349 : // Assert blockHandleLikelyMaxLen >= valueBlocksIndexHandleMaxLen.
350 : const _ = uint(blockHandleLikelyMaxLen - valueBlocksIndexHandleMaxLen)
351 :
352 2 : func encodeValueBlocksIndexHandle(dst []byte, v valueBlocksIndexHandle) int {
353 2 : n := encodeBlockHandle(dst, v.h)
354 2 : dst[n] = v.blockNumByteLength
355 2 : n++
356 2 : dst[n] = v.blockOffsetByteLength
357 2 : n++
358 2 : dst[n] = v.blockLengthByteLength
359 2 : n++
360 2 : return n
361 2 : }
362 :
363 2 : func decodeValueBlocksIndexHandle(src []byte) (valueBlocksIndexHandle, int, error) {
364 2 : var vbih valueBlocksIndexHandle
365 2 : var n int
366 2 : vbih.h, n = decodeBlockHandle(src)
367 2 : if n <= 0 {
368 0 : return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
369 0 : }
370 2 : if len(src) != n+3 {
371 0 : return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
372 0 : }
373 2 : vbih.blockNumByteLength = src[n]
374 2 : vbih.blockOffsetByteLength = src[n+1]
375 2 : vbih.blockLengthByteLength = src[n+2]
376 2 : return vbih, n + 3, nil
377 : }
378 :
379 : type valueBlocksAndIndexStats struct {
380 : numValueBlocks uint64
381 : numValuesInValueBlocks uint64
382 : // Includes both value blocks and value index block.
383 : valueBlocksAndIndexSize uint64
384 : }
385 :
386 : // valueBlockWriter writes a sequence of value blocks, and the value blocks
387 : // index, for a sstable.
388 : type valueBlockWriter struct {
389 : // The configured uncompressed block size and size threshold
390 : blockSize, blockSizeThreshold int
391 : // Configured compression.
392 : compression Compression
393 : // checksummer with configured checksum type.
394 : checksummer checksummer
395 : // Block finished callback.
396 : blockFinishedFunc func(compressedSize int)
397 :
398 : // buf is the current block being written to (uncompressed).
399 : buf *blockBuffer
400 : // compressedBuf is used for compressing the block.
401 : compressedBuf *blockBuffer
402 : // Sequence of blocks that are finished.
403 : blocks []blockAndHandle
404 : // Cumulative value block bytes written so far.
405 : totalBlockBytes uint64
406 : numValues uint64
407 : }
408 :
409 : type blockAndHandle struct {
410 : block *blockBuffer
411 : handle BlockHandle
412 : compressed bool
413 : }
414 :
415 : type blockBuffer struct {
416 : b []byte
417 : }
418 :
419 : // Pool of block buffers that should be roughly the blockSize.
420 : var uncompressedValueBlockBufPool = sync.Pool{
421 2 : New: func() interface{} {
422 2 : return &blockBuffer{}
423 2 : },
424 : }
425 :
426 : // Pool of block buffers for compressed value blocks. These may widely vary in
427 : // size based on compression ratios.
428 : var compressedValueBlockBufPool = sync.Pool{
429 2 : New: func() interface{} {
430 2 : return &blockBuffer{}
431 2 : },
432 : }
433 :
434 2 : func releaseToValueBlockBufPool(pool *sync.Pool, b *blockBuffer) {
435 2 : // Don't pool buffers larger than 128KB, in case we had some rare large
436 2 : // values.
437 2 : if len(b.b) > 128*1024 {
438 0 : return
439 0 : }
440 2 : if invariants.Enabled {
441 2 : // Set the bytes to a random value. Cap the number of bytes being
442 2 : // randomized to prevent test timeouts.
443 2 : length := cap(b.b)
444 2 : if length > 1000 {
445 2 : length = 1000
446 2 : }
447 2 : b.b = b.b[:length:length]
448 2 : rand.Read(b.b)
449 : }
450 2 : pool.Put(b)
451 : }
452 :
453 : var valueBlockWriterPool = sync.Pool{
454 2 : New: func() interface{} {
455 2 : return &valueBlockWriter{}
456 2 : },
457 : }
458 :
459 : func newValueBlockWriter(
460 : blockSize int,
461 : blockSizeThreshold int,
462 : compression Compression,
463 : checksumType ChecksumType,
464 : // compressedSize should exclude the block trailer.
465 : blockFinishedFunc func(compressedSize int),
466 2 : ) *valueBlockWriter {
467 2 : w := valueBlockWriterPool.Get().(*valueBlockWriter)
468 2 : *w = valueBlockWriter{
469 2 : blockSize: blockSize,
470 2 : blockSizeThreshold: blockSizeThreshold,
471 2 : compression: compression,
472 2 : checksummer: checksummer{
473 2 : checksumType: checksumType,
474 2 : },
475 2 : blockFinishedFunc: blockFinishedFunc,
476 2 : buf: uncompressedValueBlockBufPool.Get().(*blockBuffer),
477 2 : compressedBuf: compressedValueBlockBufPool.Get().(*blockBuffer),
478 2 : blocks: w.blocks[:0],
479 2 : }
480 2 : w.buf.b = w.buf.b[:0]
481 2 : w.compressedBuf.b = w.compressedBuf.b[:0]
482 2 : return w
483 2 : }
484 :
485 2 : func releaseValueBlockWriter(w *valueBlockWriter) {
486 2 : for i := range w.blocks {
487 2 : if w.blocks[i].compressed {
488 2 : releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.blocks[i].block)
489 2 : } else {
490 2 : releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.blocks[i].block)
491 2 : }
492 2 : w.blocks[i].block = nil
493 : }
494 2 : if w.buf != nil {
495 2 : releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.buf)
496 2 : }
497 2 : if w.compressedBuf != nil {
498 2 : releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.compressedBuf)
499 2 : }
500 2 : *w = valueBlockWriter{
501 2 : blocks: w.blocks[:0],
502 2 : }
503 2 : valueBlockWriterPool.Put(w)
504 : }
505 :
506 2 : func (w *valueBlockWriter) addValue(v []byte) (valueHandle, error) {
507 2 : if invariants.Enabled && len(v) == 0 {
508 0 : return valueHandle{}, errors.Errorf("cannot write empty value to value block")
509 0 : }
510 2 : w.numValues++
511 2 : blockLen := len(w.buf.b)
512 2 : valueLen := len(v)
513 2 : if blockLen >= w.blockSize ||
514 2 : (blockLen > w.blockSizeThreshold && blockLen+valueLen > w.blockSize) {
515 2 : // Block is not currently empty and adding this value will become too big,
516 2 : // so finish this block.
517 2 : w.compressAndFlush()
518 2 : blockLen = len(w.buf.b)
519 2 : if invariants.Enabled && blockLen != 0 {
520 0 : panic("blockLen of new block should be 0")
521 : }
522 : }
523 2 : vh := valueHandle{
524 2 : valueLen: uint32(valueLen),
525 2 : blockNum: uint32(len(w.blocks)),
526 2 : offsetInBlock: uint32(blockLen),
527 2 : }
528 2 : blockLen = int(vh.offsetInBlock + vh.valueLen)
529 2 : if cap(w.buf.b) < blockLen {
530 2 : size := 2 * cap(w.buf.b)
531 2 : if size < 1024 {
532 2 : size = 1024
533 2 : }
534 2 : for size < blockLen {
535 0 : size *= 2
536 0 : }
537 2 : buf := make([]byte, blockLen, size)
538 2 : _ = copy(buf, w.buf.b)
539 2 : w.buf.b = buf
540 2 : } else {
541 2 : w.buf.b = w.buf.b[:blockLen]
542 2 : }
543 2 : buf := w.buf.b[vh.offsetInBlock:]
544 2 : n := copy(buf, v)
545 2 : if n != len(buf) {
546 0 : panic("incorrect length computation")
547 : }
548 2 : return vh, nil
549 : }
550 :
551 2 : func (w *valueBlockWriter) compressAndFlush() {
552 2 : // Compress the buffer, discarding the result if the improvement isn't at
553 2 : // least 12.5%.
554 2 : blockType := noCompressionBlockType
555 2 : b := w.buf
556 2 : if w.compression != NoCompression {
557 2 : blockType, w.compressedBuf.b =
558 2 : compressBlock(w.compression, w.buf.b, w.compressedBuf.b[:cap(w.compressedBuf.b)])
559 2 : if len(w.compressedBuf.b) < len(w.buf.b)-len(w.buf.b)/8 {
560 2 : b = w.compressedBuf
561 2 : } else {
562 2 : blockType = noCompressionBlockType
563 2 : }
564 : }
565 2 : n := len(b.b)
566 2 : if n+blockTrailerLen > cap(b.b) {
567 2 : block := make([]byte, n+blockTrailerLen)
568 2 : copy(block, b.b)
569 2 : b.b = block
570 2 : } else {
571 2 : b.b = b.b[:n+blockTrailerLen]
572 2 : }
573 2 : b.b[n] = byte(blockType)
574 2 : w.computeChecksum(b.b)
575 2 : bh := BlockHandle{Offset: w.totalBlockBytes, Length: uint64(n)}
576 2 : w.totalBlockBytes += uint64(len(b.b))
577 2 : // blockFinishedFunc length excludes the block trailer.
578 2 : w.blockFinishedFunc(n)
579 2 : compressed := blockType != noCompressionBlockType
580 2 : w.blocks = append(w.blocks, blockAndHandle{
581 2 : block: b,
582 2 : handle: bh,
583 2 : compressed: compressed,
584 2 : })
585 2 : // Handed off a buffer to w.blocks, so need get a new one.
586 2 : if compressed {
587 2 : w.compressedBuf = compressedValueBlockBufPool.Get().(*blockBuffer)
588 2 : } else {
589 2 : w.buf = uncompressedValueBlockBufPool.Get().(*blockBuffer)
590 2 : }
591 2 : w.buf.b = w.buf.b[:0]
592 : }
593 :
594 2 : func (w *valueBlockWriter) computeChecksum(block []byte) {
595 2 : n := len(block) - blockTrailerLen
596 2 : checksum := w.checksummer.checksum(block[:n], block[n:n+1])
597 2 : binary.LittleEndian.PutUint32(block[n+1:], checksum)
598 2 : }
599 :
600 : func (w *valueBlockWriter) finish(
601 : writer io.Writer, fileOffset uint64,
602 2 : ) (valueBlocksIndexHandle, valueBlocksAndIndexStats, error) {
603 2 : if len(w.buf.b) > 0 {
604 2 : w.compressAndFlush()
605 2 : }
606 2 : n := len(w.blocks)
607 2 : if n == 0 {
608 2 : return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, nil
609 2 : }
610 2 : largestOffset := uint64(0)
611 2 : largestLength := uint64(0)
612 2 : for i := range w.blocks {
613 2 : _, err := writer.Write(w.blocks[i].block.b)
614 2 : if err != nil {
615 0 : return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
616 0 : }
617 2 : w.blocks[i].handle.Offset += fileOffset
618 2 : largestOffset = w.blocks[i].handle.Offset
619 2 : if largestLength < w.blocks[i].handle.Length {
620 2 : largestLength = w.blocks[i].handle.Length
621 2 : }
622 : }
623 2 : vbihOffset := fileOffset + w.totalBlockBytes
624 2 :
625 2 : vbih := valueBlocksIndexHandle{
626 2 : h: BlockHandle{
627 2 : Offset: vbihOffset,
628 2 : },
629 2 : blockNumByteLength: uint8(lenLittleEndian(uint64(n - 1))),
630 2 : blockOffsetByteLength: uint8(lenLittleEndian(largestOffset)),
631 2 : blockLengthByteLength: uint8(lenLittleEndian(largestLength)),
632 2 : }
633 2 : var err error
634 2 : if vbih, err = w.writeValueBlocksIndex(writer, vbih); err != nil {
635 0 : return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
636 0 : }
637 2 : stats := valueBlocksAndIndexStats{
638 2 : numValueBlocks: uint64(n),
639 2 : numValuesInValueBlocks: w.numValues,
640 2 : valueBlocksAndIndexSize: w.totalBlockBytes + vbih.h.Length + blockTrailerLen,
641 2 : }
642 2 : return vbih, stats, err
643 : }
644 :
645 : func (w *valueBlockWriter) writeValueBlocksIndex(
646 : writer io.Writer, h valueBlocksIndexHandle,
647 2 : ) (valueBlocksIndexHandle, error) {
648 2 : blockLen :=
649 2 : int(h.blockNumByteLength+h.blockOffsetByteLength+h.blockLengthByteLength) * len(w.blocks)
650 2 : h.h.Length = uint64(blockLen)
651 2 : blockLen += blockTrailerLen
652 2 : var buf []byte
653 2 : if cap(w.buf.b) < blockLen {
654 2 : buf = make([]byte, blockLen)
655 2 : w.buf.b = buf
656 2 : } else {
657 2 : buf = w.buf.b[:blockLen]
658 2 : }
659 2 : b := buf
660 2 : for i := range w.blocks {
661 2 : littleEndianPut(uint64(i), b, int(h.blockNumByteLength))
662 2 : b = b[int(h.blockNumByteLength):]
663 2 : littleEndianPut(w.blocks[i].handle.Offset, b, int(h.blockOffsetByteLength))
664 2 : b = b[int(h.blockOffsetByteLength):]
665 2 : littleEndianPut(w.blocks[i].handle.Length, b, int(h.blockLengthByteLength))
666 2 : b = b[int(h.blockLengthByteLength):]
667 2 : }
668 2 : if len(b) != blockTrailerLen {
669 0 : panic("incorrect length calculation")
670 : }
671 2 : b[0] = byte(noCompressionBlockType)
672 2 : w.computeChecksum(buf)
673 2 : if _, err := writer.Write(buf); err != nil {
674 0 : return valueBlocksIndexHandle{}, err
675 0 : }
676 2 : return h, nil
677 : }
678 :
679 : // littleEndianPut writes v to b using little endian encoding, under the
680 : // assumption that v can be represented using n bytes.
681 2 : func littleEndianPut(v uint64, b []byte, n int) {
682 2 : _ = b[n-1] // bounds check
683 2 : for i := 0; i < n; i++ {
684 2 : b[i] = byte(v)
685 2 : v = v >> 8
686 2 : }
687 : }
688 :
689 : // lenLittleEndian returns the minimum number of bytes needed to encode v
690 : // using little endian encoding.
691 2 : func lenLittleEndian(v uint64) int {
692 2 : n := 0
693 2 : for i := 0; i < 8; i++ {
694 2 : n++
695 2 : v = v >> 8
696 2 : if v == 0 {
697 2 : break
698 : }
699 : }
700 2 : return n
701 : }
702 :
703 2 : func littleEndianGet(b []byte, n int) uint64 {
704 2 : _ = b[n-1] // bounds check
705 2 : v := uint64(b[0])
706 2 : for i := 1; i < n; i++ {
707 2 : v |= uint64(b[i]) << (8 * i)
708 2 : }
709 2 : return v
710 : }
711 :
712 : // UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes.
713 : // If both are nil, there is no bound specified. Else, Compare(Lower,Upper)
714 : // must be < 0.
715 : type UserKeyPrefixBound struct {
716 : // Lower is a lower bound user key prefix.
717 : Lower []byte
718 : // Upper is an upper bound user key prefix.
719 : Upper []byte
720 : }
721 :
722 : // IsEmpty returns true iff the bound is empty.
723 2 : func (ukb *UserKeyPrefixBound) IsEmpty() bool {
724 2 : return len(ukb.Lower) == 0 && len(ukb.Upper) == 0
725 2 : }
726 :
727 : type blockProviderWhenOpen interface {
728 : readBlockForVBR(
729 : h BlockHandle, stats *base.InternalIteratorStats,
730 : ) (bufferHandle, error)
731 : }
732 :
733 : type blockProviderWhenClosed struct {
734 : rp ReaderProvider
735 : r *Reader
736 : }
737 :
738 2 : func (bpwc *blockProviderWhenClosed) open() error {
739 2 : var err error
740 2 : bpwc.r, err = bpwc.rp.GetReader()
741 2 : return err
742 2 : }
743 :
744 2 : func (bpwc *blockProviderWhenClosed) close() {
745 2 : bpwc.rp.Close()
746 2 : bpwc.r = nil
747 2 : }
748 :
749 : func (bpwc blockProviderWhenClosed) readBlockForVBR(
750 : h BlockHandle, stats *base.InternalIteratorStats,
751 2 : ) (bufferHandle, error) {
752 2 : // This is rare, since most block reads happen when the corresponding
753 2 : // sstable iterator is open. So we are willing to sacrifice a proper context
754 2 : // for tracing.
755 2 : //
756 2 : // TODO(sumeer): consider fixing this. See
757 2 : // https://github.com/cockroachdb/pebble/pull/3065#issue-1991175365 for an
758 2 : // alternative.
759 2 : ctx := objiotracing.WithBlockType(context.Background(), objiotracing.ValueBlock)
760 2 : // TODO(jackson,sumeer): Consider whether to use a buffer pool in this case.
761 2 : // The bpwc is not allowed to outlive the iterator tree, so it cannot
762 2 : // outlive the buffer pool.
763 2 : return bpwc.r.readBlock(
764 2 : ctx, h, nil, nil, stats, nil /* iterStats */, nil /* buffer pool */)
765 2 : }
766 :
767 : // ReaderProvider supports the implementation of blockProviderWhenClosed.
768 : // GetReader and Close can be called multiple times in pairs.
769 : type ReaderProvider interface {
770 : GetReader() (r *Reader, err error)
771 : Close()
772 : }
773 :
774 : // TrivialReaderProvider implements ReaderProvider for a Reader that will
775 : // outlive the top-level iterator in the iterator tree.
776 : type TrivialReaderProvider struct {
777 : *Reader
778 : }
779 :
780 : var _ ReaderProvider = TrivialReaderProvider{}
781 :
782 : // GetReader implements ReaderProvider.
783 1 : func (trp TrivialReaderProvider) GetReader() (*Reader, error) {
784 1 : return trp.Reader, nil
785 1 : }
786 :
787 : // Close implements ReaderProvider.
788 1 : func (trp TrivialReaderProvider) Close() {}
789 :
790 : // valueBlockReader is used to retrieve values in value
791 : // blocks. It is used when the sstable was written with
792 : // Properties.ValueBlocksAreEnabled.
793 : type valueBlockReader struct {
794 : bpOpen blockProviderWhenOpen
795 : rp ReaderProvider
796 : vbih valueBlocksIndexHandle
797 : stats *base.InternalIteratorStats
798 :
799 : // The value blocks index is lazily retrieved the first time the reader
800 : // needs to read a value that resides in a value block.
801 : vbiBlock []byte
802 : vbiCache bufferHandle
803 : // When sequentially iterating through all key-value pairs, the cost of
804 : // repeatedly getting a block that is already in the cache and releasing the
805 : // bufferHandle can be ~40% of the cpu overhead. So the reader remembers the
806 : // last value block it retrieved, in case there is locality of access, and
807 : // this value block can be used for the next value retrieval.
808 : valueBlockNum uint32
809 : valueBlock []byte
810 : valueBlockPtr unsafe.Pointer
811 : valueCache bufferHandle
812 : lazyFetcher base.LazyFetcher
813 : closed bool
814 : bufToMangle []byte
815 : }
816 :
817 2 : func (r *valueBlockReader) getLazyValueForPrefixAndValueHandle(handle []byte) base.LazyValue {
818 2 : fetcher := &r.lazyFetcher
819 2 : valLen, h := decodeLenFromValueHandle(handle[1:])
820 2 : *fetcher = base.LazyFetcher{
821 2 : Fetcher: r,
822 2 : Attribute: base.AttributeAndLen{
823 2 : ValueLen: int32(valLen),
824 2 : ShortAttribute: getShortAttribute(valuePrefix(handle[0])),
825 2 : },
826 2 : }
827 2 : if r.stats != nil {
828 2 : r.stats.SeparatedPointValue.Count++
829 2 : r.stats.SeparatedPointValue.ValueBytes += uint64(valLen)
830 2 : }
831 2 : return base.LazyValue{
832 2 : ValueOrHandle: h,
833 2 : Fetcher: fetcher,
834 2 : }
835 : }
836 :
837 2 : func (r *valueBlockReader) close() {
838 2 : r.bpOpen = nil
839 2 : r.vbiBlock = nil
840 2 : r.vbiCache.Release()
841 2 : // Set the handle to empty since Release does not nil the Handle.value. If
842 2 : // we were to reopen this valueBlockReader and retrieve the same
843 2 : // Handle.value from the cache, we don't want to accidentally unref it when
844 2 : // attempting to unref the old handle.
845 2 : r.vbiCache = bufferHandle{}
846 2 : r.valueBlock = nil
847 2 : r.valueBlockPtr = nil
848 2 : r.valueCache.Release()
849 2 : // See comment above.
850 2 : r.valueCache = bufferHandle{}
851 2 : r.closed = true
852 2 : // rp, vbih, stats remain valid, so that LazyFetcher.ValueFetcher can be
853 2 : // implemented.
854 2 : }
855 :
856 : // Fetch implements base.ValueFetcher.
857 : func (r *valueBlockReader) Fetch(
858 : handle []byte, valLen int32, buf []byte,
859 2 : ) (val []byte, callerOwned bool, err error) {
860 2 : if !r.closed {
861 2 : val, err := r.getValueInternal(handle, valLen)
862 2 : if invariants.Enabled {
863 2 : val = r.doValueMangling(val)
864 2 : }
865 2 : return val, false, err
866 : }
867 :
868 2 : bp := blockProviderWhenClosed{rp: r.rp}
869 2 : err = bp.open()
870 2 : if err != nil {
871 0 : return nil, false, err
872 0 : }
873 2 : defer bp.close()
874 2 : defer r.close()
875 2 : r.bpOpen = bp
876 2 : var v []byte
877 2 : v, err = r.getValueInternal(handle, valLen)
878 2 : if err != nil {
879 0 : return nil, false, err
880 0 : }
881 2 : buf = append(buf[:0], v...)
882 2 : return buf, true, nil
883 : }
884 :
885 : // doValueMangling attempts to uncover violations of the contract listed in
886 : // the declaration comment of LazyValue. It is expensive, hence only called
887 : // when invariants.Enabled.
888 2 : func (r *valueBlockReader) doValueMangling(v []byte) []byte {
889 2 : // Randomly set the bytes in the previous retrieved value to 0, since
890 2 : // property P1 only requires the valueBlockReader to maintain the memory of
891 2 : // one fetched value.
892 2 : if rand.Intn(2) == 0 {
893 2 : clear(r.bufToMangle)
894 2 : }
895 : // Store the current value in a new buffer for future mangling.
896 2 : r.bufToMangle = append([]byte(nil), v...)
897 2 : return r.bufToMangle
898 : }
899 :
900 2 : func (r *valueBlockReader) getValueInternal(handle []byte, valLen int32) (val []byte, err error) {
901 2 : vh := decodeRemainingValueHandle(handle)
902 2 : vh.valueLen = uint32(valLen)
903 2 : if r.vbiBlock == nil {
904 2 : ch, err := r.bpOpen.readBlockForVBR(r.vbih.h, r.stats)
905 2 : if err != nil {
906 0 : return nil, err
907 0 : }
908 2 : r.vbiCache = ch
909 2 : r.vbiBlock = ch.Get()
910 : }
911 2 : if r.valueBlock == nil || r.valueBlockNum != vh.blockNum {
912 2 : vbh, err := r.getBlockHandle(vh.blockNum)
913 2 : if err != nil {
914 0 : return nil, err
915 0 : }
916 2 : vbCacheHandle, err := r.bpOpen.readBlockForVBR(vbh, r.stats)
917 2 : if err != nil {
918 0 : return nil, err
919 0 : }
920 2 : r.valueBlockNum = vh.blockNum
921 2 : r.valueCache.Release()
922 2 : r.valueCache = vbCacheHandle
923 2 : r.valueBlock = vbCacheHandle.Get()
924 2 : r.valueBlockPtr = unsafe.Pointer(&r.valueBlock[0])
925 : }
926 2 : if r.stats != nil {
927 2 : r.stats.SeparatedPointValue.ValueBytesFetched += uint64(valLen)
928 2 : }
929 2 : return r.valueBlock[vh.offsetInBlock : vh.offsetInBlock+vh.valueLen], nil
930 : }
931 :
932 2 : func (r *valueBlockReader) getBlockHandle(blockNum uint32) (BlockHandle, error) {
933 2 : indexEntryLen :=
934 2 : int(r.vbih.blockNumByteLength + r.vbih.blockOffsetByteLength + r.vbih.blockLengthByteLength)
935 2 : offsetInIndex := indexEntryLen * int(blockNum)
936 2 : if len(r.vbiBlock) < offsetInIndex+indexEntryLen {
937 0 : return BlockHandle{}, errors.Errorf(
938 0 : "cannot read at offset %d and length %d from block of length %d",
939 0 : offsetInIndex, indexEntryLen, len(r.vbiBlock))
940 0 : }
941 2 : b := r.vbiBlock[offsetInIndex : offsetInIndex+indexEntryLen]
942 2 : n := int(r.vbih.blockNumByteLength)
943 2 : bn := littleEndianGet(b, n)
944 2 : if uint32(bn) != blockNum {
945 0 : return BlockHandle{},
946 0 : errors.Errorf("expected block num %d but found %d", blockNum, bn)
947 0 : }
948 2 : b = b[n:]
949 2 : n = int(r.vbih.blockOffsetByteLength)
950 2 : blockOffset := littleEndianGet(b, n)
951 2 : b = b[n:]
952 2 : n = int(r.vbih.blockLengthByteLength)
953 2 : blockLen := littleEndianGet(b, n)
954 2 : return BlockHandle{Offset: blockOffset, Length: blockLen}, nil
955 : }
|