Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package block
6 :
7 : import (
8 : "bytes"
9 : "encoding/binary"
10 : "fmt"
11 :
12 : "github.com/cespare/xxhash/v2"
13 : "github.com/cockroachdb/errors"
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/crc"
16 : )
17 :
18 : // Handle is the file offset and length of a block.
19 : type Handle struct {
20 : Offset, Length uint64
21 : }
22 :
23 : // EncodeVarints encodes the block handle into dst using a variable-width
24 : // encoding and returns the number of bytes written.
25 1 : func (h Handle) EncodeVarints(dst []byte) int {
26 1 : n := binary.PutUvarint(dst, h.Offset)
27 1 : m := binary.PutUvarint(dst[n:], h.Length)
28 1 : return n + m
29 1 : }
30 :
31 : // HandleWithProperties is used for data blocks and first/lower level index
32 : // blocks, since they can be annotated using BlockPropertyCollectors.
33 : type HandleWithProperties struct {
34 : Handle
35 : Props []byte
36 : }
37 :
38 : // EncodeVarints encodes the block handle and properties into dst using a
39 : // variable-width encoding and returns the number of bytes written.
40 1 : func (h HandleWithProperties) EncodeVarints(dst []byte) []byte {
41 1 : n := h.Handle.EncodeVarints(dst)
42 1 : dst = append(dst[:n], h.Props...)
43 1 : return dst
44 1 : }
45 :
46 : // DecodeHandle returns the block handle encoded in a variable-width encoding at
47 : // the start of src, as well as the number of bytes it occupies. It returns zero
48 : // if given invalid input. A block handle for a data block or a first/lower
49 : // level index block should not be decoded using DecodeHandle since the caller
50 : // may validate that the number of bytes decoded is equal to the length of src,
51 : // which will be false if the properties are not decoded. In those cases the
52 : // caller should use DecodeHandleWithProperties.
53 1 : func DecodeHandle(src []byte) (Handle, int) {
54 1 : offset, n := binary.Uvarint(src)
55 1 : length, m := binary.Uvarint(src[n:])
56 1 : if n == 0 || m == 0 {
57 0 : return Handle{}, 0
58 0 : }
59 1 : return Handle{Offset: offset, Length: length}, n + m
60 : }
61 :
62 : // DecodeHandleWithProperties returns the block handle and properties encoded in
63 : // a variable-width encoding at the start of src. src needs to be exactly the
64 : // length that was encoded. This method must be used for data block and
65 : // first/lower level index blocks. The properties in the block handle point to
66 : // the bytes in src.
67 1 : func DecodeHandleWithProperties(src []byte) (HandleWithProperties, error) {
68 1 : bh, n := DecodeHandle(src)
69 1 : if n == 0 {
70 0 : return HandleWithProperties{}, errors.Errorf("invalid block.Handle")
71 0 : }
72 1 : return HandleWithProperties{
73 1 : Handle: bh,
74 1 : Props: src[n:],
75 1 : }, nil
76 : }
77 :
78 : // TrailerLen is the length of the trailer at the end of a block.
79 : const TrailerLen = 5
80 :
81 : // Trailer is the trailer at the end of a block, encoding the block type
82 : // (compression) and a checksum.
83 : type Trailer = [TrailerLen]byte
84 :
85 : // MakeTrailer constructs a trailer from a block type and a checksum.
86 1 : func MakeTrailer(blockType byte, checksum uint32) (t Trailer) {
87 1 : t[0] = blockType
88 1 : binary.LittleEndian.PutUint32(t[1:5], checksum)
89 1 : return t
90 1 : }
91 :
92 : // ChecksumType specifies the checksum used for blocks.
93 : type ChecksumType byte
94 :
95 : // The available checksum types. These values are part of the durable format and
96 : // should not be changed.
97 : const (
98 : ChecksumTypeNone ChecksumType = 0
99 : ChecksumTypeCRC32c ChecksumType = 1
100 : ChecksumTypeXXHash ChecksumType = 2
101 : ChecksumTypeXXHash64 ChecksumType = 3
102 : )
103 :
104 : // String implements fmt.Stringer.
105 0 : func (t ChecksumType) String() string {
106 0 : switch t {
107 0 : case ChecksumTypeCRC32c:
108 0 : return "crc32c"
109 0 : case ChecksumTypeNone:
110 0 : return "none"
111 0 : case ChecksumTypeXXHash:
112 0 : return "xxhash"
113 0 : case ChecksumTypeXXHash64:
114 0 : return "xxhash64"
115 0 : default:
116 0 : panic(errors.Newf("sstable: unknown checksum type: %d", t))
117 : }
118 : }
119 :
120 : // A Checksummer calculates checksums for blocks.
121 : type Checksummer struct {
122 : Type ChecksumType
123 : xxHasher *xxhash.Digest
124 : }
125 :
126 : // Checksum computes a checksum over the provided block and block type.
127 1 : func (c *Checksummer) Checksum(block []byte, blockType []byte) (checksum uint32) {
128 1 : // Calculate the checksum.
129 1 : switch c.Type {
130 1 : case ChecksumTypeCRC32c:
131 1 : checksum = crc.New(block).Update(blockType).Value()
132 0 : case ChecksumTypeXXHash64:
133 0 : if c.xxHasher == nil {
134 0 : c.xxHasher = xxhash.New()
135 0 : } else {
136 0 : c.xxHasher.Reset()
137 0 : }
138 0 : c.xxHasher.Write(block)
139 0 : c.xxHasher.Write(blockType)
140 0 : checksum = uint32(c.xxHasher.Sum64())
141 0 : default:
142 0 : panic(errors.Newf("unsupported checksum type: %d", c.Type))
143 : }
144 1 : return checksum
145 : }
146 :
147 : // DataBlockIterator is a type constraint for implementations of block iterators
148 : // over data blocks. It's currently satisifed by the *rowblk.Iter type.
149 : type DataBlockIterator interface {
150 : base.InternalIterator
151 :
152 : // Handle returns the handle to the block.
153 : Handle() BufferHandle
154 : // InitHandle initializes the block from the provided buffer handle.
155 : InitHandle(base.Compare, base.Split, BufferHandle, IterTransforms) error
156 : // Valid returns true if the iterator is currently positioned at a valid KV.
157 : Valid() bool
158 : // KV returns the key-value pair at the current iterator position. The
159 : // iterator must be Valid().
160 : KV() *base.InternalKV
161 : // IsLowerBound returns true if all keys produced by this iterator are >= the
162 : // given key. The function is best effort; false negatives are allowed.
163 : //
164 : // If IsLowerBound is true then Compare(First().UserKey, k) >= 0.
165 : //
166 : // If the iterator produces no keys (i.e. First() is nil), IsLowerBound can
167 : // return true for any key.
168 : IsLowerBound(k []byte) bool
169 : // Invalidate invalidates the block iterator, removing references to the
170 : // block it was initialized with. The iterator may continue to be used after
171 : // a call to Invalidate, but all positioning methods should return false.
172 : // Valid() must also return false.
173 : Invalidate()
174 : // IsDataInvalidated returns true when the iterator has been invalidated
175 : // using an Invalidate call.
176 : //
177 : // NB: this is different from Valid which indicates whether the current *KV*
178 : // is valid.
179 : IsDataInvalidated() bool
180 : }
181 :
182 : // IndexBlockIterator is an interface for implementations of block
183 : // iterators over index blocks. It's currently satisifed by the
184 : // *rowblk.IndexIter type.
185 : type IndexBlockIterator interface {
186 : // Init initializes the block iterator from the provided block.
187 : Init(base.Compare, base.Split, []byte, IterTransforms) error
188 : // InitHandle initializes an iterator from the provided block handle.
189 : InitHandle(base.Compare, base.Split, BufferHandle, IterTransforms) error
190 : // Valid returns true if the iterator is currently positioned at a valid
191 : // block handle.
192 : Valid() bool
193 : // IsDataInvalidated returns true when the iterator has been invalidated
194 : // using an Invalidate call.
195 : //
196 : // NB: this is different from Valid which indicates whether the iterator is
197 : // currently positioned over a valid block entry.
198 : IsDataInvalidated() bool
199 : // Invalidate invalidates the block iterator, removing references to the
200 : // block it was initialized with. The iterator may continue to be used after
201 : // a call to Invalidate, but all positioning methods should return false.
202 : // Valid() must also return false.
203 : Invalidate()
204 : // Handle returns the underlying block buffer handle, if the iterator was
205 : // initialized with one.
206 : Handle() BufferHandle
207 : // Separator returns the separator at the iterator's current position. The
208 : // iterator must be positioned at a valid row. A Separator is a user key
209 : // guaranteed to be greater than or equal to every key contained within the
210 : // referenced block(s).
211 : Separator() []byte
212 : // BlockHandleWithProperties decodes the block handle with any encoded
213 : // properties at the iterator's current position.
214 : BlockHandleWithProperties() (HandleWithProperties, error)
215 : // SeekGE seeks the index iterator to the first block entry with a separator
216 : // key greater or equal to the given key. If it returns true, the iterator
217 : // is positioned over the first block that might contain the key [key], and
218 : // following blocks have keys ≥ Separator(). It returns false if the seek
219 : // key is greater than all index block separators.
220 : SeekGE(key []byte) bool
221 : // First seeks index iterator to the first block entry. It returns false if
222 : // the index block is empty.
223 : First() bool
224 : // Last seeks index iterator to the last block entry. It returns false if
225 : // the index block is empty.
226 : Last() bool
227 : // Next steps the index iterator to the next block entry. It returns false
228 : // if the index block is exhausted in the forward direction. A call to Next
229 : // while already exhausted in the forward direction is a no-op.
230 : Next() bool
231 : // Prev steps the index iterator to the previous block entry. It returns
232 : // false if the index block is exhausted in the reverse direction. A call to
233 : // Prev while already exhausted in the reverse direction is a no-op.
234 : Prev() bool
235 : // Close closes the iterator, releasing any resources it holds.
236 : Close() error
237 : }
238 :
239 : // IterTransforms allow on-the-fly transformation of data at iteration time.
240 : //
241 : // These transformations could in principle be implemented as block transforms
242 : // (at least for non-virtual sstables), but applying them during iteration is
243 : // preferable.
244 : type IterTransforms struct {
245 : // SyntheticSeqNum, if set, overrides the sequence number in all keys. It is
246 : // set if the sstable was ingested or it is foreign.
247 : SyntheticSeqNum SyntheticSeqNum
248 : // HideObsoletePoints, if true, skips over obsolete points during iteration.
249 : // This is the norm when the sstable is foreign or the largest sequence number
250 : // of the sstable is below the one we are reading.
251 : HideObsoletePoints bool
252 : SyntheticPrefix SyntheticPrefix
253 : SyntheticSuffix SyntheticSuffix
254 : }
255 :
256 : // NoTransforms is the default value for IterTransforms.
257 : var NoTransforms = IterTransforms{}
258 :
259 : // NoTransforms returns true if there are no transforms enabled.
260 1 : func (t *IterTransforms) NoTransforms() bool {
261 1 : return t.SyntheticSeqNum == 0 &&
262 1 : !t.HideObsoletePoints &&
263 1 : !t.SyntheticPrefix.IsSet() &&
264 1 : !t.SyntheticSuffix.IsSet()
265 1 : }
266 :
267 : // FragmentIterTransforms allow on-the-fly transformation of range deletion or
268 : // range key data at iteration time.
269 : type FragmentIterTransforms struct {
270 : SyntheticSeqNum SyntheticSeqNum
271 : SyntheticPrefix SyntheticPrefix
272 : SyntheticSuffix SyntheticSuffix
273 : }
274 :
275 : // NoTransforms returns true if there are no transforms enabled.
276 1 : func (t *FragmentIterTransforms) NoTransforms() bool {
277 1 : // NoTransforms returns true if there are no transforms enabled.
278 1 : return t.SyntheticSeqNum == 0 &&
279 1 : !t.SyntheticPrefix.IsSet() &&
280 1 : !t.SyntheticSuffix.IsSet()
281 1 : }
282 :
283 : // NoFragmentTransforms is the default value for IterTransforms.
284 : var NoFragmentTransforms = FragmentIterTransforms{}
285 :
286 : // SyntheticSeqNum is used to override all sequence numbers in a table. It is
287 : // set to a non-zero value when the table was created externally and ingested
288 : // whole.
289 : type SyntheticSeqNum base.SeqNum
290 :
291 : // NoSyntheticSeqNum is the default zero value for SyntheticSeqNum, which
292 : // disables overriding the sequence number.
293 : const NoSyntheticSeqNum SyntheticSeqNum = 0
294 :
295 : // SyntheticSuffix will replace every suffix of every point key surfaced during
296 : // block iteration. A synthetic suffix can be used if:
297 : // 1. no two keys in the sst share the same prefix; and
298 : // 2. pebble.Compare(prefix + replacementSuffix, prefix + originalSuffix) < 0,
299 : // for all keys in the backing sst which have a suffix (i.e. originalSuffix
300 : // is not empty).
301 : //
302 : // Range dels are not supported when synthetic suffix is used.
303 : //
304 : // For range keys, the synthetic suffix applies to the suffix that is part of
305 : // RangeKeySet - if it is non-empty, it is replaced with the SyntheticSuffix.
306 : // RangeKeyUnset keys are not supported when a synthetic suffix is used.
307 : type SyntheticSuffix []byte
308 :
309 : // IsSet returns true if the synthetic suffix is not empty.
310 1 : func (ss SyntheticSuffix) IsSet() bool {
311 1 : return len(ss) > 0
312 1 : }
313 :
314 : // SyntheticPrefix represents a byte slice that is implicitly prepended to every
315 : // key in a file being read or accessed by a reader. Note that since the byte
316 : // slice is prepended to every KV rather than replacing a byte prefix, the
317 : // result of prepending the synthetic prefix must be a full, valid key while the
318 : // partial key physically stored within the sstable need not be a valid key
319 : // according to user key semantics.
320 : //
321 : // Note that elsewhere we use the language of 'prefix' to describe the user key
322 : // portion of a MVCC key, as defined by the Comparer's base.Split method. The
323 : // SyntheticPrefix is related only in that it's a byte prefix that is
324 : // incorporated into the logical MVCC prefix.
325 : //
326 : // The table's bloom filters are constructed only on the partial keys physically
327 : // stored in the table, but interactions with the file including seeks and
328 : // reads will all behave as if the file had been constructed from keys that
329 : // include the synthetic prefix. Note that all Compare operations will act on a
330 : // partial key (before any prepending), so the Comparer must support comparing
331 : // these partial keys.
332 : //
333 : // The synthetic prefix will never modify key metadata stored in the key suffix.
334 : //
335 : // NB: Since this transformation currently only applies to point keys, a block
336 : // with range keys cannot be iterated over with a synthetic prefix.
337 : type SyntheticPrefix []byte
338 :
339 : // IsSet returns true if the synthetic prefix is not enpty.
340 1 : func (sp SyntheticPrefix) IsSet() bool {
341 1 : return len(sp) > 0
342 1 : }
343 :
344 : // Apply prepends the synthetic prefix to a key.
345 1 : func (sp SyntheticPrefix) Apply(key []byte) []byte {
346 1 : res := make([]byte, 0, len(sp)+len(key))
347 1 : res = append(res, sp...)
348 1 : res = append(res, key...)
349 1 : return res
350 1 : }
351 :
352 : // Invert removes the synthetic prefix from a key.
353 1 : func (sp SyntheticPrefix) Invert(key []byte) []byte {
354 1 : res, ok := bytes.CutPrefix(key, sp)
355 1 : if !ok {
356 0 : panic(fmt.Sprintf("unexpected prefix: %s", key))
357 : }
358 1 : return res
359 : }
|