Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package block 6 : 7 : import ( 8 : "encoding/binary" 9 : 10 : "github.com/cespare/xxhash/v2" 11 : "github.com/cockroachdb/errors" 12 : "github.com/cockroachdb/pebble/internal/base" 13 : "github.com/cockroachdb/pebble/internal/crc" 14 : ) 15 : 16 : // Handle is the file offset and length of a block. 17 : type Handle struct { 18 : Offset, Length uint64 19 : } 20 : 21 : // EncodeVarints encodes the block handle into dst using a variable-width 22 : // encoding and returns the number of bytes written. 23 1 : func (h Handle) EncodeVarints(dst []byte) int { 24 1 : n := binary.PutUvarint(dst, h.Offset) 25 1 : m := binary.PutUvarint(dst[n:], h.Length) 26 1 : return n + m 27 1 : } 28 : 29 : // HandleWithProperties is used for data blocks and first/lower level index 30 : // blocks, since they can be annotated using BlockPropertyCollectors. 31 : type HandleWithProperties struct { 32 : Handle 33 : Props []byte 34 : } 35 : 36 : // EncodeVarints encodes the block handle and properties into dst using a 37 : // variable-width encoding and returns the number of bytes written. 38 1 : func (h HandleWithProperties) EncodeVarints(dst []byte) []byte { 39 1 : n := h.Handle.EncodeVarints(dst) 40 1 : dst = append(dst[:n], h.Props...) 41 1 : return dst 42 1 : } 43 : 44 : // DecodeHandle returns the block handle encoded in a variable-width encoding at 45 : // the start of src, as well as the number of bytes it occupies. It returns zero 46 : // if given invalid input. A block handle for a data block or a first/lower 47 : // level index block should not be decoded using DecodeHandle since the caller 48 : // may validate that the number of bytes decoded is equal to the length of src, 49 : // which will be false if the properties are not decoded. In those cases the 50 : // caller should use DecodeHandleWithProperties. 51 1 : func DecodeHandle(src []byte) (Handle, int) { 52 1 : offset, n := binary.Uvarint(src) 53 1 : length, m := binary.Uvarint(src[n:]) 54 1 : if n == 0 || m == 0 { 55 0 : return Handle{}, 0 56 0 : } 57 1 : return Handle{Offset: offset, Length: length}, n + m 58 : } 59 : 60 : // DecodeHandleWithProperties returns the block handle and properties encoded in 61 : // a variable-width encoding at the start of src. src needs to be exactly the 62 : // length that was encoded. This method must be used for data block and 63 : // first/lower level index blocks. The properties in the block handle point to 64 : // the bytes in src. 65 1 : func DecodeHandleWithProperties(src []byte) (HandleWithProperties, error) { 66 1 : bh, n := DecodeHandle(src) 67 1 : if n == 0 { 68 0 : return HandleWithProperties{}, errors.Errorf("invalid block.Handle") 69 0 : } 70 1 : return HandleWithProperties{ 71 1 : Handle: bh, 72 1 : Props: src[n:], 73 1 : }, nil 74 : } 75 : 76 : // TrailerLen is the length of the trailer at the end of a block. 77 : const TrailerLen = 5 78 : 79 : // Trailer is the trailer at the end of a block, encoding the block type 80 : // (compression) and a checksum. 81 : type Trailer = [TrailerLen]byte 82 : 83 : // MakeTrailer constructs a trailer from a block type and a checksum. 84 1 : func MakeTrailer(blockType byte, checksum uint32) (t Trailer) { 85 1 : t[0] = blockType 86 1 : binary.LittleEndian.PutUint32(t[1:5], checksum) 87 1 : return t 88 1 : } 89 : 90 : // ChecksumType specifies the checksum used for blocks. 91 : type ChecksumType byte 92 : 93 : // The available checksum types. These values are part of the durable format and 94 : // should not be changed. 95 : const ( 96 : ChecksumTypeNone ChecksumType = 0 97 : ChecksumTypeCRC32c ChecksumType = 1 98 : ChecksumTypeXXHash ChecksumType = 2 99 : ChecksumTypeXXHash64 ChecksumType = 3 100 : ) 101 : 102 : // String implements fmt.Stringer. 103 0 : func (t ChecksumType) String() string { 104 0 : switch t { 105 0 : case ChecksumTypeCRC32c: 106 0 : return "crc32c" 107 0 : case ChecksumTypeNone: 108 0 : return "none" 109 0 : case ChecksumTypeXXHash: 110 0 : return "xxhash" 111 0 : case ChecksumTypeXXHash64: 112 0 : return "xxhash64" 113 0 : default: 114 0 : panic(errors.Newf("sstable: unknown checksum type: %d", t)) 115 : } 116 : } 117 : 118 : // A Checksummer calculates checksums for blocks. 119 : type Checksummer struct { 120 : Type ChecksumType 121 : xxHasher *xxhash.Digest 122 : blockTypeBuf [1]byte 123 : } 124 : 125 : // Checksum computes a checksum over the provided block and block type. 126 1 : func (c *Checksummer) Checksum(block []byte, blockType byte) (checksum uint32) { 127 1 : // Calculate the checksum. 128 1 : c.blockTypeBuf[0] = blockType 129 1 : switch c.Type { 130 1 : case ChecksumTypeCRC32c: 131 1 : checksum = crc.New(block).Update(c.blockTypeBuf[:]).Value() 132 0 : case ChecksumTypeXXHash64: 133 0 : if c.xxHasher == nil { 134 0 : c.xxHasher = xxhash.New() 135 0 : } else { 136 0 : c.xxHasher.Reset() 137 0 : } 138 0 : c.xxHasher.Write(block) 139 0 : c.xxHasher.Write(c.blockTypeBuf[:]) 140 0 : checksum = uint32(c.xxHasher.Sum64()) 141 0 : default: 142 0 : panic(errors.Newf("unsupported checksum type: %d", c.Type)) 143 : } 144 1 : return checksum 145 : } 146 : 147 : // Metadata is an in-memory buffer that stores metadata for a block. It is 148 : // allocated together with the buffer storing the block and is initialized once 149 : // when the block is read from disk. 150 : // 151 : // Portions of this buffer can be cast to the structures we need (through 152 : // unsafe.Pointer), but note that any pointers in these structures will be 153 : // invisible to the GC. Pointers to the block's data buffer are ok, since the 154 : // metadata and the data have the same lifetime (sharing the underlying 155 : // allocation). 156 : type Metadata [MetadataSize]byte 157 : 158 : // MetadataSize is the size of the metadata. The value is chosen to fit a 159 : // colblk.DataBlockDecoder and a CockroachDB colblk.KeySeeker. 160 : const MetadataSize = 336 161 : 162 : // Assert that MetadataSize is a multiple of 8. This is necessary to keep the 163 : // block data buffer aligned. 164 : const _ uint = -(MetadataSize % 8) 165 : 166 : // DataBlockIterator is a type constraint for implementations of block iterators 167 : // over data blocks. It's implemented by *rowblk.Iter and *colblk.DataBlockIter. 168 : type DataBlockIterator interface { 169 : base.InternalIterator 170 : 171 : // Handle returns the handle to the block. 172 : Handle() BufferHandle 173 : // InitHandle initializes the block from the provided buffer handle. 174 : // 175 : // The iterator takes ownership of the BufferHandle and releases it when it is 176 : // closed (or re-initialized with another handle). This happens even in error 177 : // cases. 178 : InitHandle(*base.Comparer, BufferHandle, IterTransforms) error 179 : // Valid returns true if the iterator is currently positioned at a valid KV. 180 : Valid() bool 181 : // KV returns the key-value pair at the current iterator position. The 182 : // iterator must be Valid(). 183 : KV() *base.InternalKV 184 : // IsLowerBound returns true if all keys produced by this iterator are >= the 185 : // given key. The function is best effort; false negatives are allowed. 186 : // 187 : // If IsLowerBound is true then Compare(First().UserKey, k) >= 0. 188 : // 189 : // If the iterator produces no keys (i.e. First() is nil), IsLowerBound can 190 : // return true for any key. 191 : IsLowerBound(k []byte) bool 192 : // Invalidate invalidates the block iterator, removing references to the 193 : // block it was initialized with. The iterator may continue to be used after 194 : // a call to Invalidate, but all positioning methods should return false. 195 : // Valid() must also return false. 196 : Invalidate() 197 : // IsDataInvalidated returns true when the iterator has been invalidated 198 : // using an Invalidate call. 199 : // 200 : // NB: this is different from Valid which indicates whether the current *KV* 201 : // is valid. 202 : IsDataInvalidated() bool 203 : } 204 : 205 : // IndexBlockIterator is an interface for implementations of block iterators 206 : // over index blocks. It's implemented by *rowblk.IndexIter and 207 : // *colblk.IndexBlockIter. 208 : type IndexBlockIterator interface { 209 : // Init initializes the block iterator from the provided block. 210 : Init(*base.Comparer, []byte, IterTransforms) error 211 : // InitHandle initializes an iterator from the provided block handle. 212 : // 213 : // The iterator takes ownership of the BufferHandle and releases it when it is 214 : // closed (or re-initialized with another handle). This happens even in error 215 : // cases. 216 : InitHandle(*base.Comparer, BufferHandle, IterTransforms) error 217 : // Valid returns true if the iterator is currently positioned at a valid 218 : // block handle. 219 : Valid() bool 220 : // IsDataInvalidated returns true when the iterator has been invalidated 221 : // using an Invalidate call. 222 : // 223 : // NB: this is different from Valid which indicates whether the iterator is 224 : // currently positioned over a valid block entry. 225 : IsDataInvalidated() bool 226 : // Invalidate invalidates the block iterator, removing references to the 227 : // block it was initialized with. The iterator may continue to be used after 228 : // a call to Invalidate, but all positioning methods should return false. 229 : // Valid() must also return false. 230 : Invalidate() 231 : // Handle returns the underlying block buffer handle, if the iterator was 232 : // initialized with one. 233 : Handle() BufferHandle 234 : // Separator returns the separator at the iterator's current position. The 235 : // iterator must be positioned at a valid row. A Separator is a user key 236 : // guaranteed to be greater than or equal to every key contained within the 237 : // referenced block(s). 238 : Separator() []byte 239 : // SeparatorLT returns true if the separator at the iterator's current 240 : // position is strictly less than the provided key. For some 241 : // implementations, it may be more performant to call SeparatorLT rather 242 : // than explicitly performing Compare(Separator(), key) < 0. 243 : SeparatorLT(key []byte) bool 244 : // SeparatorGT returns true if the separator at the iterator's current 245 : // position is strictly greater than (or equal, if orEqual=true) the 246 : // provided key. For some implementations, it may be more performant to call 247 : // SeparatorGT rather than explicitly performing a comparison using the key 248 : // returned by Separator. 249 : SeparatorGT(key []byte, orEqual bool) bool 250 : // BlockHandleWithProperties decodes the block handle with any encoded 251 : // properties at the iterator's current position. 252 : BlockHandleWithProperties() (HandleWithProperties, error) 253 : // SeekGE seeks the index iterator to the first block entry with a separator 254 : // key greater or equal to the given key. If it returns true, the iterator 255 : // is positioned over the first block that might contain the key [key], and 256 : // following blocks have keys ≥ Separator(). It returns false if the seek 257 : // key is greater than all index block separators. 258 : SeekGE(key []byte) bool 259 : // First seeks index iterator to the first block entry. It returns false if 260 : // the index block is empty. 261 : First() bool 262 : // Last seeks index iterator to the last block entry. It returns false if 263 : // the index block is empty. 264 : Last() bool 265 : // Next steps the index iterator to the next block entry. It returns false 266 : // if the index block is exhausted in the forward direction. A call to Next 267 : // while already exhausted in the forward direction is a no-op. 268 : Next() bool 269 : // Prev steps the index iterator to the previous block entry. It returns 270 : // false if the index block is exhausted in the reverse direction. A call to 271 : // Prev while already exhausted in the reverse direction is a no-op. 272 : Prev() bool 273 : // Close closes the iterator, releasing any resources it holds. After Close, 274 : // the iterator must be reset such that it could be reused after a call to 275 : // Init or InitHandle. 276 : Close() error 277 : }