Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "slices"
9 : "unsafe"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/binfmt"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/internal/treeprinter"
16 : "github.com/cockroachdb/pebble/sstable/block"
17 : )
18 :
19 : const indexBlockCustomHeaderSize = 0
20 :
21 : // IndexBlockWriter writes columnar index blocks. The writer is used for both
22 : // first-level and second-level index blocks. The index block schema consists of
23 : // three primary columns:
24 : // - Separators: a user key that is ≥ the largest user key in the
25 : // corresponding entry, and ≤ the smallest user key in the next entry.
26 : // Note that this allows consecutive separators to be equal. This is
27 : // possible when snapshots required we preserve duplicate user keys at
28 : // different sequence numbers.
29 : // - Offsets: the offset of the end of the corresponding block.
30 : // - Lengths: the length in bytes of the corresponding block.
31 : // - Block properties: a slice encoding arbitrary user-defined block
32 : // properties.
33 : //
34 : // TODO(jackson): Consider splitting separators into prefixes and suffixes (even
35 : // without user-defined columns). This would allow us to use prefix compression
36 : // for the prefix. Separators should typically be suffixless unless two KVs with
37 : // the same prefix straddle a block boundary. We would need to use a buffer to
38 : // materialize the separator key when we need to use it outside the context of
39 : // seeking within the block.
40 : type IndexBlockWriter struct {
41 : separators RawBytesBuilder
42 : offsets UintBuilder
43 : lengths UintBuilder
44 : blockProperties RawBytesBuilder
45 : rows int
46 : enc blockEncoder
47 : }
48 :
49 : const (
50 : indexBlockColumnSeparator = iota
51 : indexBlockColumnOffsets
52 : indexBlockColumnLengths
53 : indexBlockColumnBlockProperties
54 : indexBlockColumnCount
55 : )
56 :
57 : // Init initializes the index block writer.
58 2 : func (w *IndexBlockWriter) Init() {
59 2 : w.separators.Init()
60 2 : w.offsets.Init()
61 2 : w.lengths.Init()
62 2 : w.blockProperties.Init()
63 2 : w.rows = 0
64 2 : }
65 :
66 : // Reset resets the index block writer to its initial state, retaining buffers.
67 2 : func (w *IndexBlockWriter) Reset() {
68 2 : w.separators.Reset()
69 2 : w.offsets.Reset()
70 2 : w.lengths.Reset()
71 2 : w.blockProperties.Reset()
72 2 : w.rows = 0
73 2 : w.enc.reset()
74 2 : }
75 :
76 : // Rows returns the number of entries in the index block so far.
77 2 : func (w *IndexBlockWriter) Rows() int {
78 2 : return w.rows
79 2 : }
80 :
81 : // AddBlockHandle adds a new separator and end offset of a data block to the
82 : // index block. Add returns the index of the row.
83 : //
84 : // AddBlockHandle should only be used for first-level index blocks.
85 : func (w *IndexBlockWriter) AddBlockHandle(
86 : separator []byte, handle block.Handle, blockProperties []byte,
87 2 : ) int {
88 2 : idx := w.rows
89 2 : w.separators.Put(separator)
90 2 : w.offsets.Set(w.rows, handle.Offset)
91 2 : w.lengths.Set(w.rows, handle.Length)
92 2 : w.blockProperties.Put(blockProperties)
93 2 : w.rows++
94 2 : return idx
95 2 : }
96 :
97 : // UnsafeSeparator returns the separator of the i'th entry.
98 2 : func (w *IndexBlockWriter) UnsafeSeparator(i int) []byte {
99 2 : return w.separators.UnsafeGet(i)
100 2 : }
101 :
102 : // Size returns the size of the pending index block.
103 2 : func (w *IndexBlockWriter) Size() int {
104 2 : return w.size(w.rows)
105 2 : }
106 :
107 2 : func (w *IndexBlockWriter) size(rows int) int {
108 2 : off := blockHeaderSize(indexBlockColumnCount, indexBlockCustomHeaderSize)
109 2 : off = w.separators.Size(rows, off)
110 2 : off = w.offsets.Size(rows, off)
111 2 : off = w.lengths.Size(rows, off)
112 2 : off = w.blockProperties.Size(rows, off)
113 2 : off++
114 2 : return int(off)
115 2 : }
116 :
117 : // Finish serializes the pending index block, including the first [rows] rows.
118 : // The value of [rows] must be Rows() or Rows()-1.
119 2 : func (w *IndexBlockWriter) Finish(rows int) []byte {
120 2 : if invariants.Enabled && rows != w.rows && rows != w.rows-1 {
121 0 : panic(errors.AssertionFailedf("index block has %d rows; asked to finish %d", w.rows, rows))
122 : }
123 :
124 2 : w.enc.init(w.size(rows), Header{
125 2 : Version: Version1,
126 2 : Columns: indexBlockColumnCount,
127 2 : Rows: uint32(rows),
128 2 : }, indexBlockCustomHeaderSize)
129 2 : w.enc.encode(rows, &w.separators)
130 2 : w.enc.encode(rows, &w.offsets)
131 2 : w.enc.encode(rows, &w.lengths)
132 2 : w.enc.encode(rows, &w.blockProperties)
133 2 : return w.enc.finish()
134 : }
135 :
136 : // An IndexBlockDecoder reads columnar index blocks.
137 : type IndexBlockDecoder struct {
138 : separators RawBytes
139 : offsets UnsafeUints
140 : lengths UnsafeUints // only used for second-level index blocks
141 : blockProps RawBytes
142 : bd BlockDecoder
143 : }
144 :
145 : // Init initializes the index block decoder with the given serialized index
146 : // block.
147 2 : func (r *IndexBlockDecoder) Init(data []byte) {
148 2 : r.bd.Init(data, indexBlockCustomHeaderSize)
149 2 : r.separators = r.bd.RawBytes(indexBlockColumnSeparator)
150 2 : r.offsets = r.bd.Uints(indexBlockColumnOffsets)
151 2 : r.lengths = r.bd.Uints(indexBlockColumnLengths)
152 2 : r.blockProps = r.bd.RawBytes(indexBlockColumnBlockProperties)
153 2 : }
154 :
155 : // DebugString prints a human-readable explanation of the keyspan block's binary
156 : // representation.
157 1 : func (r *IndexBlockDecoder) DebugString() string {
158 1 : f := binfmt.New(r.bd.data).LineWidth(20)
159 1 : tp := treeprinter.New()
160 1 : r.Describe(f, tp.Child("index-block-decoder"))
161 1 : return tp.String()
162 1 : }
163 :
164 : // Describe describes the binary format of the index block, assuming f.Offset()
165 : // is positioned at the beginning of the same index block described by r.
166 1 : func (r *IndexBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) {
167 1 : // Set the relative offset. When loaded into memory, the beginning of blocks
168 1 : // are aligned. Padding that ensures alignment is done relative to the
169 1 : // current offset. Setting the relative offset ensures that if we're
170 1 : // describing this block within a larger structure (eg, f.Offset()>0), we
171 1 : // compute padding appropriately assuming the current byte f.Offset() is
172 1 : // aligned.
173 1 : f.SetAnchorOffset()
174 1 :
175 1 : n := tp.Child("index block header")
176 1 : r.bd.headerToBinFormatter(f, n)
177 1 : for i := 0; i < indexBlockColumnCount; i++ {
178 1 : r.bd.columnToBinFormatter(f, n, i, int(r.bd.header.Rows))
179 1 : }
180 1 : f.HexBytesln(1, "block padding byte")
181 1 : f.ToTreePrinter(n)
182 : }
183 :
184 : // IndexIter is an iterator over the block entries in an index block.
185 : type IndexIter struct {
186 : compare base.Compare
187 : split base.Split
188 : d *IndexBlockDecoder
189 : n int
190 : row int
191 :
192 : syntheticPrefixAndSuffix block.SyntheticPrefixAndSuffix
193 :
194 : h block.BufferHandle
195 : // TODO(radu): remove allocDecoder and require any Init callers to provide the
196 : // decoder.
197 : allocDecoder IndexBlockDecoder
198 : keyBuf []byte
199 : }
200 :
201 : // Assert that IndexIter satisfies the block.IndexBlockIterator interface.
202 : var _ block.IndexBlockIterator = (*IndexIter)(nil)
203 :
204 : // InitWithDecoder initializes an index iterator from the provided decoder.
205 : func (i *IndexIter) InitWithDecoder(
206 : compare base.Compare, split base.Split, d *IndexBlockDecoder, transforms block.IterTransforms,
207 2 : ) {
208 2 : i.compare = compare
209 2 : i.split = split
210 2 : i.d = d
211 2 : i.n = int(d.bd.header.Rows)
212 2 : i.row = -1
213 2 : i.syntheticPrefixAndSuffix = transforms.SyntheticPrefixAndSuffix
214 2 : // Leave h, allocDecoder, keyBuf unchanged.
215 2 : }
216 :
217 : // Init initializes an iterator from the provided block data slice.
218 : func (i *IndexIter) Init(
219 : cmp base.Compare, split base.Split, blk []byte, transforms block.IterTransforms,
220 2 : ) error {
221 2 : i.h.Release()
222 2 : i.h = block.BufferHandle{}
223 2 : i.allocDecoder.Init(blk)
224 2 : i.InitWithDecoder(cmp, split, &i.allocDecoder, transforms)
225 2 : return nil
226 2 : }
227 :
228 : // InitHandle initializes an iterator from the provided block handle.
229 : func (i *IndexIter) InitHandle(
230 : cmp base.Compare, split base.Split, blk block.BufferHandle, transforms block.IterTransforms,
231 2 : ) error {
232 2 : i.h.Release()
233 2 : i.h = blk
234 2 : d := (*IndexBlockDecoder)(unsafe.Pointer(blk.BlockMetadata()))
235 2 : i.InitWithDecoder(cmp, split, d, transforms)
236 2 : return nil
237 2 : }
238 :
239 : // RowIndex returns the index of the block entry at the iterator's current
240 : // position.
241 0 : func (i *IndexIter) RowIndex() int {
242 0 : return i.row
243 0 : }
244 :
245 : // Valid returns true if the iterator is currently positioned at a valid block
246 : // handle.
247 2 : func (i *IndexIter) Valid() bool {
248 2 : return 0 <= i.row && i.row < i.n
249 2 : }
250 :
251 : // Invalidate invalidates the block iterator, removing references to the block
252 : // it was initialized with.
253 2 : func (i *IndexIter) Invalidate() {
254 2 : i.d = nil
255 2 : i.n = 0
256 2 : }
257 :
258 : // IsDataInvalidated returns true when the iterator has been invalidated
259 : // using an Invalidate call. NB: this is different from Valid.
260 2 : func (i *IndexIter) IsDataInvalidated() bool {
261 2 : return i.d == nil
262 2 : }
263 :
264 : // Handle returns the underlying block buffer handle, if the iterator was
265 : // initialized with one.
266 2 : func (i *IndexIter) Handle() block.BufferHandle {
267 2 : return i.h
268 2 : }
269 :
270 : // Separator returns the separator at the iterator's current position. The
271 : // iterator must be positioned at a valid row.
272 2 : func (i *IndexIter) Separator() []byte {
273 2 : key := i.d.separators.At(i.row)
274 2 : if i.syntheticPrefixAndSuffix.IsUnset() {
275 2 : return key
276 2 : }
277 2 : return i.applyTransforms(key)
278 : }
279 :
280 : // SeparatorLT returns true if the separator at the iterator's current
281 : // position is strictly less than the provided key.
282 2 : func (i *IndexIter) SeparatorLT(key []byte) bool {
283 2 : return i.compare(i.Separator(), key) < 0
284 2 : }
285 :
286 : // SeparatorGT returns true if the separator at the iterator's current position
287 : // is strictly greater than (or equal, if orEqual=true) the provided key.
288 2 : func (i *IndexIter) SeparatorGT(key []byte, inclusively bool) bool {
289 2 : cmp := i.compare(i.Separator(), key)
290 2 : return cmp > 0 || (cmp == 0 && inclusively)
291 2 : }
292 :
293 2 : func (i *IndexIter) applyTransforms(key []byte) []byte {
294 2 : syntheticPrefix := i.syntheticPrefixAndSuffix.Prefix()
295 2 : syntheticSuffix := i.syntheticPrefixAndSuffix.Suffix()
296 2 : if syntheticSuffix.IsSet() {
297 2 : key = key[:i.split(key)]
298 2 : }
299 2 : i.keyBuf = slices.Grow(i.keyBuf[:0], len(syntheticPrefix)+len(key)+len(syntheticSuffix))
300 2 : i.keyBuf = append(i.keyBuf, syntheticPrefix...)
301 2 : i.keyBuf = append(i.keyBuf, key...)
302 2 : i.keyBuf = append(i.keyBuf, syntheticSuffix...)
303 2 : return i.keyBuf
304 : }
305 :
306 : // BlockHandleWithProperties decodes the block handle with any encoded
307 : // properties at the iterator's current position.
308 2 : func (i *IndexIter) BlockHandleWithProperties() (block.HandleWithProperties, error) {
309 2 : if invariants.Enabled && !i.Valid() {
310 0 : panic(errors.AssertionFailedf("invalid row %d (n=%d)", i.row, i.n))
311 : }
312 2 : return block.HandleWithProperties{
313 2 : Handle: block.Handle{
314 2 : Offset: i.d.offsets.At(i.row),
315 2 : Length: i.d.lengths.At(i.row),
316 2 : },
317 2 : Props: i.d.blockProps.At(i.row),
318 2 : }, nil
319 : }
320 :
321 : // SeekGE seeks the index iterator to the first block entry with a separator key
322 : // greater or equal to the given key. It returns false if the seek key is
323 : // greater than all index block separators.
324 2 : func (i *IndexIter) SeekGE(key []byte) bool {
325 2 : // Define f(-1) == false and f(upper) == true.
326 2 : // Invariant: f(index-1) == false, f(upper) == true.
327 2 : index, upper := 0, i.n
328 2 : for index < upper {
329 2 : h := int(uint(index+upper) >> 1) // avoid overflow when computing h
330 2 : // index ≤ h < upper
331 2 :
332 2 : // TODO(jackson): Is Bytes.At or Bytes.Slice(Bytes.Offset(h),
333 2 : // Bytes.Offset(h+1)) faster in this code?
334 2 : separator := i.d.separators.At(h)
335 2 : if !i.syntheticPrefixAndSuffix.IsUnset() {
336 2 : // TODO(radu): compare without materializing the transformed key.
337 2 : separator = i.applyTransforms(separator)
338 2 : }
339 : // TODO(radu): experiment with splitting the separator prefix and suffix in
340 : // separate columns and using bytes.Compare() on the prefix in the hot path.
341 2 : c := i.compare(key, separator)
342 2 : if c > 0 {
343 2 : index = h + 1 // preserves f(index-1) == false
344 2 : } else {
345 2 : upper = h // preserves f(upper) == true
346 2 : }
347 : }
348 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true => answer is index.
349 2 : i.row = index
350 2 : return index < i.n
351 : }
352 :
353 : // First seeks index iterator to the first block entry. It returns false if the
354 : // index block is empty.
355 2 : func (i *IndexIter) First() bool {
356 2 : i.row = 0
357 2 : return i.n > 0
358 2 : }
359 :
360 : // Last seeks index iterator to the last block entry. It returns false if the
361 : // index block is empty.
362 2 : func (i *IndexIter) Last() bool {
363 2 : i.row = i.n - 1
364 2 : return i.n > 0
365 2 : }
366 :
367 : // Next steps the index iterator to the next block entry. It returns false if
368 : // the index block is exhausted in the forward direction. A call to Next while
369 : // already exhausted in the forward direction is a no-op.
370 2 : func (i *IndexIter) Next() bool {
371 2 : i.row = min(i.n, i.row+1)
372 2 : return i.row < i.n
373 2 : }
374 :
375 : // Prev steps the index iterator to the previous block entry. It returns false
376 : // if the index block is exhausted in the reverse direction. A call to Prev
377 : // while already exhausted in the reverse direction is a no-op.
378 2 : func (i *IndexIter) Prev() bool {
379 2 : i.row = max(-1, i.row-1)
380 2 : return i.row >= 0 && i.row < i.n
381 2 : }
382 :
383 : // Close closes the iterator, releasing any resources it holds.
384 2 : func (i *IndexIter) Close() error {
385 2 : i.h.Release()
386 2 : i.h = block.BufferHandle{}
387 2 : i.d = nil
388 2 : i.n = 0
389 2 : i.syntheticPrefixAndSuffix = block.SyntheticPrefixAndSuffix{}
390 2 : return nil
391 2 : }
|