Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "io"
11 : "math"
12 : "math/bits"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/binfmt"
17 : "github.com/cockroachdb/pebble/internal/invariants"
18 : "github.com/cockroachdb/pebble/internal/treeprinter"
19 : "golang.org/x/exp/constraints"
20 : )
21 :
22 : // Uint is a constraint that permits any unsigned integer type with an
23 : // explicit size.
24 : type Uint interface {
25 : ~uint8 | ~uint16 | ~uint32 | ~uint64
26 : }
27 :
28 : // UintEncoding indicates how unsigned integers (of at most 64 bits) are
29 : // encoded. It has two components:
30 : // - the low bits indicate how many bytes per integer are used, with
31 : // allowed values 0, 1, 2, 4, or 8.
32 : // - whether we are using a delta encoding, meaning that a base (64-bit) value
33 : // is encoded separately and each encoded value is a delta from that base.
34 : // Delta encoding is never necessary when we use 8 bytes per integer.
35 : //
36 : // Note that 0-byte encodings imply that all values are equal (either to the
37 : // base value if we are using a delta encoding, otherwise to 0).
38 : //
39 : // The UintEncoding byte is serialized to the uint column before the column
40 : // data.
41 : type UintEncoding uint8
42 :
43 : const uintEncodingDeltaBit UintEncoding = 1 << 7
44 : const uintEncodingAllZero UintEncoding = 0
45 :
46 : // IsDelta returns true if it is a delta encoding.
47 1 : func (e UintEncoding) IsDelta() bool {
48 1 : return e&uintEncodingDeltaBit != 0
49 1 : }
50 :
51 : // Width returns the number of bytes used per integer. It can be 0, 1, 2, 4, or 8.
52 1 : func (e UintEncoding) Width() int {
53 1 : return int(e &^ uintEncodingDeltaBit)
54 1 : }
55 :
56 : // IsValid returns true if the encoding is valid.
57 1 : func (e UintEncoding) IsValid() bool {
58 1 : switch e.Width() {
59 1 : case 0, 1, 2, 4:
60 1 : return true
61 1 : case 8:
62 1 : // We should never need to do delta encoding if we store all 64 bits.
63 1 : return !e.IsDelta()
64 0 : default:
65 0 : return false
66 : }
67 : }
68 :
69 : // String implements fmt.Stringer.
70 1 : func (e UintEncoding) String() string {
71 1 : if e.Width() == 0 {
72 1 : if e.IsDelta() {
73 1 : return "const"
74 1 : }
75 1 : return "zero"
76 : }
77 1 : deltaString := ""
78 1 : if e.IsDelta() {
79 1 : deltaString = ",delta"
80 1 : }
81 1 : return fmt.Sprintf("%db%s", e.Width(), deltaString)
82 : }
83 :
84 : // UintEncodingRowThreshold is the threshold under which the number of rows can
85 : // affect the best encoding. This happens when the constant 8 bytes for the
86 : // delta base doesn't make up for saving a byte or two in the per-row encoding.
87 : const UintEncodingRowThreshold = 8
88 :
89 : // DetermineUintEncoding returns the best valid encoding that can be used to
90 : // represent numRows integers in the range [minValue, maxValue].
91 : //
92 : // DetermineUintEncoding returns the same result for any value of rows >=
93 : // UintEncodingRowThreshold.
94 1 : func DetermineUintEncoding(minValue, maxValue uint64, numRows int) UintEncoding {
95 1 : b := byteWidth(maxValue - minValue)
96 1 : if b == 8 {
97 1 : return UintEncoding(8)
98 1 : }
99 : // Check if we can use the same number of bytes without a delta encoding.
100 1 : isDelta := maxValue >= (1 << (b << 3))
101 1 : if isDelta && numRows < UintEncodingRowThreshold {
102 1 : bNoDelta := byteWidth(maxValue)
103 1 : // Check if saving (bNoDelta-b) bytes per row makes up for the 8 bytes
104 1 : // required by the delta base.
105 1 : if numRows*int(bNoDelta-b) < 8 {
106 1 : b = bNoDelta
107 1 : isDelta = false
108 1 : }
109 : }
110 1 : return makeUintEncoding(b, isDelta)
111 : }
112 :
113 : // byteWidth returns the number of bytes necessary to represent the given value,
114 : // either 0, 1, 2, 4, or 8.
115 1 : func byteWidth(maxValue uint64) uint8 {
116 1 : // b the number of bytes necessary to represent maxValue.
117 1 : b := (uint8(bits.Len64(maxValue)) + 7) >> 3
118 1 : // Round up to the nearest power of 2 by subtracting one and filling out all bits.
119 1 : b--
120 1 : b |= b >> 1
121 1 : b |= b >> 2
122 1 : return b + 1
123 1 : }
124 :
125 1 : func makeUintEncoding(width uint8, isDelta bool) UintEncoding {
126 1 : e := UintEncoding(width)
127 1 : if isDelta {
128 1 : e |= uintEncodingDeltaBit
129 1 : }
130 1 : if invariants.Enabled && !e.IsValid() {
131 0 : panic(e)
132 : }
133 1 : return e
134 : }
135 :
136 : // UintBuilder builds a column of unsigned integers. It uses the smallest
137 : // possible UintEncoding, depending on the values.
138 : type UintBuilder struct {
139 : // configuration fixed on Init; preserved across Reset
140 : useDefault bool
141 :
142 : // array holds the underlying heap-allocated array in which values are
143 : // stored.
144 : array struct {
145 : // n is the size of the array (in count of T elements; not bytes). n is
146 : // NOT the number of elements that have been populated by the user.
147 : n int
148 : // elems provides access to elements without bounds checking. elems is
149 : // grown automatically in Set.
150 : elems UnsafeRawSlice[uint64]
151 : }
152 : // stats holds state for the purpose of tracking which UintEncoding would
153 : // be used if the caller Finished the column including all elements Set so
154 : // far. The stats state is used by Size (and Finish) to cheaply determine
155 : // which encoding may most concisely encode the array.
156 : //
157 : // Every Set(i, v) call updates minimum and maximum if necessary. If a call
158 : // updates minimum, maximum or both, it recalculates the encoding and if it
159 : // changed sets sets encodingRow=i, indicating which row last updated the
160 : // width.
161 : //
162 : // Any call to Size or Finish that supplies [rows] that's inclusive of the
163 : // index stored in widthRow may use the stored width. Calls with fewer
164 : // [rows] must recompute the min/max. In expected usage, only Finish will be
165 : // called with fewer rows and only with one less row than has been set,
166 : // meaning that only if the last row updated the width is a recomputation
167 : // necessary.
168 : //
169 : // TODO(jackson): There is a small discrete set of possible encodings, so we
170 : // could instead track the index of the first row that makes each encoding
171 : // impossible. This would allow us to avoid recomputing the min/max in all
172 : // cases. Or, if we limit the API to only allow Finish to be called with one
173 : // less than the last set row, we could maintain the width of only the last
174 : // two rows.
175 : stats struct {
176 : minimum uint64
177 : maximum uint64
178 : encoding UintEncoding
179 : encodingRow int // index of last update to encoding
180 : }
181 : }
182 :
183 : // Init initializes the UintBuilder.
184 1 : func (b *UintBuilder) Init() {
185 1 : b.init(false)
186 1 : }
187 :
188 : // InitWithDefault initializes the UintBuilder. Any rows that are not explicitly
189 : // set are assumed to be zero. For the purpose of determining whether a delta
190 : // encoding is possible, the column is assumed to contain at least 1 default
191 : // value.
192 : //
193 : // InitWithDefault may be preferrable when a nonzero value is uncommon, and the
194 : // caller can avoid explicitly Set-ing every zero value.
195 1 : func (b *UintBuilder) InitWithDefault() {
196 1 : b.init(true)
197 1 : }
198 :
199 1 : func (b *UintBuilder) init(useDefault bool) {
200 1 : b.useDefault = useDefault
201 1 : b.Reset()
202 1 : }
203 :
204 : // NumColumns implements ColumnWriter.
205 1 : func (b *UintBuilder) NumColumns() int { return 1 }
206 :
207 : // DataType implements ColumnWriter.
208 1 : func (b *UintBuilder) DataType(int) DataType { return DataTypeUint }
209 :
210 : // Reset implements ColumnWriter and resets the builder, reusing existing
211 : // allocated memory.
212 1 : func (b *UintBuilder) Reset() {
213 1 : if b.useDefault {
214 1 : // If the caller configured a default zero, we assume that the array
215 1 : // will include at least one default value.
216 1 : b.stats.minimum = 0
217 1 : b.stats.maximum = 0
218 1 : clear(b.array.elems.Slice(b.array.n))
219 1 : } else {
220 1 : b.stats.minimum = math.MaxUint64
221 1 : b.stats.maximum = 0
222 1 : // We could reset all values as a precaution, but it has a visible cost
223 1 : // in benchmarks.
224 1 : if invariants.Sometimes(50) {
225 1 : for i := 0; i < b.array.n; i++ {
226 1 : b.array.elems.set(i, math.MaxUint64)
227 1 : }
228 : }
229 : }
230 1 : b.stats.encoding = uintEncodingAllZero
231 1 : b.stats.encodingRow = 0
232 : }
233 :
234 : // Get gets the value of the provided row index. The provided row must have been
235 : // Set or the builder must have been initialized with InitWithDefault.
236 1 : func (b *UintBuilder) Get(row int) uint64 {
237 1 : // If the UintBuilder is configured to use a zero value for unset rows, it's
238 1 : // possible that the array has not been grown to a size that includes [row].
239 1 : if b.array.n <= row {
240 1 : if invariants.Enabled && !b.useDefault {
241 0 : panic(errors.AssertionFailedf("Get(%d) on UintBuilder with array of size %d", row, b.array.n))
242 : }
243 1 : return 0
244 : }
245 1 : return b.array.elems.At(row)
246 : }
247 :
248 : // Set sets the value of the provided row index to v.
249 1 : func (b *UintBuilder) Set(row int, v uint64) {
250 1 : if b.array.n <= row {
251 1 : // Double the size of the allocated array, or initialize it to at least 32
252 1 : // values (256 bytes) if this is the first allocation. Then double until
253 1 : // there's sufficient space.
254 1 : n2 := max(b.array.n<<1, 32)
255 1 : for n2 <= row {
256 0 : n2 <<= 1 /* double the size */
257 0 : }
258 : // NB: Go guarantees the allocated array will be 64-bit aligned.
259 1 : newDataTyped := make([]uint64, n2)
260 1 : copy(newDataTyped, b.array.elems.Slice(b.array.n))
261 1 : newElems := makeUnsafeRawSlice[uint64](unsafe.Pointer(&newDataTyped[0]))
262 1 : b.array.n = n2
263 1 : b.array.elems = newElems
264 :
265 : }
266 : // Maintain the running minimum and maximum for the purpose of maintaining
267 : // knowledge of the delta encoding that would be used.
268 1 : if b.stats.minimum > v || b.stats.maximum < v || row < UintEncodingRowThreshold {
269 1 : b.stats.minimum = min(v, b.stats.minimum)
270 1 : b.stats.maximum = max(v, b.stats.maximum)
271 1 : // If updating the minimum and maximum means that we now much use a wider
272 1 : // width integer, update the encoding and the index of the update to it.
273 1 : if e := DetermineUintEncoding(b.stats.minimum, b.stats.maximum, row+1); e != b.stats.encoding {
274 1 : b.stats.encoding = e
275 1 : b.stats.encodingRow = row
276 1 : }
277 : }
278 1 : b.array.elems.set(row, v)
279 : }
280 :
281 : // Size implements ColumnWriter and returns the size of the column if its first
282 : // [rows] rows were serialized, serializing the column into offset [offset].
283 1 : func (b *UintBuilder) Size(rows int, offset uint32) uint32 {
284 1 : if rows == 0 {
285 1 : return offset
286 1 : }
287 1 : e, _ := b.determineEncoding(rows)
288 1 : return uintColumnSize(uint32(rows), offset, e)
289 : }
290 :
291 : // determineEncoding determines the best encoding for a column containing the
292 : // first [rows], along with a lower bound on all the values which can be used as
293 : // a "base" if the encoding is a delta encoding.
294 1 : func (b *UintBuilder) determineEncoding(rows int) (_ UintEncoding, deltaBase uint64) {
295 1 : if b.stats.encodingRow < rows {
296 1 : // b.delta.encoding became the current value within the first [rows], so we
297 1 : // can use it.
298 1 : //
299 1 : // Note that if useDefault is set, this encoding assumes there is at least
300 1 : // one element with the default (zero) value, which might be pessimistic.
301 1 : //
302 1 : // Note that b.stats.minimum includes all rows set so far so it might be
303 1 : // strictly smaller than all values up to [rows]; but it is still a suitable
304 1 : // base for b.stats.encoding.
305 1 : if invariants.Sometimes(1) && rows > 0 {
306 1 : if enc, _ := b.recalculateEncoding(rows); enc != b.stats.encoding {
307 0 : panic(fmt.Sprintf("fast and slow paths don't agree: %s vs %s", b.stats.encoding, enc))
308 : }
309 : }
310 1 : return b.stats.encoding, b.stats.minimum
311 : }
312 1 : return b.recalculateEncoding(rows)
313 : }
314 :
315 1 : func (b *UintBuilder) recalculateEncoding(rows int) (_ UintEncoding, deltaBase uint64) {
316 1 : // We have to recalculate the minimum and maximum.
317 1 : minimum, maximum := computeMinMax(b.array.elems.Slice(min(rows, b.array.n)))
318 1 : if b.useDefault {
319 1 : // Mirror the pessimism of the fast path so that the result is consistent.
320 1 : // Otherwise, adding a row can result in a different encoding even when not
321 1 : // including that row.
322 1 : minimum = 0
323 1 : }
324 1 : return DetermineUintEncoding(minimum, maximum, rows), minimum
325 : }
326 :
327 : // uintColumnSize returns the size of a column of unsigned integers, encoded at
328 : // the provided offset using the provided width. If width < sizeof(T), then a
329 : // delta encoding is assumed.
330 1 : func uintColumnSize(rows, offset uint32, e UintEncoding) uint32 {
331 1 : offset++ // DeltaEncoding byte
332 1 : if e.IsDelta() {
333 1 : // A delta encoding will be used. We need to first account for the constant
334 1 : // that encodes the base value.
335 1 : offset += 8
336 1 : }
337 1 : width := uint32(e.Width())
338 1 : // Include alignment bytes necessary to align offset appropriately for
339 1 : // elements of the delta width.
340 1 : if width > 0 {
341 1 : offset = align(offset, width)
342 1 : }
343 : // Now account for the array of [rows] x w elements encoding the deltas.
344 1 : return offset + rows*width
345 : }
346 :
347 : // Finish implements ColumnWriter, serializing the column into offset [offset] of
348 : // [buf].
349 1 : func (b *UintBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
350 1 : if rows == 0 {
351 1 : return offset
352 1 : }
353 :
354 1 : e, minimum := b.determineEncoding(rows)
355 1 :
356 1 : // NB: In some circumstances, it's possible for b.array.elems.ptr to be nil.
357 1 : // Specifically, if the builder is initialized using InitWithDefault and no
358 1 : // non-default values exist, no array will have been allocated (we lazily
359 1 : // allocate b.array.elems.ptr). It's illegal to try to construct an unsafe
360 1 : // slice from a nil ptr with non-zero rows. Only attempt to construct the
361 1 : // values slice if there's actually a non-nil ptr.
362 1 : var valuesSlice []uint64
363 1 : if b.array.elems.ptr != nil {
364 1 : valuesSlice = b.array.elems.Slice(rows)
365 1 : }
366 1 : return uintColumnFinish(minimum, valuesSlice, e, offset, buf)
367 : }
368 :
369 : // uintColumnFinish finishes the column of unsigned integers of type T, applying
370 : // the given encoding.
371 : func uintColumnFinish(
372 : minimum uint64, values []uint64, e UintEncoding, offset uint32, buf []byte,
373 1 : ) uint32 {
374 1 : buf[offset] = byte(e)
375 1 : offset++
376 1 :
377 1 : deltaBase := uint64(0)
378 1 : if e.IsDelta() {
379 1 : deltaBase = minimum
380 1 : binary.LittleEndian.PutUint64(buf[offset:], minimum)
381 1 : offset += 8
382 1 : }
383 1 : width := uint32(e.Width())
384 1 : if width == 0 {
385 1 : // All the column values are the same.
386 1 : return offset
387 1 : }
388 : // Align the offset appropriately.
389 1 : offset = alignWithZeroes(buf, offset, width)
390 1 :
391 1 : switch e.Width() {
392 1 : case 1:
393 1 : dest := makeUnsafeRawSlice[uint8](unsafe.Pointer(&buf[offset])).Slice(len(values))
394 1 : reduceUints(deltaBase, values, dest)
395 :
396 1 : case 2:
397 1 : dest := makeUnsafeRawSlice[uint16](unsafe.Pointer(&buf[offset])).Slice(len(values))
398 1 : reduceUints(deltaBase, values, dest)
399 :
400 1 : case 4:
401 1 : dest := makeUnsafeRawSlice[uint32](unsafe.Pointer(&buf[offset])).Slice(len(values))
402 1 : reduceUints(deltaBase, values, dest)
403 :
404 1 : case 8:
405 1 : if deltaBase != 0 {
406 0 : panic("unreachable")
407 : }
408 1 : dest := makeUnsafeRawSlice[uint64](unsafe.Pointer(&buf[offset])).Slice(len(values))
409 1 : copy(dest, values)
410 :
411 0 : default:
412 0 : panic("unreachable")
413 : }
414 1 : return offset + uint32(len(values))*width
415 : }
416 :
417 : // WriteDebug implements Encoder.
418 1 : func (b *UintBuilder) WriteDebug(w io.Writer, rows int) {
419 1 : fmt.Fprintf(w, "%s: %d rows", DataTypeUint, rows)
420 1 : }
421 :
422 : // reduceUints reduces the bit-width of a slice of unsigned by subtracting a
423 : // minimum value from each element and writing it to dst. For example,
424 : //
425 : // reduceUints[uint8](10, []uint64{10, 11, 12}, dst)
426 : //
427 : // could be used to reduce a slice of uint64 values to uint8 values {0, 1, 2}.
428 1 : func reduceUints[N constraints.Integer](minimum uint64, values []uint64, dst []N) {
429 1 : for i := 0; i < len(values); i++ {
430 1 : if invariants.Enabled {
431 1 : if values[i] < minimum {
432 0 : panic("incorrect minimum value")
433 : }
434 1 : if values[i]-minimum > uint64(N(0)-1) {
435 0 : panic("incorrect target width")
436 : }
437 : }
438 1 : dst[i] = N(values[i] - minimum)
439 : }
440 : }
441 :
442 : // computeMinMax computes the minimum and the maximum of the provided slice of
443 : // unsigned integers.
444 1 : func computeMinMax[I constraints.Unsigned](values []I) (I, I) {
445 1 : minimum := I(0) - 1
446 1 : maximum := I(0)
447 1 : for _, v := range values {
448 1 : minimum = min(minimum, v)
449 1 : maximum = max(maximum, v)
450 1 : }
451 1 : return minimum, maximum
452 : }
453 :
454 : func uintsToBinFormatter(
455 : f *binfmt.Formatter, tp treeprinter.Node, rows int, uintFormatter func(el, base uint64) string,
456 1 : ) {
457 1 : if rows == 0 {
458 1 : return
459 1 : }
460 1 : if uintFormatter == nil {
461 1 : uintFormatter = func(v, base uint64) string {
462 1 : if base == 0 {
463 1 : return fmt.Sprint(v)
464 1 : }
465 1 : return fmt.Sprintf("%d + %d = %d", v, base, base+v)
466 : }
467 : }
468 :
469 1 : e := UintEncoding(f.PeekUint(1)) // UintEncoding byte
470 1 : if !e.IsValid() {
471 0 : panic(fmt.Sprintf("%d", e))
472 : }
473 1 : f.HexBytesln(1, "encoding: %s", e)
474 1 :
475 1 : var base uint64
476 1 : if e.IsDelta() {
477 1 : base = f.PeekUint(8)
478 1 : f.HexBytesln(8, "64-bit constant: %d", base)
479 1 : }
480 1 : width := e.Width()
481 1 : if width == 0 {
482 1 : // The column is zero or constant.
483 1 : f.ToTreePrinter(tp)
484 1 : return
485 1 : }
486 :
487 1 : if off := align(f.RelativeOffset(), width); off != f.RelativeOffset() {
488 1 : f.HexBytesln(off-f.RelativeOffset(), "padding (aligning to %d-bit boundary)", width*8)
489 1 : }
490 1 : for i := 0; i < rows; i++ {
491 1 : f.HexBytesln(width, "data[%d] = %s", i, uintFormatter(f.PeekUint(width), base))
492 1 : }
493 1 : f.ToTreePrinter(tp)
494 : }
|