Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "io"
11 : "math"
12 : "math/bits"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/binfmt"
17 : "github.com/cockroachdb/pebble/internal/invariants"
18 : "golang.org/x/exp/constraints"
19 : )
20 :
21 : // Uint is a constraint that permits any unsigned integer type with an
22 : // explicit size.
23 : type Uint interface {
24 : ~uint8 | ~uint16 | ~uint32 | ~uint64
25 : }
26 :
27 : // UintEncoding indicates how unsigned integers (of at most 64 bits) are
28 : // encoded. It has two components:
29 : // - the low bits indicate how many bytes per integer are used, with
30 : // allowed values 0, 1, 2, 4, or 8.
31 : // - whether we are using a delta encoding, meaning that a base (64-bit) value
32 : // is encoded separately and each encoded value is a delta from that base.
33 : // Delta encoding is never necessary when we use 8 bytes per integer.
34 : //
35 : // Note that 0-byte encodings imply that all values are equal (either to the
36 : // base value if we are using a delta encoding, otherwise to 0).
37 : //
38 : // The UintEncoding byte is serialized to the uint column before the column
39 : // data.
40 : type UintEncoding uint8
41 :
42 : const uintEncodingDeltaBit UintEncoding = 1 << 7
43 : const uintEncodingAllZero UintEncoding = 0
44 :
45 : // IsDelta returns true if it is a delta encoding.
46 0 : func (e UintEncoding) IsDelta() bool {
47 0 : return e&uintEncodingDeltaBit != 0
48 0 : }
49 :
50 : // Width returns the number of bytes used per integer. It can be 0, 1, 2, 4, or 8.
51 0 : func (e UintEncoding) Width() int {
52 0 : return int(e &^ uintEncodingDeltaBit)
53 0 : }
54 :
55 : // IsValid returns true if the encoding is valid.
56 0 : func (e UintEncoding) IsValid() bool {
57 0 : switch e.Width() {
58 0 : case 0, 1, 2, 4:
59 0 : return true
60 0 : case 8:
61 0 : // We should never need to do delta encoding if we store all 64 bits.
62 0 : return !e.IsDelta()
63 0 : default:
64 0 : return false
65 : }
66 : }
67 :
68 : // String implements fmt.Stringer.
69 0 : func (e UintEncoding) String() string {
70 0 : if e.Width() == 0 {
71 0 : if e.IsDelta() {
72 0 : return "const"
73 0 : }
74 0 : return "zero"
75 : }
76 0 : deltaString := ""
77 0 : if e.IsDelta() {
78 0 : deltaString = ",delta"
79 0 : }
80 0 : return fmt.Sprintf("%db%s", e.Width(), deltaString)
81 : }
82 :
83 : // DetermineUintEncoding returns the best valid encoding that can be used to
84 : // represent integers in the range [minValue, maxValue].
85 0 : func DetermineUintEncoding(minValue, maxValue uint64) UintEncoding {
86 0 : // Find the number of bytes-per-value necessary for a delta encoding.
87 0 : b := (bits.Len64(maxValue-minValue) + 7) >> 3
88 0 : // Round up to the nearest allowed value (0, 1, 2, 4, or 8).
89 0 : if b > 4 {
90 0 : return UintEncoding(8)
91 0 : }
92 0 : if b == 3 {
93 0 : b = 4
94 0 : }
95 : // Check if we can use the same number of bytes without a delta encoding.
96 0 : isDelta := maxValue >= (1 << (b << 3))
97 0 : return makeUintEncoding(b, isDelta)
98 : }
99 :
100 0 : func makeUintEncoding(width int, isDelta bool) UintEncoding {
101 0 : e := UintEncoding(width)
102 0 : if isDelta {
103 0 : e |= uintEncodingDeltaBit
104 0 : }
105 0 : if invariants.Enabled && !e.IsValid() {
106 0 : panic(e)
107 : }
108 0 : return e
109 : }
110 :
111 : // UintBuilder builds a column of unsigned integers. It uses the smallest
112 : // possible UintEncoding, depending on the values.
113 : type UintBuilder struct {
114 : // configuration fixed on Init; preserved across Reset
115 : useDefault bool
116 :
117 : // array holds the underlying heap-allocated array in which values are
118 : // stored.
119 : array struct {
120 : // n is the size of the array (in count of T elements; not bytes). n is
121 : // NOT the number of elements that have been populated by the user.
122 : n int
123 : // elems provides access to elements without bounds checking. elems is
124 : // grown automatically in Set.
125 : elems UnsafeRawSlice[uint64]
126 : }
127 : // stats holds state for the purpose of tracking which UintEncoding would
128 : // be used if the caller Finished the column including all elements Set so
129 : // far. The stats state is used by Size (and Finish) to cheaply determine
130 : // which encoding may most concisely encode the array.
131 : //
132 : // Every Set(i, v) call updates minimum and maximum if necessary. If a call
133 : // updates minimum, maximum or both, it recalculates the encoding and if it
134 : // changed sets sets encodingRow=i, indicating which row last updated the
135 : // width.
136 : //
137 : // Any call to Size or Finish that supplies [rows] that's inclusive of the
138 : // index stored in widthRow may use the stored width. Calls with fewer
139 : // [rows] must recompute the min/max. In expected usage, only Finish will be
140 : // called with fewer rows and only with one less row than has been set,
141 : // meaning that only if the last row updated the width is a recomputation
142 : // necessary.
143 : //
144 : // TODO(jackson): There is a small discrete set of possible encodings, so we
145 : // could instead track the index of the first row that makes each encoding
146 : // impossible. This would allow us to avoid recomputing the min/max in all
147 : // cases. Or, if we limit the API to only allow Finish to be called with one
148 : // less than the last set row, we could maintain the width of only the last
149 : // two rows.
150 : stats struct {
151 : minimum uint64
152 : maximum uint64
153 : encoding UintEncoding
154 : encodingRow int // index of last update to encoding
155 : }
156 : }
157 :
158 : // Init initializes the UintBuilder.
159 0 : func (b *UintBuilder) Init() {
160 0 : b.init(false)
161 0 : }
162 :
163 : // InitWithDefault initializes the UintBuilder. Any rows that are not explicitly
164 : // set are assumed to be zero. For the purpose of determining whether a delta
165 : // encoding is possible, the column is assumed to contain at least 1 default
166 : // value.
167 : //
168 : // InitWithDefault may be preferrable when a nonzero value is uncommon, and the
169 : // caller can avoid explicitly Set-ing every zero value.
170 0 : func (b *UintBuilder) InitWithDefault() {
171 0 : b.init(true)
172 0 : }
173 :
174 0 : func (b *UintBuilder) init(useDefault bool) {
175 0 : b.useDefault = useDefault
176 0 : b.Reset()
177 0 : }
178 :
179 : // NumColumns implements ColumnWriter.
180 0 : func (b *UintBuilder) NumColumns() int { return 1 }
181 :
182 : // DataType implements ColumnWriter.
183 0 : func (b *UintBuilder) DataType(int) DataType { return DataTypeUint }
184 :
185 : // Reset implements ColumnWriter and resets the builder, reusing existing
186 : // allocated memory.
187 0 : func (b *UintBuilder) Reset() {
188 0 : if b.useDefault {
189 0 : // If the caller configured a default zero, we assume that the array
190 0 : // will include at least one default value.
191 0 : b.stats.minimum = 0
192 0 : b.stats.maximum = 0
193 0 : clear(b.array.elems.Slice(b.array.n))
194 0 : } else {
195 0 : b.stats.minimum = math.MaxUint64
196 0 : b.stats.maximum = 0
197 0 : // We could reset all values as a precaution, but it has a visible cost
198 0 : // in benchmarks.
199 0 : if invariants.Sometimes(50) {
200 0 : for i := 0; i < b.array.n; i++ {
201 0 : b.array.elems.set(i, math.MaxUint64)
202 0 : }
203 : }
204 : }
205 0 : b.stats.encoding = uintEncodingAllZero
206 0 : b.stats.encodingRow = 0
207 : }
208 :
209 : // Get gets the value of the provided row index. The provided row must have been
210 : // Set or the builder must have been initialized with InitWithDefault.
211 0 : func (b *UintBuilder) Get(row int) uint64 {
212 0 : // If the UintBuilder is configured to use a zero value for unset rows, it's
213 0 : // possible that the array has not been grown to a size that includes [row].
214 0 : if b.array.n <= row {
215 0 : if invariants.Enabled && !b.useDefault {
216 0 : panic(errors.AssertionFailedf("Get(%d) on UintBuilder with array of size %d", row, b.array.n))
217 : }
218 0 : return 0
219 : }
220 0 : return b.array.elems.At(row)
221 : }
222 :
223 : // Set sets the value of the provided row index to v.
224 0 : func (b *UintBuilder) Set(row int, v uint64) {
225 0 : if b.array.n <= row {
226 0 : // Double the size of the allocated array, or initialize it to at least 32
227 0 : // values (256 bytes) if this is the first allocation. Then double until
228 0 : // there's sufficient space.
229 0 : n2 := max(b.array.n<<1, 32)
230 0 : for n2 <= row {
231 0 : n2 <<= 1 /* double the size */
232 0 : }
233 : // NB: Go guarantees the allocated array will be 64-bit aligned.
234 0 : newDataTyped := make([]uint64, n2)
235 0 : copy(newDataTyped, b.array.elems.Slice(b.array.n))
236 0 : newElems := makeUnsafeRawSlice[uint64](unsafe.Pointer(&newDataTyped[0]))
237 0 : b.array.n = n2
238 0 : b.array.elems = newElems
239 :
240 : }
241 : // Maintain the running minimum and maximum for the purpose of maintaining
242 : // knowledge of the delta encoding that would be used.
243 0 : if b.stats.minimum > v || b.stats.maximum < v {
244 0 : b.stats.minimum = min(v, b.stats.minimum)
245 0 : b.stats.maximum = max(v, b.stats.maximum)
246 0 : // If updating the minimum and maximum means that we now much use a wider
247 0 : // width integer, update the encoding and the index of the update to it.
248 0 : if e := DetermineUintEncoding(b.stats.minimum, b.stats.maximum); e != b.stats.encoding {
249 0 : b.stats.encoding = e
250 0 : b.stats.encodingRow = row
251 0 : }
252 : }
253 0 : b.array.elems.set(row, v)
254 : }
255 :
256 : // Size implements ColumnWriter and returns the size of the column if its first
257 : // [rows] rows were serialized, serializing the column into offset [offset].
258 0 : func (b *UintBuilder) Size(rows int, offset uint32) uint32 {
259 0 : if rows == 0 {
260 0 : return offset
261 0 : }
262 0 : e, _ := b.determineEncoding(rows)
263 0 : return uintColumnSize(uint32(rows), offset, e)
264 : }
265 :
266 : // determineEncoding determines the best encoding for a column containing the
267 : // first [rows], along with the minimum value (used as the "base" value when we
268 : // use a stats encoding).
269 0 : func (b *UintBuilder) determineEncoding(rows int) (_ UintEncoding, minimum uint64) {
270 0 : if b.stats.encodingRow < rows {
271 0 : // b.delta.encoding became the current value within the first [rows], so we
272 0 : // can use it.
273 0 : return b.stats.encoding, b.stats.minimum
274 0 : }
275 :
276 : // We have to recalculate the minimum and maximum.
277 0 : minimum, maximum := computeMinMax(b.array.elems.Slice(rows))
278 0 : return DetermineUintEncoding(minimum, maximum), minimum
279 : }
280 :
281 : // uintColumnSize returns the size of a column of unsigned integers, encoded at
282 : // the provided offset using the provided width. If width < sizeof(T), then a
283 : // delta encoding is assumed.
284 0 : func uintColumnSize(rows, offset uint32, e UintEncoding) uint32 {
285 0 : offset++ // DeltaEncoding byte
286 0 : if e.IsDelta() {
287 0 : // A delta encoding will be used. We need to first account for the constant
288 0 : // that encodes the base value.
289 0 : offset += 8
290 0 : }
291 0 : width := uint32(e.Width())
292 0 : // Include alignment bytes necessary to align offset appropriately for
293 0 : // elements of the delta width.
294 0 : if width > 0 {
295 0 : offset = align(offset, width)
296 0 : }
297 : // Now account for the array of [rows] x w elements encoding the deltas.
298 0 : return offset + rows*width
299 : }
300 :
301 : // Finish implements ColumnWriter, serializing the column into offset [offset] of
302 : // [buf].
303 0 : func (b *UintBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
304 0 : if rows == 0 {
305 0 : return offset
306 0 : }
307 :
308 0 : e, minimum := b.determineEncoding(rows)
309 0 :
310 0 : // NB: In some circumstances, it's possible for b.array.elems.ptr to be nil.
311 0 : // Specifically, if the builder is initialized using InitWithDefault and no
312 0 : // non-default values exist, no array will have been allocated (we lazily
313 0 : // allocate b.array.elems.ptr). It's illegal to try to construct an unsafe
314 0 : // slice from a nil ptr with non-zero rows. Only attempt to construct the
315 0 : // values slice if there's actually a non-nil ptr.
316 0 : var valuesSlice []uint64
317 0 : if b.array.elems.ptr != nil {
318 0 : valuesSlice = b.array.elems.Slice(rows)
319 0 : }
320 0 : return uintColumnFinish(minimum, valuesSlice, e, offset, buf)
321 : }
322 :
323 : // uintColumnFinish finishes the column of unsigned integers of type T, applying
324 : // the given encoding.
325 : func uintColumnFinish(
326 : minimum uint64, values []uint64, e UintEncoding, offset uint32, buf []byte,
327 0 : ) uint32 {
328 0 : buf[offset] = byte(e)
329 0 : offset++
330 0 :
331 0 : deltaBase := uint64(0)
332 0 : if e.IsDelta() {
333 0 : deltaBase = minimum
334 0 : binary.LittleEndian.PutUint64(buf[offset:], minimum)
335 0 : offset += 8
336 0 : }
337 0 : width := uint32(e.Width())
338 0 : if width == 0 {
339 0 : // All the column values are the same.
340 0 : return offset
341 0 : }
342 : // Align the offset appropriately.
343 0 : offset = alignWithZeroes(buf, offset, width)
344 0 :
345 0 : switch e.Width() {
346 0 : case 1:
347 0 : dest := makeUnsafeRawSlice[uint8](unsafe.Pointer(&buf[offset])).Slice(len(values))
348 0 : reduceUints(deltaBase, values, dest)
349 :
350 0 : case 2:
351 0 : dest := makeUnsafeRawSlice[uint16](unsafe.Pointer(&buf[offset])).Slice(len(values))
352 0 : reduceUints(deltaBase, values, dest)
353 :
354 0 : case 4:
355 0 : dest := makeUnsafeRawSlice[uint32](unsafe.Pointer(&buf[offset])).Slice(len(values))
356 0 : reduceUints(deltaBase, values, dest)
357 :
358 0 : case 8:
359 0 : if deltaBase != 0 {
360 0 : panic("unreachable")
361 : }
362 0 : dest := makeUnsafeRawSlice[uint64](unsafe.Pointer(&buf[offset])).Slice(len(values))
363 0 : copy(dest, values)
364 :
365 0 : default:
366 0 : panic("unreachable")
367 : }
368 0 : return offset + uint32(len(values))*width
369 : }
370 :
371 : // WriteDebug implements Encoder.
372 0 : func (b *UintBuilder) WriteDebug(w io.Writer, rows int) {
373 0 : fmt.Fprintf(w, "%s: %d rows", DataTypeUint, rows)
374 0 : }
375 :
376 : // reduceUints reduces the bit-width of a slice of unsigned by subtracting a
377 : // minimum value from each element and writing it to dst. For example,
378 : //
379 : // reduceUints[uint64, uint8](10, []uint64{10, 11, 12}, dst)
380 : //
381 : // could be used to reduce a slice of uint64 values to uint8 values {0, 1, 2}.
382 0 : func reduceUints[O constraints.Integer, N constraints.Integer](minimum O, values []O, dst []N) {
383 0 : for i := 0; i < len(values); i++ {
384 0 : dst[i] = N(values[i] - minimum)
385 0 : }
386 : }
387 :
388 : // computeMinMax computes the minimum and the maximum of the provided slice of
389 : // unsigned integers.
390 0 : func computeMinMax[I constraints.Unsigned](values []I) (I, I) {
391 0 : minimum := I(0) - 1
392 0 : maximum := I(0)
393 0 : for _, v := range values {
394 0 : minimum = min(minimum, v)
395 0 : maximum = max(maximum, v)
396 0 : }
397 0 : return minimum, maximum
398 : }
399 :
400 : func uintsToBinFormatter(
401 : f *binfmt.Formatter, rows int, uintFormatter func(el, base uint64) string,
402 0 : ) {
403 0 : if rows == 0 {
404 0 : return
405 0 : }
406 0 : if uintFormatter == nil {
407 0 : uintFormatter = func(v, base uint64) string {
408 0 : if base == 0 {
409 0 : return fmt.Sprint(v)
410 0 : }
411 0 : return fmt.Sprintf("%d + %d = %d", v, base, base+v)
412 : }
413 : }
414 :
415 0 : e := UintEncoding(f.PeekUint(1)) // UintEncoding byte
416 0 : if !e.IsValid() {
417 0 : panic(fmt.Sprintf("%d", e))
418 : }
419 0 : f.HexBytesln(1, "encoding: %s", e)
420 0 :
421 0 : var base uint64
422 0 : if e.IsDelta() {
423 0 : base = f.PeekUint(8)
424 0 : f.HexBytesln(8, "64-bit constant: %d", base)
425 0 : }
426 0 : width := e.Width()
427 0 : if width == 0 {
428 0 : // The column is zero or constant.
429 0 : return
430 0 : }
431 :
432 0 : if off := align(f.RelativeOffset(), width); off != f.RelativeOffset() {
433 0 : f.HexBytesln(off-f.RelativeOffset(), "padding (aligning to %d-bit boundary)", width*8)
434 0 : }
435 0 : for i := 0; i < rows; i++ {
436 0 : f.HexBytesln(width, "data[%d] = %s", i, uintFormatter(f.PeekUint(width), base))
437 0 : }
438 : }
|