LCOV - code coverage report
Current view: top level - pebble/sstable/colblk - uints.go (source / functions) Hit Total Coverage
Test: 2024-09-15 08:16Z 6c9ad29b - meta test only.lcov Lines: 0 234 0.0 %
Date: 2024-09-15 08:17:01 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package colblk
       6             : 
       7             : import (
       8             :         "encoding/binary"
       9             :         "fmt"
      10             :         "io"
      11             :         "math"
      12             :         "math/bits"
      13             :         "unsafe"
      14             : 
      15             :         "github.com/cockroachdb/errors"
      16             :         "github.com/cockroachdb/pebble/internal/binfmt"
      17             :         "github.com/cockroachdb/pebble/internal/invariants"
      18             :         "golang.org/x/exp/constraints"
      19             : )
      20             : 
      21             : // Uint is a constraint that permits any unsigned integer type with an
      22             : // explicit size.
      23             : type Uint interface {
      24             :         ~uint8 | ~uint16 | ~uint32 | ~uint64
      25             : }
      26             : 
      27             : // UintEncoding indicates how unsigned integers (of at most 64 bits) are
      28             : // encoded. It has two components:
      29             : //   - the low bits indicate how many bytes per integer are used, with
      30             : //     allowed values 0, 1, 2, 4, or 8.
      31             : //   - whether we are using a delta encoding, meaning that a base (64-bit) value
      32             : //     is encoded separately and each encoded value is a delta from that base.
      33             : //     Delta encoding is never necessary when we use 8 bytes per integer.
      34             : //
      35             : // Note that 0-byte encodings imply that all values are equal (either to the
      36             : // base value if we are using a delta encoding, otherwise to 0).
      37             : //
      38             : // The UintEncoding byte is serialized to the uint column before the column
      39             : // data.
      40             : type UintEncoding uint8
      41             : 
      42             : const uintEncodingDeltaBit UintEncoding = 1 << 7
      43             : const uintEncodingAllZero UintEncoding = 0
      44             : 
      45             : // IsDelta returns true if it is a delta encoding.
      46           0 : func (e UintEncoding) IsDelta() bool {
      47           0 :         return e&uintEncodingDeltaBit != 0
      48           0 : }
      49             : 
      50             : // Width returns the number of bytes used per integer. It can be 0, 1, 2, 4, or 8.
      51           0 : func (e UintEncoding) Width() int {
      52           0 :         return int(e &^ uintEncodingDeltaBit)
      53           0 : }
      54             : 
      55             : // IsValid returns true if the encoding is valid.
      56           0 : func (e UintEncoding) IsValid() bool {
      57           0 :         switch e.Width() {
      58           0 :         case 0, 1, 2, 4:
      59           0 :                 return true
      60           0 :         case 8:
      61           0 :                 // We should never need to do delta encoding if we store all 64 bits.
      62           0 :                 return !e.IsDelta()
      63           0 :         default:
      64           0 :                 return false
      65             :         }
      66             : }
      67             : 
      68             : // String implements fmt.Stringer.
      69           0 : func (e UintEncoding) String() string {
      70           0 :         if e.Width() == 0 {
      71           0 :                 if e.IsDelta() {
      72           0 :                         return "const"
      73           0 :                 }
      74           0 :                 return "zero"
      75             :         }
      76           0 :         deltaString := ""
      77           0 :         if e.IsDelta() {
      78           0 :                 deltaString = ",delta"
      79           0 :         }
      80           0 :         return fmt.Sprintf("%db%s", e.Width(), deltaString)
      81             : }
      82             : 
      83             : // DetermineUintEncoding returns the best valid encoding that can be used to
      84             : // represent integers in the range [minValue, maxValue].
      85           0 : func DetermineUintEncoding(minValue, maxValue uint64) UintEncoding {
      86           0 :         // Find the number of bytes-per-value necessary for a delta encoding.
      87           0 :         b := (bits.Len64(maxValue-minValue) + 7) >> 3
      88           0 :         // Round up to the nearest allowed value (0, 1, 2, 4, or 8).
      89           0 :         if b > 4 {
      90           0 :                 return UintEncoding(8)
      91           0 :         }
      92           0 :         if b == 3 {
      93           0 :                 b = 4
      94           0 :         }
      95             :         // Check if we can use the same number of bytes without a delta encoding.
      96           0 :         isDelta := maxValue >= (1 << (b << 3))
      97           0 :         return makeUintEncoding(b, isDelta)
      98             : }
      99             : 
     100           0 : func makeUintEncoding(width int, isDelta bool) UintEncoding {
     101           0 :         e := UintEncoding(width)
     102           0 :         if isDelta {
     103           0 :                 e |= uintEncodingDeltaBit
     104           0 :         }
     105           0 :         if invariants.Enabled && !e.IsValid() {
     106           0 :                 panic(e)
     107             :         }
     108           0 :         return e
     109             : }
     110             : 
     111             : // UintBuilder builds a column of unsigned integers. It uses the smallest
     112             : // possible UintEncoding, depending on the values.
     113             : type UintBuilder struct {
     114             :         // configuration fixed on Init; preserved across Reset
     115             :         useDefault bool
     116             : 
     117             :         // array holds the underlying heap-allocated array in which values are
     118             :         // stored.
     119             :         array struct {
     120             :                 // n is the size of the array (in count of T elements; not bytes). n is
     121             :                 // NOT the number of elements that have been populated by the user.
     122             :                 n int
     123             :                 // elems provides access to elements without bounds checking. elems is
     124             :                 // grown automatically in Set.
     125             :                 elems UnsafeRawSlice[uint64]
     126             :         }
     127             :         // stats holds state for the purpose of tracking which UintEncoding would
     128             :         // be used if the caller Finished the column including all elements Set so
     129             :         // far. The stats state is used by Size (and Finish) to cheaply determine
     130             :         // which encoding may most concisely encode the array.
     131             :         //
     132             :         // Every Set(i, v) call updates minimum and maximum if necessary. If a call
     133             :         // updates minimum, maximum or both, it recalculates the encoding and if it
     134             :         // changed sets sets encodingRow=i, indicating which row last updated the
     135             :         // width.
     136             :         //
     137             :         // Any call to Size or Finish that supplies [rows] that's inclusive of the
     138             :         // index stored in widthRow may use the stored width. Calls with fewer
     139             :         // [rows] must recompute the min/max. In expected usage, only Finish will be
     140             :         // called with fewer rows and only with one less row than has been set,
     141             :         // meaning that only if the last row updated the width is a recomputation
     142             :         // necessary.
     143             :         //
     144             :         // TODO(jackson): There is a small discrete set of possible encodings, so we
     145             :         // could instead track the index of the first row that makes each encoding
     146             :         // impossible. This would allow us to avoid recomputing the min/max in all
     147             :         // cases. Or, if we limit the API to only allow Finish to be called with one
     148             :         // less than the last set row, we could maintain the width of only the last
     149             :         // two rows.
     150             :         stats struct {
     151             :                 minimum     uint64
     152             :                 maximum     uint64
     153             :                 encoding    UintEncoding
     154             :                 encodingRow int // index of last update to encoding
     155             :         }
     156             : }
     157             : 
     158             : // Init initializes the UintBuilder.
     159           0 : func (b *UintBuilder) Init() {
     160           0 :         b.init(false)
     161           0 : }
     162             : 
     163             : // InitWithDefault initializes the UintBuilder. Any rows that are not explicitly
     164             : // set are assumed to be zero. For the purpose of determining whether a delta
     165             : // encoding is possible, the column is assumed to contain at least 1 default
     166             : // value.
     167             : //
     168             : // InitWithDefault may be preferrable when a nonzero value is uncommon, and the
     169             : // caller can avoid explicitly Set-ing every zero value.
     170           0 : func (b *UintBuilder) InitWithDefault() {
     171           0 :         b.init(true)
     172           0 : }
     173             : 
     174           0 : func (b *UintBuilder) init(useDefault bool) {
     175           0 :         b.useDefault = useDefault
     176           0 :         b.Reset()
     177           0 : }
     178             : 
     179             : // NumColumns implements ColumnWriter.
     180           0 : func (b *UintBuilder) NumColumns() int { return 1 }
     181             : 
     182             : // DataType implements ColumnWriter.
     183           0 : func (b *UintBuilder) DataType(int) DataType { return DataTypeUint }
     184             : 
     185             : // Reset implements ColumnWriter and resets the builder, reusing existing
     186             : // allocated memory.
     187           0 : func (b *UintBuilder) Reset() {
     188           0 :         if b.useDefault {
     189           0 :                 // If the caller configured a default zero, we assume that the array
     190           0 :                 // will include at least one default value.
     191           0 :                 b.stats.minimum = 0
     192           0 :                 b.stats.maximum = 0
     193           0 :                 clear(b.array.elems.Slice(b.array.n))
     194           0 :         } else {
     195           0 :                 b.stats.minimum = math.MaxUint64
     196           0 :                 b.stats.maximum = 0
     197           0 :                 // We could reset all values as a precaution, but it has a visible cost
     198           0 :                 // in benchmarks.
     199           0 :                 if invariants.Sometimes(50) {
     200           0 :                         for i := 0; i < b.array.n; i++ {
     201           0 :                                 b.array.elems.set(i, math.MaxUint64)
     202           0 :                         }
     203             :                 }
     204             :         }
     205           0 :         b.stats.encoding = uintEncodingAllZero
     206           0 :         b.stats.encodingRow = 0
     207             : }
     208             : 
     209             : // Get gets the value of the provided row index. The provided row must have been
     210             : // Set or the builder must have been initialized with InitWithDefault.
     211           0 : func (b *UintBuilder) Get(row int) uint64 {
     212           0 :         // If the UintBuilder is configured to use a zero value for unset rows, it's
     213           0 :         // possible that the array has not been grown to a size that includes [row].
     214           0 :         if b.array.n <= row {
     215           0 :                 if invariants.Enabled && !b.useDefault {
     216           0 :                         panic(errors.AssertionFailedf("Get(%d) on UintBuilder with array of size %d", row, b.array.n))
     217             :                 }
     218           0 :                 return 0
     219             :         }
     220           0 :         return b.array.elems.At(row)
     221             : }
     222             : 
     223             : // Set sets the value of the provided row index to v.
     224           0 : func (b *UintBuilder) Set(row int, v uint64) {
     225           0 :         if b.array.n <= row {
     226           0 :                 // Double the size of the allocated array, or initialize it to at least 32
     227           0 :                 // values (256 bytes) if this is the first allocation. Then double until
     228           0 :                 // there's sufficient space.
     229           0 :                 n2 := max(b.array.n<<1, 32)
     230           0 :                 for n2 <= row {
     231           0 :                         n2 <<= 1 /* double the size */
     232           0 :                 }
     233             :                 // NB: Go guarantees the allocated array will be 64-bit aligned.
     234           0 :                 newDataTyped := make([]uint64, n2)
     235           0 :                 copy(newDataTyped, b.array.elems.Slice(b.array.n))
     236           0 :                 newElems := makeUnsafeRawSlice[uint64](unsafe.Pointer(&newDataTyped[0]))
     237           0 :                 b.array.n = n2
     238           0 :                 b.array.elems = newElems
     239             : 
     240             :         }
     241             :         // Maintain the running minimum and maximum for the purpose of maintaining
     242             :         // knowledge of the delta encoding that would be used.
     243           0 :         if b.stats.minimum > v || b.stats.maximum < v {
     244           0 :                 b.stats.minimum = min(v, b.stats.minimum)
     245           0 :                 b.stats.maximum = max(v, b.stats.maximum)
     246           0 :                 // If updating the minimum and maximum means that we now much use a wider
     247           0 :                 // width integer, update the encoding and the index of the update to it.
     248           0 :                 if e := DetermineUintEncoding(b.stats.minimum, b.stats.maximum); e != b.stats.encoding {
     249           0 :                         b.stats.encoding = e
     250           0 :                         b.stats.encodingRow = row
     251           0 :                 }
     252             :         }
     253           0 :         b.array.elems.set(row, v)
     254             : }
     255             : 
     256             : // Size implements ColumnWriter and returns the size of the column if its first
     257             : // [rows] rows were serialized, serializing the column into offset [offset].
     258           0 : func (b *UintBuilder) Size(rows int, offset uint32) uint32 {
     259           0 :         if rows == 0 {
     260           0 :                 return offset
     261           0 :         }
     262           0 :         e, _ := b.determineEncoding(rows)
     263           0 :         return uintColumnSize(uint32(rows), offset, e)
     264             : }
     265             : 
     266             : // determineEncoding determines the best encoding for a column containing the
     267             : // first [rows], along with the minimum value (used as the "base" value when we
     268             : // use a stats encoding).
     269           0 : func (b *UintBuilder) determineEncoding(rows int) (_ UintEncoding, minimum uint64) {
     270           0 :         if b.stats.encodingRow < rows {
     271           0 :                 // b.delta.encoding became the current value within the first [rows], so we
     272           0 :                 // can use it.
     273           0 :                 return b.stats.encoding, b.stats.minimum
     274           0 :         }
     275             : 
     276             :         // We have to recalculate the minimum and maximum.
     277           0 :         minimum, maximum := computeMinMax(b.array.elems.Slice(rows))
     278           0 :         return DetermineUintEncoding(minimum, maximum), minimum
     279             : }
     280             : 
     281             : // uintColumnSize returns the size of a column of unsigned integers, encoded at
     282             : // the provided offset using the provided width. If width < sizeof(T), then a
     283             : // delta encoding is assumed.
     284           0 : func uintColumnSize(rows, offset uint32, e UintEncoding) uint32 {
     285           0 :         offset++ // DeltaEncoding byte
     286           0 :         if e.IsDelta() {
     287           0 :                 // A delta encoding will be used. We need to first account for the constant
     288           0 :                 // that encodes the base value.
     289           0 :                 offset += 8
     290           0 :         }
     291           0 :         width := uint32(e.Width())
     292           0 :         // Include alignment bytes necessary to align offset appropriately for
     293           0 :         // elements of the delta width.
     294           0 :         if width > 0 {
     295           0 :                 offset = align(offset, width)
     296           0 :         }
     297             :         // Now account for the array of [rows] x w elements encoding the deltas.
     298           0 :         return offset + rows*width
     299             : }
     300             : 
     301             : // Finish implements ColumnWriter, serializing the column into offset [offset] of
     302             : // [buf].
     303           0 : func (b *UintBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
     304           0 :         if rows == 0 {
     305           0 :                 return offset
     306           0 :         }
     307             : 
     308           0 :         e, minimum := b.determineEncoding(rows)
     309           0 : 
     310           0 :         // NB: In some circumstances, it's possible for b.array.elems.ptr to be nil.
     311           0 :         // Specifically, if the builder is initialized using InitWithDefault and no
     312           0 :         // non-default values exist, no array will have been allocated (we lazily
     313           0 :         // allocate b.array.elems.ptr). It's illegal to try to construct an unsafe
     314           0 :         // slice from a nil ptr with non-zero rows. Only attempt to construct the
     315           0 :         // values slice if there's actually a non-nil ptr.
     316           0 :         var valuesSlice []uint64
     317           0 :         if b.array.elems.ptr != nil {
     318           0 :                 valuesSlice = b.array.elems.Slice(rows)
     319           0 :         }
     320           0 :         return uintColumnFinish(minimum, valuesSlice, e, offset, buf)
     321             : }
     322             : 
     323             : // uintColumnFinish finishes the column of unsigned integers of type T, applying
     324             : // the given encoding.
     325             : func uintColumnFinish(
     326             :         minimum uint64, values []uint64, e UintEncoding, offset uint32, buf []byte,
     327           0 : ) uint32 {
     328           0 :         buf[offset] = byte(e)
     329           0 :         offset++
     330           0 : 
     331           0 :         deltaBase := uint64(0)
     332           0 :         if e.IsDelta() {
     333           0 :                 deltaBase = minimum
     334           0 :                 binary.LittleEndian.PutUint64(buf[offset:], minimum)
     335           0 :                 offset += 8
     336           0 :         }
     337           0 :         width := uint32(e.Width())
     338           0 :         if width == 0 {
     339           0 :                 // All the column values are the same.
     340           0 :                 return offset
     341           0 :         }
     342             :         // Align the offset appropriately.
     343           0 :         offset = alignWithZeroes(buf, offset, width)
     344           0 : 
     345           0 :         switch e.Width() {
     346           0 :         case 1:
     347           0 :                 dest := makeUnsafeRawSlice[uint8](unsafe.Pointer(&buf[offset])).Slice(len(values))
     348           0 :                 reduceUints(deltaBase, values, dest)
     349             : 
     350           0 :         case 2:
     351           0 :                 dest := makeUnsafeRawSlice[uint16](unsafe.Pointer(&buf[offset])).Slice(len(values))
     352           0 :                 reduceUints(deltaBase, values, dest)
     353             : 
     354           0 :         case 4:
     355           0 :                 dest := makeUnsafeRawSlice[uint32](unsafe.Pointer(&buf[offset])).Slice(len(values))
     356           0 :                 reduceUints(deltaBase, values, dest)
     357             : 
     358           0 :         case 8:
     359           0 :                 if deltaBase != 0 {
     360           0 :                         panic("unreachable")
     361             :                 }
     362           0 :                 dest := makeUnsafeRawSlice[uint64](unsafe.Pointer(&buf[offset])).Slice(len(values))
     363           0 :                 copy(dest, values)
     364             : 
     365           0 :         default:
     366           0 :                 panic("unreachable")
     367             :         }
     368           0 :         return offset + uint32(len(values))*width
     369             : }
     370             : 
     371             : // WriteDebug implements Encoder.
     372           0 : func (b *UintBuilder) WriteDebug(w io.Writer, rows int) {
     373           0 :         fmt.Fprintf(w, "%s: %d rows", DataTypeUint, rows)
     374           0 : }
     375             : 
     376             : // reduceUints reduces the bit-width of a slice of unsigned by subtracting a
     377             : // minimum value from each element and writing it to dst. For example,
     378             : //
     379             : //      reduceUints[uint64, uint8](10, []uint64{10, 11, 12}, dst)
     380             : //
     381             : // could be used to reduce a slice of uint64 values to uint8 values {0, 1, 2}.
     382           0 : func reduceUints[O constraints.Integer, N constraints.Integer](minimum O, values []O, dst []N) {
     383           0 :         for i := 0; i < len(values); i++ {
     384           0 :                 dst[i] = N(values[i] - minimum)
     385           0 :         }
     386             : }
     387             : 
     388             : // computeMinMax computes the minimum and the maximum of the provided slice of
     389             : // unsigned integers.
     390           0 : func computeMinMax[I constraints.Unsigned](values []I) (I, I) {
     391           0 :         minimum := I(0) - 1
     392           0 :         maximum := I(0)
     393           0 :         for _, v := range values {
     394           0 :                 minimum = min(minimum, v)
     395           0 :                 maximum = max(maximum, v)
     396           0 :         }
     397           0 :         return minimum, maximum
     398             : }
     399             : 
     400             : func uintsToBinFormatter(
     401             :         f *binfmt.Formatter, rows int, uintFormatter func(el, base uint64) string,
     402           0 : ) {
     403           0 :         if rows == 0 {
     404           0 :                 return
     405           0 :         }
     406           0 :         if uintFormatter == nil {
     407           0 :                 uintFormatter = func(v, base uint64) string {
     408           0 :                         if base == 0 {
     409           0 :                                 return fmt.Sprint(v)
     410           0 :                         }
     411           0 :                         return fmt.Sprintf("%d + %d = %d", v, base, base+v)
     412             :                 }
     413             :         }
     414             : 
     415           0 :         e := UintEncoding(f.PeekUint(1)) // UintEncoding byte
     416           0 :         if !e.IsValid() {
     417           0 :                 panic(fmt.Sprintf("%d", e))
     418             :         }
     419           0 :         f.HexBytesln(1, "encoding: %s", e)
     420           0 : 
     421           0 :         var base uint64
     422           0 :         if e.IsDelta() {
     423           0 :                 base = f.PeekUint(8)
     424           0 :                 f.HexBytesln(8, "64-bit constant: %d", base)
     425           0 :         }
     426           0 :         width := e.Width()
     427           0 :         if width == 0 {
     428           0 :                 // The column is zero or constant.
     429           0 :                 return
     430           0 :         }
     431             : 
     432           0 :         if off := align(f.RelativeOffset(), width); off != f.RelativeOffset() {
     433           0 :                 f.HexBytesln(off-f.RelativeOffset(), "padding (aligning to %d-bit boundary)", width*8)
     434           0 :         }
     435           0 :         for i := 0; i < rows; i++ {
     436           0 :                 f.HexBytesln(width, "data[%d] = %s", i, uintFormatter(f.PeekUint(width), base))
     437           0 :         }
     438             : }

Generated by: LCOV version 1.14