LCOV - code coverage report
Current view: top level - pebble/sstable/colblk - uints.go (source / functions) Hit Total Coverage
Test: 2024-10-17 08:16Z dd26bf0e - tests only.lcov Lines: 257 270 95.2 %
Date: 2024-10-17 08:18:17 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package colblk
       6             : 
       7             : import (
       8             :         "encoding/binary"
       9             :         "fmt"
      10             :         "io"
      11             :         "math"
      12             :         "math/bits"
      13             :         "unsafe"
      14             : 
      15             :         "github.com/cockroachdb/errors"
      16             :         "github.com/cockroachdb/pebble/internal/binfmt"
      17             :         "github.com/cockroachdb/pebble/internal/invariants"
      18             :         "github.com/cockroachdb/pebble/internal/treeprinter"
      19             :         "golang.org/x/exp/constraints"
      20             : )
      21             : 
      22             : // Uint is a constraint that permits any unsigned integer type with an
      23             : // explicit size.
      24             : type Uint interface {
      25             :         ~uint8 | ~uint16 | ~uint32 | ~uint64
      26             : }
      27             : 
      28             : // UintEncoding indicates how unsigned integers (of at most 64 bits) are
      29             : // encoded. It has two components:
      30             : //   - the low bits indicate how many bytes per integer are used, with
      31             : //     allowed values 0, 1, 2, 4, or 8.
      32             : //   - whether we are using a delta encoding, meaning that a base (64-bit) value
      33             : //     is encoded separately and each encoded value is a delta from that base.
      34             : //     Delta encoding is never necessary when we use 8 bytes per integer.
      35             : //
      36             : // Note that 0-byte encodings imply that all values are equal (either to the
      37             : // base value if we are using a delta encoding, otherwise to 0).
      38             : //
      39             : // The UintEncoding byte is serialized to the uint column before the column
      40             : // data.
      41             : type UintEncoding uint8
      42             : 
      43             : const uintEncodingDeltaBit UintEncoding = 1 << 7
      44             : const uintEncodingAllZero UintEncoding = 0
      45             : 
      46             : // IsDelta returns true if it is a delta encoding.
      47           1 : func (e UintEncoding) IsDelta() bool {
      48           1 :         return e&uintEncodingDeltaBit != 0
      49           1 : }
      50             : 
      51             : // Width returns the number of bytes used per integer. It can be 0, 1, 2, 4, or 8.
      52           1 : func (e UintEncoding) Width() int {
      53           1 :         return int(e &^ uintEncodingDeltaBit)
      54           1 : }
      55             : 
      56             : // IsValid returns true if the encoding is valid.
      57           1 : func (e UintEncoding) IsValid() bool {
      58           1 :         switch e.Width() {
      59           1 :         case 0, 1, 2, 4:
      60           1 :                 return true
      61           1 :         case 8:
      62           1 :                 // We should never need to do delta encoding if we store all 64 bits.
      63           1 :                 return !e.IsDelta()
      64           0 :         default:
      65           0 :                 return false
      66             :         }
      67             : }
      68             : 
      69             : // String implements fmt.Stringer.
      70           1 : func (e UintEncoding) String() string {
      71           1 :         if e.Width() == 0 {
      72           1 :                 if e.IsDelta() {
      73           1 :                         return "const"
      74           1 :                 }
      75           1 :                 return "zero"
      76             :         }
      77           1 :         deltaString := ""
      78           1 :         if e.IsDelta() {
      79           1 :                 deltaString = ",delta"
      80           1 :         }
      81           1 :         return fmt.Sprintf("%db%s", e.Width(), deltaString)
      82             : }
      83             : 
      84             : // UintEncodingRowThreshold is the threshold under which the number of rows can
      85             : // affect the best encoding. This happens when the constant 8 bytes for the
      86             : // delta base doesn't make up for saving a byte or two in the per-row encoding.
      87             : const UintEncodingRowThreshold = 8
      88             : 
      89             : // DetermineUintEncoding returns the best valid encoding that can be used to
      90             : // represent numRows integers in the range [minValue, maxValue].
      91             : //
      92             : // DetermineUintEncoding returns the same result for any value of rows >=
      93             : // UintEncodingRowThreshold.
      94           1 : func DetermineUintEncoding(minValue, maxValue uint64, numRows int) UintEncoding {
      95           1 :         b := byteWidth(maxValue - minValue)
      96           1 :         if b == 8 {
      97           1 :                 return UintEncoding(8)
      98           1 :         }
      99             :         // Check if we can use the same number of bytes without a delta encoding.
     100           1 :         isDelta := maxValue >= (1 << (b << 3))
     101           1 :         if isDelta && numRows < UintEncodingRowThreshold {
     102           1 :                 bNoDelta := byteWidth(maxValue)
     103           1 :                 // Check if saving (bNoDelta-b) bytes per row makes up for the 8 bytes
     104           1 :                 // required by the delta base.
     105           1 :                 if numRows*int(bNoDelta-b) < 8 {
     106           1 :                         b = bNoDelta
     107           1 :                         isDelta = false
     108           1 :                 }
     109             :         }
     110           1 :         return makeUintEncoding(b, isDelta)
     111             : }
     112             : 
     113             : // byteWidth returns the number of bytes necessary to represent the given value,
     114             : // either 0, 1, 2, 4, or 8.
     115           1 : func byteWidth(maxValue uint64) uint8 {
     116           1 :         // b the number of bytes necessary to represent maxValue.
     117           1 :         b := (uint8(bits.Len64(maxValue)) + 7) >> 3
     118           1 :         // Round up to the nearest power of 2 by subtracting one and filling out all bits.
     119           1 :         b--
     120           1 :         b |= b >> 1
     121           1 :         b |= b >> 2
     122           1 :         return b + 1
     123           1 : }
     124             : 
     125           1 : func makeUintEncoding(width uint8, isDelta bool) UintEncoding {
     126           1 :         e := UintEncoding(width)
     127           1 :         if isDelta {
     128           1 :                 e |= uintEncodingDeltaBit
     129           1 :         }
     130           1 :         if invariants.Enabled && !e.IsValid() {
     131           0 :                 panic(e)
     132             :         }
     133           1 :         return e
     134             : }
     135             : 
     136             : // UintBuilder builds a column of unsigned integers. It uses the smallest
     137             : // possible UintEncoding, depending on the values.
     138             : type UintBuilder struct {
     139             :         // configuration fixed on Init; preserved across Reset
     140             :         useDefault bool
     141             : 
     142             :         // array holds the underlying heap-allocated array in which values are
     143             :         // stored.
     144             :         array struct {
     145             :                 // n is the size of the array (in count of T elements; not bytes). n is
     146             :                 // NOT the number of elements that have been populated by the user.
     147             :                 n int
     148             :                 // elems provides access to elements without bounds checking. elems is
     149             :                 // grown automatically in Set.
     150             :                 elems UnsafeRawSlice[uint64]
     151             :         }
     152             :         // stats holds state for the purpose of tracking which UintEncoding would
     153             :         // be used if the caller Finished the column including all elements Set so
     154             :         // far. The stats state is used by Size (and Finish) to cheaply determine
     155             :         // which encoding may most concisely encode the array.
     156             :         //
     157             :         // Every Set(i, v) call updates minimum and maximum if necessary. If a call
     158             :         // updates minimum, maximum or both, it recalculates the encoding and if it
     159             :         // changed sets sets encodingRow=i, indicating which row last updated the
     160             :         // width.
     161             :         //
     162             :         // Any call to Size or Finish that supplies [rows] that's inclusive of the
     163             :         // index stored in widthRow may use the stored width. Calls with fewer
     164             :         // [rows] must recompute the min/max. In expected usage, only Finish will be
     165             :         // called with fewer rows and only with one less row than has been set,
     166             :         // meaning that only if the last row updated the width is a recomputation
     167             :         // necessary.
     168             :         //
     169             :         // TODO(jackson): There is a small discrete set of possible encodings, so we
     170             :         // could instead track the index of the first row that makes each encoding
     171             :         // impossible. This would allow us to avoid recomputing the min/max in all
     172             :         // cases. Or, if we limit the API to only allow Finish to be called with one
     173             :         // less than the last set row, we could maintain the width of only the last
     174             :         // two rows.
     175             :         stats struct {
     176             :                 minimum     uint64
     177             :                 maximum     uint64
     178             :                 encoding    UintEncoding
     179             :                 encodingRow int // index of last update to encoding
     180             :         }
     181             : }
     182             : 
     183             : // Init initializes the UintBuilder.
     184           1 : func (b *UintBuilder) Init() {
     185           1 :         b.init(false)
     186           1 : }
     187             : 
     188             : // InitWithDefault initializes the UintBuilder. Any rows that are not explicitly
     189             : // set are assumed to be zero. For the purpose of determining whether a delta
     190             : // encoding is possible, the column is assumed to contain at least 1 default
     191             : // value.
     192             : //
     193             : // InitWithDefault may be preferrable when a nonzero value is uncommon, and the
     194             : // caller can avoid explicitly Set-ing every zero value.
     195           1 : func (b *UintBuilder) InitWithDefault() {
     196           1 :         b.init(true)
     197           1 : }
     198             : 
     199           1 : func (b *UintBuilder) init(useDefault bool) {
     200           1 :         b.useDefault = useDefault
     201           1 :         b.Reset()
     202           1 : }
     203             : 
     204             : // NumColumns implements ColumnWriter.
     205           1 : func (b *UintBuilder) NumColumns() int { return 1 }
     206             : 
     207             : // DataType implements ColumnWriter.
     208           1 : func (b *UintBuilder) DataType(int) DataType { return DataTypeUint }
     209             : 
     210             : // Reset implements ColumnWriter and resets the builder, reusing existing
     211             : // allocated memory.
     212           1 : func (b *UintBuilder) Reset() {
     213           1 :         if b.useDefault {
     214           1 :                 // If the caller configured a default zero, we assume that the array
     215           1 :                 // will include at least one default value.
     216           1 :                 b.stats.minimum = 0
     217           1 :                 b.stats.maximum = 0
     218           1 :                 clear(b.array.elems.Slice(b.array.n))
     219           1 :         } else {
     220           1 :                 b.stats.minimum = math.MaxUint64
     221           1 :                 b.stats.maximum = 0
     222           1 :                 // We could reset all values as a precaution, but it has a visible cost
     223           1 :                 // in benchmarks.
     224           1 :                 if invariants.Sometimes(50) {
     225           1 :                         for i := 0; i < b.array.n; i++ {
     226           1 :                                 b.array.elems.set(i, math.MaxUint64)
     227           1 :                         }
     228             :                 }
     229             :         }
     230           1 :         b.stats.encoding = uintEncodingAllZero
     231           1 :         b.stats.encodingRow = 0
     232             : }
     233             : 
     234             : // Get gets the value of the provided row index. The provided row must have been
     235             : // Set or the builder must have been initialized with InitWithDefault.
     236           1 : func (b *UintBuilder) Get(row int) uint64 {
     237           1 :         // If the UintBuilder is configured to use a zero value for unset rows, it's
     238           1 :         // possible that the array has not been grown to a size that includes [row].
     239           1 :         if b.array.n <= row {
     240           1 :                 if invariants.Enabled && !b.useDefault {
     241           0 :                         panic(errors.AssertionFailedf("Get(%d) on UintBuilder with array of size %d", row, b.array.n))
     242             :                 }
     243           1 :                 return 0
     244             :         }
     245           1 :         return b.array.elems.At(row)
     246             : }
     247             : 
     248             : // Set sets the value of the provided row index to v.
     249           1 : func (b *UintBuilder) Set(row int, v uint64) {
     250           1 :         if b.array.n <= row {
     251           1 :                 // Double the size of the allocated array, or initialize it to at least 32
     252           1 :                 // values (256 bytes) if this is the first allocation. Then double until
     253           1 :                 // there's sufficient space.
     254           1 :                 n2 := max(b.array.n<<1, 32)
     255           1 :                 for n2 <= row {
     256           0 :                         n2 <<= 1 /* double the size */
     257           0 :                 }
     258             :                 // NB: Go guarantees the allocated array will be 64-bit aligned.
     259           1 :                 newDataTyped := make([]uint64, n2)
     260           1 :                 copy(newDataTyped, b.array.elems.Slice(b.array.n))
     261           1 :                 newElems := makeUnsafeRawSlice[uint64](unsafe.Pointer(&newDataTyped[0]))
     262           1 :                 b.array.n = n2
     263           1 :                 b.array.elems = newElems
     264             : 
     265             :         }
     266             :         // Maintain the running minimum and maximum for the purpose of maintaining
     267             :         // knowledge of the delta encoding that would be used.
     268           1 :         if b.stats.minimum > v || b.stats.maximum < v || row < UintEncodingRowThreshold {
     269           1 :                 b.stats.minimum = min(v, b.stats.minimum)
     270           1 :                 b.stats.maximum = max(v, b.stats.maximum)
     271           1 :                 // If updating the minimum and maximum means that we now much use a wider
     272           1 :                 // width integer, update the encoding and the index of the update to it.
     273           1 :                 if e := DetermineUintEncoding(b.stats.minimum, b.stats.maximum, row+1); e != b.stats.encoding {
     274           1 :                         b.stats.encoding = e
     275           1 :                         b.stats.encodingRow = row
     276           1 :                 }
     277             :         }
     278           1 :         b.array.elems.set(row, v)
     279             : }
     280             : 
     281             : // Size implements ColumnWriter and returns the size of the column if its first
     282             : // [rows] rows were serialized, serializing the column into offset [offset].
     283           1 : func (b *UintBuilder) Size(rows int, offset uint32) uint32 {
     284           1 :         if rows == 0 {
     285           1 :                 return offset
     286           1 :         }
     287           1 :         e, _ := b.determineEncoding(rows)
     288           1 :         return uintColumnSize(uint32(rows), offset, e)
     289             : }
     290             : 
     291             : // determineEncoding determines the best encoding for a column containing the
     292             : // first [rows], along with a lower bound on all the values which can be used as
     293             : // a "base" if the encoding is a delta encoding.
     294           1 : func (b *UintBuilder) determineEncoding(rows int) (_ UintEncoding, deltaBase uint64) {
     295           1 :         if b.stats.encodingRow < rows {
     296           1 :                 // b.delta.encoding became the current value within the first [rows], so we
     297           1 :                 // can use it.
     298           1 :                 //
     299           1 :                 // Note that if useDefault is set, this encoding assumes there is at least
     300           1 :                 // one element with the default (zero) value, which might be pessimistic.
     301           1 :                 //
     302           1 :                 // Note that b.stats.minimum includes all rows set so far so it might be
     303           1 :                 // strictly smaller than all values up to [rows]; but it is still a suitable
     304           1 :                 // base for b.stats.encoding.
     305           1 :                 if invariants.Sometimes(1) && rows > 0 {
     306           1 :                         if enc, _ := b.recalculateEncoding(rows); enc != b.stats.encoding {
     307           0 :                                 panic(fmt.Sprintf("fast and slow paths don't agree: %s vs %s", b.stats.encoding, enc))
     308             :                         }
     309             :                 }
     310           1 :                 return b.stats.encoding, b.stats.minimum
     311             :         }
     312           1 :         return b.recalculateEncoding(rows)
     313             : }
     314             : 
     315           1 : func (b *UintBuilder) recalculateEncoding(rows int) (_ UintEncoding, deltaBase uint64) {
     316           1 :         // We have to recalculate the minimum and maximum.
     317           1 :         minimum, maximum := computeMinMax(b.array.elems.Slice(min(rows, b.array.n)))
     318           1 :         if b.useDefault {
     319           1 :                 // Mirror the pessimism of the fast path so that the result is consistent.
     320           1 :                 // Otherwise, adding a row can result in a different encoding even when not
     321           1 :                 // including that row.
     322           1 :                 minimum = 0
     323           1 :         }
     324           1 :         return DetermineUintEncoding(minimum, maximum, rows), minimum
     325             : }
     326             : 
     327             : // uintColumnSize returns the size of a column of unsigned integers, encoded at
     328             : // the provided offset using the provided width. If width < sizeof(T), then a
     329             : // delta encoding is assumed.
     330           1 : func uintColumnSize(rows, offset uint32, e UintEncoding) uint32 {
     331           1 :         offset++ // DeltaEncoding byte
     332           1 :         if e.IsDelta() {
     333           1 :                 // A delta encoding will be used. We need to first account for the constant
     334           1 :                 // that encodes the base value.
     335           1 :                 offset += 8
     336           1 :         }
     337           1 :         width := uint32(e.Width())
     338           1 :         // Include alignment bytes necessary to align offset appropriately for
     339           1 :         // elements of the delta width.
     340           1 :         if width > 0 {
     341           1 :                 offset = align(offset, width)
     342           1 :         }
     343             :         // Now account for the array of [rows] x w elements encoding the deltas.
     344           1 :         return offset + rows*width
     345             : }
     346             : 
     347             : // Finish implements ColumnWriter, serializing the column into offset [offset] of
     348             : // [buf].
     349           1 : func (b *UintBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
     350           1 :         if rows == 0 {
     351           1 :                 return offset
     352           1 :         }
     353             : 
     354           1 :         e, minimum := b.determineEncoding(rows)
     355           1 : 
     356           1 :         // NB: In some circumstances, it's possible for b.array.elems.ptr to be nil.
     357           1 :         // Specifically, if the builder is initialized using InitWithDefault and no
     358           1 :         // non-default values exist, no array will have been allocated (we lazily
     359           1 :         // allocate b.array.elems.ptr). It's illegal to try to construct an unsafe
     360           1 :         // slice from a nil ptr with non-zero rows. Only attempt to construct the
     361           1 :         // values slice if there's actually a non-nil ptr.
     362           1 :         var valuesSlice []uint64
     363           1 :         if b.array.elems.ptr != nil {
     364           1 :                 valuesSlice = b.array.elems.Slice(rows)
     365           1 :         }
     366           1 :         return uintColumnFinish(minimum, valuesSlice, e, offset, buf)
     367             : }
     368             : 
     369             : // uintColumnFinish finishes the column of unsigned integers of type T, applying
     370             : // the given encoding.
     371             : func uintColumnFinish(
     372             :         minimum uint64, values []uint64, e UintEncoding, offset uint32, buf []byte,
     373           1 : ) uint32 {
     374           1 :         buf[offset] = byte(e)
     375           1 :         offset++
     376           1 : 
     377           1 :         deltaBase := uint64(0)
     378           1 :         if e.IsDelta() {
     379           1 :                 deltaBase = minimum
     380           1 :                 binary.LittleEndian.PutUint64(buf[offset:], minimum)
     381           1 :                 offset += 8
     382           1 :         }
     383           1 :         width := uint32(e.Width())
     384           1 :         if width == 0 {
     385           1 :                 // All the column values are the same.
     386           1 :                 return offset
     387           1 :         }
     388             :         // Align the offset appropriately.
     389           1 :         offset = alignWithZeroes(buf, offset, width)
     390           1 : 
     391           1 :         switch e.Width() {
     392           1 :         case 1:
     393           1 :                 dest := makeUnsafeRawSlice[uint8](unsafe.Pointer(&buf[offset])).Slice(len(values))
     394           1 :                 reduceUints(deltaBase, values, dest)
     395             : 
     396           1 :         case 2:
     397           1 :                 dest := makeUnsafeRawSlice[uint16](unsafe.Pointer(&buf[offset])).Slice(len(values))
     398           1 :                 reduceUints(deltaBase, values, dest)
     399             : 
     400           1 :         case 4:
     401           1 :                 dest := makeUnsafeRawSlice[uint32](unsafe.Pointer(&buf[offset])).Slice(len(values))
     402           1 :                 reduceUints(deltaBase, values, dest)
     403             : 
     404           1 :         case 8:
     405           1 :                 if deltaBase != 0 {
     406           0 :                         panic("unreachable")
     407             :                 }
     408           1 :                 dest := makeUnsafeRawSlice[uint64](unsafe.Pointer(&buf[offset])).Slice(len(values))
     409           1 :                 copy(dest, values)
     410             : 
     411           0 :         default:
     412           0 :                 panic("unreachable")
     413             :         }
     414           1 :         return offset + uint32(len(values))*width
     415             : }
     416             : 
     417             : // WriteDebug implements Encoder.
     418           1 : func (b *UintBuilder) WriteDebug(w io.Writer, rows int) {
     419           1 :         fmt.Fprintf(w, "%s: %d rows", DataTypeUint, rows)
     420           1 : }
     421             : 
     422             : // reduceUints reduces the bit-width of a slice of unsigned by subtracting a
     423             : // minimum value from each element and writing it to dst. For example,
     424             : //
     425             : //      reduceUints[uint8](10, []uint64{10, 11, 12}, dst)
     426             : //
     427             : // could be used to reduce a slice of uint64 values to uint8 values {0, 1, 2}.
     428           1 : func reduceUints[N constraints.Integer](minimum uint64, values []uint64, dst []N) {
     429           1 :         for i := 0; i < len(values); i++ {
     430           1 :                 if invariants.Enabled {
     431           1 :                         if values[i] < minimum {
     432           0 :                                 panic("incorrect minimum value")
     433             :                         }
     434           1 :                         if values[i]-minimum > uint64(N(0)-1) {
     435           0 :                                 panic("incorrect target width")
     436             :                         }
     437             :                 }
     438           1 :                 dst[i] = N(values[i] - minimum)
     439             :         }
     440             : }
     441             : 
     442             : // computeMinMax computes the minimum and the maximum of the provided slice of
     443             : // unsigned integers.
     444           1 : func computeMinMax[I constraints.Unsigned](values []I) (I, I) {
     445           1 :         minimum := I(0) - 1
     446           1 :         maximum := I(0)
     447           1 :         for _, v := range values {
     448           1 :                 minimum = min(minimum, v)
     449           1 :                 maximum = max(maximum, v)
     450           1 :         }
     451           1 :         return minimum, maximum
     452             : }
     453             : 
     454             : func uintsToBinFormatter(
     455             :         f *binfmt.Formatter, tp treeprinter.Node, rows int, uintFormatter func(el, base uint64) string,
     456           1 : ) {
     457           1 :         if rows == 0 {
     458           1 :                 return
     459           1 :         }
     460           1 :         if uintFormatter == nil {
     461           1 :                 uintFormatter = func(v, base uint64) string {
     462           1 :                         if base == 0 {
     463           1 :                                 return fmt.Sprint(v)
     464           1 :                         }
     465           1 :                         return fmt.Sprintf("%d + %d = %d", v, base, base+v)
     466             :                 }
     467             :         }
     468             : 
     469           1 :         e := UintEncoding(f.PeekUint(1)) // UintEncoding byte
     470           1 :         if !e.IsValid() {
     471           0 :                 panic(fmt.Sprintf("%d", e))
     472             :         }
     473           1 :         f.HexBytesln(1, "encoding: %s", e)
     474           1 : 
     475           1 :         var base uint64
     476           1 :         if e.IsDelta() {
     477           1 :                 base = f.PeekUint(8)
     478           1 :                 f.HexBytesln(8, "64-bit constant: %d", base)
     479           1 :         }
     480           1 :         width := e.Width()
     481           1 :         if width == 0 {
     482           1 :                 // The column is zero or constant.
     483           1 :                 f.ToTreePrinter(tp)
     484           1 :                 return
     485           1 :         }
     486             : 
     487           1 :         if off := align(f.RelativeOffset(), width); off != f.RelativeOffset() {
     488           1 :                 f.HexBytesln(off-f.RelativeOffset(), "padding (aligning to %d-bit boundary)", width*8)
     489           1 :         }
     490           1 :         for i := 0; i < rows; i++ {
     491           1 :                 f.HexBytesln(width, "data[%d] = %s", i, uintFormatter(f.PeekUint(width), base))
     492           1 :         }
     493           1 :         f.ToTreePrinter(tp)
     494             : }

Generated by: LCOV version 1.14