Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "io"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/binfmt"
14 : "github.com/cockroachdb/pebble/internal/treeprinter"
15 : )
16 :
17 : // RawBytes holds an array of byte slices, stored as a concatenated data section
18 : // and a series of offsets for each slice. Byte slices within RawBytes are
19 : // stored in their entirety without any compression, ensuring stability without
20 : // copying.
21 : //
22 : // # Representation
23 : //
24 : // An array of N byte slices encodes N+1 offsets. The beginning of the data
25 : // representation holds an offsets table, in the same encoding as a
26 : // DataTypeUint32 column. The integer offsets may be encoded using smaller width
27 : // integers to save space if all offsets fit within an 8-bit or 16-bit uint.
28 : // Each offset is relative to the beginning of the string data section (after
29 : // the offset table).
30 : //
31 : // The use of UintEncoding conserves space in the common case. In the context of
32 : // CockroachDB, the vast majority of offsets will fit in 16-bits when using 32
33 : // KiB blocks (the size in use by CockroachDB). However, a single value larger
34 : // than 65535 bytes requires an offset too large to fit within 16 bits, in which
35 : // case offsets will be encoded as 32-bit integers.
36 : //
37 : // +-------------------------------------------------------------------+
38 : // | a uint offsets table, usually encoded with 16-bits, |
39 : // | possibly padded for alignment |
40 : // | (see UintEncoding) |
41 : // +-------------------------------------------------------------------+
42 : // | String Data |
43 : // | abcabcada.... |
44 : // +-------------------------------------------------------------------+
45 : //
46 : // The UintEncoding bits of the ColumnEncoding for a RawBytes column describes
47 : // the encoding of the offset table.
48 : type RawBytes struct {
49 : slices int
50 : offsets UnsafeOffsets
51 : start unsafe.Pointer
52 : data unsafe.Pointer
53 : }
54 :
55 : // Assert that RawBytes implements Array[[]byte].
56 : var _ Array[[]byte] = RawBytes{}
57 :
58 : // DecodeRawBytes decodes the structure of a RawBytes, constructing an accessor
59 : // for an array of byte slices constructed by RawBytesBuilder. Count must be the
60 : // number of byte slices within the array.
61 1 : func DecodeRawBytes(b []byte, offset uint32, count int) (rawBytes RawBytes, endOffset uint32) {
62 1 : if count == 0 {
63 1 : return RawBytes{}, offset
64 1 : }
65 1 : offsets, dataOff := DecodeUnsafeOffsets(b, offset, count+1 /* +1 offset */)
66 1 : return RawBytes{
67 1 : slices: count,
68 1 : offsets: offsets,
69 1 : start: unsafe.Pointer(&b[offset]),
70 1 : data: unsafe.Pointer(&b[dataOff]),
71 1 : }, dataOff + offsets.At(count)
72 : }
73 :
74 : // Assert that DecodeRawBytes implements DecodeFunc.
75 : var _ DecodeFunc[RawBytes] = DecodeRawBytes
76 :
77 0 : func defaultSliceFormatter(x []byte) string {
78 0 : if bytes.ContainsFunc(x, func(r rune) bool { return r < 32 || r > 126 }) {
79 0 : return fmt.Sprintf("%q", x)
80 0 : }
81 0 : return string(x)
82 : }
83 :
84 : func rawBytesToBinFormatter(
85 : f *binfmt.Formatter, tp treeprinter.Node, count int, sliceFormatter func([]byte) string,
86 0 : ) {
87 0 : if count == 0 {
88 0 : return
89 0 : }
90 0 : if sliceFormatter == nil {
91 0 : sliceFormatter = defaultSliceFormatter
92 0 : }
93 :
94 0 : rb, _ := DecodeRawBytes(f.RelativeData(), uint32(f.RelativeOffset()), count)
95 0 : dataOffset := uint64(f.RelativeOffset()) + uint64(uintptr(rb.data)-uintptr(rb.start))
96 0 : n := tp.Child("offsets table")
97 0 : uintsToBinFormatter(f, n, count+1, func(offset, base uint64) string {
98 0 : // NB: base is always zero for RawBytes columns.
99 0 : return fmt.Sprintf("%d [%d overall]", offset+base, offset+base+dataOffset)
100 0 : })
101 0 : n = tp.Child("data")
102 0 : for i := 0; i < rb.slices; i++ {
103 0 : s := rb.At(i)
104 0 : f.HexBytesln(len(s), "data[%d]: %s", i, sliceFormatter(s))
105 0 : }
106 0 : f.ToTreePrinter(n)
107 : }
108 :
109 1 : func (b *RawBytes) ptr(offset uint32) unsafe.Pointer {
110 1 : return unsafe.Pointer(uintptr(b.data) + uintptr(offset))
111 1 : }
112 :
113 : //gcassert:inline
114 1 : func (b *RawBytes) slice(start, end uint32) []byte {
115 1 : return unsafe.Slice((*byte)(b.ptr(start)), end-start)
116 1 : }
117 :
118 : // At returns the []byte at index i. The returned slice should not be mutated.
119 1 : func (b RawBytes) At(i int) []byte {
120 1 : return b.slice(b.offsets.At2(i))
121 1 : }
122 :
123 : // Slices returns the number of []byte slices encoded within the RawBytes.
124 0 : func (b *RawBytes) Slices() int {
125 0 : return b.slices
126 0 : }
127 :
128 : // RawBytesBuilder encodes a column of byte slices.
129 : type RawBytesBuilder struct {
130 : rows int
131 : data []byte
132 : offsets UintBuilder
133 : }
134 :
135 : // Assert that *RawBytesBuilder implements ColumnWriter.
136 : var _ ColumnWriter = (*RawBytesBuilder)(nil)
137 :
138 : // Init initializes the builder for first-time use.
139 1 : func (b *RawBytesBuilder) Init() {
140 1 : b.offsets.Init()
141 1 : b.Reset()
142 1 : }
143 :
144 : // Reset resets the builder to an empty state.
145 1 : func (b *RawBytesBuilder) Reset() {
146 1 : b.rows = 0
147 1 : b.data = b.data[:0]
148 1 : b.offsets.Reset()
149 1 : // Add an initial offset of zero to streamline the logic in RawBytes.At() to
150 1 : // avoid needing a special case for row 0.
151 1 : b.offsets.Set(0, 0)
152 1 : }
153 :
154 : // NumColumns implements ColumnWriter.
155 1 : func (b *RawBytesBuilder) NumColumns() int { return 1 }
156 :
157 : // DataType implements ColumnWriter.
158 1 : func (b *RawBytesBuilder) DataType(int) DataType { return DataTypeBytes }
159 :
160 : // Put appends the provided byte slice to the builder.
161 1 : func (b *RawBytesBuilder) Put(s []byte) {
162 1 : b.data = append(b.data, s...)
163 1 : b.rows++
164 1 : b.offsets.Set(b.rows, uint64(len(b.data)))
165 1 : }
166 :
167 : // PutConcat appends a single byte slice formed by the concatenation of the two
168 : // byte slice arguments.
169 1 : func (b *RawBytesBuilder) PutConcat(s1, s2 []byte) {
170 1 : b.data = append(append(b.data, s1...), s2...)
171 1 : b.rows++
172 1 : b.offsets.Set(b.rows, uint64(len(b.data)))
173 1 : }
174 :
175 : // Rows returns the count of slices that have been added to the builder.
176 0 : func (b *RawBytesBuilder) Rows() int {
177 0 : return b.rows
178 0 : }
179 :
180 : // UnsafeGet returns the i'th slice added to the builder. The returned slice is
181 : // owned by the builder and must not be mutated.
182 1 : func (b *RawBytesBuilder) UnsafeGet(i int) []byte {
183 1 : if b.rows == 0 {
184 1 : return nil
185 1 : }
186 1 : return b.data[b.offsets.array.elems.At(i):b.offsets.array.elems.At(i+1)]
187 : }
188 :
189 : // Finish writes the serialized byte slices to buf starting at offset. The buf
190 : // slice must be sufficiently large to store the serialized output. The caller
191 : // should use [Size] to size buf appropriately before calling Finish.
192 1 : func (b *RawBytesBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
193 1 : if rows == 0 {
194 1 : return offset
195 1 : }
196 1 : dataLen := b.offsets.Get(rows)
197 1 : offset = b.offsets.Finish(0, rows+1, offset, buf)
198 1 : // Copy the data section.
199 1 : return offset + uint32(copy(buf[offset:], b.data[:dataLen]))
200 : }
201 :
202 : // Size computes the size required to encode the byte slices beginning in a
203 : // buffer at the provided offset. The offset is required to ensure proper
204 : // alignment. The returned uint32 is the offset of the first byte after the end
205 : // of the encoded data. To compute the size in bytes, subtract the [offset]
206 : // passed into Size from the returned offset.
207 1 : func (b *RawBytesBuilder) Size(rows int, offset uint32) uint32 {
208 1 : if rows == 0 {
209 1 : return offset
210 1 : }
211 : // Get the size needed to encode the rows+1 offsets.
212 1 : offset = b.offsets.Size(rows+1, offset)
213 1 : // Add the value of offset[rows] since that is the accumulated size of the
214 1 : // first [rows] slices.
215 1 : return offset + uint32(b.offsets.Get(rows))
216 : }
217 :
218 : // WriteDebug implements Encoder.
219 0 : func (b *RawBytesBuilder) WriteDebug(w io.Writer, rows int) {
220 0 : fmt.Fprintf(w, "bytes: %d rows set; %d bytes in data", b.rows, len(b.data))
221 0 : }
|