Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "io"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/binfmt"
14 : "github.com/cockroachdb/pebble/internal/treeprinter"
15 : )
16 :
17 : // RawBytes holds an array of byte slices, stored as a concatenated data section
18 : // and a series of offsets for each slice. Byte slices within RawBytes are
19 : // stored in their entirety without any compression, ensuring stability without
20 : // copying.
21 : //
22 : // # Representation
23 : //
24 : // An array of N byte slices encodes N+1 offsets. The beginning of the data
25 : // representation holds an offsets table, in the same encoding as a
26 : // DataTypeUint32 column. The integer offsets may be encoded using smaller width
27 : // integers to save space if all offsets fit within an 8-bit or 16-bit uint.
28 : // Each offset is relative to the beginning of the string data section (after
29 : // the offset table).
30 : //
31 : // The use of UintEncoding conserves space in the common case. In the context of
32 : // CockroachDB, the vast majority of offsets will fit in 16-bits when using 32
33 : // KiB blocks (the size in use by CockroachDB). However, a single value larger
34 : // than 65535 bytes requires an offset too large to fit within 16 bits, in which
35 : // case offsets will be encoded as 32-bit integers.
36 : //
37 : // +-------------------------------------------------------------------+
38 : // | a uint offsets table, usually encoded with 16-bits, |
39 : // | possibly padded for alignment |
40 : // | (see UintEncoding) |
41 : // +-------------------------------------------------------------------+
42 : // | String Data |
43 : // | abcabcada.... |
44 : // +-------------------------------------------------------------------+
45 : //
46 : // The UintEncoding bits of the ColumnEncoding for a RawBytes column describes
47 : // the encoding of the offset table.
48 : type RawBytes struct {
49 : slices int
50 : offsets UnsafeOffsets
51 : start unsafe.Pointer
52 : data unsafe.Pointer
53 : }
54 :
55 : // Assert that RawBytes implements Array[[]byte].
56 : var _ Array[[]byte] = RawBytes{}
57 :
58 : // DecodeRawBytes decodes the structure of a RawBytes, constructing an accessor
59 : // for an array of byte slices constructed by RawBytesBuilder. Count must be the
60 : // number of byte slices within the array.
61 2 : func DecodeRawBytes(b []byte, offset uint32, count int) (rawBytes RawBytes, endOffset uint32) {
62 2 : if count == 0 {
63 2 : return RawBytes{}, offset
64 2 : }
65 2 : offsets, dataOff := DecodeUnsafeOffsets(b, offset, count+1 /* +1 offset */)
66 2 : return RawBytes{
67 2 : slices: count,
68 2 : offsets: offsets,
69 2 : start: unsafe.Pointer(&b[offset]),
70 2 : data: unsafe.Pointer(&b[dataOff]),
71 2 : }, dataOff + offsets.At(count)
72 : }
73 :
74 : // Assert that DecodeRawBytes implements DecodeFunc.
75 : var _ DecodeFunc[RawBytes] = DecodeRawBytes
76 :
77 1 : func defaultSliceFormatter(x []byte) string {
78 1 : if bytes.ContainsFunc(x, func(r rune) bool { return r < 32 || r > 126 }) {
79 1 : return fmt.Sprintf("%q", x)
80 1 : }
81 1 : return string(x)
82 : }
83 :
84 : func rawBytesToBinFormatter(
85 : f *binfmt.Formatter, tp treeprinter.Node, count int, sliceFormatter func([]byte) string,
86 1 : ) {
87 1 : if count == 0 {
88 1 : return
89 1 : }
90 1 : if sliceFormatter == nil {
91 1 : sliceFormatter = defaultSliceFormatter
92 1 : }
93 :
94 1 : rb, _ := DecodeRawBytes(f.RelativeData(), uint32(f.RelativeOffset()), count)
95 1 : dataOffset := uint64(f.RelativeOffset()) + uint64(uintptr(rb.data)-uintptr(rb.start))
96 1 : n := tp.Child("offsets table")
97 1 : uintsToBinFormatter(f, n, count+1, func(offset, base uint64) string {
98 1 : // NB: base is always zero for RawBytes columns.
99 1 : return fmt.Sprintf("%d [%d overall]", offset+base, offset+base+dataOffset)
100 1 : })
101 1 : n = tp.Child("data")
102 1 : for i := 0; i < rb.slices; i++ {
103 1 : s := rb.At(i)
104 1 : f.HexBytesln(len(s), "data[%d]: %s", i, sliceFormatter(s))
105 1 : }
106 1 : f.ToTreePrinter(n)
107 : }
108 :
109 2 : func (b *RawBytes) ptr(offset uint32) unsafe.Pointer {
110 2 : return unsafe.Pointer(uintptr(b.data) + uintptr(offset))
111 2 : }
112 :
113 : //gcassert:inline
114 2 : func (b *RawBytes) slice(start, end uint32) []byte {
115 2 : return unsafe.Slice((*byte)(b.ptr(start)), end-start)
116 2 : }
117 :
118 : // At returns the []byte at index i. The returned slice should not be mutated.
119 2 : func (b RawBytes) At(i int) []byte {
120 2 : return b.slice(b.offsets.At2(i))
121 2 : }
122 :
123 : // Slices returns the number of []byte slices encoded within the RawBytes.
124 0 : func (b *RawBytes) Slices() int {
125 0 : return b.slices
126 0 : }
127 :
128 : // RawBytesBuilder encodes a column of byte slices.
129 : type RawBytesBuilder struct {
130 : rows int
131 : data []byte
132 : offsets UintBuilder
133 : }
134 :
135 : // Assert that *RawBytesBuilder implements ColumnWriter.
136 : var _ ColumnWriter = (*RawBytesBuilder)(nil)
137 :
138 : // Init initializes the builder for first-time use.
139 2 : func (b *RawBytesBuilder) Init() {
140 2 : b.offsets.Init()
141 2 : b.Reset()
142 2 : }
143 :
144 : // Reset resets the builder to an empty state.
145 2 : func (b *RawBytesBuilder) Reset() {
146 2 : b.rows = 0
147 2 : b.data = b.data[:0]
148 2 : b.offsets.Reset()
149 2 : // Add an initial offset of zero to streamline the logic in RawBytes.At() to
150 2 : // avoid needing a special case for row 0.
151 2 : b.offsets.Set(0, 0)
152 2 : }
153 :
154 : // NumColumns implements ColumnWriter.
155 2 : func (b *RawBytesBuilder) NumColumns() int { return 1 }
156 :
157 : // DataType implements ColumnWriter.
158 2 : func (b *RawBytesBuilder) DataType(int) DataType { return DataTypeBytes }
159 :
160 : // Put appends the provided byte slice to the builder.
161 2 : func (b *RawBytesBuilder) Put(s []byte) {
162 2 : b.data = append(b.data, s...)
163 2 : b.rows++
164 2 : b.offsets.Set(b.rows, uint64(len(b.data)))
165 2 : }
166 :
167 : // PutConcat appends a single byte slice formed by the concatenation of the two
168 : // byte slice arguments.
169 2 : func (b *RawBytesBuilder) PutConcat(s1, s2 []byte) {
170 2 : b.data = append(append(b.data, s1...), s2...)
171 2 : b.rows++
172 2 : b.offsets.Set(b.rows, uint64(len(b.data)))
173 2 : }
174 :
175 : // Rows returns the count of slices that have been added to the builder.
176 0 : func (b *RawBytesBuilder) Rows() int {
177 0 : return b.rows
178 0 : }
179 :
180 : // UnsafeGet returns the i'th slice added to the builder. The returned slice is
181 : // owned by the builder and must not be mutated.
182 2 : func (b *RawBytesBuilder) UnsafeGet(i int) []byte {
183 2 : if b.rows == 0 {
184 2 : return nil
185 2 : }
186 2 : return b.data[b.offsets.array.elems.At(i):b.offsets.array.elems.At(i+1)]
187 : }
188 :
189 : // Finish writes the serialized byte slices to buf starting at offset. The buf
190 : // slice must be sufficiently large to store the serialized output. The caller
191 : // should use [Size] to size buf appropriately before calling Finish.
192 2 : func (b *RawBytesBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
193 2 : if rows == 0 {
194 2 : return offset
195 2 : }
196 2 : dataLen := b.offsets.Get(rows)
197 2 : offset = b.offsets.Finish(0, rows+1, offset, buf)
198 2 : // Copy the data section.
199 2 : return offset + uint32(copy(buf[offset:], b.data[:dataLen]))
200 : }
201 :
202 : // Size computes the size required to encode the byte slices beginning in a
203 : // buffer at the provided offset. The offset is required to ensure proper
204 : // alignment. The returned uint32 is the offset of the first byte after the end
205 : // of the encoded data. To compute the size in bytes, subtract the [offset]
206 : // passed into Size from the returned offset.
207 2 : func (b *RawBytesBuilder) Size(rows int, offset uint32) uint32 {
208 2 : if rows == 0 {
209 2 : return offset
210 2 : }
211 : // Get the size needed to encode the rows+1 offsets.
212 2 : offset = b.offsets.Size(rows+1, offset)
213 2 : // Add the value of offset[rows] since that is the accumulated size of the
214 2 : // first [rows] slices.
215 2 : return offset + uint32(b.offsets.Get(rows))
216 : }
217 :
218 : // WriteDebug implements Encoder.
219 1 : func (b *RawBytesBuilder) WriteDebug(w io.Writer, rows int) {
220 1 : fmt.Fprintf(w, "bytes: %d rows set; %d bytes in data", b.rows, len(b.data))
221 1 : }
|