Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "io"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/binfmt"
14 : )
15 :
16 : // RawBytes holds an array of byte slices, stored as a concatenated data section
17 : // and a series of offsets for each slice. Byte slices within RawBytes are
18 : // stored in their entirety without any compression, ensuring stability without
19 : // copying.
20 : //
21 : // # Representation
22 : //
23 : // An array of N byte slices encodes N+1 offsets. The beginning of the data
24 : // representation holds an offsets table, in the same encoding as a
25 : // DataTypeUint32 column. The integer offsets may be encoded using smaller width
26 : // integers to save space if all offsets fit within an 8-bit or 16-bit uint.
27 : // Each offset is relative to the beginning of the string data section (after
28 : // the offset table).
29 : //
30 : // The use of UintEncoding conserves space in the common case. In the context of
31 : // CockroachDB, the vast majority of offsets will fit in 16-bits when using 32
32 : // KiB blocks (the size in use by CockroachDB). However, a single value larger
33 : // than 65535 bytes requires an offset too large to fit within 16 bits, in which
34 : // case offsets will be encoded as 32-bit integers.
35 : //
36 : // +-------------------------------------------------------------------+
37 : // | a uint offsets table, usually encoded with 16-bits, |
38 : // | possibly padded for alignment |
39 : // | (see UintEncoding) |
40 : // +-------------------------------------------------------------------+
41 : // | String Data |
42 : // | abcabcada.... |
43 : // +-------------------------------------------------------------------+
44 : //
45 : // The UintEncoding bits of the ColumnEncoding for a RawBytes column describes
46 : // the encoding of the offset table.
47 : type RawBytes struct {
48 : slices int
49 : offsets UnsafeOffsets
50 : start unsafe.Pointer
51 : data unsafe.Pointer
52 : }
53 :
54 : // Assert that RawBytes implements Array[[]byte].
55 : var _ Array[[]byte] = RawBytes{}
56 :
57 : // DecodeRawBytes decodes the structure of a RawBytes, constructing an accessor
58 : // for an array of byte slices constructed by RawBytesBuilder. Count must be the
59 : // number of byte slices within the array.
60 0 : func DecodeRawBytes(b []byte, offset uint32, count int) (rawBytes RawBytes, endOffset uint32) {
61 0 : if count == 0 {
62 0 : return RawBytes{}, offset
63 0 : }
64 0 : offsets, dataOff := DecodeUnsafeOffsets(b, offset, count+1 /* +1 offset */)
65 0 : return RawBytes{
66 0 : slices: count,
67 0 : offsets: offsets,
68 0 : start: unsafe.Pointer(&b[offset]),
69 0 : data: unsafe.Pointer(&b[dataOff]),
70 0 : }, dataOff + offsets.At(count)
71 : }
72 :
73 : // Assert that DecodeRawBytes implements DecodeFunc.
74 : var _ DecodeFunc[RawBytes] = DecodeRawBytes
75 :
76 0 : func defaultSliceFormatter(x []byte) string {
77 0 : if bytes.ContainsFunc(x, func(r rune) bool { return r < 32 || r > 126 }) {
78 0 : return fmt.Sprintf("%q", x)
79 0 : }
80 0 : return string(x)
81 : }
82 :
83 0 : func rawBytesToBinFormatter(f *binfmt.Formatter, count int, sliceFormatter func([]byte) string) {
84 0 : if count == 0 {
85 0 : return
86 0 : }
87 0 : if sliceFormatter == nil {
88 0 : sliceFormatter = defaultSliceFormatter
89 0 : }
90 :
91 0 : rb, _ := DecodeRawBytes(f.RelativeData(), uint32(f.RelativeOffset()), count)
92 0 : dataOffset := uint64(f.RelativeOffset()) + uint64(uintptr(rb.data)-uintptr(rb.start))
93 0 : f.CommentLine("rawbytes")
94 0 : f.CommentLine("offsets table")
95 0 : uintsToBinFormatter(f, count+1, func(offset, base uint64) string {
96 0 : // NB: base is always zero for RawBytes columns.
97 0 : return fmt.Sprintf("%d [%d overall]", offset+base, offset+base+dataOffset)
98 0 : })
99 0 : f.CommentLine("data")
100 0 : for i := 0; i < rb.slices; i++ {
101 0 : s := rb.At(i)
102 0 : f.HexBytesln(len(s), "data[%d]: %s", i, sliceFormatter(s))
103 0 : }
104 : }
105 :
106 0 : func (b *RawBytes) ptr(offset uint32) unsafe.Pointer {
107 0 : return unsafe.Pointer(uintptr(b.data) + uintptr(offset))
108 0 : }
109 :
110 0 : func (b *RawBytes) slice(start, end uint32) []byte {
111 0 : return unsafe.Slice((*byte)(b.ptr(start)), end-start)
112 0 : }
113 :
114 : // At returns the []byte at index i. The returned slice should not be mutated.
115 0 : func (b RawBytes) At(i int) []byte {
116 0 : return b.slice(b.offsets.At(i), b.offsets.At(i+1))
117 0 : }
118 :
119 : // Slices returns the number of []byte slices encoded within the RawBytes.
120 0 : func (b *RawBytes) Slices() int {
121 0 : return b.slices
122 0 : }
123 :
124 : // RawBytesBuilder encodes a column of byte slices.
125 : type RawBytesBuilder struct {
126 : rows int
127 : data []byte
128 : offsets UintBuilder
129 : }
130 :
131 : // Assert that *RawBytesBuilder implements ColumnWriter.
132 : var _ ColumnWriter = (*RawBytesBuilder)(nil)
133 :
134 : // Init initializes the builder for first-time use.
135 0 : func (b *RawBytesBuilder) Init() {
136 0 : b.offsets.Init()
137 0 : b.Reset()
138 0 : }
139 :
140 : // Reset resets the builder to an empty state.
141 0 : func (b *RawBytesBuilder) Reset() {
142 0 : b.rows = 0
143 0 : b.data = b.data[:0]
144 0 : b.offsets.Reset()
145 0 : // Add an initial offset of zero to streamline the logic in RawBytes.At() to
146 0 : // avoid needing a special case for row 0.
147 0 : b.offsets.Set(0, 0)
148 0 : }
149 :
150 : // NumColumns implements ColumnWriter.
151 0 : func (b *RawBytesBuilder) NumColumns() int { return 1 }
152 :
153 : // DataType implements ColumnWriter.
154 0 : func (b *RawBytesBuilder) DataType(int) DataType { return DataTypeBytes }
155 :
156 : // Put appends the provided byte slice to the builder.
157 0 : func (b *RawBytesBuilder) Put(s []byte) {
158 0 : b.data = append(b.data, s...)
159 0 : b.rows++
160 0 : b.offsets.Set(b.rows, uint64(len(b.data)))
161 0 : }
162 :
163 : // PutConcat appends a single byte slice formed by the concatenation of the two
164 : // byte slice arguments.
165 0 : func (b *RawBytesBuilder) PutConcat(s1, s2 []byte) {
166 0 : b.data = append(append(b.data, s1...), s2...)
167 0 : b.rows++
168 0 : b.offsets.Set(b.rows, uint64(len(b.data)))
169 0 : }
170 :
171 : // Rows returns the count of slices that have been added to the builder.
172 0 : func (b *RawBytesBuilder) Rows() int {
173 0 : return b.rows
174 0 : }
175 :
176 : // UnsafeGet returns the i'th slice added to the builder. The returned slice is
177 : // owned by the builder and must not be mutated.
178 0 : func (b *RawBytesBuilder) UnsafeGet(i int) []byte {
179 0 : if b.rows == 0 {
180 0 : return nil
181 0 : }
182 0 : return b.data[b.offsets.array.elems.At(i):b.offsets.array.elems.At(i+1)]
183 : }
184 :
185 : // Finish writes the serialized byte slices to buf starting at offset. The buf
186 : // slice must be sufficiently large to store the serialized output. The caller
187 : // should use [Size] to size buf appropriately before calling Finish.
188 0 : func (b *RawBytesBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
189 0 : if rows == 0 {
190 0 : return offset
191 0 : }
192 0 : dataLen := b.offsets.Get(rows)
193 0 : offset = b.offsets.Finish(0, rows+1, offset, buf)
194 0 : // Copy the data section.
195 0 : return offset + uint32(copy(buf[offset:], b.data[:dataLen]))
196 : }
197 :
198 : // Size computes the size required to encode the byte slices beginning in a
199 : // buffer at the provided offset. The offset is required to ensure proper
200 : // alignment. The returned uint32 is the offset of the first byte after the end
201 : // of the encoded data. To compute the size in bytes, subtract the [offset]
202 : // passed into Size from the returned offset.
203 0 : func (b *RawBytesBuilder) Size(rows int, offset uint32) uint32 {
204 0 : if rows == 0 {
205 0 : return offset
206 0 : }
207 : // Get the size needed to encode the rows+1 offsets.
208 0 : offset = b.offsets.Size(rows+1, offset)
209 0 : // Add the value of offset[rows] since that is the accumulated size of the
210 0 : // first [rows] slices.
211 0 : return offset + uint32(b.offsets.Get(rows))
212 : }
213 :
214 : // WriteDebug implements Encoder.
215 0 : func (b *RawBytesBuilder) WriteDebug(w io.Writer, rows int) {
216 0 : fmt.Fprintf(w, "bytes: %d rows set; %d bytes in data", b.rows, len(b.data))
217 0 : }
|