Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "io"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/binfmt"
14 : )
15 :
16 : // RawBytes holds an array of byte slices, stored as a concatenated data section
17 : // and a series of offsets for each slice. Byte slices within RawBytes are
18 : // stored in their entirety without any compression, ensuring stability without
19 : // copying.
20 : //
21 : // # Representation
22 : //
23 : // An array of N byte slices encodes N+1 offsets. The beginning of the data
24 : // representation holds an offsets table, in the same encoding as a
25 : // DataTypeUint32 column. The integer offsets may be encoded using smaller width
26 : // integers to save space if all offsets fit within an 8-bit or 16-bit uint.
27 : // Each offset is relative to the beginning of the string data section (after
28 : // the offset table).
29 : //
30 : // The use of UintEncoding conserves space in the common case. In the context of
31 : // CockroachDB, the vast majority of offsets will fit in 16-bits when using 32
32 : // KiB blocks (the size in use by CockroachDB). However, a single value larger
33 : // than 65535 bytes requires an offset too large to fit within 16 bits, in which
34 : // case offsets will be encoded as 32-bit integers.
35 : //
36 : // +-------------------------------------------------------------------+
37 : // | a uint offsets table, usually encoded with 16-bits, |
38 : // | possibly padded for alignment |
39 : // | (see UintEncoding) |
40 : // +-------------------------------------------------------------------+
41 : // | String Data |
42 : // | abcabcada.... |
43 : // +-------------------------------------------------------------------+
44 : //
45 : // The UintEncoding bits of the ColumnEncoding for a RawBytes column describes
46 : // the encoding of the offset table.
47 : type RawBytes struct {
48 : slices int
49 : offsets UnsafeOffsets
50 : start unsafe.Pointer
51 : data unsafe.Pointer
52 : }
53 :
54 : // Assert that RawBytes implements Array[[]byte].
55 : var _ Array[[]byte] = RawBytes{}
56 :
57 : // DecodeRawBytes decodes the structure of a RawBytes, constructing an accessor
58 : // for an array of byte slices constructed by RawBytesBuilder. Count must be the
59 : // number of byte slices within the array.
60 1 : func DecodeRawBytes(b []byte, offset uint32, count int) (rawBytes RawBytes, endOffset uint32) {
61 1 : if count == 0 {
62 1 : return RawBytes{}, offset
63 1 : }
64 1 : offsets, dataOff := DecodeUnsafeOffsets(b, offset, count+1 /* +1 offset */)
65 1 : return RawBytes{
66 1 : slices: count,
67 1 : offsets: offsets,
68 1 : start: unsafe.Pointer(&b[offset]),
69 1 : data: unsafe.Pointer(&b[dataOff]),
70 1 : }, dataOff + offsets.At(count)
71 : }
72 :
73 : // Assert that DecodeRawBytes implements DecodeFunc.
74 : var _ DecodeFunc[RawBytes] = DecodeRawBytes
75 :
76 1 : func defaultSliceFormatter(x []byte) string {
77 1 : if bytes.ContainsFunc(x, func(r rune) bool { return r < 32 || r > 126 }) {
78 1 : return fmt.Sprintf("%q", x)
79 1 : }
80 1 : return string(x)
81 : }
82 :
83 1 : func rawBytesToBinFormatter(f *binfmt.Formatter, count int, sliceFormatter func([]byte) string) {
84 1 : if count == 0 {
85 1 : return
86 1 : }
87 1 : if sliceFormatter == nil {
88 1 : sliceFormatter = defaultSliceFormatter
89 1 : }
90 :
91 1 : rb, _ := DecodeRawBytes(f.RelativeData(), uint32(f.RelativeOffset()), count)
92 1 : dataOffset := uint64(f.RelativeOffset()) + uint64(uintptr(rb.data)-uintptr(rb.start))
93 1 : f.CommentLine("rawbytes")
94 1 : f.CommentLine("offsets table")
95 1 : uintsToBinFormatter(f, count+1, func(offset, base uint64) string {
96 1 : // NB: base is always zero for RawBytes columns.
97 1 : return fmt.Sprintf("%d [%d overall]", offset+base, offset+base+dataOffset)
98 1 : })
99 1 : f.CommentLine("data")
100 1 : for i := 0; i < rb.slices; i++ {
101 1 : s := rb.At(i)
102 1 : f.HexBytesln(len(s), "data[%d]: %s", i, sliceFormatter(s))
103 1 : }
104 : }
105 :
106 1 : func (b *RawBytes) ptr(offset uint32) unsafe.Pointer {
107 1 : return unsafe.Pointer(uintptr(b.data) + uintptr(offset))
108 1 : }
109 :
110 : //gcassert:inline
111 1 : func (b *RawBytes) slice(start, end uint32) []byte {
112 1 : return unsafe.Slice((*byte)(b.ptr(start)), end-start)
113 1 : }
114 :
115 : // At returns the []byte at index i. The returned slice should not be mutated.
116 1 : func (b RawBytes) At(i int) []byte {
117 1 : return b.slice(b.offsets.At2(i))
118 1 : }
119 :
120 : // Slices returns the number of []byte slices encoded within the RawBytes.
121 0 : func (b *RawBytes) Slices() int {
122 0 : return b.slices
123 0 : }
124 :
125 : // RawBytesBuilder encodes a column of byte slices.
126 : type RawBytesBuilder struct {
127 : rows int
128 : data []byte
129 : offsets UintBuilder
130 : }
131 :
132 : // Assert that *RawBytesBuilder implements ColumnWriter.
133 : var _ ColumnWriter = (*RawBytesBuilder)(nil)
134 :
135 : // Init initializes the builder for first-time use.
136 1 : func (b *RawBytesBuilder) Init() {
137 1 : b.offsets.Init()
138 1 : b.Reset()
139 1 : }
140 :
141 : // Reset resets the builder to an empty state.
142 1 : func (b *RawBytesBuilder) Reset() {
143 1 : b.rows = 0
144 1 : b.data = b.data[:0]
145 1 : b.offsets.Reset()
146 1 : // Add an initial offset of zero to streamline the logic in RawBytes.At() to
147 1 : // avoid needing a special case for row 0.
148 1 : b.offsets.Set(0, 0)
149 1 : }
150 :
151 : // NumColumns implements ColumnWriter.
152 1 : func (b *RawBytesBuilder) NumColumns() int { return 1 }
153 :
154 : // DataType implements ColumnWriter.
155 1 : func (b *RawBytesBuilder) DataType(int) DataType { return DataTypeBytes }
156 :
157 : // Put appends the provided byte slice to the builder.
158 1 : func (b *RawBytesBuilder) Put(s []byte) {
159 1 : b.data = append(b.data, s...)
160 1 : b.rows++
161 1 : b.offsets.Set(b.rows, uint64(len(b.data)))
162 1 : }
163 :
164 : // PutConcat appends a single byte slice formed by the concatenation of the two
165 : // byte slice arguments.
166 1 : func (b *RawBytesBuilder) PutConcat(s1, s2 []byte) {
167 1 : b.data = append(append(b.data, s1...), s2...)
168 1 : b.rows++
169 1 : b.offsets.Set(b.rows, uint64(len(b.data)))
170 1 : }
171 :
172 : // Rows returns the count of slices that have been added to the builder.
173 0 : func (b *RawBytesBuilder) Rows() int {
174 0 : return b.rows
175 0 : }
176 :
177 : // UnsafeGet returns the i'th slice added to the builder. The returned slice is
178 : // owned by the builder and must not be mutated.
179 1 : func (b *RawBytesBuilder) UnsafeGet(i int) []byte {
180 1 : if b.rows == 0 {
181 1 : return nil
182 1 : }
183 1 : return b.data[b.offsets.array.elems.At(i):b.offsets.array.elems.At(i+1)]
184 : }
185 :
186 : // Finish writes the serialized byte slices to buf starting at offset. The buf
187 : // slice must be sufficiently large to store the serialized output. The caller
188 : // should use [Size] to size buf appropriately before calling Finish.
189 1 : func (b *RawBytesBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 {
190 1 : if rows == 0 {
191 1 : return offset
192 1 : }
193 1 : dataLen := b.offsets.Get(rows)
194 1 : offset = b.offsets.Finish(0, rows+1, offset, buf)
195 1 : // Copy the data section.
196 1 : return offset + uint32(copy(buf[offset:], b.data[:dataLen]))
197 : }
198 :
199 : // Size computes the size required to encode the byte slices beginning in a
200 : // buffer at the provided offset. The offset is required to ensure proper
201 : // alignment. The returned uint32 is the offset of the first byte after the end
202 : // of the encoded data. To compute the size in bytes, subtract the [offset]
203 : // passed into Size from the returned offset.
204 1 : func (b *RawBytesBuilder) Size(rows int, offset uint32) uint32 {
205 1 : if rows == 0 {
206 1 : return offset
207 1 : }
208 : // Get the size needed to encode the rows+1 offsets.
209 1 : offset = b.offsets.Size(rows+1, offset)
210 1 : // Add the value of offset[rows] since that is the accumulated size of the
211 1 : // first [rows] slices.
212 1 : return offset + uint32(b.offsets.Get(rows))
213 : }
214 :
215 : // WriteDebug implements Encoder.
216 1 : func (b *RawBytesBuilder) WriteDebug(w io.Writer, rows int) {
217 1 : fmt.Fprintf(w, "bytes: %d rows set; %d bytes in data", b.rows, len(b.data))
218 1 : }
|