Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "cmp"
10 : "context"
11 : "encoding/binary"
12 : "fmt"
13 : "io"
14 : "slices"
15 : "sort"
16 : "unsafe"
17 :
18 : "github.com/cockroachdb/pebble/internal/base"
19 : )
20 :
21 : // Layout describes the block organization of an sstable.
22 : type Layout struct {
23 : // NOTE: changes to fields in this struct should also be reflected in
24 : // ValidateBlockChecksums, which validates a static list of BlockHandles
25 : // referenced in this struct.
26 :
27 : Data []BlockHandleWithProperties
28 : Index []BlockHandle
29 : TopIndex BlockHandle
30 : Filter BlockHandle
31 : RangeDel BlockHandle
32 : RangeKey BlockHandle
33 : ValueBlock []BlockHandle
34 : ValueIndex BlockHandle
35 : Properties BlockHandle
36 : MetaIndex BlockHandle
37 : Footer BlockHandle
38 : Format TableFormat
39 : }
40 :
41 : // Describe returns a description of the layout. If the verbose parameter is
42 : // true, details of the structure of each block are returned as well.
43 : func (l *Layout) Describe(
44 : w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
45 0 : ) {
46 0 : ctx := context.TODO()
47 0 : type block struct {
48 0 : BlockHandle
49 0 : name string
50 0 : }
51 0 : var blocks []block
52 0 :
53 0 : for i := range l.Data {
54 0 : blocks = append(blocks, block{l.Data[i].BlockHandle, "data"})
55 0 : }
56 0 : for i := range l.Index {
57 0 : blocks = append(blocks, block{l.Index[i], "index"})
58 0 : }
59 0 : if l.TopIndex.Length != 0 {
60 0 : blocks = append(blocks, block{l.TopIndex, "top-index"})
61 0 : }
62 0 : if l.Filter.Length != 0 {
63 0 : blocks = append(blocks, block{l.Filter, "filter"})
64 0 : }
65 0 : if l.RangeDel.Length != 0 {
66 0 : blocks = append(blocks, block{l.RangeDel, "range-del"})
67 0 : }
68 0 : if l.RangeKey.Length != 0 {
69 0 : blocks = append(blocks, block{l.RangeKey, "range-key"})
70 0 : }
71 0 : for i := range l.ValueBlock {
72 0 : blocks = append(blocks, block{l.ValueBlock[i], "value-block"})
73 0 : }
74 0 : if l.ValueIndex.Length != 0 {
75 0 : blocks = append(blocks, block{l.ValueIndex, "value-index"})
76 0 : }
77 0 : if l.Properties.Length != 0 {
78 0 : blocks = append(blocks, block{l.Properties, "properties"})
79 0 : }
80 0 : if l.MetaIndex.Length != 0 {
81 0 : blocks = append(blocks, block{l.MetaIndex, "meta-index"})
82 0 : }
83 0 : if l.Footer.Length != 0 {
84 0 : if l.Footer.Length == levelDBFooterLen {
85 0 : blocks = append(blocks, block{l.Footer, "leveldb-footer"})
86 0 : } else {
87 0 : blocks = append(blocks, block{l.Footer, "footer"})
88 0 : }
89 : }
90 :
91 0 : slices.SortFunc(blocks, func(a, b block) int {
92 0 : return cmp.Compare(a.Offset, b.Offset)
93 0 : })
94 0 : for i := range blocks {
95 0 : b := &blocks[i]
96 0 : fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length)
97 0 :
98 0 : if !verbose {
99 0 : continue
100 : }
101 0 : if b.name == "filter" {
102 0 : continue
103 : }
104 :
105 0 : if b.name == "footer" || b.name == "leveldb-footer" {
106 0 : trailer, offset := make([]byte, b.Length), b.Offset
107 0 : _ = r.readable.ReadAt(ctx, trailer, int64(offset))
108 0 :
109 0 : if b.name == "footer" {
110 0 : checksumType := ChecksumType(trailer[0])
111 0 : fmt.Fprintf(w, "%10d checksum type: %s\n", offset, checksumType)
112 0 : trailer, offset = trailer[1:], offset+1
113 0 : }
114 :
115 0 : metaHandle, n := binary.Uvarint(trailer)
116 0 : metaLen, m := binary.Uvarint(trailer[n:])
117 0 : fmt.Fprintf(w, "%10d meta: offset=%d, length=%d\n", offset, metaHandle, metaLen)
118 0 : trailer, offset = trailer[n+m:], offset+uint64(n+m)
119 0 :
120 0 : indexHandle, n := binary.Uvarint(trailer)
121 0 : indexLen, m := binary.Uvarint(trailer[n:])
122 0 : fmt.Fprintf(w, "%10d index: offset=%d, length=%d\n", offset, indexHandle, indexLen)
123 0 : trailer, offset = trailer[n+m:], offset+uint64(n+m)
124 0 :
125 0 : fmt.Fprintf(w, "%10d [padding]\n", offset)
126 0 :
127 0 : trailing := 12
128 0 : if b.name == "leveldb-footer" {
129 0 : trailing = 8
130 0 : }
131 :
132 0 : offset += uint64(len(trailer) - trailing)
133 0 : trailer = trailer[len(trailer)-trailing:]
134 0 :
135 0 : if b.name == "footer" {
136 0 : version := trailer[:4]
137 0 : fmt.Fprintf(w, "%10d version: %d\n", offset, binary.LittleEndian.Uint32(version))
138 0 : trailer, offset = trailer[4:], offset+4
139 0 : }
140 :
141 0 : magicNumber := trailer
142 0 : fmt.Fprintf(w, "%10d magic number: 0x%x\n", offset, magicNumber)
143 0 :
144 0 : continue
145 : }
146 :
147 0 : h, err := r.readBlock(
148 0 : context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
149 0 : if err != nil {
150 0 : fmt.Fprintf(w, " [err: %s]\n", err)
151 0 : continue
152 : }
153 :
154 0 : getRestart := func(data []byte, restarts, i int32) int32 {
155 0 : return decodeRestart(data[restarts+4*i:])
156 0 : }
157 :
158 0 : formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
159 0 : i := sort.Search(int(numRestarts), func(i int) bool {
160 0 : return getRestart(data, restarts, int32(i)) >= offset
161 0 : })
162 0 : if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
163 0 : fmt.Fprintf(w, " [restart]\n")
164 0 : } else {
165 0 : fmt.Fprintf(w, "\n")
166 0 : }
167 : }
168 :
169 0 : formatRestarts := func(data []byte, restarts, numRestarts int32) {
170 0 : for i := int32(0); i < numRestarts; i++ {
171 0 : offset := getRestart(data, restarts, i)
172 0 : fmt.Fprintf(w, "%10d [restart %d]\n",
173 0 : b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
174 0 : }
175 : }
176 :
177 0 : formatTrailer := func() {
178 0 : trailer := make([]byte, blockTrailerLen)
179 0 : offset := int64(b.Offset + b.Length)
180 0 : _ = r.readable.ReadAt(ctx, trailer, offset)
181 0 : bt := blockType(trailer[0])
182 0 : checksum := binary.LittleEndian.Uint32(trailer[1:])
183 0 : fmt.Fprintf(w, "%10d [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum)
184 0 : }
185 :
186 0 : var lastKey InternalKey
187 0 : switch b.name {
188 0 : case "data", "range-del", "range-key":
189 0 : iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), NoTransforms)
190 0 : for key, value := iter.First(); key != nil; key, value = iter.Next() {
191 0 : ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
192 0 : shared, ptr := decodeVarint(ptr)
193 0 : unshared, ptr := decodeVarint(ptr)
194 0 : value2, _ := decodeVarint(ptr)
195 0 :
196 0 : total := iter.nextOffset - iter.offset
197 0 : // The format of the numbers in the record line is:
198 0 : //
199 0 : // (<total> = <length> [<shared>] + <unshared> + <value>)
200 0 : //
201 0 : // <total> is the total number of bytes for the record.
202 0 : // <length> is the size of the 3 varint encoded integers for <shared>,
203 0 : // <unshared>, and <value>.
204 0 : // <shared> is the number of key bytes shared with the previous key.
205 0 : // <unshared> is the number of unshared key bytes.
206 0 : // <value> is the number of value bytes.
207 0 : fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)",
208 0 : b.Offset+uint64(iter.offset), total,
209 0 : total-int32(unshared+value2), shared, unshared, value2)
210 0 : formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
211 0 : if fmtRecord != nil {
212 0 : fmt.Fprintf(w, " ")
213 0 : if l.Format < TableFormatPebblev3 {
214 0 : fmtRecord(key, value.InPlaceValue())
215 0 : } else {
216 0 : // InPlaceValue() will succeed even for data blocks where the
217 0 : // actual value is in a different location, since this value was
218 0 : // fetched from a blockIter which does not know about value
219 0 : // blocks.
220 0 : v := value.InPlaceValue()
221 0 : if base.TrailerKind(key.Trailer) != InternalKeyKindSet {
222 0 : fmtRecord(key, v)
223 0 : } else if !isValueHandle(valuePrefix(v[0])) {
224 0 : fmtRecord(key, v[1:])
225 0 : } else {
226 0 : vh := decodeValueHandle(v[1:])
227 0 : fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh)))
228 0 : }
229 : }
230 : }
231 :
232 0 : if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
233 0 : fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n")
234 0 : }
235 0 : lastKey.Trailer = key.Trailer
236 0 : lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
237 : }
238 0 : formatRestarts(iter.data, iter.restarts, iter.numRestarts)
239 0 : formatTrailer()
240 0 : case "index", "top-index":
241 0 : iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), NoTransforms)
242 0 : for key, value := iter.First(); key != nil; key, value = iter.Next() {
243 0 : bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
244 0 : if err != nil {
245 0 : fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err)
246 0 : continue
247 : }
248 0 : fmt.Fprintf(w, "%10d block:%d/%d",
249 0 : b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
250 0 : formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
251 : }
252 0 : formatRestarts(iter.data, iter.restarts, iter.numRestarts)
253 0 : formatTrailer()
254 0 : case "properties":
255 0 : iter, _ := newRawBlockIter(r.Compare, h.Get())
256 0 : for valid := iter.First(); valid; valid = iter.Next() {
257 0 : fmt.Fprintf(w, "%10d %s (%d)",
258 0 : b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
259 0 : formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
260 0 : }
261 0 : formatRestarts(iter.data, iter.restarts, iter.numRestarts)
262 0 : formatTrailer()
263 0 : case "meta-index":
264 0 : iter, _ := newRawBlockIter(r.Compare, h.Get())
265 0 : for valid := iter.First(); valid; valid = iter.Next() {
266 0 : value := iter.Value()
267 0 : var bh BlockHandle
268 0 : var n int
269 0 : var vbih valueBlocksIndexHandle
270 0 : isValueBlocksIndexHandle := false
271 0 : if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) {
272 0 : vbih, n, err = decodeValueBlocksIndexHandle(value)
273 0 : bh = vbih.h
274 0 : isValueBlocksIndexHandle = true
275 0 : } else {
276 0 : bh, n = decodeBlockHandle(value)
277 0 : }
278 0 : if n == 0 || n != len(value) {
279 0 : fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err)
280 0 : continue
281 : }
282 0 : var vbihStr string
283 0 : if isValueBlocksIndexHandle {
284 0 : vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)",
285 0 : vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength)
286 0 : }
287 0 : fmt.Fprintf(w, "%10d %s block:%d/%d%s",
288 0 : b.Offset+uint64(iter.offset), iter.Key().UserKey,
289 0 : bh.Offset, bh.Length, vbihStr)
290 0 : formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
291 : }
292 0 : formatRestarts(iter.data, iter.restarts, iter.numRestarts)
293 0 : formatTrailer()
294 0 : case "value-block":
295 : // We don't peer into the value-block since it can't be interpreted
296 : // without the valueHandles.
297 0 : case "value-index":
298 : // We have already read the value-index to construct the list of
299 : // value-blocks, so no need to do it again.
300 : }
301 :
302 0 : h.Release()
303 : }
304 :
305 0 : last := blocks[len(blocks)-1]
306 0 : fmt.Fprintf(w, "%10d EOF\n", last.Offset+last.Length)
307 : }
|