Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "encoding/binary"
10 : "fmt"
11 : "math"
12 : "reflect"
13 : "sort"
14 : "strings"
15 : "unsafe"
16 :
17 : "github.com/cockroachdb/pebble/internal/intern"
18 : "github.com/cockroachdb/pebble/sstable/rowblk"
19 : )
20 :
21 : const propertiesBlockRestartInterval = math.MaxInt32
22 :
23 : var propTagMap = make(map[string]reflect.StructField)
24 : var propBoolTrue = []byte{'1'}
25 : var propBoolFalse = []byte{'0'}
26 :
27 : var propOffsetTagMap = make(map[uintptr]string)
28 :
29 1 : func generateTagMaps(t reflect.Type, indexPrefix []int) {
30 1 : for i := 0; i < t.NumField(); i++ {
31 1 : f := t.Field(i)
32 1 : if f.Type.Kind() == reflect.Struct {
33 1 : if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" {
34 1 : // CommonProperties struct embedded in Properties. Note that since
35 1 : // CommonProperties is placed at the top of properties we can use
36 1 : // the offsets of the fields within CommonProperties to determine
37 1 : // the offsets of those fields within Properties.
38 1 : generateTagMaps(f.Type, []int{i})
39 1 : continue
40 : }
41 0 : panic("pebble: unknown struct type in Properties")
42 : }
43 1 : if tag := f.Tag.Get("prop"); tag != "" {
44 1 : switch f.Type.Kind() {
45 1 : case reflect.Bool:
46 1 : case reflect.Uint32:
47 1 : case reflect.Uint64:
48 1 : case reflect.String:
49 0 : default:
50 0 : panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type))
51 : }
52 1 : if len(indexPrefix) > 0 {
53 1 : // Prepend the index prefix so that we can use FieldByIndex on the top-level struct.
54 1 : f.Index = append(indexPrefix[:len(indexPrefix):len(indexPrefix)], f.Index...)
55 1 : }
56 1 : propTagMap[tag] = f
57 1 : propOffsetTagMap[f.Offset] = tag
58 : }
59 : }
60 : }
61 :
62 1 : func init() {
63 1 : generateTagMaps(reflect.TypeOf(Properties{}), nil)
64 1 : }
65 :
66 : // CommonProperties holds properties for either a virtual or a physical sstable. This
67 : // can be used by code which doesn't care to make the distinction between physical
68 : // and virtual sstables properties.
69 : //
70 : // For virtual sstables, fields are constructed through extrapolation upon virtual
71 : // reader construction. See MakeVirtualReader for implementation details.
72 : //
73 : // NB: The values of these properties can affect correctness. For example,
74 : // if NumRangeKeySets == 0, but the sstable actually contains range keys, then
75 : // the iterators will behave incorrectly.
76 : type CommonProperties struct {
77 : // The number of entries in this table.
78 : NumEntries uint64 `prop:"rocksdb.num.entries"`
79 : // Total raw key size.
80 : RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
81 : // Total raw value size.
82 : RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
83 : // Total raw key size of point deletion tombstones. This value is comparable
84 : // to RawKeySize.
85 : RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
86 : // Sum of the raw value sizes carried by point deletion tombstones
87 : // containing size estimates. See the DeleteSized key kind. This value is
88 : // comparable to Raw{Key,Value}Size.
89 : RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
90 : // The number of point deletion entries ("tombstones") in this table that
91 : // carry a size hint indicating the size of the value the tombstone deletes.
92 : NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
93 : // The number of deletion entries in this table, including both point and
94 : // range deletions.
95 : NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
96 : // The number of range deletions in this table.
97 : NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
98 : // The number of RANGEKEYDELs in this table.
99 : NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
100 : // The number of RANGEKEYSETs in this table.
101 : NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
102 : // Total size of value blocks and value index block. Only serialized if > 0.
103 : ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
104 : // NumDataBlocks is the number of data blocks in this table.
105 : NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
106 : // NumTombstoneDenseBlocks is the number of data blocks in this table that
107 : // are considered tombstone-dense. See the TombstoneDenseBlocksRatio field
108 : // in manifest.TableStats for the criteria used to determine if a data
109 : // block is tombstone-dense.
110 : NumTombstoneDenseBlocks uint64 `prop:"pebble.num.tombstone-dense-blocks"`
111 : // The compression algorithm used to compress blocks.
112 : CompressionName string `prop:"rocksdb.compression"`
113 : // The compression options used to compress blocks.
114 : CompressionOptions string `prop:"rocksdb.compression_options"`
115 : }
116 :
117 : // String is only used for testing purposes.
118 0 : func (c *CommonProperties) String() string {
119 0 : var buf bytes.Buffer
120 0 : v := reflect.ValueOf(*c)
121 0 : loaded := make(map[uintptr]struct{})
122 0 : writeProperties(loaded, v, &buf)
123 0 : return buf.String()
124 0 : }
125 :
126 : // NumPointDeletions is the number of point deletions in the sstable. For virtual
127 : // sstables, this is an estimate.
128 1 : func (c *CommonProperties) NumPointDeletions() uint64 {
129 1 : return c.NumDeletions - c.NumRangeDeletions
130 1 : }
131 :
132 : // Properties holds the sstable property values. The properties are
133 : // automatically populated during sstable creation and load from the properties
134 : // meta block when an sstable is opened.
135 : type Properties struct {
136 : // CommonProperties needs to be at the top of the Properties struct so that the
137 : // offsets of the fields in CommonProperties match the offsets of the embedded
138 : // fields of CommonProperties in Properties.
139 : CommonProperties `prop:"pebble.embbeded_common_properties"`
140 :
141 : // The name of the comparer used in this table.
142 : ComparerName string `prop:"rocksdb.comparator"`
143 : // The total size of all data blocks.
144 : DataSize uint64 `prop:"rocksdb.data.size"`
145 : // The name of the filter policy used in this table. Empty if no filter
146 : // policy is used.
147 : FilterPolicyName string `prop:"rocksdb.filter.policy"`
148 : // The size of filter block.
149 : FilterSize uint64 `prop:"rocksdb.filter.size"`
150 : // Total number of index partitions if kTwoLevelIndexSearch is used.
151 : IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
152 : // The size of index block.
153 : IndexSize uint64 `prop:"rocksdb.index.size"`
154 : // The index type. TODO(peter): add a more detailed description.
155 : IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
156 : // For formats >= TableFormatPebblev4, this is set to true if the obsolete
157 : // bit is strict for all the point keys.
158 : IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
159 : // The name of the key schema used in this table. Empty for formats <=
160 : // TableFormatPebblev4.
161 : KeySchemaName string `prop:"pebble.colblk.schema"`
162 : // The name of the merger used in this table. Empty if no merger is used.
163 : MergerName string `prop:"rocksdb.merge.operator"`
164 : // The number of merge operands in the table.
165 : NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
166 : // The number of RANGEKEYUNSETs in this table.
167 : NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
168 : // The number of value blocks in this table. Only serialized if > 0.
169 : NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
170 : // The number of values stored in value blocks. Only serialized if > 0.
171 : NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
172 : // A comma separated list of names of the property collectors used in this
173 : // table.
174 : PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
175 : // Total raw rangekey key size.
176 : RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
177 : // Total raw rangekey value size.
178 : RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
179 : // The total number of keys in this table that were pinned by open snapshots.
180 : SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
181 : // The cumulative bytes of keys in this table that were pinned by
182 : // open snapshots. This value is comparable to RawKeySize.
183 : SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
184 : // The cumulative bytes of values in this table that were pinned by
185 : // open snapshots. This value is comparable to RawValueSize.
186 : SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
187 : // Size of the top-level index if kTwoLevelIndexSearch is used.
188 : TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
189 : // User collected properties. Currently, we only use them to store block
190 : // properties aggregated at the table level.
191 : UserProperties map[string]string
192 :
193 : // Loaded set indicating which fields have been loaded from disk. Indexed by
194 : // the field's byte offset within the struct
195 : // (reflect.StructField.Offset). Only set if the properties have been loaded
196 : // from a file. Only exported for testing purposes.
197 : Loaded map[uintptr]struct{}
198 : }
199 :
200 : // NumPointDeletions returns the number of point deletions in this table.
201 0 : func (p *Properties) NumPointDeletions() uint64 {
202 0 : return p.NumDeletions - p.NumRangeDeletions
203 0 : }
204 :
205 : // NumRangeKeys returns a count of the number of range keys in this table.
206 1 : func (p *Properties) NumRangeKeys() uint64 {
207 1 : return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets
208 1 : }
209 :
210 0 : func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) {
211 0 : vt := v.Type()
212 0 : for i := 0; i < v.NumField(); i++ {
213 0 : ft := vt.Field(i)
214 0 : if ft.Type.Kind() == reflect.Struct {
215 0 : // Embedded struct within the properties.
216 0 : writeProperties(loaded, v.Field(i), buf)
217 0 : continue
218 : }
219 0 : tag := ft.Tag.Get("prop")
220 0 : if tag == "" {
221 0 : continue
222 : }
223 :
224 0 : f := v.Field(i)
225 0 : // TODO(peter): Use f.IsZero() when we can rely on go1.13.
226 0 : if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() {
227 0 : // Skip printing of zero values which were not loaded from disk.
228 0 : if _, ok := loaded[ft.Offset]; !ok {
229 0 : continue
230 : }
231 : }
232 :
233 0 : fmt.Fprintf(buf, "%s: ", tag)
234 0 : switch ft.Type.Kind() {
235 0 : case reflect.Bool:
236 0 : fmt.Fprintf(buf, "%t\n", f.Bool())
237 0 : case reflect.Uint32:
238 0 : fmt.Fprintf(buf, "%d\n", f.Uint())
239 0 : case reflect.Uint64:
240 0 : fmt.Fprintf(buf, "%d\n", f.Uint())
241 0 : case reflect.String:
242 0 : fmt.Fprintf(buf, "%s\n", f.String())
243 0 : default:
244 0 : panic("not reached")
245 : }
246 : }
247 : }
248 :
249 0 : func (p *Properties) String() string {
250 0 : var buf bytes.Buffer
251 0 : v := reflect.ValueOf(*p)
252 0 : writeProperties(p.Loaded, v, &buf)
253 0 :
254 0 : // Write the UserProperties.
255 0 : keys := make([]string, 0, len(p.UserProperties))
256 0 : for key := range p.UserProperties {
257 0 : keys = append(keys, key)
258 0 : }
259 0 : sort.Strings(keys)
260 0 : for _, key := range keys {
261 0 : // If there are characters outside of the printable ASCII range, print
262 0 : // the value in hexadecimal.
263 0 : if strings.IndexFunc(p.UserProperties[key], func(r rune) bool { return r < ' ' || r > '~' }) != -1 {
264 0 : fmt.Fprintf(&buf, "%s: hex:%x\n", key, p.UserProperties[key])
265 0 : } else {
266 0 : fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key])
267 0 : }
268 : }
269 0 : return buf.String()
270 : }
271 :
272 1 : func (p *Properties) load(b []byte, deniedUserProperties map[string]struct{}) error {
273 1 : i, err := rowblk.NewRawIter(bytes.Compare, b)
274 1 : if err != nil {
275 0 : return err
276 0 : }
277 1 : p.Loaded = make(map[uintptr]struct{})
278 1 : v := reflect.ValueOf(p).Elem()
279 1 :
280 1 : for valid := i.First(); valid; valid = i.Next() {
281 1 : if f, ok := propTagMap[string(i.Key().UserKey)]; ok {
282 1 : p.Loaded[f.Offset] = struct{}{}
283 1 : field := v.FieldByIndex(f.Index)
284 1 : switch f.Type.Kind() {
285 0 : case reflect.Bool:
286 0 : field.SetBool(bytes.Equal(i.Value(), propBoolTrue))
287 1 : case reflect.Uint32:
288 1 : field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value())))
289 1 : case reflect.Uint64:
290 1 : n, _ := binary.Uvarint(i.Value())
291 1 : field.SetUint(n)
292 1 : case reflect.String:
293 1 : field.SetString(intern.Bytes(i.Value()))
294 0 : default:
295 0 : panic("not reached")
296 : }
297 1 : continue
298 : }
299 1 : if p.UserProperties == nil {
300 1 : p.UserProperties = make(map[string]string)
301 1 : }
302 :
303 1 : if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied {
304 1 : p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value())
305 1 : }
306 : }
307 1 : return nil
308 : }
309 :
310 0 : func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
311 0 : tag := propOffsetTagMap[offset]
312 0 : if value {
313 0 : m[tag] = propBoolTrue
314 0 : } else {
315 0 : m[tag] = propBoolFalse
316 0 : }
317 : }
318 :
319 1 : func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) {
320 1 : var buf [4]byte
321 1 : binary.LittleEndian.PutUint32(buf[:], value)
322 1 : m[propOffsetTagMap[offset]] = buf[:]
323 1 : }
324 :
325 0 : func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) {
326 0 : var buf [8]byte
327 0 : binary.LittleEndian.PutUint64(buf[:], value)
328 0 : m[propOffsetTagMap[offset]] = buf[:]
329 0 : }
330 :
331 : var _ = (*Properties).saveUint64
332 :
333 1 : func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) {
334 1 : var buf [10]byte
335 1 : n := binary.PutUvarint(buf[:], value)
336 1 : m[propOffsetTagMap[offset]] = buf[:n]
337 1 : }
338 :
339 1 : func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) {
340 1 : m[propOffsetTagMap[offset]] = []byte(value)
341 1 : }
342 :
343 1 : func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) {
344 1 : m := make(map[string][]byte)
345 1 : for k, v := range p.UserProperties {
346 1 : m[k] = []byte(v)
347 1 : }
348 :
349 1 : if p.ComparerName != "" {
350 1 : p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName)
351 1 : }
352 1 : if p.CompressionName != "" {
353 1 : p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName)
354 1 : }
355 1 : if p.CompressionOptions != "" {
356 1 : p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions)
357 1 : }
358 1 : p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize)
359 1 : if p.FilterPolicyName != "" {
360 1 : p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName)
361 1 : }
362 1 : p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize)
363 1 : if p.IndexPartitions != 0 {
364 1 : p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions)
365 1 : p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize)
366 1 : }
367 1 : p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
368 1 : p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
369 1 : if p.IsStrictObsolete {
370 0 : p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
371 0 : }
372 1 : if p.KeySchemaName != "" {
373 1 : p.saveString(m, unsafe.Offsetof(p.KeySchemaName), p.KeySchemaName)
374 1 : }
375 1 : if p.MergerName != "" {
376 1 : p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
377 1 : }
378 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks)
379 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries)
380 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions)
381 1 : if p.NumSizedDeletions > 0 {
382 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions)
383 1 : }
384 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands)
385 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions)
386 1 : // NB: We only write out some properties for Pebble formats. This isn't
387 1 : // strictly necessary because unrecognized properties are interpreted as
388 1 : // user-defined properties, however writing them prevents byte-for-byte
389 1 : // equivalence with RocksDB files that some of our testing requires.
390 1 : if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 {
391 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize)
392 1 : }
393 1 : if p.RawPointTombstoneValueSize > 0 {
394 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize)
395 1 : }
396 1 : if p.NumRangeKeys() > 0 {
397 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels)
398 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets)
399 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets)
400 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize)
401 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize)
402 1 : }
403 1 : if p.NumValueBlocks > 0 {
404 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks)
405 1 : }
406 1 : if p.NumValuesInValueBlocks > 0 {
407 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks)
408 1 : }
409 1 : if p.PropertyCollectorNames != "" {
410 1 : p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames)
411 1 : }
412 1 : if p.SnapshotPinnedKeys > 0 {
413 1 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys)
414 1 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize)
415 1 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize)
416 1 : }
417 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize)
418 1 : p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize)
419 1 : if p.ValueBlocksSize > 0 {
420 1 : p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
421 1 : }
422 1 : if p.NumTombstoneDenseBlocks != 0 {
423 1 : p.saveUvarint(m, unsafe.Offsetof(p.NumTombstoneDenseBlocks), p.NumTombstoneDenseBlocks)
424 1 : }
425 :
426 1 : if tblFormat < TableFormatPebblev1 {
427 0 : m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
428 0 : m["rocksdb.fixed.key.length"] = []byte{0x00}
429 0 : m["rocksdb.index.key.is.user.key"] = []byte{0x00}
430 0 : m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00}
431 0 : m["rocksdb.oldest.key.time"] = []byte{0x00}
432 0 : m["rocksdb.creation.time"] = []byte{0x00}
433 0 : m["rocksdb.format.version"] = []byte{0x00}
434 0 : }
435 :
436 1 : keys := make([]string, 0, len(m))
437 1 : for key := range m {
438 1 : keys = append(keys, key)
439 1 : }
440 1 : sort.Strings(keys)
441 1 : for _, key := range keys {
442 1 : w.AddRawString(key, m[key])
443 1 : }
444 : }
|