Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "encoding/binary"
10 : "fmt"
11 : "math"
12 : "reflect"
13 : "sort"
14 : "unsafe"
15 :
16 : "github.com/cockroachdb/pebble/internal/intern"
17 : )
18 :
19 : const propertiesBlockRestartInterval = math.MaxInt32
20 : const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno"
21 :
22 : var propTagMap = make(map[string]reflect.StructField)
23 : var propBoolTrue = []byte{'1'}
24 : var propBoolFalse = []byte{'0'}
25 :
26 : var propOffsetTagMap = make(map[uintptr]string)
27 :
28 2 : func generateTagMaps(t reflect.Type) {
29 2 : for i := 0; i < t.NumField(); i++ {
30 2 : f := t.Field(i)
31 2 : if f.Type.Kind() == reflect.Struct {
32 2 : if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" {
33 2 : // CommonProperties struct embedded in Properties. Note that since
34 2 : // CommonProperties is placed at the top of properties we can use
35 2 : // the offsets of the fields within CommonProperties to determine
36 2 : // the offsets of those fields within Properties.
37 2 : generateTagMaps(f.Type)
38 2 : continue
39 : }
40 0 : panic("pebble: unknown struct type in Properties")
41 : }
42 2 : if tag := f.Tag.Get("prop"); tag != "" {
43 2 : switch f.Type.Kind() {
44 2 : case reflect.Bool:
45 2 : case reflect.Uint32:
46 2 : case reflect.Uint64:
47 2 : case reflect.String:
48 0 : default:
49 0 : panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type))
50 : }
51 2 : propTagMap[tag] = f
52 2 : propOffsetTagMap[f.Offset] = tag
53 : }
54 : }
55 : }
56 :
57 2 : func init() {
58 2 : t := reflect.TypeOf(Properties{})
59 2 : generateTagMaps(t)
60 2 : }
61 :
62 : // CommonProperties holds properties for either a virtual or a physical sstable. This
63 : // can be used by code which doesn't care to make the distinction between physical
64 : // and virtual sstables properties.
65 : //
66 : // For virtual sstables, fields are constructed through extrapolation upon virtual
67 : // reader construction. See MakeVirtualReader for implementation details.
68 : //
69 : // NB: The values of these properties can affect correctness. For example,
70 : // if NumRangeKeySets == 0, but the sstable actually contains range keys, then
71 : // the iterators will behave incorrectly.
72 : type CommonProperties struct {
73 : // The number of entries in this table.
74 : NumEntries uint64 `prop:"rocksdb.num.entries"`
75 : // Total raw key size.
76 : RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
77 : // Total raw value size.
78 : RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
79 : // Total raw key size of point deletion tombstones. This value is comparable
80 : // to RawKeySize.
81 : RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
82 : // Sum of the raw value sizes carried by point deletion tombstones
83 : // containing size estimates. See the DeleteSized key kind. This value is
84 : // comparable to Raw{Key,Value}Size.
85 : RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
86 : // The number of point deletion entries ("tombstones") in this table that
87 : // carry a size hint indicating the size of the value the tombstone deletes.
88 : NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
89 : // The number of deletion entries in this table, including both point and
90 : // range deletions.
91 : NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
92 : // The number of range deletions in this table.
93 : NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
94 : // The number of RANGEKEYDELs in this table.
95 : NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
96 : // The number of RANGEKEYSETs in this table.
97 : NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
98 : // Total size of value blocks and value index block. Only serialized if > 0.
99 : ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
100 : }
101 :
102 : // String is only used for testing purposes.
103 1 : func (c *CommonProperties) String() string {
104 1 : var buf bytes.Buffer
105 1 : v := reflect.ValueOf(*c)
106 1 : loaded := make(map[uintptr]struct{})
107 1 : writeProperties(loaded, v, &buf)
108 1 : return buf.String()
109 1 : }
110 :
111 : // NumPointDeletions is the number of point deletions in the sstable. For virtual
112 : // sstables, this is an estimate.
113 2 : func (c *CommonProperties) NumPointDeletions() uint64 {
114 2 : return c.NumDeletions - c.NumRangeDeletions
115 2 : }
116 :
117 : // Properties holds the sstable property values. The properties are
118 : // automatically populated during sstable creation and load from the properties
119 : // meta block when an sstable is opened.
120 : type Properties struct {
121 : // CommonProperties needs to be at the top of the Properties struct so that the
122 : // offsets of the fields in CommonProperties match the offsets of the embedded
123 : // fields of CommonProperties in Properties.
124 : CommonProperties `prop:"pebble.embbeded_common_properties"`
125 :
126 : // The name of the comparer used in this table.
127 : ComparerName string `prop:"rocksdb.comparator"`
128 : // The compression algorithm used to compress blocks.
129 : CompressionName string `prop:"rocksdb.compression"`
130 : // The compression options used to compress blocks.
131 : CompressionOptions string `prop:"rocksdb.compression_options"`
132 : // The total size of all data blocks.
133 : DataSize uint64 `prop:"rocksdb.data.size"`
134 : // The external sstable version format. Version 2 is the one RocksDB has been
135 : // using since 5.13. RocksDB only uses the global sequence number for an
136 : // sstable if this property has been set.
137 : ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"`
138 : // The name of the filter policy used in this table. Empty if no filter
139 : // policy is used.
140 : FilterPolicyName string `prop:"rocksdb.filter.policy"`
141 : // The size of filter block.
142 : FilterSize uint64 `prop:"rocksdb.filter.size"`
143 : // The global sequence number to use for all entries in the table. Present if
144 : // the table was created externally and ingested whole.
145 : GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"`
146 : // Total number of index partitions if kTwoLevelIndexSearch is used.
147 : IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
148 : // The size of index block.
149 : IndexSize uint64 `prop:"rocksdb.index.size"`
150 : // The index type. TODO(peter): add a more detailed description.
151 : IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
152 : // For formats >= TableFormatPebblev4, this is set to true if the obsolete
153 : // bit is strict for all the point keys.
154 : IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
155 : // The name of the merger used in this table. Empty if no merger is used.
156 : MergerName string `prop:"rocksdb.merge.operator"`
157 : // The number of blocks in this table.
158 : NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
159 : // The number of merge operands in the table.
160 : NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
161 : // The number of RANGEKEYUNSETs in this table.
162 : NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
163 : // The number of value blocks in this table. Only serialized if > 0.
164 : NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
165 : // The number of values stored in value blocks. Only serialized if > 0.
166 : NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
167 : // The name of the prefix extractor used in this table. Empty if no prefix
168 : // extractor is used.
169 : PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"`
170 : // If filtering is enabled, was the filter created on the key prefix.
171 : PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"`
172 : // A comma separated list of names of the property collectors used in this
173 : // table.
174 : PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
175 : // Total raw rangekey key size.
176 : RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
177 : // Total raw rangekey value size.
178 : RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
179 : // The total number of keys in this table that were pinned by open snapshots.
180 : SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
181 : // The cumulative bytes of keys in this table that were pinned by
182 : // open snapshots. This value is comparable to RawKeySize.
183 : SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
184 : // The cumulative bytes of values in this table that were pinned by
185 : // open snapshots. This value is comparable to RawValueSize.
186 : SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
187 : // Size of the top-level index if kTwoLevelIndexSearch is used.
188 : TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
189 : // User collected properties.
190 : UserProperties map[string]string
191 : // If filtering is enabled, was the filter created on the whole key.
192 : WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"`
193 :
194 : // Loaded set indicating which fields have been loaded from disk. Indexed by
195 : // the field's byte offset within the struct
196 : // (reflect.StructField.Offset). Only set if the properties have been loaded
197 : // from a file. Only exported for testing purposes.
198 : Loaded map[uintptr]struct{}
199 : }
200 :
201 : // NumPointDeletions returns the number of point deletions in this table.
202 1 : func (p *Properties) NumPointDeletions() uint64 {
203 1 : return p.NumDeletions - p.NumRangeDeletions
204 1 : }
205 :
206 : // NumRangeKeys returns a count of the number of range keys in this table.
207 2 : func (p *Properties) NumRangeKeys() uint64 {
208 2 : return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets
209 2 : }
210 :
211 1 : func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) {
212 1 : vt := v.Type()
213 1 : for i := 0; i < v.NumField(); i++ {
214 1 : ft := vt.Field(i)
215 1 : if ft.Type.Kind() == reflect.Struct {
216 1 : // Embedded struct within the properties.
217 1 : writeProperties(loaded, v.Field(i), buf)
218 1 : continue
219 : }
220 1 : tag := ft.Tag.Get("prop")
221 1 : if tag == "" {
222 1 : continue
223 : }
224 :
225 1 : f := v.Field(i)
226 1 : // TODO(peter): Use f.IsZero() when we can rely on go1.13.
227 1 : if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() {
228 1 : // Skip printing of zero values which were not loaded from disk.
229 1 : if _, ok := loaded[ft.Offset]; !ok {
230 1 : continue
231 : }
232 : }
233 :
234 1 : fmt.Fprintf(buf, "%s: ", tag)
235 1 : switch ft.Type.Kind() {
236 1 : case reflect.Bool:
237 1 : fmt.Fprintf(buf, "%t\n", f.Bool())
238 1 : case reflect.Uint32:
239 1 : fmt.Fprintf(buf, "%d\n", f.Uint())
240 1 : case reflect.Uint64:
241 1 : fmt.Fprintf(buf, "%d\n", f.Uint())
242 1 : case reflect.String:
243 1 : fmt.Fprintf(buf, "%s\n", f.String())
244 0 : default:
245 0 : panic("not reached")
246 : }
247 : }
248 : }
249 :
250 1 : func (p *Properties) String() string {
251 1 : var buf bytes.Buffer
252 1 : v := reflect.ValueOf(*p)
253 1 : writeProperties(p.Loaded, v, &buf)
254 1 :
255 1 : // Write the UserProperties.
256 1 : keys := make([]string, 0, len(p.UserProperties))
257 1 : for key := range p.UserProperties {
258 1 : keys = append(keys, key)
259 1 : }
260 1 : sort.Strings(keys)
261 1 : for _, key := range keys {
262 1 : fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key])
263 1 : }
264 1 : return buf.String()
265 : }
266 :
267 : func (p *Properties) load(
268 : b block, blockOffset uint64, deniedUserProperties map[string]struct{},
269 2 : ) error {
270 2 : i, err := newRawBlockIter(bytes.Compare, b)
271 2 : if err != nil {
272 0 : return err
273 0 : }
274 2 : p.Loaded = make(map[uintptr]struct{})
275 2 : v := reflect.ValueOf(p).Elem()
276 2 : for valid := i.First(); valid; valid = i.Next() {
277 2 : if f, ok := propTagMap[string(i.Key().UserKey)]; ok {
278 2 : p.Loaded[f.Offset] = struct{}{}
279 2 : field := v.FieldByName(f.Name)
280 2 : switch f.Type.Kind() {
281 2 : case reflect.Bool:
282 2 : field.SetBool(bytes.Equal(i.Value(), propBoolTrue))
283 2 : case reflect.Uint32:
284 2 : field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value())))
285 2 : case reflect.Uint64:
286 2 : var n uint64
287 2 : if string(i.Key().UserKey) == propGlobalSeqnumName {
288 2 : n = binary.LittleEndian.Uint64(i.Value())
289 2 : } else {
290 2 : n, _ = binary.Uvarint(i.Value())
291 2 : }
292 2 : field.SetUint(n)
293 2 : case reflect.String:
294 2 : field.SetString(intern.Bytes(i.Value()))
295 0 : default:
296 0 : panic("not reached")
297 : }
298 2 : continue
299 : }
300 2 : if p.UserProperties == nil {
301 2 : p.UserProperties = make(map[string]string)
302 2 : }
303 :
304 2 : if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied {
305 2 : p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value())
306 2 : }
307 : }
308 2 : return nil
309 : }
310 :
311 2 : func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
312 2 : tag := propOffsetTagMap[offset]
313 2 : if value {
314 2 : m[tag] = propBoolTrue
315 2 : } else {
316 2 : m[tag] = propBoolFalse
317 2 : }
318 : }
319 :
320 2 : func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) {
321 2 : var buf [4]byte
322 2 : binary.LittleEndian.PutUint32(buf[:], value)
323 2 : m[propOffsetTagMap[offset]] = buf[:]
324 2 : }
325 :
326 2 : func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) {
327 2 : var buf [8]byte
328 2 : binary.LittleEndian.PutUint64(buf[:], value)
329 2 : m[propOffsetTagMap[offset]] = buf[:]
330 2 : }
331 :
332 2 : func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) {
333 2 : var buf [10]byte
334 2 : n := binary.PutUvarint(buf[:], value)
335 2 : m[propOffsetTagMap[offset]] = buf[:n]
336 2 : }
337 :
338 2 : func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) {
339 2 : m[propOffsetTagMap[offset]] = []byte(value)
340 2 : }
341 :
342 2 : func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
343 2 : m := make(map[string][]byte)
344 2 : for k, v := range p.UserProperties {
345 2 : m[k] = []byte(v)
346 2 : }
347 :
348 2 : if p.ComparerName != "" {
349 2 : p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName)
350 2 : }
351 2 : if p.CompressionName != "" {
352 2 : p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName)
353 2 : }
354 2 : if p.CompressionOptions != "" {
355 2 : p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions)
356 2 : }
357 2 : p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize)
358 2 : if p.ExternalFormatVersion != 0 {
359 2 : p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion)
360 2 : p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum)
361 2 : }
362 2 : if p.FilterPolicyName != "" {
363 2 : p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName)
364 2 : }
365 2 : p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize)
366 2 : if p.IndexPartitions != 0 {
367 2 : p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions)
368 2 : p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize)
369 2 : }
370 2 : p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
371 2 : p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
372 2 : if p.IsStrictObsolete {
373 1 : p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
374 1 : }
375 2 : if p.MergerName != "" {
376 2 : p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
377 2 : }
378 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks)
379 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries)
380 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions)
381 2 : if p.NumSizedDeletions > 0 {
382 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions)
383 2 : }
384 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands)
385 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions)
386 2 : // NB: We only write out some properties for Pebble formats. This isn't
387 2 : // strictly necessary because unrecognized properties are interpreted as
388 2 : // user-defined properties, however writing them prevents byte-for-byte
389 2 : // equivalence with RocksDB files that some of our testing requires.
390 2 : if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 {
391 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize)
392 2 : }
393 2 : if p.RawPointTombstoneValueSize > 0 {
394 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize)
395 2 : }
396 2 : if p.NumRangeKeys() > 0 {
397 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels)
398 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets)
399 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets)
400 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize)
401 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize)
402 2 : }
403 2 : if p.NumValueBlocks > 0 {
404 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks)
405 2 : }
406 2 : if p.NumValuesInValueBlocks > 0 {
407 2 : p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks)
408 2 : }
409 2 : if p.PrefixExtractorName != "" {
410 2 : p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName)
411 2 : }
412 2 : p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering)
413 2 : if p.PropertyCollectorNames != "" {
414 2 : p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames)
415 2 : }
416 2 : if p.SnapshotPinnedKeys > 0 {
417 2 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys)
418 2 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize)
419 2 : p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize)
420 2 : }
421 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize)
422 2 : p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize)
423 2 : if p.ValueBlocksSize > 0 {
424 2 : p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
425 2 : }
426 2 : p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering)
427 2 :
428 2 : if tblFormat < TableFormatPebblev1 {
429 1 : m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
430 1 : m["rocksdb.fixed.key.length"] = []byte{0x00}
431 1 : m["rocksdb.index.key.is.user.key"] = []byte{0x00}
432 1 : m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00}
433 1 : m["rocksdb.oldest.key.time"] = []byte{0x00}
434 1 : m["rocksdb.creation.time"] = []byte{0x00}
435 1 : m["rocksdb.format.version"] = []byte{0x00}
436 1 : }
437 :
438 2 : keys := make([]string, 0, len(m))
439 2 : for key := range m {
440 2 : keys = append(keys, key)
441 2 : }
442 2 : sort.Strings(keys)
443 2 : for _, key := range keys {
444 2 : w.add(InternalKey{UserKey: []byte(key)}, m[key])
445 2 : }
446 : }
|