Line data Source code
1 : // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package sstable 6 : 7 : import ( 8 : "github.com/cockroachdb/pebble/internal/base" 9 : "github.com/cockroachdb/pebble/internal/cache" 10 : ) 11 : 12 : // Compression is the per-block compression algorithm to use. 13 : type Compression int 14 : 15 : // The available compression types. 16 : const ( 17 : DefaultCompression Compression = iota 18 : NoCompression 19 : SnappyCompression 20 : ZstdCompression 21 : NCompression 22 : ) 23 : 24 : var ignoredInternalProperties = map[string]struct{}{ 25 : "rocksdb.column.family.id": {}, 26 : "rocksdb.fixed.key.length": {}, 27 : "rocksdb.index.key.is.user.key": {}, 28 : "rocksdb.index.value.is.delta.encoded": {}, 29 : "rocksdb.oldest.key.time": {}, 30 : "rocksdb.creation.time": {}, 31 : "rocksdb.file.creation.time": {}, 32 : "rocksdb.format.version": {}, 33 : } 34 : 35 1 : func (c Compression) String() string { 36 1 : switch c { 37 0 : case DefaultCompression: 38 0 : return "Default" 39 1 : case NoCompression: 40 1 : return "NoCompression" 41 1 : case SnappyCompression: 42 1 : return "Snappy" 43 1 : case ZstdCompression: 44 1 : return "ZSTD" 45 0 : default: 46 0 : return "Unknown" 47 : } 48 : } 49 : 50 : // FilterType exports the base.FilterType type. 51 : type FilterType = base.FilterType 52 : 53 : // Exported TableFilter constants. 54 : const ( 55 : TableFilter = base.TableFilter 56 : ) 57 : 58 : // FilterWriter exports the base.FilterWriter type. 59 : type FilterWriter = base.FilterWriter 60 : 61 : // FilterPolicy exports the base.FilterPolicy type. 62 : type FilterPolicy = base.FilterPolicy 63 : 64 : // TablePropertyCollector provides a hook for collecting user-defined 65 : // properties based on the keys and values stored in an sstable. A new 66 : // TablePropertyCollector is created for an sstable when the sstable is being 67 : // written. 68 : type TablePropertyCollector interface { 69 : // Add is called with each new entry added to the sstable. While the sstable 70 : // is itself sorted by key, do not assume that the entries are added in any 71 : // order. In particular, the ordering of point entries and range tombstones 72 : // is unspecified. 73 : Add(key InternalKey, value []byte) error 74 : 75 : // Finish is called when all entries have been added to the sstable. The 76 : // collected properties (if any) should be added to the specified map. Note 77 : // that in case of an error during sstable construction, Finish may not be 78 : // called. 79 : Finish(userProps map[string]string) error 80 : 81 : // The name of the property collector. 82 : Name() string 83 : } 84 : 85 : // SuffixReplaceableTableCollector is an extension to the TablePropertyCollector 86 : // interface that allows a table property collector to indicate that it supports 87 : // being *updated* during suffix replacement, i.e. when an existing SST in which 88 : // all keys have the same key suffix is updated to have a new suffix. 89 : // 90 : // A collector which supports being updated in such cases must be able to derive 91 : // its updated value from its old value and the change being made to the suffix, 92 : // without needing to be passed each updated K/V. 93 : // 94 : // For example, a collector that only inspects values can simply copy its 95 : // previously computed property as-is, since key-suffix replacement does not 96 : // change values, while a collector that depends only on key suffixes, like one 97 : // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just 98 : // set its new bounds from the new suffix, as it is common to all keys, without 99 : // needing to recompute it from every key. 100 : type SuffixReplaceableTableCollector interface { 101 : // UpdateKeySuffixes is called when a table is updated to change the suffix of 102 : // all keys in the table, and is passed the old value for that prop, if any, 103 : // for that table as well as the old and new suffix. 104 : UpdateKeySuffixes(oldProps map[string]string, oldSuffix, newSuffix []byte) error 105 : } 106 : 107 : // ReaderOptions holds the parameters needed for reading an sstable. 108 : type ReaderOptions struct { 109 : // Cache is used to cache uncompressed blocks from sstables. 110 : // 111 : // The default cache size is a zero-size cache. 112 : Cache *cache.Cache 113 : 114 : // User properties specified in this map will not be added to sst.Properties.UserProperties. 115 : DeniedUserProperties map[string]struct{} 116 : 117 : // Comparer defines a total ordering over the space of []byte keys: a 'less 118 : // than' relationship. The same comparison algorithm must be used for reads 119 : // and writes over the lifetime of the DB. 120 : // 121 : // The default value uses the same ordering as bytes.Compare. 122 : Comparer *Comparer 123 : 124 : // Merge defines the Merge function in use for this keyspace. 125 : Merge base.Merge 126 : 127 : // Filters is a map from filter policy name to filter policy. It is used for 128 : // debugging tools which may be used on multiple databases configured with 129 : // different filter policies. It is not necessary to populate this filters 130 : // map during normal usage of a DB. 131 : Filters map[string]FilterPolicy 132 : 133 : // Merger defines the associative merge operation to use for merging values 134 : // written with {Batch,DB}.Merge. The MergerName is checked for consistency 135 : // with the value stored in the sstable when it was written. 136 : MergerName string 137 : 138 : // Logger is an optional logger and tracer. 139 : LoggerAndTracer base.LoggerAndTracer 140 : } 141 : 142 1 : func (o ReaderOptions) ensureDefaults() ReaderOptions { 143 1 : if o.Comparer == nil { 144 1 : o.Comparer = base.DefaultComparer 145 1 : } 146 1 : if o.Merge == nil { 147 1 : o.Merge = base.DefaultMerger.Merge 148 1 : } 149 1 : if o.MergerName == "" { 150 1 : o.MergerName = base.DefaultMerger.Name 151 1 : } 152 1 : if o.LoggerAndTracer == nil { 153 1 : o.LoggerAndTracer = base.NoopLoggerAndTracer{} 154 1 : } 155 1 : if o.DeniedUserProperties == nil { 156 1 : o.DeniedUserProperties = ignoredInternalProperties 157 1 : } 158 1 : return o 159 : } 160 : 161 : // WriterOptions holds the parameters used to control building an sstable. 162 : type WriterOptions struct { 163 : // BlockRestartInterval is the number of keys between restart points 164 : // for delta encoding of keys. 165 : // 166 : // The default value is 16. 167 : BlockRestartInterval int 168 : 169 : // BlockSize is the target uncompressed size in bytes of each table block. 170 : // 171 : // The default value is 4096. 172 : BlockSize int 173 : 174 : // BlockSizeThreshold finishes a block if the block size is larger than the 175 : // specified percentage of the target block size and adding the next entry 176 : // would cause the block to be larger than the target block size. 177 : // 178 : // The default value is 90 179 : BlockSizeThreshold int 180 : 181 : // Cache is used to cache uncompressed blocks from sstables. 182 : // 183 : // The default is a nil cache. 184 : Cache *cache.Cache 185 : 186 : // Comparer defines a total ordering over the space of []byte keys: a 'less 187 : // than' relationship. The same comparison algorithm must be used for reads 188 : // and writes over the lifetime of the DB. 189 : // 190 : // The default value uses the same ordering as bytes.Compare. 191 : Comparer *Comparer 192 : 193 : // Compression defines the per-block compression to use. 194 : // 195 : // The default value (DefaultCompression) uses snappy compression. 196 : Compression Compression 197 : 198 : // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 199 : // reduce disk reads for Get calls. 200 : // 201 : // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 202 : // package. 203 : // 204 : // The default value means to use no filter. 205 : FilterPolicy FilterPolicy 206 : 207 : // FilterType defines whether an existing filter policy is applied at a 208 : // block-level or table-level. Block-level filters use less memory to create, 209 : // but are slower to access as a check for the key in the index must first be 210 : // performed to locate the filter block. A table-level filter will require 211 : // memory proportional to the number of keys in an sstable to create, but 212 : // avoids the index lookup when determining if a key is present. Table-level 213 : // filters should be preferred except under constrained memory situations. 214 : FilterType FilterType 215 : 216 : // IndexBlockSize is the target uncompressed size in bytes of each index 217 : // block. When the index block size is larger than this target, two-level 218 : // indexes are automatically enabled. Setting this option to a large value 219 : // (such as math.MaxInt32) disables the automatic creation of two-level 220 : // indexes. 221 : // 222 : // The default value is the value of BlockSize. 223 : IndexBlockSize int 224 : 225 : // Merger defines the associative merge operation to use for merging values 226 : // written with {Batch,DB}.Merge. The MergerName is checked for consistency 227 : // with the value stored in the sstable when it was written. 228 : MergerName string 229 : 230 : // TableFormat specifies the format version for writing sstables. The default 231 : // is TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use 232 : // TableFormatLevelDB to create LevelDB compatible sstable which can be used 233 : // by a wider range of tools and libraries. 234 : TableFormat TableFormat 235 : 236 : // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment 237 : // in format.go. Must be false if format < TableFormatPebblev4. 238 : // 239 : // TODO(bilal): set this when writing shared ssts. 240 : IsStrictObsolete bool 241 : 242 : // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is 243 : // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the 244 : // youngest for a userkey. 245 : WritingToLowestLevel bool 246 : 247 : // TablePropertyCollectors is a list of TablePropertyCollector creation 248 : // functions. A new TablePropertyCollector is created for each sstable built 249 : // and lives for the lifetime of the table. 250 : TablePropertyCollectors []func() TablePropertyCollector 251 : 252 : // BlockPropertyCollectors is a list of BlockPropertyCollector creation 253 : // functions. A new BlockPropertyCollector is created for each sstable 254 : // built and lives for the lifetime of writing that table. 255 : BlockPropertyCollectors []func() BlockPropertyCollector 256 : 257 : // Checksum specifies which checksum to use. 258 : Checksum ChecksumType 259 : 260 : // Parallelism is used to indicate that the sstable Writer is allowed to 261 : // compress data blocks and write datablocks to disk in parallel with the 262 : // Writer client goroutine. 263 : Parallelism bool 264 : 265 : // ShortAttributeExtractor mirrors 266 : // Options.Experimental.ShortAttributeExtractor. 267 : ShortAttributeExtractor base.ShortAttributeExtractor 268 : 269 : // RequiredInPlaceValueBound mirrors 270 : // Options.Experimental.RequiredInPlaceValueBound. 271 : RequiredInPlaceValueBound UserKeyPrefixBound 272 : } 273 : 274 1 : func (o WriterOptions) ensureDefaults() WriterOptions { 275 1 : if o.BlockRestartInterval <= 0 { 276 1 : o.BlockRestartInterval = base.DefaultBlockRestartInterval 277 1 : } 278 1 : if o.BlockSize <= 0 { 279 1 : o.BlockSize = base.DefaultBlockSize 280 1 : } 281 1 : if o.BlockSizeThreshold <= 0 { 282 1 : o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 283 1 : } 284 1 : if o.Comparer == nil { 285 1 : o.Comparer = base.DefaultComparer 286 1 : } 287 1 : if o.Compression <= DefaultCompression || o.Compression >= NCompression { 288 1 : o.Compression = SnappyCompression 289 1 : } 290 1 : if o.IndexBlockSize <= 0 { 291 1 : o.IndexBlockSize = o.BlockSize 292 1 : } 293 1 : if o.MergerName == "" { 294 1 : o.MergerName = base.DefaultMerger.Name 295 1 : } 296 1 : if o.Checksum == ChecksumTypeNone { 297 1 : o.Checksum = ChecksumTypeCRC32c 298 1 : } 299 : // By default, if the table format is not specified, fall back to using the 300 : // most compatible format. 301 1 : if o.TableFormat == TableFormatUnspecified { 302 1 : o.TableFormat = TableFormatRocksDBv2 303 1 : } 304 1 : return o 305 : }