Line data Source code
1 : // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package sstable 6 : 7 : import ( 8 : "fmt" 9 : 10 : "github.com/cockroachdb/crlib/fifo" 11 : "github.com/cockroachdb/pebble/internal/base" 12 : "github.com/cockroachdb/pebble/internal/sstableinternal" 13 : "github.com/cockroachdb/pebble/sstable/block" 14 : "github.com/cockroachdb/pebble/sstable/colblk" 15 : "github.com/cockroachdb/pebble/sstable/rowblk" 16 : ) 17 : 18 : const ( 19 : // MaximumBlockSize is the maximum permissible size of a block. 20 : MaximumBlockSize = rowblk.MaximumSize 21 : // DefaultNumDeletionsThreshold defines the minimum number of point 22 : // tombstones that must be present in a data block for it to be 23 : // considered tombstone-dense. 24 : DefaultNumDeletionsThreshold = 100 25 : // DefaultDeletionSizeRatioThreshold defines the minimum ratio of the size 26 : // of point tombstones to the size of the data block in order to consider the 27 : // block as tombstone-dense. 28 : DefaultDeletionSizeRatioThreshold = 0.5 29 : ) 30 : 31 : var ignoredInternalProperties = map[string]struct{}{ 32 : "rocksdb.column.family.id": {}, 33 : "rocksdb.fixed.key.length": {}, 34 : "rocksdb.index.key.is.user.key": {}, 35 : "rocksdb.index.value.is.delta.encoded": {}, 36 : "rocksdb.oldest.key.time": {}, 37 : "rocksdb.creation.time": {}, 38 : "rocksdb.file.creation.time": {}, 39 : "rocksdb.format.version": {}, 40 : } 41 : 42 : // FilterType exports the base.FilterType type. 43 : type FilterType = base.FilterType 44 : 45 : // Exported TableFilter constants. 46 : const ( 47 : TableFilter = base.TableFilter 48 : ) 49 : 50 : // FilterWriter exports the base.FilterWriter type. 51 : type FilterWriter = base.FilterWriter 52 : 53 : // FilterPolicy exports the base.FilterPolicy type. 54 : type FilterPolicy = base.FilterPolicy 55 : 56 : // Comparers is a map from comparer name to comparer. It is used for debugging 57 : // tools which may be used on multiple databases configured with different 58 : // comparers. 59 : type Comparers map[string]*base.Comparer 60 : 61 : // Mergers is a map from merger name to merger. It is used for debugging tools 62 : // which may be used on multiple databases configured with different 63 : // mergers. 64 : type Mergers map[string]*base.Merger 65 : 66 : // KeySchemas is a map from key schema name to key schema. A single database may 67 : // contain sstables with multiple key schemas. 68 : type KeySchemas map[string]*colblk.KeySchema 69 : 70 : // MakeKeySchemas constructs a KeySchemas from a slice of key schemas. 71 1 : func MakeKeySchemas(keySchemas ...*colblk.KeySchema) KeySchemas { 72 1 : m := make(KeySchemas, len(keySchemas)) 73 1 : for _, keySchema := range keySchemas { 74 1 : if _, ok := m[keySchema.Name]; ok { 75 0 : panic(fmt.Sprintf("duplicate key schemas with name %q", keySchema.Name)) 76 : } 77 1 : m[keySchema.Name] = keySchema 78 : } 79 1 : return m 80 : } 81 : 82 : // ReaderOptions holds the parameters needed for reading an sstable. 83 : type ReaderOptions struct { 84 : // LoadBlockSema, if set, is used to limit the number of blocks that can be 85 : // loaded (i.e. read from the filesystem) in parallel. Each load acquires one 86 : // unit from the semaphore for the duration of the read. 87 : LoadBlockSema *fifo.Semaphore 88 : 89 : // User properties specified in this map will not be added to sst.Properties.UserProperties. 90 : DeniedUserProperties map[string]struct{} 91 : 92 : // Comparer defines a total ordering over the space of []byte keys: a 'less 93 : // than' relationship. The same comparison algorithm must be used for reads 94 : // and writes over the lifetime of the DB. 95 : // 96 : // The default value uses the same ordering as bytes.Compare. 97 : Comparer *Comparer 98 : 99 : // Merger defines the Merge function in use for this keyspace. 100 : Merger *Merger 101 : 102 : Comparers Comparers 103 : Mergers Mergers 104 : // KeySchemas contains the set of known key schemas to use when interpreting 105 : // columnar data blocks. Only used for sstables encoded in format 106 : // TableFormatPebblev5 or higher. 107 : KeySchemas KeySchemas 108 : 109 : // Filters is a map from filter policy name to filter policy. Filters with 110 : // policies that are not in this map will be ignored. 111 : Filters map[string]FilterPolicy 112 : 113 : // Logger is an optional logger and tracer. 114 : LoggerAndTracer base.LoggerAndTracer 115 : 116 : // FilterMetricsTracker is optionally used to track filter metrics. 117 : FilterMetricsTracker *FilterMetricsTracker 118 : 119 : // internal options can only be used from within the pebble package. 120 : internal sstableinternal.ReaderOptions 121 : } 122 : 123 : // SetInternal sets the internal reader options. Note that even though this 124 : // method is public, a caller outside the pebble package can't construct a value 125 : // to pass to it. 126 1 : func (o *ReaderOptions) SetInternal(internalOpts sstableinternal.ReaderOptions) { 127 1 : o.internal = internalOpts 128 1 : } 129 : 130 : // SetInternalCacheOpts sets the internal cache options. Note that even though 131 : // this method is public, a caller outside the pebble package can't construct a 132 : // value to pass to it. 133 1 : func (o *ReaderOptions) SetInternalCacheOpts(cacheOpts sstableinternal.CacheOptions) { 134 1 : o.internal.CacheOpts = cacheOpts 135 1 : } 136 : 137 1 : func (o ReaderOptions) ensureDefaults() ReaderOptions { 138 1 : if o.Comparer == nil { 139 1 : o.Comparer = base.DefaultComparer 140 1 : } 141 1 : if o.Merger == nil { 142 1 : o.Merger = base.DefaultMerger 143 1 : } 144 1 : if o.LoggerAndTracer == nil { 145 1 : o.LoggerAndTracer = base.NoopLoggerAndTracer{} 146 1 : } 147 1 : if o.DeniedUserProperties == nil { 148 1 : o.DeniedUserProperties = ignoredInternalProperties 149 1 : } 150 1 : if o.KeySchemas == nil { 151 1 : o.KeySchemas = defaultKeySchemas 152 1 : } 153 1 : return o 154 : } 155 : 156 : var defaultKeySchema = colblk.DefaultKeySchema(base.DefaultComparer, 16) 157 : var defaultKeySchemas = MakeKeySchemas(&defaultKeySchema) 158 : 159 : // WriterOptions holds the parameters used to control building an sstable. 160 : type WriterOptions struct { 161 : // BlockRestartInterval is the number of keys between restart points 162 : // for delta encoding of keys. 163 : // 164 : // The default value is 16. 165 : BlockRestartInterval int 166 : 167 : // BlockSize is the target uncompressed size in bytes of each table block. 168 : // 169 : // The default value is 4096. 170 : BlockSize int 171 : 172 : // BlockSizeThreshold finishes a block if the block size is larger than the 173 : // specified percentage of the target block size and adding the next entry 174 : // would cause the block to be larger than the target block size. 175 : // 176 : // The default value is 90. 177 : BlockSizeThreshold int 178 : 179 : // SizeClassAwareThreshold imposes a minimum block size restriction for blocks 180 : // to be flushed, that is computed as the percentage of the target block size. 181 : // Note that this threshold takes precedence over BlockSizeThreshold when 182 : // valid AllocatorSizeClasses are specified. 183 : // 184 : // The default value is 60. 185 : SizeClassAwareThreshold int 186 : 187 : // Comparer defines a total ordering over the space of []byte keys: a 'less 188 : // than' relationship. The same comparison algorithm must be used for reads 189 : // and writes over the lifetime of the DB. 190 : // 191 : // The default value uses the same ordering as bytes.Compare. 192 : Comparer *Comparer 193 : 194 : // Compression defines the per-block compression to use. 195 : // 196 : // The default value (DefaultCompression) uses snappy compression. 197 : Compression block.Compression 198 : 199 : // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 200 : // reduce disk reads for Get calls. 201 : // 202 : // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 203 : // package. 204 : // 205 : // The default value means to use no filter. 206 : FilterPolicy FilterPolicy 207 : 208 : // FilterType defines whether an existing filter policy is applied at a 209 : // block-level or table-level. Block-level filters use less memory to create, 210 : // but are slower to access as a check for the key in the index must first be 211 : // performed to locate the filter block. A table-level filter will require 212 : // memory proportional to the number of keys in an sstable to create, but 213 : // avoids the index lookup when determining if a key is present. Table-level 214 : // filters should be preferred except under constrained memory situations. 215 : FilterType FilterType 216 : 217 : // IndexBlockSize is the target uncompressed size in bytes of each index 218 : // block. When the index block size is larger than this target, two-level 219 : // indexes are automatically enabled. Setting this option to a large value 220 : // (such as math.MaxInt32) disables the automatic creation of two-level 221 : // indexes. 222 : // 223 : // The default value is the value of BlockSize. 224 : IndexBlockSize int 225 : 226 : // KeySchema describes the schema to use for sstable formats that make use 227 : // of columnar blocks, decomposing keys into their constituent components. 228 : // Ignored if TableFormat <= TableFormatPebblev4. 229 : KeySchema *colblk.KeySchema 230 : 231 : // Merger defines the associative merge operation to use for merging values 232 : // written with {Batch,DB}.Merge. The MergerName is checked for consistency 233 : // with the value stored in the sstable when it was written. 234 : MergerName string 235 : 236 : // TableFormat specifies the format version for writing sstables. The default 237 : // is TableFormatMinSupported. 238 : TableFormat TableFormat 239 : 240 : // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment 241 : // in format.go. Must be false if format < TableFormatPebblev4. 242 : // 243 : // TODO(bilal): set this when writing shared ssts. 244 : IsStrictObsolete bool 245 : 246 : // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is 247 : // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the 248 : // youngest for a userkey. 249 : WritingToLowestLevel bool 250 : 251 : // BlockPropertyCollectors is a list of BlockPropertyCollector creation 252 : // functions. A new BlockPropertyCollector is created for each sstable 253 : // built and lives for the lifetime of writing that table. 254 : BlockPropertyCollectors []func() BlockPropertyCollector 255 : 256 : // Checksum specifies which checksum to use. 257 : Checksum block.ChecksumType 258 : 259 : // Parallelism is used to indicate that the sstable Writer is allowed to 260 : // compress data blocks and write datablocks to disk in parallel with the 261 : // Writer client goroutine. 262 : Parallelism bool 263 : 264 : // ShortAttributeExtractor mirrors 265 : // Options.Experimental.ShortAttributeExtractor. 266 : ShortAttributeExtractor base.ShortAttributeExtractor 267 : 268 : // RequiredInPlaceValueBound mirrors 269 : // Options.Experimental.RequiredInPlaceValueBound. 270 : RequiredInPlaceValueBound UserKeyPrefixBound 271 : 272 : // DisableValueBlocks is only used for TableFormat >= TableFormatPebblev3, 273 : // and if set to true, does not write any values to value blocks. This is 274 : // only intended for cases where the in-memory buffering of all value blocks 275 : // while writing a sstable is too expensive and likely to cause an OOM. It 276 : // is never set to true by a Pebble DB, and can be set to true when some 277 : // external code is directly generating huge sstables using Pebble's 278 : // sstable.Writer (for example, CockroachDB backups can sometimes write 279 : // 750MB sstables -- see 280 : // https://github.com/cockroachdb/cockroach/issues/117113). 281 : DisableValueBlocks bool 282 : 283 : // AllocatorSizeClasses provides a sorted list containing the supported size 284 : // classes of the underlying memory allocator. This provides hints to the 285 : // writer's flushing policy to select block sizes that preemptively reduce 286 : // internal fragmentation when loaded into the block cache. 287 : AllocatorSizeClasses []int 288 : 289 : // internal options can only be used from within the pebble package. 290 : internal sstableinternal.WriterOptions 291 : 292 : // NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold. 293 : NumDeletionsThreshold int 294 : 295 : // DeletionSizeRatioThreshold mirrors 296 : // Options.Experimental.DeletionSizeRatioThreshold. 297 : DeletionSizeRatioThreshold float32 298 : 299 : // disableObsoleteCollector is used to disable the obsolete key block property 300 : // collector automatically added by sstable block writers. 301 : disableObsoleteCollector bool 302 : } 303 : 304 : // UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes. 305 : // If both are nil, there is no bound specified. Else, Compare(Lower,Upper) 306 : // must be < 0. 307 : type UserKeyPrefixBound struct { 308 : // Lower is a lower bound user key prefix. 309 : Lower []byte 310 : // Upper is an upper bound user key prefix. 311 : Upper []byte 312 : } 313 : 314 : // IsEmpty returns true iff the bound is empty. 315 1 : func (ukb *UserKeyPrefixBound) IsEmpty() bool { 316 1 : return len(ukb.Lower) == 0 && len(ukb.Upper) == 0 317 1 : } 318 : 319 : // JemallocSizeClasses are a subset of available size classes in jemalloc[1], 320 : // suitable for the AllocatorSizeClasses option. 321 : // 322 : // The size classes are used when writing sstables for determining target block 323 : // sizes for flushes, with the goal of reducing internal memory fragmentation 324 : // when the blocks are later loaded into the block cache. We only use the size 325 : // classes between 16KiB - 256KiB as block limits fall in that range. 326 : // 327 : // [1] https://jemalloc.net/jemalloc.3.html#size_classes 328 : var JemallocSizeClasses = []int{ 329 : 16 * 1024, 330 : 20 * 1024, 24 * 1024, 28 * 1024, 32 * 1024, // 4KiB spacing 331 : 40 * 1024, 48 * 1024, 56 * 1024, 64 * 1024, // 8KiB spacing 332 : 80 * 1024, 96 * 1024, 112 * 1024, 128 * 1024, // 16KiB spacing. 333 : 160 * 1024, 192 * 1024, 224 * 1024, 256 * 1024, // 32KiB spacing. 334 : 320 * 1024, 335 : } 336 : 337 : // SetInternal sets the internal writer options. Note that even though this 338 : // method is public, a caller outside the pebble package can't construct a value 339 : // to pass to it. 340 1 : func (o *WriterOptions) SetInternal(internalOpts sstableinternal.WriterOptions) { 341 1 : o.internal = internalOpts 342 1 : } 343 : 344 1 : func (o WriterOptions) ensureDefaults() WriterOptions { 345 1 : if o.BlockRestartInterval <= 0 { 346 1 : o.BlockRestartInterval = base.DefaultBlockRestartInterval 347 1 : } 348 1 : if o.BlockSize <= 0 { 349 1 : o.BlockSize = base.DefaultBlockSize 350 1 : } 351 1 : if o.BlockSizeThreshold <= 0 { 352 1 : o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 353 1 : } 354 1 : if o.SizeClassAwareThreshold <= 0 { 355 1 : o.SizeClassAwareThreshold = base.SizeClassAwareBlockSizeThreshold 356 1 : } 357 1 : if o.Comparer == nil { 358 1 : o.Comparer = base.DefaultComparer 359 1 : } 360 1 : if o.Compression <= block.DefaultCompression || o.Compression >= block.NCompression { 361 1 : o.Compression = block.SnappyCompression 362 1 : } 363 1 : if o.IndexBlockSize <= 0 { 364 1 : o.IndexBlockSize = o.BlockSize 365 1 : } 366 1 : if o.MergerName == "" { 367 1 : o.MergerName = base.DefaultMerger.Name 368 1 : } 369 1 : if o.Checksum == block.ChecksumTypeNone { 370 1 : o.Checksum = block.ChecksumTypeCRC32c 371 1 : } 372 : // By default, if the table format is not specified, fall back to using the 373 : // most compatible format that is supported by Pebble. 374 1 : if o.TableFormat == TableFormatUnspecified { 375 1 : o.TableFormat = TableFormatMinSupported 376 1 : } 377 1 : if o.NumDeletionsThreshold == 0 { 378 1 : o.NumDeletionsThreshold = DefaultNumDeletionsThreshold 379 1 : } 380 1 : if o.DeletionSizeRatioThreshold == 0 { 381 1 : o.DeletionSizeRatioThreshold = DefaultDeletionSizeRatioThreshold 382 1 : } 383 1 : if o.KeySchema == nil && o.TableFormat.BlockColumnar() { 384 1 : s := colblk.DefaultKeySchema(o.Comparer, 16 /* bundle size */) 385 1 : o.KeySchema = &s 386 1 : } 387 1 : return o 388 : }