Line data Source code
1 : // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package sstable 6 : 7 : import ( 8 : "github.com/cockroachdb/fifo" 9 : "github.com/cockroachdb/pebble/internal/base" 10 : "github.com/cockroachdb/pebble/internal/sstableinternal" 11 : "github.com/cockroachdb/pebble/sstable/block" 12 : "github.com/cockroachdb/pebble/sstable/colblk" 13 : "github.com/cockroachdb/pebble/sstable/rowblk" 14 : ) 15 : 16 : const ( 17 : // MaximumBlockSize is the maximum permissible size of a block. 18 : MaximumBlockSize = rowblk.MaximumSize 19 : // DefaultNumDeletionsThreshold defines the minimum number of point 20 : // tombstones that must be present in a data block for it to be 21 : // considered tombstone-dense. 22 : DefaultNumDeletionsThreshold = 100 23 : // DefaultDeletionSizeRatioThreshold defines the minimum ratio of the size 24 : // of point tombstones to the size of the data block in order to consider the 25 : // block as tombstone-dense. 26 : DefaultDeletionSizeRatioThreshold = 0.5 27 : ) 28 : 29 : var ignoredInternalProperties = map[string]struct{}{ 30 : "rocksdb.column.family.id": {}, 31 : "rocksdb.fixed.key.length": {}, 32 : "rocksdb.index.key.is.user.key": {}, 33 : "rocksdb.index.value.is.delta.encoded": {}, 34 : "rocksdb.oldest.key.time": {}, 35 : "rocksdb.creation.time": {}, 36 : "rocksdb.file.creation.time": {}, 37 : "rocksdb.format.version": {}, 38 : } 39 : 40 : // FilterType exports the base.FilterType type. 41 : type FilterType = base.FilterType 42 : 43 : // Exported TableFilter constants. 44 : const ( 45 : TableFilter = base.TableFilter 46 : ) 47 : 48 : // FilterWriter exports the base.FilterWriter type. 49 : type FilterWriter = base.FilterWriter 50 : 51 : // FilterPolicy exports the base.FilterPolicy type. 52 : type FilterPolicy = base.FilterPolicy 53 : 54 : // Comparers is a map from comparer name to comparer. It is used for debugging 55 : // tools which may be used on multiple databases configured with different 56 : // comparers. 57 : type Comparers map[string]*base.Comparer 58 : 59 : // Mergers is a map from merger name to merger. It is used for debugging tools 60 : // which may be used on multiple databases configured with different 61 : // mergers. 62 : type Mergers map[string]*base.Merger 63 : 64 : // ReaderOptions holds the parameters needed for reading an sstable. 65 : type ReaderOptions struct { 66 : // LoadBlockSema, if set, is used to limit the number of blocks that can be 67 : // loaded (i.e. read from the filesystem) in parallel. Each load acquires one 68 : // unit from the semaphore for the duration of the read. 69 : LoadBlockSema *fifo.Semaphore 70 : 71 : // User properties specified in this map will not be added to sst.Properties.UserProperties. 72 : DeniedUserProperties map[string]struct{} 73 : 74 : // Comparer defines a total ordering over the space of []byte keys: a 'less 75 : // than' relationship. The same comparison algorithm must be used for reads 76 : // and writes over the lifetime of the DB. 77 : // 78 : // The default value uses the same ordering as bytes.Compare. 79 : Comparer *Comparer 80 : 81 : // Merger defines the Merge function in use for this keyspace. 82 : Merger *Merger 83 : 84 : Comparers Comparers 85 : Mergers Mergers 86 : 87 : // Filters is a map from filter policy name to filter policy. Filters with 88 : // policies that are not in this map will be ignored. 89 : Filters map[string]FilterPolicy 90 : 91 : // Logger is an optional logger and tracer. 92 : LoggerAndTracer base.LoggerAndTracer 93 : 94 : // FilterMetricsTracker is optionally used to track filter metrics. 95 : FilterMetricsTracker *FilterMetricsTracker 96 : 97 : // internal options can only be used from within the pebble package. 98 : internal sstableinternal.ReaderOptions 99 : } 100 : 101 : // SetInternal sets the internal reader options. Note that even though this 102 : // method is public, a caller outside the pebble package can't construct a value 103 : // to pass to it. 104 1 : func (o *ReaderOptions) SetInternal(internalOpts sstableinternal.ReaderOptions) { 105 1 : o.internal = internalOpts 106 1 : } 107 : 108 : // SetInternalCacheOpts sets the internal cache options. Note that even though 109 : // this method is public, a caller outside the pebble package can't construct a 110 : // value to pass to it. 111 2 : func (o *ReaderOptions) SetInternalCacheOpts(cacheOpts sstableinternal.CacheOptions) { 112 2 : o.internal.CacheOpts = cacheOpts 113 2 : } 114 : 115 2 : func (o ReaderOptions) ensureDefaults() ReaderOptions { 116 2 : if o.Comparer == nil { 117 1 : o.Comparer = base.DefaultComparer 118 1 : } 119 2 : if o.Merger == nil { 120 2 : o.Merger = base.DefaultMerger 121 2 : } 122 2 : if o.LoggerAndTracer == nil { 123 2 : o.LoggerAndTracer = base.NoopLoggerAndTracer{} 124 2 : } 125 2 : if o.DeniedUserProperties == nil { 126 2 : o.DeniedUserProperties = ignoredInternalProperties 127 2 : } 128 2 : return o 129 : } 130 : 131 : // WriterOptions holds the parameters used to control building an sstable. 132 : type WriterOptions struct { 133 : // BlockRestartInterval is the number of keys between restart points 134 : // for delta encoding of keys. 135 : // 136 : // The default value is 16. 137 : BlockRestartInterval int 138 : 139 : // BlockSize is the target uncompressed size in bytes of each table block. 140 : // 141 : // The default value is 4096. 142 : BlockSize int 143 : 144 : // BlockSizeThreshold finishes a block if the block size is larger than the 145 : // specified percentage of the target block size and adding the next entry 146 : // would cause the block to be larger than the target block size. 147 : // 148 : // The default value is 90. 149 : BlockSizeThreshold int 150 : 151 : // SizeClassAwareThreshold imposes a minimum block size restriction for blocks 152 : // to be flushed, that is computed as the percentage of the target block size. 153 : // Note that this threshold takes precedence over BlockSizeThreshold when 154 : // valid AllocatorSizeClasses are specified. 155 : // 156 : // The default value is 60. 157 : SizeClassAwareThreshold int 158 : 159 : // Comparer defines a total ordering over the space of []byte keys: a 'less 160 : // than' relationship. The same comparison algorithm must be used for reads 161 : // and writes over the lifetime of the DB. 162 : // 163 : // The default value uses the same ordering as bytes.Compare. 164 : Comparer *Comparer 165 : 166 : // Compression defines the per-block compression to use. 167 : // 168 : // The default value (DefaultCompression) uses snappy compression. 169 : Compression block.Compression 170 : 171 : // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 172 : // reduce disk reads for Get calls. 173 : // 174 : // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 175 : // package. 176 : // 177 : // The default value means to use no filter. 178 : FilterPolicy FilterPolicy 179 : 180 : // FilterType defines whether an existing filter policy is applied at a 181 : // block-level or table-level. Block-level filters use less memory to create, 182 : // but are slower to access as a check for the key in the index must first be 183 : // performed to locate the filter block. A table-level filter will require 184 : // memory proportional to the number of keys in an sstable to create, but 185 : // avoids the index lookup when determining if a key is present. Table-level 186 : // filters should be preferred except under constrained memory situations. 187 : FilterType FilterType 188 : 189 : // IndexBlockSize is the target uncompressed size in bytes of each index 190 : // block. When the index block size is larger than this target, two-level 191 : // indexes are automatically enabled. Setting this option to a large value 192 : // (such as math.MaxInt32) disables the automatic creation of two-level 193 : // indexes. 194 : // 195 : // The default value is the value of BlockSize. 196 : IndexBlockSize int 197 : 198 : // KeySchema describes the schema to use for sstable formats that make use 199 : // of columnar blocks, decomposing keys into their constituent components. 200 : // Ignored if TableFormat <= TableFormatPebblev4. 201 : KeySchema colblk.KeySchema 202 : 203 : // Merger defines the associative merge operation to use for merging values 204 : // written with {Batch,DB}.Merge. The MergerName is checked for consistency 205 : // with the value stored in the sstable when it was written. 206 : MergerName string 207 : 208 : // TableFormat specifies the format version for writing sstables. The default 209 : // is TableFormatMinSupported. 210 : TableFormat TableFormat 211 : 212 : // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment 213 : // in format.go. Must be false if format < TableFormatPebblev4. 214 : // 215 : // TODO(bilal): set this when writing shared ssts. 216 : IsStrictObsolete bool 217 : 218 : // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is 219 : // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the 220 : // youngest for a userkey. 221 : WritingToLowestLevel bool 222 : 223 : // BlockPropertyCollectors is a list of BlockPropertyCollector creation 224 : // functions. A new BlockPropertyCollector is created for each sstable 225 : // built and lives for the lifetime of writing that table. 226 : BlockPropertyCollectors []func() BlockPropertyCollector 227 : 228 : // Checksum specifies which checksum to use. 229 : Checksum block.ChecksumType 230 : 231 : // Parallelism is used to indicate that the sstable Writer is allowed to 232 : // compress data blocks and write datablocks to disk in parallel with the 233 : // Writer client goroutine. 234 : Parallelism bool 235 : 236 : // ShortAttributeExtractor mirrors 237 : // Options.Experimental.ShortAttributeExtractor. 238 : ShortAttributeExtractor base.ShortAttributeExtractor 239 : 240 : // RequiredInPlaceValueBound mirrors 241 : // Options.Experimental.RequiredInPlaceValueBound. 242 : RequiredInPlaceValueBound UserKeyPrefixBound 243 : 244 : // DisableValueBlocks is only used for TableFormat >= TableFormatPebblev3, 245 : // and if set to true, does not write any values to value blocks. This is 246 : // only intended for cases where the in-memory buffering of all value blocks 247 : // while writing a sstable is too expensive and likely to cause an OOM. It 248 : // is never set to true by a Pebble DB, and can be set to true when some 249 : // external code is directly generating huge sstables using Pebble's 250 : // sstable.Writer (for example, CockroachDB backups can sometimes write 251 : // 750MB sstables -- see 252 : // https://github.com/cockroachdb/cockroach/issues/117113). 253 : DisableValueBlocks bool 254 : 255 : // AllocatorSizeClasses provides a sorted list containing the supported size 256 : // classes of the underlying memory allocator. This provides hints to the 257 : // writer's flushing policy to select block sizes that preemptively reduce 258 : // internal fragmentation when loaded into the block cache. 259 : AllocatorSizeClasses []int 260 : 261 : // internal options can only be used from within the pebble package. 262 : internal sstableinternal.WriterOptions 263 : 264 : // NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold. 265 : NumDeletionsThreshold int 266 : 267 : // DeletionSizeRatioThreshold mirrors 268 : // Options.Experimental.DeletionSizeRatioThreshold. 269 : DeletionSizeRatioThreshold float32 270 : } 271 : 272 : // SetInternal sets the internal writer options. Note that even though this 273 : // method is public, a caller outside the pebble package can't construct a value 274 : // to pass to it. 275 2 : func (o *WriterOptions) SetInternal(internalOpts sstableinternal.WriterOptions) { 276 2 : o.internal = internalOpts 277 2 : } 278 : 279 2 : func (o WriterOptions) ensureDefaults() WriterOptions { 280 2 : if o.BlockRestartInterval <= 0 { 281 1 : o.BlockRestartInterval = base.DefaultBlockRestartInterval 282 1 : } 283 2 : if o.BlockSize <= 0 { 284 1 : o.BlockSize = base.DefaultBlockSize 285 1 : } 286 2 : if o.BlockSizeThreshold <= 0 { 287 1 : o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 288 1 : } 289 2 : if o.SizeClassAwareThreshold <= 0 { 290 2 : o.SizeClassAwareThreshold = base.SizeClassAwareBlockSizeThreshold 291 2 : } 292 2 : if o.Comparer == nil { 293 1 : o.Comparer = base.DefaultComparer 294 1 : } 295 2 : if o.Compression <= block.DefaultCompression || o.Compression >= block.NCompression { 296 1 : o.Compression = block.SnappyCompression 297 1 : } 298 2 : if o.IndexBlockSize <= 0 { 299 1 : o.IndexBlockSize = o.BlockSize 300 1 : } 301 2 : if o.MergerName == "" { 302 1 : o.MergerName = base.DefaultMerger.Name 303 1 : } 304 2 : if o.Checksum == block.ChecksumTypeNone { 305 2 : o.Checksum = block.ChecksumTypeCRC32c 306 2 : } 307 : // By default, if the table format is not specified, fall back to using the 308 : // most compatible format that is supported by Pebble. 309 2 : if o.TableFormat == TableFormatUnspecified { 310 1 : o.TableFormat = TableFormatMinSupported 311 1 : } 312 2 : if o.NumDeletionsThreshold == 0 { 313 1 : o.NumDeletionsThreshold = DefaultNumDeletionsThreshold 314 1 : } 315 2 : if o.DeletionSizeRatioThreshold == 0 { 316 1 : o.DeletionSizeRatioThreshold = DefaultDeletionSizeRatioThreshold 317 1 : } 318 2 : return o 319 : }