Line data Source code
1 : // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package sstable 6 : 7 : import ( 8 : "github.com/cockroachdb/fifo" 9 : "github.com/cockroachdb/pebble/internal/base" 10 : "github.com/cockroachdb/pebble/internal/sstableinternal" 11 : "github.com/cockroachdb/pebble/sstable/block" 12 : "github.com/cockroachdb/pebble/sstable/rowblk" 13 : ) 14 : 15 : // MaximumBlockSize is the maximum permissible size of a block. 16 : const MaximumBlockSize = rowblk.MaximumSize 17 : 18 : // Compression is the per-block compression algorithm to use. 19 : type Compression int 20 : 21 : // The available compression types. 22 : const ( 23 : DefaultCompression Compression = iota 24 : NoCompression 25 : SnappyCompression 26 : ZstdCompression 27 : NCompression 28 : ) 29 : 30 : var ignoredInternalProperties = map[string]struct{}{ 31 : "rocksdb.column.family.id": {}, 32 : "rocksdb.fixed.key.length": {}, 33 : "rocksdb.index.key.is.user.key": {}, 34 : "rocksdb.index.value.is.delta.encoded": {}, 35 : "rocksdb.oldest.key.time": {}, 36 : "rocksdb.creation.time": {}, 37 : "rocksdb.file.creation.time": {}, 38 : "rocksdb.format.version": {}, 39 : } 40 : 41 1 : func (c Compression) String() string { 42 1 : switch c { 43 0 : case DefaultCompression: 44 0 : return "Default" 45 1 : case NoCompression: 46 1 : return "NoCompression" 47 1 : case SnappyCompression: 48 1 : return "Snappy" 49 1 : case ZstdCompression: 50 1 : return "ZSTD" 51 0 : default: 52 0 : return "Unknown" 53 : } 54 : } 55 : 56 : // CompressionFromString returns an sstable.Compression from its 57 : // string representation. Inverse of c.String() above. 58 1 : func CompressionFromString(s string) Compression { 59 1 : switch s { 60 0 : case "Default": 61 0 : return DefaultCompression 62 1 : case "NoCompression": 63 1 : return NoCompression 64 1 : case "Snappy": 65 1 : return SnappyCompression 66 1 : case "ZSTD": 67 1 : return ZstdCompression 68 1 : default: 69 1 : return DefaultCompression 70 : } 71 : } 72 : 73 : // FilterType exports the base.FilterType type. 74 : type FilterType = base.FilterType 75 : 76 : // Exported TableFilter constants. 77 : const ( 78 : TableFilter = base.TableFilter 79 : ) 80 : 81 : // FilterWriter exports the base.FilterWriter type. 82 : type FilterWriter = base.FilterWriter 83 : 84 : // FilterPolicy exports the base.FilterPolicy type. 85 : type FilterPolicy = base.FilterPolicy 86 : 87 : // Comparers is a map from comparer name to comparer. It is used for debugging 88 : // tools which may be used on multiple databases configured with different 89 : // comparers. 90 : type Comparers map[string]*base.Comparer 91 : 92 : // Mergers is a map from merger name to merger. It is used for debugging tools 93 : // which may be used on multiple databases configured with different 94 : // mergers. 95 : type Mergers map[string]*base.Merger 96 : 97 : // ReaderOptions holds the parameters needed for reading an sstable. 98 : type ReaderOptions struct { 99 : // LoadBlockSema, if set, is used to limit the number of blocks that can be 100 : // loaded (i.e. read from the filesystem) in parallel. Each load acquires one 101 : // unit from the semaphore for the duration of the read. 102 : LoadBlockSema *fifo.Semaphore 103 : 104 : // User properties specified in this map will not be added to sst.Properties.UserProperties. 105 : DeniedUserProperties map[string]struct{} 106 : 107 : // Comparer defines a total ordering over the space of []byte keys: a 'less 108 : // than' relationship. The same comparison algorithm must be used for reads 109 : // and writes over the lifetime of the DB. 110 : // 111 : // The default value uses the same ordering as bytes.Compare. 112 : Comparer *Comparer 113 : 114 : // Merger defines the Merge function in use for this keyspace. 115 : Merger *Merger 116 : 117 : Comparers Comparers 118 : Mergers Mergers 119 : 120 : // Filters is a map from filter policy name to filter policy. Filters with 121 : // policies that are not in this map will be ignored. 122 : Filters map[string]FilterPolicy 123 : 124 : // Logger is an optional logger and tracer. 125 : LoggerAndTracer base.LoggerAndTracer 126 : 127 : // FilterMetricsTracker is optionally used to track filter metrics. 128 : FilterMetricsTracker *FilterMetricsTracker 129 : 130 : // internal options can only be used from within the pebble package. 131 : internal sstableinternal.ReaderOptions 132 : } 133 : 134 : // SetInternal sets the internal reader options. Note that even though this 135 : // method is public, a caller outside the pebble package can't construct a value 136 : // to pass to it. 137 0 : func (o *ReaderOptions) SetInternal(internalOpts sstableinternal.ReaderOptions) { 138 0 : o.internal = internalOpts 139 0 : } 140 : 141 : // SetInternalCacheOpts sets the internal cache options. Note that even though 142 : // this method is public, a caller outside the pebble package can't construct a 143 : // value to pass to it. 144 1 : func (o *ReaderOptions) SetInternalCacheOpts(cacheOpts sstableinternal.CacheOptions) { 145 1 : o.internal.CacheOpts = cacheOpts 146 1 : } 147 : 148 1 : func (o ReaderOptions) ensureDefaults() ReaderOptions { 149 1 : if o.Comparer == nil { 150 0 : o.Comparer = base.DefaultComparer 151 0 : } 152 1 : if o.Merger == nil { 153 1 : o.Merger = base.DefaultMerger 154 1 : } 155 1 : if o.LoggerAndTracer == nil { 156 1 : o.LoggerAndTracer = base.NoopLoggerAndTracer{} 157 1 : } 158 1 : if o.DeniedUserProperties == nil { 159 1 : o.DeniedUserProperties = ignoredInternalProperties 160 1 : } 161 1 : return o 162 : } 163 : 164 : // WriterOptions holds the parameters used to control building an sstable. 165 : type WriterOptions struct { 166 : // BlockRestartInterval is the number of keys between restart points 167 : // for delta encoding of keys. 168 : // 169 : // The default value is 16. 170 : BlockRestartInterval int 171 : 172 : // BlockSize is the target uncompressed size in bytes of each table block. 173 : // 174 : // The default value is 4096. 175 : BlockSize int 176 : 177 : // BlockSizeThreshold finishes a block if the block size is larger than the 178 : // specified percentage of the target block size and adding the next entry 179 : // would cause the block to be larger than the target block size. 180 : // 181 : // The default value is 90. 182 : BlockSizeThreshold int 183 : 184 : // SizeClassAwareThreshold imposes a minimum block size restriction for blocks 185 : // to be flushed, that is computed as the percentage of the target block size. 186 : // Note that this threshold takes precedence over BlockSizeThreshold when 187 : // valid AllocatorSizeClasses are specified. 188 : // 189 : // The default value is 60. 190 : SizeClassAwareThreshold int 191 : 192 : // Comparer defines a total ordering over the space of []byte keys: a 'less 193 : // than' relationship. The same comparison algorithm must be used for reads 194 : // and writes over the lifetime of the DB. 195 : // 196 : // The default value uses the same ordering as bytes.Compare. 197 : Comparer *Comparer 198 : 199 : // Compression defines the per-block compression to use. 200 : // 201 : // The default value (DefaultCompression) uses snappy compression. 202 : Compression Compression 203 : 204 : // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 205 : // reduce disk reads for Get calls. 206 : // 207 : // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 208 : // package. 209 : // 210 : // The default value means to use no filter. 211 : FilterPolicy FilterPolicy 212 : 213 : // FilterType defines whether an existing filter policy is applied at a 214 : // block-level or table-level. Block-level filters use less memory to create, 215 : // but are slower to access as a check for the key in the index must first be 216 : // performed to locate the filter block. A table-level filter will require 217 : // memory proportional to the number of keys in an sstable to create, but 218 : // avoids the index lookup when determining if a key is present. Table-level 219 : // filters should be preferred except under constrained memory situations. 220 : FilterType FilterType 221 : 222 : // IndexBlockSize is the target uncompressed size in bytes of each index 223 : // block. When the index block size is larger than this target, two-level 224 : // indexes are automatically enabled. Setting this option to a large value 225 : // (such as math.MaxInt32) disables the automatic creation of two-level 226 : // indexes. 227 : // 228 : // The default value is the value of BlockSize. 229 : IndexBlockSize int 230 : 231 : // Merger defines the associative merge operation to use for merging values 232 : // written with {Batch,DB}.Merge. The MergerName is checked for consistency 233 : // with the value stored in the sstable when it was written. 234 : MergerName string 235 : 236 : // TableFormat specifies the format version for writing sstables. The default 237 : // is TableFormatMinSupported. 238 : TableFormat TableFormat 239 : 240 : // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment 241 : // in format.go. Must be false if format < TableFormatPebblev4. 242 : // 243 : // TODO(bilal): set this when writing shared ssts. 244 : IsStrictObsolete bool 245 : 246 : // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is 247 : // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the 248 : // youngest for a userkey. 249 : WritingToLowestLevel bool 250 : 251 : // BlockPropertyCollectors is a list of BlockPropertyCollector creation 252 : // functions. A new BlockPropertyCollector is created for each sstable 253 : // built and lives for the lifetime of writing that table. 254 : BlockPropertyCollectors []func() BlockPropertyCollector 255 : 256 : // Checksum specifies which checksum to use. 257 : Checksum block.ChecksumType 258 : 259 : // Parallelism is used to indicate that the sstable Writer is allowed to 260 : // compress data blocks and write datablocks to disk in parallel with the 261 : // Writer client goroutine. 262 : Parallelism bool 263 : 264 : // ShortAttributeExtractor mirrors 265 : // Options.Experimental.ShortAttributeExtractor. 266 : ShortAttributeExtractor base.ShortAttributeExtractor 267 : 268 : // RequiredInPlaceValueBound mirrors 269 : // Options.Experimental.RequiredInPlaceValueBound. 270 : RequiredInPlaceValueBound UserKeyPrefixBound 271 : 272 : // DisableValueBlocks is only used for TableFormat >= TableFormatPebblev3, 273 : // and if set to true, does not write any values to value blocks. This is 274 : // only intended for cases where the in-memory buffering of all value blocks 275 : // while writing a sstable is too expensive and likely to cause an OOM. It 276 : // is never set to true by a Pebble DB, and can be set to true when some 277 : // external code is directly generating huge sstables using Pebble's 278 : // sstable.Writer (for example, CockroachDB backups can sometimes write 279 : // 750MB sstables -- see 280 : // https://github.com/cockroachdb/cockroach/issues/117113). 281 : DisableValueBlocks bool 282 : 283 : // AllocatorSizeClasses provides a sorted list containing the supported size 284 : // classes of the underlying memory allocator. This provides hints to the 285 : // writer's flushing policy to select block sizes that preemptively reduce 286 : // internal fragmentation when loaded into the block cache. 287 : AllocatorSizeClasses []int 288 : 289 : // internal options can only be used from within the pebble package. 290 : internal sstableinternal.WriterOptions 291 : } 292 : 293 : // SetInternal sets the internal writer options. Note that even though this 294 : // method is public, a caller outside the pebble package can't construct a value 295 : // to pass to it. 296 1 : func (o *WriterOptions) SetInternal(internalOpts sstableinternal.WriterOptions) { 297 1 : o.internal = internalOpts 298 1 : } 299 : 300 1 : func (o WriterOptions) ensureDefaults() WriterOptions { 301 1 : if o.BlockRestartInterval <= 0 { 302 0 : o.BlockRestartInterval = base.DefaultBlockRestartInterval 303 0 : } 304 1 : if o.BlockSize <= 0 { 305 0 : o.BlockSize = base.DefaultBlockSize 306 0 : } 307 1 : if o.BlockSizeThreshold <= 0 { 308 0 : o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 309 0 : } 310 1 : if o.SizeClassAwareThreshold <= 0 { 311 1 : o.SizeClassAwareThreshold = base.SizeClassAwareBlockSizeThreshold 312 1 : } 313 1 : if o.Comparer == nil { 314 0 : o.Comparer = base.DefaultComparer 315 0 : } 316 1 : if o.Compression <= DefaultCompression || o.Compression >= NCompression { 317 0 : o.Compression = SnappyCompression 318 0 : } 319 1 : if o.IndexBlockSize <= 0 { 320 0 : o.IndexBlockSize = o.BlockSize 321 0 : } 322 1 : if o.MergerName == "" { 323 0 : o.MergerName = base.DefaultMerger.Name 324 0 : } 325 1 : if o.Checksum == block.ChecksumTypeNone { 326 1 : o.Checksum = block.ChecksumTypeCRC32c 327 1 : } 328 : // By default, if the table format is not specified, fall back to using the 329 : // most compatible format that is supported by Pebble. 330 1 : if o.TableFormat == TableFormatUnspecified { 331 0 : o.TableFormat = TableFormatMinSupported 332 0 : } 333 1 : return o 334 : }