Line data Source code
1 : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "github.com/cockroachdb/errors"
9 : "github.com/cockroachdb/pebble/internal/base"
10 : "github.com/cockroachdb/pebble/sstable/block"
11 : "github.com/cockroachdb/pebble/sstable/colblk"
12 : "github.com/cockroachdb/pebble/sstable/rowblk"
13 : )
14 :
15 : // TableFormat specifies the format version for sstables. The legacy LevelDB
16 : // format is format version 1.
17 : type TableFormat uint32
18 :
19 : // The available table formats, representing the tuple (magic number, version
20 : // number). Note that these values are not (and should not) be serialized to
21 : // disk. The ordering should follow the order the versions were introduced to
22 : // Pebble (i.e. the history is linear).
23 : const (
24 : TableFormatUnspecified TableFormat = iota
25 : TableFormatLevelDB
26 : TableFormatRocksDBv2
27 : TableFormatPebblev1 // Block properties.
28 : TableFormatPebblev2 // Range keys.
29 : TableFormatPebblev3 // Value blocks.
30 : TableFormatPebblev4 // DELSIZED tombstones.
31 : TableFormatPebblev5 // Columnar blocks.
32 : NumTableFormats
33 :
34 : TableFormatMax = NumTableFormats - 1
35 :
36 : // TableFormatMinSupported is the minimum format supported by Pebble. This
37 : // package still supports older formats for uses outside of Pebble
38 : // (CockroachDB uses it to read data from backups that could be old).
39 : TableFormatMinSupported = TableFormatPebblev1
40 : )
41 :
42 : // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
43 : // InternalKeyKindSSTableInternalObsoleteBit.
44 : //
45 : // 1. Motivation
46 : //
47 : // We have various related problems caused by Pebble snapshots:
48 : //
49 : // - P1: RANGEDELs that delete points in the same sstable, but the points
50 : // happen to not get deleted during compactions because of an open snapshot.
51 : // This causes very expensive iteration, that has been observed in
52 : // production deployments
53 : //
54 : // - P2: When iterating over a foreign sstable (in disaggregated storage), we
55 : // need to do (a) point collapsing to expose at most one point per user key,
56 : // (b) apply RANGEDELs in the sstable to hide deleted points in the same
57 : // sstable. This per-sstable point collapsing iteration needs to be very
58 : // efficient (ideally as efficient from a CPU perspective as iteration over
59 : // regular sstables) since foreign sstables can be very long-lived -- one of
60 : // the goals of disaggregated storage is to scale compute and disk bandwidth
61 : // resources as a function of the hot (from a write perspective) data and
62 : // not the whole data, so we don't want to have to rewrite foreign sstables
63 : // solely to improve read performance.
64 : //
65 : // The ideal solution for P2 would allow user-facing reads to utilize the
66 : // existing SST iterators (with slight modifications) and with no loss of
67 : // efficiency. And for P1 and P2 we would like to skip whole blocks of
68 : // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
69 : // key comparisons at iteration time to discover what points are deleted is
70 : // very desirable, since keys can be long.
71 : //
72 : // We observe that:
73 : //
74 : // - Reads:
75 : // - All user-facing reads in CockroachDB use iterators over the DB, hence
76 : // have a higher read seqnum than all sstables (there are some rare cases
77 : // that can violate this, but those are not important from a performance
78 : // optimization perspective).
79 : //
80 : // - Certain internal-facing reads in CockroachDB use snapshots, but the
81 : // snapshots are shortlived enough that most L5 and L6 sstables will have
82 : // all seqnums lower than the snapshot seqnum.
83 : //
84 : // - Writes:
85 : // - We already do key comparisons between points when writing the sstable
86 : // to ensure that the sstable invariant (monotonically increasing internal
87 : // keys) is not violated. So we know which points share the same userkey,
88 : // and thereby which points are obsolete because there is a more recent
89 : // point in the same sstable.
90 : //
91 : // - The compactionIter knows which point id deleted by a RANGEDEL even if
92 : // the point does need to be written because of a snapshot.
93 : //
94 : // So this known information can be encoded in the sstable at write time and
95 : // utilized for optimized reading.
96 : //
97 : // 2. Solution
98 : //
99 : // We primarily scope the solution to the following point kinds: SET,
100 : // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
101 : // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
102 : // most one point will be exposed per user key. MERGE keys create more
103 : // complexity: MERGE followed by MERGE causes multiple keys to not be
104 : // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
105 : // that:
106 : //
107 : // - For regular sst iteration, the obsolete marking is a performance
108 : // optimization, and multiple keys for the same userkey can be handled by
109 : // higher layers in the iterator tree (specifically pebble.Iterator).
110 : //
111 : // - For foreign sst iteration, we disallow MERGEs to be written to such
112 : // shared ssts (details below).
113 : //
114 : // The key kinds are marked with an obsolete bit
115 : // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
116 : // obsolete. This marking is done within blockWriter, based on information
117 : // passed to it by Writer. In turn, Writer uses a combination of key
118 : // comparisons, and information provided by compactionIter to decide whether a
119 : // key-value pair is obsolete. Additionally, a Pebble-internal
120 : // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
121 : // blocks where all key-value pairs are obsolete. Since the common case is
122 : // non-obsolete blocks, this block property collector uses the empty byte
123 : // slice to represent a non-obsolete block, which consumes no space in
124 : // BlockHandleWithProperties.Props.
125 : //
126 : // At read time, the obsolete bit is only visible to the blockIter, which can
127 : // be optionally configured to hide obsolete points. This hiding is only
128 : // configured for data block iterators for sstables being read by user-facing
129 : // iterators at a seqnum greater than the max seqnum in the sstable.
130 : // Additionally, when this hiding is configured, a Pebble-internal block
131 : // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
132 : // blocks that are obsolete.
133 : //
134 : // 2.1 Correctness
135 : //
136 : // Due to the level invariant, the sequence of seqnums for a user key in a
137 : // sstable represents a contiguous subsequence of the seqnums for the userkey
138 : // across the whole LSM, and is more recent than the seqnums in a sstable in a
139 : // lower level. So exposing exactly one point from a sstable for a userkey
140 : // will also mask the points for the userkey in lower levels. If we expose no
141 : // point, because of RANGEDELs, that RANGEDEL will also mask the points in
142 : // lower levels.
143 : //
144 : // Note that we do not need to do anything special at write time for
145 : // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
146 : // specially only by compactions, which typically do not hide obsolete points
147 : // (see exception below). For regular reads, SETWITHDEL behaves the same as
148 : // SET and SINGLEDEL behaves the same as DEL.
149 : //
150 : // 2.1.1 Compaction reads of a foreign sstable
151 : //
152 : // Compaction reads of a foreign sstable behave like regular reads in that
153 : // only non-obsolete points are exposed. Consider a L5 foreign sstable with
154 : // b.SINGLEDEL that is non-obsolete followed by obsolete b.DEL. And a L6
155 : // foreign sstable with two b.SETs. The SINGLEDEL will be exposed, and not the
156 : // DEL, but this is not a correctness issue since only one of the SETs in the
157 : // L6 sstable will be exposed. However, this works only because we have
158 : // limited the number of foreign sst levels to two, and is extremely fragile.
159 : // For robust correctness, non-obsolete SINGLEDELs in foreign sstables should
160 : // be exposed as DELs.
161 : //
162 : // Additionally, to avoid false positive accounting errors in DELSIZED, we
163 : // should expose them as DEL.
164 : //
165 : // NB: as of writing this comment, we do not have end-to-end support for
166 : // SINGLEDEL for disaggregated storage since pointCollapsingIterator (used by
167 : // ScanInternal) does not support SINGLEDEL. So the disaggregated key spans
168 : // are required to never have SINGLEDELs (which is fine for CockroachDB since
169 : // only the MVCC key space uses disaggregated storage, and SINGLEDELs are only
170 : // used for the non-MVCC locks and intents).
171 : //
172 : // 2.2 Strictness and MERGE
173 : //
174 : // Setting the obsolete bit on point keys is advanced usage, so we support two
175 : // modes, both of which must be truthful when setting the obsolete bit, but
176 : // vary in when they don't set the obsolete bit.
177 : //
178 : // - Non-strict: In this mode, the bit does not need to be set for keys that
179 : // are obsolete. Additionally, any sstable containing MERGE keys can only
180 : // use this mode. An iterator over such an sstable, when configured to
181 : // hideObsoletePoints, can expose multiple internal keys per user key, and
182 : // can expose keys that are deleted by rangedels in the same sstable. This
183 : // is the mode that non-advanced users should use. Pebble without
184 : // disaggregated storage will also use this mode and will best-effort set
185 : // the obsolete bit, to optimize iteration when snapshots have retained many
186 : // obsolete keys.
187 : //
188 : // - Strict: In this mode, every obsolete key must have the obsolete bit set,
189 : // and no MERGE keys are permitted. An iterator over such an sstable, when
190 : // configured to hideObsoletePoints satisfies two properties:
191 : // - S1: will expose at most one internal key per user key, which is the
192 : // most recent one.
193 : // - S2: will never expose keys that are deleted by rangedels in the same
194 : // sstable.
195 : //
196 : // This is the mode for two use cases in disaggregated storage (which will
197 : // exclude parts of the key space that has MERGEs), for levels that contain
198 : // sstables that can become foreign sstables:
199 : // - Pebble compaction output to these levels that can become foreign
200 : // sstables.
201 : //
202 : // - CockroachDB ingest operations that can ingest into the levels that can
203 : // become foreign sstables. Note, these are not sstables corresponding to
204 : // copied data for CockroachDB range snapshots. This case occurs for
205 : // operations like index backfills: these trivially satisfy the strictness
206 : // criteria since they only write one key per userkey.
207 : //
208 : // TODO(sumeer): this latter case is not currently supported, since only
209 : // Writer.AddWithForceObsolete calls are permitted for writing strict
210 : // obsolete sstables. This is done to reduce the likelihood of bugs. One
211 : // simple way to lift this limitation would be to disallow adding any
212 : // RANGEDELs when a Pebble-external writer is trying to construct a strict
213 : // obsolete sstable.
214 :
215 : // parseTableFormat parses the given magic bytes and version into its
216 : // corresponding internal TableFormat.
217 2 : func parseTableFormat(magic []byte, version uint32) (TableFormat, error) {
218 2 : switch string(magic) {
219 1 : case levelDBMagic:
220 1 : return TableFormatLevelDB, nil
221 1 : case rocksDBMagic:
222 1 : if version != rocksDBFormatVersion2 {
223 1 : return TableFormatUnspecified, base.CorruptionErrorf(
224 1 : "(unsupported rocksdb format version %d)", errors.Safe(version))
225 1 : }
226 1 : return TableFormatRocksDBv2, nil
227 2 : case pebbleDBMagic:
228 2 : switch version {
229 1 : case 1:
230 1 : return TableFormatPebblev1, nil
231 2 : case 2:
232 2 : return TableFormatPebblev2, nil
233 2 : case 3:
234 2 : return TableFormatPebblev3, nil
235 2 : case 4:
236 2 : return TableFormatPebblev4, nil
237 2 : case 5:
238 2 : return TableFormatPebblev5, nil
239 1 : default:
240 1 : return TableFormatUnspecified, base.CorruptionErrorf(
241 1 : "(unsupported pebble format version %d)", errors.Safe(version))
242 : }
243 1 : default:
244 1 : return TableFormatUnspecified, base.CorruptionErrorf(
245 1 : "(bad magic number: 0x%x)", magic)
246 : }
247 : }
248 :
249 : // BlockColumnar returns true iff the table format uses the columnar format for
250 : // data, index and keyspan blocks.
251 2 : func (f TableFormat) BlockColumnar() bool {
252 2 : return f >= TableFormatPebblev5
253 2 : }
254 :
255 2 : func (f TableFormat) newIndexIter() block.IndexBlockIterator {
256 2 : if !f.BlockColumnar() {
257 2 : return new(rowblk.IndexIter)
258 2 : }
259 2 : return new(colblk.IndexIter)
260 : }
261 :
262 : // AsTuple returns the TableFormat's (Magic String, Version) tuple.
263 2 : func (f TableFormat) AsTuple() (string, uint32) {
264 2 : switch f {
265 1 : case TableFormatLevelDB:
266 1 : return levelDBMagic, 0
267 1 : case TableFormatRocksDBv2:
268 1 : return rocksDBMagic, 2
269 1 : case TableFormatPebblev1:
270 1 : return pebbleDBMagic, 1
271 2 : case TableFormatPebblev2:
272 2 : return pebbleDBMagic, 2
273 2 : case TableFormatPebblev3:
274 2 : return pebbleDBMagic, 3
275 2 : case TableFormatPebblev4:
276 2 : return pebbleDBMagic, 4
277 2 : case TableFormatPebblev5:
278 2 : return pebbleDBMagic, 5
279 0 : default:
280 0 : panic("sstable: unknown table format version tuple")
281 : }
282 : }
283 :
284 : // String returns the TableFormat (Magic String,Version) tuple.
285 2 : func (f TableFormat) String() string {
286 2 : switch f {
287 2 : case TableFormatUnspecified:
288 2 : return "unspecified"
289 2 : case TableFormatLevelDB:
290 2 : return "(LevelDB)"
291 2 : case TableFormatRocksDBv2:
292 2 : return "(RocksDB,v2)"
293 2 : case TableFormatPebblev1:
294 2 : return "(Pebble,v1)"
295 2 : case TableFormatPebblev2:
296 2 : return "(Pebble,v2)"
297 2 : case TableFormatPebblev3:
298 2 : return "(Pebble,v3)"
299 2 : case TableFormatPebblev4:
300 2 : return "(Pebble,v4)"
301 2 : case TableFormatPebblev5:
302 2 : return "(Pebble,v5)"
303 0 : default:
304 0 : panic("sstable: unknown table format version tuple")
305 : }
306 : }
307 :
308 2 : var tableFormatStrings = func() map[string]TableFormat {
309 2 : strs := make(map[string]TableFormat, NumTableFormats)
310 2 : for f := TableFormatUnspecified; f < NumTableFormats; f++ {
311 2 : strs[f.String()] = f
312 2 : }
313 2 : return strs
314 : }()
315 :
316 : // ParseTableFormatString parses a TableFormat from its human-readable string
317 : // representation.
318 1 : func ParseTableFormatString(s string) (TableFormat, error) {
319 1 : f, ok := tableFormatStrings[s]
320 1 : if !ok {
321 0 : return TableFormatUnspecified, errors.Errorf("unknown table format %q", s)
322 0 : }
323 1 : return f, nil
324 : }
|