Line data Source code
1 : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "github.com/cockroachdb/errors"
9 : "github.com/cockroachdb/pebble/internal/base"
10 : )
11 :
12 : // TableFormat specifies the format version for sstables. The legacy LevelDB
13 : // format is format version 1.
14 : type TableFormat uint32
15 :
16 : // The available table formats, representing the tuple (magic number, version
17 : // number). Note that these values are not (and should not) be serialized to
18 : // disk. The ordering should follow the order the versions were introduced to
19 : // Pebble (i.e. the history is linear).
20 : const (
21 : TableFormatUnspecified TableFormat = iota
22 : TableFormatLevelDB
23 : TableFormatRocksDBv2
24 : TableFormatPebblev1 // Block properties.
25 : TableFormatPebblev2 // Range keys.
26 : TableFormatPebblev3 // Value blocks.
27 : TableFormatPebblev4 // DELSIZED tombstones.
28 : NumTableFormats
29 :
30 : TableFormatMax = NumTableFormats - 1
31 :
32 : // TableFormatMinSupported is the minimum format supported by Pebble. This
33 : // package still supports older formats for uses outside of Pebble
34 : // (CockroachDB uses it to read data from backups that could be old).
35 : TableFormatMinSupported = TableFormatPebblev1
36 : )
37 :
38 : // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
39 : // InternalKeyKindSSTableInternalObsoleteBit.
40 : //
41 : // 1. Motivation
42 : //
43 : // We have various related problems caused by Pebble snapshots:
44 : //
45 : // - P1: RANGEDELs that delete points in the same sstable, but the points
46 : // happen to not get deleted during compactions because of an open snapshot.
47 : // This causes very expensive iteration, that has been observed in
48 : // production deployments
49 : //
50 : // - P2: When iterating over a foreign sstable (in disaggregated storage), we
51 : // need to do (a) point collapsing to expose at most one point per user key,
52 : // (b) apply RANGEDELs in the sstable to hide deleted points in the same
53 : // sstable. This per-sstable point collapsing iteration needs to be very
54 : // efficient (ideally as efficient from a CPU perspective as iteration over
55 : // regular sstables) since foreign sstables can be very long-lived -- one of
56 : // the goals of disaggregated storage is to scale compute and disk bandwidth
57 : // resources as a function of the hot (from a write perspective) data and
58 : // not the whole data, so we don't want to have to rewrite foreign sstables
59 : // solely to improve read performance.
60 : //
61 : // The ideal solution for P2 would allow user-facing reads to utilize the
62 : // existing SST iterators (with slight modifications) and with no loss of
63 : // efficiency. And for P1 and P2 we would like to skip whole blocks of
64 : // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
65 : // key comparisons at iteration time to discover what points are deleted is
66 : // very desirable, since keys can be long.
67 : //
68 : // We observe that:
69 : //
70 : // - Reads:
71 : // - All user-facing reads in CockroachDB use iterators over the DB, hence
72 : // have a higher read seqnum than all sstables (there are some rare cases
73 : // that can violate this, but those are not important from a performance
74 : // optimization perspective).
75 : //
76 : // - Certain internal-facing reads in CockroachDB use snapshots, but the
77 : // snapshots are shortlived enough that most L5 and L6 sstables will have
78 : // all seqnums lower than the snapshot seqnum.
79 : //
80 : // - Writes:
81 : // - We already do key comparisons between points when writing the sstable
82 : // to ensure that the sstable invariant (monotonically increasing internal
83 : // keys) is not violated. So we know which points share the same userkey,
84 : // and thereby which points are obsolete because there is a more recent
85 : // point in the same sstable.
86 : //
87 : // - The compactionIter knows which point id deleted by a RANGEDEL even if
88 : // the point does need to be written because of a snapshot.
89 : //
90 : // So this known information can be encoded in the sstable at write time and
91 : // utilized for optimized reading.
92 : //
93 : // 2. Solution
94 : //
95 : // We primarily scope the solution to the following point kinds: SET,
96 : // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
97 : // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
98 : // most one point will be exposed per user key. MERGE keys create more
99 : // complexity: MERGE followed by MERGE causes multiple keys to not be
100 : // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
101 : // that:
102 : //
103 : // - For regular sst iteration, the obsolete marking is a performance
104 : // optimization, and multiple keys for the same userkey can be handled by
105 : // higher layers in the iterator tree (specifically pebble.Iterator).
106 : //
107 : // - For foreign sst iteration, we disallow MERGEs to be written to such
108 : // shared ssts (details below).
109 : //
110 : // The key kinds are marked with an obsolete bit
111 : // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
112 : // obsolete. This marking is done within blockWriter, based on information
113 : // passed to it by Writer. In turn, Writer uses a combination of key
114 : // comparisons, and information provided by compactionIter to decide whether a
115 : // key-value pair is obsolete. Additionally, a Pebble-internal
116 : // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
117 : // blocks where all key-value pairs are obsolete. Since the common case is
118 : // non-obsolete blocks, this block property collector uses the empty byte
119 : // slice to represent a non-obsolete block, which consumes no space in
120 : // BlockHandleWithProperties.Props.
121 : //
122 : // At read time, the obsolete bit is only visible to the blockIter, which can
123 : // be optionally configured to hide obsolete points. This hiding is only
124 : // configured for data block iterators for sstables being read by user-facing
125 : // iterators at a seqnum greater than the max seqnum in the sstable.
126 : // Additionally, when this hiding is configured, a Pebble-internal block
127 : // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
128 : // blocks that are obsolete.
129 : //
130 : // 2.1 Correctness
131 : //
132 : // Due to the level invariant, the sequence of seqnums for a user key in a
133 : // sstable represents a contiguous subsequence of the seqnums for the userkey
134 : // across the whole LSM, and is more recent than the seqnums in a sstable in a
135 : // lower level. So exposing exactly one point from a sstable for a userkey
136 : // will also mask the points for the userkey in lower levels. If we expose no
137 : // point, because of RANGEDELs, that RANGEDEL will also mask the points in
138 : // lower levels.
139 : //
140 : // Note that we do not need to do anything special at write time for
141 : // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
142 : // specially only by compactions, which typically do not hide obsolete points
143 : // (see exception below). For regular reads, SETWITHDEL behaves the same as
144 : // SET and SINGLEDEL behaves the same as DEL.
145 : //
146 : // 2.1.1 Compaction reads of a foreign sstable
147 : //
148 : // Compaction reads of a foreign sstable behave like regular reads in that
149 : // only non-obsolete points are exposed. Consider a L5 foreign sstable with
150 : // b.SINGLEDEL that is non-obsolete followed by obsolete b.DEL. And a L6
151 : // foreign sstable with two b.SETs. The SINGLEDEL will be exposed, and not the
152 : // DEL, but this is not a correctness issue since only one of the SETs in the
153 : // L6 sstable will be exposed. However, this works only because we have
154 : // limited the number of foreign sst levels to two, and is extremely fragile.
155 : // For robust correctness, non-obsolete SINGLEDELs in foreign sstables should
156 : // be exposed as DELs.
157 : //
158 : // Additionally, to avoid false positive accounting errors in DELSIZED, we
159 : // should expose them as DEL.
160 : //
161 : // NB: as of writing this comment, we do not have end-to-end support for
162 : // SINGLEDEL for disaggregated storage since pointCollapsingIterator (used by
163 : // ScanInternal) does not support SINGLEDEL. So the disaggregated key spans
164 : // are required to never have SINGLEDELs (which is fine for CockroachDB since
165 : // only the MVCC key space uses disaggregated storage, and SINGLEDELs are only
166 : // used for the non-MVCC locks and intents).
167 : //
168 : // 2.2 Strictness and MERGE
169 : //
170 : // Setting the obsolete bit on point keys is advanced usage, so we support two
171 : // modes, both of which must be truthful when setting the obsolete bit, but
172 : // vary in when they don't set the obsolete bit.
173 : //
174 : // - Non-strict: In this mode, the bit does not need to be set for keys that
175 : // are obsolete. Additionally, any sstable containing MERGE keys can only
176 : // use this mode. An iterator over such an sstable, when configured to
177 : // hideObsoletePoints, can expose multiple internal keys per user key, and
178 : // can expose keys that are deleted by rangedels in the same sstable. This
179 : // is the mode that non-advanced users should use. Pebble without
180 : // disaggregated storage will also use this mode and will best-effort set
181 : // the obsolete bit, to optimize iteration when snapshots have retained many
182 : // obsolete keys.
183 : //
184 : // - Strict: In this mode, every obsolete key must have the obsolete bit set,
185 : // and no MERGE keys are permitted. An iterator over such an sstable, when
186 : // configured to hideObsoletePoints satisfies two properties:
187 : // - S1: will expose at most one internal key per user key, which is the
188 : // most recent one.
189 : // - S2: will never expose keys that are deleted by rangedels in the same
190 : // sstable.
191 : //
192 : // This is the mode for two use cases in disaggregated storage (which will
193 : // exclude parts of the key space that has MERGEs), for levels that contain
194 : // sstables that can become foreign sstables:
195 : // - Pebble compaction output to these levels that can become foreign
196 : // sstables.
197 : //
198 : // - CockroachDB ingest operations that can ingest into the levels that can
199 : // become foreign sstables. Note, these are not sstables corresponding to
200 : // copied data for CockroachDB range snapshots. This case occurs for
201 : // operations like index backfills: these trivially satisfy the strictness
202 : // criteria since they only write one key per userkey.
203 : //
204 : // TODO(sumeer): this latter case is not currently supported, since only
205 : // Writer.AddWithForceObsolete calls are permitted for writing strict
206 : // obsolete sstables. This is done to reduce the likelihood of bugs. One
207 : // simple way to lift this limitation would be to disallow adding any
208 : // RANGEDELs when a Pebble-external writer is trying to construct a strict
209 : // obsolete sstable.
210 :
211 : // ParseTableFormat parses the given magic bytes and version into its
212 : // corresponding internal TableFormat.
213 1 : func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
214 1 : switch string(magic) {
215 0 : case levelDBMagic:
216 0 : return TableFormatLevelDB, nil
217 0 : case rocksDBMagic:
218 0 : if version != rocksDBFormatVersion2 {
219 0 : return TableFormatUnspecified, base.CorruptionErrorf(
220 0 : "pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
221 0 : )
222 0 : }
223 0 : return TableFormatRocksDBv2, nil
224 1 : case pebbleDBMagic:
225 1 : switch version {
226 0 : case 1:
227 0 : return TableFormatPebblev1, nil
228 1 : case 2:
229 1 : return TableFormatPebblev2, nil
230 1 : case 3:
231 1 : return TableFormatPebblev3, nil
232 1 : case 4:
233 1 : return TableFormatPebblev4, nil
234 0 : default:
235 0 : return TableFormatUnspecified, base.CorruptionErrorf(
236 0 : "pebble/table: unsupported pebble format version %d", errors.Safe(version),
237 0 : )
238 : }
239 0 : default:
240 0 : return TableFormatUnspecified, base.CorruptionErrorf(
241 0 : "pebble/table: invalid table (bad magic number: 0x%x)", magic,
242 0 : )
243 : }
244 : }
245 :
246 : // AsTuple returns the TableFormat's (Magic String, Version) tuple.
247 1 : func (f TableFormat) AsTuple() (string, uint32) {
248 1 : switch f {
249 0 : case TableFormatLevelDB:
250 0 : return levelDBMagic, 0
251 0 : case TableFormatRocksDBv2:
252 0 : return rocksDBMagic, 2
253 0 : case TableFormatPebblev1:
254 0 : return pebbleDBMagic, 1
255 1 : case TableFormatPebblev2:
256 1 : return pebbleDBMagic, 2
257 1 : case TableFormatPebblev3:
258 1 : return pebbleDBMagic, 3
259 1 : case TableFormatPebblev4:
260 1 : return pebbleDBMagic, 4
261 0 : default:
262 0 : panic("sstable: unknown table format version tuple")
263 : }
264 : }
265 :
266 : // String returns the TableFormat (Magic String,Version) tuple.
267 0 : func (f TableFormat) String() string {
268 0 : switch f {
269 0 : case TableFormatLevelDB:
270 0 : return "(LevelDB)"
271 0 : case TableFormatRocksDBv2:
272 0 : return "(RocksDB,v2)"
273 0 : case TableFormatPebblev1:
274 0 : return "(Pebble,v1)"
275 0 : case TableFormatPebblev2:
276 0 : return "(Pebble,v2)"
277 0 : case TableFormatPebblev3:
278 0 : return "(Pebble,v3)"
279 0 : case TableFormatPebblev4:
280 0 : return "(Pebble,v4)"
281 0 : default:
282 0 : panic("sstable: unknown table format version tuple")
283 : }
284 : }
|