Line data Source code
1 : // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "github.com/cockroachdb/errors"
9 : "github.com/cockroachdb/pebble/internal/base"
10 : )
11 :
12 : // TableFormat specifies the format version for sstables. The legacy LevelDB
13 : // format is format version 1.
14 : type TableFormat uint32
15 :
16 : // The available table formats, representing the tuple (magic number, version
17 : // number). Note that these values are not (and should not) be serialized to
18 : // disk. The ordering should follow the order the versions were introduced to
19 : // Pebble (i.e. the history is linear).
20 : const (
21 : TableFormatUnspecified TableFormat = iota
22 : TableFormatLevelDB
23 : TableFormatRocksDBv2
24 : TableFormatPebblev1 // Block properties.
25 : TableFormatPebblev2 // Range keys.
26 : TableFormatPebblev3 // Value blocks.
27 : TableFormatPebblev4 // DELSIZED tombstones.
28 : NumTableFormats
29 :
30 : TableFormatMax = NumTableFormats - 1
31 : )
32 :
33 : // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
34 : // InternalKeyKindSSTableInternalObsoleteBit.
35 : //
36 : // 1. Motivation
37 : //
38 : // We have various related problems caused by Pebble snapshots:
39 : //
40 : // - P1: RANGEDELs that delete points in the same sstable, but the points
41 : // happen to not get deleted during compactions because of an open snapshot.
42 : // This causes very expensive iteration, that has been observed in
43 : // production deployments
44 : //
45 : // - P2: When iterating over a foreign sstable (in disaggregated storage), we
46 : // need to do (a) point collapsing to expose at most one point per user key,
47 : // (b) apply RANGEDELs in the sstable to hide deleted points in the same
48 : // sstable. This per-sstable point collapsing iteration needs to be very
49 : // efficient (ideally as efficient from a CPU perspective as iteration over
50 : // regular sstables) since foreign sstables can be very long-lived -- one of
51 : // the goals of disaggregated storage is to scale compute and disk bandwidth
52 : // resources as a function of the hot (from a write perspective) data and
53 : // not the whole data, so we don't want to have to rewrite foreign sstables
54 : // solely to improve read performance.
55 : //
56 : // The ideal solution for P2 would allow user-facing reads to utilize the
57 : // existing SST iterators (with slight modifications) and with no loss of
58 : // efficiency. And for P1 and P2 we would like to skip whole blocks of
59 : // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
60 : // key comparisons at iteration time to discover what points are deleted is
61 : // very desirable, since keys can be long.
62 : //
63 : // We observe that:
64 : //
65 : // - Reads:
66 : // - All user-facing reads in CockroachDB use iterators over the DB, hence
67 : // have a higher read seqnum than all sstables (there are some rare cases
68 : // that can violate this, but those are not important from a performance
69 : // optimization perspective).
70 : //
71 : // - Certain internal-facing reads in CockroachDB use snapshots, but the
72 : // snapshots are shortlived enough that most L5 and L6 sstables will have
73 : // all seqnums lower than the snapshot seqnum.
74 : //
75 : // - Writes:
76 : // - We already do key comparisons between points when writing the sstable
77 : // to ensure that the sstable invariant (monotonically increasing internal
78 : // keys) is not violated. So we know which points share the same userkey,
79 : // and thereby which points are obsolete because there is a more recent
80 : // point in the same sstable.
81 : //
82 : // - The compactionIter knows which point id deleted by a RANGEDEL even if
83 : // the point does need to be written because of a snapshot.
84 : //
85 : // So this known information can be encoded in the sstable at write time and
86 : // utilized for optimized reading.
87 : //
88 : // 2. Solution
89 : //
90 : // We primarily scope the solution to the following point kinds: SET,
91 : // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
92 : // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
93 : // most one point will be exposed per user key. MERGE keys create more
94 : // complexity: MERGE followed by MERGE causes multiple keys to not be
95 : // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
96 : // that:
97 : //
98 : // - For regular sst iteration, the obsolete marking is a performance
99 : // optimization, and multiple keys for the same userkey can be handled by
100 : // higher layers in the iterator tree (specifically pebble.Iterator).
101 : //
102 : // - For foreign sst iteration, we disallow MERGEs to be written to such
103 : // shared ssts (details below).
104 : //
105 : // The key kinds are marked with an obsolete bit
106 : // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
107 : // obsolete. This marking is done within blockWriter, based on information
108 : // passed to it by Writer. In turn, Writer uses a combination of key
109 : // comparisons, and information provided by compactionIter to decide whether a
110 : // key-value pair is obsolete. Additionally, a Pebble-internal
111 : // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
112 : // blocks where all key-value pairs are obsolete. Since the common case is
113 : // non-obsolete blocks, this block property collector uses the empty byte
114 : // slice to represent a non-obsolete block, which consumes no space in
115 : // BlockHandleWithProperties.Props.
116 : //
117 : // At read time, the obsolete bit is only visible to the blockIter, which can
118 : // be optionally configured to hide obsolete points. This hiding is only
119 : // configured for data block iterators for sstables being read by user-facing
120 : // iterators at a seqnum greater than the max seqnum in the sstable.
121 : // Additionally, when this hiding is configured, a Pebble-internal block
122 : // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
123 : // blocks that are obsolete.
124 : //
125 : // 2.1 Correctness
126 : //
127 : // Due to the level invariant, the sequence of seqnums for a user key in a
128 : // sstable represents a contiguous subsequence of the seqnums for the userkey
129 : // across the whole LSM, and is more recent than the seqnums in a sstable in a
130 : // lower level. So exposing exactly one point from a sstable for a userkey
131 : // will also mask the points for the userkey in lower levels. If we expose no
132 : // point, because of RANGEDELs, that RANGEDEL will also mask the points in
133 : // lower levels.
134 : //
135 : // Note that we do not need to do anything special at write time for
136 : // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
137 : // specially only by compactions, which do not hide obsolete points. For
138 : // regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the
139 : // same as DEL.
140 : //
141 : // 2.2 Strictness and MERGE
142 : //
143 : // Setting the obsolete bit on point keys is advanced usage, so we support two
144 : // modes, both of which must be truthful when setting the obsolete bit, but
145 : // vary in when they don't set the obsolete bit.
146 : //
147 : // - Non-strict: In this mode, the bit does not need to be set for keys that
148 : // are obsolete. Additionally, any sstable containing MERGE keys can only
149 : // use this mode. An iterator over such an sstable, when configured to
150 : // hideObsoletePoints, can expose multiple internal keys per user key, and
151 : // can expose keys that are deleted by rangedels in the same sstable. This
152 : // is the mode that non-advanced users should use. Pebble without
153 : // disaggregated storage will also use this mode and will best-effort set
154 : // the obsolete bit, to optimize iteration when snapshots have retained many
155 : // obsolete keys.
156 : //
157 : // - Strict: In this mode, every obsolete key must have the obsolete bit set,
158 : // and no MERGE keys are permitted. An iterator over such an sstable, when
159 : // configured to hideObsoletePoints satisfies two properties:
160 : // - S1: will expose at most one internal key per user key, which is the
161 : // most recent one.
162 : // - S2: will never expose keys that are deleted by rangedels in the same
163 : // sstable.
164 : //
165 : // This is the mode for two use cases in disaggregated storage (which will
166 : // exclude parts of the key space that has MERGEs), for levels that contain
167 : // sstables that can become foreign sstables:
168 : // - Pebble compaction output to these levels that can become foreign
169 : // sstables.
170 : //
171 : // - CockroachDB ingest operations that can ingest into the levels that can
172 : // become foreign sstables. Note, these are not sstables corresponding to
173 : // copied data for CockroachDB range snapshots. This case occurs for
174 : // operations like index backfills: these trivially satisfy the strictness
175 : // criteria since they only write one key per userkey.
176 : //
177 : // TODO(sumeer): this latter case is not currently supported, since only
178 : // Writer.AddWithForceObsolete calls are permitted for writing strict
179 : // obsolete sstables. This is done to reduce the likelihood of bugs. One
180 : // simple way to lift this limitation would be to disallow adding any
181 : // RANGEDELs when a Pebble-external writer is trying to construct a strict
182 : // obsolete sstable.
183 :
184 : // ParseTableFormat parses the given magic bytes and version into its
185 : // corresponding internal TableFormat.
186 1 : func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
187 1 : switch string(magic) {
188 0 : case levelDBMagic:
189 0 : return TableFormatLevelDB, nil
190 0 : case rocksDBMagic:
191 0 : if version != rocksDBFormatVersion2 {
192 0 : return TableFormatUnspecified, base.CorruptionErrorf(
193 0 : "pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
194 0 : )
195 0 : }
196 0 : return TableFormatRocksDBv2, nil
197 1 : case pebbleDBMagic:
198 1 : switch version {
199 0 : case 1:
200 0 : return TableFormatPebblev1, nil
201 1 : case 2:
202 1 : return TableFormatPebblev2, nil
203 1 : case 3:
204 1 : return TableFormatPebblev3, nil
205 1 : case 4:
206 1 : return TableFormatPebblev4, nil
207 0 : default:
208 0 : return TableFormatUnspecified, base.CorruptionErrorf(
209 0 : "pebble/table: unsupported pebble format version %d", errors.Safe(version),
210 0 : )
211 : }
212 0 : default:
213 0 : return TableFormatUnspecified, base.CorruptionErrorf(
214 0 : "pebble/table: invalid table (bad magic number: 0x%x)", magic,
215 0 : )
216 : }
217 : }
218 :
219 : // AsTuple returns the TableFormat's (Magic String, Version) tuple.
220 1 : func (f TableFormat) AsTuple() (string, uint32) {
221 1 : switch f {
222 0 : case TableFormatLevelDB:
223 0 : return levelDBMagic, 0
224 0 : case TableFormatRocksDBv2:
225 0 : return rocksDBMagic, 2
226 0 : case TableFormatPebblev1:
227 0 : return pebbleDBMagic, 1
228 1 : case TableFormatPebblev2:
229 1 : return pebbleDBMagic, 2
230 1 : case TableFormatPebblev3:
231 1 : return pebbleDBMagic, 3
232 1 : case TableFormatPebblev4:
233 1 : return pebbleDBMagic, 4
234 0 : default:
235 0 : panic("sstable: unknown table format version tuple")
236 : }
237 : }
238 :
239 : // String returns the TableFormat (Magic String,Version) tuple.
240 0 : func (f TableFormat) String() string {
241 0 : switch f {
242 0 : case TableFormatLevelDB:
243 0 : return "(LevelDB)"
244 0 : case TableFormatRocksDBv2:
245 0 : return "(RocksDB,v2)"
246 0 : case TableFormatPebblev1:
247 0 : return "(Pebble,v1)"
248 0 : case TableFormatPebblev2:
249 0 : return "(Pebble,v2)"
250 0 : case TableFormatPebblev3:
251 0 : return "(Pebble,v3)"
252 0 : case TableFormatPebblev4:
253 0 : return "(Pebble,v4)"
254 0 : default:
255 0 : panic("sstable: unknown table format version tuple")
256 : }
257 : }
|