Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bufio"
9 : "fmt"
10 : "io"
11 : "math"
12 : "os"
13 : "path/filepath"
14 : "sort"
15 : "strings"
16 : "sync"
17 :
18 : "github.com/cockroachdb/pebble/bloom"
19 : "github.com/cockroachdb/pebble/internal/base"
20 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
21 : "github.com/cockroachdb/pebble/vfs"
22 : )
23 :
24 : // testKVs is a key-value map holding test data.
25 : type testKVs map[string]string
26 :
27 : // SortedKeys returns the keys in the map, in sorted order.
28 1 : func (m testKVs) SortedKeys() []string {
29 1 : res := make([]string, 0, len(m))
30 1 : for k := range m {
31 1 : res = append(res, k)
32 1 : }
33 1 : sort.Strings(res)
34 1 : return res
35 : }
36 :
37 : // These variable should not be used directly, only via hamletWordCount().
38 : var hamletWordCountState struct {
39 : once sync.Once
40 : data testKVs
41 : }
42 :
43 : // hamletWordCount returns the data in testdata.h/txt, as a map from word to
44 : // count (as string).
45 1 : func hamletWordCount() testKVs {
46 1 : hamletWordCountState.once.Do(func() {
47 1 : wordCount := make(map[string]string)
48 1 : f, err := os.Open(filepath.FromSlash("testdata/h.txt"))
49 1 : if err != nil {
50 0 : panic(err)
51 : }
52 1 : defer f.Close()
53 1 : r := bufio.NewReader(f)
54 1 :
55 1 : for {
56 1 : s, err := r.ReadBytes('\n')
57 1 : if err == io.EOF {
58 1 : break
59 : }
60 1 : if err != nil {
61 0 : panic(err)
62 : }
63 1 : k := strings.TrimSpace(string(s[8:]))
64 1 : v := strings.TrimSpace(string(s[:8]))
65 1 : wordCount[k] = v
66 : }
67 1 : if len(wordCount) != 1710 {
68 0 : panic(fmt.Sprintf("h.txt entry count: got %d, want %d", len(wordCount), 1710))
69 : }
70 1 : for _, s := range hamletNonsenseWords {
71 1 : if _, ok := wordCount[s]; ok {
72 0 : panic(fmt.Sprintf("nonsense word %q was in h.txt", s))
73 : }
74 : }
75 1 : hamletWordCountState.data = wordCount
76 : })
77 1 : return hamletWordCountState.data
78 : }
79 :
80 : // hamletNonsenseWords are words that aren't in testdata/h.txt.
81 : var hamletNonsenseWords = []string{
82 : // Edge cases.
83 : "",
84 : "\x00",
85 : "\xff",
86 : "`",
87 : "a\x00",
88 : "aaaaaa",
89 : "pol\x00nius",
90 : "youth\x00",
91 : "youti",
92 : "zzzzzz",
93 : // Capitalized versions of actual words in testdata/h.txt.
94 : "A",
95 : "Hamlet",
96 : "thEE",
97 : "YOUTH",
98 : // The following were generated by http://soybomb.com/tricks/words/
99 : "pectures",
100 : "exectly",
101 : "tricatrippian",
102 : "recens",
103 : "whiratroce",
104 : "troped",
105 : "balmous",
106 : "droppewry",
107 : "toilizing",
108 : "crocias",
109 : "eathrass",
110 : "cheakden",
111 : "speablett",
112 : "skirinies",
113 : "prefing",
114 : "bonufacision",
115 : }
116 :
117 : // buildHamletTestSST creates an sst file containing the hamlet word count data,
118 : // using the given options.
119 : func buildHamletTestSST(
120 : fs vfs.FS,
121 : filename string,
122 : compression Compression,
123 : fp FilterPolicy,
124 : ftype FilterType,
125 : comparer *Comparer,
126 : propCollector func() TablePropertyCollector,
127 : blockSize int,
128 : indexBlockSize int,
129 1 : ) error {
130 1 : wordCount := hamletWordCount()
131 1 : keys := wordCount.SortedKeys()
132 1 :
133 1 : // Write the key/value pairs to a new table, in increasing key order.
134 1 : f0, err := fs.Create(filename)
135 1 : if err != nil {
136 0 : return err
137 0 : }
138 :
139 1 : writerOpts := WriterOptions{
140 1 : BlockSize: blockSize,
141 1 : Comparer: comparer,
142 1 : Compression: compression,
143 1 : FilterPolicy: fp,
144 1 : FilterType: ftype,
145 1 : IndexBlockSize: indexBlockSize,
146 1 : MergerName: "nullptr",
147 1 : TableFormat: fixtureFormat,
148 1 : }
149 1 : if propCollector != nil {
150 1 : writerOpts.TablePropertyCollectors = append(writerOpts.TablePropertyCollectors, propCollector)
151 1 : }
152 :
153 1 : w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
154 1 : // Use rangeDelV1Format for testing byte equality with RocksDB.
155 1 : w.rangeDelV1Format = true
156 1 : var rangeDelLength int
157 1 : var rangeDelCounter int
158 1 : var rangeDelStart InternalKey
159 1 : for i, k := range keys {
160 1 : v := wordCount[k]
161 1 : ikey := base.MakeInternalKey([]byte(k), 0, InternalKeyKindSet)
162 1 : if err := w.Add(ikey, []byte(v)); err != nil {
163 0 : return err
164 0 : }
165 : // This mirrors the logic in `make-table.cc`. It adds range deletions of
166 : // increasing length for every 100 keys added.
167 1 : if i%100 == 0 {
168 1 : rangeDelStart = ikey.Clone()
169 1 : rangeDelCounter = 0
170 1 : rangeDelLength++
171 1 : }
172 1 : rangeDelCounter++
173 1 :
174 1 : if rangeDelCounter == rangeDelLength {
175 1 : if err := w.DeleteRange(rangeDelStart.UserKey, ikey.UserKey); err != nil {
176 0 : return err
177 0 : }
178 : }
179 : }
180 1 : return w.Close()
181 : }
182 :
183 : // TestFixtureInfo contains all metadata necessary to generate a test sstable.
184 : type TestFixtureInfo struct {
185 : Filename string
186 : Compression Compression
187 : FullKeyFilter bool
188 : PrefixFilter bool
189 : IndexBlockSize int
190 : UseFixtureComparer bool
191 : KeyCountPropertyCollector bool
192 : }
193 :
194 : // TestFixtures contains all metadata necessary to generate the test SSTs.
195 : var TestFixtures = []TestFixtureInfo{
196 : {
197 : Filename: "h.sst",
198 : Compression: SnappyCompression,
199 : FullKeyFilter: false,
200 : PrefixFilter: false,
201 : IndexBlockSize: fixtureDefaultIndexBlockSize,
202 : UseFixtureComparer: false,
203 : KeyCountPropertyCollector: true,
204 : },
205 : {
206 : Filename: "h.no-compression.sst",
207 : Compression: NoCompression,
208 : FullKeyFilter: false,
209 : PrefixFilter: false,
210 : IndexBlockSize: fixtureDefaultIndexBlockSize,
211 : UseFixtureComparer: false,
212 : KeyCountPropertyCollector: true,
213 : },
214 : {
215 : Filename: "h.table-bloom.sst",
216 : Compression: SnappyCompression,
217 : FullKeyFilter: true,
218 : PrefixFilter: false,
219 : IndexBlockSize: fixtureDefaultIndexBlockSize,
220 : UseFixtureComparer: false,
221 : KeyCountPropertyCollector: false,
222 : },
223 : {
224 : Filename: "h.table-bloom.no-compression.sst",
225 : Compression: NoCompression,
226 : FullKeyFilter: true,
227 : PrefixFilter: false,
228 : IndexBlockSize: fixtureDefaultIndexBlockSize,
229 : UseFixtureComparer: false,
230 : KeyCountPropertyCollector: false,
231 : },
232 : {
233 : Filename: "h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
234 : Compression: NoCompression,
235 : FullKeyFilter: false,
236 : PrefixFilter: true,
237 : IndexBlockSize: fixtureDefaultIndexBlockSize,
238 : UseFixtureComparer: true,
239 : KeyCountPropertyCollector: false,
240 : },
241 : {
242 : Filename: "h.no-compression.two_level_index.sst",
243 : Compression: NoCompression,
244 : FullKeyFilter: false,
245 : PrefixFilter: false,
246 : IndexBlockSize: fixtureSmallIndexBlockSize,
247 : UseFixtureComparer: false,
248 : KeyCountPropertyCollector: true,
249 : },
250 : {
251 : Filename: "h.zstd-compression.sst",
252 : Compression: ZstdCompression,
253 : FullKeyFilter: false,
254 : PrefixFilter: false,
255 : IndexBlockSize: fixtureDefaultIndexBlockSize,
256 : UseFixtureComparer: false,
257 : KeyCountPropertyCollector: true,
258 : },
259 : }
260 :
261 : // Build creates an sst file for the given fixture.
262 1 : func (tf TestFixtureInfo) Build(fs vfs.FS, filename string) error {
263 1 : var fp base.FilterPolicy
264 1 : if tf.FullKeyFilter || tf.PrefixFilter {
265 1 : fp = bloom.FilterPolicy(10)
266 1 : }
267 1 : var comparer *Comparer
268 1 : if tf.UseFixtureComparer {
269 1 : comparer = fixtureComparer
270 1 : }
271 1 : var propCollector func() TablePropertyCollector
272 1 : if tf.KeyCountPropertyCollector {
273 1 : propCollector = func() TablePropertyCollector {
274 1 : return &keyCountPropertyCollector{}
275 1 : }
276 : }
277 :
278 1 : return buildHamletTestSST(
279 1 : fs, filename, tf.Compression, fp, base.TableFilter,
280 1 : comparer,
281 1 : propCollector,
282 1 : fixtureBlockSize,
283 1 : tf.IndexBlockSize,
284 1 : )
285 : }
286 :
287 : const fixtureDefaultIndexBlockSize = math.MaxInt32
288 : const fixtureSmallIndexBlockSize = 128
289 : const fixtureBlockSize = 2048
290 : const fixtureFormat = TableFormatPebblev1
291 :
292 : type keyCountPropertyCollector struct {
293 : count int
294 : }
295 :
296 1 : func (c *keyCountPropertyCollector) Add(key InternalKey, value []byte) error {
297 1 : c.count++
298 1 : return nil
299 1 : }
300 :
301 1 : func (c *keyCountPropertyCollector) Finish(userProps map[string]string) error {
302 1 : userProps["test.key-count"] = fmt.Sprint(c.count)
303 1 : return nil
304 1 : }
305 :
306 1 : func (c *keyCountPropertyCollector) Name() string {
307 1 : return "KeyCountPropertyCollector"
308 1 : }
309 :
310 1 : var fixtureComparer = func() *Comparer {
311 1 : c := *base.DefaultComparer
312 1 : // NB: this is named as such only to match the built-in RocksDB comparer.
313 1 : c.Name = "leveldb.BytewiseComparator"
314 1 : c.Split = func(a []byte) int {
315 1 : // TODO(tbg): It's difficult to provide a more meaningful prefix extractor
316 1 : // on the given dataset since it's not MVCC, and so it's impossible to come
317 1 : // up with a sensible one. We need to add a better dataset and use that
318 1 : // instead to get confidence that prefix extractors are working as intended.
319 1 : return len(a)
320 1 : }
321 1 : return &c
322 : }()
|