Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bufio"
9 : "fmt"
10 : "io"
11 : "math"
12 : "os"
13 : "path/filepath"
14 : "sort"
15 : "strings"
16 : "sync"
17 :
18 : "github.com/cockroachdb/pebble/bloom"
19 : "github.com/cockroachdb/pebble/internal/base"
20 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
21 : "github.com/cockroachdb/pebble/vfs"
22 : )
23 :
24 : // testKVs is a key-value map holding test data.
25 : type testKVs map[string]string
26 :
27 : // SortedKeys returns the keys in the map, in sorted order.
28 1 : func (m testKVs) SortedKeys() []string {
29 1 : res := make([]string, 0, len(m))
30 1 : for k := range m {
31 1 : res = append(res, k)
32 1 : }
33 1 : sort.Strings(res)
34 1 : return res
35 : }
36 :
37 : // These variable should not be used directly, only via hamletWordCount().
38 : var hamletWordCountState struct {
39 : once sync.Once
40 : data testKVs
41 : }
42 :
43 : // hamletWordCount returns the data in testdata.h/txt, as a map from word to
44 : // count (as string).
45 1 : func hamletWordCount() testKVs {
46 1 : hamletWordCountState.once.Do(func() {
47 1 : wordCount := make(map[string]string)
48 1 : f, err := os.Open(filepath.FromSlash("testdata/h.txt"))
49 1 : if err != nil {
50 0 : panic(err)
51 : }
52 1 : defer f.Close()
53 1 : r := bufio.NewReader(f)
54 1 :
55 1 : for {
56 1 : s, err := r.ReadBytes('\n')
57 1 : if err == io.EOF {
58 1 : break
59 : }
60 1 : if err != nil {
61 0 : panic(err)
62 : }
63 1 : k := strings.TrimSpace(string(s[8:]))
64 1 : v := strings.TrimSpace(string(s[:8]))
65 1 : wordCount[k] = v
66 : }
67 1 : if len(wordCount) != 1710 {
68 0 : panic(fmt.Sprintf("h.txt entry count: got %d, want %d", len(wordCount), 1710))
69 : }
70 1 : for _, s := range hamletNonsenseWords {
71 1 : if _, ok := wordCount[s]; ok {
72 0 : panic(fmt.Sprintf("nonsense word %q was in h.txt", s))
73 : }
74 : }
75 1 : hamletWordCountState.data = wordCount
76 : })
77 1 : return hamletWordCountState.data
78 : }
79 :
80 : // hamletNonsenseWords are words that aren't in testdata/h.txt.
81 : var hamletNonsenseWords = []string{
82 : // Edge cases.
83 : "",
84 : "\x00",
85 : "\xff",
86 : "`",
87 : "a\x00",
88 : "aaaaaa",
89 : "pol\x00nius",
90 : "youth\x00",
91 : "youti",
92 : "zzzzzz",
93 : // Capitalized versions of actual words in testdata/h.txt.
94 : "A",
95 : "Hamlet",
96 : "thEE",
97 : "YOUTH",
98 : // The following were generated by http://soybomb.com/tricks/words/
99 : "pectures",
100 : "exectly",
101 : "tricatrippian",
102 : "recens",
103 : "whiratroce",
104 : "troped",
105 : "balmous",
106 : "droppewry",
107 : "toilizing",
108 : "crocias",
109 : "eathrass",
110 : "cheakden",
111 : "speablett",
112 : "skirinies",
113 : "prefing",
114 : "bonufacision",
115 : }
116 :
117 : // buildHamletTestSST creates an sst file containing the hamlet word count data,
118 : // using the given options.
119 : func buildHamletTestSST(
120 : fs vfs.FS,
121 : filename string,
122 : compression Compression,
123 : fp FilterPolicy,
124 : ftype FilterType,
125 : comparer *Comparer,
126 : blockSize int,
127 : indexBlockSize int,
128 1 : ) error {
129 1 : wordCount := hamletWordCount()
130 1 : keys := wordCount.SortedKeys()
131 1 :
132 1 : // Write the key/value pairs to a new table, in increasing key order.
133 1 : f0, err := fs.Create(filename, vfs.WriteCategoryUnspecified)
134 1 : if err != nil {
135 0 : return err
136 0 : }
137 :
138 1 : writerOpts := WriterOptions{
139 1 : BlockSize: blockSize,
140 1 : Comparer: comparer,
141 1 : Compression: compression,
142 1 : FilterPolicy: fp,
143 1 : FilterType: ftype,
144 1 : IndexBlockSize: indexBlockSize,
145 1 : MergerName: "nullptr",
146 1 : TableFormat: fixtureFormat,
147 1 : }
148 1 :
149 1 : w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
150 1 : // Use rangeDelV1Format for testing byte equality with RocksDB.
151 1 : w.rangeDelV1Format = true
152 1 : var rangeDelLength int
153 1 : var rangeDelCounter int
154 1 : var rangeDelStart InternalKey
155 1 : for i, k := range keys {
156 1 : v := wordCount[k]
157 1 : ikey := base.MakeInternalKey([]byte(k), 0, InternalKeyKindSet)
158 1 : if err := w.Add(ikey, []byte(v)); err != nil {
159 0 : return err
160 0 : }
161 : // This mirrors the logic in `make-table.cc`. It adds range deletions of
162 : // increasing length for every 100 keys added.
163 1 : if i%100 == 0 {
164 1 : rangeDelStart = ikey.Clone()
165 1 : rangeDelCounter = 0
166 1 : rangeDelLength++
167 1 : }
168 1 : rangeDelCounter++
169 1 :
170 1 : if rangeDelCounter == rangeDelLength {
171 1 : if err := w.DeleteRange(rangeDelStart.UserKey, ikey.UserKey); err != nil {
172 0 : return err
173 0 : }
174 : }
175 : }
176 1 : return w.Close()
177 : }
178 :
179 : // TestFixtureInfo contains all metadata necessary to generate a test sstable.
180 : type TestFixtureInfo struct {
181 : Filename string
182 : Compression Compression
183 : FullKeyFilter bool
184 : PrefixFilter bool
185 : IndexBlockSize int
186 : UseFixtureComparer bool
187 : }
188 :
189 : // TestFixtures contains all metadata necessary to generate the test SSTs.
190 : var TestFixtures = []TestFixtureInfo{
191 : {
192 : Filename: "h.sst",
193 : Compression: SnappyCompression,
194 : FullKeyFilter: false,
195 : PrefixFilter: false,
196 : IndexBlockSize: fixtureDefaultIndexBlockSize,
197 : UseFixtureComparer: false,
198 : },
199 : {
200 : Filename: "h.no-compression.sst",
201 : Compression: NoCompression,
202 : FullKeyFilter: false,
203 : PrefixFilter: false,
204 : IndexBlockSize: fixtureDefaultIndexBlockSize,
205 : UseFixtureComparer: false,
206 : },
207 : {
208 : Filename: "h.table-bloom.sst",
209 : Compression: SnappyCompression,
210 : FullKeyFilter: true,
211 : PrefixFilter: false,
212 : IndexBlockSize: fixtureDefaultIndexBlockSize,
213 : UseFixtureComparer: false,
214 : },
215 : {
216 : Filename: "h.table-bloom.no-compression.sst",
217 : Compression: NoCompression,
218 : FullKeyFilter: true,
219 : PrefixFilter: false,
220 : IndexBlockSize: fixtureDefaultIndexBlockSize,
221 : UseFixtureComparer: false,
222 : },
223 : {
224 : Filename: "h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
225 : Compression: NoCompression,
226 : FullKeyFilter: false,
227 : PrefixFilter: true,
228 : IndexBlockSize: fixtureDefaultIndexBlockSize,
229 : UseFixtureComparer: true,
230 : },
231 : {
232 : Filename: "h.no-compression.two_level_index.sst",
233 : Compression: NoCompression,
234 : FullKeyFilter: false,
235 : PrefixFilter: false,
236 : IndexBlockSize: fixtureSmallIndexBlockSize,
237 : UseFixtureComparer: false,
238 : },
239 : {
240 : Filename: "h.zstd-compression.sst",
241 : Compression: ZstdCompression,
242 : FullKeyFilter: false,
243 : PrefixFilter: false,
244 : IndexBlockSize: fixtureDefaultIndexBlockSize,
245 : UseFixtureComparer: false,
246 : },
247 : }
248 :
249 : // Build creates an sst file for the given fixture.
250 1 : func (tf TestFixtureInfo) Build(fs vfs.FS, filename string) error {
251 1 : var fp base.FilterPolicy
252 1 : if tf.FullKeyFilter || tf.PrefixFilter {
253 1 : fp = bloom.FilterPolicy(10)
254 1 : }
255 1 : var comparer *Comparer
256 1 : if tf.UseFixtureComparer {
257 1 : comparer = fixtureComparer
258 1 : }
259 :
260 1 : return buildHamletTestSST(
261 1 : fs, filename, tf.Compression, fp, base.TableFilter,
262 1 : comparer,
263 1 : fixtureBlockSize,
264 1 : tf.IndexBlockSize,
265 1 : )
266 : }
267 :
268 : const fixtureDefaultIndexBlockSize = math.MaxInt32
269 : const fixtureSmallIndexBlockSize = 128
270 : const fixtureBlockSize = 2048
271 : const fixtureFormat = TableFormatPebblev1
272 :
273 2 : var fixtureComparer = func() *Comparer {
274 2 : c := *base.DefaultComparer
275 2 : // NB: this is named as such only to match the built-in RocksDB comparer.
276 2 : c.Name = "leveldb.BytewiseComparator"
277 2 : c.Split = func(a []byte) int {
278 1 : // TODO(tbg): It's difficult to provide a more meaningful prefix extractor
279 1 : // on the given dataset since it's not MVCC, and so it's impossible to come
280 1 : // up with a sensible one. We need to add a better dataset and use that
281 1 : // instead to get confidence that prefix extractors are working as intended.
282 1 : return len(a)
283 1 : }
284 2 : return &c
285 : }()
|