Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bufio"
9 : "fmt"
10 : "io"
11 : "math"
12 : "os"
13 : "path/filepath"
14 : "sort"
15 : "strings"
16 : "sync"
17 :
18 : "github.com/cockroachdb/pebble/bloom"
19 : "github.com/cockroachdb/pebble/internal/base"
20 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
21 : "github.com/cockroachdb/pebble/sstable/block"
22 : "github.com/cockroachdb/pebble/vfs"
23 : )
24 :
25 : // testKVs is a key-value map holding test data.
26 : type testKVs map[string]string
27 :
28 : // SortedKeys returns the keys in the map, in sorted order.
29 1 : func (m testKVs) SortedKeys() []string {
30 1 : res := make([]string, 0, len(m))
31 1 : for k := range m {
32 1 : res = append(res, k)
33 1 : }
34 1 : sort.Strings(res)
35 1 : return res
36 : }
37 :
38 : // These variable should not be used directly, only via hamletWordCount().
39 : var hamletWordCountState struct {
40 : once sync.Once
41 : data testKVs
42 : }
43 :
44 : // hamletWordCount returns the data in testdata.h/txt, as a map from word to
45 : // count (as string).
46 1 : func hamletWordCount() testKVs {
47 1 : hamletWordCountState.once.Do(func() {
48 1 : wordCount := make(map[string]string)
49 1 : f, err := os.Open(filepath.FromSlash("testdata/h.txt"))
50 1 : if err != nil {
51 0 : panic(err)
52 : }
53 1 : defer f.Close()
54 1 : r := bufio.NewReader(f)
55 1 :
56 1 : for {
57 1 : s, err := r.ReadBytes('\n')
58 1 : if err == io.EOF {
59 1 : break
60 : }
61 1 : if err != nil {
62 0 : panic(err)
63 : }
64 1 : k := strings.TrimSpace(string(s[8:]))
65 1 : v := strings.TrimSpace(string(s[:8]))
66 1 : wordCount[k] = v
67 : }
68 1 : if len(wordCount) != 1710 {
69 0 : panic(fmt.Sprintf("h.txt entry count: got %d, want %d", len(wordCount), 1710))
70 : }
71 1 : for _, s := range hamletNonsenseWords {
72 1 : if _, ok := wordCount[s]; ok {
73 0 : panic(fmt.Sprintf("nonsense word %q was in h.txt", s))
74 : }
75 : }
76 1 : hamletWordCountState.data = wordCount
77 : })
78 1 : return hamletWordCountState.data
79 : }
80 :
81 : // hamletNonsenseWords are words that aren't in testdata/h.txt.
82 : var hamletNonsenseWords = []string{
83 : // Edge cases.
84 : "",
85 : "\x00",
86 : "\xff",
87 : "`",
88 : "a\x00",
89 : "aaaaaa",
90 : "pol\x00nius",
91 : "youth\x00",
92 : "youti",
93 : "zzzzzz",
94 : // Capitalized versions of actual words in testdata/h.txt.
95 : "A",
96 : "Hamlet",
97 : "thEE",
98 : "YOUTH",
99 : // The following were generated by http://soybomb.com/tricks/words/
100 : "pectures",
101 : "exectly",
102 : "tricatrippian",
103 : "recens",
104 : "whiratroce",
105 : "troped",
106 : "balmous",
107 : "droppewry",
108 : "toilizing",
109 : "crocias",
110 : "eathrass",
111 : "cheakden",
112 : "speablett",
113 : "skirinies",
114 : "prefing",
115 : "bonufacision",
116 : }
117 :
118 : // buildHamletTestSST creates an sst file containing the hamlet word count data,
119 : // using the given options.
120 : func buildHamletTestSST(
121 : fs vfs.FS,
122 : filename string,
123 : compression block.Compression,
124 : fp FilterPolicy,
125 : ftype FilterType,
126 : comparer *Comparer,
127 : blockSize int,
128 : indexBlockSize int,
129 1 : ) error {
130 1 : wordCount := hamletWordCount()
131 1 : keys := wordCount.SortedKeys()
132 1 :
133 1 : // Write the key/value pairs to a new table, in increasing key order.
134 1 : f0, err := fs.Create(filename, vfs.WriteCategoryUnspecified)
135 1 : if err != nil {
136 0 : return err
137 0 : }
138 :
139 1 : writerOpts := WriterOptions{
140 1 : BlockSize: blockSize,
141 1 : Comparer: comparer,
142 1 : Compression: compression,
143 1 : FilterPolicy: fp,
144 1 : FilterType: ftype,
145 1 : IndexBlockSize: indexBlockSize,
146 1 : MergerName: "nullptr",
147 1 : TableFormat: fixtureFormat,
148 1 : }
149 1 :
150 1 : w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
151 1 : // NB: We don't have byte equality with RocksDB any longer.
152 1 : var rangeDelLength int
153 1 : var rangeDelCounter int
154 1 : var rangeDelStart []byte
155 1 : for i, k := range keys {
156 1 : v := wordCount[k]
157 1 : if err := w.Set([]byte(k), []byte(v)); err != nil {
158 0 : return err
159 0 : }
160 : // This mirrors the logic in `make-table.cc`. It adds range deletions of
161 : // increasing length for every 100 keys added.
162 1 : if i%100 == 0 {
163 1 : rangeDelStart = []byte(k)
164 1 : rangeDelCounter = 0
165 1 : rangeDelLength++
166 1 : }
167 1 : rangeDelCounter++
168 1 :
169 1 : if rangeDelCounter == rangeDelLength {
170 1 : if err := w.DeleteRange(rangeDelStart, []byte(k)); err != nil {
171 0 : return err
172 0 : }
173 : }
174 : }
175 1 : return w.Close()
176 : }
177 :
178 : // TestFixtureInfo contains all metadata necessary to generate a test sstable.
179 : type TestFixtureInfo struct {
180 : Filename string
181 : Compression block.Compression
182 : FullKeyFilter bool
183 : PrefixFilter bool
184 : IndexBlockSize int
185 : UseFixtureComparer bool
186 : }
187 :
188 : // TestFixtures contains all metadata necessary to generate the test SSTs.
189 : var TestFixtures = []TestFixtureInfo{
190 : {
191 : Filename: "h.sst",
192 : Compression: block.SnappyCompression,
193 : FullKeyFilter: false,
194 : PrefixFilter: false,
195 : IndexBlockSize: fixtureDefaultIndexBlockSize,
196 : UseFixtureComparer: false,
197 : },
198 : {
199 : Filename: "h.no-compression.sst",
200 : Compression: block.NoCompression,
201 : FullKeyFilter: false,
202 : PrefixFilter: false,
203 : IndexBlockSize: fixtureDefaultIndexBlockSize,
204 : UseFixtureComparer: false,
205 : },
206 : {
207 : Filename: "h.table-bloom.sst",
208 : Compression: block.SnappyCompression,
209 : FullKeyFilter: true,
210 : PrefixFilter: false,
211 : IndexBlockSize: fixtureDefaultIndexBlockSize,
212 : UseFixtureComparer: false,
213 : },
214 : {
215 : Filename: "h.table-bloom.no-compression.sst",
216 : Compression: block.NoCompression,
217 : FullKeyFilter: true,
218 : PrefixFilter: false,
219 : IndexBlockSize: fixtureDefaultIndexBlockSize,
220 : UseFixtureComparer: false,
221 : },
222 : {
223 : Filename: "h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
224 : Compression: block.NoCompression,
225 : FullKeyFilter: false,
226 : PrefixFilter: true,
227 : IndexBlockSize: fixtureDefaultIndexBlockSize,
228 : UseFixtureComparer: true,
229 : },
230 : {
231 : Filename: "h.no-compression.two_level_index.sst",
232 : Compression: block.NoCompression,
233 : FullKeyFilter: false,
234 : PrefixFilter: false,
235 : IndexBlockSize: fixtureSmallIndexBlockSize,
236 : UseFixtureComparer: false,
237 : },
238 : {
239 : Filename: "h.zstd-compression.sst",
240 : Compression: block.ZstdCompression,
241 : FullKeyFilter: false,
242 : PrefixFilter: false,
243 : IndexBlockSize: fixtureDefaultIndexBlockSize,
244 : UseFixtureComparer: false,
245 : },
246 : }
247 :
248 : // Build creates an sst file for the given fixture.
249 1 : func (tf TestFixtureInfo) Build(fs vfs.FS, filename string) error {
250 1 : var fp base.FilterPolicy
251 1 : if tf.FullKeyFilter || tf.PrefixFilter {
252 1 : fp = bloom.FilterPolicy(10)
253 1 : }
254 1 : var comparer *Comparer
255 1 : if tf.UseFixtureComparer {
256 1 : comparer = fixtureComparer
257 1 : }
258 :
259 1 : return buildHamletTestSST(
260 1 : fs, filename, tf.Compression, fp, base.TableFilter,
261 1 : comparer,
262 1 : fixtureBlockSize,
263 1 : tf.IndexBlockSize,
264 1 : )
265 : }
266 :
267 : const fixtureDefaultIndexBlockSize = math.MaxInt32
268 : const fixtureSmallIndexBlockSize = 128
269 : const fixtureBlockSize = 2048
270 : const fixtureFormat = TableFormatPebblev1
271 :
272 2 : var fixtureComparer = func() *Comparer {
273 2 : c := *base.DefaultComparer
274 2 : // NB: this is named as such only to match the built-in RocksDB comparer.
275 2 : c.Name = "leveldb.BytewiseComparator"
276 2 : c.Split = func(a []byte) int {
277 1 : // TODO(tbg): It's difficult to provide a more meaningful prefix extractor
278 1 : // on the given dataset since it's not MVCC, and so it's impossible to come
279 1 : // up with a sensible one. We need to add a better dataset and use that
280 1 : // instead to get confidence that prefix extractors are working as intended.
281 1 : return len(a)
282 1 : }
283 2 : return &c
284 : }()
|