Line data Source code
1 : /*
2 : * Copyright 2017 Dgraph Labs, Inc. and Contributors
3 : * Modifications copyright (C) 2017 Andy Kimball and Contributors
4 : *
5 : * Licensed under the Apache License, Version 2.0 (the "License")
6 : * you may not use this file except in compliance with the License.
7 : * You may obtain a copy of the License at
8 : *
9 : * http://www.apache.org/licenses/LICENSE-2.0
10 : *
11 : * Unless required by applicable law or agreed to in writing, software
12 : * distributed under the License is distributed on an "AS IS" BASIS,
13 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 : * See the License for the specific language governing permissions and
15 : * limitations under the License.
16 : */
17 :
18 : /*
19 : Adapted from RocksDB inline skiplist.
20 :
21 : Key differences:
22 : - No optimization for sequential inserts (no "prev").
23 : - No custom comparator.
24 : - Support overwrites. This requires care when we see the same key when inserting.
25 : For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
26 : there is no need for values. We don't intend to support versioning. In-place updates of values
27 : would be more efficient.
28 : - We discard all non-concurrent code.
29 : - We do not support Splices. This simplifies the code a lot.
30 : - No AllocateNode or other pointer arithmetic.
31 : - We combine the findLessThan, findGreaterOrEqual, etc into one function.
32 : */
33 :
34 : /*
35 : Further adapted from Badger: https://github.com/dgraph-io/badger.
36 :
37 : Key differences:
38 : - Support for previous pointers - doubly linked lists. Note that it's up to higher
39 : level code to deal with the intermediate state that occurs during insertion,
40 : where node A is linked to node B, but node B is not yet linked back to node A.
41 : - Iterator includes mutator functions.
42 : */
43 :
44 : /*
45 : Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
46 :
47 : Key differences:
48 : - Removed support for deletion.
49 : - Removed support for concurrency.
50 : - External storage of keys.
51 : - Node storage grows to an arbitrary size.
52 : */
53 :
54 : package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"
55 :
56 : import (
57 : "bytes"
58 : "encoding/binary"
59 : "fmt"
60 : "math"
61 : "math/rand/v2"
62 : "unsafe"
63 :
64 : "github.com/cockroachdb/errors"
65 : "github.com/cockroachdb/pebble/internal/base"
66 : "github.com/cockroachdb/pebble/internal/constants"
67 : )
68 :
69 : const (
70 : maxHeight = 20
71 : maxNodeSize = uint64(unsafe.Sizeof(node{}))
72 : linksSize = uint64(unsafe.Sizeof(links{}))
73 : maxNodesSize = constants.MaxUint32OrInt
74 : )
75 :
76 : var (
77 : // ErrExists indicates that a duplicate record was inserted. This should never
78 : // happen for normal usage of batchskl as every key should have a unique
79 : // sequence number.
80 : ErrExists = errors.New("record with this key already exists")
81 :
82 : // ErrTooManyRecords is a sentinel error returned when the size of the raw
83 : // nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
84 : // corresponds to ~117 M skiplist entries.
85 : ErrTooManyRecords = errors.New("too many records")
86 : )
87 :
88 : type links struct {
89 : next uint32
90 : prev uint32
91 : }
92 :
93 : type node struct {
94 : // The offset of the start of the record in the storage.
95 : offset uint32
96 : // The offset of the start and end of the key in storage.
97 : keyStart uint32
98 : keyEnd uint32
99 : // A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
100 : // during seek operations. The key retrieval can be expensive purely due to
101 : // cache misses while the abbreviatedKey stored here will be in the same
102 : // cache line as the key and the links making accessing and comparing against
103 : // it almost free.
104 : abbreviatedKey uint64
105 : // Most nodes do not need to use the full height of the link tower, since the
106 : // probability of each successive level decreases exponentially. Because
107 : // these elements are never accessed, they do not need to be allocated.
108 : // Therefore, when a node is allocated, its memory footprint is deliberately
109 : // truncated to not include unneeded link elements.
110 : links [maxHeight]links
111 : }
112 :
113 : // Skiplist is a fast, non-cocnurrent skiplist implementation that supports
114 : // forward and backward iteration. See arenaskl.Skiplist for a concurrent
115 : // skiplist. Keys and values are stored externally from the skiplist via the
116 : // Storage interface. Deletion is not supported. Instead, higher-level code is
117 : // expected to perform deletion via tombstones and needs to process those
118 : // tombstones appropriately during retrieval operations.
119 : type Skiplist struct {
120 : storage *[]byte
121 : cmp base.Compare
122 : abbreviatedKey base.AbbreviatedKey
123 : nodes []byte
124 : head uint32
125 : tail uint32
126 : height uint32 // Current height: 1 <= height <= maxHeight
127 : rand rand.PCG
128 : }
129 :
130 : var (
131 : probabilities [maxHeight]uint32
132 : )
133 :
134 1 : func init() {
135 1 : const pValue = 1 / math.E
136 1 :
137 1 : // Precompute the skiplist probabilities so that only a single random number
138 1 : // needs to be generated and so that the optimal pvalue can be used (inverse
139 1 : // of Euler's number).
140 1 : p := float64(1.0)
141 1 : for i := 0; i < maxHeight; i++ {
142 1 : probabilities[i] = uint32(float64(math.MaxUint32) * p)
143 1 : p *= pValue
144 1 : }
145 : }
146 :
147 : // NewSkiplist constructs and initializes a new, empty skiplist.
148 1 : func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
149 1 : s := &Skiplist{}
150 1 : s.Init(storage, cmp, abbreviatedKey)
151 1 : return s
152 1 : }
153 :
154 : // Reset the fields in the skiplist for reuse.
155 0 : func (s *Skiplist) Reset() {
156 0 : *s = Skiplist{
157 0 : nodes: s.nodes[:0],
158 0 : height: 1,
159 0 : }
160 0 : const batchMaxRetainedSize = 1 << 20 // 1 MB
161 0 : if cap(s.nodes) > batchMaxRetainedSize {
162 0 : s.nodes = nil
163 0 : }
164 : }
165 :
166 : // Init the skiplist to empty and re-initialize.
167 1 : func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
168 1 : *s = Skiplist{
169 1 : storage: storage,
170 1 : cmp: cmp,
171 1 : abbreviatedKey: abbreviatedKey,
172 1 : nodes: s.nodes[:0],
173 1 : height: 1,
174 1 : }
175 1 : s.rand.Seed(0, rand.Uint64())
176 1 :
177 1 : const initBufSize = 256
178 1 : if cap(s.nodes) < initBufSize {
179 1 : s.nodes = make([]byte, 0, initBufSize)
180 1 : }
181 :
182 : // Allocate head and tail nodes. While allocating a new node can fail, in the
183 : // context of initializing the skiplist we consider it unrecoverable.
184 1 : var err error
185 1 : s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
186 1 : if err != nil {
187 0 : panic(err)
188 : }
189 1 : s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
190 1 : if err != nil {
191 0 : panic(err)
192 : }
193 :
194 : // Link all head/tail levels together.
195 1 : headNode := s.node(s.head)
196 1 : tailNode := s.node(s.tail)
197 1 : for i := uint32(0); i < maxHeight; i++ {
198 1 : headNode.links[i].next = s.tail
199 1 : tailNode.links[i].prev = s.head
200 1 : }
201 : }
202 :
203 : // Add adds a new key to the skiplist if it does not yet exist. If the record
204 : // already exists, then Add returns ErrRecordExists.
205 1 : func (s *Skiplist) Add(keyOffset uint32) error {
206 1 : data := (*s.storage)[keyOffset+1:]
207 1 : v, n := binary.Uvarint(data)
208 1 : if n <= 0 {
209 0 : return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
210 0 : }
211 1 : data = data[n:]
212 1 : if v > uint64(len(data)) {
213 0 : return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
214 0 : }
215 1 : keyStart := 1 + keyOffset + uint32(n)
216 1 : keyEnd := keyStart + uint32(v)
217 1 : key := data[:v]
218 1 : abbreviatedKey := s.abbreviatedKey(key)
219 1 :
220 1 : // spl holds the list of next and previous links for each level in the
221 1 : // skiplist indicating where the new node will be inserted.
222 1 : var spl [maxHeight]splice
223 1 :
224 1 : // Fast-path for in-order insertion of keys: compare the new key against the
225 1 : // last key.
226 1 : prev := s.getPrev(s.tail, 0)
227 1 : if prevNode := s.node(prev); prev == s.head ||
228 1 : abbreviatedKey > prevNode.abbreviatedKey ||
229 1 : (abbreviatedKey == prevNode.abbreviatedKey &&
230 1 : s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
231 1 : for level := uint32(0); level < s.height; level++ {
232 1 : spl[level].prev = s.getPrev(s.tail, level)
233 1 : spl[level].next = s.tail
234 1 : }
235 1 : } else {
236 1 : s.findSplice(key, abbreviatedKey, &spl)
237 1 : }
238 :
239 1 : height := s.randomHeight()
240 1 : // Increase s.height as necessary.
241 1 : for ; s.height < height; s.height++ {
242 1 : spl[s.height].next = s.tail
243 1 : spl[s.height].prev = s.head
244 1 : }
245 :
246 : // We always insert from the base level and up. After you add a node in base
247 : // level, we cannot create a node in the level above because it would have
248 : // discovered the node in the base level.
249 1 : nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
250 1 : if err != nil {
251 0 : return err
252 0 : }
253 1 : newNode := s.node(nd)
254 1 : for level := uint32(0); level < height; level++ {
255 1 : next := spl[level].next
256 1 : prev := spl[level].prev
257 1 : newNode.links[level].next = next
258 1 : newNode.links[level].prev = prev
259 1 : s.node(next).links[level].prev = nd
260 1 : s.node(prev).links[level].next = nd
261 1 : }
262 :
263 1 : return nil
264 : }
265 :
266 : // NewIter returns a new Iterator object. The lower and upper bound parameters
267 : // control the range of keys the iterator will return. Specifying for nil for
268 : // lower or upper bound disables the check for that boundary. Note that lower
269 : // bound is not checked on {SeekGE,First} and upper bound is not check on
270 : // {SeekLT,Last}. The user is expected to perform that check. Note that it is
271 : // safe for an iterator to be copied by value.
272 1 : func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
273 1 : return Iterator{list: s, lower: lower, upper: upper}
274 1 : }
275 :
276 : func (s *Skiplist) newNode(
277 : height,
278 : offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
279 1 : ) (uint32, error) {
280 1 : if height < 1 || height > maxHeight {
281 0 : panic("height cannot be less than one or greater than the max height")
282 : }
283 :
284 1 : unusedSize := uint64(maxHeight-int(height)) * linksSize
285 1 : nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
286 1 : if err != nil {
287 0 : return 0, err
288 0 : }
289 1 : nd := s.node(nodeOffset)
290 1 :
291 1 : nd.offset = offset
292 1 : nd.keyStart = keyStart
293 1 : nd.keyEnd = keyEnd
294 1 : nd.abbreviatedKey = abbreviatedKey
295 1 : return nodeOffset, nil
296 : }
297 :
298 1 : func (s *Skiplist) alloc(size uint32) (uint32, error) {
299 1 : offset := uint64(len(s.nodes))
300 1 :
301 1 : // We only have a need for memory up to offset + size, but we never want
302 1 : // to allocate a node whose tail points into unallocated memory.
303 1 : minAllocSize := offset + maxNodeSize
304 1 : if uint64(cap(s.nodes)) < minAllocSize {
305 1 : allocSize := uint64(cap(s.nodes)) * 2
306 1 : if allocSize < minAllocSize {
307 0 : allocSize = minAllocSize
308 0 : }
309 : // Cap the allocation at the max allowed size to avoid wasted capacity.
310 1 : if allocSize > maxNodesSize {
311 0 : // The new record may still not fit within the allocation, in which case
312 0 : // we return early with an error. This avoids the panic below when we
313 0 : // resize the slice. It also avoids the allocation and copy.
314 0 : if uint64(offset)+uint64(size) > maxNodesSize {
315 0 : return 0, errors.Wrapf(ErrTooManyRecords,
316 0 : "alloc of new record (size=%d) would overflow uint32 (current size=%d)",
317 0 : uint64(offset)+uint64(size), offset,
318 0 : )
319 0 : }
320 0 : allocSize = maxNodesSize
321 : }
322 1 : tmp := make([]byte, len(s.nodes), allocSize)
323 1 : copy(tmp, s.nodes)
324 1 : s.nodes = tmp
325 : }
326 :
327 1 : newSize := uint32(offset) + size
328 1 : s.nodes = s.nodes[:newSize]
329 1 : return uint32(offset), nil
330 : }
331 :
332 1 : func (s *Skiplist) node(offset uint32) *node {
333 1 : return (*node)(unsafe.Pointer(&s.nodes[offset]))
334 1 : }
335 :
336 1 : func (s *Skiplist) randomHeight() uint32 {
337 1 : rnd := uint32(s.rand.Uint64())
338 1 : h := uint32(1)
339 1 : for h < maxHeight && rnd <= probabilities[h] {
340 1 : h++
341 1 : }
342 1 : return h
343 : }
344 :
345 1 : func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
346 1 : prev := s.head
347 1 :
348 1 : for level := s.height - 1; ; level-- {
349 1 : // The code in this loop is the same as findSpliceForLevel(). For some
350 1 : // reason, calling findSpliceForLevel() here is much much slower than the
351 1 : // inlined code below. The excess time is also caught up in the final
352 1 : // return statement which makes little sense. Revisit when in go1.14 or
353 1 : // later if inlining improves.
354 1 :
355 1 : next := s.getNext(prev, level)
356 1 : for next != s.tail {
357 1 : // Assume prev.key < key.
358 1 : nextNode := s.node(next)
359 1 : nextAbbreviatedKey := nextNode.abbreviatedKey
360 1 : if abbreviatedKey < nextAbbreviatedKey {
361 1 : // We are done for this level, since prev.key < key < next.key.
362 1 : break
363 : }
364 1 : if abbreviatedKey == nextAbbreviatedKey {
365 1 : if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
366 1 : // We are done for this level, since prev.key < key <= next.key.
367 1 : break
368 : }
369 : }
370 :
371 : // Keep moving right on this level.
372 1 : prev = next
373 1 : next = nextNode.links[level].next
374 : }
375 :
376 1 : spl[level].prev = prev
377 1 : spl[level].next = next
378 1 : if level == 0 {
379 1 : break
380 : }
381 : }
382 : }
383 :
384 : func (s *Skiplist) findSpliceForLevel(
385 : key []byte, abbreviatedKey uint64, level, start uint32,
386 1 : ) (prev, next uint32) {
387 1 : prev = start
388 1 : next = s.getNext(prev, level)
389 1 :
390 1 : for next != s.tail {
391 1 : // Assume prev.key < key.
392 1 : nextNode := s.node(next)
393 1 : nextAbbreviatedKey := nextNode.abbreviatedKey
394 1 : if abbreviatedKey < nextAbbreviatedKey {
395 1 : // We are done for this level, since prev.key < key < next.key.
396 1 : break
397 : }
398 1 : if abbreviatedKey == nextAbbreviatedKey {
399 1 : if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
400 1 : // We are done for this level, since prev.key < key < next.key.
401 1 : break
402 : }
403 : }
404 :
405 : // Keep moving right on this level.
406 1 : prev = next
407 1 : next = nextNode.links[level].next
408 : }
409 :
410 1 : return
411 : }
412 :
413 1 : func (s *Skiplist) getKey(nd uint32) base.InternalKey {
414 1 : n := s.node(nd)
415 1 : kind := base.InternalKeyKind((*s.storage)[n.offset])
416 1 : key := (*s.storage)[n.keyStart:n.keyEnd]
417 1 : return base.MakeInternalKey(key, base.SeqNum(n.offset)|base.SeqNumBatchBit, kind)
418 1 : }
419 :
420 1 : func (s *Skiplist) getNext(nd, h uint32) uint32 {
421 1 : return s.node(nd).links[h].next
422 1 : }
423 :
424 1 : func (s *Skiplist) getPrev(nd, h uint32) uint32 {
425 1 : return s.node(nd).links[h].prev
426 1 : }
427 :
428 0 : func (s *Skiplist) debug() string {
429 0 : var buf bytes.Buffer
430 0 : for level := uint32(0); level < s.height; level++ {
431 0 : var count int
432 0 : for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
433 0 : count++
434 0 : }
435 0 : fmt.Fprintf(&buf, "%d: %d\n", level, count)
436 : }
437 0 : return buf.String()
438 : }
439 :
440 : // Silence unused warning.
441 : var _ = (*Skiplist).debug
|