Line data Source code
1 : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : stdcmp "cmp"
10 : "fmt"
11 : "strings"
12 : "sync"
13 : "sync/atomic"
14 : "unsafe"
15 :
16 : "github.com/cockroachdb/errors"
17 : "github.com/cockroachdb/pebble/internal/base"
18 : "github.com/cockroachdb/pebble/internal/invariants"
19 : )
20 :
21 : type btreeCmp func(*FileMetadata, *FileMetadata) int
22 :
23 1 : func btreeCmpSeqNum(a, b *FileMetadata) int {
24 1 : return a.cmpSeqNum(b)
25 1 : }
26 :
27 1 : func btreeCmpSmallestKey(cmp Compare) btreeCmp {
28 1 : return func(a, b *FileMetadata) int {
29 1 : return a.cmpSmallestKey(b, cmp)
30 1 : }
31 : }
32 :
33 : // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
34 : // specific ordering of FileMetadata within the tree. It's typically used to
35 : // test consistency checking code that needs to construct a malformed B-Tree.
36 0 : func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
37 0 : m := map[*FileMetadata]int{}
38 0 : for i, f := range files {
39 0 : m[f] = i
40 0 : }
41 0 : return func(a, b *FileMetadata) int {
42 0 : ai, aok := m[a]
43 0 : bi, bok := m[b]
44 0 : if !aok || !bok {
45 0 : panic("btreeCmpSliceOrder called with unknown files")
46 : }
47 0 : return stdcmp.Compare(ai, bi)
48 : }
49 : }
50 :
51 : const (
52 : degree = 16
53 : maxItems = 2*degree - 1
54 : minItems = degree - 1
55 : )
56 :
57 : type leafNode struct {
58 : ref atomic.Int32
59 : count int16
60 : leaf bool
61 : // subtreeCount holds the count of files in the entire subtree formed by
62 : // this node. For leaf nodes, subtreeCount is always equal to count. For
63 : // non-leaf nodes, it's the sum of count plus all the children's
64 : // subtreeCounts.
65 : //
66 : // NB: We could move this field to the end of the node struct, since leaf =>
67 : // count=subtreeCount, however the unsafe casting [leafToNode] performs make
68 : // it risky and cumbersome.
69 : subtreeCount int
70 : items [maxItems]*FileMetadata
71 : // annot contains one annotation per annotator, merged over the entire
72 : // node's files (and all descendants for non-leaf nodes). Protected by
73 : // annotMu.
74 : annotMu sync.RWMutex
75 : annot []annotation
76 : }
77 :
78 : type node struct {
79 : leafNode
80 : children [maxItems + 1]*node
81 : }
82 :
83 : //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
84 1 : func leafToNode(ln *leafNode) *node {
85 1 : return (*node)(unsafe.Pointer(ln))
86 1 : }
87 :
88 1 : func newLeafNode() *node {
89 1 : n := leafToNode(new(leafNode))
90 1 : n.leaf = true
91 1 : n.ref.Store(1)
92 1 : return n
93 1 : }
94 :
95 1 : func newNode() *node {
96 1 : n := new(node)
97 1 : n.ref.Store(1)
98 1 : return n
99 1 : }
100 :
101 : // mut creates and returns a mutable node reference. If the node is not shared
102 : // with any other trees then it can be modified in place. Otherwise, it must be
103 : // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
104 : // policy which transparently incorporates the idea of local mutations, like
105 : // Clojure's transients or Haskell's ST monad, where nodes are only copied
106 : // during the first time that they are modified between Clone operations.
107 : //
108 : // When a node is cloned, the provided pointer will be redirected to the new
109 : // mutable node.
110 1 : func mut(n **node) *node {
111 1 : if (*n).ref.Load() == 1 {
112 1 : // Exclusive ownership. Can mutate in place. Still need to lock out
113 1 : // any concurrent writes to annot.
114 1 : (*n).annotMu.Lock()
115 1 : defer (*n).annotMu.Unlock()
116 1 :
117 1 : // Whenever a node will be mutated, reset its annotations to be marked
118 1 : // as uncached. This ensures any future calls to (*node).annotation
119 1 : // will recompute annotations on the modified subtree.
120 1 : for i := range (*n).annot {
121 0 : (*n).annot[i].valid.Store(false)
122 0 : }
123 1 : return *n
124 : }
125 : // If we do not have unique ownership over the node then we
126 : // clone it to gain unique ownership. After doing so, we can
127 : // release our reference to the old node. We pass recursive
128 : // as true because even though we just observed the node's
129 : // reference count to be greater than 1, we might be racing
130 : // with another call to decRef on this node.
131 1 : c := (*n).clone()
132 1 : (*n).decRef(true /* contentsToo */, nil)
133 1 : *n = c
134 1 : // NB: We don't need to clear annotations, because (*node).clone does not
135 1 : // copy them.
136 1 : return *n
137 : }
138 :
139 : // incRef acquires a reference to the node.
140 1 : func (n *node) incRef() {
141 1 : n.ref.Add(1)
142 1 : }
143 :
144 : // decRef releases a reference to the node. If requested, the method will unref
145 : // its items and recurse into child nodes and decrease their refcounts as well.
146 : // Some internal codepaths that manually copy the node's items or children to
147 : // new nodes pass contentsToo=false to preserve existing reference counts during
148 : // operations that should yield a net-zero change to descendant refcounts.
149 : // When a node is released, its contained files are dereferenced.
150 1 : func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
151 1 : if n.ref.Add(-1) > 0 {
152 1 : // Other references remain. Can't free.
153 1 : return
154 1 : }
155 :
156 : // Dereference the node's metadata and release child references if
157 : // requested. Some internal callers may not want to propagate the deref
158 : // because they're manually copying the filemetadata and children to other
159 : // nodes, and they want to preserve the existing reference count.
160 1 : if contentsToo {
161 1 : for _, f := range n.items[:n.count] {
162 1 : if f.FileBacking.Unref() == 0 {
163 1 : // There are two sources of node dereferences: tree mutations
164 1 : // and Version dereferences. Files should only be made obsolete
165 1 : // during Version dereferences, during which `obsolete` will be
166 1 : // non-nil.
167 1 : if obsolete == nil {
168 0 : panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
169 : }
170 : // Reference counting is performed on the FileBacking. In the case
171 : // of a virtual sstable, this reference counting is performed on
172 : // a FileBacking which is shared by every single virtual sstable
173 : // with the same backing sstable. If the reference count hits 0,
174 : // then we know that the FileBacking won't be required by any
175 : // sstable in Pebble, and that the backing sstable can be deleted.
176 1 : *obsolete = append(*obsolete, f.FileBacking)
177 : }
178 : }
179 1 : if !n.leaf {
180 1 : for i := int16(0); i <= n.count; i++ {
181 1 : n.children[i].decRef(true /* contentsToo */, obsolete)
182 1 : }
183 : }
184 : }
185 : }
186 :
187 : // clone creates a clone of the receiver with a single reference count.
188 1 : func (n *node) clone() *node {
189 1 : var c *node
190 1 : if n.leaf {
191 1 : c = newLeafNode()
192 1 : } else {
193 1 : c = newNode()
194 1 : }
195 : // NB: copy field-by-field without touching n.ref to avoid
196 : // triggering the race detector and looking like a data race.
197 1 : c.count = n.count
198 1 : c.items = n.items
199 1 : c.subtreeCount = n.subtreeCount
200 1 : // Increase the refcount of each contained item.
201 1 : for _, f := range n.items[:n.count] {
202 1 : f.FileBacking.Ref()
203 1 : }
204 1 : if !c.leaf {
205 1 : // Copy children and increase each refcount.
206 1 : c.children = n.children
207 1 : for i := int16(0); i <= c.count; i++ {
208 1 : c.children[i].incRef()
209 1 : }
210 : }
211 1 : return c
212 : }
213 :
214 : // insertAt inserts the provided file and node at the provided index. This
215 : // function is for use only as a helper function for internal B-Tree code.
216 : // Clients should not invoke it directly.
217 1 : func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
218 1 : if index < int(n.count) {
219 1 : copy(n.items[index+1:n.count+1], n.items[index:n.count])
220 1 : if !n.leaf {
221 1 : copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
222 1 : }
223 : }
224 1 : n.items[index] = item
225 1 : if !n.leaf {
226 1 : n.children[index+1] = nd
227 1 : }
228 1 : n.count++
229 : }
230 :
231 : // pushBack inserts the provided file and node at the tail of the node's items.
232 : // This function is for use only as a helper function for internal B-Tree code.
233 : // Clients should not invoke it directly.
234 1 : func (n *node) pushBack(item *FileMetadata, nd *node) {
235 1 : n.items[n.count] = item
236 1 : if !n.leaf {
237 1 : n.children[n.count+1] = nd
238 1 : }
239 1 : n.count++
240 : }
241 :
242 : // pushFront inserts the provided file and node at the head of the
243 : // node's items. This function is for use only as a helper function for internal B-Tree
244 : // code. Clients should not invoke it directly.
245 1 : func (n *node) pushFront(item *FileMetadata, nd *node) {
246 1 : if !n.leaf {
247 1 : copy(n.children[1:n.count+2], n.children[:n.count+1])
248 1 : n.children[0] = nd
249 1 : }
250 1 : copy(n.items[1:n.count+1], n.items[:n.count])
251 1 : n.items[0] = item
252 1 : n.count++
253 : }
254 :
255 : // removeAt removes a value at a given index, pulling all subsequent values
256 : // back. This function is for use only as a helper function for internal B-Tree
257 : // code. Clients should not invoke it directly.
258 1 : func (n *node) removeAt(index int) (*FileMetadata, *node) {
259 1 : var child *node
260 1 : if !n.leaf {
261 1 : child = n.children[index+1]
262 1 : copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
263 1 : n.children[n.count] = nil
264 1 : }
265 1 : n.count--
266 1 : out := n.items[index]
267 1 : copy(n.items[index:n.count], n.items[index+1:n.count+1])
268 1 : n.items[n.count] = nil
269 1 : return out, child
270 : }
271 :
272 : // popBack removes and returns the last element in the list. This function is
273 : // for use only as a helper function for internal B-Tree code. Clients should
274 : // not invoke it directly.
275 1 : func (n *node) popBack() (*FileMetadata, *node) {
276 1 : n.count--
277 1 : out := n.items[n.count]
278 1 : n.items[n.count] = nil
279 1 : if n.leaf {
280 1 : return out, nil
281 1 : }
282 1 : child := n.children[n.count+1]
283 1 : n.children[n.count+1] = nil
284 1 : return out, child
285 : }
286 :
287 : // popFront removes and returns the first element in the list. This function is
288 : // for use only as a helper function for internal B-Tree code. Clients should
289 : // not invoke it directly.
290 1 : func (n *node) popFront() (*FileMetadata, *node) {
291 1 : n.count--
292 1 : var child *node
293 1 : if !n.leaf {
294 1 : child = n.children[0]
295 1 : copy(n.children[:n.count+1], n.children[1:n.count+2])
296 1 : n.children[n.count+1] = nil
297 1 : }
298 1 : out := n.items[0]
299 1 : copy(n.items[:n.count], n.items[1:n.count+1])
300 1 : n.items[n.count] = nil
301 1 : return out, child
302 : }
303 :
304 : // find returns the index where the given item should be inserted into this
305 : // list. 'found' is true if the item already exists in the list at the given
306 : // index.
307 : //
308 : // This function is for use only as a helper function for internal B-Tree code.
309 : // Clients should not invoke it directly.
310 1 : func (n *node) find(bcmp btreeCmp, item *FileMetadata) (index int, found bool) {
311 1 : // Logic copied from sort.Search. Inlining this gave
312 1 : // an 11% speedup on BenchmarkBTreeDeleteInsert.
313 1 : i, j := 0, int(n.count)
314 1 : for i < j {
315 1 : h := int(uint(i+j) >> 1) // avoid overflow when computing h
316 1 : // i ≤ h < j
317 1 : v := bcmp(item, n.items[h])
318 1 : if v == 0 {
319 1 : return h, true
320 1 : } else if v > 0 {
321 1 : i = h + 1
322 1 : } else {
323 1 : j = h
324 1 : }
325 : }
326 1 : return i, false
327 : }
328 :
329 : // split splits the given node at the given index. The current node shrinks,
330 : // and this function returns the item that existed at that index and a new
331 : // node containing all items/children after it.
332 : //
333 : // split is called when we want to perform a transformation like the one
334 : // depicted in the following diagram.
335 : //
336 : // Before:
337 : // +-----------+
338 : // n *node | x y z |
339 : // +--/-/-\-\--+
340 : //
341 : // After:
342 : // +-----------+
343 : // | y | n's parent
344 : // +----/-\----+
345 : // / \
346 : // v v
347 : // +-----------+ +-----------+
348 : // n *node | x | | z | next *node
349 : // +-----------+ +-----------+
350 : //
351 : // split does not perform the complete transformation; the caller is responsible
352 : // for updating the parent appropriately. split splits `n` into two nodes, `n`
353 : // and `next`, returning `next` and the file that separates them. In the diagram
354 : // above, `n.split` removes y and z from `n`, returning y in the first return
355 : // value and `next` in the second return value. The caller is responsible for
356 : // updating n's parent to now contain `y` as the separator between nodes `n` and
357 : // `next`.
358 : //
359 : // This function is for use only as a helper function for internal B-Tree code.
360 : // Clients should not invoke it directly.
361 1 : func (n *node) split(i int) (*FileMetadata, *node) {
362 1 : out := n.items[i]
363 1 : var next *node
364 1 : if n.leaf {
365 1 : next = newLeafNode()
366 1 : } else {
367 1 : next = newNode()
368 1 : }
369 1 : next.count = n.count - int16(i+1)
370 1 : copy(next.items[:], n.items[i+1:n.count])
371 1 : for j := int16(i); j < n.count; j++ {
372 1 : n.items[j] = nil
373 1 : }
374 1 : if !n.leaf {
375 1 : copy(next.children[:], n.children[i+1:n.count+1])
376 1 : descendantsMoved := 0
377 1 : for j := int16(i + 1); j <= n.count; j++ {
378 1 : descendantsMoved += n.children[j].subtreeCount
379 1 : n.children[j] = nil
380 1 : }
381 1 : n.subtreeCount -= descendantsMoved
382 1 : next.subtreeCount += descendantsMoved
383 : }
384 1 : n.count = int16(i)
385 1 : // NB: We subtract one more than `next.count` from n's subtreeCount because
386 1 : // the item at index `i` was removed from `n.items`. We'll return the item
387 1 : // at index `i`, and the caller is responsible for updating the subtree
388 1 : // count of whichever node adopts it.
389 1 : n.subtreeCount -= int(next.count) + 1
390 1 : next.subtreeCount += int(next.count)
391 1 : return out, next
392 : }
393 :
394 : // Insert inserts a item into the subtree rooted at this node, making sure no
395 : // nodes in the subtree exceed maxItems items.
396 1 : func (n *node) Insert(bcmp btreeCmp, item *FileMetadata) error {
397 1 : i, found := n.find(bcmp, item)
398 1 : if found {
399 0 : // cmp provides a total ordering of the files within a level.
400 0 : // If we're inserting a metadata that's equal to an existing item
401 0 : // in the tree, we're inserting a file into a level twice.
402 0 : return errors.Errorf("files %s and %s collided on sort keys",
403 0 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
404 0 : }
405 1 : if n.leaf {
406 1 : n.insertAt(i, item, nil)
407 1 : n.subtreeCount++
408 1 : return nil
409 1 : }
410 1 : if n.children[i].count >= maxItems {
411 1 : splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
412 1 : n.insertAt(i, splitLa, splitNode)
413 1 :
414 1 : switch cmp := bcmp(item, n.items[i]); {
415 1 : case cmp < 0:
416 : // no change, we want first split node
417 1 : case cmp > 0:
418 1 : i++ // we want second split node
419 0 : default:
420 0 : // cmp provides a total ordering of the files within a level.
421 0 : // If we're inserting a metadata that's equal to an existing item
422 0 : // in the tree, we're inserting a file into a level twice.
423 0 : return errors.Errorf("files %s and %s collided on sort keys",
424 0 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
425 : }
426 : }
427 :
428 1 : err := mut(&n.children[i]).Insert(bcmp, item)
429 1 : if err == nil {
430 1 : n.subtreeCount++
431 1 : }
432 1 : return err
433 : }
434 :
435 : // removeMax removes and returns the maximum item from the subtree rooted at
436 : // this node. This function is for use only as a helper function for internal
437 : // B-Tree code. Clients should not invoke it directly.
438 1 : func (n *node) removeMax() *FileMetadata {
439 1 : if n.leaf {
440 1 : n.count--
441 1 : n.subtreeCount--
442 1 : out := n.items[n.count]
443 1 : n.items[n.count] = nil
444 1 : return out
445 1 : }
446 1 : child := mut(&n.children[n.count])
447 1 : if child.count <= minItems {
448 1 : n.rebalanceOrMerge(int(n.count))
449 1 : return n.removeMax()
450 1 : }
451 1 : n.subtreeCount--
452 1 : return child.removeMax()
453 : }
454 :
455 : // Remove removes a item from the subtree rooted at this node. Returns
456 : // the item that was removed or nil if no matching item was found.
457 1 : func (n *node) Remove(bcmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
458 1 : i, found := n.find(bcmp, item)
459 1 : if n.leaf {
460 1 : if found {
461 1 : out, _ = n.removeAt(i)
462 1 : n.subtreeCount--
463 1 : return out
464 1 : }
465 0 : return nil
466 : }
467 1 : if n.children[i].count <= minItems {
468 1 : // Child not large enough to remove from.
469 1 : n.rebalanceOrMerge(i)
470 1 : return n.Remove(bcmp, item)
471 1 : }
472 1 : child := mut(&n.children[i])
473 1 : if found {
474 1 : // Replace the item being removed with the max item in our left child.
475 1 : out = n.items[i]
476 1 : n.items[i] = child.removeMax()
477 1 : n.subtreeCount--
478 1 : return out
479 1 : }
480 : // File is not in this node and child is large enough to remove from.
481 1 : out = child.Remove(bcmp, item)
482 1 : if out != nil {
483 1 : n.subtreeCount--
484 1 : }
485 1 : return out
486 : }
487 :
488 : // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
489 : // item from it while keeping it at or above minItems. This function is for use
490 : // only as a helper function for internal B-Tree code. Clients should not invoke
491 : // it directly.
492 1 : func (n *node) rebalanceOrMerge(i int) {
493 1 : switch {
494 1 : case i > 0 && n.children[i-1].count > minItems:
495 1 : // Rebalance from left sibling.
496 1 : //
497 1 : // +-----------+
498 1 : // | y |
499 1 : // +----/-\----+
500 1 : // / \
501 1 : // v v
502 1 : // +-----------+ +-----------+
503 1 : // | x | | |
504 1 : // +----------\+ +-----------+
505 1 : // \
506 1 : // v
507 1 : // a
508 1 : //
509 1 : // After:
510 1 : //
511 1 : // +-----------+
512 1 : // | x |
513 1 : // +----/-\----+
514 1 : // / \
515 1 : // v v
516 1 : // +-----------+ +-----------+
517 1 : // | | | y |
518 1 : // +-----------+ +/----------+
519 1 : // /
520 1 : // v
521 1 : // a
522 1 : //
523 1 : left := mut(&n.children[i-1])
524 1 : child := mut(&n.children[i])
525 1 : xLa, grandChild := left.popBack()
526 1 : yLa := n.items[i-1]
527 1 : child.pushFront(yLa, grandChild)
528 1 : n.items[i-1] = xLa
529 1 : child.subtreeCount++
530 1 : left.subtreeCount--
531 1 : if grandChild != nil {
532 1 : child.subtreeCount += grandChild.subtreeCount
533 1 : left.subtreeCount -= grandChild.subtreeCount
534 1 : }
535 :
536 1 : case i < int(n.count) && n.children[i+1].count > minItems:
537 1 : // Rebalance from right sibling.
538 1 : //
539 1 : // +-----------+
540 1 : // | y |
541 1 : // +----/-\----+
542 1 : // / \
543 1 : // v v
544 1 : // +-----------+ +-----------+
545 1 : // | | | x |
546 1 : // +-----------+ +/----------+
547 1 : // /
548 1 : // v
549 1 : // a
550 1 : //
551 1 : // After:
552 1 : //
553 1 : // +-----------+
554 1 : // | x |
555 1 : // +----/-\----+
556 1 : // / \
557 1 : // v v
558 1 : // +-----------+ +-----------+
559 1 : // | y | | |
560 1 : // +----------\+ +-----------+
561 1 : // \
562 1 : // v
563 1 : // a
564 1 : //
565 1 : right := mut(&n.children[i+1])
566 1 : child := mut(&n.children[i])
567 1 : xLa, grandChild := right.popFront()
568 1 : yLa := n.items[i]
569 1 : child.pushBack(yLa, grandChild)
570 1 : child.subtreeCount++
571 1 : right.subtreeCount--
572 1 : if grandChild != nil {
573 1 : child.subtreeCount += grandChild.subtreeCount
574 1 : right.subtreeCount -= grandChild.subtreeCount
575 1 : }
576 1 : n.items[i] = xLa
577 :
578 1 : default:
579 1 : // Merge with either the left or right sibling.
580 1 : //
581 1 : // +-----------+
582 1 : // | u y v |
583 1 : // +----/-\----+
584 1 : // / \
585 1 : // v v
586 1 : // +-----------+ +-----------+
587 1 : // | x | | z |
588 1 : // +-----------+ +-----------+
589 1 : //
590 1 : // After:
591 1 : //
592 1 : // +-----------+
593 1 : // | u v |
594 1 : // +-----|-----+
595 1 : // |
596 1 : // v
597 1 : // +-----------+
598 1 : // | x y z |
599 1 : // +-----------+
600 1 : //
601 1 : if i >= int(n.count) {
602 1 : i = int(n.count - 1)
603 1 : }
604 1 : child := mut(&n.children[i])
605 1 : // Make mergeChild mutable, bumping the refcounts on its children if necessary.
606 1 : _ = mut(&n.children[i+1])
607 1 : mergeLa, mergeChild := n.removeAt(i)
608 1 : child.items[child.count] = mergeLa
609 1 : copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
610 1 : if !child.leaf {
611 1 : copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
612 1 : }
613 1 : child.count += mergeChild.count + 1
614 1 : child.subtreeCount += mergeChild.subtreeCount + 1
615 1 :
616 1 : mergeChild.decRef(false /* contentsToo */, nil)
617 : }
618 : }
619 :
620 1 : func (n *node) verifyInvariants() {
621 1 : recomputedSubtreeCount := int(n.count)
622 1 : if !n.leaf {
623 1 : for i := int16(0); i <= n.count; i++ {
624 1 : n.children[i].verifyInvariants()
625 1 : recomputedSubtreeCount += n.children[i].subtreeCount
626 1 : }
627 : }
628 1 : if recomputedSubtreeCount != n.subtreeCount {
629 0 : panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
630 0 : recomputedSubtreeCount, n.subtreeCount))
631 : }
632 : }
633 :
634 : // btree is an implementation of a B-Tree.
635 : //
636 : // btree stores FileMetadata in an ordered structure, allowing easy insertion,
637 : // removal, and iteration. The B-Tree stores items in order based on cmp. The
638 : // first level of the LSM uses a cmp function that compares sequence numbers.
639 : // All other levels compare using the FileMetadata.Smallest.
640 : //
641 : // Write operations are not safe for concurrent mutation by multiple
642 : // goroutines, but Read operations are.
643 : type btree struct {
644 : root *node
645 : cmp base.Compare
646 : bcmp btreeCmp
647 : }
648 :
649 : // Release dereferences and clears the root node of the btree, removing all
650 : // items from the btree. In doing so, it decrements contained file counts.
651 : // It returns a slice of newly obsolete backing files, if any.
652 1 : func (t *btree) Release() (obsolete []*FileBacking) {
653 1 : if t.root != nil {
654 1 : t.root.decRef(true /* contentsToo */, &obsolete)
655 1 : t.root = nil
656 1 : }
657 1 : return obsolete
658 : }
659 :
660 : // Clone clones the btree, lazily. It does so in constant time.
661 1 : func (t *btree) Clone() btree {
662 1 : c := *t
663 1 : if c.root != nil {
664 1 : // Incrementing the reference count on the root node is sufficient to
665 1 : // ensure that no node in the cloned tree can be mutated by an actor
666 1 : // holding a reference to the original tree and vice versa. This
667 1 : // property is upheld because the root node in the receiver btree and
668 1 : // the returned btree will both necessarily have a reference count of at
669 1 : // least 2 when this method returns. All tree mutations recursively
670 1 : // acquire mutable node references (see mut) as they traverse down the
671 1 : // tree. The act of acquiring a mutable node reference performs a clone
672 1 : // if a node's reference count is greater than one. Cloning a node (see
673 1 : // clone) increases the reference count on each of its children,
674 1 : // ensuring that they have a reference count of at least 2. This, in
675 1 : // turn, ensures that any of the child nodes that are modified will also
676 1 : // be copied-on-write, recursively ensuring the immutability property
677 1 : // over the entire tree.
678 1 : c.root.incRef()
679 1 : }
680 1 : return c
681 : }
682 :
683 : // Delete removes the provided file from the tree.
684 : // It returns true if the file now has a zero reference count.
685 1 : func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
686 1 : if t.root == nil || t.root.count == 0 {
687 0 : return false
688 0 : }
689 1 : if out := mut(&t.root).Remove(t.bcmp, item); out != nil {
690 1 : obsolete = out.FileBacking.Unref() == 0
691 1 : }
692 1 : if invariants.Enabled {
693 1 : t.root.verifyInvariants()
694 1 : }
695 1 : if t.root.count == 0 {
696 1 : old := t.root
697 1 : if t.root.leaf {
698 1 : t.root = nil
699 1 : } else {
700 1 : t.root = t.root.children[0]
701 1 : }
702 1 : old.decRef(false /* contentsToo */, nil)
703 : }
704 1 : return obsolete
705 : }
706 :
707 : // Insert adds the given item to the tree. If a item in the tree already
708 : // equals the given one, Insert panics.
709 1 : func (t *btree) Insert(item *FileMetadata) error {
710 1 : if t.root == nil {
711 1 : t.root = newLeafNode()
712 1 : } else if t.root.count >= maxItems {
713 1 : splitLa, splitNode := mut(&t.root).split(maxItems / 2)
714 1 : newRoot := newNode()
715 1 : newRoot.count = 1
716 1 : newRoot.items[0] = splitLa
717 1 : newRoot.children[0] = t.root
718 1 : newRoot.children[1] = splitNode
719 1 : newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
720 1 : t.root = newRoot
721 1 : }
722 1 : item.FileBacking.Ref()
723 1 : err := mut(&t.root).Insert(t.bcmp, item)
724 1 : if invariants.Enabled {
725 1 : t.root.verifyInvariants()
726 1 : }
727 1 : return err
728 : }
729 :
730 : // Iter returns a new iterator object. It is not safe to continue using an
731 : // iterator after modifications are made to the tree. If modifications are made,
732 : // create a new iterator.
733 1 : func (t *btree) Iter() iterator {
734 1 : return iterator{r: t.root, pos: -1, cmp: t.bcmp}
735 1 : }
736 :
737 : // Count returns the number of files contained within the B-Tree.
738 1 : func (t *btree) Count() int {
739 1 : if t.root == nil {
740 1 : return 0
741 1 : }
742 1 : return t.root.subtreeCount
743 : }
744 :
745 : // String returns a string description of the tree. The format is
746 : // similar to the https://en.wikipedia.org/wiki/Newick_format.
747 0 : func (t *btree) String() string {
748 0 : if t.Count() == 0 {
749 0 : return ";"
750 0 : }
751 0 : var b strings.Builder
752 0 : t.root.writeString(&b)
753 0 : return b.String()
754 : }
755 :
756 0 : func (n *node) writeString(b *strings.Builder) {
757 0 : if n.leaf {
758 0 : for i := int16(0); i < n.count; i++ {
759 0 : if i != 0 {
760 0 : b.WriteString(",")
761 0 : }
762 0 : b.WriteString(n.items[i].String())
763 : }
764 0 : return
765 : }
766 0 : for i := int16(0); i <= n.count; i++ {
767 0 : b.WriteString("(")
768 0 : n.children[i].writeString(b)
769 0 : b.WriteString(")")
770 0 : if i < n.count {
771 0 : b.WriteString(n.items[i].String())
772 0 : }
773 : }
774 : }
775 :
776 : // iterStack represents a stack of (node, pos) tuples, which captures
777 : // iteration state as an iterator descends a btree.
778 : type iterStack struct {
779 : // a contains aLen stack frames when an iterator stack is short enough.
780 : // If the iterator stack overflows the capacity of iterStackArr, the stack
781 : // is moved to s and aLen is set to -1.
782 : a iterStackArr
783 : aLen int16 // -1 when using s
784 : s []iterFrame
785 : }
786 :
787 : // Used to avoid allocations for stacks below a certain size.
788 : type iterStackArr [3]iterFrame
789 :
790 : type iterFrame struct {
791 : n *node
792 : pos int16
793 : }
794 :
795 1 : func (is *iterStack) push(f iterFrame) {
796 1 : if is.aLen == -1 {
797 0 : is.s = append(is.s, f)
798 1 : } else if int(is.aLen) == len(is.a) {
799 0 : is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
800 0 : copy(is.s, is.a[:])
801 0 : is.s[int(is.aLen)] = f
802 0 : is.aLen = -1
803 1 : } else {
804 1 : is.a[is.aLen] = f
805 1 : is.aLen++
806 1 : }
807 : }
808 :
809 1 : func (is *iterStack) pop() iterFrame {
810 1 : if is.aLen == -1 {
811 0 : f := is.s[len(is.s)-1]
812 0 : is.s = is.s[:len(is.s)-1]
813 0 : return f
814 0 : }
815 1 : is.aLen--
816 1 : return is.a[is.aLen]
817 : }
818 :
819 1 : func (is *iterStack) len() int {
820 1 : if is.aLen == -1 {
821 0 : return len(is.s)
822 0 : }
823 1 : return int(is.aLen)
824 : }
825 :
826 1 : func (is *iterStack) clone() iterStack {
827 1 : // If the iterator is using the embedded iterStackArr, we only need to
828 1 : // copy the struct itself.
829 1 : if is.s == nil {
830 1 : return *is
831 1 : }
832 0 : clone := *is
833 0 : clone.s = make([]iterFrame, len(is.s))
834 0 : copy(clone.s, is.s)
835 0 : return clone
836 : }
837 :
838 1 : func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
839 1 : if is.aLen == -1 {
840 0 : if n >= len(is.s) {
841 0 : return f, false
842 0 : }
843 0 : return is.s[n], true
844 : }
845 1 : if int16(n) >= is.aLen {
846 1 : return f, false
847 1 : }
848 1 : return is.a[n], true
849 : }
850 :
851 1 : func (is *iterStack) reset() {
852 1 : if is.aLen == -1 {
853 0 : is.s = is.s[:0]
854 1 : } else {
855 1 : is.aLen = 0
856 1 : }
857 : }
858 :
859 : // iterator is responsible for search and traversal within a btree.
860 : type iterator struct {
861 : // the root node of the B-Tree.
862 : r *node
863 : // n and pos make up the current position of the iterator.
864 : // If valid, n.items[pos] is the current value of the iterator.
865 : //
866 : // n may be nil iff i.r is nil.
867 : n *node
868 : pos int16
869 : // cmp dictates the ordering of the FileMetadata.
870 : cmp func(*FileMetadata, *FileMetadata) int
871 : // a stack of n's ancestors within the B-Tree, alongside the position
872 : // taken to arrive at n. If non-empty, the bottommost frame of the stack
873 : // will always contain the B-Tree root.
874 : s iterStack
875 : }
876 :
877 : // countLeft returns the count of files that are to the left of the current
878 : // iterator position.
879 1 : func (i *iterator) countLeft() int {
880 1 : if i.r == nil {
881 0 : return 0
882 0 : }
883 :
884 : // Each iterator has a stack of frames marking the path from the root node
885 : // to the current iterator position. All files (n.items) and all subtrees
886 : // (n.children) with indexes less than [pos] are to the left of the current
887 : // iterator position.
888 : //
889 : // +------------------------+ -
890 : // | Root pos:5 | |
891 : // +------------------------+ | stack
892 : // | Root/5 pos:3 | | frames
893 : // +------------------------+ | [i.s]
894 : // | Root/5/3 pos:9 | |
895 : // +========================+ -
896 : // | |
897 : // | i.n: Root/5/3/9 i.pos:2|
898 : // +------------------------+
899 : //
900 1 : var count int
901 1 : // Walk all the ancestors in the iterator stack [i.s], tallying up all the
902 1 : // files and subtrees to the left of the stack frame's position.
903 1 : f, ok := i.s.nth(0)
904 1 : for fi := 0; ok; fi++ {
905 1 : // There are [f.pos] files contained within [f.n.items] that sort to the
906 1 : // left of the subtree the iterator has descended.
907 1 : count += int(f.pos)
908 1 : // Any subtrees that fall before the stack frame's position are entirely
909 1 : // to the left of the iterator's current position.
910 1 : for j := int16(0); j < f.pos; j++ {
911 1 : count += f.n.children[j].subtreeCount
912 1 : }
913 1 : f, ok = i.s.nth(fi + 1)
914 : }
915 :
916 : // The bottommost stack frame is inlined within the iterator struct. Again,
917 : // [i.pos] files fall to the left of the current iterator position.
918 1 : count += int(i.pos)
919 1 : if !i.n.leaf {
920 1 : // NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
921 1 : // positioned at item `i.n.items[i.pos]`, which sorts after everything
922 1 : // in the subtree at `i.n.children[i.pos]`.
923 1 : for j := int16(0); j <= i.pos; j++ {
924 1 : count += i.n.children[j].subtreeCount
925 1 : }
926 : }
927 1 : return count
928 : }
929 :
930 1 : func (i *iterator) clone() iterator {
931 1 : c := *i
932 1 : c.s = i.s.clone()
933 1 : return c
934 1 : }
935 :
936 1 : func (i *iterator) reset() {
937 1 : i.n = i.r
938 1 : i.pos = -1
939 1 : i.s.reset()
940 1 : }
941 :
942 0 : func (i iterator) String() string {
943 0 : var buf bytes.Buffer
944 0 : for n := 0; ; n++ {
945 0 : f, ok := i.s.nth(n)
946 0 : if !ok {
947 0 : break
948 : }
949 0 : fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
950 : }
951 0 : if i.r == nil {
952 0 : fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
953 0 : } else {
954 0 : fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
955 0 : }
956 0 : return buf.String()
957 : }
958 :
959 1 : func cmpIter(a, b iterator) int {
960 1 : if a.r != b.r {
961 0 : panic("compared iterators from different btrees")
962 : }
963 :
964 : // Each iterator has a stack of frames marking the path from the root node
965 : // to the current iterator position. We walk both paths formed by the
966 : // iterators' stacks simultaneously, descending from the shared root node,
967 : // always comparing nodes at the same level in the tree.
968 : //
969 : // If the iterators' paths ever diverge and point to different nodes, the
970 : // iterators are not equal and we use the node positions to evaluate the
971 : // comparison.
972 : //
973 : // If an iterator's stack ends, we stop descending and use its current
974 : // node and position for the final comparison. One iterator's stack may
975 : // end before another's if one iterator is positioned deeper in the tree.
976 : //
977 : // a b
978 : // +------------------------+ +--------------------------+ -
979 : // | Root pos:5 | = | Root pos:5 | |
980 : // +------------------------+ +--------------------------+ | stack
981 : // | Root/5 pos:3 | = | Root/5 pos:3 | | frames
982 : // +------------------------+ +--------------------------+ |
983 : // | Root/5/3 pos:9 | > | Root/5/3 pos:1 | |
984 : // +========================+ +==========================+ -
985 : // | | | |
986 : // | a.n: Root/5/3/9 a.pos:2| | b.n: Root/5/3/1, b.pos:5 |
987 : // +------------------------+ +--------------------------+
988 :
989 : // Initialize with the iterator's current node and position. These are
990 : // conceptually the most-recent/current frame of the iterator stack.
991 1 : an, apos := a.n, a.pos
992 1 : bn, bpos := b.n, b.pos
993 1 :
994 1 : // aok, bok are set while traversing the iterator's path down the B-Tree.
995 1 : // They're declared in the outer scope because they help distinguish the
996 1 : // sentinel case when both iterators' first frame points to the last child
997 1 : // of the root. If an iterator has no other frames in its stack, it's the
998 1 : // end sentinel state which sorts after everything else.
999 1 : var aok, bok bool
1000 1 : for i := 0; ; i++ {
1001 1 : var af, bf iterFrame
1002 1 : af, aok = a.s.nth(i)
1003 1 : bf, bok = b.s.nth(i)
1004 1 : if !aok || !bok {
1005 1 : if aok {
1006 1 : // Iterator a, unlike iterator b, still has a frame. Set an,
1007 1 : // apos so we compare using the frame from the stack.
1008 1 : an, apos = af.n, af.pos
1009 1 : }
1010 1 : if bok {
1011 1 : // Iterator b, unlike iterator a, still has a frame. Set bn,
1012 1 : // bpos so we compare using the frame from the stack.
1013 1 : bn, bpos = bf.n, bf.pos
1014 1 : }
1015 1 : break
1016 : }
1017 :
1018 : // aok && bok
1019 1 : if af.n != bf.n {
1020 0 : panic("nonmatching nodes during btree iterator comparison")
1021 : }
1022 1 : if v := stdcmp.Compare(af.pos, bf.pos); v != 0 {
1023 1 : return v
1024 1 : }
1025 : // Otherwise continue up both iterators' stacks (equivalently, down the
1026 : // B-Tree away from the root).
1027 : }
1028 :
1029 1 : if aok && bok {
1030 0 : panic("expected one or more stacks to have been exhausted")
1031 : }
1032 1 : if an != bn {
1033 0 : panic("nonmatching nodes during btree iterator comparison")
1034 : }
1035 1 : if v := stdcmp.Compare(apos, bpos); v != 0 {
1036 1 : return v
1037 1 : }
1038 1 : switch {
1039 1 : case aok:
1040 1 : // a is positioned at a leaf child at this position and b is at an
1041 1 : // end sentinel state.
1042 1 : return -1
1043 1 : case bok:
1044 1 : // b is positioned at a leaf child at this position and a is at an
1045 1 : // end sentinel state.
1046 1 : return +1
1047 1 : default:
1048 1 : return 0
1049 : }
1050 : }
1051 :
1052 1 : func (i *iterator) descend(n *node, pos int16) {
1053 1 : i.s.push(iterFrame{n: n, pos: pos})
1054 1 : i.n = n.children[pos]
1055 1 : i.pos = 0
1056 1 : }
1057 :
1058 : // ascend ascends up to the current node's parent and resets the position
1059 : // to the one previously set for this parent node.
1060 1 : func (i *iterator) ascend() {
1061 1 : f := i.s.pop()
1062 1 : i.n = f.n
1063 1 : i.pos = f.pos
1064 1 : }
1065 :
1066 : // seek repositions the iterator over the first file for which fn returns
1067 : // true, mirroring the semantics of the standard library's sort.Search
1068 : // function. Like sort.Search, seek requires the iterator's B-Tree to be
1069 : // ordered such that fn returns false for some (possibly empty) prefix of the
1070 : // tree's files, and then true for the (possibly empty) remainder.
1071 1 : func (i *iterator) seek(fn func(*FileMetadata) bool) {
1072 1 : i.reset()
1073 1 : if i.r == nil {
1074 0 : return
1075 0 : }
1076 :
1077 1 : for {
1078 1 : // Logic copied from sort.Search.
1079 1 : j, k := 0, int(i.n.count)
1080 1 : for j < k {
1081 1 : h := int(uint(j+k) >> 1) // avoid overflow when computing h
1082 1 :
1083 1 : // j ≤ h < k
1084 1 : if !fn(i.n.items[h]) {
1085 1 : j = h + 1 // preserves f(j-1) == false
1086 1 : } else {
1087 1 : k = h // preserves f(k) == true
1088 1 : }
1089 : }
1090 :
1091 1 : i.pos = int16(j)
1092 1 : if i.n.leaf {
1093 1 : if i.pos == i.n.count {
1094 1 : i.next()
1095 1 : }
1096 1 : return
1097 : }
1098 1 : i.descend(i.n, i.pos)
1099 : }
1100 : }
1101 :
1102 : // first seeks to the first item in the btree.
1103 1 : func (i *iterator) first() {
1104 1 : i.reset()
1105 1 : if i.r == nil {
1106 0 : return
1107 0 : }
1108 1 : for !i.n.leaf {
1109 1 : i.descend(i.n, 0)
1110 1 : }
1111 1 : i.pos = 0
1112 : }
1113 :
1114 : // last seeks to the last item in the btree.
1115 1 : func (i *iterator) last() {
1116 1 : i.reset()
1117 1 : if i.r == nil {
1118 0 : return
1119 0 : }
1120 1 : for !i.n.leaf {
1121 1 : i.descend(i.n, i.n.count)
1122 1 : }
1123 1 : i.pos = i.n.count - 1
1124 : }
1125 :
1126 : // next positions the iterator to the item immediately following
1127 : // its current position.
1128 1 : func (i *iterator) next() {
1129 1 : if i.r == nil {
1130 0 : return
1131 0 : }
1132 :
1133 1 : if i.n.leaf {
1134 1 : if i.pos < i.n.count {
1135 1 : i.pos++
1136 1 : }
1137 1 : if i.pos < i.n.count {
1138 1 : return
1139 1 : }
1140 1 : for i.s.len() > 0 && i.pos >= i.n.count {
1141 1 : i.ascend()
1142 1 : }
1143 1 : return
1144 : }
1145 :
1146 1 : i.descend(i.n, i.pos+1)
1147 1 : for !i.n.leaf {
1148 1 : i.descend(i.n, 0)
1149 1 : }
1150 1 : i.pos = 0
1151 : }
1152 :
1153 : // prev positions the iterator to the item immediately preceding
1154 : // its current position.
1155 1 : func (i *iterator) prev() {
1156 1 : if i.r == nil {
1157 0 : return
1158 0 : }
1159 :
1160 1 : if i.n.leaf {
1161 1 : i.pos--
1162 1 : if i.pos >= 0 {
1163 1 : return
1164 1 : }
1165 1 : for i.s.len() > 0 && i.pos < 0 {
1166 1 : i.ascend()
1167 1 : i.pos--
1168 1 : }
1169 1 : return
1170 : }
1171 :
1172 1 : i.descend(i.n, i.pos)
1173 1 : for !i.n.leaf {
1174 0 : i.descend(i.n, i.n.count)
1175 0 : }
1176 1 : i.pos = i.n.count - 1
1177 : }
1178 :
1179 : // valid returns whether the iterator is positioned at a valid position.
1180 1 : func (i *iterator) valid() bool {
1181 1 : return i.r != nil && i.pos >= 0 && i.pos < i.n.count
1182 1 : }
1183 :
1184 : // cur returns the item at the iterator's current position. It is illegal
1185 : // to call cur if the iterator is not valid.
1186 1 : func (i *iterator) cur() *FileMetadata {
1187 1 : if invariants.Enabled && !i.valid() {
1188 0 : panic("btree iterator.cur invoked on invalid iterator")
1189 : }
1190 1 : return i.n.items[i.pos]
1191 : }
|