Line data Source code
1 : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : stdcmp "cmp"
10 : "fmt"
11 : "strings"
12 : "sync/atomic"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/base"
17 : "github.com/cockroachdb/pebble/internal/invariants"
18 : )
19 :
20 : type btreeCmp func(*FileMetadata, *FileMetadata) int
21 :
22 2 : func btreeCmpSeqNum(a, b *FileMetadata) int {
23 2 : return a.cmpSeqNum(b)
24 2 : }
25 :
26 2 : func btreeCmpSmallestKey(cmp Compare) btreeCmp {
27 2 : return func(a, b *FileMetadata) int {
28 2 : return a.cmpSmallestKey(b, cmp)
29 2 : }
30 : }
31 :
32 : // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
33 : // specific ordering of FileMetadata within the tree. It's typically used to
34 : // test consistency checking code that needs to construct a malformed B-Tree.
35 1 : func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
36 1 : m := map[*FileMetadata]int{}
37 1 : for i, f := range files {
38 1 : m[f] = i
39 1 : }
40 1 : return func(a, b *FileMetadata) int {
41 1 : ai, aok := m[a]
42 1 : bi, bok := m[b]
43 1 : if !aok || !bok {
44 0 : panic("btreeCmpSliceOrder called with unknown files")
45 : }
46 1 : return stdcmp.Compare(ai, bi)
47 : }
48 : }
49 :
50 : const (
51 : degree = 16
52 : maxItems = 2*degree - 1
53 : minItems = degree - 1
54 : )
55 :
56 : type leafNode struct {
57 : ref atomic.Int32
58 : count int16
59 : leaf bool
60 : // subtreeCount holds the count of files in the entire subtree formed by
61 : // this node. For leaf nodes, subtreeCount is always equal to count. For
62 : // non-leaf nodes, it's the sum of count plus all the children's
63 : // subtreeCounts.
64 : //
65 : // NB: We could move this field to the end of the node struct, since leaf =>
66 : // count=subtreeCount, however the unsafe casting [leafToNode] performs make
67 : // it risky and cumbersome.
68 : subtreeCount int
69 : items [maxItems]*FileMetadata
70 : // annot contains one annotation per annotator, merged over the entire
71 : // node's files (and all descendants for non-leaf nodes).
72 : annot []annotation
73 : }
74 :
75 : type node struct {
76 : leafNode
77 : children [maxItems + 1]*node
78 : }
79 :
80 : //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
81 2 : func leafToNode(ln *leafNode) *node {
82 2 : return (*node)(unsafe.Pointer(ln))
83 2 : }
84 :
85 2 : func newLeafNode() *node {
86 2 : n := leafToNode(new(leafNode))
87 2 : n.leaf = true
88 2 : n.ref.Store(1)
89 2 : return n
90 2 : }
91 :
92 2 : func newNode() *node {
93 2 : n := new(node)
94 2 : n.ref.Store(1)
95 2 : return n
96 2 : }
97 :
98 : // mut creates and returns a mutable node reference. If the node is not shared
99 : // with any other trees then it can be modified in place. Otherwise, it must be
100 : // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
101 : // policy which transparently incorporates the idea of local mutations, like
102 : // Clojure's transients or Haskell's ST monad, where nodes are only copied
103 : // during the first time that they are modified between Clone operations.
104 : //
105 : // When a node is cloned, the provided pointer will be redirected to the new
106 : // mutable node.
107 2 : func mut(n **node) *node {
108 2 : if (*n).ref.Load() == 1 {
109 2 : // Exclusive ownership. Can mutate in place.
110 2 :
111 2 : // Whenever a node will be mutated, reset its annotations to be marked
112 2 : // as uncached. This ensures any future calls to (*node).annotation
113 2 : // will recompute annotations on the modified subtree.
114 2 : for i := range (*n).annot {
115 1 : (*n).annot[i].valid = false
116 1 : }
117 2 : return *n
118 : }
119 : // If we do not have unique ownership over the node then we
120 : // clone it to gain unique ownership. After doing so, we can
121 : // release our reference to the old node. We pass recursive
122 : // as true because even though we just observed the node's
123 : // reference count to be greater than 1, we might be racing
124 : // with another call to decRef on this node.
125 2 : c := (*n).clone()
126 2 : (*n).decRef(true /* contentsToo */, nil)
127 2 : *n = c
128 2 : // NB: We don't need to clear annotations, because (*node).clone does not
129 2 : // copy them.
130 2 : return *n
131 : }
132 :
133 : // incRef acquires a reference to the node.
134 2 : func (n *node) incRef() {
135 2 : n.ref.Add(1)
136 2 : }
137 :
138 : // decRef releases a reference to the node. If requested, the method will unref
139 : // its items and recurse into child nodes and decrease their refcounts as well.
140 : // Some internal codepaths that manually copy the node's items or children to
141 : // new nodes pass contentsToo=false to preserve existing reference counts during
142 : // operations that should yield a net-zero change to descendant refcounts.
143 : // When a node is released, its contained files are dereferenced.
144 2 : func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
145 2 : if n.ref.Add(-1) > 0 {
146 2 : // Other references remain. Can't free.
147 2 : return
148 2 : }
149 :
150 : // Dereference the node's metadata and release child references if
151 : // requested. Some internal callers may not want to propagate the deref
152 : // because they're manually copying the filemetadata and children to other
153 : // nodes, and they want to preserve the existing reference count.
154 2 : if contentsToo {
155 2 : for _, f := range n.items[:n.count] {
156 2 : if f.FileBacking.Unref() == 0 {
157 2 : // There are two sources of node dereferences: tree mutations
158 2 : // and Version dereferences. Files should only be made obsolete
159 2 : // during Version dereferences, during which `obsolete` will be
160 2 : // non-nil.
161 2 : if obsolete == nil {
162 0 : panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
163 : }
164 : // Reference counting is performed on the FileBacking. In the case
165 : // of a virtual sstable, this reference counting is performed on
166 : // a FileBacking which is shared by every single virtual sstable
167 : // with the same backing sstable. If the reference count hits 0,
168 : // then we know that the FileBacking won't be required by any
169 : // sstable in Pebble, and that the backing sstable can be deleted.
170 2 : *obsolete = append(*obsolete, f.FileBacking)
171 : }
172 : }
173 2 : if !n.leaf {
174 2 : for i := int16(0); i <= n.count; i++ {
175 2 : n.children[i].decRef(true /* contentsToo */, obsolete)
176 2 : }
177 : }
178 : }
179 : }
180 :
181 : // clone creates a clone of the receiver with a single reference count.
182 2 : func (n *node) clone() *node {
183 2 : var c *node
184 2 : if n.leaf {
185 2 : c = newLeafNode()
186 2 : } else {
187 2 : c = newNode()
188 2 : }
189 : // NB: copy field-by-field without touching n.ref to avoid
190 : // triggering the race detector and looking like a data race.
191 2 : c.count = n.count
192 2 : c.items = n.items
193 2 : c.subtreeCount = n.subtreeCount
194 2 : // Increase the refcount of each contained item.
195 2 : for _, f := range n.items[:n.count] {
196 2 : f.FileBacking.Ref()
197 2 : }
198 2 : if !c.leaf {
199 2 : // Copy children and increase each refcount.
200 2 : c.children = n.children
201 2 : for i := int16(0); i <= c.count; i++ {
202 2 : c.children[i].incRef()
203 2 : }
204 : }
205 2 : return c
206 : }
207 :
208 : // insertAt inserts the provided file and node at the provided index. This
209 : // function is for use only as a helper function for internal B-Tree code.
210 : // Clients should not invoke it directly.
211 2 : func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
212 2 : if index < int(n.count) {
213 2 : copy(n.items[index+1:n.count+1], n.items[index:n.count])
214 2 : if !n.leaf {
215 2 : copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
216 2 : }
217 : }
218 2 : n.items[index] = item
219 2 : if !n.leaf {
220 2 : n.children[index+1] = nd
221 2 : }
222 2 : n.count++
223 : }
224 :
225 : // pushBack inserts the provided file and node at the tail of the node's items.
226 : // This function is for use only as a helper function for internal B-Tree code.
227 : // Clients should not invoke it directly.
228 2 : func (n *node) pushBack(item *FileMetadata, nd *node) {
229 2 : n.items[n.count] = item
230 2 : if !n.leaf {
231 2 : n.children[n.count+1] = nd
232 2 : }
233 2 : n.count++
234 : }
235 :
236 : // pushFront inserts the provided file and node at the head of the
237 : // node's items. This function is for use only as a helper function for internal B-Tree
238 : // code. Clients should not invoke it directly.
239 2 : func (n *node) pushFront(item *FileMetadata, nd *node) {
240 2 : if !n.leaf {
241 2 : copy(n.children[1:n.count+2], n.children[:n.count+1])
242 2 : n.children[0] = nd
243 2 : }
244 2 : copy(n.items[1:n.count+1], n.items[:n.count])
245 2 : n.items[0] = item
246 2 : n.count++
247 : }
248 :
249 : // removeAt removes a value at a given index, pulling all subsequent values
250 : // back. This function is for use only as a helper function for internal B-Tree
251 : // code. Clients should not invoke it directly.
252 2 : func (n *node) removeAt(index int) (*FileMetadata, *node) {
253 2 : var child *node
254 2 : if !n.leaf {
255 2 : child = n.children[index+1]
256 2 : copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
257 2 : n.children[n.count] = nil
258 2 : }
259 2 : n.count--
260 2 : out := n.items[index]
261 2 : copy(n.items[index:n.count], n.items[index+1:n.count+1])
262 2 : n.items[n.count] = nil
263 2 : return out, child
264 : }
265 :
266 : // popBack removes and returns the last element in the list. This function is
267 : // for use only as a helper function for internal B-Tree code. Clients should
268 : // not invoke it directly.
269 2 : func (n *node) popBack() (*FileMetadata, *node) {
270 2 : n.count--
271 2 : out := n.items[n.count]
272 2 : n.items[n.count] = nil
273 2 : if n.leaf {
274 2 : return out, nil
275 2 : }
276 2 : child := n.children[n.count+1]
277 2 : n.children[n.count+1] = nil
278 2 : return out, child
279 : }
280 :
281 : // popFront removes and returns the first element in the list. This function is
282 : // for use only as a helper function for internal B-Tree code. Clients should
283 : // not invoke it directly.
284 2 : func (n *node) popFront() (*FileMetadata, *node) {
285 2 : n.count--
286 2 : var child *node
287 2 : if !n.leaf {
288 2 : child = n.children[0]
289 2 : copy(n.children[:n.count+1], n.children[1:n.count+2])
290 2 : n.children[n.count+1] = nil
291 2 : }
292 2 : out := n.items[0]
293 2 : copy(n.items[:n.count], n.items[1:n.count+1])
294 2 : n.items[n.count] = nil
295 2 : return out, child
296 : }
297 :
298 : // find returns the index where the given item should be inserted into this
299 : // list. 'found' is true if the item already exists in the list at the given
300 : // index.
301 : //
302 : // This function is for use only as a helper function for internal B-Tree code.
303 : // Clients should not invoke it directly.
304 2 : func (n *node) find(bcmp btreeCmp, item *FileMetadata) (index int, found bool) {
305 2 : // Logic copied from sort.Search. Inlining this gave
306 2 : // an 11% speedup on BenchmarkBTreeDeleteInsert.
307 2 : i, j := 0, int(n.count)
308 2 : for i < j {
309 2 : h := int(uint(i+j) >> 1) // avoid overflow when computing h
310 2 : // i ≤ h < j
311 2 : v := bcmp(item, n.items[h])
312 2 : if v == 0 {
313 2 : return h, true
314 2 : } else if v > 0 {
315 2 : i = h + 1
316 2 : } else {
317 2 : j = h
318 2 : }
319 : }
320 2 : return i, false
321 : }
322 :
323 : // split splits the given node at the given index. The current node shrinks,
324 : // and this function returns the item that existed at that index and a new
325 : // node containing all items/children after it.
326 : //
327 : // split is called when we want to perform a transformation like the one
328 : // depicted in the following diagram.
329 : //
330 : // Before:
331 : // +-----------+
332 : // n *node | x y z |
333 : // +--/-/-\-\--+
334 : //
335 : // After:
336 : // +-----------+
337 : // | y | n's parent
338 : // +----/-\----+
339 : // / \
340 : // v v
341 : // +-----------+ +-----------+
342 : // n *node | x | | z | next *node
343 : // +-----------+ +-----------+
344 : //
345 : // split does not perform the complete transformation; the caller is responsible
346 : // for updating the parent appropriately. split splits `n` into two nodes, `n`
347 : // and `next`, returning `next` and the file that separates them. In the diagram
348 : // above, `n.split` removes y and z from `n`, returning y in the first return
349 : // value and `next` in the second return value. The caller is responsible for
350 : // updating n's parent to now contain `y` as the separator between nodes `n` and
351 : // `next`.
352 : //
353 : // This function is for use only as a helper function for internal B-Tree code.
354 : // Clients should not invoke it directly.
355 2 : func (n *node) split(i int) (*FileMetadata, *node) {
356 2 : out := n.items[i]
357 2 : var next *node
358 2 : if n.leaf {
359 2 : next = newLeafNode()
360 2 : } else {
361 2 : next = newNode()
362 2 : }
363 2 : next.count = n.count - int16(i+1)
364 2 : copy(next.items[:], n.items[i+1:n.count])
365 2 : for j := int16(i); j < n.count; j++ {
366 2 : n.items[j] = nil
367 2 : }
368 2 : if !n.leaf {
369 2 : copy(next.children[:], n.children[i+1:n.count+1])
370 2 : descendantsMoved := 0
371 2 : for j := int16(i + 1); j <= n.count; j++ {
372 2 : descendantsMoved += n.children[j].subtreeCount
373 2 : n.children[j] = nil
374 2 : }
375 2 : n.subtreeCount -= descendantsMoved
376 2 : next.subtreeCount += descendantsMoved
377 : }
378 2 : n.count = int16(i)
379 2 : // NB: We subtract one more than `next.count` from n's subtreeCount because
380 2 : // the item at index `i` was removed from `n.items`. We'll return the item
381 2 : // at index `i`, and the caller is responsible for updating the subtree
382 2 : // count of whichever node adopts it.
383 2 : n.subtreeCount -= int(next.count) + 1
384 2 : next.subtreeCount += int(next.count)
385 2 : return out, next
386 : }
387 :
388 : // Insert inserts a item into the subtree rooted at this node, making sure no
389 : // nodes in the subtree exceed maxItems items.
390 2 : func (n *node) Insert(bcmp btreeCmp, item *FileMetadata) error {
391 2 : i, found := n.find(bcmp, item)
392 2 : if found {
393 1 : // cmp provides a total ordering of the files within a level.
394 1 : // If we're inserting a metadata that's equal to an existing item
395 1 : // in the tree, we're inserting a file into a level twice.
396 1 : return errors.Errorf("files %s and %s collided on sort keys",
397 1 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
398 1 : }
399 2 : if n.leaf {
400 2 : n.insertAt(i, item, nil)
401 2 : n.subtreeCount++
402 2 : return nil
403 2 : }
404 2 : if n.children[i].count >= maxItems {
405 2 : splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
406 2 : n.insertAt(i, splitLa, splitNode)
407 2 :
408 2 : switch cmp := bcmp(item, n.items[i]); {
409 2 : case cmp < 0:
410 : // no change, we want first split node
411 2 : case cmp > 0:
412 2 : i++ // we want second split node
413 1 : default:
414 1 : // cmp provides a total ordering of the files within a level.
415 1 : // If we're inserting a metadata that's equal to an existing item
416 1 : // in the tree, we're inserting a file into a level twice.
417 1 : return errors.Errorf("files %s and %s collided on sort keys",
418 1 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
419 : }
420 : }
421 :
422 2 : err := mut(&n.children[i]).Insert(bcmp, item)
423 2 : if err == nil {
424 2 : n.subtreeCount++
425 2 : }
426 2 : return err
427 : }
428 :
429 : // removeMax removes and returns the maximum item from the subtree rooted at
430 : // this node. This function is for use only as a helper function for internal
431 : // B-Tree code. Clients should not invoke it directly.
432 2 : func (n *node) removeMax() *FileMetadata {
433 2 : if n.leaf {
434 2 : n.count--
435 2 : n.subtreeCount--
436 2 : out := n.items[n.count]
437 2 : n.items[n.count] = nil
438 2 : return out
439 2 : }
440 2 : child := mut(&n.children[n.count])
441 2 : if child.count <= minItems {
442 2 : n.rebalanceOrMerge(int(n.count))
443 2 : return n.removeMax()
444 2 : }
445 2 : n.subtreeCount--
446 2 : return child.removeMax()
447 : }
448 :
449 : // Remove removes a item from the subtree rooted at this node. Returns
450 : // the item that was removed or nil if no matching item was found.
451 2 : func (n *node) Remove(bcmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
452 2 : i, found := n.find(bcmp, item)
453 2 : if n.leaf {
454 2 : if found {
455 2 : out, _ = n.removeAt(i)
456 2 : n.subtreeCount--
457 2 : return out
458 2 : }
459 1 : return nil
460 : }
461 2 : if n.children[i].count <= minItems {
462 2 : // Child not large enough to remove from.
463 2 : n.rebalanceOrMerge(i)
464 2 : return n.Remove(bcmp, item)
465 2 : }
466 2 : child := mut(&n.children[i])
467 2 : if found {
468 2 : // Replace the item being removed with the max item in our left child.
469 2 : out = n.items[i]
470 2 : n.items[i] = child.removeMax()
471 2 : n.subtreeCount--
472 2 : return out
473 2 : }
474 : // File is not in this node and child is large enough to remove from.
475 2 : out = child.Remove(bcmp, item)
476 2 : if out != nil {
477 2 : n.subtreeCount--
478 2 : }
479 2 : return out
480 : }
481 :
482 : // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
483 : // item from it while keeping it at or above minItems. This function is for use
484 : // only as a helper function for internal B-Tree code. Clients should not invoke
485 : // it directly.
486 2 : func (n *node) rebalanceOrMerge(i int) {
487 2 : switch {
488 2 : case i > 0 && n.children[i-1].count > minItems:
489 2 : // Rebalance from left sibling.
490 2 : //
491 2 : // +-----------+
492 2 : // | y |
493 2 : // +----/-\----+
494 2 : // / \
495 2 : // v v
496 2 : // +-----------+ +-----------+
497 2 : // | x | | |
498 2 : // +----------\+ +-----------+
499 2 : // \
500 2 : // v
501 2 : // a
502 2 : //
503 2 : // After:
504 2 : //
505 2 : // +-----------+
506 2 : // | x |
507 2 : // +----/-\----+
508 2 : // / \
509 2 : // v v
510 2 : // +-----------+ +-----------+
511 2 : // | | | y |
512 2 : // +-----------+ +/----------+
513 2 : // /
514 2 : // v
515 2 : // a
516 2 : //
517 2 : left := mut(&n.children[i-1])
518 2 : child := mut(&n.children[i])
519 2 : xLa, grandChild := left.popBack()
520 2 : yLa := n.items[i-1]
521 2 : child.pushFront(yLa, grandChild)
522 2 : n.items[i-1] = xLa
523 2 : child.subtreeCount++
524 2 : left.subtreeCount--
525 2 : if grandChild != nil {
526 2 : child.subtreeCount += grandChild.subtreeCount
527 2 : left.subtreeCount -= grandChild.subtreeCount
528 2 : }
529 :
530 2 : case i < int(n.count) && n.children[i+1].count > minItems:
531 2 : // Rebalance from right sibling.
532 2 : //
533 2 : // +-----------+
534 2 : // | y |
535 2 : // +----/-\----+
536 2 : // / \
537 2 : // v v
538 2 : // +-----------+ +-----------+
539 2 : // | | | x |
540 2 : // +-----------+ +/----------+
541 2 : // /
542 2 : // v
543 2 : // a
544 2 : //
545 2 : // After:
546 2 : //
547 2 : // +-----------+
548 2 : // | x |
549 2 : // +----/-\----+
550 2 : // / \
551 2 : // v v
552 2 : // +-----------+ +-----------+
553 2 : // | y | | |
554 2 : // +----------\+ +-----------+
555 2 : // \
556 2 : // v
557 2 : // a
558 2 : //
559 2 : right := mut(&n.children[i+1])
560 2 : child := mut(&n.children[i])
561 2 : xLa, grandChild := right.popFront()
562 2 : yLa := n.items[i]
563 2 : child.pushBack(yLa, grandChild)
564 2 : child.subtreeCount++
565 2 : right.subtreeCount--
566 2 : if grandChild != nil {
567 2 : child.subtreeCount += grandChild.subtreeCount
568 2 : right.subtreeCount -= grandChild.subtreeCount
569 2 : }
570 2 : n.items[i] = xLa
571 :
572 2 : default:
573 2 : // Merge with either the left or right sibling.
574 2 : //
575 2 : // +-----------+
576 2 : // | u y v |
577 2 : // +----/-\----+
578 2 : // / \
579 2 : // v v
580 2 : // +-----------+ +-----------+
581 2 : // | x | | z |
582 2 : // +-----------+ +-----------+
583 2 : //
584 2 : // After:
585 2 : //
586 2 : // +-----------+
587 2 : // | u v |
588 2 : // +-----|-----+
589 2 : // |
590 2 : // v
591 2 : // +-----------+
592 2 : // | x y z |
593 2 : // +-----------+
594 2 : //
595 2 : if i >= int(n.count) {
596 2 : i = int(n.count - 1)
597 2 : }
598 2 : child := mut(&n.children[i])
599 2 : // Make mergeChild mutable, bumping the refcounts on its children if necessary.
600 2 : _ = mut(&n.children[i+1])
601 2 : mergeLa, mergeChild := n.removeAt(i)
602 2 : child.items[child.count] = mergeLa
603 2 : copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
604 2 : if !child.leaf {
605 2 : copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
606 2 : }
607 2 : child.count += mergeChild.count + 1
608 2 : child.subtreeCount += mergeChild.subtreeCount + 1
609 2 :
610 2 : mergeChild.decRef(false /* contentsToo */, nil)
611 : }
612 : }
613 :
614 2 : func (n *node) verifyInvariants() {
615 2 : recomputedSubtreeCount := int(n.count)
616 2 : if !n.leaf {
617 2 : for i := int16(0); i <= n.count; i++ {
618 2 : n.children[i].verifyInvariants()
619 2 : recomputedSubtreeCount += n.children[i].subtreeCount
620 2 : }
621 : }
622 2 : if recomputedSubtreeCount != n.subtreeCount {
623 0 : panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
624 0 : recomputedSubtreeCount, n.subtreeCount))
625 : }
626 : }
627 :
628 : // btree is an implementation of a B-Tree.
629 : //
630 : // btree stores FileMetadata in an ordered structure, allowing easy insertion,
631 : // removal, and iteration. The B-Tree stores items in order based on cmp. The
632 : // first level of the LSM uses a cmp function that compares sequence numbers.
633 : // All other levels compare using the FileMetadata.Smallest.
634 : //
635 : // Write operations are not safe for concurrent mutation by multiple
636 : // goroutines, but Read operations are.
637 : type btree struct {
638 : root *node
639 : cmp base.Compare
640 : bcmp btreeCmp
641 : }
642 :
643 : // Release dereferences and clears the root node of the btree, removing all
644 : // items from the btree. In doing so, it decrements contained file counts.
645 : // It returns a slice of newly obsolete backing files, if any.
646 2 : func (t *btree) Release() (obsolete []*FileBacking) {
647 2 : if t.root != nil {
648 2 : t.root.decRef(true /* contentsToo */, &obsolete)
649 2 : t.root = nil
650 2 : }
651 2 : return obsolete
652 : }
653 :
654 : // Clone clones the btree, lazily. It does so in constant time.
655 2 : func (t *btree) Clone() btree {
656 2 : c := *t
657 2 : if c.root != nil {
658 2 : // Incrementing the reference count on the root node is sufficient to
659 2 : // ensure that no node in the cloned tree can be mutated by an actor
660 2 : // holding a reference to the original tree and vice versa. This
661 2 : // property is upheld because the root node in the receiver btree and
662 2 : // the returned btree will both necessarily have a reference count of at
663 2 : // least 2 when this method returns. All tree mutations recursively
664 2 : // acquire mutable node references (see mut) as they traverse down the
665 2 : // tree. The act of acquiring a mutable node reference performs a clone
666 2 : // if a node's reference count is greater than one. Cloning a node (see
667 2 : // clone) increases the reference count on each of its children,
668 2 : // ensuring that they have a reference count of at least 2. This, in
669 2 : // turn, ensures that any of the child nodes that are modified will also
670 2 : // be copied-on-write, recursively ensuring the immutability property
671 2 : // over the entire tree.
672 2 : c.root.incRef()
673 2 : }
674 2 : return c
675 : }
676 :
677 : // Delete removes the provided file from the tree.
678 : // It returns true if the file now has a zero reference count.
679 2 : func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
680 2 : if t.root == nil || t.root.count == 0 {
681 0 : return false
682 0 : }
683 2 : if out := mut(&t.root).Remove(t.bcmp, item); out != nil {
684 2 : obsolete = out.FileBacking.Unref() == 0
685 2 : }
686 2 : if invariants.Enabled {
687 2 : t.root.verifyInvariants()
688 2 : }
689 2 : if t.root.count == 0 {
690 2 : old := t.root
691 2 : if t.root.leaf {
692 2 : t.root = nil
693 2 : } else {
694 2 : t.root = t.root.children[0]
695 2 : }
696 2 : old.decRef(false /* contentsToo */, nil)
697 : }
698 2 : return obsolete
699 : }
700 :
701 : // Insert adds the given item to the tree. If a item in the tree already
702 : // equals the given one, Insert panics.
703 2 : func (t *btree) Insert(item *FileMetadata) error {
704 2 : if t.root == nil {
705 2 : t.root = newLeafNode()
706 2 : } else if t.root.count >= maxItems {
707 2 : splitLa, splitNode := mut(&t.root).split(maxItems / 2)
708 2 : newRoot := newNode()
709 2 : newRoot.count = 1
710 2 : newRoot.items[0] = splitLa
711 2 : newRoot.children[0] = t.root
712 2 : newRoot.children[1] = splitNode
713 2 : newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
714 2 : t.root = newRoot
715 2 : }
716 2 : item.FileBacking.Ref()
717 2 : err := mut(&t.root).Insert(t.bcmp, item)
718 2 : if invariants.Enabled {
719 2 : t.root.verifyInvariants()
720 2 : }
721 2 : return err
722 : }
723 :
724 : // Iter returns a new iterator object. It is not safe to continue using an
725 : // iterator after modifications are made to the tree. If modifications are made,
726 : // create a new iterator.
727 2 : func (t *btree) Iter() iterator {
728 2 : return iterator{r: t.root, pos: -1, cmp: t.bcmp}
729 2 : }
730 :
731 : // Count returns the number of files contained within the B-Tree.
732 2 : func (t *btree) Count() int {
733 2 : if t.root == nil {
734 2 : return 0
735 2 : }
736 2 : return t.root.subtreeCount
737 : }
738 :
739 : // String returns a string description of the tree. The format is
740 : // similar to the https://en.wikipedia.org/wiki/Newick_format.
741 0 : func (t *btree) String() string {
742 0 : if t.Count() == 0 {
743 0 : return ";"
744 0 : }
745 0 : var b strings.Builder
746 0 : t.root.writeString(&b)
747 0 : return b.String()
748 : }
749 :
750 0 : func (n *node) writeString(b *strings.Builder) {
751 0 : if n.leaf {
752 0 : for i := int16(0); i < n.count; i++ {
753 0 : if i != 0 {
754 0 : b.WriteString(",")
755 0 : }
756 0 : b.WriteString(n.items[i].String())
757 : }
758 0 : return
759 : }
760 0 : for i := int16(0); i <= n.count; i++ {
761 0 : b.WriteString("(")
762 0 : n.children[i].writeString(b)
763 0 : b.WriteString(")")
764 0 : if i < n.count {
765 0 : b.WriteString(n.items[i].String())
766 0 : }
767 : }
768 : }
769 :
770 : // iterStack represents a stack of (node, pos) tuples, which captures
771 : // iteration state as an iterator descends a btree.
772 : type iterStack struct {
773 : // a contains aLen stack frames when an iterator stack is short enough.
774 : // If the iterator stack overflows the capacity of iterStackArr, the stack
775 : // is moved to s and aLen is set to -1.
776 : a iterStackArr
777 : aLen int16 // -1 when using s
778 : s []iterFrame
779 : }
780 :
781 : // Used to avoid allocations for stacks below a certain size.
782 : type iterStackArr [3]iterFrame
783 :
784 : type iterFrame struct {
785 : n *node
786 : pos int16
787 : }
788 :
789 2 : func (is *iterStack) push(f iterFrame) {
790 2 : if is.aLen == -1 {
791 1 : is.s = append(is.s, f)
792 2 : } else if int(is.aLen) == len(is.a) {
793 1 : is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
794 1 : copy(is.s, is.a[:])
795 1 : is.s[int(is.aLen)] = f
796 1 : is.aLen = -1
797 2 : } else {
798 2 : is.a[is.aLen] = f
799 2 : is.aLen++
800 2 : }
801 : }
802 :
803 2 : func (is *iterStack) pop() iterFrame {
804 2 : if is.aLen == -1 {
805 1 : f := is.s[len(is.s)-1]
806 1 : is.s = is.s[:len(is.s)-1]
807 1 : return f
808 1 : }
809 2 : is.aLen--
810 2 : return is.a[is.aLen]
811 : }
812 :
813 2 : func (is *iterStack) len() int {
814 2 : if is.aLen == -1 {
815 1 : return len(is.s)
816 1 : }
817 2 : return int(is.aLen)
818 : }
819 :
820 2 : func (is *iterStack) clone() iterStack {
821 2 : // If the iterator is using the embedded iterStackArr, we only need to
822 2 : // copy the struct itself.
823 2 : if is.s == nil {
824 2 : return *is
825 2 : }
826 0 : clone := *is
827 0 : clone.s = make([]iterFrame, len(is.s))
828 0 : copy(clone.s, is.s)
829 0 : return clone
830 : }
831 :
832 2 : func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
833 2 : if is.aLen == -1 {
834 0 : if n >= len(is.s) {
835 0 : return f, false
836 0 : }
837 0 : return is.s[n], true
838 : }
839 2 : if int16(n) >= is.aLen {
840 2 : return f, false
841 2 : }
842 2 : return is.a[n], true
843 : }
844 :
845 2 : func (is *iterStack) reset() {
846 2 : if is.aLen == -1 {
847 1 : is.s = is.s[:0]
848 2 : } else {
849 2 : is.aLen = 0
850 2 : }
851 : }
852 :
853 : // iterator is responsible for search and traversal within a btree.
854 : type iterator struct {
855 : // the root node of the B-Tree.
856 : r *node
857 : // n and pos make up the current position of the iterator.
858 : // If valid, n.items[pos] is the current value of the iterator.
859 : //
860 : // n may be nil iff i.r is nil.
861 : n *node
862 : pos int16
863 : // cmp dictates the ordering of the FileMetadata.
864 : cmp func(*FileMetadata, *FileMetadata) int
865 : // a stack of n's ancestors within the B-Tree, alongside the position
866 : // taken to arrive at n. If non-empty, the bottommost frame of the stack
867 : // will always contain the B-Tree root.
868 : s iterStack
869 : }
870 :
871 : // countLeft returns the count of files that are to the left of the current
872 : // iterator position.
873 2 : func (i *iterator) countLeft() int {
874 2 : if i.r == nil {
875 0 : return 0
876 0 : }
877 :
878 : // Each iterator has a stack of frames marking the path from the root node
879 : // to the current iterator position. All files (n.items) and all subtrees
880 : // (n.children) with indexes less than [pos] are to the left of the current
881 : // iterator position.
882 : //
883 : // +------------------------+ -
884 : // | Root pos:5 | |
885 : // +------------------------+ | stack
886 : // | Root/5 pos:3 | | frames
887 : // +------------------------+ | [i.s]
888 : // | Root/5/3 pos:9 | |
889 : // +========================+ -
890 : // | |
891 : // | i.n: Root/5/3/9 i.pos:2|
892 : // +------------------------+
893 : //
894 2 : var count int
895 2 : // Walk all the ancestors in the iterator stack [i.s], tallying up all the
896 2 : // files and subtrees to the left of the stack frame's position.
897 2 : f, ok := i.s.nth(0)
898 2 : for fi := 0; ok; fi++ {
899 2 : // There are [f.pos] files contained within [f.n.items] that sort to the
900 2 : // left of the subtree the iterator has descended.
901 2 : count += int(f.pos)
902 2 : // Any subtrees that fall before the stack frame's position are entirely
903 2 : // to the left of the iterator's current position.
904 2 : for j := int16(0); j < f.pos; j++ {
905 2 : count += f.n.children[j].subtreeCount
906 2 : }
907 2 : f, ok = i.s.nth(fi + 1)
908 : }
909 :
910 : // The bottommost stack frame is inlined within the iterator struct. Again,
911 : // [i.pos] files fall to the left of the current iterator position.
912 2 : count += int(i.pos)
913 2 : if !i.n.leaf {
914 2 : // NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
915 2 : // positioned at item `i.n.items[i.pos]`, which sorts after everything
916 2 : // in the subtree at `i.n.children[i.pos]`.
917 2 : for j := int16(0); j <= i.pos; j++ {
918 2 : count += i.n.children[j].subtreeCount
919 2 : }
920 : }
921 2 : return count
922 : }
923 :
924 2 : func (i *iterator) clone() iterator {
925 2 : c := *i
926 2 : c.s = i.s.clone()
927 2 : return c
928 2 : }
929 :
930 2 : func (i *iterator) reset() {
931 2 : i.n = i.r
932 2 : i.pos = -1
933 2 : i.s.reset()
934 2 : }
935 :
936 0 : func (i iterator) String() string {
937 0 : var buf bytes.Buffer
938 0 : for n := 0; ; n++ {
939 0 : f, ok := i.s.nth(n)
940 0 : if !ok {
941 0 : break
942 : }
943 0 : fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
944 : }
945 0 : if i.r == nil {
946 0 : fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
947 0 : } else {
948 0 : fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
949 0 : }
950 0 : return buf.String()
951 : }
952 :
953 2 : func cmpIter(a, b iterator) int {
954 2 : if a.r != b.r {
955 0 : panic("compared iterators from different btrees")
956 : }
957 :
958 : // Each iterator has a stack of frames marking the path from the root node
959 : // to the current iterator position. We walk both paths formed by the
960 : // iterators' stacks simultaneously, descending from the shared root node,
961 : // always comparing nodes at the same level in the tree.
962 : //
963 : // If the iterators' paths ever diverge and point to different nodes, the
964 : // iterators are not equal and we use the node positions to evaluate the
965 : // comparison.
966 : //
967 : // If an iterator's stack ends, we stop descending and use its current
968 : // node and position for the final comparison. One iterator's stack may
969 : // end before another's if one iterator is positioned deeper in the tree.
970 : //
971 : // a b
972 : // +------------------------+ +--------------------------+ -
973 : // | Root pos:5 | = | Root pos:5 | |
974 : // +------------------------+ +--------------------------+ | stack
975 : // | Root/5 pos:3 | = | Root/5 pos:3 | | frames
976 : // +------------------------+ +--------------------------+ |
977 : // | Root/5/3 pos:9 | > | Root/5/3 pos:1 | |
978 : // +========================+ +==========================+ -
979 : // | | | |
980 : // | a.n: Root/5/3/9 a.pos:2| | b.n: Root/5/3/1, b.pos:5 |
981 : // +------------------------+ +--------------------------+
982 :
983 : // Initialize with the iterator's current node and position. These are
984 : // conceptually the most-recent/current frame of the iterator stack.
985 2 : an, apos := a.n, a.pos
986 2 : bn, bpos := b.n, b.pos
987 2 :
988 2 : // aok, bok are set while traversing the iterator's path down the B-Tree.
989 2 : // They're declared in the outer scope because they help distinguish the
990 2 : // sentinel case when both iterators' first frame points to the last child
991 2 : // of the root. If an iterator has no other frames in its stack, it's the
992 2 : // end sentinel state which sorts after everything else.
993 2 : var aok, bok bool
994 2 : for i := 0; ; i++ {
995 2 : var af, bf iterFrame
996 2 : af, aok = a.s.nth(i)
997 2 : bf, bok = b.s.nth(i)
998 2 : if !aok || !bok {
999 2 : if aok {
1000 2 : // Iterator a, unlike iterator b, still has a frame. Set an,
1001 2 : // apos so we compare using the frame from the stack.
1002 2 : an, apos = af.n, af.pos
1003 2 : }
1004 2 : if bok {
1005 2 : // Iterator b, unlike iterator a, still has a frame. Set bn,
1006 2 : // bpos so we compare using the frame from the stack.
1007 2 : bn, bpos = bf.n, bf.pos
1008 2 : }
1009 2 : break
1010 : }
1011 :
1012 : // aok && bok
1013 2 : if af.n != bf.n {
1014 0 : panic("nonmatching nodes during btree iterator comparison")
1015 : }
1016 2 : if v := stdcmp.Compare(af.pos, bf.pos); v != 0 {
1017 2 : return v
1018 2 : }
1019 : // Otherwise continue up both iterators' stacks (equivalently, down the
1020 : // B-Tree away from the root).
1021 : }
1022 :
1023 2 : if aok && bok {
1024 0 : panic("expected one or more stacks to have been exhausted")
1025 : }
1026 2 : if an != bn {
1027 0 : panic("nonmatching nodes during btree iterator comparison")
1028 : }
1029 2 : if v := stdcmp.Compare(apos, bpos); v != 0 {
1030 2 : return v
1031 2 : }
1032 2 : switch {
1033 2 : case aok:
1034 2 : // a is positioned at a leaf child at this position and b is at an
1035 2 : // end sentinel state.
1036 2 : return -1
1037 2 : case bok:
1038 2 : // b is positioned at a leaf child at this position and a is at an
1039 2 : // end sentinel state.
1040 2 : return +1
1041 2 : default:
1042 2 : return 0
1043 : }
1044 : }
1045 :
1046 2 : func (i *iterator) descend(n *node, pos int16) {
1047 2 : i.s.push(iterFrame{n: n, pos: pos})
1048 2 : i.n = n.children[pos]
1049 2 : i.pos = 0
1050 2 : }
1051 :
1052 : // ascend ascends up to the current node's parent and resets the position
1053 : // to the one previously set for this parent node.
1054 2 : func (i *iterator) ascend() {
1055 2 : f := i.s.pop()
1056 2 : i.n = f.n
1057 2 : i.pos = f.pos
1058 2 : }
1059 :
1060 : // seek repositions the iterator over the first file for which fn returns
1061 : // true, mirroring the semantics of the standard library's sort.Search
1062 : // function. Like sort.Search, seek requires the iterator's B-Tree to be
1063 : // ordered such that fn returns false for some (possibly empty) prefix of the
1064 : // tree's files, and then true for the (possibly empty) remainder.
1065 2 : func (i *iterator) seek(fn func(*FileMetadata) bool) {
1066 2 : i.reset()
1067 2 : if i.r == nil {
1068 0 : return
1069 0 : }
1070 :
1071 2 : for {
1072 2 : // Logic copied from sort.Search.
1073 2 : j, k := 0, int(i.n.count)
1074 2 : for j < k {
1075 2 : h := int(uint(j+k) >> 1) // avoid overflow when computing h
1076 2 :
1077 2 : // j ≤ h < k
1078 2 : if !fn(i.n.items[h]) {
1079 2 : j = h + 1 // preserves f(j-1) == false
1080 2 : } else {
1081 2 : k = h // preserves f(k) == true
1082 2 : }
1083 : }
1084 :
1085 2 : i.pos = int16(j)
1086 2 : if i.n.leaf {
1087 2 : if i.pos == i.n.count {
1088 2 : i.next()
1089 2 : }
1090 2 : return
1091 : }
1092 2 : i.descend(i.n, i.pos)
1093 : }
1094 : }
1095 :
1096 : // first seeks to the first item in the btree.
1097 2 : func (i *iterator) first() {
1098 2 : i.reset()
1099 2 : if i.r == nil {
1100 1 : return
1101 1 : }
1102 2 : for !i.n.leaf {
1103 2 : i.descend(i.n, 0)
1104 2 : }
1105 2 : i.pos = 0
1106 : }
1107 :
1108 : // last seeks to the last item in the btree.
1109 2 : func (i *iterator) last() {
1110 2 : i.reset()
1111 2 : if i.r == nil {
1112 1 : return
1113 1 : }
1114 2 : for !i.n.leaf {
1115 2 : i.descend(i.n, i.n.count)
1116 2 : }
1117 2 : i.pos = i.n.count - 1
1118 : }
1119 :
1120 : // next positions the iterator to the item immediately following
1121 : // its current position.
1122 2 : func (i *iterator) next() {
1123 2 : if i.r == nil {
1124 0 : return
1125 0 : }
1126 :
1127 2 : if i.n.leaf {
1128 2 : if i.pos < i.n.count {
1129 2 : i.pos++
1130 2 : }
1131 2 : if i.pos < i.n.count {
1132 2 : return
1133 2 : }
1134 2 : for i.s.len() > 0 && i.pos >= i.n.count {
1135 2 : i.ascend()
1136 2 : }
1137 2 : return
1138 : }
1139 :
1140 2 : i.descend(i.n, i.pos+1)
1141 2 : for !i.n.leaf {
1142 2 : i.descend(i.n, 0)
1143 2 : }
1144 2 : i.pos = 0
1145 : }
1146 :
1147 : // prev positions the iterator to the item immediately preceding
1148 : // its current position.
1149 2 : func (i *iterator) prev() {
1150 2 : if i.r == nil {
1151 0 : return
1152 0 : }
1153 :
1154 2 : if i.n.leaf {
1155 2 : i.pos--
1156 2 : if i.pos >= 0 {
1157 2 : return
1158 2 : }
1159 2 : for i.s.len() > 0 && i.pos < 0 {
1160 2 : i.ascend()
1161 2 : i.pos--
1162 2 : }
1163 2 : return
1164 : }
1165 :
1166 2 : i.descend(i.n, i.pos)
1167 2 : for !i.n.leaf {
1168 1 : i.descend(i.n, i.n.count)
1169 1 : }
1170 2 : i.pos = i.n.count - 1
1171 : }
1172 :
1173 : // valid returns whether the iterator is positioned at a valid position.
1174 2 : func (i *iterator) valid() bool {
1175 2 : return i.r != nil && i.pos >= 0 && i.pos < i.n.count
1176 2 : }
1177 :
1178 : // cur returns the item at the iterator's current position. It is illegal
1179 : // to call cur if the iterator is not valid.
1180 2 : func (i *iterator) cur() *FileMetadata {
1181 2 : if invariants.Enabled && !i.valid() {
1182 0 : panic("btree iterator.cur invoked on invalid iterator")
1183 : }
1184 2 : return i.n.items[i.pos]
1185 : }
|