Line data Source code
1 : // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package manifest
6 :
7 : import (
8 : "bytes"
9 : stdcmp "cmp"
10 : "fmt"
11 : "strings"
12 : "sync/atomic"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/base"
17 : "github.com/cockroachdb/pebble/internal/invariants"
18 : )
19 :
20 : type btreeCmp func(*FileMetadata, *FileMetadata) int
21 :
22 1 : func btreeCmpSeqNum(a, b *FileMetadata) int {
23 1 : return a.cmpSeqNum(b)
24 1 : }
25 :
26 1 : func btreeCmpSmallestKey(cmp Compare) btreeCmp {
27 1 : return func(a, b *FileMetadata) int {
28 1 : return a.cmpSmallestKey(b, cmp)
29 1 : }
30 : }
31 :
32 : // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
33 : // specific ordering of FileMetadata within the tree. It's typically used to
34 : // test consistency checking code that needs to construct a malformed B-Tree.
35 1 : func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
36 1 : m := map[*FileMetadata]int{}
37 1 : for i, f := range files {
38 1 : m[f] = i
39 1 : }
40 1 : return func(a, b *FileMetadata) int {
41 1 : ai, aok := m[a]
42 1 : bi, bok := m[b]
43 1 : if !aok || !bok {
44 0 : panic("btreeCmpSliceOrder called with unknown files")
45 : }
46 1 : return stdcmp.Compare(ai, bi)
47 : }
48 : }
49 :
50 : const (
51 : degree = 16
52 : maxItems = 2*degree - 1
53 : minItems = degree - 1
54 : )
55 :
56 : type leafNode struct {
57 : ref atomic.Int32
58 : count int16
59 : leaf bool
60 : // subtreeCount holds the count of files in the entire subtree formed by
61 : // this node. For leaf nodes, subtreeCount is always equal to count. For
62 : // non-leaf nodes, it's the sum of count plus all the children's
63 : // subtreeCounts.
64 : //
65 : // NB: We could move this field to the end of the node struct, since leaf =>
66 : // count=subtreeCount, however the unsafe casting [leafToNode] performs make
67 : // it risky and cumbersome.
68 : subtreeCount int
69 : items [maxItems]*FileMetadata
70 : // annot contains one annotation per annotator, merged over the entire
71 : // node's files (and all descendants for non-leaf nodes).
72 : annot []annotation
73 : }
74 :
75 : type node struct {
76 : leafNode
77 : children [maxItems + 1]*node
78 : }
79 :
80 : //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
81 1 : func leafToNode(ln *leafNode) *node {
82 1 : return (*node)(unsafe.Pointer(ln))
83 1 : }
84 :
85 1 : func newLeafNode() *node {
86 1 : n := leafToNode(new(leafNode))
87 1 : n.leaf = true
88 1 : n.ref.Store(1)
89 1 : return n
90 1 : }
91 :
92 1 : func newNode() *node {
93 1 : n := new(node)
94 1 : n.ref.Store(1)
95 1 : return n
96 1 : }
97 :
98 : // mut creates and returns a mutable node reference. If the node is not shared
99 : // with any other trees then it can be modified in place. Otherwise, it must be
100 : // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
101 : // policy which transparently incorporates the idea of local mutations, like
102 : // Clojure's transients or Haskell's ST monad, where nodes are only copied
103 : // during the first time that they are modified between Clone operations.
104 : //
105 : // When a node is cloned, the provided pointer will be redirected to the new
106 : // mutable node.
107 1 : func mut(n **node) *node {
108 1 : if (*n).ref.Load() == 1 {
109 1 : // Exclusive ownership. Can mutate in place.
110 1 :
111 1 : // Whenever a node will be mutated, reset its annotations to be marked
112 1 : // as uncached. This ensures any future calls to (*node).annotation
113 1 : // will recompute annotations on the modified subtree.
114 1 : for i := range (*n).annot {
115 1 : (*n).annot[i].valid = false
116 1 : }
117 1 : return *n
118 : }
119 : // If we do not have unique ownership over the node then we
120 : // clone it to gain unique ownership. After doing so, we can
121 : // release our reference to the old node. We pass recursive
122 : // as true because even though we just observed the node's
123 : // reference count to be greater than 1, we might be racing
124 : // with another call to decRef on this node.
125 1 : c := (*n).clone()
126 1 : (*n).decRef(true /* contentsToo */, nil)
127 1 : *n = c
128 1 : // NB: We don't need to clear annotations, because (*node).clone does not
129 1 : // copy them.
130 1 : return *n
131 : }
132 :
133 : // incRef acquires a reference to the node.
134 1 : func (n *node) incRef() {
135 1 : n.ref.Add(1)
136 1 : }
137 :
138 : // decRef releases a reference to the node. If requested, the method will unref
139 : // its items and recurse into child nodes and decrease their refcounts as well.
140 : // Some internal codepaths that manually copy the node's items or children to
141 : // new nodes pass contentsToo=false to preserve existing reference counts during
142 : // operations that should yield a net-zero change to descendant refcounts.
143 : // When a node is released, its contained files are dereferenced.
144 1 : func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
145 1 : if n.ref.Add(-1) > 0 {
146 1 : // Other references remain. Can't free.
147 1 : return
148 1 : }
149 :
150 : // Dereference the node's metadata and release child references if
151 : // requested. Some internal callers may not want to propagate the deref
152 : // because they're manually copying the filemetadata and children to other
153 : // nodes, and they want to preserve the existing reference count.
154 1 : if contentsToo {
155 1 : for _, f := range n.items[:n.count] {
156 1 : if f.FileBacking.Unref() == 0 {
157 1 : // There are two sources of node dereferences: tree mutations
158 1 : // and Version dereferences. Files should only be made obsolete
159 1 : // during Version dereferences, during which `obsolete` will be
160 1 : // non-nil.
161 1 : if obsolete == nil {
162 0 : panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
163 : }
164 : // Reference counting is performed on the FileBacking. In the case
165 : // of a virtual sstable, this reference counting is performed on
166 : // a FileBacking which is shared by every single virtual sstable
167 : // with the same backing sstable. If the reference count hits 0,
168 : // then we know that the FileBacking won't be required by any
169 : // sstable in Pebble, and that the backing sstable can be deleted.
170 1 : *obsolete = append(*obsolete, f.FileBacking)
171 : }
172 : }
173 1 : if !n.leaf {
174 1 : for i := int16(0); i <= n.count; i++ {
175 1 : n.children[i].decRef(true /* contentsToo */, obsolete)
176 1 : }
177 : }
178 : }
179 : }
180 :
181 : // clone creates a clone of the receiver with a single reference count.
182 1 : func (n *node) clone() *node {
183 1 : var c *node
184 1 : if n.leaf {
185 1 : c = newLeafNode()
186 1 : } else {
187 1 : c = newNode()
188 1 : }
189 : // NB: copy field-by-field without touching n.ref to avoid
190 : // triggering the race detector and looking like a data race.
191 1 : c.count = n.count
192 1 : c.items = n.items
193 1 : c.subtreeCount = n.subtreeCount
194 1 : // Increase the refcount of each contained item.
195 1 : for _, f := range n.items[:n.count] {
196 1 : f.FileBacking.Ref()
197 1 : }
198 1 : if !c.leaf {
199 1 : // Copy children and increase each refcount.
200 1 : c.children = n.children
201 1 : for i := int16(0); i <= c.count; i++ {
202 1 : c.children[i].incRef()
203 1 : }
204 : }
205 1 : return c
206 : }
207 :
208 : // insertAt inserts the provided file and node at the provided index. This
209 : // function is for use only as a helper function for internal B-Tree code.
210 : // Clients should not invoke it directly.
211 1 : func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
212 1 : if index < int(n.count) {
213 1 : copy(n.items[index+1:n.count+1], n.items[index:n.count])
214 1 : if !n.leaf {
215 1 : copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
216 1 : }
217 : }
218 1 : n.items[index] = item
219 1 : if !n.leaf {
220 1 : n.children[index+1] = nd
221 1 : }
222 1 : n.count++
223 : }
224 :
225 : // pushBack inserts the provided file and node at the tail of the node's items.
226 : // This function is for use only as a helper function for internal B-Tree code.
227 : // Clients should not invoke it directly.
228 1 : func (n *node) pushBack(item *FileMetadata, nd *node) {
229 1 : n.items[n.count] = item
230 1 : if !n.leaf {
231 1 : n.children[n.count+1] = nd
232 1 : }
233 1 : n.count++
234 : }
235 :
236 : // pushFront inserts the provided file and node at the head of the
237 : // node's items. This function is for use only as a helper function for internal B-Tree
238 : // code. Clients should not invoke it directly.
239 1 : func (n *node) pushFront(item *FileMetadata, nd *node) {
240 1 : if !n.leaf {
241 1 : copy(n.children[1:n.count+2], n.children[:n.count+1])
242 1 : n.children[0] = nd
243 1 : }
244 1 : copy(n.items[1:n.count+1], n.items[:n.count])
245 1 : n.items[0] = item
246 1 : n.count++
247 : }
248 :
249 : // removeAt removes a value at a given index, pulling all subsequent values
250 : // back. This function is for use only as a helper function for internal B-Tree
251 : // code. Clients should not invoke it directly.
252 1 : func (n *node) removeAt(index int) (*FileMetadata, *node) {
253 1 : var child *node
254 1 : if !n.leaf {
255 1 : child = n.children[index+1]
256 1 : copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
257 1 : n.children[n.count] = nil
258 1 : }
259 1 : n.count--
260 1 : out := n.items[index]
261 1 : copy(n.items[index:n.count], n.items[index+1:n.count+1])
262 1 : n.items[n.count] = nil
263 1 : return out, child
264 : }
265 :
266 : // popBack removes and returns the last element in the list. This function is
267 : // for use only as a helper function for internal B-Tree code. Clients should
268 : // not invoke it directly.
269 1 : func (n *node) popBack() (*FileMetadata, *node) {
270 1 : n.count--
271 1 : out := n.items[n.count]
272 1 : n.items[n.count] = nil
273 1 : if n.leaf {
274 1 : return out, nil
275 1 : }
276 1 : child := n.children[n.count+1]
277 1 : n.children[n.count+1] = nil
278 1 : return out, child
279 : }
280 :
281 : // popFront removes and returns the first element in the list. This function is
282 : // for use only as a helper function for internal B-Tree code. Clients should
283 : // not invoke it directly.
284 1 : func (n *node) popFront() (*FileMetadata, *node) {
285 1 : n.count--
286 1 : var child *node
287 1 : if !n.leaf {
288 1 : child = n.children[0]
289 1 : copy(n.children[:n.count+1], n.children[1:n.count+2])
290 1 : n.children[n.count+1] = nil
291 1 : }
292 1 : out := n.items[0]
293 1 : copy(n.items[:n.count], n.items[1:n.count+1])
294 1 : n.items[n.count] = nil
295 1 : return out, child
296 : }
297 :
298 : // find returns the index where the given item should be inserted into this
299 : // list. 'found' is true if the item already exists in the list at the given
300 : // index.
301 : //
302 : // This function is for use only as a helper function for internal B-Tree code.
303 : // Clients should not invoke it directly.
304 1 : func (n *node) find(bcmp btreeCmp, item *FileMetadata) (index int, found bool) {
305 1 : // Logic copied from sort.Search. Inlining this gave
306 1 : // an 11% speedup on BenchmarkBTreeDeleteInsert.
307 1 : i, j := 0, int(n.count)
308 1 : for i < j {
309 1 : h := int(uint(i+j) >> 1) // avoid overflow when computing h
310 1 : // i ≤ h < j
311 1 : v := bcmp(item, n.items[h])
312 1 : if v == 0 {
313 1 : return h, true
314 1 : } else if v > 0 {
315 1 : i = h + 1
316 1 : } else {
317 1 : j = h
318 1 : }
319 : }
320 1 : return i, false
321 : }
322 :
323 : // split splits the given node at the given index. The current node shrinks,
324 : // and this function returns the item that existed at that index and a new
325 : // node containing all items/children after it.
326 : //
327 : // split is called when we want to perform a transformation like the one
328 : // depicted in the following diagram.
329 : //
330 : // Before:
331 : // +-----------+
332 : // n *node | x y z |
333 : // +--/-/-\-\--+
334 : //
335 : // After:
336 : // +-----------+
337 : // | y | n's parent
338 : // +----/-\----+
339 : // / \
340 : // v v
341 : // +-----------+ +-----------+
342 : // n *node | x | | z | next *node
343 : // +-----------+ +-----------+
344 : //
345 : // split does not perform the complete transformation; the caller is responsible
346 : // for updating the parent appropriately. split splits `n` into two nodes, `n`
347 : // and `next`, returning `next` and the file that separates them. In the diagram
348 : // above, `n.split` removes y and z from `n`, returning y in the first return
349 : // value and `next` in the second return value. The caller is responsible for
350 : // updating n's parent to now contain `y` as the separator between nodes `n` and
351 : // `next`.
352 : //
353 : // This function is for use only as a helper function for internal B-Tree code.
354 : // Clients should not invoke it directly.
355 1 : func (n *node) split(i int) (*FileMetadata, *node) {
356 1 : out := n.items[i]
357 1 : var next *node
358 1 : if n.leaf {
359 1 : next = newLeafNode()
360 1 : } else {
361 1 : next = newNode()
362 1 : }
363 1 : next.count = n.count - int16(i+1)
364 1 : copy(next.items[:], n.items[i+1:n.count])
365 1 : for j := int16(i); j < n.count; j++ {
366 1 : n.items[j] = nil
367 1 : }
368 1 : if !n.leaf {
369 1 : copy(next.children[:], n.children[i+1:n.count+1])
370 1 : descendantsMoved := 0
371 1 : for j := int16(i + 1); j <= n.count; j++ {
372 1 : descendantsMoved += n.children[j].subtreeCount
373 1 : n.children[j] = nil
374 1 : }
375 1 : n.subtreeCount -= descendantsMoved
376 1 : next.subtreeCount += descendantsMoved
377 : }
378 1 : n.count = int16(i)
379 1 : // NB: We subtract one more than `next.count` from n's subtreeCount because
380 1 : // the item at index `i` was removed from `n.items`. We'll return the item
381 1 : // at index `i`, and the caller is responsible for updating the subtree
382 1 : // count of whichever node adopts it.
383 1 : n.subtreeCount -= int(next.count) + 1
384 1 : next.subtreeCount += int(next.count)
385 1 : return out, next
386 : }
387 :
388 : // Insert inserts a item into the subtree rooted at this node, making sure no
389 : // nodes in the subtree exceed maxItems items.
390 1 : func (n *node) Insert(bcmp btreeCmp, item *FileMetadata) error {
391 1 : i, found := n.find(bcmp, item)
392 1 : if found {
393 1 : // cmp provides a total ordering of the files within a level.
394 1 : // If we're inserting a metadata that's equal to an existing item
395 1 : // in the tree, we're inserting a file into a level twice.
396 1 : return errors.Errorf("files %s and %s collided on sort keys",
397 1 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
398 1 : }
399 1 : if n.leaf {
400 1 : n.insertAt(i, item, nil)
401 1 : n.subtreeCount++
402 1 : return nil
403 1 : }
404 1 : if n.children[i].count >= maxItems {
405 1 : splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
406 1 : n.insertAt(i, splitLa, splitNode)
407 1 :
408 1 : switch cmp := bcmp(item, n.items[i]); {
409 1 : case cmp < 0:
410 : // no change, we want first split node
411 1 : case cmp > 0:
412 1 : i++ // we want second split node
413 0 : default:
414 0 : // cmp provides a total ordering of the files within a level.
415 0 : // If we're inserting a metadata that's equal to an existing item
416 0 : // in the tree, we're inserting a file into a level twice.
417 0 : return errors.Errorf("files %s and %s collided on sort keys",
418 0 : errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
419 : }
420 : }
421 :
422 1 : err := mut(&n.children[i]).Insert(bcmp, item)
423 1 : if err == nil {
424 1 : n.subtreeCount++
425 1 : }
426 1 : return err
427 : }
428 :
429 : // removeMax removes and returns the maximum item from the subtree rooted at
430 : // this node. This function is for use only as a helper function for internal
431 : // B-Tree code. Clients should not invoke it directly.
432 1 : func (n *node) removeMax() *FileMetadata {
433 1 : if n.leaf {
434 1 : n.count--
435 1 : n.subtreeCount--
436 1 : out := n.items[n.count]
437 1 : n.items[n.count] = nil
438 1 : return out
439 1 : }
440 1 : child := mut(&n.children[n.count])
441 1 : if child.count <= minItems {
442 0 : n.rebalanceOrMerge(int(n.count))
443 0 : return n.removeMax()
444 0 : }
445 1 : n.subtreeCount--
446 1 : return child.removeMax()
447 : }
448 :
449 : // Remove removes a item from the subtree rooted at this node. Returns
450 : // the item that was removed or nil if no matching item was found.
451 1 : func (n *node) Remove(bcmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
452 1 : i, found := n.find(bcmp, item)
453 1 : if n.leaf {
454 1 : if found {
455 1 : out, _ = n.removeAt(i)
456 1 : n.subtreeCount--
457 1 : return out
458 1 : }
459 1 : return nil
460 : }
461 1 : if n.children[i].count <= minItems {
462 1 : // Child not large enough to remove from.
463 1 : n.rebalanceOrMerge(i)
464 1 : return n.Remove(bcmp, item)
465 1 : }
466 1 : child := mut(&n.children[i])
467 1 : if found {
468 1 : // Replace the item being removed with the max item in our left child.
469 1 : out = n.items[i]
470 1 : n.items[i] = child.removeMax()
471 1 : n.subtreeCount--
472 1 : return out
473 1 : }
474 : // File is not in this node and child is large enough to remove from.
475 1 : out = child.Remove(bcmp, item)
476 1 : if out != nil {
477 1 : n.subtreeCount--
478 1 : }
479 1 : return out
480 : }
481 :
482 : // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
483 : // item from it while keeping it at or above minItems. This function is for use
484 : // only as a helper function for internal B-Tree code. Clients should not invoke
485 : // it directly.
486 1 : func (n *node) rebalanceOrMerge(i int) {
487 1 : switch {
488 1 : case i > 0 && n.children[i-1].count > minItems:
489 1 : // Rebalance from left sibling.
490 1 : //
491 1 : // +-----------+
492 1 : // | y |
493 1 : // +----/-\----+
494 1 : // / \
495 1 : // v v
496 1 : // +-----------+ +-----------+
497 1 : // | x | | |
498 1 : // +----------\+ +-----------+
499 1 : // \
500 1 : // v
501 1 : // a
502 1 : //
503 1 : // After:
504 1 : //
505 1 : // +-----------+
506 1 : // | x |
507 1 : // +----/-\----+
508 1 : // / \
509 1 : // v v
510 1 : // +-----------+ +-----------+
511 1 : // | | | y |
512 1 : // +-----------+ +/----------+
513 1 : // /
514 1 : // v
515 1 : // a
516 1 : //
517 1 : left := mut(&n.children[i-1])
518 1 : child := mut(&n.children[i])
519 1 : xLa, grandChild := left.popBack()
520 1 : yLa := n.items[i-1]
521 1 : child.pushFront(yLa, grandChild)
522 1 : n.items[i-1] = xLa
523 1 : child.subtreeCount++
524 1 : left.subtreeCount--
525 1 : if grandChild != nil {
526 1 : child.subtreeCount += grandChild.subtreeCount
527 1 : left.subtreeCount -= grandChild.subtreeCount
528 1 : }
529 :
530 1 : case i < int(n.count) && n.children[i+1].count > minItems:
531 1 : // Rebalance from right sibling.
532 1 : //
533 1 : // +-----------+
534 1 : // | y |
535 1 : // +----/-\----+
536 1 : // / \
537 1 : // v v
538 1 : // +-----------+ +-----------+
539 1 : // | | | x |
540 1 : // +-----------+ +/----------+
541 1 : // /
542 1 : // v
543 1 : // a
544 1 : //
545 1 : // After:
546 1 : //
547 1 : // +-----------+
548 1 : // | x |
549 1 : // +----/-\----+
550 1 : // / \
551 1 : // v v
552 1 : // +-----------+ +-----------+
553 1 : // | y | | |
554 1 : // +----------\+ +-----------+
555 1 : // \
556 1 : // v
557 1 : // a
558 1 : //
559 1 : right := mut(&n.children[i+1])
560 1 : child := mut(&n.children[i])
561 1 : xLa, grandChild := right.popFront()
562 1 : yLa := n.items[i]
563 1 : child.pushBack(yLa, grandChild)
564 1 : child.subtreeCount++
565 1 : right.subtreeCount--
566 1 : if grandChild != nil {
567 1 : child.subtreeCount += grandChild.subtreeCount
568 1 : right.subtreeCount -= grandChild.subtreeCount
569 1 : }
570 1 : n.items[i] = xLa
571 :
572 1 : default:
573 1 : // Merge with either the left or right sibling.
574 1 : //
575 1 : // +-----------+
576 1 : // | u y v |
577 1 : // +----/-\----+
578 1 : // / \
579 1 : // v v
580 1 : // +-----------+ +-----------+
581 1 : // | x | | z |
582 1 : // +-----------+ +-----------+
583 1 : //
584 1 : // After:
585 1 : //
586 1 : // +-----------+
587 1 : // | u v |
588 1 : // +-----|-----+
589 1 : // |
590 1 : // v
591 1 : // +-----------+
592 1 : // | x y z |
593 1 : // +-----------+
594 1 : //
595 1 : if i >= int(n.count) {
596 1 : i = int(n.count - 1)
597 1 : }
598 1 : child := mut(&n.children[i])
599 1 : // Make mergeChild mutable, bumping the refcounts on its children if necessary.
600 1 : _ = mut(&n.children[i+1])
601 1 : mergeLa, mergeChild := n.removeAt(i)
602 1 : child.items[child.count] = mergeLa
603 1 : copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
604 1 : if !child.leaf {
605 1 : copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
606 1 : }
607 1 : child.count += mergeChild.count + 1
608 1 : child.subtreeCount += mergeChild.subtreeCount + 1
609 1 :
610 1 : mergeChild.decRef(false /* contentsToo */, nil)
611 : }
612 : }
613 :
614 1 : func (n *node) verifyInvariants() {
615 1 : recomputedSubtreeCount := int(n.count)
616 1 : if !n.leaf {
617 1 : for i := int16(0); i <= n.count; i++ {
618 1 : n.children[i].verifyInvariants()
619 1 : recomputedSubtreeCount += n.children[i].subtreeCount
620 1 : }
621 : }
622 1 : if recomputedSubtreeCount != n.subtreeCount {
623 0 : panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
624 0 : recomputedSubtreeCount, n.subtreeCount))
625 : }
626 : }
627 :
628 : // btree is an implementation of a B-Tree.
629 : //
630 : // btree stores FileMetadata in an ordered structure, allowing easy insertion,
631 : // removal, and iteration. The B-Tree stores items in order based on cmp. The
632 : // first level of the LSM uses a cmp function that compares sequence numbers.
633 : // All other levels compare using the FileMetadata.Smallest.
634 : //
635 : // Write operations are not safe for concurrent mutation by multiple
636 : // goroutines, but Read operations are.
637 : type btree struct {
638 : root *node
639 : cmp base.Compare
640 : bcmp btreeCmp
641 : }
642 :
643 : // Release dereferences and clears the root node of the btree, removing all
644 : // items from the btree. In doing so, it decrements contained file counts.
645 : // It returns a slice of newly obsolete backing files, if any.
646 1 : func (t *btree) Release() (obsolete []*FileBacking) {
647 1 : if t.root != nil {
648 1 : t.root.decRef(true /* contentsToo */, &obsolete)
649 1 : t.root = nil
650 1 : }
651 1 : return obsolete
652 : }
653 :
654 : // Clone clones the btree, lazily. It does so in constant time.
655 1 : func (t *btree) Clone() btree {
656 1 : c := *t
657 1 : if c.root != nil {
658 1 : // Incrementing the reference count on the root node is sufficient to
659 1 : // ensure that no node in the cloned tree can be mutated by an actor
660 1 : // holding a reference to the original tree and vice versa. This
661 1 : // property is upheld because the root node in the receiver btree and
662 1 : // the returned btree will both necessarily have a reference count of at
663 1 : // least 2 when this method returns. All tree mutations recursively
664 1 : // acquire mutable node references (see mut) as they traverse down the
665 1 : // tree. The act of acquiring a mutable node reference performs a clone
666 1 : // if a node's reference count is greater than one. Cloning a node (see
667 1 : // clone) increases the reference count on each of its children,
668 1 : // ensuring that they have a reference count of at least 2. This, in
669 1 : // turn, ensures that any of the child nodes that are modified will also
670 1 : // be copied-on-write, recursively ensuring the immutability property
671 1 : // over the entire tree.
672 1 : c.root.incRef()
673 1 : }
674 1 : return c
675 : }
676 :
677 : // Delete removes the provided file from the tree.
678 : // It returns true if the file now has a zero reference count.
679 1 : func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
680 1 : if t.root == nil || t.root.count == 0 {
681 0 : return false
682 0 : }
683 1 : if out := mut(&t.root).Remove(t.bcmp, item); out != nil {
684 1 : obsolete = out.FileBacking.Unref() == 0
685 1 : }
686 1 : if invariants.Enabled {
687 1 : t.root.verifyInvariants()
688 1 : }
689 1 : if t.root.count == 0 {
690 1 : old := t.root
691 1 : if t.root.leaf {
692 1 : t.root = nil
693 1 : } else {
694 1 : t.root = t.root.children[0]
695 1 : }
696 1 : old.decRef(false /* contentsToo */, nil)
697 : }
698 1 : return obsolete
699 : }
700 :
701 : // Insert adds the given item to the tree. If a item in the tree already
702 : // equals the given one, Insert panics.
703 1 : func (t *btree) Insert(item *FileMetadata) error {
704 1 : if t.root == nil {
705 1 : t.root = newLeafNode()
706 1 : } else if t.root.count >= maxItems {
707 1 : splitLa, splitNode := mut(&t.root).split(maxItems / 2)
708 1 : newRoot := newNode()
709 1 : newRoot.count = 1
710 1 : newRoot.items[0] = splitLa
711 1 : newRoot.children[0] = t.root
712 1 : newRoot.children[1] = splitNode
713 1 : newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
714 1 : t.root = newRoot
715 1 : }
716 1 : item.FileBacking.Ref()
717 1 : err := mut(&t.root).Insert(t.bcmp, item)
718 1 : if invariants.Enabled {
719 1 : t.root.verifyInvariants()
720 1 : }
721 1 : return err
722 : }
723 :
724 : // Iter returns a new iterator object. It is not safe to continue using an
725 : // iterator after modifications are made to the tree. If modifications are made,
726 : // create a new iterator.
727 1 : func (t *btree) Iter() iterator {
728 1 : return iterator{r: t.root, pos: -1, cmp: t.bcmp}
729 1 : }
730 :
731 : // Count returns the number of files contained within the B-Tree.
732 1 : func (t *btree) Count() int {
733 1 : if t.root == nil {
734 1 : return 0
735 1 : }
736 1 : return t.root.subtreeCount
737 : }
738 :
739 : // String returns a string description of the tree. The format is
740 : // similar to the https://en.wikipedia.org/wiki/Newick_format.
741 0 : func (t *btree) String() string {
742 0 : if t.Count() == 0 {
743 0 : return ";"
744 0 : }
745 0 : var b strings.Builder
746 0 : t.root.writeString(&b)
747 0 : return b.String()
748 : }
749 :
750 0 : func (n *node) writeString(b *strings.Builder) {
751 0 : if n.leaf {
752 0 : for i := int16(0); i < n.count; i++ {
753 0 : if i != 0 {
754 0 : b.WriteString(",")
755 0 : }
756 0 : b.WriteString(n.items[i].String())
757 : }
758 0 : return
759 : }
760 0 : for i := int16(0); i <= n.count; i++ {
761 0 : b.WriteString("(")
762 0 : n.children[i].writeString(b)
763 0 : b.WriteString(")")
764 0 : if i < n.count {
765 0 : b.WriteString(n.items[i].String())
766 0 : }
767 : }
768 : }
769 :
770 : // iterStack represents a stack of (node, pos) tuples, which captures
771 : // iteration state as an iterator descends a btree.
772 : type iterStack struct {
773 : // a contains aLen stack frames when an iterator stack is short enough.
774 : // If the iterator stack overflows the capacity of iterStackArr, the stack
775 : // is moved to s and aLen is set to -1.
776 : a iterStackArr
777 : aLen int16 // -1 when using s
778 : s []iterFrame
779 : }
780 :
781 : // Used to avoid allocations for stacks below a certain size.
782 : type iterStackArr [3]iterFrame
783 :
784 : type iterFrame struct {
785 : n *node
786 : pos int16
787 : }
788 :
789 1 : func (is *iterStack) push(f iterFrame) {
790 1 : if is.aLen == -1 {
791 1 : is.s = append(is.s, f)
792 1 : } else if int(is.aLen) == len(is.a) {
793 1 : is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
794 1 : copy(is.s, is.a[:])
795 1 : is.s[int(is.aLen)] = f
796 1 : is.aLen = -1
797 1 : } else {
798 1 : is.a[is.aLen] = f
799 1 : is.aLen++
800 1 : }
801 : }
802 :
803 1 : func (is *iterStack) pop() iterFrame {
804 1 : if is.aLen == -1 {
805 1 : f := is.s[len(is.s)-1]
806 1 : is.s = is.s[:len(is.s)-1]
807 1 : return f
808 1 : }
809 1 : is.aLen--
810 1 : return is.a[is.aLen]
811 : }
812 :
813 1 : func (is *iterStack) len() int {
814 1 : if is.aLen == -1 {
815 1 : return len(is.s)
816 1 : }
817 1 : return int(is.aLen)
818 : }
819 :
820 1 : func (is *iterStack) clone() iterStack {
821 1 : // If the iterator is using the embedded iterStackArr, we only need to
822 1 : // copy the struct itself.
823 1 : if is.s == nil {
824 1 : return *is
825 1 : }
826 0 : clone := *is
827 0 : clone.s = make([]iterFrame, len(is.s))
828 0 : copy(clone.s, is.s)
829 0 : return clone
830 : }
831 :
832 1 : func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
833 1 : if is.aLen == -1 {
834 0 : if n >= len(is.s) {
835 0 : return f, false
836 0 : }
837 0 : return is.s[n], true
838 : }
839 1 : if int16(n) >= is.aLen {
840 1 : return f, false
841 1 : }
842 1 : return is.a[n], true
843 : }
844 :
845 1 : func (is *iterStack) reset() {
846 1 : if is.aLen == -1 {
847 1 : is.s = is.s[:0]
848 1 : } else {
849 1 : is.aLen = 0
850 1 : }
851 : }
852 :
853 : // iterator is responsible for search and traversal within a btree.
854 : type iterator struct {
855 : // the root node of the B-Tree.
856 : r *node
857 : // n and pos make up the current position of the iterator.
858 : // If valid, n.items[pos] is the current value of the iterator.
859 : //
860 : // n may be nil iff i.r is nil.
861 : n *node
862 : pos int16
863 : // cmp dictates the ordering of the FileMetadata.
864 : cmp func(*FileMetadata, *FileMetadata) int
865 : // a stack of n's ancestors within the B-Tree, alongside the position
866 : // taken to arrive at n. If non-empty, the bottommost frame of the stack
867 : // will always contain the B-Tree root.
868 : s iterStack
869 : }
870 :
871 : // countLeft returns the count of files that are to the left of the current
872 : // iterator position.
873 1 : func (i *iterator) countLeft() int {
874 1 : if i.r == nil {
875 0 : return 0
876 0 : }
877 :
878 : // Each iterator has a stack of frames marking the path from the root node
879 : // to the current iterator position. All files (n.items) and all subtrees
880 : // (n.children) with indexes less than [pos] are to the left of the current
881 : // iterator position.
882 : //
883 : // +------------------------+ -
884 : // | Root pos:5 | |
885 : // +------------------------+ | stack
886 : // | Root/5 pos:3 | | frames
887 : // +------------------------+ | [i.s]
888 : // | Root/5/3 pos:9 | |
889 : // +========================+ -
890 : // | |
891 : // | i.n: Root/5/3/9 i.pos:2|
892 : // +------------------------+
893 : //
894 1 : var count int
895 1 : // Walk all the ancestors in the iterator stack [i.s], tallying up all the
896 1 : // files and subtrees to the left of the stack frame's position.
897 1 : f, ok := i.s.nth(0)
898 1 : for fi := 0; ok; fi++ {
899 1 : // There are [f.pos] files contained within [f.n.items] that sort to the
900 1 : // left of the subtree the iterator has descended.
901 1 : count += int(f.pos)
902 1 : // Any subtrees that fall before the stack frame's position are entirely
903 1 : // to the left of the iterator's current position.
904 1 : for j := int16(0); j < f.pos; j++ {
905 1 : count += f.n.children[j].subtreeCount
906 1 : }
907 1 : f, ok = i.s.nth(fi + 1)
908 : }
909 :
910 : // The bottommost stack frame is inlined within the iterator struct. Again,
911 : // [i.pos] files fall to the left of the current iterator position.
912 1 : count += int(i.pos)
913 1 : if !i.n.leaf {
914 1 : // NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
915 1 : // positioned at item `i.n.items[i.pos]`, which sorts after everything
916 1 : // in the subtree at `i.n.children[i.pos]`.
917 1 : for j := int16(0); j <= i.pos; j++ {
918 1 : count += i.n.children[j].subtreeCount
919 1 : }
920 : }
921 1 : return count
922 : }
923 :
924 1 : func (i *iterator) clone() iterator {
925 1 : c := *i
926 1 : c.s = i.s.clone()
927 1 : return c
928 1 : }
929 :
930 1 : func (i *iterator) reset() {
931 1 : i.n = i.r
932 1 : i.pos = -1
933 1 : i.s.reset()
934 1 : }
935 :
936 0 : func (i iterator) String() string {
937 0 : var buf bytes.Buffer
938 0 : for n := 0; ; n++ {
939 0 : f, ok := i.s.nth(n)
940 0 : if !ok {
941 0 : break
942 : }
943 0 : fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
944 : }
945 0 : if i.r == nil {
946 0 : fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
947 0 : } else {
948 0 : fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
949 0 : }
950 0 : return buf.String()
951 : }
952 :
953 1 : func cmpIter(a, b iterator) int {
954 1 : if a.r != b.r {
955 0 : panic("compared iterators from different btrees")
956 : }
957 :
958 : // Each iterator has a stack of frames marking the path from the root node
959 : // to the current iterator position. We walk both paths formed by the
960 : // iterators' stacks simultaneously, descending from the shared root node,
961 : // always comparing nodes at the same level in the tree.
962 : //
963 : // If the iterators' paths ever diverge and point to different nodes, the
964 : // iterators are not equal and we use the node positions to evaluate the
965 : // comparison.
966 : //
967 : // If an iterator's stack ends, we stop descending and use its current
968 : // node and position for the final comparison. One iterator's stack may
969 : // end before another's if one iterator is positioned deeper in the tree.
970 : //
971 : // a b
972 : // +------------------------+ +--------------------------+ -
973 : // | Root pos:5 | = | Root pos:5 | |
974 : // +------------------------+ +--------------------------+ | stack
975 : // | Root/5 pos:3 | = | Root/5 pos:3 | | frames
976 : // +------------------------+ +--------------------------+ |
977 : // | Root/5/3 pos:9 | > | Root/5/3 pos:1 | |
978 : // +========================+ +==========================+ -
979 : // | | | |
980 : // | a.n: Root/5/3/9 a.pos:2| | b.n: Root/5/3/1, b.pos:5 |
981 : // +------------------------+ +--------------------------+
982 :
983 : // Initialize with the iterator's current node and position. These are
984 : // conceptually the most-recent/current frame of the iterator stack.
985 1 : an, apos := a.n, a.pos
986 1 : bn, bpos := b.n, b.pos
987 1 :
988 1 : // aok, bok are set while traversing the iterator's path down the B-Tree.
989 1 : // They're declared in the outer scope because they help distinguish the
990 1 : // sentinel case when both iterators' first frame points to the last child
991 1 : // of the root. If an iterator has no other frames in its stack, it's the
992 1 : // end sentinel state which sorts after everything else.
993 1 : var aok, bok bool
994 1 : for i := 0; ; i++ {
995 1 : var af, bf iterFrame
996 1 : af, aok = a.s.nth(i)
997 1 : bf, bok = b.s.nth(i)
998 1 : if !aok || !bok {
999 1 : if aok {
1000 1 : // Iterator a, unlike iterator b, still has a frame. Set an,
1001 1 : // apos so we compare using the frame from the stack.
1002 1 : an, apos = af.n, af.pos
1003 1 : }
1004 1 : if bok {
1005 1 : // Iterator b, unlike iterator a, still has a frame. Set bn,
1006 1 : // bpos so we compare using the frame from the stack.
1007 1 : bn, bpos = bf.n, bf.pos
1008 1 : }
1009 1 : break
1010 : }
1011 :
1012 : // aok && bok
1013 1 : if af.n != bf.n {
1014 0 : panic("nonmatching nodes during btree iterator comparison")
1015 : }
1016 1 : if v := stdcmp.Compare(af.pos, bf.pos); v != 0 {
1017 1 : return v
1018 1 : }
1019 : // Otherwise continue up both iterators' stacks (equivalently, down the
1020 : // B-Tree away from the root).
1021 : }
1022 :
1023 1 : if aok && bok {
1024 0 : panic("expected one or more stacks to have been exhausted")
1025 : }
1026 1 : if an != bn {
1027 0 : panic("nonmatching nodes during btree iterator comparison")
1028 : }
1029 1 : if v := stdcmp.Compare(apos, bpos); v != 0 {
1030 1 : return v
1031 1 : }
1032 1 : switch {
1033 1 : case aok:
1034 1 : // a is positioned at a leaf child at this position and b is at an
1035 1 : // end sentinel state.
1036 1 : return -1
1037 1 : case bok:
1038 1 : // b is positioned at a leaf child at this position and a is at an
1039 1 : // end sentinel state.
1040 1 : return +1
1041 1 : default:
1042 1 : return 0
1043 : }
1044 : }
1045 :
1046 1 : func (i *iterator) descend(n *node, pos int16) {
1047 1 : i.s.push(iterFrame{n: n, pos: pos})
1048 1 : i.n = n.children[pos]
1049 1 : i.pos = 0
1050 1 : }
1051 :
1052 : // ascend ascends up to the current node's parent and resets the position
1053 : // to the one previously set for this parent node.
1054 1 : func (i *iterator) ascend() {
1055 1 : f := i.s.pop()
1056 1 : i.n = f.n
1057 1 : i.pos = f.pos
1058 1 : }
1059 :
1060 : // seek repositions the iterator over the first file for which fn returns
1061 : // true, mirroring the semantics of the standard library's sort.Search
1062 : // function. Like sort.Search, seek requires the iterator's B-Tree to be
1063 : // ordered such that fn returns false for some (possibly empty) prefix of the
1064 : // tree's files, and then true for the (possibly empty) remainder.
1065 1 : func (i *iterator) seek(fn func(*FileMetadata) bool) {
1066 1 : i.reset()
1067 1 : if i.r == nil {
1068 0 : return
1069 0 : }
1070 :
1071 1 : for {
1072 1 : // Logic copied from sort.Search.
1073 1 : j, k := 0, int(i.n.count)
1074 1 : for j < k {
1075 1 : h := int(uint(j+k) >> 1) // avoid overflow when computing h
1076 1 :
1077 1 : // j ≤ h < k
1078 1 : if !fn(i.n.items[h]) {
1079 1 : j = h + 1 // preserves f(j-1) == false
1080 1 : } else {
1081 1 : k = h // preserves f(k) == true
1082 1 : }
1083 : }
1084 :
1085 1 : i.pos = int16(j)
1086 1 : if i.n.leaf {
1087 1 : if i.pos == i.n.count {
1088 1 : i.next()
1089 1 : }
1090 1 : return
1091 : }
1092 1 : i.descend(i.n, i.pos)
1093 : }
1094 : }
1095 :
1096 : // first seeks to the first item in the btree.
1097 1 : func (i *iterator) first() {
1098 1 : i.reset()
1099 1 : if i.r == nil {
1100 1 : return
1101 1 : }
1102 1 : for !i.n.leaf {
1103 1 : i.descend(i.n, 0)
1104 1 : }
1105 1 : i.pos = 0
1106 : }
1107 :
1108 : // last seeks to the last item in the btree.
1109 1 : func (i *iterator) last() {
1110 1 : i.reset()
1111 1 : if i.r == nil {
1112 1 : return
1113 1 : }
1114 1 : for !i.n.leaf {
1115 1 : i.descend(i.n, i.n.count)
1116 1 : }
1117 1 : i.pos = i.n.count - 1
1118 : }
1119 :
1120 : // next positions the iterator to the item immediately following
1121 : // its current position.
1122 1 : func (i *iterator) next() {
1123 1 : if i.r == nil {
1124 0 : return
1125 0 : }
1126 :
1127 1 : if i.n.leaf {
1128 1 : if i.pos < i.n.count {
1129 1 : i.pos++
1130 1 : }
1131 1 : if i.pos < i.n.count {
1132 1 : return
1133 1 : }
1134 1 : for i.s.len() > 0 && i.pos >= i.n.count {
1135 1 : i.ascend()
1136 1 : }
1137 1 : return
1138 : }
1139 :
1140 1 : i.descend(i.n, i.pos+1)
1141 1 : for !i.n.leaf {
1142 1 : i.descend(i.n, 0)
1143 1 : }
1144 1 : i.pos = 0
1145 : }
1146 :
1147 : // prev positions the iterator to the item immediately preceding
1148 : // its current position.
1149 1 : func (i *iterator) prev() {
1150 1 : if i.r == nil {
1151 0 : return
1152 0 : }
1153 :
1154 1 : if i.n.leaf {
1155 1 : i.pos--
1156 1 : if i.pos >= 0 {
1157 1 : return
1158 1 : }
1159 1 : for i.s.len() > 0 && i.pos < 0 {
1160 1 : i.ascend()
1161 1 : i.pos--
1162 1 : }
1163 1 : return
1164 : }
1165 :
1166 1 : i.descend(i.n, i.pos)
1167 1 : for !i.n.leaf {
1168 1 : i.descend(i.n, i.n.count)
1169 1 : }
1170 1 : i.pos = i.n.count - 1
1171 : }
1172 :
1173 : // valid returns whether the iterator is positioned at a valid position.
1174 1 : func (i *iterator) valid() bool {
1175 1 : return i.r != nil && i.pos >= 0 && i.pos < i.n.count
1176 1 : }
1177 :
1178 : // cur returns the item at the iterator's current position. It is illegal
1179 : // to call cur if the iterator is not valid.
1180 1 : func (i *iterator) cur() *FileMetadata {
1181 1 : if invariants.Enabled && !i.valid() {
1182 0 : panic("btree iterator.cur invoked on invalid iterator")
1183 : }
1184 1 : return i.n.items[i.pos]
1185 : }
|