LCOV - code coverage report
Current view: top level - pebble/internal/batchskl - skl.go (source / functions) Hit Total Coverage
Test: 2024-12-25 08:17Z 78d53457 - meta test only.lcov Lines: 187 228 82.0 %
Date: 2024-12-25 08:18:29 Functions: 0 0 -

          Line data    Source code
       1             : /*
       2             :  * Copyright 2017 Dgraph Labs, Inc. and Contributors
       3             :  * Modifications copyright (C) 2017 Andy Kimball and Contributors
       4             :  *
       5             :  * Licensed under the Apache License, Version 2.0 (the "License")
       6             :  * you may not use this file except in compliance with the License.
       7             :  * You may obtain a copy of the License at
       8             :  *
       9             :  *     http://www.apache.org/licenses/LICENSE-2.0
      10             :  *
      11             :  * Unless required by applicable law or agreed to in writing, software
      12             :  * distributed under the License is distributed on an "AS IS" BASIS,
      13             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      14             :  * See the License for the specific language governing permissions and
      15             :  * limitations under the License.
      16             :  */
      17             : 
      18             : /*
      19             : Adapted from RocksDB inline skiplist.
      20             : 
      21             : Key differences:
      22             : - No optimization for sequential inserts (no "prev").
      23             : - No custom comparator.
      24             : - Support overwrites. This requires care when we see the same key when inserting.
      25             :   For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
      26             :         there is no need for values. We don't intend to support versioning. In-place updates of values
      27             :         would be more efficient.
      28             : - We discard all non-concurrent code.
      29             : - We do not support Splices. This simplifies the code a lot.
      30             : - No AllocateNode or other pointer arithmetic.
      31             : - We combine the findLessThan, findGreaterOrEqual, etc into one function.
      32             : */
      33             : 
      34             : /*
      35             : Further adapted from Badger: https://github.com/dgraph-io/badger.
      36             : 
      37             : Key differences:
      38             : - Support for previous pointers - doubly linked lists. Note that it's up to higher
      39             :   level code to deal with the intermediate state that occurs during insertion,
      40             :   where node A is linked to node B, but node B is not yet linked back to node A.
      41             : - Iterator includes mutator functions.
      42             : */
      43             : 
      44             : /*
      45             : Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
      46             : 
      47             : Key differences:
      48             : - Removed support for deletion.
      49             : - Removed support for concurrency.
      50             : - External storage of keys.
      51             : - Node storage grows to an arbitrary size.
      52             : */
      53             : 
      54             : package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"
      55             : 
      56             : import (
      57             :         "bytes"
      58             :         "encoding/binary"
      59             :         "fmt"
      60             :         "math"
      61             :         "math/rand/v2"
      62             :         "unsafe"
      63             : 
      64             :         "github.com/cockroachdb/errors"
      65             :         "github.com/cockroachdb/pebble/internal/base"
      66             :         "github.com/cockroachdb/pebble/internal/constants"
      67             : )
      68             : 
      69             : const (
      70             :         maxHeight    = 20
      71             :         maxNodeSize  = uint64(unsafe.Sizeof(node{}))
      72             :         linksSize    = uint64(unsafe.Sizeof(links{}))
      73             :         maxNodesSize = constants.MaxUint32OrInt
      74             : )
      75             : 
      76             : var (
      77             :         // ErrExists indicates that a duplicate record was inserted. This should never
      78             :         // happen for normal usage of batchskl as every key should have a unique
      79             :         // sequence number.
      80             :         ErrExists = errors.New("record with this key already exists")
      81             : 
      82             :         // ErrTooManyRecords is a sentinel error returned when the size of the raw
      83             :         // nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
      84             :         // corresponds to ~117 M skiplist entries.
      85             :         ErrTooManyRecords = errors.New("too many records")
      86             : )
      87             : 
      88             : type links struct {
      89             :         next uint32
      90             :         prev uint32
      91             : }
      92             : 
      93             : type node struct {
      94             :         // The offset of the start of the record in the storage.
      95             :         offset uint32
      96             :         // The offset of the start and end of the key in storage.
      97             :         keyStart uint32
      98             :         keyEnd   uint32
      99             :         // A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
     100             :         // during seek operations. The key retrieval can be expensive purely due to
     101             :         // cache misses while the abbreviatedKey stored here will be in the same
     102             :         // cache line as the key and the links making accessing and comparing against
     103             :         // it almost free.
     104             :         abbreviatedKey uint64
     105             :         // Most nodes do not need to use the full height of the link tower, since the
     106             :         // probability of each successive level decreases exponentially. Because
     107             :         // these elements are never accessed, they do not need to be allocated.
     108             :         // Therefore, when a node is allocated, its memory footprint is deliberately
     109             :         // truncated to not include unneeded link elements.
     110             :         links [maxHeight]links
     111             : }
     112             : 
     113             : // Skiplist is a fast, non-cocnurrent skiplist implementation that supports
     114             : // forward and backward iteration. See arenaskl.Skiplist for a concurrent
     115             : // skiplist. Keys and values are stored externally from the skiplist via the
     116             : // Storage interface. Deletion is not supported. Instead, higher-level code is
     117             : // expected to perform deletion via tombstones and needs to process those
     118             : // tombstones appropriately during retrieval operations.
     119             : type Skiplist struct {
     120             :         storage        *[]byte
     121             :         cmp            base.Compare
     122             :         abbreviatedKey base.AbbreviatedKey
     123             :         nodes          []byte
     124             :         head           uint32
     125             :         tail           uint32
     126             :         height         uint32 // Current height: 1 <= height <= maxHeight
     127             :         rand           rand.PCG
     128             : }
     129             : 
     130             : var (
     131             :         probabilities [maxHeight]uint32
     132             : )
     133             : 
     134           1 : func init() {
     135           1 :         const pValue = 1 / math.E
     136           1 : 
     137           1 :         // Precompute the skiplist probabilities so that only a single random number
     138           1 :         // needs to be generated and so that the optimal pvalue can be used (inverse
     139           1 :         // of Euler's number).
     140           1 :         p := float64(1.0)
     141           1 :         for i := 0; i < maxHeight; i++ {
     142           1 :                 probabilities[i] = uint32(float64(math.MaxUint32) * p)
     143           1 :                 p *= pValue
     144           1 :         }
     145             : }
     146             : 
     147             : // NewSkiplist constructs and initializes a new, empty skiplist.
     148           1 : func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
     149           1 :         s := &Skiplist{}
     150           1 :         s.Init(storage, cmp, abbreviatedKey)
     151           1 :         return s
     152           1 : }
     153             : 
     154             : // Reset the fields in the skiplist for reuse.
     155           0 : func (s *Skiplist) Reset() {
     156           0 :         *s = Skiplist{
     157           0 :                 nodes:  s.nodes[:0],
     158           0 :                 height: 1,
     159           0 :         }
     160           0 :         const batchMaxRetainedSize = 1 << 20 // 1 MB
     161           0 :         if cap(s.nodes) > batchMaxRetainedSize {
     162           0 :                 s.nodes = nil
     163           0 :         }
     164             : }
     165             : 
     166             : // Init the skiplist to empty and re-initialize.
     167           1 : func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
     168           1 :         *s = Skiplist{
     169           1 :                 storage:        storage,
     170           1 :                 cmp:            cmp,
     171           1 :                 abbreviatedKey: abbreviatedKey,
     172           1 :                 nodes:          s.nodes[:0],
     173           1 :                 height:         1,
     174           1 :         }
     175           1 :         s.rand.Seed(0, rand.Uint64())
     176           1 : 
     177           1 :         const initBufSize = 256
     178           1 :         if cap(s.nodes) < initBufSize {
     179           1 :                 s.nodes = make([]byte, 0, initBufSize)
     180           1 :         }
     181             : 
     182             :         // Allocate head and tail nodes. While allocating a new node can fail, in the
     183             :         // context of initializing the skiplist we consider it unrecoverable.
     184           1 :         var err error
     185           1 :         s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
     186           1 :         if err != nil {
     187           0 :                 panic(err)
     188             :         }
     189           1 :         s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
     190           1 :         if err != nil {
     191           0 :                 panic(err)
     192             :         }
     193             : 
     194             :         // Link all head/tail levels together.
     195           1 :         headNode := s.node(s.head)
     196           1 :         tailNode := s.node(s.tail)
     197           1 :         for i := uint32(0); i < maxHeight; i++ {
     198           1 :                 headNode.links[i].next = s.tail
     199           1 :                 tailNode.links[i].prev = s.head
     200           1 :         }
     201             : }
     202             : 
     203             : // Add adds a new key to the skiplist if it does not yet exist. If the record
     204             : // already exists, then Add returns ErrRecordExists.
     205           1 : func (s *Skiplist) Add(keyOffset uint32) error {
     206           1 :         data := (*s.storage)[keyOffset+1:]
     207           1 :         v, n := binary.Uvarint(data)
     208           1 :         if n <= 0 {
     209           0 :                 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
     210           0 :         }
     211           1 :         data = data[n:]
     212           1 :         if v > uint64(len(data)) {
     213           0 :                 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
     214           0 :         }
     215           1 :         keyStart := 1 + keyOffset + uint32(n)
     216           1 :         keyEnd := keyStart + uint32(v)
     217           1 :         key := data[:v]
     218           1 :         abbreviatedKey := s.abbreviatedKey(key)
     219           1 : 
     220           1 :         // spl holds the list of next and previous links for each level in the
     221           1 :         // skiplist indicating where the new node will be inserted.
     222           1 :         var spl [maxHeight]splice
     223           1 : 
     224           1 :         // Fast-path for in-order insertion of keys: compare the new key against the
     225           1 :         // last key.
     226           1 :         prev := s.getPrev(s.tail, 0)
     227           1 :         if prevNode := s.node(prev); prev == s.head ||
     228           1 :                 abbreviatedKey > prevNode.abbreviatedKey ||
     229           1 :                 (abbreviatedKey == prevNode.abbreviatedKey &&
     230           1 :                         s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
     231           1 :                 for level := uint32(0); level < s.height; level++ {
     232           1 :                         spl[level].prev = s.getPrev(s.tail, level)
     233           1 :                         spl[level].next = s.tail
     234           1 :                 }
     235           1 :         } else {
     236           1 :                 s.findSplice(key, abbreviatedKey, &spl)
     237           1 :         }
     238             : 
     239           1 :         height := s.randomHeight()
     240           1 :         // Increase s.height as necessary.
     241           1 :         for ; s.height < height; s.height++ {
     242           1 :                 spl[s.height].next = s.tail
     243           1 :                 spl[s.height].prev = s.head
     244           1 :         }
     245             : 
     246             :         // We always insert from the base level and up. After you add a node in base
     247             :         // level, we cannot create a node in the level above because it would have
     248             :         // discovered the node in the base level.
     249           1 :         nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
     250           1 :         if err != nil {
     251           0 :                 return err
     252           0 :         }
     253           1 :         newNode := s.node(nd)
     254           1 :         for level := uint32(0); level < height; level++ {
     255           1 :                 next := spl[level].next
     256           1 :                 prev := spl[level].prev
     257           1 :                 newNode.links[level].next = next
     258           1 :                 newNode.links[level].prev = prev
     259           1 :                 s.node(next).links[level].prev = nd
     260           1 :                 s.node(prev).links[level].next = nd
     261           1 :         }
     262             : 
     263           1 :         return nil
     264             : }
     265             : 
     266             : // NewIter returns a new Iterator object. The lower and upper bound parameters
     267             : // control the range of keys the iterator will return. Specifying for nil for
     268             : // lower or upper bound disables the check for that boundary. Note that lower
     269             : // bound is not checked on {SeekGE,First} and upper bound is not check on
     270             : // {SeekLT,Last}. The user is expected to perform that check. Note that it is
     271             : // safe for an iterator to be copied by value.
     272           1 : func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
     273           1 :         return Iterator{list: s, lower: lower, upper: upper}
     274           1 : }
     275             : 
     276             : func (s *Skiplist) newNode(
     277             :         height,
     278             :         offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
     279           1 : ) (uint32, error) {
     280           1 :         if height < 1 || height > maxHeight {
     281           0 :                 panic("height cannot be less than one or greater than the max height")
     282             :         }
     283             : 
     284           1 :         unusedSize := uint64(maxHeight-int(height)) * linksSize
     285           1 :         nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
     286           1 :         if err != nil {
     287           0 :                 return 0, err
     288           0 :         }
     289           1 :         nd := s.node(nodeOffset)
     290           1 : 
     291           1 :         nd.offset = offset
     292           1 :         nd.keyStart = keyStart
     293           1 :         nd.keyEnd = keyEnd
     294           1 :         nd.abbreviatedKey = abbreviatedKey
     295           1 :         return nodeOffset, nil
     296             : }
     297             : 
     298           1 : func (s *Skiplist) alloc(size uint32) (uint32, error) {
     299           1 :         offset := uint64(len(s.nodes))
     300           1 : 
     301           1 :         // We only have a need for memory up to offset + size, but we never want
     302           1 :         // to allocate a node whose tail points into unallocated memory.
     303           1 :         minAllocSize := offset + maxNodeSize
     304           1 :         if uint64(cap(s.nodes)) < minAllocSize {
     305           1 :                 allocSize := uint64(cap(s.nodes)) * 2
     306           1 :                 if allocSize < minAllocSize {
     307           0 :                         allocSize = minAllocSize
     308           0 :                 }
     309             :                 // Cap the allocation at the max allowed size to avoid wasted capacity.
     310           1 :                 if allocSize > maxNodesSize {
     311           0 :                         // The new record may still not fit within the allocation, in which case
     312           0 :                         // we return early with an error. This avoids the panic below when we
     313           0 :                         // resize the slice. It also avoids the allocation and copy.
     314           0 :                         if uint64(offset)+uint64(size) > maxNodesSize {
     315           0 :                                 return 0, errors.Wrapf(ErrTooManyRecords,
     316           0 :                                         "alloc of new record (size=%d) would overflow uint32 (current size=%d)",
     317           0 :                                         uint64(offset)+uint64(size), offset,
     318           0 :                                 )
     319           0 :                         }
     320           0 :                         allocSize = maxNodesSize
     321             :                 }
     322           1 :                 tmp := make([]byte, len(s.nodes), allocSize)
     323           1 :                 copy(tmp, s.nodes)
     324           1 :                 s.nodes = tmp
     325             :         }
     326             : 
     327           1 :         newSize := uint32(offset) + size
     328           1 :         s.nodes = s.nodes[:newSize]
     329           1 :         return uint32(offset), nil
     330             : }
     331             : 
     332           1 : func (s *Skiplist) node(offset uint32) *node {
     333           1 :         return (*node)(unsafe.Pointer(&s.nodes[offset]))
     334           1 : }
     335             : 
     336           1 : func (s *Skiplist) randomHeight() uint32 {
     337           1 :         rnd := uint32(s.rand.Uint64())
     338           1 :         h := uint32(1)
     339           1 :         for h < maxHeight && rnd <= probabilities[h] {
     340           1 :                 h++
     341           1 :         }
     342           1 :         return h
     343             : }
     344             : 
     345           1 : func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
     346           1 :         prev := s.head
     347           1 : 
     348           1 :         for level := s.height - 1; ; level-- {
     349           1 :                 // The code in this loop is the same as findSpliceForLevel(). For some
     350           1 :                 // reason, calling findSpliceForLevel() here is much much slower than the
     351           1 :                 // inlined code below. The excess time is also caught up in the final
     352           1 :                 // return statement which makes little sense. Revisit when in go1.14 or
     353           1 :                 // later if inlining improves.
     354           1 : 
     355           1 :                 next := s.getNext(prev, level)
     356           1 :                 for next != s.tail {
     357           1 :                         // Assume prev.key < key.
     358           1 :                         nextNode := s.node(next)
     359           1 :                         nextAbbreviatedKey := nextNode.abbreviatedKey
     360           1 :                         if abbreviatedKey < nextAbbreviatedKey {
     361           1 :                                 // We are done for this level, since prev.key < key < next.key.
     362           1 :                                 break
     363             :                         }
     364           1 :                         if abbreviatedKey == nextAbbreviatedKey {
     365           1 :                                 if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
     366           1 :                                         // We are done for this level, since prev.key < key <= next.key.
     367           1 :                                         break
     368             :                                 }
     369             :                         }
     370             : 
     371             :                         // Keep moving right on this level.
     372           1 :                         prev = next
     373           1 :                         next = nextNode.links[level].next
     374             :                 }
     375             : 
     376           1 :                 spl[level].prev = prev
     377           1 :                 spl[level].next = next
     378           1 :                 if level == 0 {
     379           1 :                         break
     380             :                 }
     381             :         }
     382             : }
     383             : 
     384             : func (s *Skiplist) findSpliceForLevel(
     385             :         key []byte, abbreviatedKey uint64, level, start uint32,
     386           1 : ) (prev, next uint32) {
     387           1 :         prev = start
     388           1 :         next = s.getNext(prev, level)
     389           1 : 
     390           1 :         for next != s.tail {
     391           1 :                 // Assume prev.key < key.
     392           1 :                 nextNode := s.node(next)
     393           1 :                 nextAbbreviatedKey := nextNode.abbreviatedKey
     394           1 :                 if abbreviatedKey < nextAbbreviatedKey {
     395           1 :                         // We are done for this level, since prev.key < key < next.key.
     396           1 :                         break
     397             :                 }
     398           1 :                 if abbreviatedKey == nextAbbreviatedKey {
     399           1 :                         if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
     400           1 :                                 // We are done for this level, since prev.key < key < next.key.
     401           1 :                                 break
     402             :                         }
     403             :                 }
     404             : 
     405             :                 // Keep moving right on this level.
     406           1 :                 prev = next
     407           1 :                 next = nextNode.links[level].next
     408             :         }
     409             : 
     410           1 :         return
     411             : }
     412             : 
     413           1 : func (s *Skiplist) getKey(nd uint32) base.InternalKey {
     414           1 :         n := s.node(nd)
     415           1 :         kind := base.InternalKeyKind((*s.storage)[n.offset])
     416           1 :         key := (*s.storage)[n.keyStart:n.keyEnd]
     417           1 :         return base.MakeInternalKey(key, base.SeqNum(n.offset)|base.SeqNumBatchBit, kind)
     418           1 : }
     419             : 
     420           1 : func (s *Skiplist) getNext(nd, h uint32) uint32 {
     421           1 :         return s.node(nd).links[h].next
     422           1 : }
     423             : 
     424           1 : func (s *Skiplist) getPrev(nd, h uint32) uint32 {
     425           1 :         return s.node(nd).links[h].prev
     426           1 : }
     427             : 
     428           0 : func (s *Skiplist) debug() string {
     429           0 :         var buf bytes.Buffer
     430           0 :         for level := uint32(0); level < s.height; level++ {
     431           0 :                 var count int
     432           0 :                 for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
     433           0 :                         count++
     434           0 :                 }
     435           0 :                 fmt.Fprintf(&buf, "%d: %d\n", level, count)
     436             :         }
     437           0 :         return buf.String()
     438             : }
     439             : 
     440             : // Silence unused warning.
     441             : var _ = (*Skiplist).debug

Generated by: LCOV version 1.14