Line data Source code
1 : // Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "iter"
10 :
11 : "github.com/cockroachdb/pebble/internal/invariants"
12 : )
13 :
14 : // BitmapRunLengthEncoder encodes a bitmap. It uses a run-length encoding for
15 : // runs of all ones and all zeros. The encoding is intended for compactly
16 : // representing bitmaps with significant spatial locality and when readers only
17 : // read the bitmap sequentially.
18 : //
19 : // The BitmapRunLengthEncoder considers groups of 8-bits of the bitmap at a
20 : // time. If one of these 8-bit groups (a byte) contains all set bits (0xFF) or
21 : // all unset bits (0x00), the encoder uses a run-length encoding for the byte
22 : // and any consecutive identical bytes. All other bytes are encoded as
23 : // themselves.
24 : //
25 : // For example, an all-set bitmap of 1024 bits would be encoded as the 0xFF byte
26 : // followed by the varint encoding of 128 (1024/8 = 128).
27 : type BitmapRunLengthEncoder struct {
28 : // buf holds the encoded bitmap up until the beginning of the pending run of
29 : // all-set bits indicated by allSetRunLength.
30 : buf []byte
31 : // allSetRunLength is the count of consecutive pending bytes that are all
32 : // set (preceding the byte currently being buffered in currByte). When the
33 : // current byte is finished, if allSetRunLength is greater than zero, the
34 : // all-set sequence must be encoded to buf before the current byte.
35 : allSetRunLength int
36 : // currByte is the current byte being buffered. After the first call to Set,
37 : // currByte always has at least 1 set bit. currByteIndex is the index of the
38 : // buffered byte in a fully-materialized bitmap. That is, currByte
39 : // accumulates the bits within the bitmap in indices [currByteIndex*8,
40 : // (currByteIndex+1)*8).
41 : currByte byte
42 : currByteIndex int
43 : }
44 :
45 : // Init initializes or resets the encoder to its initial state.
46 1 : func (bb *BitmapRunLengthEncoder) Init() {
47 1 : invariants.MaybeMangle(bb.buf)
48 1 : bb.buf = bb.buf[:0]
49 1 : bb.allSetRunLength = 0
50 1 : bb.currByte = 0
51 1 : bb.currByteIndex = -1
52 1 : }
53 :
54 : // Set records that the i'th bit in the bitmap is Set. Set must be called with i
55 : // in increasing order.
56 1 : func (bb *BitmapRunLengthEncoder) Set(i int) {
57 1 : // bi is the index of the byte that would contain the i'th bit in a
58 1 : // fully-materialized bitmap
59 1 : bi := i / 8
60 1 :
61 1 : // If the i'th bit falls into the same byte as the previous call to set,
62 1 : // we can just set the bit in currByte.
63 1 : if bi == bb.currByteIndex {
64 1 : bb.currByte |= 1 << (i % 8)
65 1 : return
66 1 : }
67 :
68 : // The i'th bit falls into a new byte. We may need to finish encoding
69 : // previous bits whose state is buffered on the encoder.
70 :
71 : // Append what we accumulated within currByte.
72 1 : switch bb.currByte {
73 1 : case 0x00:
74 : // If and only if this is the first call to set, currByte is all
75 : // zeros and we have nothing to append.
76 1 : case 0xFF:
77 1 : // The current byte contains all set bits. Increment the count of the
78 1 : // current pending run.
79 1 : bb.allSetRunLength++
80 1 : default:
81 1 : // The current byte contains a mix of set and unset bits. If there's a
82 1 : // pending all-ones run, encode it.
83 1 : bb.maybeFlushAllSetRun()
84 1 : // And append the mixed byte to buf.
85 1 : bb.buf = append(bb.buf, bb.currByte)
86 : }
87 :
88 : // If there's a gap between the index of the byte we just finished and the
89 : // index of the new byte we're beginning, we need to append a run of zeros.
90 1 : if n := bi - bb.currByteIndex - 1; n > 0 {
91 1 : bb.maybeFlushAllSetRun()
92 1 : bb.buf = append(bb.buf, 0x00)
93 1 : bb.buf = binary.AppendUvarint(bb.buf, uint64(n))
94 1 : }
95 :
96 : // Save the bit on currByte.
97 1 : bb.currByteIndex = bi
98 1 : bb.currByte = 1 << (i % 8)
99 : }
100 :
101 1 : func (bb *BitmapRunLengthEncoder) maybeFlushAllSetRun() {
102 1 : if bb.allSetRunLength > 0 {
103 1 : bb.buf = append(bb.buf, 0xFF)
104 1 : bb.buf = binary.AppendUvarint(bb.buf, uint64(bb.allSetRunLength))
105 1 : bb.allSetRunLength = 0
106 1 : }
107 : }
108 :
109 : // FinishAndAppend appends the encoded bitmap to buf and returns the result.
110 1 : func (bb *BitmapRunLengthEncoder) FinishAndAppend(buf []byte) []byte {
111 1 : // Finish the current pending state.
112 1 : switch bb.currByte {
113 1 : case 0xFF:
114 1 : bb.allSetRunLength++
115 1 : bb.maybeFlushAllSetRun()
116 0 : case 0x00:
117 : // Only possible if Set was never called. Do nothing.
118 1 : default:
119 1 : bb.maybeFlushAllSetRun()
120 1 : bb.buf = append(bb.buf, bb.currByte)
121 : }
122 1 : return append(buf, bb.buf...)
123 : }
124 :
125 : // Size returns the current number of bytes in the buf, accounting for the buf
126 : // state when our current pending byte has been written. This is useful for
127 : // estimating the size of the bitmap without having to copy the result of
128 : // FinishAndAppend.
129 1 : func (bb *BitmapRunLengthEncoder) Size() int {
130 1 : switch bb.currByte {
131 1 : case 0xFF:
132 1 : // 1 byte for 0xFF + n bytes for varint encoding of
133 1 : // (allSetRunLength + 1).
134 1 : v := bb.allSetRunLength + 1
135 1 : varintSize := 1
136 1 : for v >= 0x80 {
137 0 : v >>= 7
138 0 : varintSize++
139 0 : }
140 1 : return len(bb.buf) + 1 + varintSize
141 0 : case 0x00:
142 0 : // Only possible if Set was never called. Do nothing.
143 0 : return len(bb.buf)
144 1 : default:
145 1 : if bb.allSetRunLength > 0 {
146 1 : // 1 byte for 0xFF + n bytes for varint encoding of
147 1 : // allSetRunLength + 1 byte for current byte.
148 1 : v := bb.allSetRunLength
149 1 : varintSize := 1
150 1 : for v >= 0x80 {
151 0 : v >>= 7
152 0 : varintSize++
153 0 : }
154 1 : return len(bb.buf) + 1 + varintSize + 1
155 : }
156 : // For mixed byte case without pending all-set run: just 1 byte for
157 : // current byte
158 1 : return len(bb.buf) + 1
159 : }
160 : }
161 :
162 : // IterSetBitsInRunLengthBitmap returns an iterator over the indices of set bits
163 : // within the provided run-length encoded bitmap.
164 1 : func IterSetBitsInRunLengthBitmap(bitmap []byte) iter.Seq[int] {
165 1 : return func(yield func(int) bool) {
166 1 : // i is the index of the current byte in the bitmap.
167 1 : i := 0
168 1 : // bit is the index of the current bit in the overall bitmap.
169 1 : bit := 0
170 1 : for i < len(bitmap) {
171 1 : b := bitmap[i]
172 1 : i++
173 1 : switch b {
174 1 : case 0x00:
175 1 : // Decode the varint. A decoded value of v indicates that there
176 1 : // are 8*v consecutive unset bits in the bitmap.
177 1 : v, n := binary.Uvarint(bitmap[i:])
178 1 : i += n
179 1 : bit += 8 * int(v)
180 1 : case 0xFF:
181 1 : // Decode the varint. A decoded value of v indicates that there
182 1 : // are 8*v consecutive set bits in the bitmap.
183 1 : v, n := binary.Uvarint(bitmap[i:])
184 1 : i += n
185 1 : for j := range 8 * int(v) {
186 1 : if !yield(bit + j) {
187 0 : return
188 0 : }
189 : }
190 1 : bit += 8 * int(v)
191 1 : default:
192 1 : // b is a byte that is neither all ones nor all zeros. Check
193 1 : // each bit and yield the index of any set bits to the iterator.
194 1 : for j := range 8 {
195 1 : if b&(1<<j) == 0 {
196 1 : continue
197 : }
198 1 : if !yield(bit + j) {
199 0 : return
200 0 : }
201 : }
202 1 : bit += 8
203 : }
204 : }
205 : }
206 : }
|