/src/regex/regex-automata/src/util/sparse_set.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | This module defines a sparse set data structure. Its most interesting |
3 | | properties are: |
4 | | |
5 | | * They preserve insertion order. |
6 | | * Set membership testing is done in constant time. |
7 | | * Set insertion is done in constant time. |
8 | | * Clearing the set is done in constant time. |
9 | | |
10 | | The cost for doing this is that the capacity of the set needs to be known up |
11 | | front, and the elements in the set are limited to state identifiers. |
12 | | |
13 | | These sets are principally used when traversing an NFA state graph. This |
14 | | happens at search time, for example, in the PikeVM. It also happens during DFA |
15 | | determinization. |
16 | | */ |
17 | | |
18 | | use alloc::{vec, vec::Vec}; |
19 | | |
20 | | use crate::util::primitives::StateID; |
21 | | |
22 | | /// A pairse of sparse sets. |
23 | | /// |
24 | | /// This is useful when one needs to compute NFA epsilon closures from a |
25 | | /// previous set of states derived from an epsilon closure. One set can be the |
26 | | /// starting states where as the other set can be the destination states after |
27 | | /// following the transitions for a particular byte of input. |
28 | | /// |
29 | | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of |
30 | | /// the same size. |
31 | | /// |
32 | | /// The members of this struct are exposed so that callers may borrow 'set1' |
33 | | /// and 'set2' individually without being force to borrow both at the same |
34 | | /// time. |
35 | | #[derive(Clone, Debug)] |
36 | | pub(crate) struct SparseSets { |
37 | | pub(crate) set1: SparseSet, |
38 | | pub(crate) set2: SparseSet, |
39 | | } |
40 | | |
41 | | impl SparseSets { |
42 | | /// Create a new pair of sparse sets where each set has the given capacity. |
43 | | /// |
44 | | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
45 | 114k | pub(crate) fn new(capacity: usize) -> SparseSets { |
46 | 114k | SparseSets { |
47 | 114k | set1: SparseSet::new(capacity), |
48 | 114k | set2: SparseSet::new(capacity), |
49 | 114k | } |
50 | 114k | } |
51 | | |
52 | | /// Resizes these sparse sets to have the new capacity given. |
53 | | /// |
54 | | /// The sets are automatically cleared. |
55 | | /// |
56 | | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
57 | | #[inline] |
58 | 0 | pub(crate) fn resize(&mut self, new_capacity: usize) { |
59 | 0 | self.set1.resize(new_capacity); |
60 | 0 | self.set2.resize(new_capacity); |
61 | 0 | } |
62 | | |
63 | | /// Clear both sparse sets. |
64 | 16.2M | pub(crate) fn clear(&mut self) { |
65 | 16.2M | self.set1.clear(); |
66 | 16.2M | self.set2.clear(); |
67 | 16.2M | } |
68 | | |
69 | | /// Swap set1 with set2. |
70 | 716k | pub(crate) fn swap(&mut self) { |
71 | 716k | core::mem::swap(&mut self.set1, &mut self.set2); |
72 | 716k | } |
73 | | |
74 | | /// Returns the memory usage, in bytes, used by this pair of sparse sets. |
75 | 2.47M | pub(crate) fn memory_usage(&self) -> usize { |
76 | 2.47M | self.set1.memory_usage() + self.set2.memory_usage() |
77 | 2.47M | } |
78 | | } |
79 | | |
80 | | /// A sparse set used for representing ordered NFA states. |
81 | | /// |
82 | | /// This supports constant time addition and membership testing. Clearing an |
83 | | /// entire set can also be done in constant time. Iteration yields elements |
84 | | /// in the order in which they were inserted. |
85 | | /// |
86 | | /// The data structure is based on: https://research.swtch.com/sparse |
87 | | /// Note though that we don't actually use uninitialized memory. We generally |
88 | | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
89 | | /// other properties listed above are extremely useful. |
90 | | #[derive(Clone)] |
91 | | pub(crate) struct SparseSet { |
92 | | /// The number of elements currently in this set. |
93 | | len: usize, |
94 | | /// Dense contains the ids in the order in which they were inserted. |
95 | | dense: Vec<StateID>, |
96 | | /// Sparse maps ids to their location in dense. |
97 | | /// |
98 | | /// A state ID is in the set if and only if |
99 | | /// sparse[id] < len && id == dense[sparse[id]]. |
100 | | /// |
101 | | /// Note that these are indices into 'dense'. It's a little weird to use |
102 | | /// StateID here, but we know our length can never exceed the bounds of |
103 | | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
104 | | /// where as a usize is likely double that in most cases. |
105 | | sparse: Vec<StateID>, |
106 | | } |
107 | | |
108 | | impl SparseSet { |
109 | | /// Create a new sparse set with the given capacity. |
110 | | /// |
111 | | /// Sparse sets have a fixed size and they cannot grow. Attempting to |
112 | | /// insert more distinct elements than the total capacity of the set will |
113 | | /// result in a panic. |
114 | | /// |
115 | | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
116 | | #[inline] |
117 | 478k | pub(crate) fn new(capacity: usize) -> SparseSet { |
118 | 478k | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
119 | 478k | set.resize(capacity); |
120 | 478k | set |
121 | 478k | } |
122 | | |
123 | | /// Resizes this sparse set to have the new capacity given. |
124 | | /// |
125 | | /// This set is automatically cleared. |
126 | | /// |
127 | | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
128 | | #[inline] |
129 | 560k | pub(crate) fn resize(&mut self, new_capacity: usize) { |
130 | 560k | assert!( |
131 | 560k | new_capacity <= StateID::LIMIT, |
132 | 0 | "sparse set capacity cannot excced {:?}", |
133 | | StateID::LIMIT |
134 | | ); |
135 | 560k | self.clear(); |
136 | 560k | self.dense.resize(new_capacity, StateID::ZERO); |
137 | 560k | self.sparse.resize(new_capacity, StateID::ZERO); |
138 | 560k | } |
139 | | |
140 | | /// Returns the capacity of this set. |
141 | | /// |
142 | | /// The capacity represents a fixed limit on the number of distinct |
143 | | /// elements that are allowed in this set. The capacity cannot be changed. |
144 | | #[inline] |
145 | 5.90G | pub(crate) fn capacity(&self) -> usize { |
146 | 5.90G | self.dense.len() |
147 | 5.90G | } |
148 | | |
149 | | /// Returns the number of elements in this set. |
150 | | #[inline] |
151 | 12.4G | pub(crate) fn len(&self) -> usize { |
152 | 12.4G | self.len |
153 | 12.4G | } |
154 | | |
155 | | /// Returns true if and only if this set is empty. |
156 | | #[inline] |
157 | 19.0M | pub(crate) fn is_empty(&self) -> bool { |
158 | 19.0M | self.len() == 0 |
159 | 19.0M | } |
160 | | |
161 | | /// Insert the state ID value into this set and return true if the given |
162 | | /// state ID was not previously in this set. |
163 | | /// |
164 | | /// This operation is idempotent. If the given value is already in this |
165 | | /// set, then this is a no-op. |
166 | | /// |
167 | | /// If more than `capacity` ids are inserted, then this panics. |
168 | | /// |
169 | | /// This is marked as inline(always) since the compiler won't inline it |
170 | | /// otherwise, and it's a fairly hot piece of code in DFA determinization. |
171 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
172 | 6.52G | pub(crate) fn insert(&mut self, id: StateID) -> bool { |
173 | 6.52G | if self.contains(id) { |
174 | 628M | return false; |
175 | 5.90G | } |
176 | 5.90G | |
177 | 5.90G | let i = self.len(); |
178 | 5.90G | assert!( |
179 | 5.90G | i < self.capacity(), |
180 | 0 | "{:?} exceeds capacity of {:?} when inserting {:?}", |
181 | 0 | i, |
182 | 0 | self.capacity(), |
183 | | id, |
184 | | ); |
185 | | // OK since i < self.capacity() and self.capacity() is guaranteed to |
186 | | // be <= StateID::LIMIT. |
187 | 5.90G | let index = StateID::new_unchecked(i); |
188 | 5.90G | self.dense[index] = id; |
189 | 5.90G | self.sparse[id] = index; |
190 | 5.90G | self.len += 1; |
191 | 5.90G | true |
192 | 6.52G | } |
193 | | |
194 | | /// Returns true if and only if this set contains the given value. |
195 | | #[inline] |
196 | 6.52G | pub(crate) fn contains(&self, id: StateID) -> bool { |
197 | 6.52G | let index = self.sparse[id]; |
198 | 6.52G | index.as_usize() < self.len() && self.dense[index] == id |
199 | 6.52G | } |
200 | | |
201 | | /// Clear this set such that it has no members. |
202 | | #[inline] |
203 | 39.4M | pub(crate) fn clear(&mut self) { |
204 | 39.4M | self.len = 0; |
205 | 39.4M | } |
206 | | |
207 | | #[inline] |
208 | 36.4M | pub(crate) fn iter(&self) -> SparseSetIter<'_> { |
209 | 36.4M | SparseSetIter(self.dense[..self.len()].iter()) |
210 | 36.4M | } |
211 | | |
212 | | /// Returns the heap memory usage, in bytes, used by this sparse set. |
213 | | #[inline] |
214 | 4.95M | pub(crate) fn memory_usage(&self) -> usize { |
215 | 4.95M | self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE |
216 | 4.95M | } |
217 | | } |
218 | | |
219 | | impl core::fmt::Debug for SparseSet { |
220 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
221 | 0 | let elements: Vec<StateID> = self.iter().collect(); |
222 | 0 | f.debug_tuple("SparseSet").field(&elements).finish() |
223 | 0 | } |
224 | | } |
225 | | |
226 | | /// An iterator over all elements in a sparse set. |
227 | | /// |
228 | | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
229 | | #[derive(Debug)] |
230 | | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
231 | | |
232 | | impl<'a> Iterator for SparseSetIter<'a> { |
233 | | type Item = StateID; |
234 | | |
235 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
236 | 5.78G | fn next(&mut self) -> Option<StateID> { |
237 | 5.78G | self.0.next().map(|&id| id) |
238 | 5.78G | } |
239 | | } |