Coverage Report

Created: 2025-07-23 07:16

/src/regex/regex-automata/src/util/sparse_set.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
This module defines a sparse set data structure. Its most interesting
3
properties are:
4
5
* They preserve insertion order.
6
* Set membership testing is done in constant time.
7
* Set insertion is done in constant time.
8
* Clearing the set is done in constant time.
9
10
The cost for doing this is that the capacity of the set needs to be known up
11
front, and the elements in the set are limited to state identifiers.
12
13
These sets are principally used when traversing an NFA state graph. This
14
happens at search time, for example, in the PikeVM. It also happens during DFA
15
determinization.
16
*/
17
18
use alloc::{vec, vec::Vec};
19
20
use crate::util::primitives::StateID;
21
22
/// A pairse of sparse sets.
23
///
24
/// This is useful when one needs to compute NFA epsilon closures from a
25
/// previous set of states derived from an epsilon closure. One set can be the
26
/// starting states where as the other set can be the destination states after
27
/// following the transitions for a particular byte of input.
28
///
29
/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
30
/// the same size.
31
///
32
/// The members of this struct are exposed so that callers may borrow 'set1'
33
/// and 'set2' individually without being force to borrow both at the same
34
/// time.
35
#[derive(Clone, Debug)]
36
pub(crate) struct SparseSets {
37
    pub(crate) set1: SparseSet,
38
    pub(crate) set2: SparseSet,
39
}
40
41
impl SparseSets {
42
    /// Create a new pair of sparse sets where each set has the given capacity.
43
    ///
44
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
45
114k
    pub(crate) fn new(capacity: usize) -> SparseSets {
46
114k
        SparseSets {
47
114k
            set1: SparseSet::new(capacity),
48
114k
            set2: SparseSet::new(capacity),
49
114k
        }
50
114k
    }
51
52
    /// Resizes these sparse sets to have the new capacity given.
53
    ///
54
    /// The sets are automatically cleared.
55
    ///
56
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
57
    #[inline]
58
0
    pub(crate) fn resize(&mut self, new_capacity: usize) {
59
0
        self.set1.resize(new_capacity);
60
0
        self.set2.resize(new_capacity);
61
0
    }
62
63
    /// Clear both sparse sets.
64
16.2M
    pub(crate) fn clear(&mut self) {
65
16.2M
        self.set1.clear();
66
16.2M
        self.set2.clear();
67
16.2M
    }
68
69
    /// Swap set1 with set2.
70
716k
    pub(crate) fn swap(&mut self) {
71
716k
        core::mem::swap(&mut self.set1, &mut self.set2);
72
716k
    }
73
74
    /// Returns the memory usage, in bytes, used by this pair of sparse sets.
75
2.47M
    pub(crate) fn memory_usage(&self) -> usize {
76
2.47M
        self.set1.memory_usage() + self.set2.memory_usage()
77
2.47M
    }
78
}
79
80
/// A sparse set used for representing ordered NFA states.
81
///
82
/// This supports constant time addition and membership testing. Clearing an
83
/// entire set can also be done in constant time. Iteration yields elements
84
/// in the order in which they were inserted.
85
///
86
/// The data structure is based on: https://research.swtch.com/sparse
87
/// Note though that we don't actually use uninitialized memory. We generally
88
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
89
/// other properties listed above are extremely useful.
90
#[derive(Clone)]
91
pub(crate) struct SparseSet {
92
    /// The number of elements currently in this set.
93
    len: usize,
94
    /// Dense contains the ids in the order in which they were inserted.
95
    dense: Vec<StateID>,
96
    /// Sparse maps ids to their location in dense.
97
    ///
98
    /// A state ID is in the set if and only if
99
    /// sparse[id] < len && id == dense[sparse[id]].
100
    ///
101
    /// Note that these are indices into 'dense'. It's a little weird to use
102
    /// StateID here, but we know our length can never exceed the bounds of
103
    /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
104
    /// where as a usize is likely double that in most cases.
105
    sparse: Vec<StateID>,
106
}
107
108
impl SparseSet {
109
    /// Create a new sparse set with the given capacity.
110
    ///
111
    /// Sparse sets have a fixed size and they cannot grow. Attempting to
112
    /// insert more distinct elements than the total capacity of the set will
113
    /// result in a panic.
114
    ///
115
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
116
    #[inline]
117
478k
    pub(crate) fn new(capacity: usize) -> SparseSet {
118
478k
        let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
119
478k
        set.resize(capacity);
120
478k
        set
121
478k
    }
122
123
    /// Resizes this sparse set to have the new capacity given.
124
    ///
125
    /// This set is automatically cleared.
126
    ///
127
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
128
    #[inline]
129
560k
    pub(crate) fn resize(&mut self, new_capacity: usize) {
130
560k
        assert!(
131
560k
            new_capacity <= StateID::LIMIT,
132
0
            "sparse set capacity cannot excced {:?}",
133
            StateID::LIMIT
134
        );
135
560k
        self.clear();
136
560k
        self.dense.resize(new_capacity, StateID::ZERO);
137
560k
        self.sparse.resize(new_capacity, StateID::ZERO);
138
560k
    }
139
140
    /// Returns the capacity of this set.
141
    ///
142
    /// The capacity represents a fixed limit on the number of distinct
143
    /// elements that are allowed in this set. The capacity cannot be changed.
144
    #[inline]
145
5.90G
    pub(crate) fn capacity(&self) -> usize {
146
5.90G
        self.dense.len()
147
5.90G
    }
148
149
    /// Returns the number of elements in this set.
150
    #[inline]
151
12.4G
    pub(crate) fn len(&self) -> usize {
152
12.4G
        self.len
153
12.4G
    }
154
155
    /// Returns true if and only if this set is empty.
156
    #[inline]
157
19.0M
    pub(crate) fn is_empty(&self) -> bool {
158
19.0M
        self.len() == 0
159
19.0M
    }
160
161
    /// Insert the state ID value into this set and return true if the given
162
    /// state ID was not previously in this set.
163
    ///
164
    /// This operation is idempotent. If the given value is already in this
165
    /// set, then this is a no-op.
166
    ///
167
    /// If more than `capacity` ids are inserted, then this panics.
168
    ///
169
    /// This is marked as inline(always) since the compiler won't inline it
170
    /// otherwise, and it's a fairly hot piece of code in DFA determinization.
171
    #[cfg_attr(feature = "perf-inline", inline(always))]
172
6.52G
    pub(crate) fn insert(&mut self, id: StateID) -> bool {
173
6.52G
        if self.contains(id) {
174
628M
            return false;
175
5.90G
        }
176
5.90G
177
5.90G
        let i = self.len();
178
5.90G
        assert!(
179
5.90G
            i < self.capacity(),
180
0
            "{:?} exceeds capacity of {:?} when inserting {:?}",
181
0
            i,
182
0
            self.capacity(),
183
            id,
184
        );
185
        // OK since i < self.capacity() and self.capacity() is guaranteed to
186
        // be <= StateID::LIMIT.
187
5.90G
        let index = StateID::new_unchecked(i);
188
5.90G
        self.dense[index] = id;
189
5.90G
        self.sparse[id] = index;
190
5.90G
        self.len += 1;
191
5.90G
        true
192
6.52G
    }
193
194
    /// Returns true if and only if this set contains the given value.
195
    #[inline]
196
6.52G
    pub(crate) fn contains(&self, id: StateID) -> bool {
197
6.52G
        let index = self.sparse[id];
198
6.52G
        index.as_usize() < self.len() && self.dense[index] == id
199
6.52G
    }
200
201
    /// Clear this set such that it has no members.
202
    #[inline]
203
39.4M
    pub(crate) fn clear(&mut self) {
204
39.4M
        self.len = 0;
205
39.4M
    }
206
207
    #[inline]
208
36.4M
    pub(crate) fn iter(&self) -> SparseSetIter<'_> {
209
36.4M
        SparseSetIter(self.dense[..self.len()].iter())
210
36.4M
    }
211
212
    /// Returns the heap memory usage, in bytes, used by this sparse set.
213
    #[inline]
214
4.95M
    pub(crate) fn memory_usage(&self) -> usize {
215
4.95M
        self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
216
4.95M
    }
217
}
218
219
impl core::fmt::Debug for SparseSet {
220
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
221
0
        let elements: Vec<StateID> = self.iter().collect();
222
0
        f.debug_tuple("SparseSet").field(&elements).finish()
223
0
    }
224
}
225
226
/// An iterator over all elements in a sparse set.
227
///
228
/// The lifetime `'a` refers to the lifetime of the set being iterated over.
229
#[derive(Debug)]
230
pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
231
232
impl<'a> Iterator for SparseSetIter<'a> {
233
    type Item = StateID;
234
235
    #[cfg_attr(feature = "perf-inline", inline(always))]
236
5.78G
    fn next(&mut self) -> Option<StateID> {
237
5.78G
        self.0.next().map(|&id| id)
238
5.78G
    }
239
}