/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.1.10/src/nfa/map.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This module contains a couple simple and purpose built hash maps. The key |
2 | | // trade off they make is that they serve as caches rather than true maps. That |
3 | | // is, inserting a new entry may cause eviction of another entry. This gives |
4 | | // us two things. First, there's less overhead associated with inserts and |
5 | | // lookups. Secondly, it lets us control our memory usage. |
6 | | // |
7 | | // These maps are used in some fairly hot code when generating NFA states for |
8 | | // large Unicode character classes. |
9 | | // |
10 | | // Instead of exposing a rich hashmap entry API, we just permit the caller |
11 | | // to produce a hash of the key directly. The hash can then be reused for both |
12 | | // lookups and insertions at the cost of leaking things a bit. But these are |
13 | | // for internal use only, so it's fine. |
14 | | // |
15 | | // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a |
16 | | // (almost) minimal DFA for large Unicode character classes in linear time. |
17 | | // (Daciuk's algorithm is always used when compiling forward NFAs. For reverse |
18 | | // NFAs, it's only used when the compiler is configured to 'shrink' the NFA, |
19 | | // since there's a bit more expense in the reverse direction.) |
20 | | // |
21 | | // The Utf8SuffixMap is used when compiling large Unicode character classes for |
22 | | // reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive |
23 | | // construction of UTF-8 automata by caching common suffixes. This doesn't |
24 | | // get the same space savings as Daciuk's algorithm, but it's basically as |
25 | | // fast as the naive approach and typically winds up using less memory (since |
26 | | // it generates smaller NFAs) despite the presence of the cache. |
27 | | // |
28 | | // These maps effectively represent caching mechanisms for CState::Sparse and |
29 | | // CState::Range, respectively. The former represents a single NFA state with |
30 | | // many transitions of equivalent priority while the latter represents a single |
31 | | // NFA state with a single transition. (Neither state ever has or is an |
32 | | // epsilon transition.) Thus, they have different key types. It's likely we |
33 | | // could make one generic map, but the machinery didn't seem worth it. They |
34 | | // are simple enough. |
35 | | |
36 | | use nfa::{StateID, Transition}; |
37 | | |
38 | | // Basic FNV-1a hash constants as described in: |
39 | | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
40 | | const PRIME: u64 = 1099511628211; |
41 | | const INIT: u64 = 14695981039346656037; |
42 | | |
43 | | /// A bounded hash map where the key is a sequence of NFA transitions and the |
44 | | /// value is a pre-existing NFA state ID. |
45 | | /// |
46 | | /// std's hashmap can be used for this, however, this map has two important |
47 | | /// advantages. Firstly, it has lower overhead. Secondly, it permits us to |
48 | | /// control our memory usage by limited the number of slots. In general, the |
49 | | /// cost here is that this map acts as a cache. That is, inserting a new entry |
50 | | /// may remove an old entry. We are okay with this, since it does not impact |
51 | | /// correctness in the cases where it is used. The only effect that dropping |
52 | | /// states from the cache has is that the resulting NFA generated may be bigger |
53 | | /// than it otherwise would be. |
54 | | /// |
55 | | /// This improves benchmarks that compile large Unicode character classes, |
56 | | /// since it makes the generation of (almost) minimal UTF-8 automaton faster. |
57 | | /// Specifically, one could observe the difference with std's hashmap via |
58 | | /// something like the following benchmark: |
59 | | /// |
60 | | /// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" |
61 | | /// |
62 | | /// But to observe that difference, you'd have to modify the code to use |
63 | | /// std's hashmap. |
64 | | /// |
65 | | /// It is quite possible that there is a better way to approach this problem. |
66 | | /// For example, if there happens to be a very common state that collides with |
67 | | /// a lot of less frequent states, then we could wind up with very poor caching |
68 | | /// behavior. Alas, the effectiveness of this cache has not been measured. |
69 | | /// Instead, ad hoc experiments suggest that it is "good enough." Additional |
70 | | /// smarts (such as an LRU eviction policy) have to be weighed against the |
71 | | /// amount of extra time they cost. |
72 | | #[derive(Clone, Debug)] |
73 | | pub struct Utf8BoundedMap { |
74 | | /// The current version of this map. Only entries with matching versions |
75 | | /// are considered during lookups. If an entry is found with a mismatched |
76 | | /// version, then the map behaves as if the entry does not exist. |
77 | | version: u16, |
78 | | /// The total number of entries this map can store. |
79 | | capacity: usize, |
80 | | /// The actual entries, keyed by hash. Collisions between different states |
81 | | /// result in the old state being dropped. |
82 | | map: Vec<Utf8BoundedEntry>, |
83 | | } |
84 | | |
85 | | /// An entry in this map. |
86 | | #[derive(Clone, Debug, Default)] |
87 | | struct Utf8BoundedEntry { |
88 | | /// The version of the map used to produce this entry. If this entry's |
89 | | /// version does not match the current version of the map, then the map |
90 | | /// should behave as if this entry does not exist. |
91 | | version: u16, |
92 | | /// The key, which is a sorted sequence of non-overlapping NFA transitions. |
93 | | key: Vec<Transition>, |
94 | | /// The state ID corresponding to the state containing the transitions in |
95 | | /// this entry. |
96 | | val: StateID, |
97 | | } |
98 | | |
99 | | impl Utf8BoundedMap { |
100 | | /// Create a new bounded map with the given capacity. The map will never |
101 | | /// grow beyond the given size. |
102 | | /// |
103 | | /// Note that this does not allocate. Instead, callers must call `clear` |
104 | | /// before using this map. `clear` will allocate space if necessary. |
105 | | /// |
106 | | /// This avoids the need to pay for the allocation of this map when |
107 | | /// compiling regexes that lack large Unicode character classes. |
108 | 0 | pub fn new(capacity: usize) -> Utf8BoundedMap { |
109 | 0 | assert!(capacity > 0); |
110 | 0 | Utf8BoundedMap { version: 0, capacity, map: vec![] } |
111 | 0 | } |
112 | | |
113 | | /// Clear this map of all entries, but permit the reuse of allocation |
114 | | /// if possible. |
115 | | /// |
116 | | /// This must be called before the map can be used. |
117 | 0 | pub fn clear(&mut self) { |
118 | 0 | if self.map.is_empty() { |
119 | 0 | self.map = vec![Utf8BoundedEntry::default(); self.capacity]; |
120 | 0 | } else { |
121 | 0 | self.version = self.version.wrapping_add(1); |
122 | 0 | if self.version == 0 { |
123 | 0 | self.map = vec![Utf8BoundedEntry::default(); self.capacity]; |
124 | 0 | } |
125 | | } |
126 | 0 | } |
127 | | |
128 | | /// Return a hash of the given transitions. |
129 | 0 | pub fn hash(&self, key: &[Transition]) -> usize { |
130 | 0 | let mut h = INIT; |
131 | 0 | for t in key { |
132 | 0 | h = (h ^ (t.start as u64)).wrapping_mul(PRIME); |
133 | 0 | h = (h ^ (t.end as u64)).wrapping_mul(PRIME); |
134 | 0 | h = (h ^ (t.next as u64)).wrapping_mul(PRIME); |
135 | 0 | } |
136 | 0 | (h as usize) % self.map.len() |
137 | 0 | } |
138 | | |
139 | | /// Retrieve the cached state ID corresponding to the given key. The hash |
140 | | /// given must have been computed with `hash` using the same key value. |
141 | | /// |
142 | | /// If there is no cached state with the given transitions, then None is |
143 | | /// returned. |
144 | 0 | pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { |
145 | 0 | let entry = &self.map[hash]; |
146 | 0 | if entry.version != self.version { |
147 | 0 | return None; |
148 | 0 | } |
149 | 0 | // There may be a hash collision, so we need to confirm real equality. |
150 | 0 | if entry.key != key { |
151 | 0 | return None; |
152 | 0 | } |
153 | 0 | Some(entry.val) |
154 | 0 | } |
155 | | |
156 | | /// Add a cached state to this map with the given key. Callers should |
157 | | /// ensure that `state_id` points to a state that contains precisely the |
158 | | /// NFA transitions given. |
159 | | /// |
160 | | /// `hash` must have been computed using the `hash` method with the same |
161 | | /// key. |
162 | 0 | pub fn set( |
163 | 0 | &mut self, |
164 | 0 | key: Vec<Transition>, |
165 | 0 | hash: usize, |
166 | 0 | state_id: StateID, |
167 | 0 | ) { |
168 | 0 | self.map[hash] = |
169 | 0 | Utf8BoundedEntry { version: self.version, key, val: state_id }; |
170 | 0 | } |
171 | | } |
172 | | |
173 | | /// A cache of suffixes used to modestly compress UTF-8 automata for large |
174 | | /// Unicode character classes. |
175 | | #[derive(Clone, Debug)] |
176 | | pub struct Utf8SuffixMap { |
177 | | /// The current version of this map. Only entries with matching versions |
178 | | /// are considered during lookups. If an entry is found with a mismatched |
179 | | /// version, then the map behaves as if the entry does not exist. |
180 | | version: u16, |
181 | | /// The total number of entries this map can store. |
182 | | capacity: usize, |
183 | | /// The actual entries, keyed by hash. Collisions between different states |
184 | | /// result in the old state being dropped. |
185 | | map: Vec<Utf8SuffixEntry>, |
186 | | } |
187 | | |
188 | | /// A key that uniquely identifies an NFA state. It is a triple that represents |
189 | | /// a transition from one state for a particular byte range. |
190 | | #[derive(Clone, Debug, Default, Eq, PartialEq)] |
191 | | pub struct Utf8SuffixKey { |
192 | | pub from: StateID, |
193 | | pub start: u8, |
194 | | pub end: u8, |
195 | | } |
196 | | |
197 | | /// An entry in this map. |
198 | | #[derive(Clone, Debug, Default)] |
199 | | struct Utf8SuffixEntry { |
200 | | /// The version of the map used to produce this entry. If this entry's |
201 | | /// version does not match the current version of the map, then the map |
202 | | /// should behave as if this entry does not exist. |
203 | | version: u16, |
204 | | /// The key, which consists of a transition in a particular state. |
205 | | key: Utf8SuffixKey, |
206 | | /// The identifier that the transition in the key maps to. |
207 | | val: StateID, |
208 | | } |
209 | | |
210 | | impl Utf8SuffixMap { |
211 | | /// Create a new bounded map with the given capacity. The map will never |
212 | | /// grow beyond the given size. |
213 | | /// |
214 | | /// Note that this does not allocate. Instead, callers must call `clear` |
215 | | /// before using this map. `clear` will allocate space if necessary. |
216 | | /// |
217 | | /// This avoids the need to pay for the allocation of this map when |
218 | | /// compiling regexes that lack large Unicode character classes. |
219 | 0 | pub fn new(capacity: usize) -> Utf8SuffixMap { |
220 | 0 | assert!(capacity > 0); |
221 | 0 | Utf8SuffixMap { version: 0, capacity, map: vec![] } |
222 | 0 | } |
223 | | |
224 | | /// Clear this map of all entries, but permit the reuse of allocation |
225 | | /// if possible. |
226 | | /// |
227 | | /// This must be called before the map can be used. |
228 | 0 | pub fn clear(&mut self) { |
229 | 0 | if self.map.is_empty() { |
230 | 0 | self.map = vec![Utf8SuffixEntry::default(); self.capacity]; |
231 | 0 | } else { |
232 | 0 | self.version = self.version.wrapping_add(1); |
233 | 0 | if self.version == 0 { |
234 | 0 | self.map = vec![Utf8SuffixEntry::default(); self.capacity]; |
235 | 0 | } |
236 | | } |
237 | 0 | } |
238 | | |
239 | | /// Return a hash of the given transition. |
240 | 0 | pub fn hash(&self, key: &Utf8SuffixKey) -> usize { |
241 | | // Basic FNV-1a hash as described: |
242 | | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
243 | | const PRIME: u64 = 1099511628211; |
244 | | const INIT: u64 = 14695981039346656037; |
245 | | |
246 | 0 | let mut h = INIT; |
247 | 0 | h = (h ^ (key.from as u64)).wrapping_mul(PRIME); |
248 | 0 | h = (h ^ (key.start as u64)).wrapping_mul(PRIME); |
249 | 0 | h = (h ^ (key.end as u64)).wrapping_mul(PRIME); |
250 | 0 | (h as usize) % self.map.len() |
251 | 0 | } |
252 | | |
253 | | /// Retrieve the cached state ID corresponding to the given key. The hash |
254 | | /// given must have been computed with `hash` using the same key value. |
255 | | /// |
256 | | /// If there is no cached state with the given key, then None is returned. |
257 | 0 | pub fn get( |
258 | 0 | &mut self, |
259 | 0 | key: &Utf8SuffixKey, |
260 | 0 | hash: usize, |
261 | 0 | ) -> Option<StateID> { |
262 | 0 | let entry = &self.map[hash]; |
263 | 0 | if entry.version != self.version { |
264 | 0 | return None; |
265 | 0 | } |
266 | 0 | if key != &entry.key { |
267 | 0 | return None; |
268 | 0 | } |
269 | 0 | Some(entry.val) |
270 | 0 | } |
271 | | |
272 | | /// Add a cached state to this map with the given key. Callers should |
273 | | /// ensure that `state_id` points to a state that contains precisely the |
274 | | /// NFA transition given. |
275 | | /// |
276 | | /// `hash` must have been computed using the `hash` method with the same |
277 | | /// key. |
278 | 0 | pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { |
279 | 0 | self.map[hash] = |
280 | 0 | Utf8SuffixEntry { version: self.version, key, val: state_id }; |
281 | 0 | } |
282 | | } |