/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.1.10/src/nfa/map.rs

Source (jump to first uncovered line)
// This module contains a couple simple and purpose built hash maps. The key
// trade off they make is that they serve as caches rather than true maps. That
// is, inserting a new entry may cause eviction of another entry. This gives
// us two things. First, there's less overhead associated with inserts and
// lookups. Secondly, it lets us control our memory usage.
//
// These maps are used in some fairly hot code when generating NFA states for
// large Unicode character classes.
//
// Instead of exposing a rich hashmap entry API, we just permit the caller
// to produce a hash of the key directly. The hash can then be reused for both
// lookups and insertions at the cost of leaking things a bit. But these are
// for internal use only, so it's fine.
//
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
// (almost) minimal DFA for large Unicode character classes in linear time.
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
// since there's a bit more expense in the reverse direction.)
//
// The Utf8SuffixMap is used when compiling large Unicode character classes for
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
// construction of UTF-8 automata by caching common suffixes. This doesn't
// get the same space savings as Daciuk's algorithm, but it's basically as
// fast as the naive approach and typically winds up using less memory (since
// it generates smaller NFAs) despite the presence of the cache.
//
// These maps effectively represent caching mechanisms for CState::Sparse and
// CState::Range, respectively. The former represents a single NFA state with
// many transitions of equivalent priority while the latter represents a single
// NFA state with a single transition. (Neither state ever has or is an
// epsilon transition.) Thus, they have different key types. It's likely we
// could make one generic map, but the machinery didn't seem worth it. They
// are simple enough.

use nfa::{StateID, Transition};

// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;

/// A bounded hash map where the key is a sequence of NFA transitions and the
/// value is a pre-existing NFA state ID.
///
/// std's hashmap can be used for this, however, this map has two important
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
/// control our memory usage by limited the number of slots. In general, the
/// cost here is that this map acts as a cache. That is, inserting a new entry
/// may remove an old entry. We are okay with this, since it does not impact
/// correctness in the cases where it is used. The only effect that dropping
/// states from the cache has is that the resulting NFA generated may be bigger
/// than it otherwise would be.
///
/// This improves benchmarks that compile large Unicode character classes,
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
///   hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
///
/// It is quite possible that there is a better way to approach this problem.
/// For example, if there happens to be a very common state that collides with
/// a lot of less frequent states, then we could wind up with very poor caching
/// behavior. Alas, the effectiveness of this cache has not been measured.
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
/// smarts (such as an LRU eviction policy) have to be weighed against the
/// amount of extra time they cost.
#[derive(Clone, Debug)]
pub struct Utf8BoundedMap {
    /// The current version of this map. Only entries with matching versions
    /// are considered during lookups. If an entry is found with a mismatched
    /// version, then the map behaves as if the entry does not exist.
    version: u16,
    /// The total number of entries this map can store.
    capacity: usize,
    /// The actual entries, keyed by hash. Collisions between different states
    /// result in the old state being dropped.
    map: Vec<Utf8BoundedEntry>,
}

/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8BoundedEntry {
    /// The version of the map used to produce this entry. If this entry's
    /// version does not match the current version of the map, then the map
    /// should behave as if this entry does not exist.
    version: u16,
    /// The key, which is a sorted sequence of non-overlapping NFA transitions.
    key: Vec<Transition>,
    /// The state ID corresponding to the state containing the transitions in
    /// this entry.
    val: StateID,
}

impl Utf8BoundedMap {
    /// Create a new bounded map with the given capacity. The map will never
    /// grow beyond the given size.
    ///
    /// Note that this does not allocate. Instead, callers must call `clear`
    /// before using this map. `clear` will allocate space if necessary.
    ///
    /// This avoids the need to pay for the allocation of this map when
    /// compiling regexes that lack large Unicode character classes.
    pub fn new(capacity: usize) -> Utf8BoundedMap {
        assert!(capacity > 0);
        Utf8BoundedMap { version: 0, capacity, map: vec![] }
    }

    /// Clear this map of all entries, but permit the reuse of allocation
    /// if possible.
    ///
    /// This must be called before the map can be used.
    pub fn clear(&mut self) {
        if self.map.is_empty() {
            self.map = vec![Utf8BoundedEntry::default(); self.capacity];
        } else {
            self.version = self.version.wrapping_add(1);
            if self.version == 0 {
                self.map = vec![Utf8BoundedEntry::default(); self.capacity];
            }
        }
    }

    /// Return a hash of the given transitions.
    pub fn hash(&self, key: &[Transition]) -> usize {
        let mut h = INIT;
        for t in key {
            h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
            h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
            h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
        }
        (h as usize) % self.map.len()
    }

    /// Retrieve the cached state ID corresponding to the given key. The hash
    /// given must have been computed with `hash` using the same key value.
    ///
    /// If there is no cached state with the given transitions, then None is
    /// returned.
    pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
        let entry = &self.map[hash];
        if entry.version != self.version {
            return None;
        }
        // There may be a hash collision, so we need to confirm real equality.
        if entry.key != key {
            return None;
        }
        Some(entry.val)
    }

    /// Add a cached state to this map with the given key. Callers should
    /// ensure that `state_id` points to a state that contains precisely the
    /// NFA transitions given.
    ///
    /// `hash` must have been computed using the `hash` method with the same
    /// key.
    pub fn set(
        &mut self,
        key: Vec<Transition>,
        hash: usize,
        state_id: StateID,
    ) {
        self.map[hash] =
            Utf8BoundedEntry { version: self.version, key, val: state_id };
    }
}

/// A cache of suffixes used to modestly compress UTF-8 automata for large
/// Unicode character classes.
#[derive(Clone, Debug)]
pub struct Utf8SuffixMap {
    /// The current version of this map. Only entries with matching versions
    /// are considered during lookups. If an entry is found with a mismatched
    /// version, then the map behaves as if the entry does not exist.
    version: u16,
    /// The total number of entries this map can store.
    capacity: usize,
    /// The actual entries, keyed by hash. Collisions between different states
    /// result in the old state being dropped.
    map: Vec<Utf8SuffixEntry>,
}

/// A key that uniquely identifies an NFA state. It is a triple that represents
/// a transition from one state for a particular byte range.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Utf8SuffixKey {
    pub from: StateID,
    pub start: u8,
    pub end: u8,
}

/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8SuffixEntry {
    /// The version of the map used to produce this entry. If this entry's
    /// version does not match the current version of the map, then the map
    /// should behave as if this entry does not exist.
    version: u16,
    /// The key, which consists of a transition in a particular state.
    key: Utf8SuffixKey,
    /// The identifier that the transition in the key maps to.
    val: StateID,
}

impl Utf8SuffixMap {
    /// Create a new bounded map with the given capacity. The map will never
    /// grow beyond the given size.
    ///
    /// Note that this does not allocate. Instead, callers must call `clear`
    /// before using this map. `clear` will allocate space if necessary.
    ///
    /// This avoids the need to pay for the allocation of this map when
    /// compiling regexes that lack large Unicode character classes.
    pub fn new(capacity: usize) -> Utf8SuffixMap {
        assert!(capacity > 0);
        Utf8SuffixMap { version: 0, capacity, map: vec![] }
    }

    /// Clear this map of all entries, but permit the reuse of allocation
    /// if possible.
    ///
    /// This must be called before the map can be used.
    pub fn clear(&mut self) {
        if self.map.is_empty() {
            self.map = vec![Utf8SuffixEntry::default(); self.capacity];
        } else {
            self.version = self.version.wrapping_add(1);
            if self.version == 0 {
                self.map = vec![Utf8SuffixEntry::default(); self.capacity];
            }
        }
    }

    /// Return a hash of the given transition.
    pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
        // Basic FNV-1a hash as described:
        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
        const PRIME: u64 = 1099511628211;
        const INIT: u64 = 14695981039346656037;

        let mut h = INIT;
        h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
        h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
        h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
        (h as usize) % self.map.len()
    }

    /// Retrieve the cached state ID corresponding to the given key. The hash
    /// given must have been computed with `hash` using the same key value.
    ///
    /// If there is no cached state with the given key, then None is returned.
    pub fn get(
        &mut self,
        key: &Utf8SuffixKey,
        hash: usize,
    ) -> Option<StateID> {
        let entry = &self.map[hash];
        if entry.version != self.version {
            return None;
        }
        if key != &entry.key {
            return None;
        }
        Some(entry.val)
    }

    /// Add a cached state to this map with the given key. Callers should
    /// ensure that `state_id` points to a state that contains precisely the
    /// NFA transition given.
    ///
    /// `hash` must have been computed using the `hash` method with the same
    /// key.
    pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
        self.map[hash] =
            Utf8SuffixEntry { version: self.version, key, val: state_id };
    }
}

Coverage Report

Created: 2024-12-17 06:15

Line	Count	Source (jump to first uncovered line)
1		// This module contains a couple simple and purpose built hash maps. The key
2		// trade off they make is that they serve as caches rather than true maps. That
3		// is, inserting a new entry may cause eviction of another entry. This gives
4		// us two things. First, there's less overhead associated with inserts and
5		// lookups. Secondly, it lets us control our memory usage.
6		//
7		// These maps are used in some fairly hot code when generating NFA states for
8		// large Unicode character classes.
9		//
10		// Instead of exposing a rich hashmap entry API, we just permit the caller
11		// to produce a hash of the key directly. The hash can then be reused for both
12		// lookups and insertions at the cost of leaking things a bit. But these are
13		// for internal use only, so it's fine.
14		//
15		// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16		// (almost) minimal DFA for large Unicode character classes in linear time.
17		// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18		// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19		// since there's a bit more expense in the reverse direction.)
20		//
21		// The Utf8SuffixMap is used when compiling large Unicode character classes for
22		// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23		// construction of UTF-8 automata by caching common suffixes. This doesn't
24		// get the same space savings as Daciuk's algorithm, but it's basically as
25		// fast as the naive approach and typically winds up using less memory (since
26		// it generates smaller NFAs) despite the presence of the cache.
27		//
28		// These maps effectively represent caching mechanisms for CState::Sparse and
29		// CState::Range, respectively. The former represents a single NFA state with
30		// many transitions of equivalent priority while the latter represents a single
31		// NFA state with a single transition. (Neither state ever has or is an
32		// epsilon transition.) Thus, they have different key types. It's likely we
33		// could make one generic map, but the machinery didn't seem worth it. They
34		// are simple enough.
35
36		use nfa::{StateID, Transition};
37
38		// Basic FNV-1a hash constants as described in:
39		// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
40		const PRIME: u64 = 1099511628211;
41		const INIT: u64 = 14695981039346656037;
42
43		/// A bounded hash map where the key is a sequence of NFA transitions and the
44		/// value is a pre-existing NFA state ID.
45		///
46		/// std's hashmap can be used for this, however, this map has two important
47		/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
48		/// control our memory usage by limited the number of slots. In general, the
49		/// cost here is that this map acts as a cache. That is, inserting a new entry
50		/// may remove an old entry. We are okay with this, since it does not impact
51		/// correctness in the cases where it is used. The only effect that dropping
52		/// states from the cache has is that the resulting NFA generated may be bigger
53		/// than it otherwise would be.
54		///
55		/// This improves benchmarks that compile large Unicode character classes,
56		/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
57		/// Specifically, one could observe the difference with std's hashmap via
58		/// something like the following benchmark:
59		///
60		/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
61		///
62		/// But to observe that difference, you'd have to modify the code to use
63		/// std's hashmap.
64		///
65		/// It is quite possible that there is a better way to approach this problem.
66		/// For example, if there happens to be a very common state that collides with
67		/// a lot of less frequent states, then we could wind up with very poor caching
68		/// behavior. Alas, the effectiveness of this cache has not been measured.
69		/// Instead, ad hoc experiments suggest that it is "good enough." Additional
70		/// smarts (such as an LRU eviction policy) have to be weighed against the
71		/// amount of extra time they cost.
72		#[derive(Clone, Debug)]
73		pub struct Utf8BoundedMap {
74		/// The current version of this map. Only entries with matching versions
75		/// are considered during lookups. If an entry is found with a mismatched
76		/// version, then the map behaves as if the entry does not exist.
77		version: u16,
78		/// The total number of entries this map can store.
79		capacity: usize,
80		/// The actual entries, keyed by hash. Collisions between different states
81		/// result in the old state being dropped.
82		map: Vec<Utf8BoundedEntry>,
83		}
84
85		/// An entry in this map.
86		#[derive(Clone, Debug, Default)]
87		struct Utf8BoundedEntry {
88		/// The version of the map used to produce this entry. If this entry's
89		/// version does not match the current version of the map, then the map
90		/// should behave as if this entry does not exist.
91		version: u16,
92		/// The key, which is a sorted sequence of non-overlapping NFA transitions.
93		key: Vec<Transition>,
94		/// The state ID corresponding to the state containing the transitions in
95		/// this entry.
96		val: StateID,
97		}
98
99		impl Utf8BoundedMap {
100		/// Create a new bounded map with the given capacity. The map will never
101		/// grow beyond the given size.
102		///
103		/// Note that this does not allocate. Instead, callers must call `clear`
104		/// before using this map. `clear` will allocate space if necessary.
105		///
106		/// This avoids the need to pay for the allocation of this map when
107		/// compiling regexes that lack large Unicode character classes.
108	0	pub fn new(capacity: usize) -> Utf8BoundedMap {
109	0	assert!(capacity > 0);
110	0	Utf8BoundedMap { version: 0, capacity, map: vec![] }
111	0	}
112
113		/// Clear this map of all entries, but permit the reuse of allocation
114		/// if possible.
115		///
116		/// This must be called before the map can be used.
117	0	pub fn clear(&mut self) {
118	0	if self.map.is_empty() {
119	0	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
120	0	} else {
121	0	self.version = self.version.wrapping_add(1);
122	0	if self.version == 0 {
123	0	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
124	0	}
125		}
126	0	}
127
128		/// Return a hash of the given transitions.
129	0	pub fn hash(&self, key: &[Transition]) -> usize {
130	0	let mut h = INIT;
131	0	for t in key {
132	0	h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
133	0	h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
134	0	h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
135	0	}
136	0	(h as usize) % self.map.len()
137	0	}
138
139		/// Retrieve the cached state ID corresponding to the given key. The hash
140		/// given must have been computed with `hash` using the same key value.
141		///
142		/// If there is no cached state with the given transitions, then None is
143		/// returned.
144	0	pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
145	0	let entry = &self.map[hash];
146	0	if entry.version != self.version {
147	0	return None;
148	0	}
149	0	// There may be a hash collision, so we need to confirm real equality.
150	0	if entry.key != key {
151	0	return None;
152	0	}
153	0	Some(entry.val)
154	0	}
155
156		/// Add a cached state to this map with the given key. Callers should
157		/// ensure that `state_id` points to a state that contains precisely the
158		/// NFA transitions given.
159		///
160		/// `hash` must have been computed using the `hash` method with the same
161		/// key.
162	0	pub fn set(
163	0	&mut self,
164	0	key: Vec<Transition>,
165	0	hash: usize,
166	0	state_id: StateID,
167	0	) {
168	0	self.map[hash] =
169	0	Utf8BoundedEntry { version: self.version, key, val: state_id };
170	0	}
171		}
172
173		/// A cache of suffixes used to modestly compress UTF-8 automata for large
174		/// Unicode character classes.
175		#[derive(Clone, Debug)]
176		pub struct Utf8SuffixMap {
177		/// The current version of this map. Only entries with matching versions
178		/// are considered during lookups. If an entry is found with a mismatched
179		/// version, then the map behaves as if the entry does not exist.
180		version: u16,
181		/// The total number of entries this map can store.
182		capacity: usize,
183		/// The actual entries, keyed by hash. Collisions between different states
184		/// result in the old state being dropped.
185		map: Vec<Utf8SuffixEntry>,
186		}
187
188		/// A key that uniquely identifies an NFA state. It is a triple that represents
189		/// a transition from one state for a particular byte range.
190		#[derive(Clone, Debug, Default, Eq, PartialEq)]
191		pub struct Utf8SuffixKey {
192		pub from: StateID,
193		pub start: u8,
194		pub end: u8,
195		}
196
197		/// An entry in this map.
198		#[derive(Clone, Debug, Default)]
199		struct Utf8SuffixEntry {
200		/// The version of the map used to produce this entry. If this entry's
201		/// version does not match the current version of the map, then the map
202		/// should behave as if this entry does not exist.
203		version: u16,
204		/// The key, which consists of a transition in a particular state.
205		key: Utf8SuffixKey,
206		/// The identifier that the transition in the key maps to.
207		val: StateID,
208		}
209
210		impl Utf8SuffixMap {
211		/// Create a new bounded map with the given capacity. The map will never
212		/// grow beyond the given size.
213		///
214		/// Note that this does not allocate. Instead, callers must call `clear`
215		/// before using this map. `clear` will allocate space if necessary.
216		///
217		/// This avoids the need to pay for the allocation of this map when
218		/// compiling regexes that lack large Unicode character classes.
219	0	pub fn new(capacity: usize) -> Utf8SuffixMap {
220	0	assert!(capacity > 0);
221	0	Utf8SuffixMap { version: 0, capacity, map: vec![] }
222	0	}
223
224		/// Clear this map of all entries, but permit the reuse of allocation
225		/// if possible.
226		///
227		/// This must be called before the map can be used.
228	0	pub fn clear(&mut self) {
229	0	if self.map.is_empty() {
230	0	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
231	0	} else {
232	0	self.version = self.version.wrapping_add(1);
233	0	if self.version == 0 {
234	0	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
235	0	}
236		}
237	0	}
238
239		/// Return a hash of the given transition.
240	0	pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
241		// Basic FNV-1a hash as described:
242		// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
243		const PRIME: u64 = 1099511628211;
244		const INIT: u64 = 14695981039346656037;
245
246	0	let mut h = INIT;
247	0	h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
248	0	h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
249	0	h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
250	0	(h as usize) % self.map.len()
251	0	}
252
253		/// Retrieve the cached state ID corresponding to the given key. The hash
254		/// given must have been computed with `hash` using the same key value.
255		///
256		/// If there is no cached state with the given key, then None is returned.
257	0	pub fn get(
258	0	&mut self,
259	0	key: &Utf8SuffixKey,
260	0	hash: usize,
261	0	) -> Option<StateID> {
262	0	let entry = &self.map[hash];
263	0	if entry.version != self.version {
264	0	return None;
265	0	}
266	0	if key != &entry.key {
267	0	return None;
268	0	}
269	0	Some(entry.val)
270	0	}
271
272		/// Add a cached state to this map with the given key. Callers should
273		/// ensure that `state_id` points to a state that contains precisely the
274		/// NFA transition given.
275		///
276		/// `hash` must have been computed using the `hash` method with the same
277		/// key.
278	0	pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
279	0	self.map[hash] =
280	0	Utf8SuffixEntry { version: self.version, key, val: state_id };
281	0	}
282		}