/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/util/mod.rs

Source
/*!
TODO
*/

use core::{ascii, fmt, str};

#[cfg(feature = "alloc")]
use alloc::vec::Vec;

pub mod alphabet;
pub(crate) mod bytes;
#[cfg(feature = "alloc")]
pub(crate) mod determinize;
pub mod id;
#[cfg(feature = "alloc")]
pub(crate) mod lazy;
pub(crate) mod matchtypes;
pub mod prefilter;
#[cfg(feature = "alloc")]
pub(crate) mod sparse_set;
pub(crate) mod start;
#[cfg(feature = "alloc")]
pub(crate) mod syntax;

/// The offset, in bytes, that a match is delayed by in the DFAs generated by
/// this crate. (This includes lazy DFAs.)
///
/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
/// and $. In particular, both of these operators may require the
/// identification of the end of input in order to confirm a match. Not only
/// does this mean that all matches must therefore be delayed by a single byte,
/// but that a special EOI value is added to the alphabet of all DFAs. (Which
/// means that even though the alphabet of a DFA is typically all byte values,
/// the actual maximum alphabet size is 257 due to the extra EOI value.)
///
/// Since we delay matches by only 1 byte, this can't fully support a
/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
/// DFAs in this crate do not support it. (It's not as simple as just
/// increasing the match offset to do it---otherwise we would---but building
/// the full Unicode-aware word boundary detection into an automaton is quite
/// tricky.)
pub(crate) const MATCH_OFFSET: usize = 1;

/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct DebugByte(pub u8);

impl fmt::Debug for DebugByte {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // 10 bytes is enough to cover any output from ascii::escape_default.
        let mut bytes = [0u8; 10];
        let mut len = 0;
        for (i, mut b) in ascii::escape_default(self.0).enumerate() {
            // capitalize \xab to \xAB
            if i >= 2 && b'a' <= b && b <= b'f' {
                b -= 32;
            }
            bytes[len] = b;
            len += 1;
        }
        write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
    }
}

/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
///
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
/// value is guaranteed to be greater than `i`.
///
/// Generally speaking, this should only be called on `text` when it is
/// permitted to assume that it is valid UTF-8 and where either `i >=
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
#[inline(always)]
pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
    let b = match text.get(i) {
        None => return i.checked_add(1).unwrap(),
        Some(&b) => b,
    };
    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
    // other than just start at the next byte.
    let inc = utf8_len(b).unwrap_or(1);
    i.checked_add(inc).unwrap()
}

/// Returns true if and only if the given byte is considered a word character.
/// This only applies to ASCII.
///
/// This was copied from regex-syntax so that we can use it to determine the
/// starting DFA state while searching without depending on regex-syntax. The
/// definition is never going to change, so there's no maintenance/bit-rot
/// hazard here.
#[inline(always)]
pub(crate) fn is_word_byte(b: u8) -> bool {
    match b {
        b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
        _ => false,
    }
}

/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[inline(always)]
pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let len = match utf8_len(bytes[0]) {
        None => return Some(Err(bytes[0])),
        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
        Some(1) => return Some(Ok(bytes[0] as char)),
        Some(len) => len,
    };
    match str::from_utf8(&bytes[..len]) {
        Ok(s) => Some(Ok(s.chars().next().unwrap())),
        Err(_) => Some(Err(bytes[0])),
    }
}

/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the end of the given byte
/// slice, then the last byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[inline(always)]
pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let mut start = bytes.len() - 1;
    let limit = bytes.len().saturating_sub(4);
    while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
        start -= 1;
    }
    match decode_utf8(&bytes[start..]) {
        None => None,
        Some(Ok(ch)) => Some(Ok(ch)),
        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
    }
}

/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
#[inline(always)]
fn utf8_len(byte: u8) -> Option<usize> {
    if byte <= 0x7F {
        return Some(1);
    } else if byte & 0b1100_0000 == 0b1000_0000 {
        return None;
    } else if byte <= 0b1101_1111 {
        Some(2)
    } else if byte <= 0b1110_1111 {
        Some(3)
    } else if byte <= 0b1111_0111 {
        Some(4)
    } else {
        None
    }
}

/// Returns true if and only if the given byte is either a valid leading UTF-8
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
/// valid UTF-8 sequence.
#[inline(always)]
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
    // In the ASCII case, the most significant bit is never set. The leading
    // byte of a 2/3/4-byte sequence always has the top two most significant
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
    // also returns true, since every such byte has its two most significant
    // bits set:
    //
    //     \xC0 :: 11000000
    //     \xC1 :: 11000001
    //     \xF5 :: 11110101
    //     \xF6 :: 11110110
    //     \xF7 :: 11110111
    //     \xF8 :: 11111000
    //     \xF9 :: 11111001
    //     \xFA :: 11111010
    //     \xFB :: 11111011
    //     \xFC :: 11111100
    //     \xFD :: 11111101
    //     \xFE :: 11111110
    //     \xFF :: 11111111
    (b & 0b1100_0000) != 0b1000_0000
}

#[cfg(feature = "alloc")]
#[inline(always)]
pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
    use core::{ptr, sync::atomic::AtomicPtr};

    use crate::{
        dfa::{
            dense::{self, DFA},
            Automaton,
        },
        util::lazy,
    };

    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());

    let dfa = lazy::get_or_init(&WORD, || {
        // TODO: Should we use a lazy DFA here instead? It does complicate
        // things somewhat, since we then need a mutable cache, which probably
        // means a thread local.
        dense::Builder::new()
            .configure(dense::Config::new().anchored(true))
            .build(r"\w")
            .unwrap()
    });
    // This is OK since '\w' contains no look-around.
    let mut sid = dfa.universal_start_state();
    while at < bytes.len() {
        let byte = bytes[at];
        sid = dfa.next_state(sid, byte);
        at += 1;
        if dfa.is_special_state(sid) {
            if dfa.is_match_state(sid) {
                return true;
            } else if dfa.is_dead_state(sid) {
                return false;
            }
        }
    }
    dfa.is_match_state(dfa.next_eoi_state(sid))
}

#[cfg(feature = "alloc")]
#[inline(always)]
pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
    use core::{ptr, sync::atomic::AtomicPtr};

    use crate::{
        dfa::{
            dense::{self, DFA},
            Automaton,
        },
        nfa::thompson::NFA,
    };

    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());

    let dfa = lazy::get_or_init(&WORD, || {
        dense::Builder::new()
            .configure(dense::Config::new().anchored(true))
            .thompson(NFA::config().reverse(true).shrink(true))
            .build(r"\w")
            .unwrap()
    });

    // This is OK since '\w' contains no look-around.
    let mut sid = dfa.universal_start_state();
    while at > 0 {
        at -= 1;
        let byte = bytes[at];
        sid = dfa.next_state(sid, byte);
        if dfa.is_special_state(sid) {
            if dfa.is_match_state(sid) {
                return true;
            } else if dfa.is_dead_state(sid) {
                return false;
            }
        }
    }
    dfa.is_match_state(dfa.next_eoi_state(sid))
}

Coverage Report

Created: 2025-11-11 06:52

Line	Count	Source
1		/*!
2		TODO
3		*/
4
5		use core::{ascii, fmt, str};
6
7		#[cfg(feature = "alloc")]
8		use alloc::vec::Vec;
9
10		pub mod alphabet;
11		pub(crate) mod bytes;
12		#[cfg(feature = "alloc")]
13		pub(crate) mod determinize;
14		pub mod id;
15		#[cfg(feature = "alloc")]
16		pub(crate) mod lazy;
17		pub(crate) mod matchtypes;
18		pub mod prefilter;
19		#[cfg(feature = "alloc")]
20		pub(crate) mod sparse_set;
21		pub(crate) mod start;
22		#[cfg(feature = "alloc")]
23		pub(crate) mod syntax;
24
25		/// The offset, in bytes, that a match is delayed by in the DFAs generated by
26		/// this crate. (This includes lazy DFAs.)
27		///
28		/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
29		/// and $. In particular, both of these operators may require the
30		/// identification of the end of input in order to confirm a match. Not only
31		/// does this mean that all matches must therefore be delayed by a single byte,
32		/// but that a special EOI value is added to the alphabet of all DFAs. (Which
33		/// means that even though the alphabet of a DFA is typically all byte values,
34		/// the actual maximum alphabet size is 257 due to the extra EOI value.)
35		///
36		/// Since we delay matches by only 1 byte, this can't fully support a
37		/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
38		/// DFAs in this crate do not support it. (It's not as simple as just
39		/// increasing the match offset to do it---otherwise we would---but building
40		/// the full Unicode-aware word boundary detection into an automaton is quite
41		/// tricky.)
42		pub(crate) const MATCH_OFFSET: usize = 1;
43
44		/// A type that wraps a single byte with a convenient fmt::Debug impl that
45		/// escapes the byte.
46		pub(crate) struct DebugByte(pub u8);
47
48		impl fmt::Debug for DebugByte {
49	0	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
50		// 10 bytes is enough to cover any output from ascii::escape_default.
51	0	let mut bytes = [0u8; 10];
52	0	let mut len = 0;
53	0	for (i, mut b) in ascii::escape_default(self.0).enumerate() {
54		// capitalize \xab to \xAB
55	0	if i >= 2 && b'a' <= b && b <= b'f' {
56	0	b -= 32;
57	0	}
58	0	bytes[len] = b;
59	0	len += 1;
60		}
61	0	write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
62	0	}
63		}
64
65		/// Returns the smallest possible index of the next valid UTF-8 sequence
66		/// starting after `i`.
67		///
68		/// For all inputs, including invalid UTF-8 and any value of `i`, the return
69		/// value is guaranteed to be greater than `i`.
70		///
71		/// Generally speaking, this should only be called on `text` when it is
72		/// permitted to assume that it is valid UTF-8 and where either `i >=
73		/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
74		#[inline(always)]
75	0	pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
76	0	let b = match text.get(i) {
77	0	None => return i.checked_add(1).unwrap(),
78	0	Some(&b) => b,
79		};
80		// For cases where we see an invalid UTF-8 byte, there isn't much we can do
81		// other than just start at the next byte.
82	0	let inc = utf8_len(b).unwrap_or(1);
83	0	i.checked_add(inc).unwrap()
84	0	}
85
86		/// Returns true if and only if the given byte is considered a word character.
87		/// This only applies to ASCII.
88		///
89		/// This was copied from regex-syntax so that we can use it to determine the
90		/// starting DFA state while searching without depending on regex-syntax. The
91		/// definition is never going to change, so there's no maintenance/bit-rot
92		/// hazard here.
93		#[inline(always)]
94	0	pub(crate) fn is_word_byte(b: u8) -> bool {
95	0	match b {
96	0	b'_' \| b'0'..=b'9' \| b'a'..=b'z' \| b'A'..=b'Z' => true,
97	0	_ => false,
98		}
99	0	}
100
101		/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
102		///
103		/// If no valid encoding of a codepoint exists at the beginning of the given
104		/// byte slice, then the first byte is returned instead.
105		///
106		/// This returns `None` if and only if `bytes` is empty.
107		#[inline(always)]
108	0	pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
109	0	if bytes.is_empty() {
110	0	return None;
111	0	}
112	0	let len = match utf8_len(bytes[0]) {
113	0	None => return Some(Err(bytes[0])),
114	0	Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
115	0	Some(1) => return Some(Ok(bytes[0] as char)),
116	0	Some(len) => len,
117		};
118	0	match str::from_utf8(&bytes[..len]) {
119	0	Ok(s) => Some(Ok(s.chars().next().unwrap())),
120	0	Err(_) => Some(Err(bytes[0])),
121		}
122	0	}
123
124		/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
125		///
126		/// If no valid encoding of a codepoint exists at the end of the given byte
127		/// slice, then the last byte is returned instead.
128		///
129		/// This returns `None` if and only if `bytes` is empty.
130		#[inline(always)]
131	0	pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
132	0	if bytes.is_empty() {
133	0	return None;
134	0	}
135	0	let mut start = bytes.len() - 1;
136	0	let limit = bytes.len().saturating_sub(4);
137	0	while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
138	0	start -= 1;
139	0	}
140	0	match decode_utf8(&bytes[start..]) {
141	0	None => None,
142	0	Some(Ok(ch)) => Some(Ok(ch)),
143	0	Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
144		}
145	0	}
146
147		/// Given a UTF-8 leading byte, this returns the total number of code units
148		/// in the following encoded codepoint.
149		///
150		/// If the given byte is not a valid UTF-8 leading byte, then this returns
151		/// `None`.
152		#[inline(always)]
153	0	fn utf8_len(byte: u8) -> Option<usize> {
154	0	if byte <= 0x7F {
155	0	return Some(1);
156	0	} else if byte & 0b1100_0000 == 0b1000_0000 {
157	0	return None;
158	0	} else if byte <= 0b1101_1111 {
159	0	Some(2)
160	0	} else if byte <= 0b1110_1111 {
161	0	Some(3)
162	0	} else if byte <= 0b1111_0111 {
163	0	Some(4)
164		} else {
165	0	None
166		}
167	0	}
168
169		/// Returns true if and only if the given byte is either a valid leading UTF-8
170		/// byte, or is otherwise an invalid byte that can never appear anywhere in a
171		/// valid UTF-8 sequence.
172		#[inline(always)]
173	0	fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
174		// In the ASCII case, the most significant bit is never set. The leading
175		// byte of a 2/3/4-byte sequence always has the top two most significant
176		// bits set. For bytes that can never appear anywhere in valid UTF-8, this
177		// also returns true, since every such byte has its two most significant
178		// bits set:
179		//
180		// \xC0 :: 11000000
181		// \xC1 :: 11000001
182		// \xF5 :: 11110101
183		// \xF6 :: 11110110
184		// \xF7 :: 11110111
185		// \xF8 :: 11111000
186		// \xF9 :: 11111001
187		// \xFA :: 11111010
188		// \xFB :: 11111011
189		// \xFC :: 11111100
190		// \xFD :: 11111101
191		// \xFE :: 11111110
192		// \xFF :: 11111111
193	0	(b & 0b1100_0000) != 0b1000_0000
194	0	}
195
196		#[cfg(feature = "alloc")]
197		#[inline(always)]
198		pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
199		use core::{ptr, sync::atomic::AtomicPtr};
200
201		use crate::{
202		dfa::{
203		dense::{self, DFA},
204		Automaton,
205		},
206		util::lazy,
207		};
208
209		static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
210
211		let dfa = lazy::get_or_init(&WORD, \|\| {
212		// TODO: Should we use a lazy DFA here instead? It does complicate
213		// things somewhat, since we then need a mutable cache, which probably
214		// means a thread local.
215		dense::Builder::new()
216		.configure(dense::Config::new().anchored(true))
217		.build(r"\w")
218		.unwrap()
219		});
220		// This is OK since '\w' contains no look-around.
221		let mut sid = dfa.universal_start_state();
222		while at < bytes.len() {
223		let byte = bytes[at];
224		sid = dfa.next_state(sid, byte);
225		at += 1;
226		if dfa.is_special_state(sid) {
227		if dfa.is_match_state(sid) {
228		return true;
229		} else if dfa.is_dead_state(sid) {
230		return false;
231		}
232		}
233		}
234		dfa.is_match_state(dfa.next_eoi_state(sid))
235		}
236
237		#[cfg(feature = "alloc")]
238		#[inline(always)]
239		pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
240		use core::{ptr, sync::atomic::AtomicPtr};
241
242		use crate::{
243		dfa::{
244		dense::{self, DFA},
245		Automaton,
246		},
247		nfa::thompson::NFA,
248		};
249
250		static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
251
252		let dfa = lazy::get_or_init(&WORD, \|\| {
253		dense::Builder::new()
254		.configure(dense::Config::new().anchored(true))
255		.thompson(NFA::config().reverse(true).shrink(true))
256		.build(r"\w")
257		.unwrap()
258		});
259
260		// This is OK since '\w' contains no look-around.
261		let mut sid = dfa.universal_start_state();
262		while at > 0 {
263		at -= 1;
264		let byte = bytes[at];
265		sid = dfa.next_state(sid, byte);
266		if dfa.is_special_state(sid) {
267		if dfa.is_match_state(sid) {
268		return true;
269		} else if dfa.is_dead_state(sid) {
270		return false;
271		}
272		}
273		}
274		dfa.is_match_state(dfa.next_eoi_state(sid))
275		}