/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/utf8.rs

Source (jump to first uncovered line)
/*!
Utilities for dealing with UTF-8.

This module provides some UTF-8 related helper routines, including an
incremental decoder.
*/

/// Returns true if and only if the given byte is considered a word character.
/// This only applies to ASCII.
///
/// This was copied from regex-syntax so that we can use it to determine the
/// starting DFA state while searching without depending on regex-syntax. The
/// definition is never going to change, so there's no maintenance/bit-rot
/// hazard here.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn is_word_byte(b: u8) -> bool {
    const fn mkwordset() -> [bool; 256] {
        // FIXME: Use as_usize() once const functions in traits are stable.
        let mut set = [false; 256];
        set[b'_' as usize] = true;

        let mut byte = b'0';
        while byte <= b'9' {
            set[byte as usize] = true;
            byte += 1;
        }
        byte = b'A';
        while byte <= b'Z' {
            set[byte as usize] = true;
            byte += 1;
        }
        byte = b'a';
        while byte <= b'z' {
            set[byte as usize] = true;
            byte += 1;
        }
        set
    }
    const WORD: [bool; 256] = mkwordset();
    WORD[b as usize]
}

/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for a
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
/// crate, then please file an issue and discuss your use case.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let len = match len(bytes[0]) {
        None => return Some(Err(bytes[0])),
        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
        Some(1) => return Some(Ok(char::from(bytes[0]))),
        Some(len) => len,
    };
    match core::str::from_utf8(&bytes[..len]) {
        Ok(s) => Some(Ok(s.chars().next().unwrap())),
        Err(_) => Some(Err(bytes[0])),
    }
}

/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the end of the given byte
/// slice, then the last byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
    if bytes.is_empty() {
        return None;
    }
    let mut start = bytes.len() - 1;
    let limit = bytes.len().saturating_sub(4);
    while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
        start -= 1;
    }
    match decode(&bytes[start..]) {
        None => None,
        Some(Ok(ch)) => Some(Ok(ch)),
        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
    }
}

/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn len(byte: u8) -> Option<usize> {
    if byte <= 0x7F {
        return Some(1);
    } else if byte & 0b1100_0000 == 0b1000_0000 {
        return None;
    } else if byte <= 0b1101_1111 {
        Some(2)
    } else if byte <= 0b1110_1111 {
        Some(3)
    } else if byte <= 0b1111_0111 {
        Some(4)
    } else {
        None
    }
}

/// Returns true if and only if the given offset in the given bytes falls on a
/// valid UTF-8 encoded codepoint boundary.
///
/// If `bytes` is not valid UTF-8, then the behavior of this routine is
/// unspecified.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
    match bytes.get(i) {
        // The position at the end of the bytes always represents an empty
        // string, which is a valid boundary. But anything after that doesn't
        // make much sense to call valid a boundary.
        None => i == bytes.len(),
        // Other than ASCII (where the most significant bit is never set),
        // valid starting bytes always have their most significant two bits
        // set, where as continuation bytes never have their second most
        // significant bit set. Therefore, this only returns true when bytes[i]
        // corresponds to a byte that begins a valid UTF-8 encoding of a
        // Unicode scalar value.
        Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
    }
}

/// Returns true if and only if the given byte is either a valid leading UTF-8
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
/// valid UTF-8 sequence.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_leading_or_invalid_byte(b: u8) -> bool {
    // In the ASCII case, the most significant bit is never set. The leading
    // byte of a 2/3/4-byte sequence always has the top two most significant
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
    // also returns true, since every such byte has its two most significant
    // bits set:
    //
    //     \xC0 :: 11000000
    //     \xC1 :: 11000001
    //     \xF5 :: 11110101
    //     \xF6 :: 11110110
    //     \xF7 :: 11110111
    //     \xF8 :: 11111000
    //     \xF9 :: 11111001
    //     \xFA :: 11111010
    //     \xFB :: 11111011
    //     \xFC :: 11111100
    //     \xFD :: 11111101
    //     \xFE :: 11111110
    //     \xFF :: 11111111
    (b & 0b1100_0000) != 0b1000_0000
}

/*
/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
///
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
/// value is guaranteed to be greater than `i`. (If there is no value greater
/// than `i` that fits in `usize`, then this panics.)
///
/// Generally speaking, this should only be called on `text` when it is
/// permitted to assume that it is valid UTF-8 and where either `i >=
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
///
/// NOTE: This method was used in a previous conception of iterators where we
/// specifically tried to skip over empty matches that split a codepoint by
/// simply requiring that our next search begin at the beginning of codepoint.
/// But we ended up changing that technique to always advance by 1 byte and
/// then filter out matches that split a codepoint after-the-fact. Thus, we no
/// longer use this method. But I've kept it around in case we want to switch
/// back to this approach. Its guarantees are a little subtle, so I'd prefer
/// not to rebuild it from whole cloth.
pub(crate) fn next(text: &[u8], i: usize) -> usize {
    let b = match text.get(i) {
        None => return i.checked_add(1).unwrap(),
        Some(&b) => b,
    };
    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
    // other than just start at the next byte.
    let inc = len(b).unwrap_or(1);
    i.checked_add(inc).unwrap()
}
*/

Coverage Report

Created: 2025-07-11 07:02

Line	Count	Source (jump to first uncovered line)
1		/*!
2		Utilities for dealing with UTF-8.
3
4		This module provides some UTF-8 related helper routines, including an
5		incremental decoder.
6		*/
7
8		/// Returns true if and only if the given byte is considered a word character.
9		/// This only applies to ASCII.
10		///
11		/// This was copied from regex-syntax so that we can use it to determine the
12		/// starting DFA state while searching without depending on regex-syntax. The
13		/// definition is never going to change, so there's no maintenance/bit-rot
14		/// hazard here.
15		#[cfg_attr(feature = "perf-inline", inline(always))]
16	0	pub(crate) fn is_word_byte(b: u8) -> bool {
17	0	const fn mkwordset() -> [bool; 256] {
18	0	// FIXME: Use as_usize() once const functions in traits are stable.
19	0	let mut set = [false; 256];
20	0	set[b'_' as usize] = true;
21	0
22	0	let mut byte = b'0';
23	0	while byte <= b'9' {
24	0	set[byte as usize] = true;
25	0	byte += 1;
26	0	}
27	0	byte = b'A';
28	0	while byte <= b'Z' {
29	0	set[byte as usize] = true;
30	0	byte += 1;
31	0	}
32	0	byte = b'a';
33	0	while byte <= b'z' {
34	0	set[byte as usize] = true;
35	0	byte += 1;
36	0	}
37	0	set
38	0	}
39		const WORD: [bool; 256] = mkwordset();
40	0	WORD[b as usize]
41	0	}
42
43		/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
44		///
45		/// If no valid encoding of a codepoint exists at the beginning of the given
46		/// byte slice, then the first byte is returned instead.
47		///
48		/// This returns `None` if and only if `bytes` is empty.
49		///
50		/// This never panics.
51		///
52		/// WARNING: This is not designed for performance. If you're looking for a
53		/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
54		/// crate, then please file an issue and discuss your use case.
55		#[cfg_attr(feature = "perf-inline", inline(always))]
56	0	pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
57	0	if bytes.is_empty() {
58	0	return None;
59	0	}
60	0	let len = match len(bytes[0]) {
61	0	None => return Some(Err(bytes[0])),
62	0	Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
63	0	Some(1) => return Some(Ok(char::from(bytes[0]))),
64	0	Some(len) => len,
65	0	};
66	0	match core::str::from_utf8(&bytes[..len]) {
67	0	Ok(s) => Some(Ok(s.chars().next().unwrap())),
68	0	Err(_) => Some(Err(bytes[0])),
69		}
70	0	}
71
72		/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
73		///
74		/// If no valid encoding of a codepoint exists at the end of the given byte
75		/// slice, then the last byte is returned instead.
76		///
77		/// This returns `None` if and only if `bytes` is empty.
78		#[cfg_attr(feature = "perf-inline", inline(always))]
79	0	pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
80	0	if bytes.is_empty() {
81	0	return None;
82	0	}
83	0	let mut start = bytes.len() - 1;
84	0	let limit = bytes.len().saturating_sub(4);
85	0	while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
86	0	start -= 1;
87	0	}
88	0	match decode(&bytes[start..]) {
89	0	None => None,
90	0	Some(Ok(ch)) => Some(Ok(ch)),
91	0	Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
92		}
93	0	}
94
95		/// Given a UTF-8 leading byte, this returns the total number of code units
96		/// in the following encoded codepoint.
97		///
98		/// If the given byte is not a valid UTF-8 leading byte, then this returns
99		/// `None`.
100		#[cfg_attr(feature = "perf-inline", inline(always))]
101	0	fn len(byte: u8) -> Option<usize> {
102	0	if byte <= 0x7F {
103	0	return Some(1);
104	0	} else if byte & 0b1100_0000 == 0b1000_0000 {
105	0	return None;
106	0	} else if byte <= 0b1101_1111 {
107	0	Some(2)
108	0	} else if byte <= 0b1110_1111 {
109	0	Some(3)
110	0	} else if byte <= 0b1111_0111 {
111	0	Some(4)
112		} else {
113	0	None
114		}
115	0	}
116
117		/// Returns true if and only if the given offset in the given bytes falls on a
118		/// valid UTF-8 encoded codepoint boundary.
119		///
120		/// If `bytes` is not valid UTF-8, then the behavior of this routine is
121		/// unspecified.
122		#[cfg_attr(feature = "perf-inline", inline(always))]
123	0	pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
124	0	match bytes.get(i) {
125		// The position at the end of the bytes always represents an empty
126		// string, which is a valid boundary. But anything after that doesn't
127		// make much sense to call valid a boundary.
128	0	None => i == bytes.len(),
129		// Other than ASCII (where the most significant bit is never set),
130		// valid starting bytes always have their most significant two bits
131		// set, where as continuation bytes never have their second most
132		// significant bit set. Therefore, this only returns true when bytes[i]
133		// corresponds to a byte that begins a valid UTF-8 encoding of a
134		// Unicode scalar value.
135	0	Some(&b) => b <= 0b0111_1111 \|\| b >= 0b1100_0000,
136		}
137	0	}
138
139		/// Returns true if and only if the given byte is either a valid leading UTF-8
140		/// byte, or is otherwise an invalid byte that can never appear anywhere in a
141		/// valid UTF-8 sequence.
142		#[cfg_attr(feature = "perf-inline", inline(always))]
143	0	fn is_leading_or_invalid_byte(b: u8) -> bool {
144	0	// In the ASCII case, the most significant bit is never set. The leading
145	0	// byte of a 2/3/4-byte sequence always has the top two most significant
146	0	// bits set. For bytes that can never appear anywhere in valid UTF-8, this
147	0	// also returns true, since every such byte has its two most significant
148	0	// bits set:
149	0	//
150	0	// \xC0 :: 11000000
151	0	// \xC1 :: 11000001
152	0	// \xF5 :: 11110101
153	0	// \xF6 :: 11110110
154	0	// \xF7 :: 11110111
155	0	// \xF8 :: 11111000
156	0	// \xF9 :: 11111001
157	0	// \xFA :: 11111010
158	0	// \xFB :: 11111011
159	0	// \xFC :: 11111100
160	0	// \xFD :: 11111101
161	0	// \xFE :: 11111110
162	0	// \xFF :: 11111111
163	0	(b & 0b1100_0000) != 0b1000_0000
164	0	}
165
166		/*
167		/// Returns the smallest possible index of the next valid UTF-8 sequence
168		/// starting after `i`.
169		///
170		/// For all inputs, including invalid UTF-8 and any value of `i`, the return
171		/// value is guaranteed to be greater than `i`. (If there is no value greater
172		/// than `i` that fits in `usize`, then this panics.)
173		///
174		/// Generally speaking, this should only be called on `text` when it is
175		/// permitted to assume that it is valid UTF-8 and where either `i >=
176		/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
177		///
178		/// NOTE: This method was used in a previous conception of iterators where we
179		/// specifically tried to skip over empty matches that split a codepoint by
180		/// simply requiring that our next search begin at the beginning of codepoint.
181		/// But we ended up changing that technique to always advance by 1 byte and
182		/// then filter out matches that split a codepoint after-the-fact. Thus, we no
183		/// longer use this method. But I've kept it around in case we want to switch
184		/// back to this approach. Its guarantees are a little subtle, so I'd prefer
185		/// not to rebuild it from whole cloth.
186		pub(crate) fn next(text: &[u8], i: usize) -> usize {
187		let b = match text.get(i) {
188		None => return i.checked_add(1).unwrap(),
189		Some(&b) => b,
190		};
191		// For cases where we see an invalid UTF-8 byte, there isn't much we can do
192		// other than just start at the next byte.
193		let inc = len(b).unwrap_or(1);
194		i.checked_add(inc).unwrap()
195		}
196		*/