Coverage Report

Created: 2025-11-11 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/util/mod.rs
Line
Count
Source
1
/*!
2
TODO
3
*/
4
5
use core::{ascii, fmt, str};
6
7
#[cfg(feature = "alloc")]
8
use alloc::vec::Vec;
9
10
pub mod alphabet;
11
pub(crate) mod bytes;
12
#[cfg(feature = "alloc")]
13
pub(crate) mod determinize;
14
pub mod id;
15
#[cfg(feature = "alloc")]
16
pub(crate) mod lazy;
17
pub(crate) mod matchtypes;
18
pub mod prefilter;
19
#[cfg(feature = "alloc")]
20
pub(crate) mod sparse_set;
21
pub(crate) mod start;
22
#[cfg(feature = "alloc")]
23
pub(crate) mod syntax;
24
25
/// The offset, in bytes, that a match is delayed by in the DFAs generated by
26
/// this crate. (This includes lazy DFAs.)
27
///
28
/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
29
/// and $. In particular, both of these operators may require the
30
/// identification of the end of input in order to confirm a match. Not only
31
/// does this mean that all matches must therefore be delayed by a single byte,
32
/// but that a special EOI value is added to the alphabet of all DFAs. (Which
33
/// means that even though the alphabet of a DFA is typically all byte values,
34
/// the actual maximum alphabet size is 257 due to the extra EOI value.)
35
///
36
/// Since we delay matches by only 1 byte, this can't fully support a
37
/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
38
/// DFAs in this crate do not support it. (It's not as simple as just
39
/// increasing the match offset to do it---otherwise we would---but building
40
/// the full Unicode-aware word boundary detection into an automaton is quite
41
/// tricky.)
42
pub(crate) const MATCH_OFFSET: usize = 1;
43
44
/// A type that wraps a single byte with a convenient fmt::Debug impl that
45
/// escapes the byte.
46
pub(crate) struct DebugByte(pub u8);
47
48
impl fmt::Debug for DebugByte {
49
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
50
        // 10 bytes is enough to cover any output from ascii::escape_default.
51
0
        let mut bytes = [0u8; 10];
52
0
        let mut len = 0;
53
0
        for (i, mut b) in ascii::escape_default(self.0).enumerate() {
54
            // capitalize \xab to \xAB
55
0
            if i >= 2 && b'a' <= b && b <= b'f' {
56
0
                b -= 32;
57
0
            }
58
0
            bytes[len] = b;
59
0
            len += 1;
60
        }
61
0
        write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
62
0
    }
63
}
64
65
/// Returns the smallest possible index of the next valid UTF-8 sequence
66
/// starting after `i`.
67
///
68
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
69
/// value is guaranteed to be greater than `i`.
70
///
71
/// Generally speaking, this should only be called on `text` when it is
72
/// permitted to assume that it is valid UTF-8 and where either `i >=
73
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
74
#[inline(always)]
75
0
pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
76
0
    let b = match text.get(i) {
77
0
        None => return i.checked_add(1).unwrap(),
78
0
        Some(&b) => b,
79
    };
80
    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
81
    // other than just start at the next byte.
82
0
    let inc = utf8_len(b).unwrap_or(1);
83
0
    i.checked_add(inc).unwrap()
84
0
}
85
86
/// Returns true if and only if the given byte is considered a word character.
87
/// This only applies to ASCII.
88
///
89
/// This was copied from regex-syntax so that we can use it to determine the
90
/// starting DFA state while searching without depending on regex-syntax. The
91
/// definition is never going to change, so there's no maintenance/bit-rot
92
/// hazard here.
93
#[inline(always)]
94
0
pub(crate) fn is_word_byte(b: u8) -> bool {
95
0
    match b {
96
0
        b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
97
0
        _ => false,
98
    }
99
0
}
100
101
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
102
///
103
/// If no valid encoding of a codepoint exists at the beginning of the given
104
/// byte slice, then the first byte is returned instead.
105
///
106
/// This returns `None` if and only if `bytes` is empty.
107
#[inline(always)]
108
0
pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
109
0
    if bytes.is_empty() {
110
0
        return None;
111
0
    }
112
0
    let len = match utf8_len(bytes[0]) {
113
0
        None => return Some(Err(bytes[0])),
114
0
        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
115
0
        Some(1) => return Some(Ok(bytes[0] as char)),
116
0
        Some(len) => len,
117
    };
118
0
    match str::from_utf8(&bytes[..len]) {
119
0
        Ok(s) => Some(Ok(s.chars().next().unwrap())),
120
0
        Err(_) => Some(Err(bytes[0])),
121
    }
122
0
}
123
124
/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
125
///
126
/// If no valid encoding of a codepoint exists at the end of the given byte
127
/// slice, then the last byte is returned instead.
128
///
129
/// This returns `None` if and only if `bytes` is empty.
130
#[inline(always)]
131
0
pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
132
0
    if bytes.is_empty() {
133
0
        return None;
134
0
    }
135
0
    let mut start = bytes.len() - 1;
136
0
    let limit = bytes.len().saturating_sub(4);
137
0
    while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
138
0
        start -= 1;
139
0
    }
140
0
    match decode_utf8(&bytes[start..]) {
141
0
        None => None,
142
0
        Some(Ok(ch)) => Some(Ok(ch)),
143
0
        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
144
    }
145
0
}
146
147
/// Given a UTF-8 leading byte, this returns the total number of code units
148
/// in the following encoded codepoint.
149
///
150
/// If the given byte is not a valid UTF-8 leading byte, then this returns
151
/// `None`.
152
#[inline(always)]
153
0
fn utf8_len(byte: u8) -> Option<usize> {
154
0
    if byte <= 0x7F {
155
0
        return Some(1);
156
0
    } else if byte & 0b1100_0000 == 0b1000_0000 {
157
0
        return None;
158
0
    } else if byte <= 0b1101_1111 {
159
0
        Some(2)
160
0
    } else if byte <= 0b1110_1111 {
161
0
        Some(3)
162
0
    } else if byte <= 0b1111_0111 {
163
0
        Some(4)
164
    } else {
165
0
        None
166
    }
167
0
}
168
169
/// Returns true if and only if the given byte is either a valid leading UTF-8
170
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
171
/// valid UTF-8 sequence.
172
#[inline(always)]
173
0
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
174
    // In the ASCII case, the most significant bit is never set. The leading
175
    // byte of a 2/3/4-byte sequence always has the top two most significant
176
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
177
    // also returns true, since every such byte has its two most significant
178
    // bits set:
179
    //
180
    //     \xC0 :: 11000000
181
    //     \xC1 :: 11000001
182
    //     \xF5 :: 11110101
183
    //     \xF6 :: 11110110
184
    //     \xF7 :: 11110111
185
    //     \xF8 :: 11111000
186
    //     \xF9 :: 11111001
187
    //     \xFA :: 11111010
188
    //     \xFB :: 11111011
189
    //     \xFC :: 11111100
190
    //     \xFD :: 11111101
191
    //     \xFE :: 11111110
192
    //     \xFF :: 11111111
193
0
    (b & 0b1100_0000) != 0b1000_0000
194
0
}
195
196
#[cfg(feature = "alloc")]
197
#[inline(always)]
198
pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
199
    use core::{ptr, sync::atomic::AtomicPtr};
200
201
    use crate::{
202
        dfa::{
203
            dense::{self, DFA},
204
            Automaton,
205
        },
206
        util::lazy,
207
    };
208
209
    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
210
211
    let dfa = lazy::get_or_init(&WORD, || {
212
        // TODO: Should we use a lazy DFA here instead? It does complicate
213
        // things somewhat, since we then need a mutable cache, which probably
214
        // means a thread local.
215
        dense::Builder::new()
216
            .configure(dense::Config::new().anchored(true))
217
            .build(r"\w")
218
            .unwrap()
219
    });
220
    // This is OK since '\w' contains no look-around.
221
    let mut sid = dfa.universal_start_state();
222
    while at < bytes.len() {
223
        let byte = bytes[at];
224
        sid = dfa.next_state(sid, byte);
225
        at += 1;
226
        if dfa.is_special_state(sid) {
227
            if dfa.is_match_state(sid) {
228
                return true;
229
            } else if dfa.is_dead_state(sid) {
230
                return false;
231
            }
232
        }
233
    }
234
    dfa.is_match_state(dfa.next_eoi_state(sid))
235
}
236
237
#[cfg(feature = "alloc")]
238
#[inline(always)]
239
pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
240
    use core::{ptr, sync::atomic::AtomicPtr};
241
242
    use crate::{
243
        dfa::{
244
            dense::{self, DFA},
245
            Automaton,
246
        },
247
        nfa::thompson::NFA,
248
    };
249
250
    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
251
252
    let dfa = lazy::get_or_init(&WORD, || {
253
        dense::Builder::new()
254
            .configure(dense::Config::new().anchored(true))
255
            .thompson(NFA::config().reverse(true).shrink(true))
256
            .build(r"\w")
257
            .unwrap()
258
    });
259
260
    // This is OK since '\w' contains no look-around.
261
    let mut sid = dfa.universal_start_state();
262
    while at > 0 {
263
        at -= 1;
264
        let byte = bytes[at];
265
        sid = dfa.next_state(sid, byte);
266
        if dfa.is_special_state(sid) {
267
            if dfa.is_match_state(sid) {
268
                return true;
269
            } else if dfa.is_dead_state(sid) {
270
                return false;
271
            }
272
        }
273
    }
274
    dfa.is_match_state(dfa.next_eoi_state(sid))
275
}