/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/util/mod.rs
Line | Count | Source |
1 | | /*! |
2 | | TODO |
3 | | */ |
4 | | |
5 | | use core::{ascii, fmt, str}; |
6 | | |
7 | | #[cfg(feature = "alloc")] |
8 | | use alloc::vec::Vec; |
9 | | |
10 | | pub mod alphabet; |
11 | | pub(crate) mod bytes; |
12 | | #[cfg(feature = "alloc")] |
13 | | pub(crate) mod determinize; |
14 | | pub mod id; |
15 | | #[cfg(feature = "alloc")] |
16 | | pub(crate) mod lazy; |
17 | | pub(crate) mod matchtypes; |
18 | | pub mod prefilter; |
19 | | #[cfg(feature = "alloc")] |
20 | | pub(crate) mod sparse_set; |
21 | | pub(crate) mod start; |
22 | | #[cfg(feature = "alloc")] |
23 | | pub(crate) mod syntax; |
24 | | |
25 | | /// The offset, in bytes, that a match is delayed by in the DFAs generated by |
26 | | /// this crate. (This includes lazy DFAs.) |
27 | | /// |
28 | | /// The purpose of this delay is to support look-ahead such as \b (ASCII-only) |
29 | | /// and $. In particular, both of these operators may require the |
30 | | /// identification of the end of input in order to confirm a match. Not only |
31 | | /// does this mean that all matches must therefore be delayed by a single byte, |
32 | | /// but that a special EOI value is added to the alphabet of all DFAs. (Which |
33 | | /// means that even though the alphabet of a DFA is typically all byte values, |
34 | | /// the actual maximum alphabet size is 257 due to the extra EOI value.) |
35 | | /// |
36 | | /// Since we delay matches by only 1 byte, this can't fully support a |
37 | | /// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, |
38 | | /// DFAs in this crate do not support it. (It's not as simple as just |
39 | | /// increasing the match offset to do it---otherwise we would---but building |
40 | | /// the full Unicode-aware word boundary detection into an automaton is quite |
41 | | /// tricky.) |
42 | | pub(crate) const MATCH_OFFSET: usize = 1; |
43 | | |
44 | | /// A type that wraps a single byte with a convenient fmt::Debug impl that |
45 | | /// escapes the byte. |
46 | | pub(crate) struct DebugByte(pub u8); |
47 | | |
48 | | impl fmt::Debug for DebugByte { |
49 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
50 | | // 10 bytes is enough to cover any output from ascii::escape_default. |
51 | 0 | let mut bytes = [0u8; 10]; |
52 | 0 | let mut len = 0; |
53 | 0 | for (i, mut b) in ascii::escape_default(self.0).enumerate() { |
54 | | // capitalize \xab to \xAB |
55 | 0 | if i >= 2 && b'a' <= b && b <= b'f' { |
56 | 0 | b -= 32; |
57 | 0 | } |
58 | 0 | bytes[len] = b; |
59 | 0 | len += 1; |
60 | | } |
61 | 0 | write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap()) |
62 | 0 | } |
63 | | } |
64 | | |
65 | | /// Returns the smallest possible index of the next valid UTF-8 sequence |
66 | | /// starting after `i`. |
67 | | /// |
68 | | /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
69 | | /// value is guaranteed to be greater than `i`. |
70 | | /// |
71 | | /// Generally speaking, this should only be called on `text` when it is |
72 | | /// permitted to assume that it is valid UTF-8 and where either `i >= |
73 | | /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
74 | | #[inline(always)] |
75 | 0 | pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { |
76 | 0 | let b = match text.get(i) { |
77 | 0 | None => return i.checked_add(1).unwrap(), |
78 | 0 | Some(&b) => b, |
79 | | }; |
80 | | // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
81 | | // other than just start at the next byte. |
82 | 0 | let inc = utf8_len(b).unwrap_or(1); |
83 | 0 | i.checked_add(inc).unwrap() |
84 | 0 | } |
85 | | |
86 | | /// Returns true if and only if the given byte is considered a word character. |
87 | | /// This only applies to ASCII. |
88 | | /// |
89 | | /// This was copied from regex-syntax so that we can use it to determine the |
90 | | /// starting DFA state while searching without depending on regex-syntax. The |
91 | | /// definition is never going to change, so there's no maintenance/bit-rot |
92 | | /// hazard here. |
93 | | #[inline(always)] |
94 | 0 | pub(crate) fn is_word_byte(b: u8) -> bool { |
95 | 0 | match b { |
96 | 0 | b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, |
97 | 0 | _ => false, |
98 | | } |
99 | 0 | } |
100 | | |
101 | | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
102 | | /// |
103 | | /// If no valid encoding of a codepoint exists at the beginning of the given |
104 | | /// byte slice, then the first byte is returned instead. |
105 | | /// |
106 | | /// This returns `None` if and only if `bytes` is empty. |
107 | | #[inline(always)] |
108 | 0 | pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { |
109 | 0 | if bytes.is_empty() { |
110 | 0 | return None; |
111 | 0 | } |
112 | 0 | let len = match utf8_len(bytes[0]) { |
113 | 0 | None => return Some(Err(bytes[0])), |
114 | 0 | Some(len) if len > bytes.len() => return Some(Err(bytes[0])), |
115 | 0 | Some(1) => return Some(Ok(bytes[0] as char)), |
116 | 0 | Some(len) => len, |
117 | | }; |
118 | 0 | match str::from_utf8(&bytes[..len]) { |
119 | 0 | Ok(s) => Some(Ok(s.chars().next().unwrap())), |
120 | 0 | Err(_) => Some(Err(bytes[0])), |
121 | | } |
122 | 0 | } |
123 | | |
124 | | /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
125 | | /// |
126 | | /// If no valid encoding of a codepoint exists at the end of the given byte |
127 | | /// slice, then the last byte is returned instead. |
128 | | /// |
129 | | /// This returns `None` if and only if `bytes` is empty. |
130 | | #[inline(always)] |
131 | 0 | pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { |
132 | 0 | if bytes.is_empty() { |
133 | 0 | return None; |
134 | 0 | } |
135 | 0 | let mut start = bytes.len() - 1; |
136 | 0 | let limit = bytes.len().saturating_sub(4); |
137 | 0 | while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { |
138 | 0 | start -= 1; |
139 | 0 | } |
140 | 0 | match decode_utf8(&bytes[start..]) { |
141 | 0 | None => None, |
142 | 0 | Some(Ok(ch)) => Some(Ok(ch)), |
143 | 0 | Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
144 | | } |
145 | 0 | } |
146 | | |
147 | | /// Given a UTF-8 leading byte, this returns the total number of code units |
148 | | /// in the following encoded codepoint. |
149 | | /// |
150 | | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
151 | | /// `None`. |
152 | | #[inline(always)] |
153 | 0 | fn utf8_len(byte: u8) -> Option<usize> { |
154 | 0 | if byte <= 0x7F { |
155 | 0 | return Some(1); |
156 | 0 | } else if byte & 0b1100_0000 == 0b1000_0000 { |
157 | 0 | return None; |
158 | 0 | } else if byte <= 0b1101_1111 { |
159 | 0 | Some(2) |
160 | 0 | } else if byte <= 0b1110_1111 { |
161 | 0 | Some(3) |
162 | 0 | } else if byte <= 0b1111_0111 { |
163 | 0 | Some(4) |
164 | | } else { |
165 | 0 | None |
166 | | } |
167 | 0 | } |
168 | | |
169 | | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
170 | | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
171 | | /// valid UTF-8 sequence. |
172 | | #[inline(always)] |
173 | 0 | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { |
174 | | // In the ASCII case, the most significant bit is never set. The leading |
175 | | // byte of a 2/3/4-byte sequence always has the top two most significant |
176 | | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
177 | | // also returns true, since every such byte has its two most significant |
178 | | // bits set: |
179 | | // |
180 | | // \xC0 :: 11000000 |
181 | | // \xC1 :: 11000001 |
182 | | // \xF5 :: 11110101 |
183 | | // \xF6 :: 11110110 |
184 | | // \xF7 :: 11110111 |
185 | | // \xF8 :: 11111000 |
186 | | // \xF9 :: 11111001 |
187 | | // \xFA :: 11111010 |
188 | | // \xFB :: 11111011 |
189 | | // \xFC :: 11111100 |
190 | | // \xFD :: 11111101 |
191 | | // \xFE :: 11111110 |
192 | | // \xFF :: 11111111 |
193 | 0 | (b & 0b1100_0000) != 0b1000_0000 |
194 | 0 | } |
195 | | |
196 | | #[cfg(feature = "alloc")] |
197 | | #[inline(always)] |
198 | | pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { |
199 | | use core::{ptr, sync::atomic::AtomicPtr}; |
200 | | |
201 | | use crate::{ |
202 | | dfa::{ |
203 | | dense::{self, DFA}, |
204 | | Automaton, |
205 | | }, |
206 | | util::lazy, |
207 | | }; |
208 | | |
209 | | static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); |
210 | | |
211 | | let dfa = lazy::get_or_init(&WORD, || { |
212 | | // TODO: Should we use a lazy DFA here instead? It does complicate |
213 | | // things somewhat, since we then need a mutable cache, which probably |
214 | | // means a thread local. |
215 | | dense::Builder::new() |
216 | | .configure(dense::Config::new().anchored(true)) |
217 | | .build(r"\w") |
218 | | .unwrap() |
219 | | }); |
220 | | // This is OK since '\w' contains no look-around. |
221 | | let mut sid = dfa.universal_start_state(); |
222 | | while at < bytes.len() { |
223 | | let byte = bytes[at]; |
224 | | sid = dfa.next_state(sid, byte); |
225 | | at += 1; |
226 | | if dfa.is_special_state(sid) { |
227 | | if dfa.is_match_state(sid) { |
228 | | return true; |
229 | | } else if dfa.is_dead_state(sid) { |
230 | | return false; |
231 | | } |
232 | | } |
233 | | } |
234 | | dfa.is_match_state(dfa.next_eoi_state(sid)) |
235 | | } |
236 | | |
237 | | #[cfg(feature = "alloc")] |
238 | | #[inline(always)] |
239 | | pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { |
240 | | use core::{ptr, sync::atomic::AtomicPtr}; |
241 | | |
242 | | use crate::{ |
243 | | dfa::{ |
244 | | dense::{self, DFA}, |
245 | | Automaton, |
246 | | }, |
247 | | nfa::thompson::NFA, |
248 | | }; |
249 | | |
250 | | static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); |
251 | | |
252 | | let dfa = lazy::get_or_init(&WORD, || { |
253 | | dense::Builder::new() |
254 | | .configure(dense::Config::new().anchored(true)) |
255 | | .thompson(NFA::config().reverse(true).shrink(true)) |
256 | | .build(r"\w") |
257 | | .unwrap() |
258 | | }); |
259 | | |
260 | | // This is OK since '\w' contains no look-around. |
261 | | let mut sid = dfa.universal_start_state(); |
262 | | while at > 0 { |
263 | | at -= 1; |
264 | | let byte = bytes[at]; |
265 | | sid = dfa.next_state(sid, byte); |
266 | | if dfa.is_special_state(sid) { |
267 | | if dfa.is_match_state(sid) { |
268 | | return true; |
269 | | } else if dfa.is_dead_state(sid) { |
270 | | return false; |
271 | | } |
272 | | } |
273 | | } |
274 | | dfa.is_match_state(dfa.next_eoi_state(sid)) |
275 | | } |