/rust/registry/src/github.com-1ecc6299db9ec823/regex-1.4.3/src/input.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use std::char; |
2 | | use std::cmp::Ordering; |
3 | | use std::fmt; |
4 | | use std::ops; |
5 | | use std::u32; |
6 | | |
7 | | use syntax; |
8 | | |
9 | | use literal::LiteralSearcher; |
10 | | use prog::InstEmptyLook; |
11 | | use utf8::{decode_last_utf8, decode_utf8}; |
12 | | |
13 | | /// Represents a location in the input. |
14 | 0 | #[derive(Clone, Copy, Debug)] |
15 | | pub struct InputAt { |
16 | | pos: usize, |
17 | | c: Char, |
18 | | byte: Option<u8>, |
19 | | len: usize, |
20 | | } |
21 | | |
22 | | impl InputAt { |
23 | | /// Returns true iff this position is at the beginning of the input. |
24 | | pub fn is_start(&self) -> bool { |
25 | | self.pos == 0 |
26 | | } |
27 | | |
28 | | /// Returns true iff this position is past the end of the input. |
29 | | pub fn is_end(&self) -> bool { |
30 | | self.c.is_none() && self.byte.is_none() |
31 | | } |
32 | | |
33 | | /// Returns the character at this position. |
34 | | /// |
35 | | /// If this position is just before or after the input, then an absent |
36 | | /// character is returned. |
37 | | pub fn char(&self) -> Char { |
38 | | self.c |
39 | | } |
40 | | |
41 | | /// Returns the byte at this position. |
42 | | pub fn byte(&self) -> Option<u8> { |
43 | | self.byte |
44 | | } |
45 | | |
46 | | /// Returns the UTF-8 width of the character at this position. |
47 | | pub fn len(&self) -> usize { |
48 | | self.len |
49 | | } |
50 | | |
51 | | /// Returns whether the UTF-8 width of the character at this position |
52 | | /// is zero. |
53 | | pub fn is_empty(&self) -> bool { |
54 | | self.len == 0 |
55 | | } |
56 | | |
57 | | /// Returns the byte offset of this position. |
58 | | pub fn pos(&self) -> usize { |
59 | | self.pos |
60 | | } |
61 | | |
62 | | /// Returns the byte offset of the next position in the input. |
63 | | pub fn next_pos(&self) -> usize { |
64 | | self.pos + self.len |
65 | | } |
66 | | } |
67 | | |
68 | | /// An abstraction over input used in the matching engines. |
69 | | pub trait Input: fmt::Debug { |
70 | | /// Return an encoding of the position at byte offset `i`. |
71 | | fn at(&self, i: usize) -> InputAt; |
72 | | |
73 | | /// Return the Unicode character occurring next to `at`. |
74 | | /// |
75 | | /// If no such character could be decoded, then `Char` is absent. |
76 | | fn next_char(&self, at: InputAt) -> Char; |
77 | | |
78 | | /// Return the Unicode character occurring previous to `at`. |
79 | | /// |
80 | | /// If no such character could be decoded, then `Char` is absent. |
81 | | fn previous_char(&self, at: InputAt) -> Char; |
82 | | |
83 | | /// Return true if the given empty width instruction matches at the |
84 | | /// input position given. |
85 | | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; |
86 | | |
87 | | /// Scan the input for a matching prefix. |
88 | | fn prefix_at( |
89 | | &self, |
90 | | prefixes: &LiteralSearcher, |
91 | | at: InputAt, |
92 | | ) -> Option<InputAt>; |
93 | | |
94 | | /// The number of bytes in the input. |
95 | | fn len(&self) -> usize; |
96 | | |
97 | | /// Whether the input is empty. |
98 | 0 | fn is_empty(&self) -> bool { |
99 | 0 | self.len() == 0 |
100 | 0 | } |
101 | | |
102 | | /// Return the given input as a sequence of bytes. |
103 | | fn as_bytes(&self) -> &[u8]; |
104 | | } |
105 | | |
106 | | impl<'a, T: Input> Input for &'a T { |
107 | 0 | fn at(&self, i: usize) -> InputAt { |
108 | 0 | (**self).at(i) |
109 | 0 | } |
110 | | |
111 | 0 | fn next_char(&self, at: InputAt) -> Char { |
112 | 0 | (**self).next_char(at) |
113 | 0 | } |
114 | | |
115 | 0 | fn previous_char(&self, at: InputAt) -> Char { |
116 | 0 | (**self).previous_char(at) |
117 | 0 | } |
118 | | |
119 | 0 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
120 | 0 | (**self).is_empty_match(at, empty) |
121 | 0 | } |
122 | | |
123 | 0 | fn prefix_at( |
124 | 0 | &self, |
125 | 0 | prefixes: &LiteralSearcher, |
126 | 0 | at: InputAt, |
127 | 0 | ) -> Option<InputAt> { |
128 | 0 | (**self).prefix_at(prefixes, at) |
129 | 0 | } |
130 | | |
131 | 0 | fn len(&self) -> usize { |
132 | 0 | (**self).len() |
133 | 0 | } |
134 | | |
135 | 0 | fn as_bytes(&self) -> &[u8] { |
136 | 0 | (**self).as_bytes() |
137 | 0 | } |
138 | | } |
139 | | |
140 | | /// An input reader over characters. |
141 | 0 | #[derive(Clone, Copy, Debug)] |
142 | | pub struct CharInput<'t>(&'t [u8]); |
143 | | |
144 | | impl<'t> CharInput<'t> { |
145 | | /// Return a new character input reader for the given string. |
146 | | pub fn new(s: &'t [u8]) -> CharInput<'t> { |
147 | | CharInput(s) |
148 | | } |
149 | | } |
150 | | |
151 | | impl<'t> ops::Deref for CharInput<'t> { |
152 | | type Target = [u8]; |
153 | | |
154 | | fn deref(&self) -> &[u8] { |
155 | | self.0 |
156 | | } |
157 | | } |
158 | | |
159 | | impl<'t> Input for CharInput<'t> { |
160 | 0 | fn at(&self, i: usize) -> InputAt { |
161 | 0 | if i >= self.len() { |
162 | 0 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
163 | | } else { |
164 | 0 | let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); |
165 | 0 | InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } |
166 | | } |
167 | 0 | } |
168 | | |
169 | | fn next_char(&self, at: InputAt) -> Char { |
170 | | at.char() |
171 | | } |
172 | | |
173 | 0 | fn previous_char(&self, at: InputAt) -> Char { |
174 | 0 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
175 | 0 | } |
176 | | |
177 | | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
178 | | use prog::EmptyLook::*; |
179 | | match empty.look { |
180 | | StartLine => { |
181 | | let c = self.previous_char(at); |
182 | | at.pos() == 0 || c == '\n' |
183 | | } |
184 | | EndLine => { |
185 | | let c = self.next_char(at); |
186 | | at.pos() == self.len() || c == '\n' |
187 | | } |
188 | | StartText => at.pos() == 0, |
189 | | EndText => at.pos() == self.len(), |
190 | | WordBoundary => { |
191 | | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
192 | | c1.is_word_char() != c2.is_word_char() |
193 | | } |
194 | | NotWordBoundary => { |
195 | | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
196 | | c1.is_word_char() == c2.is_word_char() |
197 | | } |
198 | | WordBoundaryAscii => { |
199 | | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
200 | | c1.is_word_byte() != c2.is_word_byte() |
201 | | } |
202 | | NotWordBoundaryAscii => { |
203 | | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
204 | | c1.is_word_byte() == c2.is_word_byte() |
205 | | } |
206 | | } |
207 | | } |
208 | | |
209 | 0 | fn prefix_at( |
210 | 0 | &self, |
211 | 0 | prefixes: &LiteralSearcher, |
212 | 0 | at: InputAt, |
213 | 0 | ) -> Option<InputAt> { |
214 | 0 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
215 | 0 | } |
216 | | |
217 | | fn len(&self) -> usize { |
218 | | self.0.len() |
219 | | } |
220 | | |
221 | | fn as_bytes(&self) -> &[u8] { |
222 | | self.0 |
223 | | } |
224 | | } |
225 | | |
226 | | /// An input reader over bytes. |
227 | 0 | #[derive(Clone, Copy, Debug)] |
228 | | pub struct ByteInput<'t> { |
229 | | text: &'t [u8], |
230 | | only_utf8: bool, |
231 | | } |
232 | | |
233 | | impl<'t> ByteInput<'t> { |
234 | | /// Return a new byte-based input reader for the given string. |
235 | | pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { |
236 | | ByteInput { text: text, only_utf8: only_utf8 } |
237 | | } |
238 | | } |
239 | | |
240 | | impl<'t> ops::Deref for ByteInput<'t> { |
241 | | type Target = [u8]; |
242 | | |
243 | | fn deref(&self) -> &[u8] { |
244 | | self.text |
245 | | } |
246 | | } |
247 | | |
248 | | impl<'t> Input for ByteInput<'t> { |
249 | 0 | fn at(&self, i: usize) -> InputAt { |
250 | 0 | if i >= self.len() { |
251 | 0 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
252 | | } else { |
253 | 0 | InputAt { |
254 | 0 | pos: i, |
255 | 0 | c: None.into(), |
256 | 0 | byte: self.get(i).cloned(), |
257 | 0 | len: 1, |
258 | 0 | } |
259 | | } |
260 | 0 | } |
261 | | |
262 | 0 | fn next_char(&self, at: InputAt) -> Char { |
263 | 0 | decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() |
264 | 0 | } |
265 | | |
266 | 0 | fn previous_char(&self, at: InputAt) -> Char { |
267 | 0 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
268 | 0 | } |
269 | | |
270 | 0 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
271 | 0 | use prog::EmptyLook::*; |
272 | 0 | match empty.look { |
273 | 0 | StartLine => { |
274 | 0 | let c = self.previous_char(at); |
275 | 0 | at.pos() == 0 || c == '\n' |
276 | | } |
277 | | EndLine => { |
278 | 0 | let c = self.next_char(at); |
279 | 0 | at.pos() == self.len() || c == '\n' |
280 | | } |
281 | 0 | StartText => at.pos() == 0, |
282 | 0 | EndText => at.pos() == self.len(), |
283 | | WordBoundary => { |
284 | 0 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
285 | 0 | c1.is_word_char() != c2.is_word_char() |
286 | | } |
287 | | NotWordBoundary => { |
288 | 0 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
289 | 0 | c1.is_word_char() == c2.is_word_char() |
290 | | } |
291 | | WordBoundaryAscii => { |
292 | 0 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
293 | 0 | if self.only_utf8 { |
294 | | // If we must match UTF-8, then we can't match word |
295 | | // boundaries at invalid UTF-8. |
296 | 0 | if c1.is_none() && !at.is_start() { |
297 | 0 | return false; |
298 | 0 | } |
299 | 0 | if c2.is_none() && !at.is_end() { |
300 | 0 | return false; |
301 | 0 | } |
302 | 0 | } |
303 | 0 | c1.is_word_byte() != c2.is_word_byte() |
304 | | } |
305 | | NotWordBoundaryAscii => { |
306 | 0 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
307 | 0 | if self.only_utf8 { |
308 | | // If we must match UTF-8, then we can't match word |
309 | | // boundaries at invalid UTF-8. |
310 | 0 | if c1.is_none() && !at.is_start() { |
311 | 0 | return false; |
312 | 0 | } |
313 | 0 | if c2.is_none() && !at.is_end() { |
314 | 0 | return false; |
315 | 0 | } |
316 | 0 | } |
317 | 0 | c1.is_word_byte() == c2.is_word_byte() |
318 | | } |
319 | | } |
320 | 0 | } |
321 | | |
322 | 0 | fn prefix_at( |
323 | 0 | &self, |
324 | 0 | prefixes: &LiteralSearcher, |
325 | 0 | at: InputAt, |
326 | 0 | ) -> Option<InputAt> { |
327 | 0 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
328 | 0 | } |
329 | | |
330 | | fn len(&self) -> usize { |
331 | | self.text.len() |
332 | | } |
333 | | |
334 | | fn as_bytes(&self) -> &[u8] { |
335 | | self.text |
336 | | } |
337 | | } |
338 | | |
339 | | /// An inline representation of `Option<char>`. |
340 | | /// |
341 | | /// This eliminates the need to do case analysis on `Option<char>` to determine |
342 | | /// ordinality with other characters. |
343 | | /// |
344 | | /// (The `Option<char>` is not related to encoding. Instead, it is used in the |
345 | | /// matching engines to represent the beginning and ending boundaries of the |
346 | | /// search text.) |
347 | 0 | #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] |
348 | | pub struct Char(u32); |
349 | | |
350 | | impl fmt::Debug for Char { |
351 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
352 | 0 | match char::from_u32(self.0) { |
353 | 0 | None => write!(f, "Empty"), |
354 | 0 | Some(c) => write!(f, "{:?}", c), |
355 | | } |
356 | 0 | } |
357 | | } |
358 | | |
359 | | impl Char { |
360 | | /// Returns true iff the character is absent. |
361 | | #[inline] |
362 | | pub fn is_none(self) -> bool { |
363 | | self.0 == u32::MAX |
364 | | } |
365 | | |
366 | | /// Returns the length of the character's UTF-8 encoding. |
367 | | /// |
368 | | /// If the character is absent, then `1` is returned. |
369 | | #[inline] |
370 | 0 | pub fn len_utf8(self) -> usize { |
371 | 0 | char::from_u32(self.0).map_or(1, |c| c.len_utf8()) |
372 | 0 | } |
373 | | |
374 | | /// Returns true iff the character is a word character. |
375 | | /// |
376 | | /// If the character is absent, then false is returned. |
377 | | pub fn is_word_char(self) -> bool { |
378 | | // is_word_character can panic if the Unicode data for \w isn't |
379 | | // available. However, our compiler ensures that if a Unicode word |
380 | | // boundary is used, then the data must also be available. If it isn't, |
381 | | // then the compiler returns an error. |
382 | | char::from_u32(self.0).map_or(false, syntax::is_word_character) |
383 | | } |
384 | | |
385 | | /// Returns true iff the byte is a word byte. |
386 | | /// |
387 | | /// If the byte is absent, then false is returned. |
388 | | pub fn is_word_byte(self) -> bool { |
389 | 0 | match char::from_u32(self.0) { |
390 | 0 | Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), |
391 | 0 | None | Some(_) => false, |
392 | | } |
393 | 0 | } |
394 | | } |
395 | | |
396 | | impl From<char> for Char { |
397 | | fn from(c: char) -> Char { |
398 | | Char(c as u32) |
399 | | } |
400 | | } |
401 | | |
402 | | impl From<Option<char>> for Char { |
403 | 0 | fn from(c: Option<char>) -> Char { |
404 | 0 | c.map_or(Char(u32::MAX), |c| c.into()) |
405 | 0 | } |
406 | | } |
407 | | |
408 | | impl PartialEq<char> for Char { |
409 | | #[inline] |
410 | | fn eq(&self, other: &char) -> bool { |
411 | | self.0 == *other as u32 |
412 | | } |
413 | | } |
414 | | |
415 | | impl PartialEq<Char> for char { |
416 | | #[inline] |
417 | | fn eq(&self, other: &Char) -> bool { |
418 | | *self as u32 == other.0 |
419 | | } |
420 | | } |
421 | | |
422 | | impl PartialOrd<char> for Char { |
423 | | #[inline] |
424 | | fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
425 | | self.0.partial_cmp(&(*other as u32)) |
426 | | } |
427 | | } |
428 | | |
429 | | impl PartialOrd<Char> for char { |
430 | | #[inline] |
431 | | fn partial_cmp(&self, other: &Char) -> Option<Ordering> { |
432 | | (*self as u32).partial_cmp(&other.0) |
433 | | } |
434 | | } |