Coverage Report

Created: 2021-03-22 08:29

/rust/registry/src/github.com-1ecc6299db9ec823/regex-1.4.3/src/input.rs
Line
Count
Source (jump to first uncovered line)
1
use std::char;
2
use std::cmp::Ordering;
3
use std::fmt;
4
use std::ops;
5
use std::u32;
6
7
use syntax;
8
9
use literal::LiteralSearcher;
10
use prog::InstEmptyLook;
11
use utf8::{decode_last_utf8, decode_utf8};
12
13
/// Represents a location in the input.
14
0
#[derive(Clone, Copy, Debug)]
15
pub struct InputAt {
16
    pos: usize,
17
    c: Char,
18
    byte: Option<u8>,
19
    len: usize,
20
}
21
22
impl InputAt {
23
    /// Returns true iff this position is at the beginning of the input.
24
    pub fn is_start(&self) -> bool {
25
        self.pos == 0
26
    }
27
28
    /// Returns true iff this position is past the end of the input.
29
    pub fn is_end(&self) -> bool {
30
        self.c.is_none() && self.byte.is_none()
31
    }
32
33
    /// Returns the character at this position.
34
    ///
35
    /// If this position is just before or after the input, then an absent
36
    /// character is returned.
37
    pub fn char(&self) -> Char {
38
        self.c
39
    }
40
41
    /// Returns the byte at this position.
42
    pub fn byte(&self) -> Option<u8> {
43
        self.byte
44
    }
45
46
    /// Returns the UTF-8 width of the character at this position.
47
    pub fn len(&self) -> usize {
48
        self.len
49
    }
50
51
    /// Returns whether the UTF-8 width of the character at this position
52
    /// is zero.
53
    pub fn is_empty(&self) -> bool {
54
        self.len == 0
55
    }
56
57
    /// Returns the byte offset of this position.
58
    pub fn pos(&self) -> usize {
59
        self.pos
60
    }
61
62
    /// Returns the byte offset of the next position in the input.
63
    pub fn next_pos(&self) -> usize {
64
        self.pos + self.len
65
    }
66
}
67
68
/// An abstraction over input used in the matching engines.
69
pub trait Input: fmt::Debug {
70
    /// Return an encoding of the position at byte offset `i`.
71
    fn at(&self, i: usize) -> InputAt;
72
73
    /// Return the Unicode character occurring next to `at`.
74
    ///
75
    /// If no such character could be decoded, then `Char` is absent.
76
    fn next_char(&self, at: InputAt) -> Char;
77
78
    /// Return the Unicode character occurring previous to `at`.
79
    ///
80
    /// If no such character could be decoded, then `Char` is absent.
81
    fn previous_char(&self, at: InputAt) -> Char;
82
83
    /// Return true if the given empty width instruction matches at the
84
    /// input position given.
85
    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
86
87
    /// Scan the input for a matching prefix.
88
    fn prefix_at(
89
        &self,
90
        prefixes: &LiteralSearcher,
91
        at: InputAt,
92
    ) -> Option<InputAt>;
93
94
    /// The number of bytes in the input.
95
    fn len(&self) -> usize;
96
97
    /// Whether the input is empty.
98
0
    fn is_empty(&self) -> bool {
99
0
        self.len() == 0
100
0
    }
101
102
    /// Return the given input as a sequence of bytes.
103
    fn as_bytes(&self) -> &[u8];
104
}
105
106
impl<'a, T: Input> Input for &'a T {
107
0
    fn at(&self, i: usize) -> InputAt {
108
0
        (**self).at(i)
109
0
    }
110
111
0
    fn next_char(&self, at: InputAt) -> Char {
112
0
        (**self).next_char(at)
113
0
    }
114
115
0
    fn previous_char(&self, at: InputAt) -> Char {
116
0
        (**self).previous_char(at)
117
0
    }
118
119
0
    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
120
0
        (**self).is_empty_match(at, empty)
121
0
    }
122
123
0
    fn prefix_at(
124
0
        &self,
125
0
        prefixes: &LiteralSearcher,
126
0
        at: InputAt,
127
0
    ) -> Option<InputAt> {
128
0
        (**self).prefix_at(prefixes, at)
129
0
    }
130
131
0
    fn len(&self) -> usize {
132
0
        (**self).len()
133
0
    }
134
135
0
    fn as_bytes(&self) -> &[u8] {
136
0
        (**self).as_bytes()
137
0
    }
138
}
139
140
/// An input reader over characters.
141
0
#[derive(Clone, Copy, Debug)]
142
pub struct CharInput<'t>(&'t [u8]);
143
144
impl<'t> CharInput<'t> {
145
    /// Return a new character input reader for the given string.
146
    pub fn new(s: &'t [u8]) -> CharInput<'t> {
147
        CharInput(s)
148
    }
149
}
150
151
impl<'t> ops::Deref for CharInput<'t> {
152
    type Target = [u8];
153
154
    fn deref(&self) -> &[u8] {
155
        self.0
156
    }
157
}
158
159
impl<'t> Input for CharInput<'t> {
160
0
    fn at(&self, i: usize) -> InputAt {
161
0
        if i >= self.len() {
162
0
            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
163
        } else {
164
0
            let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
165
0
            InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
166
        }
167
0
    }
168
169
    fn next_char(&self, at: InputAt) -> Char {
170
        at.char()
171
    }
172
173
0
    fn previous_char(&self, at: InputAt) -> Char {
174
0
        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
175
0
    }
176
177
    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
178
        use prog::EmptyLook::*;
179
        match empty.look {
180
            StartLine => {
181
                let c = self.previous_char(at);
182
                at.pos() == 0 || c == '\n'
183
            }
184
            EndLine => {
185
                let c = self.next_char(at);
186
                at.pos() == self.len() || c == '\n'
187
            }
188
            StartText => at.pos() == 0,
189
            EndText => at.pos() == self.len(),
190
            WordBoundary => {
191
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
192
                c1.is_word_char() != c2.is_word_char()
193
            }
194
            NotWordBoundary => {
195
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
196
                c1.is_word_char() == c2.is_word_char()
197
            }
198
            WordBoundaryAscii => {
199
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
200
                c1.is_word_byte() != c2.is_word_byte()
201
            }
202
            NotWordBoundaryAscii => {
203
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
204
                c1.is_word_byte() == c2.is_word_byte()
205
            }
206
        }
207
    }
208
209
0
    fn prefix_at(
210
0
        &self,
211
0
        prefixes: &LiteralSearcher,
212
0
        at: InputAt,
213
0
    ) -> Option<InputAt> {
214
0
        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
215
0
    }
216
217
    fn len(&self) -> usize {
218
        self.0.len()
219
    }
220
221
    fn as_bytes(&self) -> &[u8] {
222
        self.0
223
    }
224
}
225
226
/// An input reader over bytes.
227
0
#[derive(Clone, Copy, Debug)]
228
pub struct ByteInput<'t> {
229
    text: &'t [u8],
230
    only_utf8: bool,
231
}
232
233
impl<'t> ByteInput<'t> {
234
    /// Return a new byte-based input reader for the given string.
235
    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
236
        ByteInput { text: text, only_utf8: only_utf8 }
237
    }
238
}
239
240
impl<'t> ops::Deref for ByteInput<'t> {
241
    type Target = [u8];
242
243
    fn deref(&self) -> &[u8] {
244
        self.text
245
    }
246
}
247
248
impl<'t> Input for ByteInput<'t> {
249
0
    fn at(&self, i: usize) -> InputAt {
250
0
        if i >= self.len() {
251
0
            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
252
        } else {
253
0
            InputAt {
254
0
                pos: i,
255
0
                c: None.into(),
256
0
                byte: self.get(i).cloned(),
257
0
                len: 1,
258
0
            }
259
        }
260
0
    }
261
262
0
    fn next_char(&self, at: InputAt) -> Char {
263
0
        decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
264
0
    }
265
266
0
    fn previous_char(&self, at: InputAt) -> Char {
267
0
        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
268
0
    }
269
270
0
    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
271
0
        use prog::EmptyLook::*;
272
0
        match empty.look {
273
0
            StartLine => {
274
0
                let c = self.previous_char(at);
275
0
                at.pos() == 0 || c == '\n'
276
            }
277
            EndLine => {
278
0
                let c = self.next_char(at);
279
0
                at.pos() == self.len() || c == '\n'
280
            }
281
0
            StartText => at.pos() == 0,
282
0
            EndText => at.pos() == self.len(),
283
            WordBoundary => {
284
0
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
285
0
                c1.is_word_char() != c2.is_word_char()
286
            }
287
            NotWordBoundary => {
288
0
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
289
0
                c1.is_word_char() == c2.is_word_char()
290
            }
291
            WordBoundaryAscii => {
292
0
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
293
0
                if self.only_utf8 {
294
                    // If we must match UTF-8, then we can't match word
295
                    // boundaries at invalid UTF-8.
296
0
                    if c1.is_none() && !at.is_start() {
297
0
                        return false;
298
0
                    }
299
0
                    if c2.is_none() && !at.is_end() {
300
0
                        return false;
301
0
                    }
302
0
                }
303
0
                c1.is_word_byte() != c2.is_word_byte()
304
            }
305
            NotWordBoundaryAscii => {
306
0
                let (c1, c2) = (self.previous_char(at), self.next_char(at));
307
0
                if self.only_utf8 {
308
                    // If we must match UTF-8, then we can't match word
309
                    // boundaries at invalid UTF-8.
310
0
                    if c1.is_none() && !at.is_start() {
311
0
                        return false;
312
0
                    }
313
0
                    if c2.is_none() && !at.is_end() {
314
0
                        return false;
315
0
                    }
316
0
                }
317
0
                c1.is_word_byte() == c2.is_word_byte()
318
            }
319
        }
320
0
    }
321
322
0
    fn prefix_at(
323
0
        &self,
324
0
        prefixes: &LiteralSearcher,
325
0
        at: InputAt,
326
0
    ) -> Option<InputAt> {
327
0
        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
328
0
    }
329
330
    fn len(&self) -> usize {
331
        self.text.len()
332
    }
333
334
    fn as_bytes(&self) -> &[u8] {
335
        self.text
336
    }
337
}
338
339
/// An inline representation of `Option<char>`.
340
///
341
/// This eliminates the need to do case analysis on `Option<char>` to determine
342
/// ordinality with other characters.
343
///
344
/// (The `Option<char>` is not related to encoding. Instead, it is used in the
345
/// matching engines to represent the beginning and ending boundaries of the
346
/// search text.)
347
0
#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
348
pub struct Char(u32);
349
350
impl fmt::Debug for Char {
351
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
352
0
        match char::from_u32(self.0) {
353
0
            None => write!(f, "Empty"),
354
0
            Some(c) => write!(f, "{:?}", c),
355
        }
356
0
    }
357
}
358
359
impl Char {
360
    /// Returns true iff the character is absent.
361
    #[inline]
362
    pub fn is_none(self) -> bool {
363
        self.0 == u32::MAX
364
    }
365
366
    /// Returns the length of the character's UTF-8 encoding.
367
    ///
368
    /// If the character is absent, then `1` is returned.
369
    #[inline]
370
0
    pub fn len_utf8(self) -> usize {
371
0
        char::from_u32(self.0).map_or(1, |c| c.len_utf8())
372
0
    }
373
374
    /// Returns true iff the character is a word character.
375
    ///
376
    /// If the character is absent, then false is returned.
377
    pub fn is_word_char(self) -> bool {
378
        // is_word_character can panic if the Unicode data for \w isn't
379
        // available. However, our compiler ensures that if a Unicode word
380
        // boundary is used, then the data must also be available. If it isn't,
381
        // then the compiler returns an error.
382
        char::from_u32(self.0).map_or(false, syntax::is_word_character)
383
    }
384
385
    /// Returns true iff the byte is a word byte.
386
    ///
387
    /// If the byte is absent, then false is returned.
388
    pub fn is_word_byte(self) -> bool {
389
0
        match char::from_u32(self.0) {
390
0
            Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
391
0
            None | Some(_) => false,
392
        }
393
0
    }
394
}
395
396
impl From<char> for Char {
397
    fn from(c: char) -> Char {
398
        Char(c as u32)
399
    }
400
}
401
402
impl From<Option<char>> for Char {
403
0
    fn from(c: Option<char>) -> Char {
404
0
        c.map_or(Char(u32::MAX), |c| c.into())
405
0
    }
406
}
407
408
impl PartialEq<char> for Char {
409
    #[inline]
410
    fn eq(&self, other: &char) -> bool {
411
        self.0 == *other as u32
412
    }
413
}
414
415
impl PartialEq<Char> for char {
416
    #[inline]
417
    fn eq(&self, other: &Char) -> bool {
418
        *self as u32 == other.0
419
    }
420
}
421
422
impl PartialOrd<char> for Char {
423
    #[inline]
424
    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
425
        self.0.partial_cmp(&(*other as u32))
426
    }
427
}
428
429
impl PartialOrd<Char> for char {
430
    #[inline]
431
    fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
432
        (*self as u32).partial_cmp(&other.0)
433
    }
434
}