Coverage Report

Created: 2024-08-22 06:13

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.7/src/util/look.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Types and routines for working with look-around assertions.
3
4
This module principally defines two types:
5
6
* [`Look`] enumerates all of the assertions supported by this crate.
7
* [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
8
* [`LookMatcher`] provides routines for checking whether a `Look` or a
9
`LookSet` matches at a particular position in a haystack.
10
*/
11
12
// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
13
// copied verbatim from the regex-syntax crate. I would have no problems using
14
// the regex-syntax types and defining the matching routines (only found
15
// in this crate) as free functions, except the `Look` and `LookSet` types
16
// are used in lots of places. Including in places we expect to work when
17
// regex-syntax is *not* enabled, such as in the definition of the NFA itself.
18
//
19
// Thankfully the code we copy is pretty simple and there isn't much of it.
20
// Otherwise, the rest of this module deals with *matching* the assertions,
21
// which is not something that regex-syntax handles.
22
23
use crate::util::{escape::DebugByte, utf8};
24
25
/// A look-around assertion.
26
///
27
/// An assertion matches at a position between characters in a haystack.
28
/// Namely, it does not actually "consume" any input as most parts of a regular
29
/// expression do. Assertions are a way of stating that some property must be
30
/// true at a particular point during matching.
31
///
32
/// For example, `(?m)^[a-z]+$` is a pattern that:
33
///
34
/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That
35
/// occurs at either the beginning of the haystack, or immediately following
36
/// a `\n` character.
37
/// * Looks for one or more occurrences of `[a-z]`.
38
/// * Once `[a-z]+` has matched as much as it can, an overall match is only
39
/// reported when `[a-z]+` stops just before a `\n`.
40
///
41
/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
42
///
43
/// Assertions are also called "look-around," "look-behind" and "look-ahead."
44
/// Specifically, some assertions are look-behind (like `^`), other assertions
45
/// are look-ahead (like `$`) and yet other assertions are both look-ahead and
46
/// look-behind (like `\b`).
47
///
48
/// # Assertions in an NFA
49
///
50
/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
51
/// thought of as a conditional epsilon transition. That is, a matching engine
52
/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
53
/// moving through conditional epsilon transitions when their condition
54
/// is satisfied at whatever position the `PikeVM` is currently at in the
55
/// haystack.
56
///
57
/// How assertions are handled in a `DFA` is trickier, since a DFA does not
58
/// have epsilon transitions at all. In this case, they are compiled into the
59
/// automaton itself, at the expense of more states than what would be required
60
/// without an assertion.
61
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
62
pub enum Look {
63
    /// Match the beginning of text. Specifically, this matches at the starting
64
    /// position of the input.
65
    Start = 1 << 0,
66
    /// Match the end of text. Specifically, this matches at the ending
67
    /// position of the input.
68
    End = 1 << 1,
69
    /// Match the beginning of a line or the beginning of text. Specifically,
70
    /// this matches at the starting position of the input, or at the position
71
    /// immediately following a `\n` character.
72
    StartLF = 1 << 2,
73
    /// Match the end of a line or the end of text. Specifically, this matches
74
    /// at the end position of the input, or at the position immediately
75
    /// preceding a `\n` character.
76
    EndLF = 1 << 3,
77
    /// Match the beginning of a line or the beginning of text. Specifically,
78
    /// this matches at the starting position of the input, or at the position
79
    /// immediately following either a `\r` or `\n` character, but never after
80
    /// a `\r` when a `\n` follows.
81
    StartCRLF = 1 << 4,
82
    /// Match the end of a line or the end of text. Specifically, this matches
83
    /// at the end position of the input, or at the position immediately
84
    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
85
    /// precedes it.
86
    EndCRLF = 1 << 5,
87
    /// Match an ASCII-only word boundary. That is, this matches a position
88
    /// where the left adjacent character and right adjacent character
89
    /// correspond to a word and non-word or a non-word and word character.
90
    WordAscii = 1 << 6,
91
    /// Match an ASCII-only negation of a word boundary.
92
    WordAsciiNegate = 1 << 7,
93
    /// Match a Unicode-aware word boundary. That is, this matches a position
94
    /// where the left adjacent character and right adjacent character
95
    /// correspond to a word and non-word or a non-word and word character.
96
    WordUnicode = 1 << 8,
97
    /// Match a Unicode-aware negation of a word boundary.
98
    WordUnicodeNegate = 1 << 9,
99
    /// Match the start of an ASCII-only word boundary. That is, this matches a
100
    /// position at either the beginning of the haystack or where the previous
101
    /// character is not a word character and the following character is a word
102
    /// character.
103
    WordStartAscii = 1 << 10,
104
    /// Match the end of an ASCII-only word boundary. That is, this matches
105
    /// a position at either the end of the haystack or where the previous
106
    /// character is a word character and the following character is not a word
107
    /// character.
108
    WordEndAscii = 1 << 11,
109
    /// Match the start of a Unicode word boundary. That is, this matches a
110
    /// position at either the beginning of the haystack or where the previous
111
    /// character is not a word character and the following character is a word
112
    /// character.
113
    WordStartUnicode = 1 << 12,
114
    /// Match the end of a Unicode word boundary. That is, this matches a
115
    /// position at either the end of the haystack or where the previous
116
    /// character is a word character and the following character is not a word
117
    /// character.
118
    WordEndUnicode = 1 << 13,
119
    /// Match the start half of an ASCII-only word boundary. That is, this
120
    /// matches a position at either the beginning of the haystack or where the
121
    /// previous character is not a word character.
122
    WordStartHalfAscii = 1 << 14,
123
    /// Match the end half of an ASCII-only word boundary. That is, this
124
    /// matches a position at either the end of the haystack or where the
125
    /// following character is not a word character.
126
    WordEndHalfAscii = 1 << 15,
127
    /// Match the start half of a Unicode word boundary. That is, this matches
128
    /// a position at either the beginning of the haystack or where the
129
    /// previous character is not a word character.
130
    WordStartHalfUnicode = 1 << 16,
131
    /// Match the end half of a Unicode word boundary. That is, this matches
132
    /// a position at either the end of the haystack or where the following
133
    /// character is not a word character.
134
    WordEndHalfUnicode = 1 << 17,
135
}
136
137
impl Look {
138
    /// Flip the look-around assertion to its equivalent for reverse searches.
139
    /// For example, `StartLF` gets translated to `EndLF`.
140
    ///
141
    /// Some assertions, such as `WordUnicode`, remain the same since they
142
    /// match the same positions regardless of the direction of the search.
143
    #[inline]
144
0
    pub const fn reversed(self) -> Look {
145
0
        match self {
146
0
            Look::Start => Look::End,
147
0
            Look::End => Look::Start,
148
0
            Look::StartLF => Look::EndLF,
149
0
            Look::EndLF => Look::StartLF,
150
0
            Look::StartCRLF => Look::EndCRLF,
151
0
            Look::EndCRLF => Look::StartCRLF,
152
0
            Look::WordAscii => Look::WordAscii,
153
0
            Look::WordAsciiNegate => Look::WordAsciiNegate,
154
0
            Look::WordUnicode => Look::WordUnicode,
155
0
            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
156
0
            Look::WordStartAscii => Look::WordEndAscii,
157
0
            Look::WordEndAscii => Look::WordStartAscii,
158
0
            Look::WordStartUnicode => Look::WordEndUnicode,
159
0
            Look::WordEndUnicode => Look::WordStartUnicode,
160
0
            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
161
0
            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
162
0
            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
163
0
            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
164
        }
165
0
    }
166
167
    /// Return the underlying representation of this look-around enumeration
168
    /// as an integer. Giving the return value to the [`Look::from_repr`]
169
    /// constructor is guaranteed to return the same look-around variant that
170
    /// one started with within a semver compatible release of this crate.
171
    #[inline]
172
0
    pub const fn as_repr(self) -> u32 {
173
0
        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
174
0
        // actual int.
175
0
        self as u32
176
0
    }
177
178
    /// Given the underlying representation of a `Look` value, return the
179
    /// corresponding `Look` value if the representation is valid. Otherwise
180
    /// `None` is returned.
181
    #[inline]
182
0
    pub const fn from_repr(repr: u32) -> Option<Look> {
183
0
        match repr {
184
0
            0b00_0000_0000_0000_0001 => Some(Look::Start),
185
0
            0b00_0000_0000_0000_0010 => Some(Look::End),
186
0
            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
187
0
            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
188
0
            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
189
0
            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
190
0
            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
191
0
            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
192
0
            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
193
0
            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
194
0
            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
195
0
            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
196
0
            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
197
0
            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
198
0
            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
199
0
            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
200
0
            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
201
0
            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
202
0
            _ => None,
203
        }
204
0
    }
205
206
    /// Returns a convenient single codepoint representation of this
207
    /// look-around assertion. Each assertion is guaranteed to be represented
208
    /// by a distinct character.
209
    ///
210
    /// This is useful for succinctly representing a look-around assertion in
211
    /// human friendly but succinct output intended for a programmer working on
212
    /// regex internals.
213
    #[inline]
214
0
    pub const fn as_char(self) -> char {
215
0
        match self {
216
0
            Look::Start => 'A',
217
0
            Look::End => 'z',
218
0
            Look::StartLF => '^',
219
0
            Look::EndLF => '$',
220
0
            Look::StartCRLF => 'r',
221
0
            Look::EndCRLF => 'R',
222
0
            Look::WordAscii => 'b',
223
0
            Look::WordAsciiNegate => 'B',
224
0
            Look::WordUnicode => '𝛃',
225
0
            Look::WordUnicodeNegate => '𝚩',
226
0
            Look::WordStartAscii => '<',
227
0
            Look::WordEndAscii => '>',
228
0
            Look::WordStartUnicode => '〈',
229
0
            Look::WordEndUnicode => '〉',
230
0
            Look::WordStartHalfAscii => '◁',
231
0
            Look::WordEndHalfAscii => '▷',
232
0
            Look::WordStartHalfUnicode => '◀',
233
0
            Look::WordEndHalfUnicode => '▶',
234
        }
235
0
    }
236
}
237
238
/// LookSet is a memory-efficient set of look-around assertions.
239
///
240
/// This is useful for efficiently tracking look-around assertions. For
241
/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
242
/// that return `LookSet`s.
243
#[derive(Clone, Copy, Default, Eq, PartialEq)]
244
pub struct LookSet {
245
    /// The underlying representation this set is exposed to make it possible
246
    /// to store it somewhere efficiently. The representation is that
247
    /// of a bitset, where each assertion occupies bit `i` where
248
    /// `i = Look::as_repr()`.
249
    ///
250
    /// Note that users of this internal representation must permit the full
251
    /// range of `u16` values to be represented. For example, even if the
252
    /// current implementation only makes use of the 10 least significant bits,
253
    /// it may use more bits in a future semver compatible release.
254
    pub bits: u32,
255
}
256
257
impl LookSet {
258
    /// Create an empty set of look-around assertions.
259
    #[inline]
260
0
    pub fn empty() -> LookSet {
261
0
        LookSet { bits: 0 }
262
0
    }
263
264
    /// Create a full set of look-around assertions.
265
    ///
266
    /// This set contains all possible look-around assertions.
267
    #[inline]
268
0
    pub fn full() -> LookSet {
269
0
        LookSet { bits: !0 }
270
0
    }
271
272
    /// Create a look-around set containing the look-around assertion given.
273
    ///
274
    /// This is a convenience routine for creating an empty set and inserting
275
    /// one look-around assertions.
276
    #[inline]
277
0
    pub fn singleton(look: Look) -> LookSet {
278
0
        LookSet::empty().insert(look)
279
0
    }
280
281
    /// Returns the total number of look-around assertions in this set.
282
    #[inline]
283
0
    pub fn len(self) -> usize {
284
0
        // OK because max value always fits in a u8, which in turn always
285
0
        // fits in a usize, regardless of target.
286
0
        usize::try_from(self.bits.count_ones()).unwrap()
287
0
    }
288
289
    /// Returns true if and only if this set is empty.
290
    #[inline]
291
0
    pub fn is_empty(self) -> bool {
292
0
        self.len() == 0
293
0
    }
294
295
    /// Returns true if and only if the given look-around assertion is in this
296
    /// set.
297
    #[inline]
298
0
    pub fn contains(self, look: Look) -> bool {
299
0
        self.bits & look.as_repr() != 0
300
0
    }
301
302
    /// Returns true if and only if this set contains any anchor assertions.
303
    /// This includes both "start/end of haystack" and "start/end of line."
304
    #[inline]
305
0
    pub fn contains_anchor(&self) -> bool {
306
0
        self.contains_anchor_haystack() || self.contains_anchor_line()
307
0
    }
308
309
    /// Returns true if and only if this set contains any "start/end of
310
    /// haystack" anchors. This doesn't include "start/end of line" anchors.
311
    #[inline]
312
0
    pub fn contains_anchor_haystack(&self) -> bool {
313
0
        self.contains(Look::Start) || self.contains(Look::End)
314
0
    }
315
316
    /// Returns true if and only if this set contains any "start/end of line"
317
    /// anchors. This doesn't include "start/end of haystack" anchors. This
318
    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
319
    #[inline]
320
0
    pub fn contains_anchor_line(&self) -> bool {
321
0
        self.contains(Look::StartLF)
322
0
            || self.contains(Look::EndLF)
323
0
            || self.contains(Look::StartCRLF)
324
0
            || self.contains(Look::EndCRLF)
325
0
    }
326
327
    /// Returns true if and only if this set contains any "start/end of line"
328
    /// anchors that only treat `\n` as line terminators. This does not include
329
    /// haystack anchors or CRLF aware line anchors.
330
    #[inline]
331
0
    pub fn contains_anchor_lf(&self) -> bool {
332
0
        self.contains(Look::StartLF) || self.contains(Look::EndLF)
333
0
    }
334
335
    /// Returns true if and only if this set contains any "start/end of line"
336
    /// anchors that are CRLF-aware. This doesn't include "start/end of
337
    /// haystack" or "start/end of line-feed" anchors.
338
    #[inline]
339
0
    pub fn contains_anchor_crlf(&self) -> bool {
340
0
        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
341
0
    }
342
343
    /// Returns true if and only if this set contains any word boundary or
344
    /// negated word boundary assertions. This include both Unicode and ASCII
345
    /// word boundaries.
346
    #[inline]
347
0
    pub fn contains_word(self) -> bool {
348
0
        self.contains_word_unicode() || self.contains_word_ascii()
349
0
    }
350
351
    /// Returns true if and only if this set contains any Unicode word boundary
352
    /// or negated Unicode word boundary assertions.
353
    #[inline]
354
0
    pub fn contains_word_unicode(self) -> bool {
355
0
        self.contains(Look::WordUnicode)
356
0
            || self.contains(Look::WordUnicodeNegate)
357
0
            || self.contains(Look::WordStartUnicode)
358
0
            || self.contains(Look::WordEndUnicode)
359
0
            || self.contains(Look::WordStartHalfUnicode)
360
0
            || self.contains(Look::WordEndHalfUnicode)
361
0
    }
362
363
    /// Returns true if and only if this set contains any ASCII word boundary
364
    /// or negated ASCII word boundary assertions.
365
    #[inline]
366
0
    pub fn contains_word_ascii(self) -> bool {
367
0
        self.contains(Look::WordAscii)
368
0
            || self.contains(Look::WordAsciiNegate)
369
0
            || self.contains(Look::WordStartAscii)
370
0
            || self.contains(Look::WordEndAscii)
371
0
            || self.contains(Look::WordStartHalfAscii)
372
0
            || self.contains(Look::WordEndHalfAscii)
373
0
    }
374
375
    /// Returns an iterator over all of the look-around assertions in this set.
376
    #[inline]
377
0
    pub fn iter(self) -> LookSetIter {
378
0
        LookSetIter { set: self }
379
0
    }
380
381
    /// Return a new set that is equivalent to the original, but with the given
382
    /// assertion added to it. If the assertion is already in the set, then the
383
    /// returned set is equivalent to the original.
384
    #[inline]
385
0
    pub fn insert(self, look: Look) -> LookSet {
386
0
        LookSet { bits: self.bits | look.as_repr() }
387
0
    }
388
389
    /// Updates this set in place with the result of inserting the given
390
    /// assertion into this set.
391
    #[inline]
392
0
    pub fn set_insert(&mut self, look: Look) {
393
0
        *self = self.insert(look);
394
0
    }
395
396
    /// Return a new set that is equivalent to the original, but with the given
397
    /// assertion removed from it. If the assertion is not in the set, then the
398
    /// returned set is equivalent to the original.
399
    #[inline]
400
0
    pub fn remove(self, look: Look) -> LookSet {
401
0
        LookSet { bits: self.bits & !look.as_repr() }
402
0
    }
403
404
    /// Updates this set in place with the result of removing the given
405
    /// assertion from this set.
406
    #[inline]
407
0
    pub fn set_remove(&mut self, look: Look) {
408
0
        *self = self.remove(look);
409
0
    }
410
411
    /// Returns a new set that is the result of subtracting the given set from
412
    /// this set.
413
    #[inline]
414
0
    pub fn subtract(self, other: LookSet) -> LookSet {
415
0
        LookSet { bits: self.bits & !other.bits }
416
0
    }
417
418
    /// Updates this set in place with the result of subtracting the given set
419
    /// from this set.
420
    #[inline]
421
0
    pub fn set_subtract(&mut self, other: LookSet) {
422
0
        *self = self.subtract(other);
423
0
    }
424
425
    /// Returns a new set that is the union of this and the one given.
426
    #[inline]
427
0
    pub fn union(self, other: LookSet) -> LookSet {
428
0
        LookSet { bits: self.bits | other.bits }
429
0
    }
430
431
    /// Updates this set in place with the result of unioning it with the one
432
    /// given.
433
    #[inline]
434
0
    pub fn set_union(&mut self, other: LookSet) {
435
0
        *self = self.union(other);
436
0
    }
437
438
    /// Returns a new set that is the intersection of this and the one given.
439
    #[inline]
440
0
    pub fn intersect(self, other: LookSet) -> LookSet {
441
0
        LookSet { bits: self.bits & other.bits }
442
0
    }
443
444
    /// Updates this set in place with the result of intersecting it with the
445
    /// one given.
446
    #[inline]
447
0
    pub fn set_intersect(&mut self, other: LookSet) {
448
0
        *self = self.intersect(other);
449
0
    }
450
451
    /// Return a `LookSet` from the slice given as a native endian 32-bit
452
    /// integer.
453
    ///
454
    /// # Panics
455
    ///
456
    /// This panics if `slice.len() < 4`.
457
    #[inline]
458
0
    pub fn read_repr(slice: &[u8]) -> LookSet {
459
0
        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
460
0
        LookSet { bits }
461
0
    }
462
463
    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
464
    /// of the slice given.
465
    ///
466
    /// # Panics
467
    ///
468
    /// This panics if `slice.len() < 4`.
469
    #[inline]
470
0
    pub fn write_repr(self, slice: &mut [u8]) {
471
0
        let raw = self.bits.to_ne_bytes();
472
0
        slice[0] = raw[0];
473
0
        slice[1] = raw[1];
474
0
        slice[2] = raw[2];
475
0
        slice[3] = raw[3];
476
0
    }
477
478
    /// Checks that all assertions in this set can be matched.
479
    ///
480
    /// Some assertions, such as Unicode word boundaries, require optional (but
481
    /// enabled by default) tables that may not be available. If there are
482
    /// assertions in this set that require tables that are not available, then
483
    /// this will return an error.
484
    ///
485
    /// Specifically, this returns an error when the the
486
    /// `unicode-word-boundary` feature is _not_ enabled _and_ this set
487
    /// contains a Unicode word boundary assertion.
488
    ///
489
    /// It can be useful to use this on the result of
490
    /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
491
    /// when building a matcher engine to ensure methods like
492
    /// [`LookMatcher::matches_set`] do not panic at search time.
493
0
    pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
494
0
        if self.contains_word_unicode() {
495
0
            UnicodeWordBoundaryError::check()?;
496
0
        }
497
0
        Ok(())
498
0
    }
499
}
500
501
impl core::fmt::Debug for LookSet {
502
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
503
0
        if self.is_empty() {
504
0
            return write!(f, "∅");
505
0
        }
506
0
        for look in self.iter() {
507
0
            write!(f, "{}", look.as_char())?;
508
        }
509
0
        Ok(())
510
0
    }
511
}
512
513
/// An iterator over all look-around assertions in a [`LookSet`].
514
///
515
/// This iterator is created by [`LookSet::iter`].
516
#[derive(Clone, Debug)]
517
pub struct LookSetIter {
518
    set: LookSet,
519
}
520
521
impl Iterator for LookSetIter {
522
    type Item = Look;
523
524
    #[inline]
525
0
    fn next(&mut self) -> Option<Look> {
526
0
        if self.set.is_empty() {
527
0
            return None;
528
0
        }
529
0
        // We'll never have more than u8::MAX distinct look-around assertions,
530
0
        // so 'bit' will always fit into a u16.
531
0
        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
532
0
        let look = Look::from_repr(1 << bit)?;
533
0
        self.set = self.set.remove(look);
534
0
        Some(look)
535
0
    }
536
}
537
538
/// A matcher for look-around assertions.
539
///
540
/// This matcher permits configuring aspects of how look-around assertions are
541
/// matched.
542
///
543
/// # Example
544
///
545
/// A `LookMatcher` can change the line terminator used for matching multi-line
546
/// anchors such as `(?m:^)` and `(?m:$)`.
547
///
548
/// ```
549
/// use regex_automata::{
550
///     nfa::thompson::{self, pikevm::PikeVM},
551
///     util::look::LookMatcher,
552
///     Match, Input,
553
/// };
554
///
555
/// let mut lookm = LookMatcher::new();
556
/// lookm.set_line_terminator(b'\x00');
557
///
558
/// let re = PikeVM::builder()
559
///     .thompson(thompson::Config::new().look_matcher(lookm))
560
///     .build(r"(?m)^[a-z]+$")?;
561
/// let mut cache = re.create_cache();
562
///
563
/// // Multi-line assertions now use NUL as a terminator.
564
/// assert_eq!(
565
///     Some(Match::must(0, 1..4)),
566
///     re.find(&mut cache, b"\x00abc\x00"),
567
/// );
568
/// // ... and \n is no longer recognized as a terminator.
569
/// assert_eq!(
570
///     None,
571
///     re.find(&mut cache, b"\nabc\n"),
572
/// );
573
///
574
/// # Ok::<(), Box<dyn std::error::Error>>(())
575
/// ```
576
#[derive(Clone, Debug)]
577
pub struct LookMatcher {
578
    lineterm: DebugByte,
579
}
580
581
impl LookMatcher {
582
    /// Creates a new default matcher for look-around assertions.
583
0
    pub fn new() -> LookMatcher {
584
0
        LookMatcher { lineterm: DebugByte(b'\n') }
585
0
    }
586
587
    /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
588
    ///
589
    /// Namely, instead of `^` matching after `\n` and `$` matching immediately
590
    /// before a `\n`, this will cause it to match after and before the byte
591
    /// given.
592
    ///
593
    /// It can occasionally be useful to use this to configure the line
594
    /// terminator to the NUL byte when searching binary data.
595
    ///
596
    /// Note that this does not apply to CRLF-aware line anchors such as
597
    /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
598
    /// use `\r` and `\n`.
599
0
    pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
600
0
        self.lineterm.0 = byte;
601
0
        self
602
0
    }
603
604
    /// Returns the line terminator that was configured for this matcher.
605
    ///
606
    /// If no line terminator was configured, then this returns `\n`.
607
    ///
608
    /// Note that the line terminator should only be used for matching `(?m:^)`
609
    /// and `(?m:$)` assertions. It specifically should _not_ be used for
610
    /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
611
0
    pub fn get_line_terminator(&self) -> u8 {
612
0
        self.lineterm.0
613
0
    }
614
615
    /// Returns true when the position `at` in `haystack` satisfies the given
616
    /// look-around assertion.
617
    ///
618
    /// # Panics
619
    ///
620
    /// This panics when testing any Unicode word boundary assertion in this
621
    /// set and when the Unicode word data is not available. Specifically, this
622
    /// only occurs when the `unicode-word-boundary` feature is not enabled.
623
    ///
624
    /// Since it's generally expected that this routine is called inside of
625
    /// a matching engine, callers should check the error condition when
626
    /// building the matching engine. If there is a Unicode word boundary
627
    /// in the matcher and the data isn't available, then the matcher should
628
    /// fail to build.
629
    ///
630
    /// Callers can check the error condition with [`LookSet::available`].
631
    ///
632
    /// This also may panic when `at > haystack.len()`. Note that `at ==
633
    /// haystack.len()` is legal and guaranteed not to panic.
634
    #[inline]
635
0
    pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
636
0
        self.matches_inline(look, haystack, at)
637
0
    }
638
639
    /// Like `matches`, but forcefully inlined.
640
    ///
641
    /// # Panics
642
    ///
643
    /// This panics when testing any Unicode word boundary assertion in this
644
    /// set and when the Unicode word data is not available. Specifically, this
645
    /// only occurs when the `unicode-word-boundary` feature is not enabled.
646
    ///
647
    /// Since it's generally expected that this routine is called inside of
648
    /// a matching engine, callers should check the error condition when
649
    /// building the matching engine. If there is a Unicode word boundary
650
    /// in the matcher and the data isn't available, then the matcher should
651
    /// fail to build.
652
    ///
653
    /// Callers can check the error condition with [`LookSet::available`].
654
    ///
655
    /// This also may panic when `at > haystack.len()`. Note that `at ==
656
    /// haystack.len()` is legal and guaranteed not to panic.
657
    #[cfg_attr(feature = "perf-inline", inline(always))]
658
0
    pub(crate) fn matches_inline(
659
0
        &self,
660
0
        look: Look,
661
0
        haystack: &[u8],
662
0
        at: usize,
663
0
    ) -> bool {
664
0
        match look {
665
0
            Look::Start => self.is_start(haystack, at),
666
0
            Look::End => self.is_end(haystack, at),
667
0
            Look::StartLF => self.is_start_lf(haystack, at),
668
0
            Look::EndLF => self.is_end_lf(haystack, at),
669
0
            Look::StartCRLF => self.is_start_crlf(haystack, at),
670
0
            Look::EndCRLF => self.is_end_crlf(haystack, at),
671
0
            Look::WordAscii => self.is_word_ascii(haystack, at),
672
0
            Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
673
0
            Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
674
            Look::WordUnicodeNegate => {
675
0
                self.is_word_unicode_negate(haystack, at).unwrap()
676
            }
677
0
            Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
678
0
            Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
679
            Look::WordStartUnicode => {
680
0
                self.is_word_start_unicode(haystack, at).unwrap()
681
            }
682
            Look::WordEndUnicode => {
683
0
                self.is_word_end_unicode(haystack, at).unwrap()
684
            }
685
            Look::WordStartHalfAscii => {
686
0
                self.is_word_start_half_ascii(haystack, at)
687
            }
688
            Look::WordEndHalfAscii => {
689
0
                self.is_word_end_half_ascii(haystack, at)
690
            }
691
            Look::WordStartHalfUnicode => {
692
0
                self.is_word_start_half_unicode(haystack, at).unwrap()
693
            }
694
            Look::WordEndHalfUnicode => {
695
0
                self.is_word_end_half_unicode(haystack, at).unwrap()
696
            }
697
        }
698
0
    }
699
700
    /// Returns true when _all_ of the assertions in the given set match at the
701
    /// given position in the haystack.
702
    ///
703
    /// # Panics
704
    ///
705
    /// This panics when testing any Unicode word boundary assertion in this
706
    /// set and when the Unicode word data is not available. Specifically, this
707
    /// only occurs when the `unicode-word-boundary` feature is not enabled.
708
    ///
709
    /// Since it's generally expected that this routine is called inside of
710
    /// a matching engine, callers should check the error condition when
711
    /// building the matching engine. If there is a Unicode word boundary
712
    /// in the matcher and the data isn't available, then the matcher should
713
    /// fail to build.
714
    ///
715
    /// Callers can check the error condition with [`LookSet::available`].
716
    ///
717
    /// This also may panic when `at > haystack.len()`. Note that `at ==
718
    /// haystack.len()` is legal and guaranteed not to panic.
719
    #[inline]
720
0
    pub fn matches_set(
721
0
        &self,
722
0
        set: LookSet,
723
0
        haystack: &[u8],
724
0
        at: usize,
725
0
    ) -> bool {
726
0
        self.matches_set_inline(set, haystack, at)
727
0
    }
728
729
    /// Like `LookSet::matches`, but forcefully inlined for perf.
730
    #[cfg_attr(feature = "perf-inline", inline(always))]
731
0
    pub(crate) fn matches_set_inline(
732
0
        &self,
733
0
        set: LookSet,
734
0
        haystack: &[u8],
735
0
        at: usize,
736
0
    ) -> bool {
737
0
        // This used to luse LookSet::iter with Look::matches on each element,
738
0
        // but that proved to be quite diastrous for perf. The manual "if
739
0
        // the set has this assertion, check it" turns out to be quite a bit
740
0
        // faster.
741
0
        if set.contains(Look::Start) {
742
0
            if !self.is_start(haystack, at) {
743
0
                return false;
744
0
            }
745
0
        }
746
0
        if set.contains(Look::End) {
747
0
            if !self.is_end(haystack, at) {
748
0
                return false;
749
0
            }
750
0
        }
751
0
        if set.contains(Look::StartLF) {
752
0
            if !self.is_start_lf(haystack, at) {
753
0
                return false;
754
0
            }
755
0
        }
756
0
        if set.contains(Look::EndLF) {
757
0
            if !self.is_end_lf(haystack, at) {
758
0
                return false;
759
0
            }
760
0
        }
761
0
        if set.contains(Look::StartCRLF) {
762
0
            if !self.is_start_crlf(haystack, at) {
763
0
                return false;
764
0
            }
765
0
        }
766
0
        if set.contains(Look::EndCRLF) {
767
0
            if !self.is_end_crlf(haystack, at) {
768
0
                return false;
769
0
            }
770
0
        }
771
0
        if set.contains(Look::WordAscii) {
772
0
            if !self.is_word_ascii(haystack, at) {
773
0
                return false;
774
0
            }
775
0
        }
776
0
        if set.contains(Look::WordAsciiNegate) {
777
0
            if !self.is_word_ascii_negate(haystack, at) {
778
0
                return false;
779
0
            }
780
0
        }
781
0
        if set.contains(Look::WordUnicode) {
782
0
            if !self.is_word_unicode(haystack, at).unwrap() {
783
0
                return false;
784
0
            }
785
0
        }
786
0
        if set.contains(Look::WordUnicodeNegate) {
787
0
            if !self.is_word_unicode_negate(haystack, at).unwrap() {
788
0
                return false;
789
0
            }
790
0
        }
791
0
        if set.contains(Look::WordStartAscii) {
792
0
            if !self.is_word_start_ascii(haystack, at) {
793
0
                return false;
794
0
            }
795
0
        }
796
0
        if set.contains(Look::WordEndAscii) {
797
0
            if !self.is_word_end_ascii(haystack, at) {
798
0
                return false;
799
0
            }
800
0
        }
801
0
        if set.contains(Look::WordStartUnicode) {
802
0
            if !self.is_word_start_unicode(haystack, at).unwrap() {
803
0
                return false;
804
0
            }
805
0
        }
806
0
        if set.contains(Look::WordEndUnicode) {
807
0
            if !self.is_word_end_unicode(haystack, at).unwrap() {
808
0
                return false;
809
0
            }
810
0
        }
811
0
        if set.contains(Look::WordStartHalfAscii) {
812
0
            if !self.is_word_start_half_ascii(haystack, at) {
813
0
                return false;
814
0
            }
815
0
        }
816
0
        if set.contains(Look::WordEndHalfAscii) {
817
0
            if !self.is_word_end_half_ascii(haystack, at) {
818
0
                return false;
819
0
            }
820
0
        }
821
0
        if set.contains(Look::WordStartHalfUnicode) {
822
0
            if !self.is_word_start_half_unicode(haystack, at).unwrap() {
823
0
                return false;
824
0
            }
825
0
        }
826
0
        if set.contains(Look::WordEndHalfUnicode) {
827
0
            if !self.is_word_end_half_unicode(haystack, at).unwrap() {
828
0
                return false;
829
0
            }
830
0
        }
831
0
        true
832
0
    }
833
834
    /// Split up the given byte classes into equivalence classes in a way that
835
    /// is consistent with this look-around assertion.
836
    #[cfg(feature = "alloc")]
837
    pub(crate) fn add_to_byteset(
838
        &self,
839
        look: Look,
840
        set: &mut crate::util::alphabet::ByteClassSet,
841
    ) {
842
        match look {
843
            Look::Start | Look::End => {}
844
            Look::StartLF | Look::EndLF => {
845
                set.set_range(self.lineterm.0, self.lineterm.0);
846
            }
847
            Look::StartCRLF | Look::EndCRLF => {
848
                set.set_range(b'\r', b'\r');
849
                set.set_range(b'\n', b'\n');
850
            }
851
            Look::WordAscii
852
            | Look::WordAsciiNegate
853
            | Look::WordUnicode
854
            | Look::WordUnicodeNegate
855
            | Look::WordStartAscii
856
            | Look::WordEndAscii
857
            | Look::WordStartUnicode
858
            | Look::WordEndUnicode
859
            | Look::WordStartHalfAscii
860
            | Look::WordEndHalfAscii
861
            | Look::WordStartHalfUnicode
862
            | Look::WordEndHalfUnicode => {
863
                // We need to mark all ranges of bytes whose pairs result in
864
                // evaluating \b differently. This isn't technically correct
865
                // for Unicode word boundaries, but DFAs can't handle those
866
                // anyway, and thus, the byte classes don't need to either
867
                // since they are themselves only used in DFAs.
868
                //
869
                // FIXME: It seems like the calls to 'set_range' here are
870
                // completely invariant, which means we could just hard-code
871
                // them here without needing to write a loop. And we only need
872
                // to do this dance at most once per regex.
873
                //
874
                // FIXME: Is this correct for \B?
875
                let iswb = utf8::is_word_byte;
876
                // This unwrap is OK because we guard every use of 'asu8' with
877
                // a check that the input is <= 255.
878
                let asu8 = |b: u16| u8::try_from(b).unwrap();
879
                let mut b1: u16 = 0;
880
                let mut b2: u16;
881
                while b1 <= 255 {
882
                    b2 = b1 + 1;
883
                    while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) {
884
                        b2 += 1;
885
                    }
886
                    // The guards above guarantee that b2 can never get any
887
                    // bigger.
888
                    assert!(b2 <= 256);
889
                    // Subtracting 1 from b2 is always OK because it is always
890
                    // at least 1 greater than b1, and the assert above
891
                    // guarantees that the asu8 conversion will succeed.
892
                    set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));
893
                    b1 = b2;
894
                }
895
            }
896
        }
897
    }
898
899
    /// Returns true when [`Look::Start`] is satisfied `at` the given position
900
    /// in `haystack`.
901
    ///
902
    /// # Panics
903
    ///
904
    /// This may panic when `at > haystack.len()`. Note that `at ==
905
    /// haystack.len()` is legal and guaranteed not to panic.
906
    #[inline]
907
0
    pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
908
0
        at == 0
909
0
    }
910
911
    /// Returns true when [`Look::End`] is satisfied `at` the given position in
912
    /// `haystack`.
913
    ///
914
    /// # Panics
915
    ///
916
    /// This may panic when `at > haystack.len()`. Note that `at ==
917
    /// haystack.len()` is legal and guaranteed not to panic.
918
    #[inline]
919
0
    pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
920
0
        at == haystack.len()
921
0
    }
922
923
    /// Returns true when [`Look::StartLF`] is satisfied `at` the given
924
    /// position in `haystack`.
925
    ///
926
    /// # Panics
927
    ///
928
    /// This may panic when `at > haystack.len()`. Note that `at ==
929
    /// haystack.len()` is legal and guaranteed not to panic.
930
    #[inline]
931
0
    pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
932
0
        self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0
933
0
    }
934
935
    /// Returns true when [`Look::EndLF`] is satisfied `at` the given position
936
    /// in `haystack`.
937
    ///
938
    /// # Panics
939
    ///
940
    /// This may panic when `at > haystack.len()`. Note that `at ==
941
    /// haystack.len()` is legal and guaranteed not to panic.
942
    #[inline]
943
0
    pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
944
0
        self.is_end(haystack, at) || haystack[at] == self.lineterm.0
945
0
    }
946
947
    /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
948
    /// position in `haystack`.
949
    ///
950
    /// # Panics
951
    ///
952
    /// This may panic when `at > haystack.len()`. Note that `at ==
953
    /// haystack.len()` is legal and guaranteed not to panic.
954
    #[inline]
955
0
    pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
956
0
        self.is_start(haystack, at)
957
0
            || haystack[at - 1] == b'\n'
958
0
            || (haystack[at - 1] == b'\r'
959
0
                && (at >= haystack.len() || haystack[at] != b'\n'))
960
0
    }
961
962
    /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
963
    /// position in `haystack`.
964
    ///
965
    /// # Panics
966
    ///
967
    /// This may panic when `at > haystack.len()`. Note that `at ==
968
    /// haystack.len()` is legal and guaranteed not to panic.
969
    #[inline]
970
0
    pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
971
0
        self.is_end(haystack, at)
972
0
            || haystack[at] == b'\r'
973
0
            || (haystack[at] == b'\n'
974
0
                && (at == 0 || haystack[at - 1] != b'\r'))
975
0
    }
976
977
    /// Returns true when [`Look::WordAscii`] is satisfied `at` the given
978
    /// position in `haystack`.
979
    ///
980
    /// # Panics
981
    ///
982
    /// This may panic when `at > haystack.len()`. Note that `at ==
983
    /// haystack.len()` is legal and guaranteed not to panic.
984
    #[inline]
985
0
    pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
986
0
        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
987
0
        let word_after =
988
0
            at < haystack.len() && utf8::is_word_byte(haystack[at]);
989
0
        word_before != word_after
990
0
    }
991
992
    /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
993
    /// position in `haystack`.
994
    ///
995
    /// # Panics
996
    ///
997
    /// This may panic when `at > haystack.len()`. Note that `at ==
998
    /// haystack.len()` is legal and guaranteed not to panic.
999
    #[inline]
1000
0
    pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
1001
0
        !self.is_word_ascii(haystack, at)
1002
0
    }
1003
1004
    /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
1005
    /// position in `haystack`.
1006
    ///
1007
    /// # Panics
1008
    ///
1009
    /// This may panic when `at > haystack.len()`. Note that `at ==
1010
    /// haystack.len()` is legal and guaranteed not to panic.
1011
    ///
1012
    /// # Errors
1013
    ///
1014
    /// This returns an error when Unicode word boundary tables
1015
    /// are not available. Specifically, this only occurs when the
1016
    /// `unicode-word-boundary` feature is not enabled.
1017
    #[inline]
1018
0
    pub fn is_word_unicode(
1019
0
        &self,
1020
0
        haystack: &[u8],
1021
0
        at: usize,
1022
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1023
0
        let word_before = is_word_char::rev(haystack, at)?;
1024
0
        let word_after = is_word_char::fwd(haystack, at)?;
1025
0
        Ok(word_before != word_after)
1026
0
    }
1027
1028
    /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
1029
    /// given position in `haystack`.
1030
    ///
1031
    /// # Panics
1032
    ///
1033
    /// This may panic when `at > haystack.len()`. Note that `at ==
1034
    /// haystack.len()` is legal and guaranteed not to panic.
1035
    ///
1036
    /// # Errors
1037
    ///
1038
    /// This returns an error when Unicode word boundary tables
1039
    /// are not available. Specifically, this only occurs when the
1040
    /// `unicode-word-boundary` feature is not enabled.
1041
    #[inline]
1042
0
    pub fn is_word_unicode_negate(
1043
0
        &self,
1044
0
        haystack: &[u8],
1045
0
        at: usize,
1046
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1047
        // This is pretty subtle. Why do we need to do UTF-8 decoding here?
1048
        // Well... at time of writing, the is_word_char_{fwd,rev} routines will
1049
        // only return true if there is a valid UTF-8 encoding of a "word"
1050
        // codepoint, and false in every other case (including invalid UTF-8).
1051
        // This means that in regions of invalid UTF-8 (which might be a
1052
        // subset of valid UTF-8!), it would result in \B matching. While this
1053
        // would be questionable in the context of truly invalid UTF-8, it is
1054
        // *certainly* wrong to report match boundaries that split the encoding
1055
        // of a codepoint. So to work around this, we ensure that we can decode
1056
        // a codepoint on either side of `at`. If either direction fails, then
1057
        // we don't permit \B to match at all.
1058
        //
1059
        // Now, this isn't exactly optimal from a perf perspective. We could
1060
        // try and detect this in is_word_char::{fwd,rev}, but it's not clear
1061
        // if it's worth it. \B is, after all, rarely used. Even worse,
1062
        // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
1063
        // will wind up doing UTF-8 decoding twice. Owch. We could fix this
1064
        // with more code complexity, but it just doesn't feel worth it for \B.
1065
        //
1066
        // And in particular, we do *not* have to do this with \b, because \b
1067
        // *requires* that at least one side of `at` be a "word" codepoint,
1068
        // which in turn implies one side of `at` must be valid UTF-8. This in
1069
        // turn implies that \b can never split a valid UTF-8 encoding of a
1070
        // codepoint. In the case where one side of `at` is truly invalid UTF-8
1071
        // and the other side IS a word codepoint, then we want \b to match
1072
        // since it represents a valid UTF-8 boundary. It also makes sense. For
1073
        // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
1074
        //
1075
        // Note also that this is not just '!is_word_unicode(..)' like it is
1076
        // for the ASCII case. For example, neither \b nor \B is satisfied
1077
        // within invalid UTF-8 sequences.
1078
0
        let word_before = at > 0
1079
0
            && match utf8::decode_last(&haystack[..at]) {
1080
0
                None | Some(Err(_)) => return Ok(false),
1081
0
                Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1082
            };
1083
0
        let word_after = at < haystack.len()
1084
0
            && match utf8::decode(&haystack[at..]) {
1085
0
                None | Some(Err(_)) => return Ok(false),
1086
0
                Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1087
            };
1088
0
        Ok(word_before == word_after)
1089
0
    }
1090
1091
    /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
1092
    /// position in `haystack`.
1093
    ///
1094
    /// # Panics
1095
    ///
1096
    /// This may panic when `at > haystack.len()`. Note that `at ==
1097
    /// haystack.len()` is legal and guaranteed not to panic.
1098
    #[inline]
1099
0
    pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
1100
0
        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1101
0
        let word_after =
1102
0
            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1103
0
        !word_before && word_after
1104
0
    }
1105
1106
    /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
1107
    /// position in `haystack`.
1108
    ///
1109
    /// # Panics
1110
    ///
1111
    /// This may panic when `at > haystack.len()`. Note that `at ==
1112
    /// haystack.len()` is legal and guaranteed not to panic.
1113
    #[inline]
1114
0
    pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
1115
0
        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1116
0
        let word_after =
1117
0
            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1118
0
        word_before && !word_after
1119
0
    }
1120
1121
    /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
1122
    /// given position in `haystack`.
1123
    ///
1124
    /// # Panics
1125
    ///
1126
    /// This may panic when `at > haystack.len()`. Note that `at ==
1127
    /// haystack.len()` is legal and guaranteed not to panic.
1128
    ///
1129
    /// # Errors
1130
    ///
1131
    /// This returns an error when Unicode word boundary tables
1132
    /// are not available. Specifically, this only occurs when the
1133
    /// `unicode-word-boundary` feature is not enabled.
1134
    #[inline]
1135
0
    pub fn is_word_start_unicode(
1136
0
        &self,
1137
0
        haystack: &[u8],
1138
0
        at: usize,
1139
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1140
0
        let word_before = is_word_char::rev(haystack, at)?;
1141
0
        let word_after = is_word_char::fwd(haystack, at)?;
1142
0
        Ok(!word_before && word_after)
1143
0
    }
1144
1145
    /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
1146
    /// given position in `haystack`.
1147
    ///
1148
    /// # Panics
1149
    ///
1150
    /// This may panic when `at > haystack.len()`. Note that `at ==
1151
    /// haystack.len()` is legal and guaranteed not to panic.
1152
    ///
1153
    /// # Errors
1154
    ///
1155
    /// This returns an error when Unicode word boundary tables
1156
    /// are not available. Specifically, this only occurs when the
1157
    /// `unicode-word-boundary` feature is not enabled.
1158
    #[inline]
1159
0
    pub fn is_word_end_unicode(
1160
0
        &self,
1161
0
        haystack: &[u8],
1162
0
        at: usize,
1163
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1164
0
        let word_before = is_word_char::rev(haystack, at)?;
1165
0
        let word_after = is_word_char::fwd(haystack, at)?;
1166
0
        Ok(word_before && !word_after)
1167
0
    }
1168
1169
    /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
1170
    /// given position in `haystack`.
1171
    ///
1172
    /// # Panics
1173
    ///
1174
    /// This may panic when `at > haystack.len()`. Note that `at ==
1175
    /// haystack.len()` is legal and guaranteed not to panic.
1176
    #[inline]
1177
0
    pub fn is_word_start_half_ascii(
1178
0
        &self,
1179
0
        haystack: &[u8],
1180
0
        at: usize,
1181
0
    ) -> bool {
1182
0
        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1183
0
        !word_before
1184
0
    }
1185
1186
    /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
1187
    /// given position in `haystack`.
1188
    ///
1189
    /// # Panics
1190
    ///
1191
    /// This may panic when `at > haystack.len()`. Note that `at ==
1192
    /// haystack.len()` is legal and guaranteed not to panic.
1193
    #[inline]
1194
0
    pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
1195
0
        let word_after =
1196
0
            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1197
0
        !word_after
1198
0
    }
1199
1200
    /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
1201
    /// given position in `haystack`.
1202
    ///
1203
    /// # Panics
1204
    ///
1205
    /// This may panic when `at > haystack.len()`. Note that `at ==
1206
    /// haystack.len()` is legal and guaranteed not to panic.
1207
    ///
1208
    /// # Errors
1209
    ///
1210
    /// This returns an error when Unicode word boundary tables
1211
    /// are not available. Specifically, this only occurs when the
1212
    /// `unicode-word-boundary` feature is not enabled.
1213
    #[inline]
1214
0
    pub fn is_word_start_half_unicode(
1215
0
        &self,
1216
0
        haystack: &[u8],
1217
0
        at: usize,
1218
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1219
        // See `is_word_unicode_negate` for why we need to do this. We don't
1220
        // need to do it for `is_word_start_unicode` because that guarantees
1221
        // that the position matched falls on a valid UTF-8 boundary given
1222
        // that the right side must be in \w.
1223
0
        let word_before = at > 0
1224
0
            && match utf8::decode_last(&haystack[..at]) {
1225
0
                None | Some(Err(_)) => return Ok(false),
1226
0
                Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1227
            };
1228
0
        Ok(!word_before)
1229
0
    }
1230
1231
    /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
1232
    /// given position in `haystack`.
1233
    ///
1234
    /// # Panics
1235
    ///
1236
    /// This may panic when `at > haystack.len()`. Note that `at ==
1237
    /// haystack.len()` is legal and guaranteed not to panic.
1238
    ///
1239
    /// # Errors
1240
    ///
1241
    /// This returns an error when Unicode word boundary tables
1242
    /// are not available. Specifically, this only occurs when the
1243
    /// `unicode-word-boundary` feature is not enabled.
1244
    #[inline]
1245
0
    pub fn is_word_end_half_unicode(
1246
0
        &self,
1247
0
        haystack: &[u8],
1248
0
        at: usize,
1249
0
    ) -> Result<bool, UnicodeWordBoundaryError> {
1250
        // See `is_word_unicode_negate` for why we need to do this. We don't
1251
        // need to do it for `is_word_end_unicode` because that guarantees
1252
        // that the position matched falls on a valid UTF-8 boundary given
1253
        // that the left side must be in \w.
1254
0
        let word_after = at < haystack.len()
1255
0
            && match utf8::decode(&haystack[at..]) {
1256
0
                None | Some(Err(_)) => return Ok(false),
1257
0
                Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1258
            };
1259
0
        Ok(!word_after)
1260
0
    }
1261
}
1262
1263
impl Default for LookMatcher {
1264
0
    fn default() -> LookMatcher {
1265
0
        LookMatcher::new()
1266
0
    }
1267
}
1268
1269
/// An error that occurs when the Unicode-aware `\w` class is unavailable.
1270
///
1271
/// This error can occur when the data tables necessary for the Unicode aware
1272
/// Perl character class `\w` are unavailable. The `\w` class is used to
1273
/// determine whether a codepoint is considered a word character or not when
1274
/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
1275
/// position.
1276
///
1277
/// This error can only occur when the `unicode-word-boundary` feature is
1278
/// disabled.
1279
#[derive(Clone, Debug)]
1280
pub struct UnicodeWordBoundaryError(());
1281
1282
impl UnicodeWordBoundaryError {
1283
    #[cfg(not(feature = "unicode-word-boundary"))]
1284
0
    pub(crate) fn new() -> UnicodeWordBoundaryError {
1285
0
        UnicodeWordBoundaryError(())
1286
0
    }
1287
1288
    /// Returns an error if and only if Unicode word boundary data is
1289
    /// unavailable.
1290
0
    pub fn check() -> Result<(), UnicodeWordBoundaryError> {
1291
0
        is_word_char::check()
1292
0
    }
1293
}
1294
1295
#[cfg(feature = "std")]
1296
impl std::error::Error for UnicodeWordBoundaryError {}
1297
1298
impl core::fmt::Display for UnicodeWordBoundaryError {
1299
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1300
0
        write!(
1301
0
            f,
1302
0
            "Unicode-aware \\b and \\B are unavailable because the \
1303
0
             requisite data tables are missing, please enable the \
1304
0
             unicode-word-boundary feature"
1305
0
        )
1306
0
    }
1307
}
1308
1309
// Below are FOUR different ways for checking whether whether a "word"
1310
// codepoint exists at a particular position in the haystack. The four
1311
// different approaches are, in order of preference:
1312
//
1313
// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
1314
// first call, and then use that DFA for all subsequent calls.
1315
// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
1316
// 3. Do UTF-8 decoding and use our own 'perl_word' table.
1317
// 4. Return an error.
1318
//
1319
// The reason for all of these approaches is a combination of perf and
1320
// permitting one to build regex-automata without the Unicode data necessary
1321
// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
1322
// still work.)
1323
//
1324
// The DFA approach is the fastest, but it requires the regex parser, the
1325
// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
1326
// bring in, but if it's available, it's (probably) the best we can do.
1327
//
1328
// Approaches (2) and (3) are effectively equivalent, but (2) reuses the
1329
// data in regex-syntax and avoids duplicating it in regex-automata.
1330
//
1331
// Finally, (4) unconditionally returns an error since the requisite data isn't
1332
// available anywhere.
1333
//
1334
// There are actually more approaches possible that we didn't implement. For
1335
// example, if the DFA builder is available but the syntax parser is not, we
1336
// could technically hand construct our own NFA from the 'perl_word' data
1337
// table. But to avoid some pretty hairy code duplication, we would in turn
1338
// need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
1339
//
1340
// A possibly more sensible alternative is to use a lazy DFA when the full
1341
// DFA builder isn't available...
1342
//
1343
// Yet another choice would be to build the full DFA and then embed it into the
1344
// source. Then we'd only need to bring in the DFA search runtime, which is
1345
// considerably smaller than the DFA builder code. The problem here is that the
1346
// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
1347
// we'd need to build regex-cli, which depends on regex-automata in order to
1348
// build some part of regex-automata. But to be honest, something like this has
1349
// to be allowed somehow? I just don't know what the right process is.
1350
//
1351
// There are perhaps other choices as well. Why did I stop at these 4? Because
1352
// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
1353
// approach eventually, as the benefits of the DFA approach are somewhat
1354
// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1355
// the commands below no longer work. If necessary, we should re-capitulate
1356
// the benchmark from whole cloth in rebar.)
1357
//
1358
//   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
1359
//
1360
// Then I changed the code below so that the util/unicode_data/perl_word table
1361
// was used and re-ran the benchmark:
1362
//
1363
//   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
1364
//
1365
// And compared them:
1366
//
1367
//   $ regex-cli bench diff dfa.csv table.csv
1368
//   benchmark                             engine                 dfa        table
1369
//   ---------                             ------                 ---        -----
1370
//   internal/count/boundary-words-holmes  regex/automata/pikevm  18.6 MB/s  12.9 MB/s
1371
//
1372
// Which is a nice improvement.
1373
//
1374
// UPDATE: It turns out that it takes approximately 22ms to build the reverse
1375
// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
1376
// the grand scheme things, but that is a significant latency cost. So I'm not
1377
// sure that's a good idea. I then tried using a lazy DFA instead, and that
1378
// eliminated the overhead, but since the lazy DFA requires mutable working
1379
// memory, that requires introducing a 'Cache' for every simultaneous call.
1380
//
1381
// I ended up deciding for now to just keep the "UTF-8 decode and check the
1382
// table." The DFA and lazy DFA approaches are still below, but commented out.
1383
//
1384
// [1]: https://github.com/BurntSushi/ucd-generate/issues/11
1385
1386
/*
1387
/// A module that looks for word codepoints using lazy DFAs.
1388
#[cfg(all(
1389
    feature = "unicode-word-boundary",
1390
    feature = "syntax",
1391
    feature = "unicode-perl",
1392
    feature = "hybrid"
1393
))]
1394
mod is_word_char {
1395
    use alloc::vec::Vec;
1396
1397
    use crate::{
1398
        hybrid::dfa::{Cache, DFA},
1399
        nfa::thompson::NFA,
1400
        util::{lazy::Lazy, pool::Pool, primitives::StateID},
1401
        Anchored, Input,
1402
    };
1403
1404
    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1405
        Ok(())
1406
    }
1407
1408
    #[cfg_attr(feature = "perf-inline", inline(always))]
1409
    pub(super) fn fwd(
1410
        haystack: &[u8],
1411
        mut at: usize,
1412
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1413
        static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());
1414
        static CACHE: Lazy<Pool<Cache>> =
1415
            Lazy::new(|| Pool::new(|| WORD.create_cache()));
1416
        let dfa = Lazy::get(&WORD);
1417
        let mut cache = Lazy::get(&CACHE).get();
1418
        let mut sid = dfa
1419
            .start_state_forward(
1420
                &mut cache,
1421
                &Input::new("").anchored(Anchored::Yes),
1422
            )
1423
            .unwrap();
1424
        while at < haystack.len() {
1425
            let byte = haystack[at];
1426
            sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1427
            at += 1;
1428
            if sid.is_tagged() {
1429
                if sid.is_match() {
1430
                    return Ok(true);
1431
                } else if sid.is_dead() {
1432
                    return Ok(false);
1433
                }
1434
            }
1435
        }
1436
        Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1437
    }
1438
1439
    #[cfg_attr(feature = "perf-inline", inline(always))]
1440
    pub(super) fn rev(
1441
        haystack: &[u8],
1442
        mut at: usize,
1443
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1444
        static WORD: Lazy<DFA> = Lazy::new(|| {
1445
            DFA::builder()
1446
                .thompson(NFA::config().reverse(true))
1447
                .build(r"\w")
1448
                .unwrap()
1449
        });
1450
        static CACHE: Lazy<Pool<Cache>> =
1451
            Lazy::new(|| Pool::new(|| WORD.create_cache()));
1452
        let dfa = Lazy::get(&WORD);
1453
        let mut cache = Lazy::get(&CACHE).get();
1454
        let mut sid = dfa
1455
            .start_state_reverse(
1456
                &mut cache,
1457
                &Input::new("").anchored(Anchored::Yes),
1458
            )
1459
            .unwrap();
1460
        while at > 0 {
1461
            at -= 1;
1462
            let byte = haystack[at];
1463
            sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1464
            if sid.is_tagged() {
1465
                if sid.is_match() {
1466
                    return Ok(true);
1467
                } else if sid.is_dead() {
1468
                    return Ok(false);
1469
                }
1470
            }
1471
        }
1472
        Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1473
    }
1474
}
1475
*/
1476
1477
/*
1478
/// A module that looks for word codepoints using fully compiled DFAs.
1479
#[cfg(all(
1480
    feature = "unicode-word-boundary",
1481
    feature = "syntax",
1482
    feature = "unicode-perl",
1483
    feature = "dfa-build"
1484
))]
1485
mod is_word_char {
1486
    use alloc::vec::Vec;
1487
1488
    use crate::{
1489
        dfa::{dense::DFA, Automaton, StartKind},
1490
        nfa::thompson::NFA,
1491
        util::{lazy::Lazy, primitives::StateID},
1492
        Anchored, Input,
1493
    };
1494
1495
    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1496
        Ok(())
1497
    }
1498
1499
    #[cfg_attr(feature = "perf-inline", inline(always))]
1500
    pub(super) fn fwd(
1501
        haystack: &[u8],
1502
        mut at: usize,
1503
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1504
        static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1505
            let dfa = DFA::builder()
1506
                .configure(DFA::config().start_kind(StartKind::Anchored))
1507
                .build(r"\w")
1508
                .unwrap();
1509
            // OK because our regex has no look-around.
1510
            let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1511
            (dfa, start_id)
1512
        });
1513
        let &(ref dfa, mut sid) = Lazy::get(&WORD);
1514
        while at < haystack.len() {
1515
            let byte = haystack[at];
1516
            sid = dfa.next_state(sid, byte);
1517
            at += 1;
1518
            if dfa.is_special_state(sid) {
1519
                if dfa.is_match_state(sid) {
1520
                    return Ok(true);
1521
                } else if dfa.is_dead_state(sid) {
1522
                    return Ok(false);
1523
                }
1524
            }
1525
        }
1526
        Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1527
    }
1528
1529
    #[cfg_attr(feature = "perf-inline", inline(always))]
1530
    pub(super) fn rev(
1531
        haystack: &[u8],
1532
        mut at: usize,
1533
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1534
        static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1535
            let dfa = DFA::builder()
1536
                .configure(DFA::config().start_kind(StartKind::Anchored))
1537
                // From ad hoc measurements, it looks like setting
1538
                // shrink==false is slightly faster than shrink==true. I kind
1539
                // of feel like this indicates that shrinking is probably a
1540
                // failure, although it can help in some cases. Sigh.
1541
                .thompson(NFA::config().reverse(true).shrink(false))
1542
                .build(r"\w")
1543
                .unwrap();
1544
            // OK because our regex has no look-around.
1545
            let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1546
            (dfa, start_id)
1547
        });
1548
        let &(ref dfa, mut sid) = Lazy::get(&WORD);
1549
        while at > 0 {
1550
            at -= 1;
1551
            let byte = haystack[at];
1552
            sid = dfa.next_state(sid, byte);
1553
            if dfa.is_special_state(sid) {
1554
                if dfa.is_match_state(sid) {
1555
                    return Ok(true);
1556
                } else if dfa.is_dead_state(sid) {
1557
                    return Ok(false);
1558
                }
1559
            }
1560
        }
1561
        Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1562
    }
1563
}
1564
*/
1565
1566
/// A module that looks for word codepoints using regex-syntax's data tables.
1567
#[cfg(all(
1568
    feature = "unicode-word-boundary",
1569
    feature = "syntax",
1570
    feature = "unicode-perl",
1571
))]
1572
mod is_word_char {
1573
    use regex_syntax::try_is_word_character;
1574
1575
    use crate::util::utf8;
1576
1577
    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1578
        Ok(())
1579
    }
1580
1581
    #[cfg_attr(feature = "perf-inline", inline(always))]
1582
    pub(super) fn fwd(
1583
        haystack: &[u8],
1584
        at: usize,
1585
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1586
        Ok(match utf8::decode(&haystack[at..]) {
1587
            None | Some(Err(_)) => false,
1588
            Some(Ok(ch)) => try_is_word_character(ch).expect(
1589
                "since unicode-word-boundary, syntax and unicode-perl \
1590
                 are all enabled, it is expected that \
1591
                 try_is_word_character succeeds",
1592
            ),
1593
        })
1594
    }
1595
1596
    #[cfg_attr(feature = "perf-inline", inline(always))]
1597
    pub(super) fn rev(
1598
        haystack: &[u8],
1599
        at: usize,
1600
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1601
        Ok(match utf8::decode_last(&haystack[..at]) {
1602
            None | Some(Err(_)) => false,
1603
            Some(Ok(ch)) => try_is_word_character(ch).expect(
1604
                "since unicode-word-boundary, syntax and unicode-perl \
1605
                 are all enabled, it is expected that \
1606
                 try_is_word_character succeeds",
1607
            ),
1608
        })
1609
    }
1610
}
1611
1612
/// A module that looks for word codepoints using regex-automata's data tables
1613
/// (which are only compiled when regex-syntax's tables aren't available).
1614
///
1615
/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
1616
/// perl_word.
1617
#[cfg(all(
1618
    feature = "unicode-word-boundary",
1619
    not(all(feature = "syntax", feature = "unicode-perl")),
1620
))]
1621
mod is_word_char {
1622
    use crate::util::utf8;
1623
1624
    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1625
        Ok(())
1626
    }
1627
1628
    #[cfg_attr(feature = "perf-inline", inline(always))]
1629
    pub(super) fn fwd(
1630
        haystack: &[u8],
1631
        at: usize,
1632
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1633
        Ok(match utf8::decode(&haystack[at..]) {
1634
            None | Some(Err(_)) => false,
1635
            Some(Ok(ch)) => is_word_character(ch),
1636
        })
1637
    }
1638
1639
    #[cfg_attr(feature = "perf-inline", inline(always))]
1640
    pub(super) fn rev(
1641
        haystack: &[u8],
1642
        at: usize,
1643
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1644
        Ok(match utf8::decode_last(&haystack[..at]) {
1645
            None | Some(Err(_)) => false,
1646
            Some(Ok(ch)) => is_word_character(ch),
1647
        })
1648
    }
1649
1650
    #[cfg_attr(feature = "perf-inline", inline(always))]
1651
    fn is_word_character(c: char) -> bool {
1652
        use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
1653
1654
        if u8::try_from(c).map_or(false, utf8::is_word_byte) {
1655
            return true;
1656
        }
1657
        PERL_WORD
1658
            .binary_search_by(|&(start, end)| {
1659
                use core::cmp::Ordering;
1660
1661
                if start <= c && c <= end {
1662
                    Ordering::Equal
1663
                } else if start > c {
1664
                    Ordering::Greater
1665
                } else {
1666
                    Ordering::Less
1667
                }
1668
            })
1669
            .is_ok()
1670
    }
1671
}
1672
1673
/// A module that always returns an error if Unicode word boundaries are
1674
/// disabled. When this feature is disabled, then regex-automata will not
1675
/// include its own data tables even if regex-syntax is disabled.
1676
#[cfg(not(feature = "unicode-word-boundary"))]
1677
mod is_word_char {
1678
0
    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1679
0
        Err(super::UnicodeWordBoundaryError::new())
1680
0
    }
1681
1682
    #[cfg_attr(feature = "perf-inline", inline(always))]
1683
0
    pub(super) fn fwd(
1684
0
        _bytes: &[u8],
1685
0
        _at: usize,
1686
0
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1687
0
        Err(super::UnicodeWordBoundaryError::new())
1688
0
    }
1689
1690
    #[cfg_attr(feature = "perf-inline", inline(always))]
1691
0
    pub(super) fn rev(
1692
0
        _bytes: &[u8],
1693
0
        _at: usize,
1694
0
    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1695
0
        Err(super::UnicodeWordBoundaryError::new())
1696
0
    }
1697
}
1698
1699
#[cfg(test)]
1700
mod tests {
1701
    use super::*;
1702
1703
    macro_rules! testlook {
1704
        ($look:expr, $haystack:expr, $at:expr) => {
1705
            LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
1706
        };
1707
    }
1708
1709
    #[test]
1710
    fn look_matches_start_line() {
1711
        let look = Look::StartLF;
1712
1713
        assert!(testlook!(look, "", 0));
1714
        assert!(testlook!(look, "\n", 0));
1715
        assert!(testlook!(look, "\n", 1));
1716
        assert!(testlook!(look, "a", 0));
1717
        assert!(testlook!(look, "\na", 1));
1718
1719
        assert!(!testlook!(look, "a", 1));
1720
        assert!(!testlook!(look, "a\na", 1));
1721
    }
1722
1723
    #[test]
1724
    fn look_matches_end_line() {
1725
        let look = Look::EndLF;
1726
1727
        assert!(testlook!(look, "", 0));
1728
        assert!(testlook!(look, "\n", 1));
1729
        assert!(testlook!(look, "\na", 0));
1730
        assert!(testlook!(look, "\na", 2));
1731
        assert!(testlook!(look, "a\na", 1));
1732
1733
        assert!(!testlook!(look, "a", 0));
1734
        assert!(!testlook!(look, "\na", 1));
1735
        assert!(!testlook!(look, "a\na", 0));
1736
        assert!(!testlook!(look, "a\na", 2));
1737
    }
1738
1739
    #[test]
1740
    fn look_matches_start_text() {
1741
        let look = Look::Start;
1742
1743
        assert!(testlook!(look, "", 0));
1744
        assert!(testlook!(look, "\n", 0));
1745
        assert!(testlook!(look, "a", 0));
1746
1747
        assert!(!testlook!(look, "\n", 1));
1748
        assert!(!testlook!(look, "\na", 1));
1749
        assert!(!testlook!(look, "a", 1));
1750
        assert!(!testlook!(look, "a\na", 1));
1751
    }
1752
1753
    #[test]
1754
    fn look_matches_end_text() {
1755
        let look = Look::End;
1756
1757
        assert!(testlook!(look, "", 0));
1758
        assert!(testlook!(look, "\n", 1));
1759
        assert!(testlook!(look, "\na", 2));
1760
1761
        assert!(!testlook!(look, "\na", 0));
1762
        assert!(!testlook!(look, "a\na", 1));
1763
        assert!(!testlook!(look, "a", 0));
1764
        assert!(!testlook!(look, "\na", 1));
1765
        assert!(!testlook!(look, "a\na", 0));
1766
        assert!(!testlook!(look, "a\na", 2));
1767
    }
1768
1769
    #[test]
1770
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1771
    fn look_matches_word_unicode() {
1772
        let look = Look::WordUnicode;
1773
1774
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1775
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1776
1777
        // Simple ASCII word boundaries.
1778
        assert!(testlook!(look, "a", 0));
1779
        assert!(testlook!(look, "a", 1));
1780
        assert!(testlook!(look, "a ", 1));
1781
        assert!(testlook!(look, " a ", 1));
1782
        assert!(testlook!(look, " a ", 2));
1783
1784
        // Unicode word boundaries with a non-ASCII codepoint.
1785
        assert!(testlook!(look, "𝛃", 0));
1786
        assert!(testlook!(look, "𝛃", 4));
1787
        assert!(testlook!(look, "𝛃 ", 4));
1788
        assert!(testlook!(look, " 𝛃 ", 1));
1789
        assert!(testlook!(look, " 𝛃 ", 5));
1790
1791
        // Unicode word boundaries between non-ASCII codepoints.
1792
        assert!(testlook!(look, "𝛃𐆀", 0));
1793
        assert!(testlook!(look, "𝛃𐆀", 4));
1794
1795
        // Non word boundaries for ASCII.
1796
        assert!(!testlook!(look, "", 0));
1797
        assert!(!testlook!(look, "ab", 1));
1798
        assert!(!testlook!(look, "a ", 2));
1799
        assert!(!testlook!(look, " a ", 0));
1800
        assert!(!testlook!(look, " a ", 3));
1801
1802
        // Non word boundaries with a non-ASCII codepoint.
1803
        assert!(!testlook!(look, "𝛃b", 4));
1804
        assert!(!testlook!(look, "𝛃 ", 5));
1805
        assert!(!testlook!(look, " 𝛃 ", 0));
1806
        assert!(!testlook!(look, " 𝛃 ", 6));
1807
        assert!(!testlook!(look, "𝛃", 1));
1808
        assert!(!testlook!(look, "𝛃", 2));
1809
        assert!(!testlook!(look, "𝛃", 3));
1810
1811
        // Non word boundaries with non-ASCII codepoints.
1812
        assert!(!testlook!(look, "𝛃𐆀", 1));
1813
        assert!(!testlook!(look, "𝛃𐆀", 2));
1814
        assert!(!testlook!(look, "𝛃𐆀", 3));
1815
        assert!(!testlook!(look, "𝛃𐆀", 5));
1816
        assert!(!testlook!(look, "𝛃𐆀", 6));
1817
        assert!(!testlook!(look, "𝛃𐆀", 7));
1818
        assert!(!testlook!(look, "𝛃𐆀", 8));
1819
    }
1820
1821
    #[test]
1822
    fn look_matches_word_ascii() {
1823
        let look = Look::WordAscii;
1824
1825
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1826
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1827
1828
        // Simple ASCII word boundaries.
1829
        assert!(testlook!(look, "a", 0));
1830
        assert!(testlook!(look, "a", 1));
1831
        assert!(testlook!(look, "a ", 1));
1832
        assert!(testlook!(look, " a ", 1));
1833
        assert!(testlook!(look, " a ", 2));
1834
1835
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
1836
        // an ASCII word boundary, none of these match.
1837
        assert!(!testlook!(look, "𝛃", 0));
1838
        assert!(!testlook!(look, "𝛃", 4));
1839
        assert!(!testlook!(look, "𝛃 ", 4));
1840
        assert!(!testlook!(look, " 𝛃 ", 1));
1841
        assert!(!testlook!(look, " 𝛃 ", 5));
1842
1843
        // Unicode word boundaries between non-ASCII codepoints. Again, since
1844
        // this is an ASCII word boundary, none of these match.
1845
        assert!(!testlook!(look, "𝛃𐆀", 0));
1846
        assert!(!testlook!(look, "𝛃𐆀", 4));
1847
1848
        // Non word boundaries for ASCII.
1849
        assert!(!testlook!(look, "", 0));
1850
        assert!(!testlook!(look, "ab", 1));
1851
        assert!(!testlook!(look, "a ", 2));
1852
        assert!(!testlook!(look, " a ", 0));
1853
        assert!(!testlook!(look, " a ", 3));
1854
1855
        // Non word boundaries with a non-ASCII codepoint.
1856
        assert!(testlook!(look, "𝛃b", 4));
1857
        assert!(!testlook!(look, "𝛃 ", 5));
1858
        assert!(!testlook!(look, " 𝛃 ", 0));
1859
        assert!(!testlook!(look, " 𝛃 ", 6));
1860
        assert!(!testlook!(look, "𝛃", 1));
1861
        assert!(!testlook!(look, "𝛃", 2));
1862
        assert!(!testlook!(look, "𝛃", 3));
1863
1864
        // Non word boundaries with non-ASCII codepoints.
1865
        assert!(!testlook!(look, "𝛃𐆀", 1));
1866
        assert!(!testlook!(look, "𝛃𐆀", 2));
1867
        assert!(!testlook!(look, "𝛃𐆀", 3));
1868
        assert!(!testlook!(look, "𝛃𐆀", 5));
1869
        assert!(!testlook!(look, "𝛃𐆀", 6));
1870
        assert!(!testlook!(look, "𝛃𐆀", 7));
1871
        assert!(!testlook!(look, "𝛃𐆀", 8));
1872
    }
1873
1874
    #[test]
1875
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1876
    fn look_matches_word_unicode_negate() {
1877
        let look = Look::WordUnicodeNegate;
1878
1879
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1880
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1881
1882
        // Simple ASCII word boundaries.
1883
        assert!(!testlook!(look, "a", 0));
1884
        assert!(!testlook!(look, "a", 1));
1885
        assert!(!testlook!(look, "a ", 1));
1886
        assert!(!testlook!(look, " a ", 1));
1887
        assert!(!testlook!(look, " a ", 2));
1888
1889
        // Unicode word boundaries with a non-ASCII codepoint.
1890
        assert!(!testlook!(look, "𝛃", 0));
1891
        assert!(!testlook!(look, "𝛃", 4));
1892
        assert!(!testlook!(look, "𝛃 ", 4));
1893
        assert!(!testlook!(look, " 𝛃 ", 1));
1894
        assert!(!testlook!(look, " 𝛃 ", 5));
1895
1896
        // Unicode word boundaries between non-ASCII codepoints.
1897
        assert!(!testlook!(look, "𝛃𐆀", 0));
1898
        assert!(!testlook!(look, "𝛃𐆀", 4));
1899
1900
        // Non word boundaries for ASCII.
1901
        assert!(testlook!(look, "", 0));
1902
        assert!(testlook!(look, "ab", 1));
1903
        assert!(testlook!(look, "a ", 2));
1904
        assert!(testlook!(look, " a ", 0));
1905
        assert!(testlook!(look, " a ", 3));
1906
1907
        // Non word boundaries with a non-ASCII codepoint.
1908
        assert!(testlook!(look, "𝛃b", 4));
1909
        assert!(testlook!(look, "𝛃 ", 5));
1910
        assert!(testlook!(look, " 𝛃 ", 0));
1911
        assert!(testlook!(look, " 𝛃 ", 6));
1912
        // These don't match because they could otherwise return an offset that
1913
        // splits the UTF-8 encoding of a codepoint.
1914
        assert!(!testlook!(look, "𝛃", 1));
1915
        assert!(!testlook!(look, "𝛃", 2));
1916
        assert!(!testlook!(look, "𝛃", 3));
1917
1918
        // Non word boundaries with non-ASCII codepoints. These also don't
1919
        // match because they could otherwise return an offset that splits the
1920
        // UTF-8 encoding of a codepoint.
1921
        assert!(!testlook!(look, "𝛃𐆀", 1));
1922
        assert!(!testlook!(look, "𝛃𐆀", 2));
1923
        assert!(!testlook!(look, "𝛃𐆀", 3));
1924
        assert!(!testlook!(look, "𝛃𐆀", 5));
1925
        assert!(!testlook!(look, "𝛃𐆀", 6));
1926
        assert!(!testlook!(look, "𝛃𐆀", 7));
1927
        // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
1928
        // of the haystack. So the "end" of the haystack isn't a word and 𐆀
1929
        // isn't a word, thus, \B matches.
1930
        assert!(testlook!(look, "𝛃𐆀", 8));
1931
    }
1932
1933
    #[test]
1934
    fn look_matches_word_ascii_negate() {
1935
        let look = Look::WordAsciiNegate;
1936
1937
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1938
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1939
1940
        // Simple ASCII word boundaries.
1941
        assert!(!testlook!(look, "a", 0));
1942
        assert!(!testlook!(look, "a", 1));
1943
        assert!(!testlook!(look, "a ", 1));
1944
        assert!(!testlook!(look, " a ", 1));
1945
        assert!(!testlook!(look, " a ", 2));
1946
1947
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
1948
        // an ASCII word boundary, none of these match.
1949
        assert!(testlook!(look, "𝛃", 0));
1950
        assert!(testlook!(look, "𝛃", 4));
1951
        assert!(testlook!(look, "𝛃 ", 4));
1952
        assert!(testlook!(look, " 𝛃 ", 1));
1953
        assert!(testlook!(look, " 𝛃 ", 5));
1954
1955
        // Unicode word boundaries between non-ASCII codepoints. Again, since
1956
        // this is an ASCII word boundary, none of these match.
1957
        assert!(testlook!(look, "𝛃𐆀", 0));
1958
        assert!(testlook!(look, "𝛃𐆀", 4));
1959
1960
        // Non word boundaries for ASCII.
1961
        assert!(testlook!(look, "", 0));
1962
        assert!(testlook!(look, "ab", 1));
1963
        assert!(testlook!(look, "a ", 2));
1964
        assert!(testlook!(look, " a ", 0));
1965
        assert!(testlook!(look, " a ", 3));
1966
1967
        // Non word boundaries with a non-ASCII codepoint.
1968
        assert!(!testlook!(look, "𝛃b", 4));
1969
        assert!(testlook!(look, "𝛃 ", 5));
1970
        assert!(testlook!(look, " 𝛃 ", 0));
1971
        assert!(testlook!(look, " 𝛃 ", 6));
1972
        assert!(testlook!(look, "𝛃", 1));
1973
        assert!(testlook!(look, "𝛃", 2));
1974
        assert!(testlook!(look, "𝛃", 3));
1975
1976
        // Non word boundaries with non-ASCII codepoints.
1977
        assert!(testlook!(look, "𝛃𐆀", 1));
1978
        assert!(testlook!(look, "𝛃𐆀", 2));
1979
        assert!(testlook!(look, "𝛃𐆀", 3));
1980
        assert!(testlook!(look, "𝛃𐆀", 5));
1981
        assert!(testlook!(look, "𝛃𐆀", 6));
1982
        assert!(testlook!(look, "𝛃𐆀", 7));
1983
        assert!(testlook!(look, "𝛃𐆀", 8));
1984
    }
1985
1986
    #[test]
1987
    fn look_matches_word_start_ascii() {
1988
        let look = Look::WordStartAscii;
1989
1990
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1991
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1992
1993
        // Simple ASCII word boundaries.
1994
        assert!(testlook!(look, "a", 0));
1995
        assert!(!testlook!(look, "a", 1));
1996
        assert!(!testlook!(look, "a ", 1));
1997
        assert!(testlook!(look, " a ", 1));
1998
        assert!(!testlook!(look, " a ", 2));
1999
2000
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2001
        // an ASCII word boundary, none of these match.
2002
        assert!(!testlook!(look, "𝛃", 0));
2003
        assert!(!testlook!(look, "𝛃", 4));
2004
        assert!(!testlook!(look, "𝛃 ", 4));
2005
        assert!(!testlook!(look, " 𝛃 ", 1));
2006
        assert!(!testlook!(look, " 𝛃 ", 5));
2007
2008
        // Unicode word boundaries between non-ASCII codepoints. Again, since
2009
        // this is an ASCII word boundary, none of these match.
2010
        assert!(!testlook!(look, "𝛃𐆀", 0));
2011
        assert!(!testlook!(look, "𝛃𐆀", 4));
2012
2013
        // Non word boundaries for ASCII.
2014
        assert!(!testlook!(look, "", 0));
2015
        assert!(!testlook!(look, "ab", 1));
2016
        assert!(!testlook!(look, "a ", 2));
2017
        assert!(!testlook!(look, " a ", 0));
2018
        assert!(!testlook!(look, " a ", 3));
2019
2020
        // Non word boundaries with a non-ASCII codepoint.
2021
        assert!(testlook!(look, "𝛃b", 4));
2022
        assert!(!testlook!(look, "b𝛃", 1));
2023
        assert!(!testlook!(look, "𝛃 ", 5));
2024
        assert!(!testlook!(look, " 𝛃 ", 0));
2025
        assert!(!testlook!(look, " 𝛃 ", 6));
2026
        assert!(!testlook!(look, "𝛃", 1));
2027
        assert!(!testlook!(look, "𝛃", 2));
2028
        assert!(!testlook!(look, "𝛃", 3));
2029
2030
        // Non word boundaries with non-ASCII codepoints.
2031
        assert!(!testlook!(look, "𝛃𐆀", 1));
2032
        assert!(!testlook!(look, "𝛃𐆀", 2));
2033
        assert!(!testlook!(look, "𝛃𐆀", 3));
2034
        assert!(!testlook!(look, "𝛃𐆀", 5));
2035
        assert!(!testlook!(look, "𝛃𐆀", 6));
2036
        assert!(!testlook!(look, "𝛃𐆀", 7));
2037
        assert!(!testlook!(look, "𝛃𐆀", 8));
2038
    }
2039
2040
    #[test]
2041
    fn look_matches_word_end_ascii() {
2042
        let look = Look::WordEndAscii;
2043
2044
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2045
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2046
2047
        // Simple ASCII word boundaries.
2048
        assert!(!testlook!(look, "a", 0));
2049
        assert!(testlook!(look, "a", 1));
2050
        assert!(testlook!(look, "a ", 1));
2051
        assert!(!testlook!(look, " a ", 1));
2052
        assert!(testlook!(look, " a ", 2));
2053
2054
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2055
        // an ASCII word boundary, none of these match.
2056
        assert!(!testlook!(look, "𝛃", 0));
2057
        assert!(!testlook!(look, "𝛃", 4));
2058
        assert!(!testlook!(look, "𝛃 ", 4));
2059
        assert!(!testlook!(look, " 𝛃 ", 1));
2060
        assert!(!testlook!(look, " 𝛃 ", 5));
2061
2062
        // Unicode word boundaries between non-ASCII codepoints. Again, since
2063
        // this is an ASCII word boundary, none of these match.
2064
        assert!(!testlook!(look, "𝛃𐆀", 0));
2065
        assert!(!testlook!(look, "𝛃𐆀", 4));
2066
2067
        // Non word boundaries for ASCII.
2068
        assert!(!testlook!(look, "", 0));
2069
        assert!(!testlook!(look, "ab", 1));
2070
        assert!(!testlook!(look, "a ", 2));
2071
        assert!(!testlook!(look, " a ", 0));
2072
        assert!(!testlook!(look, " a ", 3));
2073
2074
        // Non word boundaries with a non-ASCII codepoint.
2075
        assert!(!testlook!(look, "𝛃b", 4));
2076
        assert!(testlook!(look, "b𝛃", 1));
2077
        assert!(!testlook!(look, "𝛃 ", 5));
2078
        assert!(!testlook!(look, " 𝛃 ", 0));
2079
        assert!(!testlook!(look, " 𝛃 ", 6));
2080
        assert!(!testlook!(look, "𝛃", 1));
2081
        assert!(!testlook!(look, "𝛃", 2));
2082
        assert!(!testlook!(look, "𝛃", 3));
2083
2084
        // Non word boundaries with non-ASCII codepoints.
2085
        assert!(!testlook!(look, "𝛃𐆀", 1));
2086
        assert!(!testlook!(look, "𝛃𐆀", 2));
2087
        assert!(!testlook!(look, "𝛃𐆀", 3));
2088
        assert!(!testlook!(look, "𝛃𐆀", 5));
2089
        assert!(!testlook!(look, "𝛃𐆀", 6));
2090
        assert!(!testlook!(look, "𝛃𐆀", 7));
2091
        assert!(!testlook!(look, "𝛃𐆀", 8));
2092
    }
2093
2094
    #[test]
2095
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2096
    fn look_matches_word_start_unicode() {
2097
        let look = Look::WordStartUnicode;
2098
2099
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2100
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2101
2102
        // Simple ASCII word boundaries.
2103
        assert!(testlook!(look, "a", 0));
2104
        assert!(!testlook!(look, "a", 1));
2105
        assert!(!testlook!(look, "a ", 1));
2106
        assert!(testlook!(look, " a ", 1));
2107
        assert!(!testlook!(look, " a ", 2));
2108
2109
        // Unicode word boundaries with a non-ASCII codepoint.
2110
        assert!(testlook!(look, "𝛃", 0));
2111
        assert!(!testlook!(look, "𝛃", 4));
2112
        assert!(!testlook!(look, "𝛃 ", 4));
2113
        assert!(testlook!(look, " 𝛃 ", 1));
2114
        assert!(!testlook!(look, " 𝛃 ", 5));
2115
2116
        // Unicode word boundaries between non-ASCII codepoints.
2117
        assert!(testlook!(look, "𝛃𐆀", 0));
2118
        assert!(!testlook!(look, "𝛃𐆀", 4));
2119
2120
        // Non word boundaries for ASCII.
2121
        assert!(!testlook!(look, "", 0));
2122
        assert!(!testlook!(look, "ab", 1));
2123
        assert!(!testlook!(look, "a ", 2));
2124
        assert!(!testlook!(look, " a ", 0));
2125
        assert!(!testlook!(look, " a ", 3));
2126
2127
        // Non word boundaries with a non-ASCII codepoint.
2128
        assert!(!testlook!(look, "𝛃b", 4));
2129
        assert!(!testlook!(look, "b𝛃", 1));
2130
        assert!(!testlook!(look, "𝛃 ", 5));
2131
        assert!(!testlook!(look, " 𝛃 ", 0));
2132
        assert!(!testlook!(look, " 𝛃 ", 6));
2133
        assert!(!testlook!(look, "𝛃", 1));
2134
        assert!(!testlook!(look, "𝛃", 2));
2135
        assert!(!testlook!(look, "𝛃", 3));
2136
2137
        // Non word boundaries with non-ASCII codepoints.
2138
        assert!(!testlook!(look, "𝛃𐆀", 1));
2139
        assert!(!testlook!(look, "𝛃𐆀", 2));
2140
        assert!(!testlook!(look, "𝛃𐆀", 3));
2141
        assert!(!testlook!(look, "𝛃𐆀", 5));
2142
        assert!(!testlook!(look, "𝛃𐆀", 6));
2143
        assert!(!testlook!(look, "𝛃𐆀", 7));
2144
        assert!(!testlook!(look, "𝛃𐆀", 8));
2145
    }
2146
2147
    #[test]
2148
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2149
    fn look_matches_word_end_unicode() {
2150
        let look = Look::WordEndUnicode;
2151
2152
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2153
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2154
2155
        // Simple ASCII word boundaries.
2156
        assert!(!testlook!(look, "a", 0));
2157
        assert!(testlook!(look, "a", 1));
2158
        assert!(testlook!(look, "a ", 1));
2159
        assert!(!testlook!(look, " a ", 1));
2160
        assert!(testlook!(look, " a ", 2));
2161
2162
        // Unicode word boundaries with a non-ASCII codepoint.
2163
        assert!(!testlook!(look, "𝛃", 0));
2164
        assert!(testlook!(look, "𝛃", 4));
2165
        assert!(testlook!(look, "𝛃 ", 4));
2166
        assert!(!testlook!(look, " 𝛃 ", 1));
2167
        assert!(testlook!(look, " 𝛃 ", 5));
2168
2169
        // Unicode word boundaries between non-ASCII codepoints.
2170
        assert!(!testlook!(look, "𝛃𐆀", 0));
2171
        assert!(testlook!(look, "𝛃𐆀", 4));
2172
2173
        // Non word boundaries for ASCII.
2174
        assert!(!testlook!(look, "", 0));
2175
        assert!(!testlook!(look, "ab", 1));
2176
        assert!(!testlook!(look, "a ", 2));
2177
        assert!(!testlook!(look, " a ", 0));
2178
        assert!(!testlook!(look, " a ", 3));
2179
2180
        // Non word boundaries with a non-ASCII codepoint.
2181
        assert!(!testlook!(look, "𝛃b", 4));
2182
        assert!(!testlook!(look, "b𝛃", 1));
2183
        assert!(!testlook!(look, "𝛃 ", 5));
2184
        assert!(!testlook!(look, " 𝛃 ", 0));
2185
        assert!(!testlook!(look, " 𝛃 ", 6));
2186
        assert!(!testlook!(look, "𝛃", 1));
2187
        assert!(!testlook!(look, "𝛃", 2));
2188
        assert!(!testlook!(look, "𝛃", 3));
2189
2190
        // Non word boundaries with non-ASCII codepoints.
2191
        assert!(!testlook!(look, "𝛃𐆀", 1));
2192
        assert!(!testlook!(look, "𝛃𐆀", 2));
2193
        assert!(!testlook!(look, "𝛃𐆀", 3));
2194
        assert!(!testlook!(look, "𝛃𐆀", 5));
2195
        assert!(!testlook!(look, "𝛃𐆀", 6));
2196
        assert!(!testlook!(look, "𝛃𐆀", 7));
2197
        assert!(!testlook!(look, "𝛃𐆀", 8));
2198
    }
2199
2200
    #[test]
2201
    fn look_matches_word_start_half_ascii() {
2202
        let look = Look::WordStartHalfAscii;
2203
2204
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2205
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2206
2207
        // Simple ASCII word boundaries.
2208
        assert!(testlook!(look, "a", 0));
2209
        assert!(!testlook!(look, "a", 1));
2210
        assert!(!testlook!(look, "a ", 1));
2211
        assert!(testlook!(look, " a ", 1));
2212
        assert!(!testlook!(look, " a ", 2));
2213
2214
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2215
        // an ASCII word boundary, none of these match.
2216
        assert!(testlook!(look, "𝛃", 0));
2217
        assert!(testlook!(look, "𝛃", 4));
2218
        assert!(testlook!(look, "𝛃 ", 4));
2219
        assert!(testlook!(look, " 𝛃 ", 1));
2220
        assert!(testlook!(look, " 𝛃 ", 5));
2221
2222
        // Unicode word boundaries between non-ASCII codepoints. Again, since
2223
        // this is an ASCII word boundary, none of these match.
2224
        assert!(testlook!(look, "𝛃𐆀", 0));
2225
        assert!(testlook!(look, "𝛃𐆀", 4));
2226
2227
        // Non word boundaries for ASCII.
2228
        assert!(testlook!(look, "", 0));
2229
        assert!(!testlook!(look, "ab", 1));
2230
        assert!(testlook!(look, "a ", 2));
2231
        assert!(testlook!(look, " a ", 0));
2232
        assert!(testlook!(look, " a ", 3));
2233
2234
        // Non word boundaries with a non-ASCII codepoint.
2235
        assert!(testlook!(look, "𝛃b", 4));
2236
        assert!(!testlook!(look, "b𝛃", 1));
2237
        assert!(testlook!(look, "𝛃 ", 5));
2238
        assert!(testlook!(look, " 𝛃 ", 0));
2239
        assert!(testlook!(look, " 𝛃 ", 6));
2240
        assert!(testlook!(look, "𝛃", 1));
2241
        assert!(testlook!(look, "𝛃", 2));
2242
        assert!(testlook!(look, "𝛃", 3));
2243
2244
        // Non word boundaries with non-ASCII codepoints.
2245
        assert!(testlook!(look, "𝛃𐆀", 1));
2246
        assert!(testlook!(look, "𝛃𐆀", 2));
2247
        assert!(testlook!(look, "𝛃𐆀", 3));
2248
        assert!(testlook!(look, "𝛃𐆀", 5));
2249
        assert!(testlook!(look, "𝛃𐆀", 6));
2250
        assert!(testlook!(look, "𝛃𐆀", 7));
2251
        assert!(testlook!(look, "𝛃𐆀", 8));
2252
    }
2253
2254
    #[test]
2255
    fn look_matches_word_end_half_ascii() {
2256
        let look = Look::WordEndHalfAscii;
2257
2258
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2259
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2260
2261
        // Simple ASCII word boundaries.
2262
        assert!(!testlook!(look, "a", 0));
2263
        assert!(testlook!(look, "a", 1));
2264
        assert!(testlook!(look, "a ", 1));
2265
        assert!(!testlook!(look, " a ", 1));
2266
        assert!(testlook!(look, " a ", 2));
2267
2268
        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2269
        // an ASCII word boundary, none of these match.
2270
        assert!(testlook!(look, "𝛃", 0));
2271
        assert!(testlook!(look, "𝛃", 4));
2272
        assert!(testlook!(look, "𝛃 ", 4));
2273
        assert!(testlook!(look, " 𝛃 ", 1));
2274
        assert!(testlook!(look, " 𝛃 ", 5));
2275
2276
        // Unicode word boundaries between non-ASCII codepoints. Again, since
2277
        // this is an ASCII word boundary, none of these match.
2278
        assert!(testlook!(look, "𝛃𐆀", 0));
2279
        assert!(testlook!(look, "𝛃𐆀", 4));
2280
2281
        // Non word boundaries for ASCII.
2282
        assert!(testlook!(look, "", 0));
2283
        assert!(!testlook!(look, "ab", 1));
2284
        assert!(testlook!(look, "a ", 2));
2285
        assert!(testlook!(look, " a ", 0));
2286
        assert!(testlook!(look, " a ", 3));
2287
2288
        // Non word boundaries with a non-ASCII codepoint.
2289
        assert!(!testlook!(look, "𝛃b", 4));
2290
        assert!(testlook!(look, "b𝛃", 1));
2291
        assert!(testlook!(look, "𝛃 ", 5));
2292
        assert!(testlook!(look, " 𝛃 ", 0));
2293
        assert!(testlook!(look, " 𝛃 ", 6));
2294
        assert!(testlook!(look, "𝛃", 1));
2295
        assert!(testlook!(look, "𝛃", 2));
2296
        assert!(testlook!(look, "𝛃", 3));
2297
2298
        // Non word boundaries with non-ASCII codepoints.
2299
        assert!(testlook!(look, "𝛃𐆀", 1));
2300
        assert!(testlook!(look, "𝛃𐆀", 2));
2301
        assert!(testlook!(look, "𝛃𐆀", 3));
2302
        assert!(testlook!(look, "𝛃𐆀", 5));
2303
        assert!(testlook!(look, "𝛃𐆀", 6));
2304
        assert!(testlook!(look, "𝛃𐆀", 7));
2305
        assert!(testlook!(look, "𝛃𐆀", 8));
2306
    }
2307
2308
    #[test]
2309
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2310
    fn look_matches_word_start_half_unicode() {
2311
        let look = Look::WordStartHalfUnicode;
2312
2313
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2314
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2315
2316
        // Simple ASCII word boundaries.
2317
        assert!(testlook!(look, "a", 0));
2318
        assert!(!testlook!(look, "a", 1));
2319
        assert!(!testlook!(look, "a ", 1));
2320
        assert!(testlook!(look, " a ", 1));
2321
        assert!(!testlook!(look, " a ", 2));
2322
2323
        // Unicode word boundaries with a non-ASCII codepoint.
2324
        assert!(testlook!(look, "𝛃", 0));
2325
        assert!(!testlook!(look, "𝛃", 4));
2326
        assert!(!testlook!(look, "𝛃 ", 4));
2327
        assert!(testlook!(look, " 𝛃 ", 1));
2328
        assert!(!testlook!(look, " 𝛃 ", 5));
2329
2330
        // Unicode word boundaries between non-ASCII codepoints.
2331
        assert!(testlook!(look, "𝛃𐆀", 0));
2332
        assert!(!testlook!(look, "𝛃𐆀", 4));
2333
2334
        // Non word boundaries for ASCII.
2335
        assert!(testlook!(look, "", 0));
2336
        assert!(!testlook!(look, "ab", 1));
2337
        assert!(testlook!(look, "a ", 2));
2338
        assert!(testlook!(look, " a ", 0));
2339
        assert!(testlook!(look, " a ", 3));
2340
2341
        // Non word boundaries with a non-ASCII codepoint.
2342
        assert!(!testlook!(look, "𝛃b", 4));
2343
        assert!(!testlook!(look, "b𝛃", 1));
2344
        assert!(testlook!(look, "𝛃 ", 5));
2345
        assert!(testlook!(look, " 𝛃 ", 0));
2346
        assert!(testlook!(look, " 𝛃 ", 6));
2347
        assert!(!testlook!(look, "𝛃", 1));
2348
        assert!(!testlook!(look, "𝛃", 2));
2349
        assert!(!testlook!(look, "𝛃", 3));
2350
2351
        // Non word boundaries with non-ASCII codepoints.
2352
        assert!(!testlook!(look, "𝛃𐆀", 1));
2353
        assert!(!testlook!(look, "𝛃𐆀", 2));
2354
        assert!(!testlook!(look, "𝛃𐆀", 3));
2355
        assert!(!testlook!(look, "𝛃𐆀", 5));
2356
        assert!(!testlook!(look, "𝛃𐆀", 6));
2357
        assert!(!testlook!(look, "𝛃𐆀", 7));
2358
        assert!(testlook!(look, "𝛃𐆀", 8));
2359
    }
2360
2361
    #[test]
2362
    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2363
    fn look_matches_word_end_half_unicode() {
2364
        let look = Look::WordEndHalfUnicode;
2365
2366
        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2367
        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2368
2369
        // Simple ASCII word boundaries.
2370
        assert!(!testlook!(look, "a", 0));
2371
        assert!(testlook!(look, "a", 1));
2372
        assert!(testlook!(look, "a ", 1));
2373
        assert!(!testlook!(look, " a ", 1));
2374
        assert!(testlook!(look, " a ", 2));
2375
2376
        // Unicode word boundaries with a non-ASCII codepoint.
2377
        assert!(!testlook!(look, "𝛃", 0));
2378
        assert!(testlook!(look, "𝛃", 4));
2379
        assert!(testlook!(look, "𝛃 ", 4));
2380
        assert!(!testlook!(look, " 𝛃 ", 1));
2381
        assert!(testlook!(look, " 𝛃 ", 5));
2382
2383
        // Unicode word boundaries between non-ASCII codepoints.
2384
        assert!(!testlook!(look, "𝛃𐆀", 0));
2385
        assert!(testlook!(look, "𝛃𐆀", 4));
2386
2387
        // Non word boundaries for ASCII.
2388
        assert!(testlook!(look, "", 0));
2389
        assert!(!testlook!(look, "ab", 1));
2390
        assert!(testlook!(look, "a ", 2));
2391
        assert!(testlook!(look, " a ", 0));
2392
        assert!(testlook!(look, " a ", 3));
2393
2394
        // Non word boundaries with a non-ASCII codepoint.
2395
        assert!(!testlook!(look, "𝛃b", 4));
2396
        assert!(!testlook!(look, "b𝛃", 1));
2397
        assert!(testlook!(look, "𝛃 ", 5));
2398
        assert!(testlook!(look, " 𝛃 ", 0));
2399
        assert!(testlook!(look, " 𝛃 ", 6));
2400
        assert!(!testlook!(look, "𝛃", 1));
2401
        assert!(!testlook!(look, "𝛃", 2));
2402
        assert!(!testlook!(look, "𝛃", 3));
2403
2404
        // Non word boundaries with non-ASCII codepoints.
2405
        assert!(!testlook!(look, "𝛃𐆀", 1));
2406
        assert!(!testlook!(look, "𝛃𐆀", 2));
2407
        assert!(!testlook!(look, "𝛃𐆀", 3));
2408
        assert!(!testlook!(look, "𝛃𐆀", 5));
2409
        assert!(!testlook!(look, "𝛃𐆀", 6));
2410
        assert!(!testlook!(look, "𝛃𐆀", 7));
2411
        assert!(testlook!(look, "𝛃𐆀", 8));
2412
    }
2413
2414
    #[test]
2415
    fn look_set() {
2416
        let mut f = LookSet::default();
2417
        assert!(!f.contains(Look::Start));
2418
        assert!(!f.contains(Look::End));
2419
        assert!(!f.contains(Look::StartLF));
2420
        assert!(!f.contains(Look::EndLF));
2421
        assert!(!f.contains(Look::WordUnicode));
2422
        assert!(!f.contains(Look::WordUnicodeNegate));
2423
        assert!(!f.contains(Look::WordAscii));
2424
        assert!(!f.contains(Look::WordAsciiNegate));
2425
2426
        f = f.insert(Look::Start);
2427
        assert!(f.contains(Look::Start));
2428
        f = f.remove(Look::Start);
2429
        assert!(!f.contains(Look::Start));
2430
2431
        f = f.insert(Look::End);
2432
        assert!(f.contains(Look::End));
2433
        f = f.remove(Look::End);
2434
        assert!(!f.contains(Look::End));
2435
2436
        f = f.insert(Look::StartLF);
2437
        assert!(f.contains(Look::StartLF));
2438
        f = f.remove(Look::StartLF);
2439
        assert!(!f.contains(Look::StartLF));
2440
2441
        f = f.insert(Look::EndLF);
2442
        assert!(f.contains(Look::EndLF));
2443
        f = f.remove(Look::EndLF);
2444
        assert!(!f.contains(Look::EndLF));
2445
2446
        f = f.insert(Look::StartCRLF);
2447
        assert!(f.contains(Look::StartCRLF));
2448
        f = f.remove(Look::StartCRLF);
2449
        assert!(!f.contains(Look::StartCRLF));
2450
2451
        f = f.insert(Look::EndCRLF);
2452
        assert!(f.contains(Look::EndCRLF));
2453
        f = f.remove(Look::EndCRLF);
2454
        assert!(!f.contains(Look::EndCRLF));
2455
2456
        f = f.insert(Look::WordUnicode);
2457
        assert!(f.contains(Look::WordUnicode));
2458
        f = f.remove(Look::WordUnicode);
2459
        assert!(!f.contains(Look::WordUnicode));
2460
2461
        f = f.insert(Look::WordUnicodeNegate);
2462
        assert!(f.contains(Look::WordUnicodeNegate));
2463
        f = f.remove(Look::WordUnicodeNegate);
2464
        assert!(!f.contains(Look::WordUnicodeNegate));
2465
2466
        f = f.insert(Look::WordAscii);
2467
        assert!(f.contains(Look::WordAscii));
2468
        f = f.remove(Look::WordAscii);
2469
        assert!(!f.contains(Look::WordAscii));
2470
2471
        f = f.insert(Look::WordAsciiNegate);
2472
        assert!(f.contains(Look::WordAsciiNegate));
2473
        f = f.remove(Look::WordAsciiNegate);
2474
        assert!(!f.contains(Look::WordAsciiNegate));
2475
2476
        f = f.insert(Look::WordStartAscii);
2477
        assert!(f.contains(Look::WordStartAscii));
2478
        f = f.remove(Look::WordStartAscii);
2479
        assert!(!f.contains(Look::WordStartAscii));
2480
2481
        f = f.insert(Look::WordEndAscii);
2482
        assert!(f.contains(Look::WordEndAscii));
2483
        f = f.remove(Look::WordEndAscii);
2484
        assert!(!f.contains(Look::WordEndAscii));
2485
2486
        f = f.insert(Look::WordStartUnicode);
2487
        assert!(f.contains(Look::WordStartUnicode));
2488
        f = f.remove(Look::WordStartUnicode);
2489
        assert!(!f.contains(Look::WordStartUnicode));
2490
2491
        f = f.insert(Look::WordEndUnicode);
2492
        assert!(f.contains(Look::WordEndUnicode));
2493
        f = f.remove(Look::WordEndUnicode);
2494
        assert!(!f.contains(Look::WordEndUnicode));
2495
2496
        f = f.insert(Look::WordStartHalfAscii);
2497
        assert!(f.contains(Look::WordStartHalfAscii));
2498
        f = f.remove(Look::WordStartHalfAscii);
2499
        assert!(!f.contains(Look::WordStartHalfAscii));
2500
2501
        f = f.insert(Look::WordEndHalfAscii);
2502
        assert!(f.contains(Look::WordEndHalfAscii));
2503
        f = f.remove(Look::WordEndHalfAscii);
2504
        assert!(!f.contains(Look::WordEndHalfAscii));
2505
2506
        f = f.insert(Look::WordStartHalfUnicode);
2507
        assert!(f.contains(Look::WordStartHalfUnicode));
2508
        f = f.remove(Look::WordStartHalfUnicode);
2509
        assert!(!f.contains(Look::WordStartHalfUnicode));
2510
2511
        f = f.insert(Look::WordEndHalfUnicode);
2512
        assert!(f.contains(Look::WordEndHalfUnicode));
2513
        f = f.remove(Look::WordEndHalfUnicode);
2514
        assert!(!f.contains(Look::WordEndHalfUnicode));
2515
    }
2516
2517
    #[test]
2518
    fn look_set_iter() {
2519
        let set = LookSet::empty();
2520
        assert_eq!(0, set.iter().count());
2521
2522
        let set = LookSet::full();
2523
        assert_eq!(18, set.iter().count());
2524
2525
        let set =
2526
            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
2527
        assert_eq!(2, set.iter().count());
2528
2529
        let set = LookSet::empty().insert(Look::StartLF);
2530
        assert_eq!(1, set.iter().count());
2531
2532
        let set = LookSet::empty().insert(Look::WordAsciiNegate);
2533
        assert_eq!(1, set.iter().count());
2534
2535
        let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
2536
        assert_eq!(1, set.iter().count());
2537
    }
2538
2539
    #[test]
2540
    #[cfg(feature = "alloc")]
2541
    fn look_set_debug() {
2542
        let res = alloc::format!("{:?}", LookSet::empty());
2543
        assert_eq!("∅", res);
2544
        let res = alloc::format!("{:?}", LookSet::full());
2545
        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
2546
    }
2547
}