/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.4.13/src/util/look.rs
Line  | Count  | Source  | 
1  |  | /*!  | 
2  |  | Types and routines for working with look-around assertions.  | 
3  |  |  | 
4  |  | This module principally defines two types:  | 
5  |  |  | 
6  |  | * [`Look`] enumerates all of the assertions supported by this crate.  | 
7  |  | * [`LookSet`] provides a way to efficiently store a set of [`Look`] values.  | 
8  |  | * [`LookMatcher`] provides routines for checking whether a `Look` or a  | 
9  |  | `LookSet` matches at a particular position in a haystack.  | 
10  |  | */  | 
11  |  |  | 
12  |  | // LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically  | 
13  |  | // copied verbatim from the regex-syntax crate. I would have no problems using  | 
14  |  | // the regex-syntax types and defining the matching routines (only found  | 
15  |  | // in this crate) as free functions, except the `Look` and `LookSet` types  | 
16  |  | // are used in lots of places. Including in places we expect to work when  | 
17  |  | // regex-syntax is *not* enabled, such as in the definition of the NFA itself.  | 
18  |  | //  | 
19  |  | // Thankfully the code we copy is pretty simple and there isn't much of it.  | 
20  |  | // Otherwise, the rest of this module deals with *matching* the assertions,  | 
21  |  | // which is not something that regex-syntax handles.  | 
22  |  |  | 
23  |  | use crate::util::{escape::DebugByte, utf8}; | 
24  |  |  | 
25  |  | /// A look-around assertion.  | 
26  |  | ///  | 
27  |  | /// An assertion matches at a position between characters in a haystack.  | 
28  |  | /// Namely, it does not actually "consume" any input as most parts of a regular  | 
29  |  | /// expression do. Assertions are a way of stating that some property must be  | 
30  |  | /// true at a particular point during matching.  | 
31  |  | ///  | 
32  |  | /// For example, `(?m)^[a-z]+$` is a pattern that:  | 
33  |  | ///  | 
34  |  | /// * Scans the haystack for a position at which `(?m:^)` is satisfied. That  | 
35  |  | /// occurs at either the beginning of the haystack, or immediately following  | 
36  |  | /// a `\n` character.  | 
37  |  | /// * Looks for one or more occurrences of `[a-z]`.  | 
38  |  | /// * Once `[a-z]+` has matched as much as it can, an overall match is only  | 
39  |  | /// reported when `[a-z]+` stops just before a `\n`.  | 
40  |  | ///  | 
41  |  | /// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.  | 
42  |  | ///  | 
43  |  | /// Assertions are also called "look-around," "look-behind" and "look-ahead."  | 
44  |  | /// Specifically, some assertions are look-behind (like `^`), other assertions  | 
45  |  | /// are look-ahead (like `$`) and yet other assertions are both look-ahead and  | 
46  |  | /// look-behind (like `\b`).  | 
47  |  | ///  | 
48  |  | /// # Assertions in an NFA  | 
49  |  | ///  | 
50  |  | /// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be  | 
51  |  | /// thought of as a conditional epsilon transition. That is, a matching engine  | 
52  |  | /// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits  | 
53  |  | /// moving through conditional epsilon transitions when their condition  | 
54  |  | /// is satisfied at whatever position the `PikeVM` is currently at in the  | 
55  |  | /// haystack.  | 
56  |  | ///  | 
57  |  | /// How assertions are handled in a `DFA` is trickier, since a DFA does not  | 
58  |  | /// have epsilon transitions at all. In this case, they are compiled into the  | 
59  |  | /// automaton itself, at the expense of more states than what would be required  | 
60  |  | /// without an assertion.  | 
61  |  | #[derive(Clone, Copy, Debug, Eq, PartialEq)]  | 
62  |  | pub enum Look { | 
63  |  |     /// Match the beginning of text. Specifically, this matches at the starting  | 
64  |  |     /// position of the input.  | 
65  |  |     Start = 1 << 0,  | 
66  |  |     /// Match the end of text. Specifically, this matches at the ending  | 
67  |  |     /// position of the input.  | 
68  |  |     End = 1 << 1,  | 
69  |  |     /// Match the beginning of a line or the beginning of text. Specifically,  | 
70  |  |     /// this matches at the starting position of the input, or at the position  | 
71  |  |     /// immediately following a `\n` character.  | 
72  |  |     StartLF = 1 << 2,  | 
73  |  |     /// Match the end of a line or the end of text. Specifically, this matches  | 
74  |  |     /// at the end position of the input, or at the position immediately  | 
75  |  |     /// preceding a `\n` character.  | 
76  |  |     EndLF = 1 << 3,  | 
77  |  |     /// Match the beginning of a line or the beginning of text. Specifically,  | 
78  |  |     /// this matches at the starting position of the input, or at the position  | 
79  |  |     /// immediately following either a `\r` or `\n` character, but never after  | 
80  |  |     /// a `\r` when a `\n` follows.  | 
81  |  |     StartCRLF = 1 << 4,  | 
82  |  |     /// Match the end of a line or the end of text. Specifically, this matches  | 
83  |  |     /// at the end position of the input, or at the position immediately  | 
84  |  |     /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`  | 
85  |  |     /// precedes it.  | 
86  |  |     EndCRLF = 1 << 5,  | 
87  |  |     /// Match an ASCII-only word boundary. That is, this matches a position  | 
88  |  |     /// where the left adjacent character and right adjacent character  | 
89  |  |     /// correspond to a word and non-word or a non-word and word character.  | 
90  |  |     WordAscii = 1 << 6,  | 
91  |  |     /// Match an ASCII-only negation of a word boundary.  | 
92  |  |     WordAsciiNegate = 1 << 7,  | 
93  |  |     /// Match a Unicode-aware word boundary. That is, this matches a position  | 
94  |  |     /// where the left adjacent character and right adjacent character  | 
95  |  |     /// correspond to a word and non-word or a non-word and word character.  | 
96  |  |     WordUnicode = 1 << 8,  | 
97  |  |     /// Match a Unicode-aware negation of a word boundary.  | 
98  |  |     WordUnicodeNegate = 1 << 9,  | 
99  |  |     /// Match the start of an ASCII-only word boundary. That is, this matches a  | 
100  |  |     /// position at either the beginning of the haystack or where the previous  | 
101  |  |     /// character is not a word character and the following character is a word  | 
102  |  |     /// character.  | 
103  |  |     WordStartAscii = 1 << 10,  | 
104  |  |     /// Match the end of an ASCII-only word boundary. That is, this matches  | 
105  |  |     /// a position at either the end of the haystack or where the previous  | 
106  |  |     /// character is a word character and the following character is not a word  | 
107  |  |     /// character.  | 
108  |  |     WordEndAscii = 1 << 11,  | 
109  |  |     /// Match the start of a Unicode word boundary. That is, this matches a  | 
110  |  |     /// position at either the beginning of the haystack or where the previous  | 
111  |  |     /// character is not a word character and the following character is a word  | 
112  |  |     /// character.  | 
113  |  |     WordStartUnicode = 1 << 12,  | 
114  |  |     /// Match the end of a Unicode word boundary. That is, this matches a  | 
115  |  |     /// position at either the end of the haystack or where the previous  | 
116  |  |     /// character is a word character and the following character is not a word  | 
117  |  |     /// character.  | 
118  |  |     WordEndUnicode = 1 << 13,  | 
119  |  |     /// Match the start half of an ASCII-only word boundary. That is, this  | 
120  |  |     /// matches a position at either the beginning of the haystack or where the  | 
121  |  |     /// previous character is not a word character.  | 
122  |  |     WordStartHalfAscii = 1 << 14,  | 
123  |  |     /// Match the end half of an ASCII-only word boundary. That is, this  | 
124  |  |     /// matches a position at either the end of the haystack or where the  | 
125  |  |     /// following character is not a word character.  | 
126  |  |     WordEndHalfAscii = 1 << 15,  | 
127  |  |     /// Match the start half of a Unicode word boundary. That is, this matches  | 
128  |  |     /// a position at either the beginning of the haystack or where the  | 
129  |  |     /// previous character is not a word character.  | 
130  |  |     WordStartHalfUnicode = 1 << 16,  | 
131  |  |     /// Match the end half of a Unicode word boundary. That is, this matches  | 
132  |  |     /// a position at either the end of the haystack or where the following  | 
133  |  |     /// character is not a word character.  | 
134  |  |     WordEndHalfUnicode = 1 << 17,  | 
135  |  | }  | 
136  |  |  | 
137  |  | impl Look { | 
138  |  |     /// Flip the look-around assertion to its equivalent for reverse searches.  | 
139  |  |     /// For example, `StartLF` gets translated to `EndLF`.  | 
140  |  |     ///  | 
141  |  |     /// Some assertions, such as `WordUnicode`, remain the same since they  | 
142  |  |     /// match the same positions regardless of the direction of the search.  | 
143  |  |     #[inline]  | 
144  | 0  |     pub const fn reversed(self) -> Look { | 
145  | 0  |         match self { | 
146  | 0  |             Look::Start => Look::End,  | 
147  | 0  |             Look::End => Look::Start,  | 
148  | 0  |             Look::StartLF => Look::EndLF,  | 
149  | 0  |             Look::EndLF => Look::StartLF,  | 
150  | 0  |             Look::StartCRLF => Look::EndCRLF,  | 
151  | 0  |             Look::EndCRLF => Look::StartCRLF,  | 
152  | 0  |             Look::WordAscii => Look::WordAscii,  | 
153  | 0  |             Look::WordAsciiNegate => Look::WordAsciiNegate,  | 
154  | 0  |             Look::WordUnicode => Look::WordUnicode,  | 
155  | 0  |             Look::WordUnicodeNegate => Look::WordUnicodeNegate,  | 
156  | 0  |             Look::WordStartAscii => Look::WordEndAscii,  | 
157  | 0  |             Look::WordEndAscii => Look::WordStartAscii,  | 
158  | 0  |             Look::WordStartUnicode => Look::WordEndUnicode,  | 
159  | 0  |             Look::WordEndUnicode => Look::WordStartUnicode,  | 
160  | 0  |             Look::WordStartHalfAscii => Look::WordEndHalfAscii,  | 
161  | 0  |             Look::WordEndHalfAscii => Look::WordStartHalfAscii,  | 
162  | 0  |             Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,  | 
163  | 0  |             Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,  | 
164  |  |         }  | 
165  | 0  |     }  | 
166  |  |  | 
167  |  |     /// Return the underlying representation of this look-around enumeration  | 
168  |  |     /// as an integer. Giving the return value to the [`Look::from_repr`]  | 
169  |  |     /// constructor is guaranteed to return the same look-around variant that  | 
170  |  |     /// one started with within a semver compatible release of this crate.  | 
171  |  |     #[inline]  | 
172  | 0  |     pub const fn as_repr(self) -> u32 { | 
173  |  |         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an  | 
174  |  |         // actual int.  | 
175  | 0  |         self as u32  | 
176  | 0  |     }  | 
177  |  |  | 
178  |  |     /// Given the underlying representation of a `Look` value, return the  | 
179  |  |     /// corresponding `Look` value if the representation is valid. Otherwise  | 
180  |  |     /// `None` is returned.  | 
181  |  |     #[inline]  | 
182  | 0  |     pub const fn from_repr(repr: u32) -> Option<Look> { | 
183  | 0  |         match repr { | 
184  | 0  |             0b00_0000_0000_0000_0001 => Some(Look::Start),  | 
185  | 0  |             0b00_0000_0000_0000_0010 => Some(Look::End),  | 
186  | 0  |             0b00_0000_0000_0000_0100 => Some(Look::StartLF),  | 
187  | 0  |             0b00_0000_0000_0000_1000 => Some(Look::EndLF),  | 
188  | 0  |             0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),  | 
189  | 0  |             0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),  | 
190  | 0  |             0b00_0000_0000_0100_0000 => Some(Look::WordAscii),  | 
191  | 0  |             0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),  | 
192  | 0  |             0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),  | 
193  | 0  |             0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),  | 
194  | 0  |             0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),  | 
195  | 0  |             0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),  | 
196  | 0  |             0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),  | 
197  | 0  |             0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),  | 
198  | 0  |             0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),  | 
199  | 0  |             0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),  | 
200  | 0  |             0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),  | 
201  | 0  |             0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),  | 
202  | 0  |             _ => None,  | 
203  |  |         }  | 
204  | 0  |     }  | 
205  |  |  | 
206  |  |     /// Returns a convenient single codepoint representation of this  | 
207  |  |     /// look-around assertion. Each assertion is guaranteed to be represented  | 
208  |  |     /// by a distinct character.  | 
209  |  |     ///  | 
210  |  |     /// This is useful for succinctly representing a look-around assertion in  | 
211  |  |     /// human friendly but succinct output intended for a programmer working on  | 
212  |  |     /// regex internals.  | 
213  |  |     #[inline]  | 
214  | 0  |     pub const fn as_char(self) -> char { | 
215  | 0  |         match self { | 
216  | 0  |             Look::Start => 'A',  | 
217  | 0  |             Look::End => 'z',  | 
218  | 0  |             Look::StartLF => '^',  | 
219  | 0  |             Look::EndLF => '$',  | 
220  | 0  |             Look::StartCRLF => 'r',  | 
221  | 0  |             Look::EndCRLF => 'R',  | 
222  | 0  |             Look::WordAscii => 'b',  | 
223  | 0  |             Look::WordAsciiNegate => 'B',  | 
224  | 0  |             Look::WordUnicode => '𝛃',  | 
225  | 0  |             Look::WordUnicodeNegate => '𝚩',  | 
226  | 0  |             Look::WordStartAscii => '<',  | 
227  | 0  |             Look::WordEndAscii => '>',  | 
228  | 0  |             Look::WordStartUnicode => '〈',  | 
229  | 0  |             Look::WordEndUnicode => '〉',  | 
230  | 0  |             Look::WordStartHalfAscii => '◁',  | 
231  | 0  |             Look::WordEndHalfAscii => '▷',  | 
232  | 0  |             Look::WordStartHalfUnicode => '◀',  | 
233  | 0  |             Look::WordEndHalfUnicode => '▶',  | 
234  |  |         }  | 
235  | 0  |     }  | 
236  |  | }  | 
237  |  |  | 
238  |  | /// LookSet is a memory-efficient set of look-around assertions.  | 
239  |  | ///  | 
240  |  | /// This is useful for efficiently tracking look-around assertions. For  | 
241  |  | /// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties  | 
242  |  | /// that return `LookSet`s.  | 
243  |  | #[derive(Clone, Copy, Default, Eq, PartialEq)]  | 
244  |  | pub struct LookSet { | 
245  |  |     /// The underlying representation this set is exposed to make it possible  | 
246  |  |     /// to store it somewhere efficiently. The representation is that  | 
247  |  |     /// of a bitset, where each assertion occupies bit `i` where  | 
248  |  |     /// `i = Look::as_repr()`.  | 
249  |  |     ///  | 
250  |  |     /// Note that users of this internal representation must permit the full  | 
251  |  |     /// range of `u16` values to be represented. For example, even if the  | 
252  |  |     /// current implementation only makes use of the 10 least significant bits,  | 
253  |  |     /// it may use more bits in a future semver compatible release.  | 
254  |  |     pub bits: u32,  | 
255  |  | }  | 
256  |  |  | 
257  |  | impl LookSet { | 
258  |  |     /// Create an empty set of look-around assertions.  | 
259  |  |     #[inline]  | 
260  | 0  |     pub fn empty() -> LookSet { | 
261  | 0  |         LookSet { bits: 0 } | 
262  | 0  |     }  | 
263  |  |  | 
264  |  |     /// Create a full set of look-around assertions.  | 
265  |  |     ///  | 
266  |  |     /// This set contains all possible look-around assertions.  | 
267  |  |     #[inline]  | 
268  | 0  |     pub fn full() -> LookSet { | 
269  | 0  |         LookSet { bits: !0 } | 
270  | 0  |     }  | 
271  |  |  | 
272  |  |     /// Create a look-around set containing the look-around assertion given.  | 
273  |  |     ///  | 
274  |  |     /// This is a convenience routine for creating an empty set and inserting  | 
275  |  |     /// one look-around assertions.  | 
276  |  |     #[inline]  | 
277  | 0  |     pub fn singleton(look: Look) -> LookSet { | 
278  | 0  |         LookSet::empty().insert(look)  | 
279  | 0  |     }  | 
280  |  |  | 
281  |  |     /// Returns the total number of look-around assertions in this set.  | 
282  |  |     #[inline]  | 
283  | 0  |     pub fn len(self) -> usize { | 
284  |  |         // OK because max value always fits in a u8, which in turn always  | 
285  |  |         // fits in a usize, regardless of target.  | 
286  | 0  |         usize::try_from(self.bits.count_ones()).unwrap()  | 
287  | 0  |     }  | 
288  |  |  | 
289  |  |     /// Returns true if and only if this set is empty.  | 
290  |  |     #[inline]  | 
291  | 0  |     pub fn is_empty(self) -> bool { | 
292  | 0  |         self.len() == 0  | 
293  | 0  |     }  | 
294  |  |  | 
295  |  |     /// Returns true if and only if the given look-around assertion is in this  | 
296  |  |     /// set.  | 
297  |  |     #[inline]  | 
298  | 0  |     pub fn contains(self, look: Look) -> bool { | 
299  | 0  |         self.bits & look.as_repr() != 0  | 
300  | 0  |     }  | 
301  |  |  | 
302  |  |     /// Returns true if and only if this set contains any anchor assertions.  | 
303  |  |     /// This includes both "start/end of haystack" and "start/end of line."  | 
304  |  |     #[inline]  | 
305  | 0  |     pub fn contains_anchor(&self) -> bool { | 
306  | 0  |         self.contains_anchor_haystack() || self.contains_anchor_line()  | 
307  | 0  |     }  | 
308  |  |  | 
309  |  |     /// Returns true if and only if this set contains any "start/end of  | 
310  |  |     /// haystack" anchors. This doesn't include "start/end of line" anchors.  | 
311  |  |     #[inline]  | 
312  | 0  |     pub fn contains_anchor_haystack(&self) -> bool { | 
313  | 0  |         self.contains(Look::Start) || self.contains(Look::End)  | 
314  | 0  |     }  | 
315  |  |  | 
316  |  |     /// Returns true if and only if this set contains any "start/end of line"  | 
317  |  |     /// anchors. This doesn't include "start/end of haystack" anchors. This  | 
318  |  |     /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.  | 
319  |  |     #[inline]  | 
320  | 0  |     pub fn contains_anchor_line(&self) -> bool { | 
321  | 0  |         self.contains(Look::StartLF)  | 
322  | 0  |             || self.contains(Look::EndLF)  | 
323  | 0  |             || self.contains(Look::StartCRLF)  | 
324  | 0  |             || self.contains(Look::EndCRLF)  | 
325  | 0  |     }  | 
326  |  |  | 
327  |  |     /// Returns true if and only if this set contains any "start/end of line"  | 
328  |  |     /// anchors that only treat `\n` as line terminators. This does not include  | 
329  |  |     /// haystack anchors or CRLF aware line anchors.  | 
330  |  |     #[inline]  | 
331  | 0  |     pub fn contains_anchor_lf(&self) -> bool { | 
332  | 0  |         self.contains(Look::StartLF) || self.contains(Look::EndLF)  | 
333  | 0  |     }  | 
334  |  |  | 
335  |  |     /// Returns true if and only if this set contains any "start/end of line"  | 
336  |  |     /// anchors that are CRLF-aware. This doesn't include "start/end of  | 
337  |  |     /// haystack" or "start/end of line-feed" anchors.  | 
338  |  |     #[inline]  | 
339  | 0  |     pub fn contains_anchor_crlf(&self) -> bool { | 
340  | 0  |         self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)  | 
341  | 0  |     }  | 
342  |  |  | 
343  |  |     /// Returns true if and only if this set contains any word boundary or  | 
344  |  |     /// negated word boundary assertions. This include both Unicode and ASCII  | 
345  |  |     /// word boundaries.  | 
346  |  |     #[inline]  | 
347  | 0  |     pub fn contains_word(self) -> bool { | 
348  | 0  |         self.contains_word_unicode() || self.contains_word_ascii()  | 
349  | 0  |     }  | 
350  |  |  | 
351  |  |     /// Returns true if and only if this set contains any Unicode word boundary  | 
352  |  |     /// or negated Unicode word boundary assertions.  | 
353  |  |     #[inline]  | 
354  | 0  |     pub fn contains_word_unicode(self) -> bool { | 
355  | 0  |         self.contains(Look::WordUnicode)  | 
356  | 0  |             || self.contains(Look::WordUnicodeNegate)  | 
357  | 0  |             || self.contains(Look::WordStartUnicode)  | 
358  | 0  |             || self.contains(Look::WordEndUnicode)  | 
359  | 0  |             || self.contains(Look::WordStartHalfUnicode)  | 
360  | 0  |             || self.contains(Look::WordEndHalfUnicode)  | 
361  | 0  |     }  | 
362  |  |  | 
363  |  |     /// Returns true if and only if this set contains any ASCII word boundary  | 
364  |  |     /// or negated ASCII word boundary assertions.  | 
365  |  |     #[inline]  | 
366  | 0  |     pub fn contains_word_ascii(self) -> bool { | 
367  | 0  |         self.contains(Look::WordAscii)  | 
368  | 0  |             || self.contains(Look::WordAsciiNegate)  | 
369  | 0  |             || self.contains(Look::WordStartAscii)  | 
370  | 0  |             || self.contains(Look::WordEndAscii)  | 
371  | 0  |             || self.contains(Look::WordStartHalfAscii)  | 
372  | 0  |             || self.contains(Look::WordEndHalfAscii)  | 
373  | 0  |     }  | 
374  |  |  | 
375  |  |     /// Returns an iterator over all of the look-around assertions in this set.  | 
376  |  |     #[inline]  | 
377  | 0  |     pub fn iter(self) -> LookSetIter { | 
378  | 0  |         LookSetIter { set: self } | 
379  | 0  |     }  | 
380  |  |  | 
381  |  |     /// Return a new set that is equivalent to the original, but with the given  | 
382  |  |     /// assertion added to it. If the assertion is already in the set, then the  | 
383  |  |     /// returned set is equivalent to the original.  | 
384  |  |     #[inline]  | 
385  | 0  |     pub fn insert(self, look: Look) -> LookSet { | 
386  | 0  |         LookSet { bits: self.bits | look.as_repr() } | 
387  | 0  |     }  | 
388  |  |  | 
389  |  |     /// Updates this set in place with the result of inserting the given  | 
390  |  |     /// assertion into this set.  | 
391  |  |     #[inline]  | 
392  | 0  |     pub fn set_insert(&mut self, look: Look) { | 
393  | 0  |         *self = self.insert(look);  | 
394  | 0  |     }  | 
395  |  |  | 
396  |  |     /// Return a new set that is equivalent to the original, but with the given  | 
397  |  |     /// assertion removed from it. If the assertion is not in the set, then the  | 
398  |  |     /// returned set is equivalent to the original.  | 
399  |  |     #[inline]  | 
400  | 0  |     pub fn remove(self, look: Look) -> LookSet { | 
401  | 0  |         LookSet { bits: self.bits & !look.as_repr() } | 
402  | 0  |     }  | 
403  |  |  | 
404  |  |     /// Updates this set in place with the result of removing the given  | 
405  |  |     /// assertion from this set.  | 
406  |  |     #[inline]  | 
407  | 0  |     pub fn set_remove(&mut self, look: Look) { | 
408  | 0  |         *self = self.remove(look);  | 
409  | 0  |     }  | 
410  |  |  | 
411  |  |     /// Returns a new set that is the result of subtracting the given set from  | 
412  |  |     /// this set.  | 
413  |  |     #[inline]  | 
414  | 0  |     pub fn subtract(self, other: LookSet) -> LookSet { | 
415  | 0  |         LookSet { bits: self.bits & !other.bits } | 
416  | 0  |     }  | 
417  |  |  | 
418  |  |     /// Updates this set in place with the result of subtracting the given set  | 
419  |  |     /// from this set.  | 
420  |  |     #[inline]  | 
421  | 0  |     pub fn set_subtract(&mut self, other: LookSet) { | 
422  | 0  |         *self = self.subtract(other);  | 
423  | 0  |     }  | 
424  |  |  | 
425  |  |     /// Returns a new set that is the union of this and the one given.  | 
426  |  |     #[inline]  | 
427  | 0  |     pub fn union(self, other: LookSet) -> LookSet { | 
428  | 0  |         LookSet { bits: self.bits | other.bits } | 
429  | 0  |     }  | 
430  |  |  | 
431  |  |     /// Updates this set in place with the result of unioning it with the one  | 
432  |  |     /// given.  | 
433  |  |     #[inline]  | 
434  | 0  |     pub fn set_union(&mut self, other: LookSet) { | 
435  | 0  |         *self = self.union(other);  | 
436  | 0  |     }  | 
437  |  |  | 
438  |  |     /// Returns a new set that is the intersection of this and the one given.  | 
439  |  |     #[inline]  | 
440  | 0  |     pub fn intersect(self, other: LookSet) -> LookSet { | 
441  | 0  |         LookSet { bits: self.bits & other.bits } | 
442  | 0  |     }  | 
443  |  |  | 
444  |  |     /// Updates this set in place with the result of intersecting it with the  | 
445  |  |     /// one given.  | 
446  |  |     #[inline]  | 
447  | 0  |     pub fn set_intersect(&mut self, other: LookSet) { | 
448  | 0  |         *self = self.intersect(other);  | 
449  | 0  |     }  | 
450  |  |  | 
451  |  |     /// Return a `LookSet` from the slice given as a native endian 32-bit  | 
452  |  |     /// integer.  | 
453  |  |     ///  | 
454  |  |     /// # Panics  | 
455  |  |     ///  | 
456  |  |     /// This panics if `slice.len() < 4`.  | 
457  |  |     #[inline]  | 
458  | 0  |     pub fn read_repr(slice: &[u8]) -> LookSet { | 
459  | 0  |         let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());  | 
460  | 0  |         LookSet { bits } | 
461  | 0  |     }  | 
462  |  |  | 
463  |  |     /// Write a `LookSet` as a native endian 32-bit integer to the beginning  | 
464  |  |     /// of the slice given.  | 
465  |  |     ///  | 
466  |  |     /// # Panics  | 
467  |  |     ///  | 
468  |  |     /// This panics if `slice.len() < 4`.  | 
469  |  |     #[inline]  | 
470  | 0  |     pub fn write_repr(self, slice: &mut [u8]) { | 
471  | 0  |         let raw = self.bits.to_ne_bytes();  | 
472  | 0  |         slice[0] = raw[0];  | 
473  | 0  |         slice[1] = raw[1];  | 
474  | 0  |         slice[2] = raw[2];  | 
475  | 0  |         slice[3] = raw[3];  | 
476  | 0  |     }  | 
477  |  |  | 
478  |  |     /// Checks that all assertions in this set can be matched.  | 
479  |  |     ///  | 
480  |  |     /// Some assertions, such as Unicode word boundaries, require optional (but  | 
481  |  |     /// enabled by default) tables that may not be available. If there are  | 
482  |  |     /// assertions in this set that require tables that are not available, then  | 
483  |  |     /// this will return an error.  | 
484  |  |     ///  | 
485  |  |     /// Specifically, this returns an error when the  | 
486  |  |     /// `unicode-word-boundary` feature is _not_ enabled _and_ this set  | 
487  |  |     /// contains a Unicode word boundary assertion.  | 
488  |  |     ///  | 
489  |  |     /// It can be useful to use this on the result of  | 
490  |  |     /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)  | 
491  |  |     /// when building a matcher engine to ensure methods like  | 
492  |  |     /// [`LookMatcher::matches_set`] do not panic at search time.  | 
493  | 0  |     pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { | 
494  | 0  |         if self.contains_word_unicode() { | 
495  | 0  |             UnicodeWordBoundaryError::check()?;  | 
496  | 0  |         }  | 
497  | 0  |         Ok(())  | 
498  | 0  |     }  | 
499  |  | }  | 
500  |  |  | 
501  |  | impl core::fmt::Debug for LookSet { | 
502  | 0  |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { | 
503  | 0  |         if self.is_empty() { | 
504  | 0  |             return write!(f, "∅");  | 
505  | 0  |         }  | 
506  | 0  |         for look in self.iter() { | 
507  | 0  |             write!(f, "{}", look.as_char())?; | 
508  |  |         }  | 
509  | 0  |         Ok(())  | 
510  | 0  |     }  | 
511  |  | }  | 
512  |  |  | 
513  |  | /// An iterator over all look-around assertions in a [`LookSet`].  | 
514  |  | ///  | 
515  |  | /// This iterator is created by [`LookSet::iter`].  | 
516  |  | #[derive(Clone, Debug)]  | 
517  |  | pub struct LookSetIter { | 
518  |  |     set: LookSet,  | 
519  |  | }  | 
520  |  |  | 
521  |  | impl Iterator for LookSetIter { | 
522  |  |     type Item = Look;  | 
523  |  |  | 
524  |  |     #[inline]  | 
525  | 0  |     fn next(&mut self) -> Option<Look> { | 
526  | 0  |         if self.set.is_empty() { | 
527  | 0  |             return None;  | 
528  | 0  |         }  | 
529  |  |         // We'll never have more than u8::MAX distinct look-around assertions,  | 
530  |  |         // so 'bit' will always fit into a u16.  | 
531  | 0  |         let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();  | 
532  | 0  |         let look = Look::from_repr(1 << bit)?;  | 
533  | 0  |         self.set = self.set.remove(look);  | 
534  | 0  |         Some(look)  | 
535  | 0  |     }  | 
536  |  | }  | 
537  |  |  | 
538  |  | /// A matcher for look-around assertions.  | 
539  |  | ///  | 
540  |  | /// This matcher permits configuring aspects of how look-around assertions are  | 
541  |  | /// matched.  | 
542  |  | ///  | 
543  |  | /// # Example  | 
544  |  | ///  | 
545  |  | /// A `LookMatcher` can change the line terminator used for matching multi-line  | 
546  |  | /// anchors such as `(?m:^)` and `(?m:$)`.  | 
547  |  | ///  | 
548  |  | /// ```  | 
549  |  | /// use regex_automata::{ | 
550  |  | ///     nfa::thompson::{self, pikevm::PikeVM}, | 
551  |  | ///     util::look::LookMatcher,  | 
552  |  | ///     Match, Input,  | 
553  |  | /// };  | 
554  |  | ///  | 
555  |  | /// let mut lookm = LookMatcher::new();  | 
556  |  | /// lookm.set_line_terminator(b'\x00');  | 
557  |  | ///  | 
558  |  | /// let re = PikeVM::builder()  | 
559  |  | ///     .thompson(thompson::Config::new().look_matcher(lookm))  | 
560  |  | ///     .build(r"(?m)^[a-z]+$")?;  | 
561  |  | /// let mut cache = re.create_cache();  | 
562  |  | ///  | 
563  |  | /// // Multi-line assertions now use NUL as a terminator.  | 
564  |  | /// assert_eq!(  | 
565  |  | ///     Some(Match::must(0, 1..4)),  | 
566  |  | ///     re.find(&mut cache, b"\x00abc\x00"),  | 
567  |  | /// );  | 
568  |  | /// // ... and \n is no longer recognized as a terminator.  | 
569  |  | /// assert_eq!(  | 
570  |  | ///     None,  | 
571  |  | ///     re.find(&mut cache, b"\nabc\n"),  | 
572  |  | /// );  | 
573  |  | ///  | 
574  |  | /// # Ok::<(), Box<dyn std::error::Error>>(())  | 
575  |  | /// ```  | 
576  |  | #[derive(Clone, Debug)]  | 
577  |  | pub struct LookMatcher { | 
578  |  |     lineterm: DebugByte,  | 
579  |  | }  | 
580  |  |  | 
581  |  | impl LookMatcher { | 
582  |  |     /// Creates a new default matcher for look-around assertions.  | 
583  | 0  |     pub fn new() -> LookMatcher { | 
584  | 0  |         LookMatcher { lineterm: DebugByte(b'\n') } | 
585  | 0  |     }  | 
586  |  |  | 
587  |  |     /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.  | 
588  |  |     ///  | 
589  |  |     /// Namely, instead of `^` matching after `\n` and `$` matching immediately  | 
590  |  |     /// before a `\n`, this will cause it to match after and before the byte  | 
591  |  |     /// given.  | 
592  |  |     ///  | 
593  |  |     /// It can occasionally be useful to use this to configure the line  | 
594  |  |     /// terminator to the NUL byte when searching binary data.  | 
595  |  |     ///  | 
596  |  |     /// Note that this does not apply to CRLF-aware line anchors such as  | 
597  |  |     /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to  | 
598  |  |     /// use `\r` and `\n`.  | 
599  | 0  |     pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { | 
600  | 0  |         self.lineterm.0 = byte;  | 
601  | 0  |         self  | 
602  | 0  |     }  | 
603  |  |  | 
604  |  |     /// Returns the line terminator that was configured for this matcher.  | 
605  |  |     ///  | 
606  |  |     /// If no line terminator was configured, then this returns `\n`.  | 
607  |  |     ///  | 
608  |  |     /// Note that the line terminator should only be used for matching `(?m:^)`  | 
609  |  |     /// and `(?m:$)` assertions. It specifically should _not_ be used for  | 
610  |  |     /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.  | 
611  | 0  |     pub fn get_line_terminator(&self) -> u8 { | 
612  | 0  |         self.lineterm.0  | 
613  | 0  |     }  | 
614  |  |  | 
615  |  |     /// Returns true when the position `at` in `haystack` satisfies the given  | 
616  |  |     /// look-around assertion.  | 
617  |  |     ///  | 
618  |  |     /// # Panics  | 
619  |  |     ///  | 
620  |  |     /// This panics when testing any Unicode word boundary assertion in this  | 
621  |  |     /// set and when the Unicode word data is not available. Specifically, this  | 
622  |  |     /// only occurs when the `unicode-word-boundary` feature is not enabled.  | 
623  |  |     ///  | 
624  |  |     /// Since it's generally expected that this routine is called inside of  | 
625  |  |     /// a matching engine, callers should check the error condition when  | 
626  |  |     /// building the matching engine. If there is a Unicode word boundary  | 
627  |  |     /// in the matcher and the data isn't available, then the matcher should  | 
628  |  |     /// fail to build.  | 
629  |  |     ///  | 
630  |  |     /// Callers can check the error condition with [`LookSet::available`].  | 
631  |  |     ///  | 
632  |  |     /// This also may panic when `at > haystack.len()`. Note that `at ==  | 
633  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
634  |  |     #[inline]  | 
635  | 0  |     pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { | 
636  | 0  |         self.matches_inline(look, haystack, at)  | 
637  | 0  |     }  | 
638  |  |  | 
639  |  |     /// Like `matches`, but forcefully inlined.  | 
640  |  |     ///  | 
641  |  |     /// # Panics  | 
642  |  |     ///  | 
643  |  |     /// This panics when testing any Unicode word boundary assertion in this  | 
644  |  |     /// set and when the Unicode word data is not available. Specifically, this  | 
645  |  |     /// only occurs when the `unicode-word-boundary` feature is not enabled.  | 
646  |  |     ///  | 
647  |  |     /// Since it's generally expected that this routine is called inside of  | 
648  |  |     /// a matching engine, callers should check the error condition when  | 
649  |  |     /// building the matching engine. If there is a Unicode word boundary  | 
650  |  |     /// in the matcher and the data isn't available, then the matcher should  | 
651  |  |     /// fail to build.  | 
652  |  |     ///  | 
653  |  |     /// Callers can check the error condition with [`LookSet::available`].  | 
654  |  |     ///  | 
655  |  |     /// This also may panic when `at > haystack.len()`. Note that `at ==  | 
656  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
657  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
658  | 0  |     pub(crate) fn matches_inline(  | 
659  | 0  |         &self,  | 
660  | 0  |         look: Look,  | 
661  | 0  |         haystack: &[u8],  | 
662  | 0  |         at: usize,  | 
663  | 0  |     ) -> bool { | 
664  | 0  |         match look { | 
665  | 0  |             Look::Start => self.is_start(haystack, at),  | 
666  | 0  |             Look::End => self.is_end(haystack, at),  | 
667  | 0  |             Look::StartLF => self.is_start_lf(haystack, at),  | 
668  | 0  |             Look::EndLF => self.is_end_lf(haystack, at),  | 
669  | 0  |             Look::StartCRLF => self.is_start_crlf(haystack, at),  | 
670  | 0  |             Look::EndCRLF => self.is_end_crlf(haystack, at),  | 
671  | 0  |             Look::WordAscii => self.is_word_ascii(haystack, at),  | 
672  | 0  |             Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),  | 
673  | 0  |             Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),  | 
674  |  |             Look::WordUnicodeNegate => { | 
675  | 0  |                 self.is_word_unicode_negate(haystack, at).unwrap()  | 
676  |  |             }  | 
677  | 0  |             Look::WordStartAscii => self.is_word_start_ascii(haystack, at),  | 
678  | 0  |             Look::WordEndAscii => self.is_word_end_ascii(haystack, at),  | 
679  |  |             Look::WordStartUnicode => { | 
680  | 0  |                 self.is_word_start_unicode(haystack, at).unwrap()  | 
681  |  |             }  | 
682  |  |             Look::WordEndUnicode => { | 
683  | 0  |                 self.is_word_end_unicode(haystack, at).unwrap()  | 
684  |  |             }  | 
685  |  |             Look::WordStartHalfAscii => { | 
686  | 0  |                 self.is_word_start_half_ascii(haystack, at)  | 
687  |  |             }  | 
688  |  |             Look::WordEndHalfAscii => { | 
689  | 0  |                 self.is_word_end_half_ascii(haystack, at)  | 
690  |  |             }  | 
691  |  |             Look::WordStartHalfUnicode => { | 
692  | 0  |                 self.is_word_start_half_unicode(haystack, at).unwrap()  | 
693  |  |             }  | 
694  |  |             Look::WordEndHalfUnicode => { | 
695  | 0  |                 self.is_word_end_half_unicode(haystack, at).unwrap()  | 
696  |  |             }  | 
697  |  |         }  | 
698  | 0  |     }  | 
699  |  |  | 
700  |  |     /// Returns true when _all_ of the assertions in the given set match at the  | 
701  |  |     /// given position in the haystack.  | 
702  |  |     ///  | 
703  |  |     /// # Panics  | 
704  |  |     ///  | 
705  |  |     /// This panics when testing any Unicode word boundary assertion in this  | 
706  |  |     /// set and when the Unicode word data is not available. Specifically, this  | 
707  |  |     /// only occurs when the `unicode-word-boundary` feature is not enabled.  | 
708  |  |     ///  | 
709  |  |     /// Since it's generally expected that this routine is called inside of  | 
710  |  |     /// a matching engine, callers should check the error condition when  | 
711  |  |     /// building the matching engine. If there is a Unicode word boundary  | 
712  |  |     /// in the matcher and the data isn't available, then the matcher should  | 
713  |  |     /// fail to build.  | 
714  |  |     ///  | 
715  |  |     /// Callers can check the error condition with [`LookSet::available`].  | 
716  |  |     ///  | 
717  |  |     /// This also may panic when `at > haystack.len()`. Note that `at ==  | 
718  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
719  |  |     #[inline]  | 
720  | 0  |     pub fn matches_set(  | 
721  | 0  |         &self,  | 
722  | 0  |         set: LookSet,  | 
723  | 0  |         haystack: &[u8],  | 
724  | 0  |         at: usize,  | 
725  | 0  |     ) -> bool { | 
726  | 0  |         self.matches_set_inline(set, haystack, at)  | 
727  | 0  |     }  | 
728  |  |  | 
729  |  |     /// Like `LookSet::matches`, but forcefully inlined for perf.  | 
730  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
731  | 0  |     pub(crate) fn matches_set_inline(  | 
732  | 0  |         &self,  | 
733  | 0  |         set: LookSet,  | 
734  | 0  |         haystack: &[u8],  | 
735  | 0  |         at: usize,  | 
736  | 0  |     ) -> bool { | 
737  |  |         // This used to use LookSet::iter with Look::matches on each element,  | 
738  |  |         // but that proved to be quite disastrous for perf. The manual "if  | 
739  |  |         // the set has this assertion, check it" turns out to be quite a bit  | 
740  |  |         // faster.  | 
741  | 0  |         if set.contains(Look::Start) { | 
742  | 0  |             if !self.is_start(haystack, at) { | 
743  | 0  |                 return false;  | 
744  | 0  |             }  | 
745  | 0  |         }  | 
746  | 0  |         if set.contains(Look::End) { | 
747  | 0  |             if !self.is_end(haystack, at) { | 
748  | 0  |                 return false;  | 
749  | 0  |             }  | 
750  | 0  |         }  | 
751  | 0  |         if set.contains(Look::StartLF) { | 
752  | 0  |             if !self.is_start_lf(haystack, at) { | 
753  | 0  |                 return false;  | 
754  | 0  |             }  | 
755  | 0  |         }  | 
756  | 0  |         if set.contains(Look::EndLF) { | 
757  | 0  |             if !self.is_end_lf(haystack, at) { | 
758  | 0  |                 return false;  | 
759  | 0  |             }  | 
760  | 0  |         }  | 
761  | 0  |         if set.contains(Look::StartCRLF) { | 
762  | 0  |             if !self.is_start_crlf(haystack, at) { | 
763  | 0  |                 return false;  | 
764  | 0  |             }  | 
765  | 0  |         }  | 
766  | 0  |         if set.contains(Look::EndCRLF) { | 
767  | 0  |             if !self.is_end_crlf(haystack, at) { | 
768  | 0  |                 return false;  | 
769  | 0  |             }  | 
770  | 0  |         }  | 
771  | 0  |         if set.contains(Look::WordAscii) { | 
772  | 0  |             if !self.is_word_ascii(haystack, at) { | 
773  | 0  |                 return false;  | 
774  | 0  |             }  | 
775  | 0  |         }  | 
776  | 0  |         if set.contains(Look::WordAsciiNegate) { | 
777  | 0  |             if !self.is_word_ascii_negate(haystack, at) { | 
778  | 0  |                 return false;  | 
779  | 0  |             }  | 
780  | 0  |         }  | 
781  | 0  |         if set.contains(Look::WordUnicode) { | 
782  | 0  |             if !self.is_word_unicode(haystack, at).unwrap() { | 
783  | 0  |                 return false;  | 
784  | 0  |             }  | 
785  | 0  |         }  | 
786  | 0  |         if set.contains(Look::WordUnicodeNegate) { | 
787  | 0  |             if !self.is_word_unicode_negate(haystack, at).unwrap() { | 
788  | 0  |                 return false;  | 
789  | 0  |             }  | 
790  | 0  |         }  | 
791  | 0  |         if set.contains(Look::WordStartAscii) { | 
792  | 0  |             if !self.is_word_start_ascii(haystack, at) { | 
793  | 0  |                 return false;  | 
794  | 0  |             }  | 
795  | 0  |         }  | 
796  | 0  |         if set.contains(Look::WordEndAscii) { | 
797  | 0  |             if !self.is_word_end_ascii(haystack, at) { | 
798  | 0  |                 return false;  | 
799  | 0  |             }  | 
800  | 0  |         }  | 
801  | 0  |         if set.contains(Look::WordStartUnicode) { | 
802  | 0  |             if !self.is_word_start_unicode(haystack, at).unwrap() { | 
803  | 0  |                 return false;  | 
804  | 0  |             }  | 
805  | 0  |         }  | 
806  | 0  |         if set.contains(Look::WordEndUnicode) { | 
807  | 0  |             if !self.is_word_end_unicode(haystack, at).unwrap() { | 
808  | 0  |                 return false;  | 
809  | 0  |             }  | 
810  | 0  |         }  | 
811  | 0  |         if set.contains(Look::WordStartHalfAscii) { | 
812  | 0  |             if !self.is_word_start_half_ascii(haystack, at) { | 
813  | 0  |                 return false;  | 
814  | 0  |             }  | 
815  | 0  |         }  | 
816  | 0  |         if set.contains(Look::WordEndHalfAscii) { | 
817  | 0  |             if !self.is_word_end_half_ascii(haystack, at) { | 
818  | 0  |                 return false;  | 
819  | 0  |             }  | 
820  | 0  |         }  | 
821  | 0  |         if set.contains(Look::WordStartHalfUnicode) { | 
822  | 0  |             if !self.is_word_start_half_unicode(haystack, at).unwrap() { | 
823  | 0  |                 return false;  | 
824  | 0  |             }  | 
825  | 0  |         }  | 
826  | 0  |         if set.contains(Look::WordEndHalfUnicode) { | 
827  | 0  |             if !self.is_word_end_half_unicode(haystack, at).unwrap() { | 
828  | 0  |                 return false;  | 
829  | 0  |             }  | 
830  | 0  |         }  | 
831  | 0  |         true  | 
832  | 0  |     }  | 
833  |  |  | 
834  |  |     /// Split up the given byte classes into equivalence classes in a way that  | 
835  |  |     /// is consistent with this look-around assertion.  | 
836  |  |     #[cfg(feature = "alloc")]  | 
837  | 0  |     pub(crate) fn add_to_byteset(  | 
838  | 0  |         &self,  | 
839  | 0  |         look: Look,  | 
840  | 0  |         set: &mut crate::util::alphabet::ByteClassSet,  | 
841  | 0  |     ) { | 
842  | 0  |         match look { | 
843  | 0  |             Look::Start | Look::End => {} | 
844  | 0  |             Look::StartLF | Look::EndLF => { | 
845  | 0  |                 set.set_range(self.lineterm.0, self.lineterm.0);  | 
846  | 0  |             }  | 
847  | 0  |             Look::StartCRLF | Look::EndCRLF => { | 
848  | 0  |                 set.set_range(b'\r', b'\r');  | 
849  | 0  |                 set.set_range(b'\n', b'\n');  | 
850  | 0  |             }  | 
851  |  |             Look::WordAscii  | 
852  |  |             | Look::WordAsciiNegate  | 
853  |  |             | Look::WordUnicode  | 
854  |  |             | Look::WordUnicodeNegate  | 
855  |  |             | Look::WordStartAscii  | 
856  |  |             | Look::WordEndAscii  | 
857  |  |             | Look::WordStartUnicode  | 
858  |  |             | Look::WordEndUnicode  | 
859  |  |             | Look::WordStartHalfAscii  | 
860  |  |             | Look::WordEndHalfAscii  | 
861  |  |             | Look::WordStartHalfUnicode  | 
862  |  |             | Look::WordEndHalfUnicode => { | 
863  |  |                 // We need to mark all ranges of bytes whose pairs result in  | 
864  |  |                 // evaluating \b differently. This isn't technically correct  | 
865  |  |                 // for Unicode word boundaries, but DFAs can't handle those  | 
866  |  |                 // anyway, and thus, the byte classes don't need to either  | 
867  |  |                 // since they are themselves only used in DFAs.  | 
868  |  |                 //  | 
869  |  |                 // FIXME: It seems like the calls to 'set_range' here are  | 
870  |  |                 // completely invariant, which means we could just hard-code  | 
871  |  |                 // them here without needing to write a loop. And we only need  | 
872  |  |                 // to do this dance at most once per regex.  | 
873  |  |                 //  | 
874  |  |                 // FIXME: Is this correct for \B?  | 
875  | 0  |                 let iswb = utf8::is_word_byte;  | 
876  |  |                 // This unwrap is OK because we guard every use of 'asu8' with  | 
877  |  |                 // a check that the input is <= 255.  | 
878  | 0  |                 let asu8 = |b: u16| u8::try_from(b).unwrap();  | 
879  | 0  |                 let mut b1: u16 = 0;  | 
880  |  |                 let mut b2: u16;  | 
881  | 0  |                 while b1 <= 255 { | 
882  | 0  |                     b2 = b1 + 1;  | 
883  | 0  |                     while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { | 
884  | 0  |                         b2 += 1;  | 
885  | 0  |                     }  | 
886  |  |                     // The guards above guarantee that b2 can never get any  | 
887  |  |                     // bigger.  | 
888  | 0  |                     assert!(b2 <= 256);  | 
889  |  |                     // Subtracting 1 from b2 is always OK because it is always  | 
890  |  |                     // at least 1 greater than b1, and the assert above  | 
891  |  |                     // guarantees that the asu8 conversion will succeed.  | 
892  | 0  |                     set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));  | 
893  | 0  |                     b1 = b2;  | 
894  |  |                 }  | 
895  |  |             }  | 
896  |  |         }  | 
897  | 0  |     }  | 
898  |  |  | 
899  |  |     /// Returns true when [`Look::Start`] is satisfied `at` the given position  | 
900  |  |     /// in `haystack`.  | 
901  |  |     ///  | 
902  |  |     /// # Panics  | 
903  |  |     ///  | 
904  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
905  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
906  |  |     #[inline]  | 
907  | 0  |     pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { | 
908  | 0  |         at == 0  | 
909  | 0  |     }  | 
910  |  |  | 
911  |  |     /// Returns true when [`Look::End`] is satisfied `at` the given position in  | 
912  |  |     /// `haystack`.  | 
913  |  |     ///  | 
914  |  |     /// # Panics  | 
915  |  |     ///  | 
916  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
917  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
918  |  |     #[inline]  | 
919  | 0  |     pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { | 
920  | 0  |         at == haystack.len()  | 
921  | 0  |     }  | 
922  |  |  | 
923  |  |     /// Returns true when [`Look::StartLF`] is satisfied `at` the given  | 
924  |  |     /// position in `haystack`.  | 
925  |  |     ///  | 
926  |  |     /// # Panics  | 
927  |  |     ///  | 
928  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
929  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
930  |  |     #[inline]  | 
931  | 0  |     pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { | 
932  | 0  |         self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0  | 
933  | 0  |     }  | 
934  |  |  | 
935  |  |     /// Returns true when [`Look::EndLF`] is satisfied `at` the given position  | 
936  |  |     /// in `haystack`.  | 
937  |  |     ///  | 
938  |  |     /// # Panics  | 
939  |  |     ///  | 
940  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
941  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
942  |  |     #[inline]  | 
943  | 0  |     pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { | 
944  | 0  |         self.is_end(haystack, at) || haystack[at] == self.lineterm.0  | 
945  | 0  |     }  | 
946  |  |  | 
947  |  |     /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given  | 
948  |  |     /// position in `haystack`.  | 
949  |  |     ///  | 
950  |  |     /// # Panics  | 
951  |  |     ///  | 
952  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
953  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
954  |  |     #[inline]  | 
955  | 0  |     pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { | 
956  | 0  |         self.is_start(haystack, at)  | 
957  | 0  |             || haystack[at - 1] == b'\n'  | 
958  | 0  |             || (haystack[at - 1] == b'\r'  | 
959  | 0  |                 && (at >= haystack.len() || haystack[at] != b'\n'))  | 
960  | 0  |     }  | 
961  |  |  | 
962  |  |     /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given  | 
963  |  |     /// position in `haystack`.  | 
964  |  |     ///  | 
965  |  |     /// # Panics  | 
966  |  |     ///  | 
967  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
968  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
969  |  |     #[inline]  | 
970  | 0  |     pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { | 
971  | 0  |         self.is_end(haystack, at)  | 
972  | 0  |             || haystack[at] == b'\r'  | 
973  | 0  |             || (haystack[at] == b'\n'  | 
974  | 0  |                 && (at == 0 || haystack[at - 1] != b'\r'))  | 
975  | 0  |     }  | 
976  |  |  | 
977  |  |     /// Returns true when [`Look::WordAscii`] is satisfied `at` the given  | 
978  |  |     /// position in `haystack`.  | 
979  |  |     ///  | 
980  |  |     /// # Panics  | 
981  |  |     ///  | 
982  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
983  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
984  |  |     #[inline]  | 
985  | 0  |     pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { | 
986  | 0  |         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);  | 
987  | 0  |         let word_after =  | 
988  | 0  |             at < haystack.len() && utf8::is_word_byte(haystack[at]);  | 
989  | 0  |         word_before != word_after  | 
990  | 0  |     }  | 
991  |  |  | 
992  |  |     /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given  | 
993  |  |     /// position in `haystack`.  | 
994  |  |     ///  | 
995  |  |     /// # Panics  | 
996  |  |     ///  | 
997  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
998  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
999  |  |     #[inline]  | 
1000  | 0  |     pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { | 
1001  | 0  |         !self.is_word_ascii(haystack, at)  | 
1002  | 0  |     }  | 
1003  |  |  | 
1004  |  |     /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given  | 
1005  |  |     /// position in `haystack`.  | 
1006  |  |     ///  | 
1007  |  |     /// # Panics  | 
1008  |  |     ///  | 
1009  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1010  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1011  |  |     ///  | 
1012  |  |     /// # Errors  | 
1013  |  |     ///  | 
1014  |  |     /// This returns an error when Unicode word boundary tables  | 
1015  |  |     /// are not available. Specifically, this only occurs when the  | 
1016  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1017  |  |     #[inline]  | 
1018  | 0  |     pub fn is_word_unicode(  | 
1019  | 0  |         &self,  | 
1020  | 0  |         haystack: &[u8],  | 
1021  | 0  |         at: usize,  | 
1022  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1023  | 0  |         let word_before = is_word_char::rev(haystack, at)?;  | 
1024  | 0  |         let word_after = is_word_char::fwd(haystack, at)?;  | 
1025  | 0  |         Ok(word_before != word_after)  | 
1026  | 0  |     }  | 
1027  |  |  | 
1028  |  |     /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the  | 
1029  |  |     /// given position in `haystack`.  | 
1030  |  |     ///  | 
1031  |  |     /// # Panics  | 
1032  |  |     ///  | 
1033  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1034  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1035  |  |     ///  | 
1036  |  |     /// # Errors  | 
1037  |  |     ///  | 
1038  |  |     /// This returns an error when Unicode word boundary tables  | 
1039  |  |     /// are not available. Specifically, this only occurs when the  | 
1040  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1041  |  |     #[inline]  | 
1042  | 0  |     pub fn is_word_unicode_negate(  | 
1043  | 0  |         &self,  | 
1044  | 0  |         haystack: &[u8],  | 
1045  | 0  |         at: usize,  | 
1046  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1047  |  |         // This is pretty subtle. Why do we need to do UTF-8 decoding here?  | 
1048  |  |         // Well... at time of writing, the is_word_char_{fwd,rev} routines will | 
1049  |  |         // only return true if there is a valid UTF-8 encoding of a "word"  | 
1050  |  |         // codepoint, and false in every other case (including invalid UTF-8).  | 
1051  |  |         // This means that in regions of invalid UTF-8 (which might be a  | 
1052  |  |         // subset of valid UTF-8!), it would result in \B matching. While this  | 
1053  |  |         // would be questionable in the context of truly invalid UTF-8, it is  | 
1054  |  |         // *certainly* wrong to report match boundaries that split the encoding  | 
1055  |  |         // of a codepoint. So to work around this, we ensure that we can decode  | 
1056  |  |         // a codepoint on either side of `at`. If either direction fails, then  | 
1057  |  |         // we don't permit \B to match at all.  | 
1058  |  |         //  | 
1059  |  |         // Now, this isn't exactly optimal from a perf perspective. We could  | 
1060  |  |         // try and detect this in is_word_char::{fwd,rev}, but it's not clear | 
1061  |  |         // if it's worth it. \B is, after all, rarely used. Even worse,  | 
1062  |  |         // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this | 
1063  |  |         // will wind up doing UTF-8 decoding twice. Ouch. We could fix this  | 
1064  |  |         // with more code complexity, but it just doesn't feel worth it for \B.  | 
1065  |  |         //  | 
1066  |  |         // And in particular, we do *not* have to do this with \b, because \b  | 
1067  |  |         // *requires* that at least one side of `at` be a "word" codepoint,  | 
1068  |  |         // which in turn implies one side of `at` must be valid UTF-8. This in  | 
1069  |  |         // turn implies that \b can never split a valid UTF-8 encoding of a  | 
1070  |  |         // codepoint. In the case where one side of `at` is truly invalid UTF-8  | 
1071  |  |         // and the other side IS a word codepoint, then we want \b to match  | 
1072  |  |         // since it represents a valid UTF-8 boundary. It also makes sense. For  | 
1073  |  |         // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.  | 
1074  |  |         //  | 
1075  |  |         // Note also that this is not just '!is_word_unicode(..)' like it is  | 
1076  |  |         // for the ASCII case. For example, neither \b nor \B is satisfied  | 
1077  |  |         // within invalid UTF-8 sequences.  | 
1078  | 0  |         let word_before = at > 0  | 
1079  | 0  |             && match utf8::decode_last(&haystack[..at]) { | 
1080  | 0  |                 None | Some(Err(_)) => return Ok(false),  | 
1081  | 0  |                 Some(Ok(_)) => is_word_char::rev(haystack, at)?,  | 
1082  |  |             };  | 
1083  | 0  |         let word_after = at < haystack.len()  | 
1084  | 0  |             && match utf8::decode(&haystack[at..]) { | 
1085  | 0  |                 None | Some(Err(_)) => return Ok(false),  | 
1086  | 0  |                 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,  | 
1087  |  |             };  | 
1088  | 0  |         Ok(word_before == word_after)  | 
1089  | 0  |     }  | 
1090  |  |  | 
1091  |  |     /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given  | 
1092  |  |     /// position in `haystack`.  | 
1093  |  |     ///  | 
1094  |  |     /// # Panics  | 
1095  |  |     ///  | 
1096  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1097  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1098  |  |     #[inline]  | 
1099  | 0  |     pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { | 
1100  | 0  |         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);  | 
1101  | 0  |         let word_after =  | 
1102  | 0  |             at < haystack.len() && utf8::is_word_byte(haystack[at]);  | 
1103  | 0  |         !word_before && word_after  | 
1104  | 0  |     }  | 
1105  |  |  | 
1106  |  |     /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given  | 
1107  |  |     /// position in `haystack`.  | 
1108  |  |     ///  | 
1109  |  |     /// # Panics  | 
1110  |  |     ///  | 
1111  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1112  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1113  |  |     #[inline]  | 
1114  | 0  |     pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { | 
1115  | 0  |         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);  | 
1116  | 0  |         let word_after =  | 
1117  | 0  |             at < haystack.len() && utf8::is_word_byte(haystack[at]);  | 
1118  | 0  |         word_before && !word_after  | 
1119  | 0  |     }  | 
1120  |  |  | 
1121  |  |     /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the  | 
1122  |  |     /// given position in `haystack`.  | 
1123  |  |     ///  | 
1124  |  |     /// # Panics  | 
1125  |  |     ///  | 
1126  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1127  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1128  |  |     ///  | 
1129  |  |     /// # Errors  | 
1130  |  |     ///  | 
1131  |  |     /// This returns an error when Unicode word boundary tables  | 
1132  |  |     /// are not available. Specifically, this only occurs when the  | 
1133  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1134  |  |     #[inline]  | 
1135  | 0  |     pub fn is_word_start_unicode(  | 
1136  | 0  |         &self,  | 
1137  | 0  |         haystack: &[u8],  | 
1138  | 0  |         at: usize,  | 
1139  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1140  | 0  |         let word_before = is_word_char::rev(haystack, at)?;  | 
1141  | 0  |         let word_after = is_word_char::fwd(haystack, at)?;  | 
1142  | 0  |         Ok(!word_before && word_after)  | 
1143  | 0  |     }  | 
1144  |  |  | 
1145  |  |     /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the  | 
1146  |  |     /// given position in `haystack`.  | 
1147  |  |     ///  | 
1148  |  |     /// # Panics  | 
1149  |  |     ///  | 
1150  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1151  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1152  |  |     ///  | 
1153  |  |     /// # Errors  | 
1154  |  |     ///  | 
1155  |  |     /// This returns an error when Unicode word boundary tables  | 
1156  |  |     /// are not available. Specifically, this only occurs when the  | 
1157  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1158  |  |     #[inline]  | 
1159  | 0  |     pub fn is_word_end_unicode(  | 
1160  | 0  |         &self,  | 
1161  | 0  |         haystack: &[u8],  | 
1162  | 0  |         at: usize,  | 
1163  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1164  | 0  |         let word_before = is_word_char::rev(haystack, at)?;  | 
1165  | 0  |         let word_after = is_word_char::fwd(haystack, at)?;  | 
1166  | 0  |         Ok(word_before && !word_after)  | 
1167  | 0  |     }  | 
1168  |  |  | 
1169  |  |     /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the  | 
1170  |  |     /// given position in `haystack`.  | 
1171  |  |     ///  | 
1172  |  |     /// # Panics  | 
1173  |  |     ///  | 
1174  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1175  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1176  |  |     #[inline]  | 
1177  | 0  |     pub fn is_word_start_half_ascii(  | 
1178  | 0  |         &self,  | 
1179  | 0  |         haystack: &[u8],  | 
1180  | 0  |         at: usize,  | 
1181  | 0  |     ) -> bool { | 
1182  | 0  |         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);  | 
1183  | 0  |         !word_before  | 
1184  | 0  |     }  | 
1185  |  |  | 
1186  |  |     /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the  | 
1187  |  |     /// given position in `haystack`.  | 
1188  |  |     ///  | 
1189  |  |     /// # Panics  | 
1190  |  |     ///  | 
1191  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1192  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1193  |  |     #[inline]  | 
1194  | 0  |     pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { | 
1195  | 0  |         let word_after =  | 
1196  | 0  |             at < haystack.len() && utf8::is_word_byte(haystack[at]);  | 
1197  | 0  |         !word_after  | 
1198  | 0  |     }  | 
1199  |  |  | 
1200  |  |     /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the  | 
1201  |  |     /// given position in `haystack`.  | 
1202  |  |     ///  | 
1203  |  |     /// # Panics  | 
1204  |  |     ///  | 
1205  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1206  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1207  |  |     ///  | 
1208  |  |     /// # Errors  | 
1209  |  |     ///  | 
1210  |  |     /// This returns an error when Unicode word boundary tables  | 
1211  |  |     /// are not available. Specifically, this only occurs when the  | 
1212  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1213  |  |     #[inline]  | 
1214  | 0  |     pub fn is_word_start_half_unicode(  | 
1215  | 0  |         &self,  | 
1216  | 0  |         haystack: &[u8],  | 
1217  | 0  |         at: usize,  | 
1218  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1219  |  |         // See `is_word_unicode_negate` for why we need to do this. We don't  | 
1220  |  |         // need to do it for `is_word_start_unicode` because that guarantees  | 
1221  |  |         // that the position matched falls on a valid UTF-8 boundary given  | 
1222  |  |         // that the right side must be in \w.  | 
1223  | 0  |         let word_before = at > 0  | 
1224  | 0  |             && match utf8::decode_last(&haystack[..at]) { | 
1225  | 0  |                 None | Some(Err(_)) => return Ok(false),  | 
1226  | 0  |                 Some(Ok(_)) => is_word_char::rev(haystack, at)?,  | 
1227  |  |             };  | 
1228  | 0  |         Ok(!word_before)  | 
1229  | 0  |     }  | 
1230  |  |  | 
1231  |  |     /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the  | 
1232  |  |     /// given position in `haystack`.  | 
1233  |  |     ///  | 
1234  |  |     /// # Panics  | 
1235  |  |     ///  | 
1236  |  |     /// This may panic when `at > haystack.len()`. Note that `at ==  | 
1237  |  |     /// haystack.len()` is legal and guaranteed not to panic.  | 
1238  |  |     ///  | 
1239  |  |     /// # Errors  | 
1240  |  |     ///  | 
1241  |  |     /// This returns an error when Unicode word boundary tables  | 
1242  |  |     /// are not available. Specifically, this only occurs when the  | 
1243  |  |     /// `unicode-word-boundary` feature is not enabled.  | 
1244  |  |     #[inline]  | 
1245  | 0  |     pub fn is_word_end_half_unicode(  | 
1246  | 0  |         &self,  | 
1247  | 0  |         haystack: &[u8],  | 
1248  | 0  |         at: usize,  | 
1249  | 0  |     ) -> Result<bool, UnicodeWordBoundaryError> { | 
1250  |  |         // See `is_word_unicode_negate` for why we need to do this. We don't  | 
1251  |  |         // need to do it for `is_word_end_unicode` because that guarantees  | 
1252  |  |         // that the position matched falls on a valid UTF-8 boundary given  | 
1253  |  |         // that the left side must be in \w.  | 
1254  | 0  |         let word_after = at < haystack.len()  | 
1255  | 0  |             && match utf8::decode(&haystack[at..]) { | 
1256  | 0  |                 None | Some(Err(_)) => return Ok(false),  | 
1257  | 0  |                 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,  | 
1258  |  |             };  | 
1259  | 0  |         Ok(!word_after)  | 
1260  | 0  |     }  | 
1261  |  | }  | 
1262  |  |  | 
1263  |  | impl Default for LookMatcher { | 
1264  | 0  |     fn default() -> LookMatcher { | 
1265  | 0  |         LookMatcher::new()  | 
1266  | 0  |     }  | 
1267  |  | }  | 
1268  |  |  | 
1269  |  | /// An error that occurs when the Unicode-aware `\w` class is unavailable.  | 
1270  |  | ///  | 
1271  |  | /// This error can occur when the data tables necessary for the Unicode aware  | 
1272  |  | /// Perl character class `\w` are unavailable. The `\w` class is used to  | 
1273  |  | /// determine whether a codepoint is considered a word character or not when  | 
1274  |  | /// determining whether a Unicode aware `\b` (or `\B`) matches at a particular  | 
1275  |  | /// position.  | 
1276  |  | ///  | 
1277  |  | /// This error can only occur when the `unicode-word-boundary` feature is  | 
1278  |  | /// disabled.  | 
1279  |  | #[derive(Clone, Debug)]  | 
1280  |  | pub struct UnicodeWordBoundaryError(());  | 
1281  |  |  | 
1282  |  | impl UnicodeWordBoundaryError { | 
1283  |  |     #[cfg(not(feature = "unicode-word-boundary"))]  | 
1284  |  |     pub(crate) fn new() -> UnicodeWordBoundaryError { | 
1285  |  |         UnicodeWordBoundaryError(())  | 
1286  |  |     }  | 
1287  |  |  | 
1288  |  |     /// Returns an error if and only if Unicode word boundary data is  | 
1289  |  |     /// unavailable.  | 
1290  | 0  |     pub fn check() -> Result<(), UnicodeWordBoundaryError> { | 
1291  | 0  |         is_word_char::check()  | 
1292  | 0  |     }  | 
1293  |  | }  | 
1294  |  |  | 
1295  |  | #[cfg(feature = "std")]  | 
1296  |  | impl std::error::Error for UnicodeWordBoundaryError {} | 
1297  |  |  | 
1298  |  | impl core::fmt::Display for UnicodeWordBoundaryError { | 
1299  | 0  |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { | 
1300  | 0  |         write!(  | 
1301  | 0  |             f,  | 
1302  | 0  |             "Unicode-aware \\b and \\B are unavailable because the \  | 
1303  | 0  |              requisite data tables are missing, please enable the \  | 
1304  | 0  |              unicode-word-boundary feature"  | 
1305  |  |         )  | 
1306  | 0  |     }  | 
1307  |  | }  | 
1308  |  |  | 
1309  |  | // Below are FOUR different ways for checking whether whether a "word"  | 
1310  |  | // codepoint exists at a particular position in the haystack. The four  | 
1311  |  | // different approaches are, in order of preference:  | 
1312  |  | //  | 
1313  |  | // 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the  | 
1314  |  | // first call, and then use that DFA for all subsequent calls.  | 
1315  |  | // 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.  | 
1316  |  | // 3. Do UTF-8 decoding and use our own 'perl_word' table.  | 
1317  |  | // 4. Return an error.  | 
1318  |  | //  | 
1319  |  | // The reason for all of these approaches is a combination of perf and  | 
1320  |  | // permitting one to build regex-automata without the Unicode data necessary  | 
1321  |  | // for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would  | 
1322  |  | // still work.)  | 
1323  |  | //  | 
1324  |  | // The DFA approach is the fastest, but it requires the regex parser, the  | 
1325  |  | // NFA compiler, the DFA builder and the DFA search runtime. That's a lot to  | 
1326  |  | // bring in, but if it's available, it's (probably) the best we can do.  | 
1327  |  | //  | 
1328  |  | // Approaches (2) and (3) are effectively equivalent, but (2) reuses the  | 
1329  |  | // data in regex-syntax and avoids duplicating it in regex-automata.  | 
1330  |  | //  | 
1331  |  | // Finally, (4) unconditionally returns an error since the requisite data isn't  | 
1332  |  | // available anywhere.  | 
1333  |  | //  | 
1334  |  | // There are actually more approaches possible that we didn't implement. For  | 
1335  |  | // example, if the DFA builder is available but the syntax parser is not, we  | 
1336  |  | // could technically hand construct our own NFA from the 'perl_word' data  | 
1337  |  | // table. But to avoid some pretty hairy code duplication, we would in turn  | 
1338  |  | // need to pull the UTF-8 compiler out of the NFA compiler. Yikes.  | 
1339  |  | //  | 
1340  |  | // A possibly more sensible alternative is to use a lazy DFA when the full  | 
1341  |  | // DFA builder isn't available...  | 
1342  |  | //  | 
1343  |  | // Yet another choice would be to build the full DFA and then embed it into the  | 
1344  |  | // source. Then we'd only need to bring in the DFA search runtime, which is  | 
1345  |  | // considerably smaller than the DFA builder code. The problem here is that the  | 
1346  |  | // Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,  | 
1347  |  | // we'd need to build regex-cli, which depends on regex-automata in order to  | 
1348  |  | // build some part of regex-automata. But to be honest, something like this has  | 
1349  |  | // to be allowed somehow? I just don't know what the right process is.  | 
1350  |  | //  | 
1351  |  | // There are perhaps other choices as well. Why did I stop at these 4? Because  | 
1352  |  | // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA  | 
1353  |  | // approach eventually, as the benefits of the DFA approach are somewhat  | 
1354  |  | // compelling. The 'boundary-words-holmes' benchmark tests this. (Note that  | 
1355  |  | // the commands below no longer work. If necessary, we should re-capitulate  | 
1356  |  | // the benchmark from whole cloth in rebar.)  | 
1357  |  | //  | 
1358  |  | //   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv  | 
1359  |  | //  | 
1360  |  | // Then I changed the code below so that the util/unicode_data/perl_word table  | 
1361  |  | // was used and re-ran the benchmark:  | 
1362  |  | //  | 
1363  |  | //   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv  | 
1364  |  | //  | 
1365  |  | // And compared them:  | 
1366  |  | //  | 
1367  |  | //   $ regex-cli bench diff dfa.csv table.csv  | 
1368  |  | //   benchmark                             engine                 dfa        table  | 
1369  |  | //   ---------                             ------                 ---        -----  | 
1370  |  | //   internal/count/boundary-words-holmes  regex/automata/pikevm  18.6 MB/s  12.9 MB/s  | 
1371  |  | //  | 
1372  |  | // Which is a nice improvement.  | 
1373  |  | //  | 
1374  |  | // UPDATE: It turns out that it takes approximately 22ms to build the reverse  | 
1375  |  | // DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in  | 
1376  |  | // the grand scheme things, but that is a significant latency cost. So I'm not  | 
1377  |  | // sure that's a good idea. I then tried using a lazy DFA instead, and that  | 
1378  |  | // eliminated the overhead, but since the lazy DFA requires mutable working  | 
1379  |  | // memory, that requires introducing a 'Cache' for every simultaneous call.  | 
1380  |  | //  | 
1381  |  | // I ended up deciding for now to just keep the "UTF-8 decode and check the  | 
1382  |  | // table." The DFA and lazy DFA approaches are still below, but commented out.  | 
1383  |  | //  | 
1384  |  | // [1]: https://github.com/BurntSushi/ucd-generate/issues/11  | 
1385  |  |  | 
1386  |  | /*  | 
1387  |  | /// A module that looks for word codepoints using lazy DFAs.  | 
1388  |  | #[cfg(all(  | 
1389  |  |     feature = "unicode-word-boundary",  | 
1390  |  |     feature = "syntax",  | 
1391  |  |     feature = "unicode-perl",  | 
1392  |  |     feature = "hybrid"  | 
1393  |  | ))]  | 
1394  |  | mod is_word_char { | 
1395  |  |     use alloc::vec::Vec;  | 
1396  |  |  | 
1397  |  |     use crate::{ | 
1398  |  |         hybrid::dfa::{Cache, DFA}, | 
1399  |  |         nfa::thompson::NFA,  | 
1400  |  |         util::{lazy::Lazy, pool::Pool, primitives::StateID}, | 
1401  |  |         Anchored, Input,  | 
1402  |  |     };  | 
1403  |  |  | 
1404  |  |     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { | 
1405  |  |         Ok(())  | 
1406  |  |     }  | 
1407  |  |  | 
1408  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1409  |  |     pub(super) fn fwd(  | 
1410  |  |         haystack: &[u8],  | 
1411  |  |         mut at: usize,  | 
1412  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1413  |  |         static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());  | 
1414  |  |         static CACHE: Lazy<Pool<Cache>> =  | 
1415  |  |             Lazy::new(|| Pool::new(|| WORD.create_cache()));  | 
1416  |  |         let dfa = Lazy::get(&WORD);  | 
1417  |  |         let mut cache = Lazy::get(&CACHE).get();  | 
1418  |  |         let mut sid = dfa  | 
1419  |  |             .start_state_forward(  | 
1420  |  |                 &mut cache,  | 
1421  |  |                 &Input::new("").anchored(Anchored::Yes), | 
1422  |  |             )  | 
1423  |  |             .unwrap();  | 
1424  |  |         while at < haystack.len() { | 
1425  |  |             let byte = haystack[at];  | 
1426  |  |             sid = dfa.next_state(&mut cache, sid, byte).unwrap();  | 
1427  |  |             at += 1;  | 
1428  |  |             if sid.is_tagged() { | 
1429  |  |                 if sid.is_match() { | 
1430  |  |                     return Ok(true);  | 
1431  |  |                 } else if sid.is_dead() { | 
1432  |  |                     return Ok(false);  | 
1433  |  |                 }  | 
1434  |  |             }  | 
1435  |  |         }  | 
1436  |  |         Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())  | 
1437  |  |     }  | 
1438  |  |  | 
1439  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1440  |  |     pub(super) fn rev(  | 
1441  |  |         haystack: &[u8],  | 
1442  |  |         mut at: usize,  | 
1443  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1444  |  |         static WORD: Lazy<DFA> = Lazy::new(|| { | 
1445  |  |             DFA::builder()  | 
1446  |  |                 .thompson(NFA::config().reverse(true))  | 
1447  |  |                 .build(r"\w")  | 
1448  |  |                 .unwrap()  | 
1449  |  |         });  | 
1450  |  |         static CACHE: Lazy<Pool<Cache>> =  | 
1451  |  |             Lazy::new(|| Pool::new(|| WORD.create_cache()));  | 
1452  |  |         let dfa = Lazy::get(&WORD);  | 
1453  |  |         let mut cache = Lazy::get(&CACHE).get();  | 
1454  |  |         let mut sid = dfa  | 
1455  |  |             .start_state_reverse(  | 
1456  |  |                 &mut cache,  | 
1457  |  |                 &Input::new("").anchored(Anchored::Yes), | 
1458  |  |             )  | 
1459  |  |             .unwrap();  | 
1460  |  |         while at > 0 { | 
1461  |  |             at -= 1;  | 
1462  |  |             let byte = haystack[at];  | 
1463  |  |             sid = dfa.next_state(&mut cache, sid, byte).unwrap();  | 
1464  |  |             if sid.is_tagged() { | 
1465  |  |                 if sid.is_match() { | 
1466  |  |                     return Ok(true);  | 
1467  |  |                 } else if sid.is_dead() { | 
1468  |  |                     return Ok(false);  | 
1469  |  |                 }  | 
1470  |  |             }  | 
1471  |  |         }  | 
1472  |  |         Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())  | 
1473  |  |     }  | 
1474  |  | }  | 
1475  |  | */  | 
1476  |  |  | 
1477  |  | /*  | 
1478  |  | /// A module that looks for word codepoints using fully compiled DFAs.  | 
1479  |  | #[cfg(all(  | 
1480  |  |     feature = "unicode-word-boundary",  | 
1481  |  |     feature = "syntax",  | 
1482  |  |     feature = "unicode-perl",  | 
1483  |  |     feature = "dfa-build"  | 
1484  |  | ))]  | 
1485  |  | mod is_word_char { | 
1486  |  |     use alloc::vec::Vec;  | 
1487  |  |  | 
1488  |  |     use crate::{ | 
1489  |  |         dfa::{dense::DFA, Automaton, StartKind}, | 
1490  |  |         nfa::thompson::NFA,  | 
1491  |  |         util::{lazy::Lazy, primitives::StateID}, | 
1492  |  |         Anchored, Input,  | 
1493  |  |     };  | 
1494  |  |  | 
1495  |  |     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { | 
1496  |  |         Ok(())  | 
1497  |  |     }  | 
1498  |  |  | 
1499  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1500  |  |     pub(super) fn fwd(  | 
1501  |  |         haystack: &[u8],  | 
1502  |  |         mut at: usize,  | 
1503  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1504  |  |         static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { | 
1505  |  |             let dfa = DFA::builder()  | 
1506  |  |                 .configure(DFA::config().start_kind(StartKind::Anchored))  | 
1507  |  |                 .build(r"\w")  | 
1508  |  |                 .unwrap();  | 
1509  |  |             // OK because our regex has no look-around.  | 
1510  |  |             let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();  | 
1511  |  |             (dfa, start_id)  | 
1512  |  |         });  | 
1513  |  |         let &(ref dfa, mut sid) = Lazy::get(&WORD);  | 
1514  |  |         while at < haystack.len() { | 
1515  |  |             let byte = haystack[at];  | 
1516  |  |             sid = dfa.next_state(sid, byte);  | 
1517  |  |             at += 1;  | 
1518  |  |             if dfa.is_special_state(sid) { | 
1519  |  |                 if dfa.is_match_state(sid) { | 
1520  |  |                     return Ok(true);  | 
1521  |  |                 } else if dfa.is_dead_state(sid) { | 
1522  |  |                     return Ok(false);  | 
1523  |  |                 }  | 
1524  |  |             }  | 
1525  |  |         }  | 
1526  |  |         Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))  | 
1527  |  |     }  | 
1528  |  |  | 
1529  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1530  |  |     pub(super) fn rev(  | 
1531  |  |         haystack: &[u8],  | 
1532  |  |         mut at: usize,  | 
1533  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1534  |  |         static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { | 
1535  |  |             let dfa = DFA::builder()  | 
1536  |  |                 .configure(DFA::config().start_kind(StartKind::Anchored))  | 
1537  |  |                 // From ad hoc measurements, it looks like setting  | 
1538  |  |                 // shrink==false is slightly faster than shrink==true. I kind  | 
1539  |  |                 // of feel like this indicates that shrinking is probably a  | 
1540  |  |                 // failure, although it can help in some cases. Sigh.  | 
1541  |  |                 .thompson(NFA::config().reverse(true).shrink(false))  | 
1542  |  |                 .build(r"\w")  | 
1543  |  |                 .unwrap();  | 
1544  |  |             // OK because our regex has no look-around.  | 
1545  |  |             let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();  | 
1546  |  |             (dfa, start_id)  | 
1547  |  |         });  | 
1548  |  |         let &(ref dfa, mut sid) = Lazy::get(&WORD);  | 
1549  |  |         while at > 0 { | 
1550  |  |             at -= 1;  | 
1551  |  |             let byte = haystack[at];  | 
1552  |  |             sid = dfa.next_state(sid, byte);  | 
1553  |  |             if dfa.is_special_state(sid) { | 
1554  |  |                 if dfa.is_match_state(sid) { | 
1555  |  |                     return Ok(true);  | 
1556  |  |                 } else if dfa.is_dead_state(sid) { | 
1557  |  |                     return Ok(false);  | 
1558  |  |                 }  | 
1559  |  |             }  | 
1560  |  |         }  | 
1561  |  |         Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))  | 
1562  |  |     }  | 
1563  |  | }  | 
1564  |  | */  | 
1565  |  |  | 
1566  |  | /// A module that looks for word codepoints using regex-syntax's data tables.  | 
1567  |  | #[cfg(all(  | 
1568  |  |     feature = "unicode-word-boundary",  | 
1569  |  |     feature = "syntax",  | 
1570  |  |     feature = "unicode-perl",  | 
1571  |  | ))]  | 
1572  |  | mod is_word_char { | 
1573  |  |     use regex_syntax::try_is_word_character;  | 
1574  |  |  | 
1575  |  |     use crate::util::utf8;  | 
1576  |  |  | 
1577  | 0  |     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { | 
1578  | 0  |         Ok(())  | 
1579  | 0  |     }  | 
1580  |  |  | 
1581  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1582  | 0  |     pub(super) fn fwd(  | 
1583  | 0  |         haystack: &[u8],  | 
1584  | 0  |         at: usize,  | 
1585  | 0  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1586  | 0  |         Ok(match utf8::decode(&haystack[at..]) { | 
1587  | 0  |             None | Some(Err(_)) => false,  | 
1588  | 0  |             Some(Ok(ch)) => try_is_word_character(ch).expect(  | 
1589  | 0  |                 "since unicode-word-boundary, syntax and unicode-perl \  | 
1590  | 0  |                  are all enabled, it is expected that \  | 
1591  | 0  |                  try_is_word_character succeeds",  | 
1592  |  |             ),  | 
1593  |  |         })  | 
1594  | 0  |     }  | 
1595  |  |  | 
1596  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1597  | 0  |     pub(super) fn rev(  | 
1598  | 0  |         haystack: &[u8],  | 
1599  | 0  |         at: usize,  | 
1600  | 0  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1601  | 0  |         Ok(match utf8::decode_last(&haystack[..at]) { | 
1602  | 0  |             None | Some(Err(_)) => false,  | 
1603  | 0  |             Some(Ok(ch)) => try_is_word_character(ch).expect(  | 
1604  | 0  |                 "since unicode-word-boundary, syntax and unicode-perl \  | 
1605  | 0  |                  are all enabled, it is expected that \  | 
1606  | 0  |                  try_is_word_character succeeds",  | 
1607  |  |             ),  | 
1608  |  |         })  | 
1609  | 0  |     }  | 
1610  |  | }  | 
1611  |  |  | 
1612  |  | /// A module that looks for word codepoints using regex-automata's data tables  | 
1613  |  | /// (which are only compiled when regex-syntax's tables aren't available).  | 
1614  |  | ///  | 
1615  |  | /// Note that the cfg should match the one in src/util/unicode_data/mod.rs for  | 
1616  |  | /// perl_word.  | 
1617  |  | #[cfg(all(  | 
1618  |  |     feature = "unicode-word-boundary",  | 
1619  |  |     not(all(feature = "syntax", feature = "unicode-perl")),  | 
1620  |  | ))]  | 
1621  |  | mod is_word_char { | 
1622  |  |     use crate::util::utf8;  | 
1623  |  |  | 
1624  |  |     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { | 
1625  |  |         Ok(())  | 
1626  |  |     }  | 
1627  |  |  | 
1628  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1629  |  |     pub(super) fn fwd(  | 
1630  |  |         haystack: &[u8],  | 
1631  |  |         at: usize,  | 
1632  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1633  |  |         Ok(match utf8::decode(&haystack[at..]) { | 
1634  |  |             None | Some(Err(_)) => false,  | 
1635  |  |             Some(Ok(ch)) => is_word_character(ch),  | 
1636  |  |         })  | 
1637  |  |     }  | 
1638  |  |  | 
1639  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1640  |  |     pub(super) fn rev(  | 
1641  |  |         haystack: &[u8],  | 
1642  |  |         at: usize,  | 
1643  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1644  |  |         Ok(match utf8::decode_last(&haystack[..at]) { | 
1645  |  |             None | Some(Err(_)) => false,  | 
1646  |  |             Some(Ok(ch)) => is_word_character(ch),  | 
1647  |  |         })  | 
1648  |  |     }  | 
1649  |  |  | 
1650  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1651  |  |     fn is_word_character(c: char) -> bool { | 
1652  |  |         use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; | 
1653  |  |  | 
1654  |  |         if u8::try_from(c).map_or(false, utf8::is_word_byte) { | 
1655  |  |             return true;  | 
1656  |  |         }  | 
1657  |  |         PERL_WORD  | 
1658  |  |             .binary_search_by(|&(start, end)| { | 
1659  |  |                 use core::cmp::Ordering;  | 
1660  |  |  | 
1661  |  |                 if start <= c && c <= end { | 
1662  |  |                     Ordering::Equal  | 
1663  |  |                 } else if start > c { | 
1664  |  |                     Ordering::Greater  | 
1665  |  |                 } else { | 
1666  |  |                     Ordering::Less  | 
1667  |  |                 }  | 
1668  |  |             })  | 
1669  |  |             .is_ok()  | 
1670  |  |     }  | 
1671  |  | }  | 
1672  |  |  | 
1673  |  | /// A module that always returns an error if Unicode word boundaries are  | 
1674  |  | /// disabled. When this feature is disabled, then regex-automata will not  | 
1675  |  | /// include its own data tables even if regex-syntax is disabled.  | 
1676  |  | #[cfg(not(feature = "unicode-word-boundary"))]  | 
1677  |  | mod is_word_char { | 
1678  |  |     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { | 
1679  |  |         Err(super::UnicodeWordBoundaryError::new())  | 
1680  |  |     }  | 
1681  |  |  | 
1682  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1683  |  |     pub(super) fn fwd(  | 
1684  |  |         _bytes: &[u8],  | 
1685  |  |         _at: usize,  | 
1686  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1687  |  |         Err(super::UnicodeWordBoundaryError::new())  | 
1688  |  |     }  | 
1689  |  |  | 
1690  |  |     #[cfg_attr(feature = "perf-inline", inline(always))]  | 
1691  |  |     pub(super) fn rev(  | 
1692  |  |         _bytes: &[u8],  | 
1693  |  |         _at: usize,  | 
1694  |  |     ) -> Result<bool, super::UnicodeWordBoundaryError> { | 
1695  |  |         Err(super::UnicodeWordBoundaryError::new())  | 
1696  |  |     }  | 
1697  |  | }  | 
1698  |  |  | 
1699  |  | #[cfg(test)]  | 
1700  |  | mod tests { | 
1701  |  |     use super::*;  | 
1702  |  |  | 
1703  |  |     macro_rules! testlook { | 
1704  |  |         ($look:expr, $haystack:expr, $at:expr) => { | 
1705  |  |             LookMatcher::default().matches($look, $haystack.as_bytes(), $at)  | 
1706  |  |         };  | 
1707  |  |     }  | 
1708  |  |  | 
1709  |  |     #[test]  | 
1710  |  |     fn look_matches_start_line() { | 
1711  |  |         let look = Look::StartLF;  | 
1712  |  |  | 
1713  |  |         assert!(testlook!(look, "", 0));  | 
1714  |  |         assert!(testlook!(look, "\n", 0));  | 
1715  |  |         assert!(testlook!(look, "\n", 1));  | 
1716  |  |         assert!(testlook!(look, "a", 0));  | 
1717  |  |         assert!(testlook!(look, "\na", 1));  | 
1718  |  |  | 
1719  |  |         assert!(!testlook!(look, "a", 1));  | 
1720  |  |         assert!(!testlook!(look, "a\na", 1));  | 
1721  |  |     }  | 
1722  |  |  | 
1723  |  |     #[test]  | 
1724  |  |     fn look_matches_end_line() { | 
1725  |  |         let look = Look::EndLF;  | 
1726  |  |  | 
1727  |  |         assert!(testlook!(look, "", 0));  | 
1728  |  |         assert!(testlook!(look, "\n", 1));  | 
1729  |  |         assert!(testlook!(look, "\na", 0));  | 
1730  |  |         assert!(testlook!(look, "\na", 2));  | 
1731  |  |         assert!(testlook!(look, "a\na", 1));  | 
1732  |  |  | 
1733  |  |         assert!(!testlook!(look, "a", 0));  | 
1734  |  |         assert!(!testlook!(look, "\na", 1));  | 
1735  |  |         assert!(!testlook!(look, "a\na", 0));  | 
1736  |  |         assert!(!testlook!(look, "a\na", 2));  | 
1737  |  |     }  | 
1738  |  |  | 
1739  |  |     #[test]  | 
1740  |  |     fn look_matches_start_text() { | 
1741  |  |         let look = Look::Start;  | 
1742  |  |  | 
1743  |  |         assert!(testlook!(look, "", 0));  | 
1744  |  |         assert!(testlook!(look, "\n", 0));  | 
1745  |  |         assert!(testlook!(look, "a", 0));  | 
1746  |  |  | 
1747  |  |         assert!(!testlook!(look, "\n", 1));  | 
1748  |  |         assert!(!testlook!(look, "\na", 1));  | 
1749  |  |         assert!(!testlook!(look, "a", 1));  | 
1750  |  |         assert!(!testlook!(look, "a\na", 1));  | 
1751  |  |     }  | 
1752  |  |  | 
1753  |  |     #[test]  | 
1754  |  |     fn look_matches_end_text() { | 
1755  |  |         let look = Look::End;  | 
1756  |  |  | 
1757  |  |         assert!(testlook!(look, "", 0));  | 
1758  |  |         assert!(testlook!(look, "\n", 1));  | 
1759  |  |         assert!(testlook!(look, "\na", 2));  | 
1760  |  |  | 
1761  |  |         assert!(!testlook!(look, "\na", 0));  | 
1762  |  |         assert!(!testlook!(look, "a\na", 1));  | 
1763  |  |         assert!(!testlook!(look, "a", 0));  | 
1764  |  |         assert!(!testlook!(look, "\na", 1));  | 
1765  |  |         assert!(!testlook!(look, "a\na", 0));  | 
1766  |  |         assert!(!testlook!(look, "a\na", 2));  | 
1767  |  |     }  | 
1768  |  |  | 
1769  |  |     #[test]  | 
1770  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
1771  |  |     fn look_matches_word_unicode() { | 
1772  |  |         let look = Look::WordUnicode;  | 
1773  |  |  | 
1774  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
1775  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
1776  |  |  | 
1777  |  |         // Simple ASCII word boundaries.  | 
1778  |  |         assert!(testlook!(look, "a", 0));  | 
1779  |  |         assert!(testlook!(look, "a", 1));  | 
1780  |  |         assert!(testlook!(look, "a ", 1));  | 
1781  |  |         assert!(testlook!(look, " a ", 1));  | 
1782  |  |         assert!(testlook!(look, " a ", 2));  | 
1783  |  |  | 
1784  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
1785  |  |         assert!(testlook!(look, "𝛃", 0));  | 
1786  |  |         assert!(testlook!(look, "𝛃", 4));  | 
1787  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
1788  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
1789  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
1790  |  |  | 
1791  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
1792  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
1793  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
1794  |  |  | 
1795  |  |         // Non word boundaries for ASCII.  | 
1796  |  |         assert!(!testlook!(look, "", 0));  | 
1797  |  |         assert!(!testlook!(look, "ab", 1));  | 
1798  |  |         assert!(!testlook!(look, "a ", 2));  | 
1799  |  |         assert!(!testlook!(look, " a ", 0));  | 
1800  |  |         assert!(!testlook!(look, " a ", 3));  | 
1801  |  |  | 
1802  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
1803  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
1804  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
1805  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
1806  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
1807  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
1808  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
1809  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
1810  |  |  | 
1811  |  |         // Non word boundaries with non-ASCII codepoints.  | 
1812  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
1813  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
1814  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
1815  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
1816  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
1817  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
1818  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
1819  |  |     }  | 
1820  |  |  | 
1821  |  |     #[test]  | 
1822  |  |     fn look_matches_word_ascii() { | 
1823  |  |         let look = Look::WordAscii;  | 
1824  |  |  | 
1825  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
1826  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
1827  |  |  | 
1828  |  |         // Simple ASCII word boundaries.  | 
1829  |  |         assert!(testlook!(look, "a", 0));  | 
1830  |  |         assert!(testlook!(look, "a", 1));  | 
1831  |  |         assert!(testlook!(look, "a ", 1));  | 
1832  |  |         assert!(testlook!(look, " a ", 1));  | 
1833  |  |         assert!(testlook!(look, " a ", 2));  | 
1834  |  |  | 
1835  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
1836  |  |         // an ASCII word boundary, none of these match.  | 
1837  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
1838  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
1839  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
1840  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
1841  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
1842  |  |  | 
1843  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
1844  |  |         // this is an ASCII word boundary, none of these match.  | 
1845  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
1846  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
1847  |  |  | 
1848  |  |         // Non word boundaries for ASCII.  | 
1849  |  |         assert!(!testlook!(look, "", 0));  | 
1850  |  |         assert!(!testlook!(look, "ab", 1));  | 
1851  |  |         assert!(!testlook!(look, "a ", 2));  | 
1852  |  |         assert!(!testlook!(look, " a ", 0));  | 
1853  |  |         assert!(!testlook!(look, " a ", 3));  | 
1854  |  |  | 
1855  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
1856  |  |         assert!(testlook!(look, "𝛃b", 4));  | 
1857  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
1858  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
1859  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
1860  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
1861  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
1862  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
1863  |  |  | 
1864  |  |         // Non word boundaries with non-ASCII codepoints.  | 
1865  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
1866  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
1867  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
1868  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
1869  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
1870  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
1871  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
1872  |  |     }  | 
1873  |  |  | 
1874  |  |     #[test]  | 
1875  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
1876  |  |     fn look_matches_word_unicode_negate() { | 
1877  |  |         let look = Look::WordUnicodeNegate;  | 
1878  |  |  | 
1879  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
1880  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
1881  |  |  | 
1882  |  |         // Simple ASCII word boundaries.  | 
1883  |  |         assert!(!testlook!(look, "a", 0));  | 
1884  |  |         assert!(!testlook!(look, "a", 1));  | 
1885  |  |         assert!(!testlook!(look, "a ", 1));  | 
1886  |  |         assert!(!testlook!(look, " a ", 1));  | 
1887  |  |         assert!(!testlook!(look, " a ", 2));  | 
1888  |  |  | 
1889  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
1890  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
1891  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
1892  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
1893  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
1894  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
1895  |  |  | 
1896  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
1897  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
1898  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
1899  |  |  | 
1900  |  |         // Non word boundaries for ASCII.  | 
1901  |  |         assert!(testlook!(look, "", 0));  | 
1902  |  |         assert!(testlook!(look, "ab", 1));  | 
1903  |  |         assert!(testlook!(look, "a ", 2));  | 
1904  |  |         assert!(testlook!(look, " a ", 0));  | 
1905  |  |         assert!(testlook!(look, " a ", 3));  | 
1906  |  |  | 
1907  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
1908  |  |         assert!(testlook!(look, "𝛃b", 4));  | 
1909  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
1910  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
1911  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
1912  |  |         // These don't match because they could otherwise return an offset that  | 
1913  |  |         // splits the UTF-8 encoding of a codepoint.  | 
1914  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
1915  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
1916  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
1917  |  |  | 
1918  |  |         // Non word boundaries with non-ASCII codepoints. These also don't  | 
1919  |  |         // match because they could otherwise return an offset that splits the  | 
1920  |  |         // UTF-8 encoding of a codepoint.  | 
1921  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
1922  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
1923  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
1924  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
1925  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
1926  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
1927  |  |         // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end  | 
1928  |  |         // of the haystack. So the "end" of the haystack isn't a word and 𐆀  | 
1929  |  |         // isn't a word, thus, \B matches.  | 
1930  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
1931  |  |     }  | 
1932  |  |  | 
1933  |  |     #[test]  | 
1934  |  |     fn look_matches_word_ascii_negate() { | 
1935  |  |         let look = Look::WordAsciiNegate;  | 
1936  |  |  | 
1937  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
1938  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
1939  |  |  | 
1940  |  |         // Simple ASCII word boundaries.  | 
1941  |  |         assert!(!testlook!(look, "a", 0));  | 
1942  |  |         assert!(!testlook!(look, "a", 1));  | 
1943  |  |         assert!(!testlook!(look, "a ", 1));  | 
1944  |  |         assert!(!testlook!(look, " a ", 1));  | 
1945  |  |         assert!(!testlook!(look, " a ", 2));  | 
1946  |  |  | 
1947  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
1948  |  |         // an ASCII word boundary, none of these match.  | 
1949  |  |         assert!(testlook!(look, "𝛃", 0));  | 
1950  |  |         assert!(testlook!(look, "𝛃", 4));  | 
1951  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
1952  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
1953  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
1954  |  |  | 
1955  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
1956  |  |         // this is an ASCII word boundary, none of these match.  | 
1957  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
1958  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
1959  |  |  | 
1960  |  |         // Non word boundaries for ASCII.  | 
1961  |  |         assert!(testlook!(look, "", 0));  | 
1962  |  |         assert!(testlook!(look, "ab", 1));  | 
1963  |  |         assert!(testlook!(look, "a ", 2));  | 
1964  |  |         assert!(testlook!(look, " a ", 0));  | 
1965  |  |         assert!(testlook!(look, " a ", 3));  | 
1966  |  |  | 
1967  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
1968  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
1969  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
1970  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
1971  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
1972  |  |         assert!(testlook!(look, "𝛃", 1));  | 
1973  |  |         assert!(testlook!(look, "𝛃", 2));  | 
1974  |  |         assert!(testlook!(look, "𝛃", 3));  | 
1975  |  |  | 
1976  |  |         // Non word boundaries with non-ASCII codepoints.  | 
1977  |  |         assert!(testlook!(look, "𝛃𐆀", 1));  | 
1978  |  |         assert!(testlook!(look, "𝛃𐆀", 2));  | 
1979  |  |         assert!(testlook!(look, "𝛃𐆀", 3));  | 
1980  |  |         assert!(testlook!(look, "𝛃𐆀", 5));  | 
1981  |  |         assert!(testlook!(look, "𝛃𐆀", 6));  | 
1982  |  |         assert!(testlook!(look, "𝛃𐆀", 7));  | 
1983  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
1984  |  |     }  | 
1985  |  |  | 
1986  |  |     #[test]  | 
1987  |  |     fn look_matches_word_start_ascii() { | 
1988  |  |         let look = Look::WordStartAscii;  | 
1989  |  |  | 
1990  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
1991  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
1992  |  |  | 
1993  |  |         // Simple ASCII word boundaries.  | 
1994  |  |         assert!(testlook!(look, "a", 0));  | 
1995  |  |         assert!(!testlook!(look, "a", 1));  | 
1996  |  |         assert!(!testlook!(look, "a ", 1));  | 
1997  |  |         assert!(testlook!(look, " a ", 1));  | 
1998  |  |         assert!(!testlook!(look, " a ", 2));  | 
1999  |  |  | 
2000  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
2001  |  |         // an ASCII word boundary, none of these match.  | 
2002  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
2003  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
2004  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
2005  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
2006  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
2007  |  |  | 
2008  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
2009  |  |         // this is an ASCII word boundary, none of these match.  | 
2010  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
2011  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
2012  |  |  | 
2013  |  |         // Non word boundaries for ASCII.  | 
2014  |  |         assert!(!testlook!(look, "", 0));  | 
2015  |  |         assert!(!testlook!(look, "ab", 1));  | 
2016  |  |         assert!(!testlook!(look, "a ", 2));  | 
2017  |  |         assert!(!testlook!(look, " a ", 0));  | 
2018  |  |         assert!(!testlook!(look, " a ", 3));  | 
2019  |  |  | 
2020  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2021  |  |         assert!(testlook!(look, "𝛃b", 4));  | 
2022  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2023  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
2024  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
2025  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
2026  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2027  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2028  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2029  |  |  | 
2030  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2031  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2032  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2033  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2034  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2035  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2036  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2037  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
2038  |  |     }  | 
2039  |  |  | 
2040  |  |     #[test]  | 
2041  |  |     fn look_matches_word_end_ascii() { | 
2042  |  |         let look = Look::WordEndAscii;  | 
2043  |  |  | 
2044  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2045  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2046  |  |  | 
2047  |  |         // Simple ASCII word boundaries.  | 
2048  |  |         assert!(!testlook!(look, "a", 0));  | 
2049  |  |         assert!(testlook!(look, "a", 1));  | 
2050  |  |         assert!(testlook!(look, "a ", 1));  | 
2051  |  |         assert!(!testlook!(look, " a ", 1));  | 
2052  |  |         assert!(testlook!(look, " a ", 2));  | 
2053  |  |  | 
2054  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
2055  |  |         // an ASCII word boundary, none of these match.  | 
2056  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
2057  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
2058  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
2059  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
2060  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
2061  |  |  | 
2062  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
2063  |  |         // this is an ASCII word boundary, none of these match.  | 
2064  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
2065  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
2066  |  |  | 
2067  |  |         // Non word boundaries for ASCII.  | 
2068  |  |         assert!(!testlook!(look, "", 0));  | 
2069  |  |         assert!(!testlook!(look, "ab", 1));  | 
2070  |  |         assert!(!testlook!(look, "a ", 2));  | 
2071  |  |         assert!(!testlook!(look, " a ", 0));  | 
2072  |  |         assert!(!testlook!(look, " a ", 3));  | 
2073  |  |  | 
2074  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2075  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2076  |  |         assert!(testlook!(look, "b𝛃", 1));  | 
2077  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
2078  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
2079  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
2080  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2081  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2082  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2083  |  |  | 
2084  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2085  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2086  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2087  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2088  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2089  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2090  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2091  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
2092  |  |     }  | 
2093  |  |  | 
2094  |  |     #[test]  | 
2095  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
2096  |  |     fn look_matches_word_start_unicode() { | 
2097  |  |         let look = Look::WordStartUnicode;  | 
2098  |  |  | 
2099  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2100  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2101  |  |  | 
2102  |  |         // Simple ASCII word boundaries.  | 
2103  |  |         assert!(testlook!(look, "a", 0));  | 
2104  |  |         assert!(!testlook!(look, "a", 1));  | 
2105  |  |         assert!(!testlook!(look, "a ", 1));  | 
2106  |  |         assert!(testlook!(look, " a ", 1));  | 
2107  |  |         assert!(!testlook!(look, " a ", 2));  | 
2108  |  |  | 
2109  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
2110  |  |         assert!(testlook!(look, "𝛃", 0));  | 
2111  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
2112  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
2113  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
2114  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
2115  |  |  | 
2116  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
2117  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
2118  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
2119  |  |  | 
2120  |  |         // Non word boundaries for ASCII.  | 
2121  |  |         assert!(!testlook!(look, "", 0));  | 
2122  |  |         assert!(!testlook!(look, "ab", 1));  | 
2123  |  |         assert!(!testlook!(look, "a ", 2));  | 
2124  |  |         assert!(!testlook!(look, " a ", 0));  | 
2125  |  |         assert!(!testlook!(look, " a ", 3));  | 
2126  |  |  | 
2127  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2128  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2129  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2130  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
2131  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
2132  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
2133  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2134  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2135  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2136  |  |  | 
2137  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2138  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2139  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2140  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2141  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2142  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2143  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2144  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
2145  |  |     }  | 
2146  |  |  | 
2147  |  |     #[test]  | 
2148  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
2149  |  |     fn look_matches_word_end_unicode() { | 
2150  |  |         let look = Look::WordEndUnicode;  | 
2151  |  |  | 
2152  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2153  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2154  |  |  | 
2155  |  |         // Simple ASCII word boundaries.  | 
2156  |  |         assert!(!testlook!(look, "a", 0));  | 
2157  |  |         assert!(testlook!(look, "a", 1));  | 
2158  |  |         assert!(testlook!(look, "a ", 1));  | 
2159  |  |         assert!(!testlook!(look, " a ", 1));  | 
2160  |  |         assert!(testlook!(look, " a ", 2));  | 
2161  |  |  | 
2162  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
2163  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
2164  |  |         assert!(testlook!(look, "𝛃", 4));  | 
2165  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
2166  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
2167  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
2168  |  |  | 
2169  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
2170  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
2171  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
2172  |  |  | 
2173  |  |         // Non word boundaries for ASCII.  | 
2174  |  |         assert!(!testlook!(look, "", 0));  | 
2175  |  |         assert!(!testlook!(look, "ab", 1));  | 
2176  |  |         assert!(!testlook!(look, "a ", 2));  | 
2177  |  |         assert!(!testlook!(look, " a ", 0));  | 
2178  |  |         assert!(!testlook!(look, " a ", 3));  | 
2179  |  |  | 
2180  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2181  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2182  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2183  |  |         assert!(!testlook!(look, "𝛃 ", 5));  | 
2184  |  |         assert!(!testlook!(look, " 𝛃 ", 0));  | 
2185  |  |         assert!(!testlook!(look, " 𝛃 ", 6));  | 
2186  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2187  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2188  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2189  |  |  | 
2190  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2191  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2192  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2193  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2194  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2195  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2196  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2197  |  |         assert!(!testlook!(look, "𝛃𐆀", 8));  | 
2198  |  |     }  | 
2199  |  |  | 
2200  |  |     #[test]  | 
2201  |  |     fn look_matches_word_start_half_ascii() { | 
2202  |  |         let look = Look::WordStartHalfAscii;  | 
2203  |  |  | 
2204  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2205  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2206  |  |  | 
2207  |  |         // Simple ASCII word boundaries.  | 
2208  |  |         assert!(testlook!(look, "a", 0));  | 
2209  |  |         assert!(!testlook!(look, "a", 1));  | 
2210  |  |         assert!(!testlook!(look, "a ", 1));  | 
2211  |  |         assert!(testlook!(look, " a ", 1));  | 
2212  |  |         assert!(!testlook!(look, " a ", 2));  | 
2213  |  |  | 
2214  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
2215  |  |         // an ASCII word boundary, none of these match.  | 
2216  |  |         assert!(testlook!(look, "𝛃", 0));  | 
2217  |  |         assert!(testlook!(look, "𝛃", 4));  | 
2218  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
2219  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
2220  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
2221  |  |  | 
2222  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
2223  |  |         // this is an ASCII word boundary, none of these match.  | 
2224  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
2225  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
2226  |  |  | 
2227  |  |         // Non word boundaries for ASCII.  | 
2228  |  |         assert!(testlook!(look, "", 0));  | 
2229  |  |         assert!(!testlook!(look, "ab", 1));  | 
2230  |  |         assert!(testlook!(look, "a ", 2));  | 
2231  |  |         assert!(testlook!(look, " a ", 0));  | 
2232  |  |         assert!(testlook!(look, " a ", 3));  | 
2233  |  |  | 
2234  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2235  |  |         assert!(testlook!(look, "𝛃b", 4));  | 
2236  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2237  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
2238  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
2239  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
2240  |  |         assert!(testlook!(look, "𝛃", 1));  | 
2241  |  |         assert!(testlook!(look, "𝛃", 2));  | 
2242  |  |         assert!(testlook!(look, "𝛃", 3));  | 
2243  |  |  | 
2244  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2245  |  |         assert!(testlook!(look, "𝛃𐆀", 1));  | 
2246  |  |         assert!(testlook!(look, "𝛃𐆀", 2));  | 
2247  |  |         assert!(testlook!(look, "𝛃𐆀", 3));  | 
2248  |  |         assert!(testlook!(look, "𝛃𐆀", 5));  | 
2249  |  |         assert!(testlook!(look, "𝛃𐆀", 6));  | 
2250  |  |         assert!(testlook!(look, "𝛃𐆀", 7));  | 
2251  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
2252  |  |     }  | 
2253  |  |  | 
2254  |  |     #[test]  | 
2255  |  |     fn look_matches_word_end_half_ascii() { | 
2256  |  |         let look = Look::WordEndHalfAscii;  | 
2257  |  |  | 
2258  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2259  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2260  |  |  | 
2261  |  |         // Simple ASCII word boundaries.  | 
2262  |  |         assert!(!testlook!(look, "a", 0));  | 
2263  |  |         assert!(testlook!(look, "a", 1));  | 
2264  |  |         assert!(testlook!(look, "a ", 1));  | 
2265  |  |         assert!(!testlook!(look, " a ", 1));  | 
2266  |  |         assert!(testlook!(look, " a ", 2));  | 
2267  |  |  | 
2268  |  |         // Unicode word boundaries with a non-ASCII codepoint. Since this is  | 
2269  |  |         // an ASCII word boundary, none of these match.  | 
2270  |  |         assert!(testlook!(look, "𝛃", 0));  | 
2271  |  |         assert!(testlook!(look, "𝛃", 4));  | 
2272  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
2273  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
2274  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
2275  |  |  | 
2276  |  |         // Unicode word boundaries between non-ASCII codepoints. Again, since  | 
2277  |  |         // this is an ASCII word boundary, none of these match.  | 
2278  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
2279  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
2280  |  |  | 
2281  |  |         // Non word boundaries for ASCII.  | 
2282  |  |         assert!(testlook!(look, "", 0));  | 
2283  |  |         assert!(!testlook!(look, "ab", 1));  | 
2284  |  |         assert!(testlook!(look, "a ", 2));  | 
2285  |  |         assert!(testlook!(look, " a ", 0));  | 
2286  |  |         assert!(testlook!(look, " a ", 3));  | 
2287  |  |  | 
2288  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2289  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2290  |  |         assert!(testlook!(look, "b𝛃", 1));  | 
2291  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
2292  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
2293  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
2294  |  |         assert!(testlook!(look, "𝛃", 1));  | 
2295  |  |         assert!(testlook!(look, "𝛃", 2));  | 
2296  |  |         assert!(testlook!(look, "𝛃", 3));  | 
2297  |  |  | 
2298  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2299  |  |         assert!(testlook!(look, "𝛃𐆀", 1));  | 
2300  |  |         assert!(testlook!(look, "𝛃𐆀", 2));  | 
2301  |  |         assert!(testlook!(look, "𝛃𐆀", 3));  | 
2302  |  |         assert!(testlook!(look, "𝛃𐆀", 5));  | 
2303  |  |         assert!(testlook!(look, "𝛃𐆀", 6));  | 
2304  |  |         assert!(testlook!(look, "𝛃𐆀", 7));  | 
2305  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
2306  |  |     }  | 
2307  |  |  | 
2308  |  |     #[test]  | 
2309  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
2310  |  |     fn look_matches_word_start_half_unicode() { | 
2311  |  |         let look = Look::WordStartHalfUnicode;  | 
2312  |  |  | 
2313  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2314  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2315  |  |  | 
2316  |  |         // Simple ASCII word boundaries.  | 
2317  |  |         assert!(testlook!(look, "a", 0));  | 
2318  |  |         assert!(!testlook!(look, "a", 1));  | 
2319  |  |         assert!(!testlook!(look, "a ", 1));  | 
2320  |  |         assert!(testlook!(look, " a ", 1));  | 
2321  |  |         assert!(!testlook!(look, " a ", 2));  | 
2322  |  |  | 
2323  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
2324  |  |         assert!(testlook!(look, "𝛃", 0));  | 
2325  |  |         assert!(!testlook!(look, "𝛃", 4));  | 
2326  |  |         assert!(!testlook!(look, "𝛃 ", 4));  | 
2327  |  |         assert!(testlook!(look, " 𝛃 ", 1));  | 
2328  |  |         assert!(!testlook!(look, " 𝛃 ", 5));  | 
2329  |  |  | 
2330  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
2331  |  |         assert!(testlook!(look, "𝛃𐆀", 0));  | 
2332  |  |         assert!(!testlook!(look, "𝛃𐆀", 4));  | 
2333  |  |  | 
2334  |  |         // Non word boundaries for ASCII.  | 
2335  |  |         assert!(testlook!(look, "", 0));  | 
2336  |  |         assert!(!testlook!(look, "ab", 1));  | 
2337  |  |         assert!(testlook!(look, "a ", 2));  | 
2338  |  |         assert!(testlook!(look, " a ", 0));  | 
2339  |  |         assert!(testlook!(look, " a ", 3));  | 
2340  |  |  | 
2341  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2342  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2343  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2344  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
2345  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
2346  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
2347  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2348  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2349  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2350  |  |  | 
2351  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2352  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2353  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2354  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2355  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2356  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2357  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2358  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
2359  |  |     }  | 
2360  |  |  | 
2361  |  |     #[test]  | 
2362  |  |     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]  | 
2363  |  |     fn look_matches_word_end_half_unicode() { | 
2364  |  |         let look = Look::WordEndHalfUnicode;  | 
2365  |  |  | 
2366  |  |         // \xF0\x9D\x9B\x83 = 𝛃 (in \w)  | 
2367  |  |         // \xF0\x90\x86\x80 = 𐆀 (not in \w)  | 
2368  |  |  | 
2369  |  |         // Simple ASCII word boundaries.  | 
2370  |  |         assert!(!testlook!(look, "a", 0));  | 
2371  |  |         assert!(testlook!(look, "a", 1));  | 
2372  |  |         assert!(testlook!(look, "a ", 1));  | 
2373  |  |         assert!(!testlook!(look, " a ", 1));  | 
2374  |  |         assert!(testlook!(look, " a ", 2));  | 
2375  |  |  | 
2376  |  |         // Unicode word boundaries with a non-ASCII codepoint.  | 
2377  |  |         assert!(!testlook!(look, "𝛃", 0));  | 
2378  |  |         assert!(testlook!(look, "𝛃", 4));  | 
2379  |  |         assert!(testlook!(look, "𝛃 ", 4));  | 
2380  |  |         assert!(!testlook!(look, " 𝛃 ", 1));  | 
2381  |  |         assert!(testlook!(look, " 𝛃 ", 5));  | 
2382  |  |  | 
2383  |  |         // Unicode word boundaries between non-ASCII codepoints.  | 
2384  |  |         assert!(!testlook!(look, "𝛃𐆀", 0));  | 
2385  |  |         assert!(testlook!(look, "𝛃𐆀", 4));  | 
2386  |  |  | 
2387  |  |         // Non word boundaries for ASCII.  | 
2388  |  |         assert!(testlook!(look, "", 0));  | 
2389  |  |         assert!(!testlook!(look, "ab", 1));  | 
2390  |  |         assert!(testlook!(look, "a ", 2));  | 
2391  |  |         assert!(testlook!(look, " a ", 0));  | 
2392  |  |         assert!(testlook!(look, " a ", 3));  | 
2393  |  |  | 
2394  |  |         // Non word boundaries with a non-ASCII codepoint.  | 
2395  |  |         assert!(!testlook!(look, "𝛃b", 4));  | 
2396  |  |         assert!(!testlook!(look, "b𝛃", 1));  | 
2397  |  |         assert!(testlook!(look, "𝛃 ", 5));  | 
2398  |  |         assert!(testlook!(look, " 𝛃 ", 0));  | 
2399  |  |         assert!(testlook!(look, " 𝛃 ", 6));  | 
2400  |  |         assert!(!testlook!(look, "𝛃", 1));  | 
2401  |  |         assert!(!testlook!(look, "𝛃", 2));  | 
2402  |  |         assert!(!testlook!(look, "𝛃", 3));  | 
2403  |  |  | 
2404  |  |         // Non word boundaries with non-ASCII codepoints.  | 
2405  |  |         assert!(!testlook!(look, "𝛃𐆀", 1));  | 
2406  |  |         assert!(!testlook!(look, "𝛃𐆀", 2));  | 
2407  |  |         assert!(!testlook!(look, "𝛃𐆀", 3));  | 
2408  |  |         assert!(!testlook!(look, "𝛃𐆀", 5));  | 
2409  |  |         assert!(!testlook!(look, "𝛃𐆀", 6));  | 
2410  |  |         assert!(!testlook!(look, "𝛃𐆀", 7));  | 
2411  |  |         assert!(testlook!(look, "𝛃𐆀", 8));  | 
2412  |  |     }  | 
2413  |  |  | 
2414  |  |     #[test]  | 
2415  |  |     fn look_set() { | 
2416  |  |         let mut f = LookSet::default();  | 
2417  |  |         assert!(!f.contains(Look::Start));  | 
2418  |  |         assert!(!f.contains(Look::End));  | 
2419  |  |         assert!(!f.contains(Look::StartLF));  | 
2420  |  |         assert!(!f.contains(Look::EndLF));  | 
2421  |  |         assert!(!f.contains(Look::WordUnicode));  | 
2422  |  |         assert!(!f.contains(Look::WordUnicodeNegate));  | 
2423  |  |         assert!(!f.contains(Look::WordAscii));  | 
2424  |  |         assert!(!f.contains(Look::WordAsciiNegate));  | 
2425  |  |  | 
2426  |  |         f = f.insert(Look::Start);  | 
2427  |  |         assert!(f.contains(Look::Start));  | 
2428  |  |         f = f.remove(Look::Start);  | 
2429  |  |         assert!(!f.contains(Look::Start));  | 
2430  |  |  | 
2431  |  |         f = f.insert(Look::End);  | 
2432  |  |         assert!(f.contains(Look::End));  | 
2433  |  |         f = f.remove(Look::End);  | 
2434  |  |         assert!(!f.contains(Look::End));  | 
2435  |  |  | 
2436  |  |         f = f.insert(Look::StartLF);  | 
2437  |  |         assert!(f.contains(Look::StartLF));  | 
2438  |  |         f = f.remove(Look::StartLF);  | 
2439  |  |         assert!(!f.contains(Look::StartLF));  | 
2440  |  |  | 
2441  |  |         f = f.insert(Look::EndLF);  | 
2442  |  |         assert!(f.contains(Look::EndLF));  | 
2443  |  |         f = f.remove(Look::EndLF);  | 
2444  |  |         assert!(!f.contains(Look::EndLF));  | 
2445  |  |  | 
2446  |  |         f = f.insert(Look::StartCRLF);  | 
2447  |  |         assert!(f.contains(Look::StartCRLF));  | 
2448  |  |         f = f.remove(Look::StartCRLF);  | 
2449  |  |         assert!(!f.contains(Look::StartCRLF));  | 
2450  |  |  | 
2451  |  |         f = f.insert(Look::EndCRLF);  | 
2452  |  |         assert!(f.contains(Look::EndCRLF));  | 
2453  |  |         f = f.remove(Look::EndCRLF);  | 
2454  |  |         assert!(!f.contains(Look::EndCRLF));  | 
2455  |  |  | 
2456  |  |         f = f.insert(Look::WordUnicode);  | 
2457  |  |         assert!(f.contains(Look::WordUnicode));  | 
2458  |  |         f = f.remove(Look::WordUnicode);  | 
2459  |  |         assert!(!f.contains(Look::WordUnicode));  | 
2460  |  |  | 
2461  |  |         f = f.insert(Look::WordUnicodeNegate);  | 
2462  |  |         assert!(f.contains(Look::WordUnicodeNegate));  | 
2463  |  |         f = f.remove(Look::WordUnicodeNegate);  | 
2464  |  |         assert!(!f.contains(Look::WordUnicodeNegate));  | 
2465  |  |  | 
2466  |  |         f = f.insert(Look::WordAscii);  | 
2467  |  |         assert!(f.contains(Look::WordAscii));  | 
2468  |  |         f = f.remove(Look::WordAscii);  | 
2469  |  |         assert!(!f.contains(Look::WordAscii));  | 
2470  |  |  | 
2471  |  |         f = f.insert(Look::WordAsciiNegate);  | 
2472  |  |         assert!(f.contains(Look::WordAsciiNegate));  | 
2473  |  |         f = f.remove(Look::WordAsciiNegate);  | 
2474  |  |         assert!(!f.contains(Look::WordAsciiNegate));  | 
2475  |  |  | 
2476  |  |         f = f.insert(Look::WordStartAscii);  | 
2477  |  |         assert!(f.contains(Look::WordStartAscii));  | 
2478  |  |         f = f.remove(Look::WordStartAscii);  | 
2479  |  |         assert!(!f.contains(Look::WordStartAscii));  | 
2480  |  |  | 
2481  |  |         f = f.insert(Look::WordEndAscii);  | 
2482  |  |         assert!(f.contains(Look::WordEndAscii));  | 
2483  |  |         f = f.remove(Look::WordEndAscii);  | 
2484  |  |         assert!(!f.contains(Look::WordEndAscii));  | 
2485  |  |  | 
2486  |  |         f = f.insert(Look::WordStartUnicode);  | 
2487  |  |         assert!(f.contains(Look::WordStartUnicode));  | 
2488  |  |         f = f.remove(Look::WordStartUnicode);  | 
2489  |  |         assert!(!f.contains(Look::WordStartUnicode));  | 
2490  |  |  | 
2491  |  |         f = f.insert(Look::WordEndUnicode);  | 
2492  |  |         assert!(f.contains(Look::WordEndUnicode));  | 
2493  |  |         f = f.remove(Look::WordEndUnicode);  | 
2494  |  |         assert!(!f.contains(Look::WordEndUnicode));  | 
2495  |  |  | 
2496  |  |         f = f.insert(Look::WordStartHalfAscii);  | 
2497  |  |         assert!(f.contains(Look::WordStartHalfAscii));  | 
2498  |  |         f = f.remove(Look::WordStartHalfAscii);  | 
2499  |  |         assert!(!f.contains(Look::WordStartHalfAscii));  | 
2500  |  |  | 
2501  |  |         f = f.insert(Look::WordEndHalfAscii);  | 
2502  |  |         assert!(f.contains(Look::WordEndHalfAscii));  | 
2503  |  |         f = f.remove(Look::WordEndHalfAscii);  | 
2504  |  |         assert!(!f.contains(Look::WordEndHalfAscii));  | 
2505  |  |  | 
2506  |  |         f = f.insert(Look::WordStartHalfUnicode);  | 
2507  |  |         assert!(f.contains(Look::WordStartHalfUnicode));  | 
2508  |  |         f = f.remove(Look::WordStartHalfUnicode);  | 
2509  |  |         assert!(!f.contains(Look::WordStartHalfUnicode));  | 
2510  |  |  | 
2511  |  |         f = f.insert(Look::WordEndHalfUnicode);  | 
2512  |  |         assert!(f.contains(Look::WordEndHalfUnicode));  | 
2513  |  |         f = f.remove(Look::WordEndHalfUnicode);  | 
2514  |  |         assert!(!f.contains(Look::WordEndHalfUnicode));  | 
2515  |  |     }  | 
2516  |  |  | 
2517  |  |     #[test]  | 
2518  |  |     fn look_set_iter() { | 
2519  |  |         let set = LookSet::empty();  | 
2520  |  |         assert_eq!(0, set.iter().count());  | 
2521  |  |  | 
2522  |  |         let set = LookSet::full();  | 
2523  |  |         assert_eq!(18, set.iter().count());  | 
2524  |  |  | 
2525  |  |         let set =  | 
2526  |  |             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);  | 
2527  |  |         assert_eq!(2, set.iter().count());  | 
2528  |  |  | 
2529  |  |         let set = LookSet::empty().insert(Look::StartLF);  | 
2530  |  |         assert_eq!(1, set.iter().count());  | 
2531  |  |  | 
2532  |  |         let set = LookSet::empty().insert(Look::WordAsciiNegate);  | 
2533  |  |         assert_eq!(1, set.iter().count());  | 
2534  |  |  | 
2535  |  |         let set = LookSet::empty().insert(Look::WordEndHalfUnicode);  | 
2536  |  |         assert_eq!(1, set.iter().count());  | 
2537  |  |     }  | 
2538  |  |  | 
2539  |  |     #[test]  | 
2540  |  |     #[cfg(feature = "alloc")]  | 
2541  |  |     fn look_set_debug() { | 
2542  |  |         let res = alloc::format!("{:?}", LookSet::empty()); | 
2543  |  |         assert_eq!("∅", res); | 
2544  |  |         let res = alloc::format!("{:?}", LookSet::full()); | 
2545  |  |         assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); | 
2546  |  |     }  | 
2547  |  | }  |