/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.7/src/util/look.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Types and routines for working with look-around assertions. |
3 | | |
4 | | This module principally defines two types: |
5 | | |
6 | | * [`Look`] enumerates all of the assertions supported by this crate. |
7 | | * [`LookSet`] provides a way to efficiently store a set of [`Look`] values. |
8 | | * [`LookMatcher`] provides routines for checking whether a `Look` or a |
9 | | `LookSet` matches at a particular position in a haystack. |
10 | | */ |
11 | | |
12 | | // LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically |
13 | | // copied verbatim from the regex-syntax crate. I would have no problems using |
14 | | // the regex-syntax types and defining the matching routines (only found |
15 | | // in this crate) as free functions, except the `Look` and `LookSet` types |
16 | | // are used in lots of places. Including in places we expect to work when |
17 | | // regex-syntax is *not* enabled, such as in the definition of the NFA itself. |
18 | | // |
19 | | // Thankfully the code we copy is pretty simple and there isn't much of it. |
20 | | // Otherwise, the rest of this module deals with *matching* the assertions, |
21 | | // which is not something that regex-syntax handles. |
22 | | |
23 | | use crate::util::{escape::DebugByte, utf8}; |
24 | | |
25 | | /// A look-around assertion. |
26 | | /// |
27 | | /// An assertion matches at a position between characters in a haystack. |
28 | | /// Namely, it does not actually "consume" any input as most parts of a regular |
29 | | /// expression do. Assertions are a way of stating that some property must be |
30 | | /// true at a particular point during matching. |
31 | | /// |
32 | | /// For example, `(?m)^[a-z]+$` is a pattern that: |
33 | | /// |
34 | | /// * Scans the haystack for a position at which `(?m:^)` is satisfied. That |
35 | | /// occurs at either the beginning of the haystack, or immediately following |
36 | | /// a `\n` character. |
37 | | /// * Looks for one or more occurrences of `[a-z]`. |
38 | | /// * Once `[a-z]+` has matched as much as it can, an overall match is only |
39 | | /// reported when `[a-z]+` stops just before a `\n`. |
40 | | /// |
41 | | /// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. |
42 | | /// |
43 | | /// Assertions are also called "look-around," "look-behind" and "look-ahead." |
44 | | /// Specifically, some assertions are look-behind (like `^`), other assertions |
45 | | /// are look-ahead (like `$`) and yet other assertions are both look-ahead and |
46 | | /// look-behind (like `\b`). |
47 | | /// |
48 | | /// # Assertions in an NFA |
49 | | /// |
50 | | /// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be |
51 | | /// thought of as a conditional epsilon transition. That is, a matching engine |
52 | | /// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits |
53 | | /// moving through conditional epsilon transitions when their condition |
54 | | /// is satisfied at whatever position the `PikeVM` is currently at in the |
55 | | /// haystack. |
56 | | /// |
57 | | /// How assertions are handled in a `DFA` is trickier, since a DFA does not |
58 | | /// have epsilon transitions at all. In this case, they are compiled into the |
59 | | /// automaton itself, at the expense of more states than what would be required |
60 | | /// without an assertion. |
61 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
62 | | pub enum Look { |
63 | | /// Match the beginning of text. Specifically, this matches at the starting |
64 | | /// position of the input. |
65 | | Start = 1 << 0, |
66 | | /// Match the end of text. Specifically, this matches at the ending |
67 | | /// position of the input. |
68 | | End = 1 << 1, |
69 | | /// Match the beginning of a line or the beginning of text. Specifically, |
70 | | /// this matches at the starting position of the input, or at the position |
71 | | /// immediately following a `\n` character. |
72 | | StartLF = 1 << 2, |
73 | | /// Match the end of a line or the end of text. Specifically, this matches |
74 | | /// at the end position of the input, or at the position immediately |
75 | | /// preceding a `\n` character. |
76 | | EndLF = 1 << 3, |
77 | | /// Match the beginning of a line or the beginning of text. Specifically, |
78 | | /// this matches at the starting position of the input, or at the position |
79 | | /// immediately following either a `\r` or `\n` character, but never after |
80 | | /// a `\r` when a `\n` follows. |
81 | | StartCRLF = 1 << 4, |
82 | | /// Match the end of a line or the end of text. Specifically, this matches |
83 | | /// at the end position of the input, or at the position immediately |
84 | | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
85 | | /// precedes it. |
86 | | EndCRLF = 1 << 5, |
87 | | /// Match an ASCII-only word boundary. That is, this matches a position |
88 | | /// where the left adjacent character and right adjacent character |
89 | | /// correspond to a word and non-word or a non-word and word character. |
90 | | WordAscii = 1 << 6, |
91 | | /// Match an ASCII-only negation of a word boundary. |
92 | | WordAsciiNegate = 1 << 7, |
93 | | /// Match a Unicode-aware word boundary. That is, this matches a position |
94 | | /// where the left adjacent character and right adjacent character |
95 | | /// correspond to a word and non-word or a non-word and word character. |
96 | | WordUnicode = 1 << 8, |
97 | | /// Match a Unicode-aware negation of a word boundary. |
98 | | WordUnicodeNegate = 1 << 9, |
99 | | /// Match the start of an ASCII-only word boundary. That is, this matches a |
100 | | /// position at either the beginning of the haystack or where the previous |
101 | | /// character is not a word character and the following character is a word |
102 | | /// character. |
103 | | WordStartAscii = 1 << 10, |
104 | | /// Match the end of an ASCII-only word boundary. That is, this matches |
105 | | /// a position at either the end of the haystack or where the previous |
106 | | /// character is a word character and the following character is not a word |
107 | | /// character. |
108 | | WordEndAscii = 1 << 11, |
109 | | /// Match the start of a Unicode word boundary. That is, this matches a |
110 | | /// position at either the beginning of the haystack or where the previous |
111 | | /// character is not a word character and the following character is a word |
112 | | /// character. |
113 | | WordStartUnicode = 1 << 12, |
114 | | /// Match the end of a Unicode word boundary. That is, this matches a |
115 | | /// position at either the end of the haystack or where the previous |
116 | | /// character is a word character and the following character is not a word |
117 | | /// character. |
118 | | WordEndUnicode = 1 << 13, |
119 | | /// Match the start half of an ASCII-only word boundary. That is, this |
120 | | /// matches a position at either the beginning of the haystack or where the |
121 | | /// previous character is not a word character. |
122 | | WordStartHalfAscii = 1 << 14, |
123 | | /// Match the end half of an ASCII-only word boundary. That is, this |
124 | | /// matches a position at either the end of the haystack or where the |
125 | | /// following character is not a word character. |
126 | | WordEndHalfAscii = 1 << 15, |
127 | | /// Match the start half of a Unicode word boundary. That is, this matches |
128 | | /// a position at either the beginning of the haystack or where the |
129 | | /// previous character is not a word character. |
130 | | WordStartHalfUnicode = 1 << 16, |
131 | | /// Match the end half of a Unicode word boundary. That is, this matches |
132 | | /// a position at either the end of the haystack or where the following |
133 | | /// character is not a word character. |
134 | | WordEndHalfUnicode = 1 << 17, |
135 | | } |
136 | | |
137 | | impl Look { |
138 | | /// Flip the look-around assertion to its equivalent for reverse searches. |
139 | | /// For example, `StartLF` gets translated to `EndLF`. |
140 | | /// |
141 | | /// Some assertions, such as `WordUnicode`, remain the same since they |
142 | | /// match the same positions regardless of the direction of the search. |
143 | | #[inline] |
144 | 0 | pub const fn reversed(self) -> Look { |
145 | 0 | match self { |
146 | 0 | Look::Start => Look::End, |
147 | 0 | Look::End => Look::Start, |
148 | 0 | Look::StartLF => Look::EndLF, |
149 | 0 | Look::EndLF => Look::StartLF, |
150 | 0 | Look::StartCRLF => Look::EndCRLF, |
151 | 0 | Look::EndCRLF => Look::StartCRLF, |
152 | 0 | Look::WordAscii => Look::WordAscii, |
153 | 0 | Look::WordAsciiNegate => Look::WordAsciiNegate, |
154 | 0 | Look::WordUnicode => Look::WordUnicode, |
155 | 0 | Look::WordUnicodeNegate => Look::WordUnicodeNegate, |
156 | 0 | Look::WordStartAscii => Look::WordEndAscii, |
157 | 0 | Look::WordEndAscii => Look::WordStartAscii, |
158 | 0 | Look::WordStartUnicode => Look::WordEndUnicode, |
159 | 0 | Look::WordEndUnicode => Look::WordStartUnicode, |
160 | 0 | Look::WordStartHalfAscii => Look::WordEndHalfAscii, |
161 | 0 | Look::WordEndHalfAscii => Look::WordStartHalfAscii, |
162 | 0 | Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, |
163 | 0 | Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, |
164 | | } |
165 | 0 | } |
166 | | |
167 | | /// Return the underlying representation of this look-around enumeration |
168 | | /// as an integer. Giving the return value to the [`Look::from_repr`] |
169 | | /// constructor is guaranteed to return the same look-around variant that |
170 | | /// one started with within a semver compatible release of this crate. |
171 | | #[inline] |
172 | 0 | pub const fn as_repr(self) -> u32 { |
173 | 0 | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an |
174 | 0 | // actual int. |
175 | 0 | self as u32 |
176 | 0 | } |
177 | | |
178 | | /// Given the underlying representation of a `Look` value, return the |
179 | | /// corresponding `Look` value if the representation is valid. Otherwise |
180 | | /// `None` is returned. |
181 | | #[inline] |
182 | 0 | pub const fn from_repr(repr: u32) -> Option<Look> { |
183 | 0 | match repr { |
184 | 0 | 0b00_0000_0000_0000_0001 => Some(Look::Start), |
185 | 0 | 0b00_0000_0000_0000_0010 => Some(Look::End), |
186 | 0 | 0b00_0000_0000_0000_0100 => Some(Look::StartLF), |
187 | 0 | 0b00_0000_0000_0000_1000 => Some(Look::EndLF), |
188 | 0 | 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), |
189 | 0 | 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), |
190 | 0 | 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), |
191 | 0 | 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), |
192 | 0 | 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), |
193 | 0 | 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), |
194 | 0 | 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), |
195 | 0 | 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), |
196 | 0 | 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), |
197 | 0 | 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), |
198 | 0 | 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), |
199 | 0 | 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), |
200 | 0 | 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), |
201 | 0 | 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), |
202 | 0 | _ => None, |
203 | | } |
204 | 0 | } |
205 | | |
206 | | /// Returns a convenient single codepoint representation of this |
207 | | /// look-around assertion. Each assertion is guaranteed to be represented |
208 | | /// by a distinct character. |
209 | | /// |
210 | | /// This is useful for succinctly representing a look-around assertion in |
211 | | /// human friendly but succinct output intended for a programmer working on |
212 | | /// regex internals. |
213 | | #[inline] |
214 | 0 | pub const fn as_char(self) -> char { |
215 | 0 | match self { |
216 | 0 | Look::Start => 'A', |
217 | 0 | Look::End => 'z', |
218 | 0 | Look::StartLF => '^', |
219 | 0 | Look::EndLF => '$', |
220 | 0 | Look::StartCRLF => 'r', |
221 | 0 | Look::EndCRLF => 'R', |
222 | 0 | Look::WordAscii => 'b', |
223 | 0 | Look::WordAsciiNegate => 'B', |
224 | 0 | Look::WordUnicode => '𝛃', |
225 | 0 | Look::WordUnicodeNegate => '𝚩', |
226 | 0 | Look::WordStartAscii => '<', |
227 | 0 | Look::WordEndAscii => '>', |
228 | 0 | Look::WordStartUnicode => '〈', |
229 | 0 | Look::WordEndUnicode => '〉', |
230 | 0 | Look::WordStartHalfAscii => '◁', |
231 | 0 | Look::WordEndHalfAscii => '▷', |
232 | 0 | Look::WordStartHalfUnicode => '◀', |
233 | 0 | Look::WordEndHalfUnicode => '▶', |
234 | | } |
235 | 0 | } |
236 | | } |
237 | | |
238 | | /// LookSet is a memory-efficient set of look-around assertions. |
239 | | /// |
240 | | /// This is useful for efficiently tracking look-around assertions. For |
241 | | /// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties |
242 | | /// that return `LookSet`s. |
243 | | #[derive(Clone, Copy, Default, Eq, PartialEq)] |
244 | | pub struct LookSet { |
245 | | /// The underlying representation this set is exposed to make it possible |
246 | | /// to store it somewhere efficiently. The representation is that |
247 | | /// of a bitset, where each assertion occupies bit `i` where |
248 | | /// `i = Look::as_repr()`. |
249 | | /// |
250 | | /// Note that users of this internal representation must permit the full |
251 | | /// range of `u16` values to be represented. For example, even if the |
252 | | /// current implementation only makes use of the 10 least significant bits, |
253 | | /// it may use more bits in a future semver compatible release. |
254 | | pub bits: u32, |
255 | | } |
256 | | |
257 | | impl LookSet { |
258 | | /// Create an empty set of look-around assertions. |
259 | | #[inline] |
260 | 0 | pub fn empty() -> LookSet { |
261 | 0 | LookSet { bits: 0 } |
262 | 0 | } |
263 | | |
264 | | /// Create a full set of look-around assertions. |
265 | | /// |
266 | | /// This set contains all possible look-around assertions. |
267 | | #[inline] |
268 | 0 | pub fn full() -> LookSet { |
269 | 0 | LookSet { bits: !0 } |
270 | 0 | } |
271 | | |
272 | | /// Create a look-around set containing the look-around assertion given. |
273 | | /// |
274 | | /// This is a convenience routine for creating an empty set and inserting |
275 | | /// one look-around assertions. |
276 | | #[inline] |
277 | 0 | pub fn singleton(look: Look) -> LookSet { |
278 | 0 | LookSet::empty().insert(look) |
279 | 0 | } |
280 | | |
281 | | /// Returns the total number of look-around assertions in this set. |
282 | | #[inline] |
283 | 0 | pub fn len(self) -> usize { |
284 | 0 | // OK because max value always fits in a u8, which in turn always |
285 | 0 | // fits in a usize, regardless of target. |
286 | 0 | usize::try_from(self.bits.count_ones()).unwrap() |
287 | 0 | } |
288 | | |
289 | | /// Returns true if and only if this set is empty. |
290 | | #[inline] |
291 | 0 | pub fn is_empty(self) -> bool { |
292 | 0 | self.len() == 0 |
293 | 0 | } |
294 | | |
295 | | /// Returns true if and only if the given look-around assertion is in this |
296 | | /// set. |
297 | | #[inline] |
298 | 0 | pub fn contains(self, look: Look) -> bool { |
299 | 0 | self.bits & look.as_repr() != 0 |
300 | 0 | } |
301 | | |
302 | | /// Returns true if and only if this set contains any anchor assertions. |
303 | | /// This includes both "start/end of haystack" and "start/end of line." |
304 | | #[inline] |
305 | 0 | pub fn contains_anchor(&self) -> bool { |
306 | 0 | self.contains_anchor_haystack() || self.contains_anchor_line() |
307 | 0 | } |
308 | | |
309 | | /// Returns true if and only if this set contains any "start/end of |
310 | | /// haystack" anchors. This doesn't include "start/end of line" anchors. |
311 | | #[inline] |
312 | 0 | pub fn contains_anchor_haystack(&self) -> bool { |
313 | 0 | self.contains(Look::Start) || self.contains(Look::End) |
314 | 0 | } |
315 | | |
316 | | /// Returns true if and only if this set contains any "start/end of line" |
317 | | /// anchors. This doesn't include "start/end of haystack" anchors. This |
318 | | /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. |
319 | | #[inline] |
320 | 0 | pub fn contains_anchor_line(&self) -> bool { |
321 | 0 | self.contains(Look::StartLF) |
322 | 0 | || self.contains(Look::EndLF) |
323 | 0 | || self.contains(Look::StartCRLF) |
324 | 0 | || self.contains(Look::EndCRLF) |
325 | 0 | } |
326 | | |
327 | | /// Returns true if and only if this set contains any "start/end of line" |
328 | | /// anchors that only treat `\n` as line terminators. This does not include |
329 | | /// haystack anchors or CRLF aware line anchors. |
330 | | #[inline] |
331 | 0 | pub fn contains_anchor_lf(&self) -> bool { |
332 | 0 | self.contains(Look::StartLF) || self.contains(Look::EndLF) |
333 | 0 | } |
334 | | |
335 | | /// Returns true if and only if this set contains any "start/end of line" |
336 | | /// anchors that are CRLF-aware. This doesn't include "start/end of |
337 | | /// haystack" or "start/end of line-feed" anchors. |
338 | | #[inline] |
339 | 0 | pub fn contains_anchor_crlf(&self) -> bool { |
340 | 0 | self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) |
341 | 0 | } |
342 | | |
343 | | /// Returns true if and only if this set contains any word boundary or |
344 | | /// negated word boundary assertions. This include both Unicode and ASCII |
345 | | /// word boundaries. |
346 | | #[inline] |
347 | 0 | pub fn contains_word(self) -> bool { |
348 | 0 | self.contains_word_unicode() || self.contains_word_ascii() |
349 | 0 | } |
350 | | |
351 | | /// Returns true if and only if this set contains any Unicode word boundary |
352 | | /// or negated Unicode word boundary assertions. |
353 | | #[inline] |
354 | 0 | pub fn contains_word_unicode(self) -> bool { |
355 | 0 | self.contains(Look::WordUnicode) |
356 | 0 | || self.contains(Look::WordUnicodeNegate) |
357 | 0 | || self.contains(Look::WordStartUnicode) |
358 | 0 | || self.contains(Look::WordEndUnicode) |
359 | 0 | || self.contains(Look::WordStartHalfUnicode) |
360 | 0 | || self.contains(Look::WordEndHalfUnicode) |
361 | 0 | } |
362 | | |
363 | | /// Returns true if and only if this set contains any ASCII word boundary |
364 | | /// or negated ASCII word boundary assertions. |
365 | | #[inline] |
366 | 0 | pub fn contains_word_ascii(self) -> bool { |
367 | 0 | self.contains(Look::WordAscii) |
368 | 0 | || self.contains(Look::WordAsciiNegate) |
369 | 0 | || self.contains(Look::WordStartAscii) |
370 | 0 | || self.contains(Look::WordEndAscii) |
371 | 0 | || self.contains(Look::WordStartHalfAscii) |
372 | 0 | || self.contains(Look::WordEndHalfAscii) |
373 | 0 | } |
374 | | |
375 | | /// Returns an iterator over all of the look-around assertions in this set. |
376 | | #[inline] |
377 | 0 | pub fn iter(self) -> LookSetIter { |
378 | 0 | LookSetIter { set: self } |
379 | 0 | } |
380 | | |
381 | | /// Return a new set that is equivalent to the original, but with the given |
382 | | /// assertion added to it. If the assertion is already in the set, then the |
383 | | /// returned set is equivalent to the original. |
384 | | #[inline] |
385 | 0 | pub fn insert(self, look: Look) -> LookSet { |
386 | 0 | LookSet { bits: self.bits | look.as_repr() } |
387 | 0 | } |
388 | | |
389 | | /// Updates this set in place with the result of inserting the given |
390 | | /// assertion into this set. |
391 | | #[inline] |
392 | 0 | pub fn set_insert(&mut self, look: Look) { |
393 | 0 | *self = self.insert(look); |
394 | 0 | } |
395 | | |
396 | | /// Return a new set that is equivalent to the original, but with the given |
397 | | /// assertion removed from it. If the assertion is not in the set, then the |
398 | | /// returned set is equivalent to the original. |
399 | | #[inline] |
400 | 0 | pub fn remove(self, look: Look) -> LookSet { |
401 | 0 | LookSet { bits: self.bits & !look.as_repr() } |
402 | 0 | } |
403 | | |
404 | | /// Updates this set in place with the result of removing the given |
405 | | /// assertion from this set. |
406 | | #[inline] |
407 | 0 | pub fn set_remove(&mut self, look: Look) { |
408 | 0 | *self = self.remove(look); |
409 | 0 | } |
410 | | |
411 | | /// Returns a new set that is the result of subtracting the given set from |
412 | | /// this set. |
413 | | #[inline] |
414 | 0 | pub fn subtract(self, other: LookSet) -> LookSet { |
415 | 0 | LookSet { bits: self.bits & !other.bits } |
416 | 0 | } |
417 | | |
418 | | /// Updates this set in place with the result of subtracting the given set |
419 | | /// from this set. |
420 | | #[inline] |
421 | 0 | pub fn set_subtract(&mut self, other: LookSet) { |
422 | 0 | *self = self.subtract(other); |
423 | 0 | } |
424 | | |
425 | | /// Returns a new set that is the union of this and the one given. |
426 | | #[inline] |
427 | 0 | pub fn union(self, other: LookSet) -> LookSet { |
428 | 0 | LookSet { bits: self.bits | other.bits } |
429 | 0 | } |
430 | | |
431 | | /// Updates this set in place with the result of unioning it with the one |
432 | | /// given. |
433 | | #[inline] |
434 | 0 | pub fn set_union(&mut self, other: LookSet) { |
435 | 0 | *self = self.union(other); |
436 | 0 | } |
437 | | |
438 | | /// Returns a new set that is the intersection of this and the one given. |
439 | | #[inline] |
440 | 0 | pub fn intersect(self, other: LookSet) -> LookSet { |
441 | 0 | LookSet { bits: self.bits & other.bits } |
442 | 0 | } |
443 | | |
444 | | /// Updates this set in place with the result of intersecting it with the |
445 | | /// one given. |
446 | | #[inline] |
447 | 0 | pub fn set_intersect(&mut self, other: LookSet) { |
448 | 0 | *self = self.intersect(other); |
449 | 0 | } |
450 | | |
451 | | /// Return a `LookSet` from the slice given as a native endian 32-bit |
452 | | /// integer. |
453 | | /// |
454 | | /// # Panics |
455 | | /// |
456 | | /// This panics if `slice.len() < 4`. |
457 | | #[inline] |
458 | 0 | pub fn read_repr(slice: &[u8]) -> LookSet { |
459 | 0 | let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); |
460 | 0 | LookSet { bits } |
461 | 0 | } |
462 | | |
463 | | /// Write a `LookSet` as a native endian 32-bit integer to the beginning |
464 | | /// of the slice given. |
465 | | /// |
466 | | /// # Panics |
467 | | /// |
468 | | /// This panics if `slice.len() < 4`. |
469 | | #[inline] |
470 | 0 | pub fn write_repr(self, slice: &mut [u8]) { |
471 | 0 | let raw = self.bits.to_ne_bytes(); |
472 | 0 | slice[0] = raw[0]; |
473 | 0 | slice[1] = raw[1]; |
474 | 0 | slice[2] = raw[2]; |
475 | 0 | slice[3] = raw[3]; |
476 | 0 | } |
477 | | |
478 | | /// Checks that all assertions in this set can be matched. |
479 | | /// |
480 | | /// Some assertions, such as Unicode word boundaries, require optional (but |
481 | | /// enabled by default) tables that may not be available. If there are |
482 | | /// assertions in this set that require tables that are not available, then |
483 | | /// this will return an error. |
484 | | /// |
485 | | /// Specifically, this returns an error when the the |
486 | | /// `unicode-word-boundary` feature is _not_ enabled _and_ this set |
487 | | /// contains a Unicode word boundary assertion. |
488 | | /// |
489 | | /// It can be useful to use this on the result of |
490 | | /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) |
491 | | /// when building a matcher engine to ensure methods like |
492 | | /// [`LookMatcher::matches_set`] do not panic at search time. |
493 | 0 | pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { |
494 | 0 | if self.contains_word_unicode() { |
495 | 0 | UnicodeWordBoundaryError::check()?; |
496 | 0 | } |
497 | 0 | Ok(()) |
498 | 0 | } |
499 | | } |
500 | | |
501 | | impl core::fmt::Debug for LookSet { |
502 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
503 | 0 | if self.is_empty() { |
504 | 0 | return write!(f, "∅"); |
505 | 0 | } |
506 | 0 | for look in self.iter() { |
507 | 0 | write!(f, "{}", look.as_char())?; |
508 | | } |
509 | 0 | Ok(()) |
510 | 0 | } |
511 | | } |
512 | | |
513 | | /// An iterator over all look-around assertions in a [`LookSet`]. |
514 | | /// |
515 | | /// This iterator is created by [`LookSet::iter`]. |
516 | | #[derive(Clone, Debug)] |
517 | | pub struct LookSetIter { |
518 | | set: LookSet, |
519 | | } |
520 | | |
521 | | impl Iterator for LookSetIter { |
522 | | type Item = Look; |
523 | | |
524 | | #[inline] |
525 | 0 | fn next(&mut self) -> Option<Look> { |
526 | 0 | if self.set.is_empty() { |
527 | 0 | return None; |
528 | 0 | } |
529 | 0 | // We'll never have more than u8::MAX distinct look-around assertions, |
530 | 0 | // so 'bit' will always fit into a u16. |
531 | 0 | let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); |
532 | 0 | let look = Look::from_repr(1 << bit)?; |
533 | 0 | self.set = self.set.remove(look); |
534 | 0 | Some(look) |
535 | 0 | } |
536 | | } |
537 | | |
538 | | /// A matcher for look-around assertions. |
539 | | /// |
540 | | /// This matcher permits configuring aspects of how look-around assertions are |
541 | | /// matched. |
542 | | /// |
543 | | /// # Example |
544 | | /// |
545 | | /// A `LookMatcher` can change the line terminator used for matching multi-line |
546 | | /// anchors such as `(?m:^)` and `(?m:$)`. |
547 | | /// |
548 | | /// ``` |
549 | | /// use regex_automata::{ |
550 | | /// nfa::thompson::{self, pikevm::PikeVM}, |
551 | | /// util::look::LookMatcher, |
552 | | /// Match, Input, |
553 | | /// }; |
554 | | /// |
555 | | /// let mut lookm = LookMatcher::new(); |
556 | | /// lookm.set_line_terminator(b'\x00'); |
557 | | /// |
558 | | /// let re = PikeVM::builder() |
559 | | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
560 | | /// .build(r"(?m)^[a-z]+$")?; |
561 | | /// let mut cache = re.create_cache(); |
562 | | /// |
563 | | /// // Multi-line assertions now use NUL as a terminator. |
564 | | /// assert_eq!( |
565 | | /// Some(Match::must(0, 1..4)), |
566 | | /// re.find(&mut cache, b"\x00abc\x00"), |
567 | | /// ); |
568 | | /// // ... and \n is no longer recognized as a terminator. |
569 | | /// assert_eq!( |
570 | | /// None, |
571 | | /// re.find(&mut cache, b"\nabc\n"), |
572 | | /// ); |
573 | | /// |
574 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
575 | | /// ``` |
576 | | #[derive(Clone, Debug)] |
577 | | pub struct LookMatcher { |
578 | | lineterm: DebugByte, |
579 | | } |
580 | | |
581 | | impl LookMatcher { |
582 | | /// Creates a new default matcher for look-around assertions. |
583 | 0 | pub fn new() -> LookMatcher { |
584 | 0 | LookMatcher { lineterm: DebugByte(b'\n') } |
585 | 0 | } |
586 | | |
587 | | /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. |
588 | | /// |
589 | | /// Namely, instead of `^` matching after `\n` and `$` matching immediately |
590 | | /// before a `\n`, this will cause it to match after and before the byte |
591 | | /// given. |
592 | | /// |
593 | | /// It can occasionally be useful to use this to configure the line |
594 | | /// terminator to the NUL byte when searching binary data. |
595 | | /// |
596 | | /// Note that this does not apply to CRLF-aware line anchors such as |
597 | | /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to |
598 | | /// use `\r` and `\n`. |
599 | 0 | pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { |
600 | 0 | self.lineterm.0 = byte; |
601 | 0 | self |
602 | 0 | } |
603 | | |
604 | | /// Returns the line terminator that was configured for this matcher. |
605 | | /// |
606 | | /// If no line terminator was configured, then this returns `\n`. |
607 | | /// |
608 | | /// Note that the line terminator should only be used for matching `(?m:^)` |
609 | | /// and `(?m:$)` assertions. It specifically should _not_ be used for |
610 | | /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. |
611 | 0 | pub fn get_line_terminator(&self) -> u8 { |
612 | 0 | self.lineterm.0 |
613 | 0 | } |
614 | | |
615 | | /// Returns true when the position `at` in `haystack` satisfies the given |
616 | | /// look-around assertion. |
617 | | /// |
618 | | /// # Panics |
619 | | /// |
620 | | /// This panics when testing any Unicode word boundary assertion in this |
621 | | /// set and when the Unicode word data is not available. Specifically, this |
622 | | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
623 | | /// |
624 | | /// Since it's generally expected that this routine is called inside of |
625 | | /// a matching engine, callers should check the error condition when |
626 | | /// building the matching engine. If there is a Unicode word boundary |
627 | | /// in the matcher and the data isn't available, then the matcher should |
628 | | /// fail to build. |
629 | | /// |
630 | | /// Callers can check the error condition with [`LookSet::available`]. |
631 | | /// |
632 | | /// This also may panic when `at > haystack.len()`. Note that `at == |
633 | | /// haystack.len()` is legal and guaranteed not to panic. |
634 | | #[inline] |
635 | 0 | pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { |
636 | 0 | self.matches_inline(look, haystack, at) |
637 | 0 | } |
638 | | |
639 | | /// Like `matches`, but forcefully inlined. |
640 | | /// |
641 | | /// # Panics |
642 | | /// |
643 | | /// This panics when testing any Unicode word boundary assertion in this |
644 | | /// set and when the Unicode word data is not available. Specifically, this |
645 | | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
646 | | /// |
647 | | /// Since it's generally expected that this routine is called inside of |
648 | | /// a matching engine, callers should check the error condition when |
649 | | /// building the matching engine. If there is a Unicode word boundary |
650 | | /// in the matcher and the data isn't available, then the matcher should |
651 | | /// fail to build. |
652 | | /// |
653 | | /// Callers can check the error condition with [`LookSet::available`]. |
654 | | /// |
655 | | /// This also may panic when `at > haystack.len()`. Note that `at == |
656 | | /// haystack.len()` is legal and guaranteed not to panic. |
657 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
658 | 0 | pub(crate) fn matches_inline( |
659 | 0 | &self, |
660 | 0 | look: Look, |
661 | 0 | haystack: &[u8], |
662 | 0 | at: usize, |
663 | 0 | ) -> bool { |
664 | 0 | match look { |
665 | 0 | Look::Start => self.is_start(haystack, at), |
666 | 0 | Look::End => self.is_end(haystack, at), |
667 | 0 | Look::StartLF => self.is_start_lf(haystack, at), |
668 | 0 | Look::EndLF => self.is_end_lf(haystack, at), |
669 | 0 | Look::StartCRLF => self.is_start_crlf(haystack, at), |
670 | 0 | Look::EndCRLF => self.is_end_crlf(haystack, at), |
671 | 0 | Look::WordAscii => self.is_word_ascii(haystack, at), |
672 | 0 | Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), |
673 | 0 | Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), |
674 | | Look::WordUnicodeNegate => { |
675 | 0 | self.is_word_unicode_negate(haystack, at).unwrap() |
676 | | } |
677 | 0 | Look::WordStartAscii => self.is_word_start_ascii(haystack, at), |
678 | 0 | Look::WordEndAscii => self.is_word_end_ascii(haystack, at), |
679 | | Look::WordStartUnicode => { |
680 | 0 | self.is_word_start_unicode(haystack, at).unwrap() |
681 | | } |
682 | | Look::WordEndUnicode => { |
683 | 0 | self.is_word_end_unicode(haystack, at).unwrap() |
684 | | } |
685 | | Look::WordStartHalfAscii => { |
686 | 0 | self.is_word_start_half_ascii(haystack, at) |
687 | | } |
688 | | Look::WordEndHalfAscii => { |
689 | 0 | self.is_word_end_half_ascii(haystack, at) |
690 | | } |
691 | | Look::WordStartHalfUnicode => { |
692 | 0 | self.is_word_start_half_unicode(haystack, at).unwrap() |
693 | | } |
694 | | Look::WordEndHalfUnicode => { |
695 | 0 | self.is_word_end_half_unicode(haystack, at).unwrap() |
696 | | } |
697 | | } |
698 | 0 | } |
699 | | |
700 | | /// Returns true when _all_ of the assertions in the given set match at the |
701 | | /// given position in the haystack. |
702 | | /// |
703 | | /// # Panics |
704 | | /// |
705 | | /// This panics when testing any Unicode word boundary assertion in this |
706 | | /// set and when the Unicode word data is not available. Specifically, this |
707 | | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
708 | | /// |
709 | | /// Since it's generally expected that this routine is called inside of |
710 | | /// a matching engine, callers should check the error condition when |
711 | | /// building the matching engine. If there is a Unicode word boundary |
712 | | /// in the matcher and the data isn't available, then the matcher should |
713 | | /// fail to build. |
714 | | /// |
715 | | /// Callers can check the error condition with [`LookSet::available`]. |
716 | | /// |
717 | | /// This also may panic when `at > haystack.len()`. Note that `at == |
718 | | /// haystack.len()` is legal and guaranteed not to panic. |
719 | | #[inline] |
720 | 0 | pub fn matches_set( |
721 | 0 | &self, |
722 | 0 | set: LookSet, |
723 | 0 | haystack: &[u8], |
724 | 0 | at: usize, |
725 | 0 | ) -> bool { |
726 | 0 | self.matches_set_inline(set, haystack, at) |
727 | 0 | } |
728 | | |
729 | | /// Like `LookSet::matches`, but forcefully inlined for perf. |
730 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
731 | 0 | pub(crate) fn matches_set_inline( |
732 | 0 | &self, |
733 | 0 | set: LookSet, |
734 | 0 | haystack: &[u8], |
735 | 0 | at: usize, |
736 | 0 | ) -> bool { |
737 | 0 | // This used to luse LookSet::iter with Look::matches on each element, |
738 | 0 | // but that proved to be quite diastrous for perf. The manual "if |
739 | 0 | // the set has this assertion, check it" turns out to be quite a bit |
740 | 0 | // faster. |
741 | 0 | if set.contains(Look::Start) { |
742 | 0 | if !self.is_start(haystack, at) { |
743 | 0 | return false; |
744 | 0 | } |
745 | 0 | } |
746 | 0 | if set.contains(Look::End) { |
747 | 0 | if !self.is_end(haystack, at) { |
748 | 0 | return false; |
749 | 0 | } |
750 | 0 | } |
751 | 0 | if set.contains(Look::StartLF) { |
752 | 0 | if !self.is_start_lf(haystack, at) { |
753 | 0 | return false; |
754 | 0 | } |
755 | 0 | } |
756 | 0 | if set.contains(Look::EndLF) { |
757 | 0 | if !self.is_end_lf(haystack, at) { |
758 | 0 | return false; |
759 | 0 | } |
760 | 0 | } |
761 | 0 | if set.contains(Look::StartCRLF) { |
762 | 0 | if !self.is_start_crlf(haystack, at) { |
763 | 0 | return false; |
764 | 0 | } |
765 | 0 | } |
766 | 0 | if set.contains(Look::EndCRLF) { |
767 | 0 | if !self.is_end_crlf(haystack, at) { |
768 | 0 | return false; |
769 | 0 | } |
770 | 0 | } |
771 | 0 | if set.contains(Look::WordAscii) { |
772 | 0 | if !self.is_word_ascii(haystack, at) { |
773 | 0 | return false; |
774 | 0 | } |
775 | 0 | } |
776 | 0 | if set.contains(Look::WordAsciiNegate) { |
777 | 0 | if !self.is_word_ascii_negate(haystack, at) { |
778 | 0 | return false; |
779 | 0 | } |
780 | 0 | } |
781 | 0 | if set.contains(Look::WordUnicode) { |
782 | 0 | if !self.is_word_unicode(haystack, at).unwrap() { |
783 | 0 | return false; |
784 | 0 | } |
785 | 0 | } |
786 | 0 | if set.contains(Look::WordUnicodeNegate) { |
787 | 0 | if !self.is_word_unicode_negate(haystack, at).unwrap() { |
788 | 0 | return false; |
789 | 0 | } |
790 | 0 | } |
791 | 0 | if set.contains(Look::WordStartAscii) { |
792 | 0 | if !self.is_word_start_ascii(haystack, at) { |
793 | 0 | return false; |
794 | 0 | } |
795 | 0 | } |
796 | 0 | if set.contains(Look::WordEndAscii) { |
797 | 0 | if !self.is_word_end_ascii(haystack, at) { |
798 | 0 | return false; |
799 | 0 | } |
800 | 0 | } |
801 | 0 | if set.contains(Look::WordStartUnicode) { |
802 | 0 | if !self.is_word_start_unicode(haystack, at).unwrap() { |
803 | 0 | return false; |
804 | 0 | } |
805 | 0 | } |
806 | 0 | if set.contains(Look::WordEndUnicode) { |
807 | 0 | if !self.is_word_end_unicode(haystack, at).unwrap() { |
808 | 0 | return false; |
809 | 0 | } |
810 | 0 | } |
811 | 0 | if set.contains(Look::WordStartHalfAscii) { |
812 | 0 | if !self.is_word_start_half_ascii(haystack, at) { |
813 | 0 | return false; |
814 | 0 | } |
815 | 0 | } |
816 | 0 | if set.contains(Look::WordEndHalfAscii) { |
817 | 0 | if !self.is_word_end_half_ascii(haystack, at) { |
818 | 0 | return false; |
819 | 0 | } |
820 | 0 | } |
821 | 0 | if set.contains(Look::WordStartHalfUnicode) { |
822 | 0 | if !self.is_word_start_half_unicode(haystack, at).unwrap() { |
823 | 0 | return false; |
824 | 0 | } |
825 | 0 | } |
826 | 0 | if set.contains(Look::WordEndHalfUnicode) { |
827 | 0 | if !self.is_word_end_half_unicode(haystack, at).unwrap() { |
828 | 0 | return false; |
829 | 0 | } |
830 | 0 | } |
831 | 0 | true |
832 | 0 | } |
833 | | |
834 | | /// Split up the given byte classes into equivalence classes in a way that |
835 | | /// is consistent with this look-around assertion. |
836 | | #[cfg(feature = "alloc")] |
837 | | pub(crate) fn add_to_byteset( |
838 | | &self, |
839 | | look: Look, |
840 | | set: &mut crate::util::alphabet::ByteClassSet, |
841 | | ) { |
842 | | match look { |
843 | | Look::Start | Look::End => {} |
844 | | Look::StartLF | Look::EndLF => { |
845 | | set.set_range(self.lineterm.0, self.lineterm.0); |
846 | | } |
847 | | Look::StartCRLF | Look::EndCRLF => { |
848 | | set.set_range(b'\r', b'\r'); |
849 | | set.set_range(b'\n', b'\n'); |
850 | | } |
851 | | Look::WordAscii |
852 | | | Look::WordAsciiNegate |
853 | | | Look::WordUnicode |
854 | | | Look::WordUnicodeNegate |
855 | | | Look::WordStartAscii |
856 | | | Look::WordEndAscii |
857 | | | Look::WordStartUnicode |
858 | | | Look::WordEndUnicode |
859 | | | Look::WordStartHalfAscii |
860 | | | Look::WordEndHalfAscii |
861 | | | Look::WordStartHalfUnicode |
862 | | | Look::WordEndHalfUnicode => { |
863 | | // We need to mark all ranges of bytes whose pairs result in |
864 | | // evaluating \b differently. This isn't technically correct |
865 | | // for Unicode word boundaries, but DFAs can't handle those |
866 | | // anyway, and thus, the byte classes don't need to either |
867 | | // since they are themselves only used in DFAs. |
868 | | // |
869 | | // FIXME: It seems like the calls to 'set_range' here are |
870 | | // completely invariant, which means we could just hard-code |
871 | | // them here without needing to write a loop. And we only need |
872 | | // to do this dance at most once per regex. |
873 | | // |
874 | | // FIXME: Is this correct for \B? |
875 | | let iswb = utf8::is_word_byte; |
876 | | // This unwrap is OK because we guard every use of 'asu8' with |
877 | | // a check that the input is <= 255. |
878 | | let asu8 = |b: u16| u8::try_from(b).unwrap(); |
879 | | let mut b1: u16 = 0; |
880 | | let mut b2: u16; |
881 | | while b1 <= 255 { |
882 | | b2 = b1 + 1; |
883 | | while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { |
884 | | b2 += 1; |
885 | | } |
886 | | // The guards above guarantee that b2 can never get any |
887 | | // bigger. |
888 | | assert!(b2 <= 256); |
889 | | // Subtracting 1 from b2 is always OK because it is always |
890 | | // at least 1 greater than b1, and the assert above |
891 | | // guarantees that the asu8 conversion will succeed. |
892 | | set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); |
893 | | b1 = b2; |
894 | | } |
895 | | } |
896 | | } |
897 | | } |
898 | | |
899 | | /// Returns true when [`Look::Start`] is satisfied `at` the given position |
900 | | /// in `haystack`. |
901 | | /// |
902 | | /// # Panics |
903 | | /// |
904 | | /// This may panic when `at > haystack.len()`. Note that `at == |
905 | | /// haystack.len()` is legal and guaranteed not to panic. |
906 | | #[inline] |
907 | 0 | pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { |
908 | 0 | at == 0 |
909 | 0 | } |
910 | | |
911 | | /// Returns true when [`Look::End`] is satisfied `at` the given position in |
912 | | /// `haystack`. |
913 | | /// |
914 | | /// # Panics |
915 | | /// |
916 | | /// This may panic when `at > haystack.len()`. Note that `at == |
917 | | /// haystack.len()` is legal and guaranteed not to panic. |
918 | | #[inline] |
919 | 0 | pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { |
920 | 0 | at == haystack.len() |
921 | 0 | } |
922 | | |
923 | | /// Returns true when [`Look::StartLF`] is satisfied `at` the given |
924 | | /// position in `haystack`. |
925 | | /// |
926 | | /// # Panics |
927 | | /// |
928 | | /// This may panic when `at > haystack.len()`. Note that `at == |
929 | | /// haystack.len()` is legal and guaranteed not to panic. |
930 | | #[inline] |
931 | 0 | pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { |
932 | 0 | self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 |
933 | 0 | } |
934 | | |
935 | | /// Returns true when [`Look::EndLF`] is satisfied `at` the given position |
936 | | /// in `haystack`. |
937 | | /// |
938 | | /// # Panics |
939 | | /// |
940 | | /// This may panic when `at > haystack.len()`. Note that `at == |
941 | | /// haystack.len()` is legal and guaranteed not to panic. |
942 | | #[inline] |
943 | 0 | pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { |
944 | 0 | self.is_end(haystack, at) || haystack[at] == self.lineterm.0 |
945 | 0 | } |
946 | | |
947 | | /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given |
948 | | /// position in `haystack`. |
949 | | /// |
950 | | /// # Panics |
951 | | /// |
952 | | /// This may panic when `at > haystack.len()`. Note that `at == |
953 | | /// haystack.len()` is legal and guaranteed not to panic. |
954 | | #[inline] |
955 | 0 | pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { |
956 | 0 | self.is_start(haystack, at) |
957 | 0 | || haystack[at - 1] == b'\n' |
958 | 0 | || (haystack[at - 1] == b'\r' |
959 | 0 | && (at >= haystack.len() || haystack[at] != b'\n')) |
960 | 0 | } |
961 | | |
962 | | /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given |
963 | | /// position in `haystack`. |
964 | | /// |
965 | | /// # Panics |
966 | | /// |
967 | | /// This may panic when `at > haystack.len()`. Note that `at == |
968 | | /// haystack.len()` is legal and guaranteed not to panic. |
969 | | #[inline] |
970 | 0 | pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { |
971 | 0 | self.is_end(haystack, at) |
972 | 0 | || haystack[at] == b'\r' |
973 | 0 | || (haystack[at] == b'\n' |
974 | 0 | && (at == 0 || haystack[at - 1] != b'\r')) |
975 | 0 | } |
976 | | |
977 | | /// Returns true when [`Look::WordAscii`] is satisfied `at` the given |
978 | | /// position in `haystack`. |
979 | | /// |
980 | | /// # Panics |
981 | | /// |
982 | | /// This may panic when `at > haystack.len()`. Note that `at == |
983 | | /// haystack.len()` is legal and guaranteed not to panic. |
984 | | #[inline] |
985 | 0 | pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { |
986 | 0 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
987 | 0 | let word_after = |
988 | 0 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
989 | 0 | word_before != word_after |
990 | 0 | } |
991 | | |
992 | | /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given |
993 | | /// position in `haystack`. |
994 | | /// |
995 | | /// # Panics |
996 | | /// |
997 | | /// This may panic when `at > haystack.len()`. Note that `at == |
998 | | /// haystack.len()` is legal and guaranteed not to panic. |
999 | | #[inline] |
1000 | 0 | pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { |
1001 | 0 | !self.is_word_ascii(haystack, at) |
1002 | 0 | } |
1003 | | |
1004 | | /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given |
1005 | | /// position in `haystack`. |
1006 | | /// |
1007 | | /// # Panics |
1008 | | /// |
1009 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1010 | | /// haystack.len()` is legal and guaranteed not to panic. |
1011 | | /// |
1012 | | /// # Errors |
1013 | | /// |
1014 | | /// This returns an error when Unicode word boundary tables |
1015 | | /// are not available. Specifically, this only occurs when the |
1016 | | /// `unicode-word-boundary` feature is not enabled. |
1017 | | #[inline] |
1018 | 0 | pub fn is_word_unicode( |
1019 | 0 | &self, |
1020 | 0 | haystack: &[u8], |
1021 | 0 | at: usize, |
1022 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1023 | 0 | let word_before = is_word_char::rev(haystack, at)?; |
1024 | 0 | let word_after = is_word_char::fwd(haystack, at)?; |
1025 | 0 | Ok(word_before != word_after) |
1026 | 0 | } |
1027 | | |
1028 | | /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the |
1029 | | /// given position in `haystack`. |
1030 | | /// |
1031 | | /// # Panics |
1032 | | /// |
1033 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1034 | | /// haystack.len()` is legal and guaranteed not to panic. |
1035 | | /// |
1036 | | /// # Errors |
1037 | | /// |
1038 | | /// This returns an error when Unicode word boundary tables |
1039 | | /// are not available. Specifically, this only occurs when the |
1040 | | /// `unicode-word-boundary` feature is not enabled. |
1041 | | #[inline] |
1042 | 0 | pub fn is_word_unicode_negate( |
1043 | 0 | &self, |
1044 | 0 | haystack: &[u8], |
1045 | 0 | at: usize, |
1046 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1047 | | // This is pretty subtle. Why do we need to do UTF-8 decoding here? |
1048 | | // Well... at time of writing, the is_word_char_{fwd,rev} routines will |
1049 | | // only return true if there is a valid UTF-8 encoding of a "word" |
1050 | | // codepoint, and false in every other case (including invalid UTF-8). |
1051 | | // This means that in regions of invalid UTF-8 (which might be a |
1052 | | // subset of valid UTF-8!), it would result in \B matching. While this |
1053 | | // would be questionable in the context of truly invalid UTF-8, it is |
1054 | | // *certainly* wrong to report match boundaries that split the encoding |
1055 | | // of a codepoint. So to work around this, we ensure that we can decode |
1056 | | // a codepoint on either side of `at`. If either direction fails, then |
1057 | | // we don't permit \B to match at all. |
1058 | | // |
1059 | | // Now, this isn't exactly optimal from a perf perspective. We could |
1060 | | // try and detect this in is_word_char::{fwd,rev}, but it's not clear |
1061 | | // if it's worth it. \B is, after all, rarely used. Even worse, |
1062 | | // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this |
1063 | | // will wind up doing UTF-8 decoding twice. Owch. We could fix this |
1064 | | // with more code complexity, but it just doesn't feel worth it for \B. |
1065 | | // |
1066 | | // And in particular, we do *not* have to do this with \b, because \b |
1067 | | // *requires* that at least one side of `at` be a "word" codepoint, |
1068 | | // which in turn implies one side of `at` must be valid UTF-8. This in |
1069 | | // turn implies that \b can never split a valid UTF-8 encoding of a |
1070 | | // codepoint. In the case where one side of `at` is truly invalid UTF-8 |
1071 | | // and the other side IS a word codepoint, then we want \b to match |
1072 | | // since it represents a valid UTF-8 boundary. It also makes sense. For |
1073 | | // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. |
1074 | | // |
1075 | | // Note also that this is not just '!is_word_unicode(..)' like it is |
1076 | | // for the ASCII case. For example, neither \b nor \B is satisfied |
1077 | | // within invalid UTF-8 sequences. |
1078 | 0 | let word_before = at > 0 |
1079 | 0 | && match utf8::decode_last(&haystack[..at]) { |
1080 | 0 | None | Some(Err(_)) => return Ok(false), |
1081 | 0 | Some(Ok(_)) => is_word_char::rev(haystack, at)?, |
1082 | | }; |
1083 | 0 | let word_after = at < haystack.len() |
1084 | 0 | && match utf8::decode(&haystack[at..]) { |
1085 | 0 | None | Some(Err(_)) => return Ok(false), |
1086 | 0 | Some(Ok(_)) => is_word_char::fwd(haystack, at)?, |
1087 | | }; |
1088 | 0 | Ok(word_before == word_after) |
1089 | 0 | } |
1090 | | |
1091 | | /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given |
1092 | | /// position in `haystack`. |
1093 | | /// |
1094 | | /// # Panics |
1095 | | /// |
1096 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1097 | | /// haystack.len()` is legal and guaranteed not to panic. |
1098 | | #[inline] |
1099 | 0 | pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { |
1100 | 0 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
1101 | 0 | let word_after = |
1102 | 0 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
1103 | 0 | !word_before && word_after |
1104 | 0 | } |
1105 | | |
1106 | | /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given |
1107 | | /// position in `haystack`. |
1108 | | /// |
1109 | | /// # Panics |
1110 | | /// |
1111 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1112 | | /// haystack.len()` is legal and guaranteed not to panic. |
1113 | | #[inline] |
1114 | 0 | pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { |
1115 | 0 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
1116 | 0 | let word_after = |
1117 | 0 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
1118 | 0 | word_before && !word_after |
1119 | 0 | } |
1120 | | |
1121 | | /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the |
1122 | | /// given position in `haystack`. |
1123 | | /// |
1124 | | /// # Panics |
1125 | | /// |
1126 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1127 | | /// haystack.len()` is legal and guaranteed not to panic. |
1128 | | /// |
1129 | | /// # Errors |
1130 | | /// |
1131 | | /// This returns an error when Unicode word boundary tables |
1132 | | /// are not available. Specifically, this only occurs when the |
1133 | | /// `unicode-word-boundary` feature is not enabled. |
1134 | | #[inline] |
1135 | 0 | pub fn is_word_start_unicode( |
1136 | 0 | &self, |
1137 | 0 | haystack: &[u8], |
1138 | 0 | at: usize, |
1139 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1140 | 0 | let word_before = is_word_char::rev(haystack, at)?; |
1141 | 0 | let word_after = is_word_char::fwd(haystack, at)?; |
1142 | 0 | Ok(!word_before && word_after) |
1143 | 0 | } |
1144 | | |
1145 | | /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the |
1146 | | /// given position in `haystack`. |
1147 | | /// |
1148 | | /// # Panics |
1149 | | /// |
1150 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1151 | | /// haystack.len()` is legal and guaranteed not to panic. |
1152 | | /// |
1153 | | /// # Errors |
1154 | | /// |
1155 | | /// This returns an error when Unicode word boundary tables |
1156 | | /// are not available. Specifically, this only occurs when the |
1157 | | /// `unicode-word-boundary` feature is not enabled. |
1158 | | #[inline] |
1159 | 0 | pub fn is_word_end_unicode( |
1160 | 0 | &self, |
1161 | 0 | haystack: &[u8], |
1162 | 0 | at: usize, |
1163 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1164 | 0 | let word_before = is_word_char::rev(haystack, at)?; |
1165 | 0 | let word_after = is_word_char::fwd(haystack, at)?; |
1166 | 0 | Ok(word_before && !word_after) |
1167 | 0 | } |
1168 | | |
1169 | | /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the |
1170 | | /// given position in `haystack`. |
1171 | | /// |
1172 | | /// # Panics |
1173 | | /// |
1174 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1175 | | /// haystack.len()` is legal and guaranteed not to panic. |
1176 | | #[inline] |
1177 | 0 | pub fn is_word_start_half_ascii( |
1178 | 0 | &self, |
1179 | 0 | haystack: &[u8], |
1180 | 0 | at: usize, |
1181 | 0 | ) -> bool { |
1182 | 0 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
1183 | 0 | !word_before |
1184 | 0 | } |
1185 | | |
1186 | | /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the |
1187 | | /// given position in `haystack`. |
1188 | | /// |
1189 | | /// # Panics |
1190 | | /// |
1191 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1192 | | /// haystack.len()` is legal and guaranteed not to panic. |
1193 | | #[inline] |
1194 | 0 | pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { |
1195 | 0 | let word_after = |
1196 | 0 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
1197 | 0 | !word_after |
1198 | 0 | } |
1199 | | |
1200 | | /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the |
1201 | | /// given position in `haystack`. |
1202 | | /// |
1203 | | /// # Panics |
1204 | | /// |
1205 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1206 | | /// haystack.len()` is legal and guaranteed not to panic. |
1207 | | /// |
1208 | | /// # Errors |
1209 | | /// |
1210 | | /// This returns an error when Unicode word boundary tables |
1211 | | /// are not available. Specifically, this only occurs when the |
1212 | | /// `unicode-word-boundary` feature is not enabled. |
1213 | | #[inline] |
1214 | 0 | pub fn is_word_start_half_unicode( |
1215 | 0 | &self, |
1216 | 0 | haystack: &[u8], |
1217 | 0 | at: usize, |
1218 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1219 | | // See `is_word_unicode_negate` for why we need to do this. We don't |
1220 | | // need to do it for `is_word_start_unicode` because that guarantees |
1221 | | // that the position matched falls on a valid UTF-8 boundary given |
1222 | | // that the right side must be in \w. |
1223 | 0 | let word_before = at > 0 |
1224 | 0 | && match utf8::decode_last(&haystack[..at]) { |
1225 | 0 | None | Some(Err(_)) => return Ok(false), |
1226 | 0 | Some(Ok(_)) => is_word_char::rev(haystack, at)?, |
1227 | | }; |
1228 | 0 | Ok(!word_before) |
1229 | 0 | } |
1230 | | |
1231 | | /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the |
1232 | | /// given position in `haystack`. |
1233 | | /// |
1234 | | /// # Panics |
1235 | | /// |
1236 | | /// This may panic when `at > haystack.len()`. Note that `at == |
1237 | | /// haystack.len()` is legal and guaranteed not to panic. |
1238 | | /// |
1239 | | /// # Errors |
1240 | | /// |
1241 | | /// This returns an error when Unicode word boundary tables |
1242 | | /// are not available. Specifically, this only occurs when the |
1243 | | /// `unicode-word-boundary` feature is not enabled. |
1244 | | #[inline] |
1245 | 0 | pub fn is_word_end_half_unicode( |
1246 | 0 | &self, |
1247 | 0 | haystack: &[u8], |
1248 | 0 | at: usize, |
1249 | 0 | ) -> Result<bool, UnicodeWordBoundaryError> { |
1250 | | // See `is_word_unicode_negate` for why we need to do this. We don't |
1251 | | // need to do it for `is_word_end_unicode` because that guarantees |
1252 | | // that the position matched falls on a valid UTF-8 boundary given |
1253 | | // that the left side must be in \w. |
1254 | 0 | let word_after = at < haystack.len() |
1255 | 0 | && match utf8::decode(&haystack[at..]) { |
1256 | 0 | None | Some(Err(_)) => return Ok(false), |
1257 | 0 | Some(Ok(_)) => is_word_char::fwd(haystack, at)?, |
1258 | | }; |
1259 | 0 | Ok(!word_after) |
1260 | 0 | } |
1261 | | } |
1262 | | |
1263 | | impl Default for LookMatcher { |
1264 | 0 | fn default() -> LookMatcher { |
1265 | 0 | LookMatcher::new() |
1266 | 0 | } |
1267 | | } |
1268 | | |
1269 | | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
1270 | | /// |
1271 | | /// This error can occur when the data tables necessary for the Unicode aware |
1272 | | /// Perl character class `\w` are unavailable. The `\w` class is used to |
1273 | | /// determine whether a codepoint is considered a word character or not when |
1274 | | /// determining whether a Unicode aware `\b` (or `\B`) matches at a particular |
1275 | | /// position. |
1276 | | /// |
1277 | | /// This error can only occur when the `unicode-word-boundary` feature is |
1278 | | /// disabled. |
1279 | | #[derive(Clone, Debug)] |
1280 | | pub struct UnicodeWordBoundaryError(()); |
1281 | | |
1282 | | impl UnicodeWordBoundaryError { |
1283 | | #[cfg(not(feature = "unicode-word-boundary"))] |
1284 | 0 | pub(crate) fn new() -> UnicodeWordBoundaryError { |
1285 | 0 | UnicodeWordBoundaryError(()) |
1286 | 0 | } |
1287 | | |
1288 | | /// Returns an error if and only if Unicode word boundary data is |
1289 | | /// unavailable. |
1290 | 0 | pub fn check() -> Result<(), UnicodeWordBoundaryError> { |
1291 | 0 | is_word_char::check() |
1292 | 0 | } |
1293 | | } |
1294 | | |
1295 | | #[cfg(feature = "std")] |
1296 | | impl std::error::Error for UnicodeWordBoundaryError {} |
1297 | | |
1298 | | impl core::fmt::Display for UnicodeWordBoundaryError { |
1299 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1300 | 0 | write!( |
1301 | 0 | f, |
1302 | 0 | "Unicode-aware \\b and \\B are unavailable because the \ |
1303 | 0 | requisite data tables are missing, please enable the \ |
1304 | 0 | unicode-word-boundary feature" |
1305 | 0 | ) |
1306 | 0 | } |
1307 | | } |
1308 | | |
1309 | | // Below are FOUR different ways for checking whether whether a "word" |
1310 | | // codepoint exists at a particular position in the haystack. The four |
1311 | | // different approaches are, in order of preference: |
1312 | | // |
1313 | | // 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the |
1314 | | // first call, and then use that DFA for all subsequent calls. |
1315 | | // 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. |
1316 | | // 3. Do UTF-8 decoding and use our own 'perl_word' table. |
1317 | | // 4. Return an error. |
1318 | | // |
1319 | | // The reason for all of these approaches is a combination of perf and |
1320 | | // permitting one to build regex-automata without the Unicode data necessary |
1321 | | // for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would |
1322 | | // still work.) |
1323 | | // |
1324 | | // The DFA approach is the fastest, but it requires the regex parser, the |
1325 | | // NFA compiler, the DFA builder and the DFA search runtime. That's a lot to |
1326 | | // bring in, but if it's available, it's (probably) the best we can do. |
1327 | | // |
1328 | | // Approaches (2) and (3) are effectively equivalent, but (2) reuses the |
1329 | | // data in regex-syntax and avoids duplicating it in regex-automata. |
1330 | | // |
1331 | | // Finally, (4) unconditionally returns an error since the requisite data isn't |
1332 | | // available anywhere. |
1333 | | // |
1334 | | // There are actually more approaches possible that we didn't implement. For |
1335 | | // example, if the DFA builder is available but the syntax parser is not, we |
1336 | | // could technically hand construct our own NFA from the 'perl_word' data |
1337 | | // table. But to avoid some pretty hairy code duplication, we would in turn |
1338 | | // need to pull the UTF-8 compiler out of the NFA compiler. Yikes. |
1339 | | // |
1340 | | // A possibly more sensible alternative is to use a lazy DFA when the full |
1341 | | // DFA builder isn't available... |
1342 | | // |
1343 | | // Yet another choice would be to build the full DFA and then embed it into the |
1344 | | // source. Then we'd only need to bring in the DFA search runtime, which is |
1345 | | // considerably smaller than the DFA builder code. The problem here is that the |
1346 | | // Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, |
1347 | | // we'd need to build regex-cli, which depends on regex-automata in order to |
1348 | | // build some part of regex-automata. But to be honest, something like this has |
1349 | | // to be allowed somehow? I just don't know what the right process is. |
1350 | | // |
1351 | | // There are perhaps other choices as well. Why did I stop at these 4? Because |
1352 | | // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA |
1353 | | // approach eventually, as the benefits of the DFA approach are somewhat |
1354 | | // compelling. The 'boundary-words-holmes' benchmark tests this. (Note that |
1355 | | // the commands below no longer work. If necessary, we should re-capitulate |
1356 | | // the benchmark from whole cloth in rebar.) |
1357 | | // |
1358 | | // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv |
1359 | | // |
1360 | | // Then I changed the code below so that the util/unicode_data/perl_word table |
1361 | | // was used and re-ran the benchmark: |
1362 | | // |
1363 | | // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv |
1364 | | // |
1365 | | // And compared them: |
1366 | | // |
1367 | | // $ regex-cli bench diff dfa.csv table.csv |
1368 | | // benchmark engine dfa table |
1369 | | // --------- ------ --- ----- |
1370 | | // internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s |
1371 | | // |
1372 | | // Which is a nice improvement. |
1373 | | // |
1374 | | // UPDATE: It turns out that it takes approximately 22ms to build the reverse |
1375 | | // DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in |
1376 | | // the grand scheme things, but that is a significant latency cost. So I'm not |
1377 | | // sure that's a good idea. I then tried using a lazy DFA instead, and that |
1378 | | // eliminated the overhead, but since the lazy DFA requires mutable working |
1379 | | // memory, that requires introducing a 'Cache' for every simultaneous call. |
1380 | | // |
1381 | | // I ended up deciding for now to just keep the "UTF-8 decode and check the |
1382 | | // table." The DFA and lazy DFA approaches are still below, but commented out. |
1383 | | // |
1384 | | // [1]: https://github.com/BurntSushi/ucd-generate/issues/11 |
1385 | | |
1386 | | /* |
1387 | | /// A module that looks for word codepoints using lazy DFAs. |
1388 | | #[cfg(all( |
1389 | | feature = "unicode-word-boundary", |
1390 | | feature = "syntax", |
1391 | | feature = "unicode-perl", |
1392 | | feature = "hybrid" |
1393 | | ))] |
1394 | | mod is_word_char { |
1395 | | use alloc::vec::Vec; |
1396 | | |
1397 | | use crate::{ |
1398 | | hybrid::dfa::{Cache, DFA}, |
1399 | | nfa::thompson::NFA, |
1400 | | util::{lazy::Lazy, pool::Pool, primitives::StateID}, |
1401 | | Anchored, Input, |
1402 | | }; |
1403 | | |
1404 | | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
1405 | | Ok(()) |
1406 | | } |
1407 | | |
1408 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1409 | | pub(super) fn fwd( |
1410 | | haystack: &[u8], |
1411 | | mut at: usize, |
1412 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1413 | | static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap()); |
1414 | | static CACHE: Lazy<Pool<Cache>> = |
1415 | | Lazy::new(|| Pool::new(|| WORD.create_cache())); |
1416 | | let dfa = Lazy::get(&WORD); |
1417 | | let mut cache = Lazy::get(&CACHE).get(); |
1418 | | let mut sid = dfa |
1419 | | .start_state_forward( |
1420 | | &mut cache, |
1421 | | &Input::new("").anchored(Anchored::Yes), |
1422 | | ) |
1423 | | .unwrap(); |
1424 | | while at < haystack.len() { |
1425 | | let byte = haystack[at]; |
1426 | | sid = dfa.next_state(&mut cache, sid, byte).unwrap(); |
1427 | | at += 1; |
1428 | | if sid.is_tagged() { |
1429 | | if sid.is_match() { |
1430 | | return Ok(true); |
1431 | | } else if sid.is_dead() { |
1432 | | return Ok(false); |
1433 | | } |
1434 | | } |
1435 | | } |
1436 | | Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) |
1437 | | } |
1438 | | |
1439 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1440 | | pub(super) fn rev( |
1441 | | haystack: &[u8], |
1442 | | mut at: usize, |
1443 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1444 | | static WORD: Lazy<DFA> = Lazy::new(|| { |
1445 | | DFA::builder() |
1446 | | .thompson(NFA::config().reverse(true)) |
1447 | | .build(r"\w") |
1448 | | .unwrap() |
1449 | | }); |
1450 | | static CACHE: Lazy<Pool<Cache>> = |
1451 | | Lazy::new(|| Pool::new(|| WORD.create_cache())); |
1452 | | let dfa = Lazy::get(&WORD); |
1453 | | let mut cache = Lazy::get(&CACHE).get(); |
1454 | | let mut sid = dfa |
1455 | | .start_state_reverse( |
1456 | | &mut cache, |
1457 | | &Input::new("").anchored(Anchored::Yes), |
1458 | | ) |
1459 | | .unwrap(); |
1460 | | while at > 0 { |
1461 | | at -= 1; |
1462 | | let byte = haystack[at]; |
1463 | | sid = dfa.next_state(&mut cache, sid, byte).unwrap(); |
1464 | | if sid.is_tagged() { |
1465 | | if sid.is_match() { |
1466 | | return Ok(true); |
1467 | | } else if sid.is_dead() { |
1468 | | return Ok(false); |
1469 | | } |
1470 | | } |
1471 | | } |
1472 | | Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) |
1473 | | } |
1474 | | } |
1475 | | */ |
1476 | | |
1477 | | /* |
1478 | | /// A module that looks for word codepoints using fully compiled DFAs. |
1479 | | #[cfg(all( |
1480 | | feature = "unicode-word-boundary", |
1481 | | feature = "syntax", |
1482 | | feature = "unicode-perl", |
1483 | | feature = "dfa-build" |
1484 | | ))] |
1485 | | mod is_word_char { |
1486 | | use alloc::vec::Vec; |
1487 | | |
1488 | | use crate::{ |
1489 | | dfa::{dense::DFA, Automaton, StartKind}, |
1490 | | nfa::thompson::NFA, |
1491 | | util::{lazy::Lazy, primitives::StateID}, |
1492 | | Anchored, Input, |
1493 | | }; |
1494 | | |
1495 | | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
1496 | | Ok(()) |
1497 | | } |
1498 | | |
1499 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1500 | | pub(super) fn fwd( |
1501 | | haystack: &[u8], |
1502 | | mut at: usize, |
1503 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1504 | | static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { |
1505 | | let dfa = DFA::builder() |
1506 | | .configure(DFA::config().start_kind(StartKind::Anchored)) |
1507 | | .build(r"\w") |
1508 | | .unwrap(); |
1509 | | // OK because our regex has no look-around. |
1510 | | let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); |
1511 | | (dfa, start_id) |
1512 | | }); |
1513 | | let &(ref dfa, mut sid) = Lazy::get(&WORD); |
1514 | | while at < haystack.len() { |
1515 | | let byte = haystack[at]; |
1516 | | sid = dfa.next_state(sid, byte); |
1517 | | at += 1; |
1518 | | if dfa.is_special_state(sid) { |
1519 | | if dfa.is_match_state(sid) { |
1520 | | return Ok(true); |
1521 | | } else if dfa.is_dead_state(sid) { |
1522 | | return Ok(false); |
1523 | | } |
1524 | | } |
1525 | | } |
1526 | | Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) |
1527 | | } |
1528 | | |
1529 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1530 | | pub(super) fn rev( |
1531 | | haystack: &[u8], |
1532 | | mut at: usize, |
1533 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1534 | | static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { |
1535 | | let dfa = DFA::builder() |
1536 | | .configure(DFA::config().start_kind(StartKind::Anchored)) |
1537 | | // From ad hoc measurements, it looks like setting |
1538 | | // shrink==false is slightly faster than shrink==true. I kind |
1539 | | // of feel like this indicates that shrinking is probably a |
1540 | | // failure, although it can help in some cases. Sigh. |
1541 | | .thompson(NFA::config().reverse(true).shrink(false)) |
1542 | | .build(r"\w") |
1543 | | .unwrap(); |
1544 | | // OK because our regex has no look-around. |
1545 | | let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); |
1546 | | (dfa, start_id) |
1547 | | }); |
1548 | | let &(ref dfa, mut sid) = Lazy::get(&WORD); |
1549 | | while at > 0 { |
1550 | | at -= 1; |
1551 | | let byte = haystack[at]; |
1552 | | sid = dfa.next_state(sid, byte); |
1553 | | if dfa.is_special_state(sid) { |
1554 | | if dfa.is_match_state(sid) { |
1555 | | return Ok(true); |
1556 | | } else if dfa.is_dead_state(sid) { |
1557 | | return Ok(false); |
1558 | | } |
1559 | | } |
1560 | | } |
1561 | | Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) |
1562 | | } |
1563 | | } |
1564 | | */ |
1565 | | |
1566 | | /// A module that looks for word codepoints using regex-syntax's data tables. |
1567 | | #[cfg(all( |
1568 | | feature = "unicode-word-boundary", |
1569 | | feature = "syntax", |
1570 | | feature = "unicode-perl", |
1571 | | ))] |
1572 | | mod is_word_char { |
1573 | | use regex_syntax::try_is_word_character; |
1574 | | |
1575 | | use crate::util::utf8; |
1576 | | |
1577 | | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
1578 | | Ok(()) |
1579 | | } |
1580 | | |
1581 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1582 | | pub(super) fn fwd( |
1583 | | haystack: &[u8], |
1584 | | at: usize, |
1585 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1586 | | Ok(match utf8::decode(&haystack[at..]) { |
1587 | | None | Some(Err(_)) => false, |
1588 | | Some(Ok(ch)) => try_is_word_character(ch).expect( |
1589 | | "since unicode-word-boundary, syntax and unicode-perl \ |
1590 | | are all enabled, it is expected that \ |
1591 | | try_is_word_character succeeds", |
1592 | | ), |
1593 | | }) |
1594 | | } |
1595 | | |
1596 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1597 | | pub(super) fn rev( |
1598 | | haystack: &[u8], |
1599 | | at: usize, |
1600 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1601 | | Ok(match utf8::decode_last(&haystack[..at]) { |
1602 | | None | Some(Err(_)) => false, |
1603 | | Some(Ok(ch)) => try_is_word_character(ch).expect( |
1604 | | "since unicode-word-boundary, syntax and unicode-perl \ |
1605 | | are all enabled, it is expected that \ |
1606 | | try_is_word_character succeeds", |
1607 | | ), |
1608 | | }) |
1609 | | } |
1610 | | } |
1611 | | |
1612 | | /// A module that looks for word codepoints using regex-automata's data tables |
1613 | | /// (which are only compiled when regex-syntax's tables aren't available). |
1614 | | /// |
1615 | | /// Note that the cfg should match the one in src/util/unicode_data/mod.rs for |
1616 | | /// perl_word. |
1617 | | #[cfg(all( |
1618 | | feature = "unicode-word-boundary", |
1619 | | not(all(feature = "syntax", feature = "unicode-perl")), |
1620 | | ))] |
1621 | | mod is_word_char { |
1622 | | use crate::util::utf8; |
1623 | | |
1624 | | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
1625 | | Ok(()) |
1626 | | } |
1627 | | |
1628 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1629 | | pub(super) fn fwd( |
1630 | | haystack: &[u8], |
1631 | | at: usize, |
1632 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1633 | | Ok(match utf8::decode(&haystack[at..]) { |
1634 | | None | Some(Err(_)) => false, |
1635 | | Some(Ok(ch)) => is_word_character(ch), |
1636 | | }) |
1637 | | } |
1638 | | |
1639 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1640 | | pub(super) fn rev( |
1641 | | haystack: &[u8], |
1642 | | at: usize, |
1643 | | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1644 | | Ok(match utf8::decode_last(&haystack[..at]) { |
1645 | | None | Some(Err(_)) => false, |
1646 | | Some(Ok(ch)) => is_word_character(ch), |
1647 | | }) |
1648 | | } |
1649 | | |
1650 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1651 | | fn is_word_character(c: char) -> bool { |
1652 | | use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; |
1653 | | |
1654 | | if u8::try_from(c).map_or(false, utf8::is_word_byte) { |
1655 | | return true; |
1656 | | } |
1657 | | PERL_WORD |
1658 | | .binary_search_by(|&(start, end)| { |
1659 | | use core::cmp::Ordering; |
1660 | | |
1661 | | if start <= c && c <= end { |
1662 | | Ordering::Equal |
1663 | | } else if start > c { |
1664 | | Ordering::Greater |
1665 | | } else { |
1666 | | Ordering::Less |
1667 | | } |
1668 | | }) |
1669 | | .is_ok() |
1670 | | } |
1671 | | } |
1672 | | |
1673 | | /// A module that always returns an error if Unicode word boundaries are |
1674 | | /// disabled. When this feature is disabled, then regex-automata will not |
1675 | | /// include its own data tables even if regex-syntax is disabled. |
1676 | | #[cfg(not(feature = "unicode-word-boundary"))] |
1677 | | mod is_word_char { |
1678 | 0 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
1679 | 0 | Err(super::UnicodeWordBoundaryError::new()) |
1680 | 0 | } |
1681 | | |
1682 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1683 | 0 | pub(super) fn fwd( |
1684 | 0 | _bytes: &[u8], |
1685 | 0 | _at: usize, |
1686 | 0 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1687 | 0 | Err(super::UnicodeWordBoundaryError::new()) |
1688 | 0 | } |
1689 | | |
1690 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1691 | 0 | pub(super) fn rev( |
1692 | 0 | _bytes: &[u8], |
1693 | 0 | _at: usize, |
1694 | 0 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
1695 | 0 | Err(super::UnicodeWordBoundaryError::new()) |
1696 | 0 | } |
1697 | | } |
1698 | | |
1699 | | #[cfg(test)] |
1700 | | mod tests { |
1701 | | use super::*; |
1702 | | |
1703 | | macro_rules! testlook { |
1704 | | ($look:expr, $haystack:expr, $at:expr) => { |
1705 | | LookMatcher::default().matches($look, $haystack.as_bytes(), $at) |
1706 | | }; |
1707 | | } |
1708 | | |
1709 | | #[test] |
1710 | | fn look_matches_start_line() { |
1711 | | let look = Look::StartLF; |
1712 | | |
1713 | | assert!(testlook!(look, "", 0)); |
1714 | | assert!(testlook!(look, "\n", 0)); |
1715 | | assert!(testlook!(look, "\n", 1)); |
1716 | | assert!(testlook!(look, "a", 0)); |
1717 | | assert!(testlook!(look, "\na", 1)); |
1718 | | |
1719 | | assert!(!testlook!(look, "a", 1)); |
1720 | | assert!(!testlook!(look, "a\na", 1)); |
1721 | | } |
1722 | | |
1723 | | #[test] |
1724 | | fn look_matches_end_line() { |
1725 | | let look = Look::EndLF; |
1726 | | |
1727 | | assert!(testlook!(look, "", 0)); |
1728 | | assert!(testlook!(look, "\n", 1)); |
1729 | | assert!(testlook!(look, "\na", 0)); |
1730 | | assert!(testlook!(look, "\na", 2)); |
1731 | | assert!(testlook!(look, "a\na", 1)); |
1732 | | |
1733 | | assert!(!testlook!(look, "a", 0)); |
1734 | | assert!(!testlook!(look, "\na", 1)); |
1735 | | assert!(!testlook!(look, "a\na", 0)); |
1736 | | assert!(!testlook!(look, "a\na", 2)); |
1737 | | } |
1738 | | |
1739 | | #[test] |
1740 | | fn look_matches_start_text() { |
1741 | | let look = Look::Start; |
1742 | | |
1743 | | assert!(testlook!(look, "", 0)); |
1744 | | assert!(testlook!(look, "\n", 0)); |
1745 | | assert!(testlook!(look, "a", 0)); |
1746 | | |
1747 | | assert!(!testlook!(look, "\n", 1)); |
1748 | | assert!(!testlook!(look, "\na", 1)); |
1749 | | assert!(!testlook!(look, "a", 1)); |
1750 | | assert!(!testlook!(look, "a\na", 1)); |
1751 | | } |
1752 | | |
1753 | | #[test] |
1754 | | fn look_matches_end_text() { |
1755 | | let look = Look::End; |
1756 | | |
1757 | | assert!(testlook!(look, "", 0)); |
1758 | | assert!(testlook!(look, "\n", 1)); |
1759 | | assert!(testlook!(look, "\na", 2)); |
1760 | | |
1761 | | assert!(!testlook!(look, "\na", 0)); |
1762 | | assert!(!testlook!(look, "a\na", 1)); |
1763 | | assert!(!testlook!(look, "a", 0)); |
1764 | | assert!(!testlook!(look, "\na", 1)); |
1765 | | assert!(!testlook!(look, "a\na", 0)); |
1766 | | assert!(!testlook!(look, "a\na", 2)); |
1767 | | } |
1768 | | |
1769 | | #[test] |
1770 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
1771 | | fn look_matches_word_unicode() { |
1772 | | let look = Look::WordUnicode; |
1773 | | |
1774 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
1775 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
1776 | | |
1777 | | // Simple ASCII word boundaries. |
1778 | | assert!(testlook!(look, "a", 0)); |
1779 | | assert!(testlook!(look, "a", 1)); |
1780 | | assert!(testlook!(look, "a ", 1)); |
1781 | | assert!(testlook!(look, " a ", 1)); |
1782 | | assert!(testlook!(look, " a ", 2)); |
1783 | | |
1784 | | // Unicode word boundaries with a non-ASCII codepoint. |
1785 | | assert!(testlook!(look, "𝛃", 0)); |
1786 | | assert!(testlook!(look, "𝛃", 4)); |
1787 | | assert!(testlook!(look, "𝛃 ", 4)); |
1788 | | assert!(testlook!(look, " 𝛃 ", 1)); |
1789 | | assert!(testlook!(look, " 𝛃 ", 5)); |
1790 | | |
1791 | | // Unicode word boundaries between non-ASCII codepoints. |
1792 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
1793 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
1794 | | |
1795 | | // Non word boundaries for ASCII. |
1796 | | assert!(!testlook!(look, "", 0)); |
1797 | | assert!(!testlook!(look, "ab", 1)); |
1798 | | assert!(!testlook!(look, "a ", 2)); |
1799 | | assert!(!testlook!(look, " a ", 0)); |
1800 | | assert!(!testlook!(look, " a ", 3)); |
1801 | | |
1802 | | // Non word boundaries with a non-ASCII codepoint. |
1803 | | assert!(!testlook!(look, "𝛃b", 4)); |
1804 | | assert!(!testlook!(look, "𝛃 ", 5)); |
1805 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
1806 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
1807 | | assert!(!testlook!(look, "𝛃", 1)); |
1808 | | assert!(!testlook!(look, "𝛃", 2)); |
1809 | | assert!(!testlook!(look, "𝛃", 3)); |
1810 | | |
1811 | | // Non word boundaries with non-ASCII codepoints. |
1812 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
1813 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
1814 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
1815 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
1816 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
1817 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
1818 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
1819 | | } |
1820 | | |
1821 | | #[test] |
1822 | | fn look_matches_word_ascii() { |
1823 | | let look = Look::WordAscii; |
1824 | | |
1825 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
1826 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
1827 | | |
1828 | | // Simple ASCII word boundaries. |
1829 | | assert!(testlook!(look, "a", 0)); |
1830 | | assert!(testlook!(look, "a", 1)); |
1831 | | assert!(testlook!(look, "a ", 1)); |
1832 | | assert!(testlook!(look, " a ", 1)); |
1833 | | assert!(testlook!(look, " a ", 2)); |
1834 | | |
1835 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
1836 | | // an ASCII word boundary, none of these match. |
1837 | | assert!(!testlook!(look, "𝛃", 0)); |
1838 | | assert!(!testlook!(look, "𝛃", 4)); |
1839 | | assert!(!testlook!(look, "𝛃 ", 4)); |
1840 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
1841 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
1842 | | |
1843 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
1844 | | // this is an ASCII word boundary, none of these match. |
1845 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
1846 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
1847 | | |
1848 | | // Non word boundaries for ASCII. |
1849 | | assert!(!testlook!(look, "", 0)); |
1850 | | assert!(!testlook!(look, "ab", 1)); |
1851 | | assert!(!testlook!(look, "a ", 2)); |
1852 | | assert!(!testlook!(look, " a ", 0)); |
1853 | | assert!(!testlook!(look, " a ", 3)); |
1854 | | |
1855 | | // Non word boundaries with a non-ASCII codepoint. |
1856 | | assert!(testlook!(look, "𝛃b", 4)); |
1857 | | assert!(!testlook!(look, "𝛃 ", 5)); |
1858 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
1859 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
1860 | | assert!(!testlook!(look, "𝛃", 1)); |
1861 | | assert!(!testlook!(look, "𝛃", 2)); |
1862 | | assert!(!testlook!(look, "𝛃", 3)); |
1863 | | |
1864 | | // Non word boundaries with non-ASCII codepoints. |
1865 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
1866 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
1867 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
1868 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
1869 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
1870 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
1871 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
1872 | | } |
1873 | | |
1874 | | #[test] |
1875 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
1876 | | fn look_matches_word_unicode_negate() { |
1877 | | let look = Look::WordUnicodeNegate; |
1878 | | |
1879 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
1880 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
1881 | | |
1882 | | // Simple ASCII word boundaries. |
1883 | | assert!(!testlook!(look, "a", 0)); |
1884 | | assert!(!testlook!(look, "a", 1)); |
1885 | | assert!(!testlook!(look, "a ", 1)); |
1886 | | assert!(!testlook!(look, " a ", 1)); |
1887 | | assert!(!testlook!(look, " a ", 2)); |
1888 | | |
1889 | | // Unicode word boundaries with a non-ASCII codepoint. |
1890 | | assert!(!testlook!(look, "𝛃", 0)); |
1891 | | assert!(!testlook!(look, "𝛃", 4)); |
1892 | | assert!(!testlook!(look, "𝛃 ", 4)); |
1893 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
1894 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
1895 | | |
1896 | | // Unicode word boundaries between non-ASCII codepoints. |
1897 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
1898 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
1899 | | |
1900 | | // Non word boundaries for ASCII. |
1901 | | assert!(testlook!(look, "", 0)); |
1902 | | assert!(testlook!(look, "ab", 1)); |
1903 | | assert!(testlook!(look, "a ", 2)); |
1904 | | assert!(testlook!(look, " a ", 0)); |
1905 | | assert!(testlook!(look, " a ", 3)); |
1906 | | |
1907 | | // Non word boundaries with a non-ASCII codepoint. |
1908 | | assert!(testlook!(look, "𝛃b", 4)); |
1909 | | assert!(testlook!(look, "𝛃 ", 5)); |
1910 | | assert!(testlook!(look, " 𝛃 ", 0)); |
1911 | | assert!(testlook!(look, " 𝛃 ", 6)); |
1912 | | // These don't match because they could otherwise return an offset that |
1913 | | // splits the UTF-8 encoding of a codepoint. |
1914 | | assert!(!testlook!(look, "𝛃", 1)); |
1915 | | assert!(!testlook!(look, "𝛃", 2)); |
1916 | | assert!(!testlook!(look, "𝛃", 3)); |
1917 | | |
1918 | | // Non word boundaries with non-ASCII codepoints. These also don't |
1919 | | // match because they could otherwise return an offset that splits the |
1920 | | // UTF-8 encoding of a codepoint. |
1921 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
1922 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
1923 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
1924 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
1925 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
1926 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
1927 | | // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end |
1928 | | // of the haystack. So the "end" of the haystack isn't a word and 𐆀 |
1929 | | // isn't a word, thus, \B matches. |
1930 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
1931 | | } |
1932 | | |
1933 | | #[test] |
1934 | | fn look_matches_word_ascii_negate() { |
1935 | | let look = Look::WordAsciiNegate; |
1936 | | |
1937 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
1938 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
1939 | | |
1940 | | // Simple ASCII word boundaries. |
1941 | | assert!(!testlook!(look, "a", 0)); |
1942 | | assert!(!testlook!(look, "a", 1)); |
1943 | | assert!(!testlook!(look, "a ", 1)); |
1944 | | assert!(!testlook!(look, " a ", 1)); |
1945 | | assert!(!testlook!(look, " a ", 2)); |
1946 | | |
1947 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
1948 | | // an ASCII word boundary, none of these match. |
1949 | | assert!(testlook!(look, "𝛃", 0)); |
1950 | | assert!(testlook!(look, "𝛃", 4)); |
1951 | | assert!(testlook!(look, "𝛃 ", 4)); |
1952 | | assert!(testlook!(look, " 𝛃 ", 1)); |
1953 | | assert!(testlook!(look, " 𝛃 ", 5)); |
1954 | | |
1955 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
1956 | | // this is an ASCII word boundary, none of these match. |
1957 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
1958 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
1959 | | |
1960 | | // Non word boundaries for ASCII. |
1961 | | assert!(testlook!(look, "", 0)); |
1962 | | assert!(testlook!(look, "ab", 1)); |
1963 | | assert!(testlook!(look, "a ", 2)); |
1964 | | assert!(testlook!(look, " a ", 0)); |
1965 | | assert!(testlook!(look, " a ", 3)); |
1966 | | |
1967 | | // Non word boundaries with a non-ASCII codepoint. |
1968 | | assert!(!testlook!(look, "𝛃b", 4)); |
1969 | | assert!(testlook!(look, "𝛃 ", 5)); |
1970 | | assert!(testlook!(look, " 𝛃 ", 0)); |
1971 | | assert!(testlook!(look, " 𝛃 ", 6)); |
1972 | | assert!(testlook!(look, "𝛃", 1)); |
1973 | | assert!(testlook!(look, "𝛃", 2)); |
1974 | | assert!(testlook!(look, "𝛃", 3)); |
1975 | | |
1976 | | // Non word boundaries with non-ASCII codepoints. |
1977 | | assert!(testlook!(look, "𝛃𐆀", 1)); |
1978 | | assert!(testlook!(look, "𝛃𐆀", 2)); |
1979 | | assert!(testlook!(look, "𝛃𐆀", 3)); |
1980 | | assert!(testlook!(look, "𝛃𐆀", 5)); |
1981 | | assert!(testlook!(look, "𝛃𐆀", 6)); |
1982 | | assert!(testlook!(look, "𝛃𐆀", 7)); |
1983 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
1984 | | } |
1985 | | |
1986 | | #[test] |
1987 | | fn look_matches_word_start_ascii() { |
1988 | | let look = Look::WordStartAscii; |
1989 | | |
1990 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
1991 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
1992 | | |
1993 | | // Simple ASCII word boundaries. |
1994 | | assert!(testlook!(look, "a", 0)); |
1995 | | assert!(!testlook!(look, "a", 1)); |
1996 | | assert!(!testlook!(look, "a ", 1)); |
1997 | | assert!(testlook!(look, " a ", 1)); |
1998 | | assert!(!testlook!(look, " a ", 2)); |
1999 | | |
2000 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
2001 | | // an ASCII word boundary, none of these match. |
2002 | | assert!(!testlook!(look, "𝛃", 0)); |
2003 | | assert!(!testlook!(look, "𝛃", 4)); |
2004 | | assert!(!testlook!(look, "𝛃 ", 4)); |
2005 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
2006 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
2007 | | |
2008 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
2009 | | // this is an ASCII word boundary, none of these match. |
2010 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
2011 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
2012 | | |
2013 | | // Non word boundaries for ASCII. |
2014 | | assert!(!testlook!(look, "", 0)); |
2015 | | assert!(!testlook!(look, "ab", 1)); |
2016 | | assert!(!testlook!(look, "a ", 2)); |
2017 | | assert!(!testlook!(look, " a ", 0)); |
2018 | | assert!(!testlook!(look, " a ", 3)); |
2019 | | |
2020 | | // Non word boundaries with a non-ASCII codepoint. |
2021 | | assert!(testlook!(look, "𝛃b", 4)); |
2022 | | assert!(!testlook!(look, "b𝛃", 1)); |
2023 | | assert!(!testlook!(look, "𝛃 ", 5)); |
2024 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
2025 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
2026 | | assert!(!testlook!(look, "𝛃", 1)); |
2027 | | assert!(!testlook!(look, "𝛃", 2)); |
2028 | | assert!(!testlook!(look, "𝛃", 3)); |
2029 | | |
2030 | | // Non word boundaries with non-ASCII codepoints. |
2031 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2032 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2033 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2034 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2035 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2036 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2037 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
2038 | | } |
2039 | | |
2040 | | #[test] |
2041 | | fn look_matches_word_end_ascii() { |
2042 | | let look = Look::WordEndAscii; |
2043 | | |
2044 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2045 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2046 | | |
2047 | | // Simple ASCII word boundaries. |
2048 | | assert!(!testlook!(look, "a", 0)); |
2049 | | assert!(testlook!(look, "a", 1)); |
2050 | | assert!(testlook!(look, "a ", 1)); |
2051 | | assert!(!testlook!(look, " a ", 1)); |
2052 | | assert!(testlook!(look, " a ", 2)); |
2053 | | |
2054 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
2055 | | // an ASCII word boundary, none of these match. |
2056 | | assert!(!testlook!(look, "𝛃", 0)); |
2057 | | assert!(!testlook!(look, "𝛃", 4)); |
2058 | | assert!(!testlook!(look, "𝛃 ", 4)); |
2059 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
2060 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
2061 | | |
2062 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
2063 | | // this is an ASCII word boundary, none of these match. |
2064 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
2065 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
2066 | | |
2067 | | // Non word boundaries for ASCII. |
2068 | | assert!(!testlook!(look, "", 0)); |
2069 | | assert!(!testlook!(look, "ab", 1)); |
2070 | | assert!(!testlook!(look, "a ", 2)); |
2071 | | assert!(!testlook!(look, " a ", 0)); |
2072 | | assert!(!testlook!(look, " a ", 3)); |
2073 | | |
2074 | | // Non word boundaries with a non-ASCII codepoint. |
2075 | | assert!(!testlook!(look, "𝛃b", 4)); |
2076 | | assert!(testlook!(look, "b𝛃", 1)); |
2077 | | assert!(!testlook!(look, "𝛃 ", 5)); |
2078 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
2079 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
2080 | | assert!(!testlook!(look, "𝛃", 1)); |
2081 | | assert!(!testlook!(look, "𝛃", 2)); |
2082 | | assert!(!testlook!(look, "𝛃", 3)); |
2083 | | |
2084 | | // Non word boundaries with non-ASCII codepoints. |
2085 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2086 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2087 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2088 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2089 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2090 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2091 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
2092 | | } |
2093 | | |
2094 | | #[test] |
2095 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
2096 | | fn look_matches_word_start_unicode() { |
2097 | | let look = Look::WordStartUnicode; |
2098 | | |
2099 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2100 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2101 | | |
2102 | | // Simple ASCII word boundaries. |
2103 | | assert!(testlook!(look, "a", 0)); |
2104 | | assert!(!testlook!(look, "a", 1)); |
2105 | | assert!(!testlook!(look, "a ", 1)); |
2106 | | assert!(testlook!(look, " a ", 1)); |
2107 | | assert!(!testlook!(look, " a ", 2)); |
2108 | | |
2109 | | // Unicode word boundaries with a non-ASCII codepoint. |
2110 | | assert!(testlook!(look, "𝛃", 0)); |
2111 | | assert!(!testlook!(look, "𝛃", 4)); |
2112 | | assert!(!testlook!(look, "𝛃 ", 4)); |
2113 | | assert!(testlook!(look, " 𝛃 ", 1)); |
2114 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
2115 | | |
2116 | | // Unicode word boundaries between non-ASCII codepoints. |
2117 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
2118 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
2119 | | |
2120 | | // Non word boundaries for ASCII. |
2121 | | assert!(!testlook!(look, "", 0)); |
2122 | | assert!(!testlook!(look, "ab", 1)); |
2123 | | assert!(!testlook!(look, "a ", 2)); |
2124 | | assert!(!testlook!(look, " a ", 0)); |
2125 | | assert!(!testlook!(look, " a ", 3)); |
2126 | | |
2127 | | // Non word boundaries with a non-ASCII codepoint. |
2128 | | assert!(!testlook!(look, "𝛃b", 4)); |
2129 | | assert!(!testlook!(look, "b𝛃", 1)); |
2130 | | assert!(!testlook!(look, "𝛃 ", 5)); |
2131 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
2132 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
2133 | | assert!(!testlook!(look, "𝛃", 1)); |
2134 | | assert!(!testlook!(look, "𝛃", 2)); |
2135 | | assert!(!testlook!(look, "𝛃", 3)); |
2136 | | |
2137 | | // Non word boundaries with non-ASCII codepoints. |
2138 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2139 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2140 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2141 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2142 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2143 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2144 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
2145 | | } |
2146 | | |
2147 | | #[test] |
2148 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
2149 | | fn look_matches_word_end_unicode() { |
2150 | | let look = Look::WordEndUnicode; |
2151 | | |
2152 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2153 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2154 | | |
2155 | | // Simple ASCII word boundaries. |
2156 | | assert!(!testlook!(look, "a", 0)); |
2157 | | assert!(testlook!(look, "a", 1)); |
2158 | | assert!(testlook!(look, "a ", 1)); |
2159 | | assert!(!testlook!(look, " a ", 1)); |
2160 | | assert!(testlook!(look, " a ", 2)); |
2161 | | |
2162 | | // Unicode word boundaries with a non-ASCII codepoint. |
2163 | | assert!(!testlook!(look, "𝛃", 0)); |
2164 | | assert!(testlook!(look, "𝛃", 4)); |
2165 | | assert!(testlook!(look, "𝛃 ", 4)); |
2166 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
2167 | | assert!(testlook!(look, " 𝛃 ", 5)); |
2168 | | |
2169 | | // Unicode word boundaries between non-ASCII codepoints. |
2170 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
2171 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
2172 | | |
2173 | | // Non word boundaries for ASCII. |
2174 | | assert!(!testlook!(look, "", 0)); |
2175 | | assert!(!testlook!(look, "ab", 1)); |
2176 | | assert!(!testlook!(look, "a ", 2)); |
2177 | | assert!(!testlook!(look, " a ", 0)); |
2178 | | assert!(!testlook!(look, " a ", 3)); |
2179 | | |
2180 | | // Non word boundaries with a non-ASCII codepoint. |
2181 | | assert!(!testlook!(look, "𝛃b", 4)); |
2182 | | assert!(!testlook!(look, "b𝛃", 1)); |
2183 | | assert!(!testlook!(look, "𝛃 ", 5)); |
2184 | | assert!(!testlook!(look, " 𝛃 ", 0)); |
2185 | | assert!(!testlook!(look, " 𝛃 ", 6)); |
2186 | | assert!(!testlook!(look, "𝛃", 1)); |
2187 | | assert!(!testlook!(look, "𝛃", 2)); |
2188 | | assert!(!testlook!(look, "𝛃", 3)); |
2189 | | |
2190 | | // Non word boundaries with non-ASCII codepoints. |
2191 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2192 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2193 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2194 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2195 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2196 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2197 | | assert!(!testlook!(look, "𝛃𐆀", 8)); |
2198 | | } |
2199 | | |
2200 | | #[test] |
2201 | | fn look_matches_word_start_half_ascii() { |
2202 | | let look = Look::WordStartHalfAscii; |
2203 | | |
2204 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2205 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2206 | | |
2207 | | // Simple ASCII word boundaries. |
2208 | | assert!(testlook!(look, "a", 0)); |
2209 | | assert!(!testlook!(look, "a", 1)); |
2210 | | assert!(!testlook!(look, "a ", 1)); |
2211 | | assert!(testlook!(look, " a ", 1)); |
2212 | | assert!(!testlook!(look, " a ", 2)); |
2213 | | |
2214 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
2215 | | // an ASCII word boundary, none of these match. |
2216 | | assert!(testlook!(look, "𝛃", 0)); |
2217 | | assert!(testlook!(look, "𝛃", 4)); |
2218 | | assert!(testlook!(look, "𝛃 ", 4)); |
2219 | | assert!(testlook!(look, " 𝛃 ", 1)); |
2220 | | assert!(testlook!(look, " 𝛃 ", 5)); |
2221 | | |
2222 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
2223 | | // this is an ASCII word boundary, none of these match. |
2224 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
2225 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
2226 | | |
2227 | | // Non word boundaries for ASCII. |
2228 | | assert!(testlook!(look, "", 0)); |
2229 | | assert!(!testlook!(look, "ab", 1)); |
2230 | | assert!(testlook!(look, "a ", 2)); |
2231 | | assert!(testlook!(look, " a ", 0)); |
2232 | | assert!(testlook!(look, " a ", 3)); |
2233 | | |
2234 | | // Non word boundaries with a non-ASCII codepoint. |
2235 | | assert!(testlook!(look, "𝛃b", 4)); |
2236 | | assert!(!testlook!(look, "b𝛃", 1)); |
2237 | | assert!(testlook!(look, "𝛃 ", 5)); |
2238 | | assert!(testlook!(look, " 𝛃 ", 0)); |
2239 | | assert!(testlook!(look, " 𝛃 ", 6)); |
2240 | | assert!(testlook!(look, "𝛃", 1)); |
2241 | | assert!(testlook!(look, "𝛃", 2)); |
2242 | | assert!(testlook!(look, "𝛃", 3)); |
2243 | | |
2244 | | // Non word boundaries with non-ASCII codepoints. |
2245 | | assert!(testlook!(look, "𝛃𐆀", 1)); |
2246 | | assert!(testlook!(look, "𝛃𐆀", 2)); |
2247 | | assert!(testlook!(look, "𝛃𐆀", 3)); |
2248 | | assert!(testlook!(look, "𝛃𐆀", 5)); |
2249 | | assert!(testlook!(look, "𝛃𐆀", 6)); |
2250 | | assert!(testlook!(look, "𝛃𐆀", 7)); |
2251 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
2252 | | } |
2253 | | |
2254 | | #[test] |
2255 | | fn look_matches_word_end_half_ascii() { |
2256 | | let look = Look::WordEndHalfAscii; |
2257 | | |
2258 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2259 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2260 | | |
2261 | | // Simple ASCII word boundaries. |
2262 | | assert!(!testlook!(look, "a", 0)); |
2263 | | assert!(testlook!(look, "a", 1)); |
2264 | | assert!(testlook!(look, "a ", 1)); |
2265 | | assert!(!testlook!(look, " a ", 1)); |
2266 | | assert!(testlook!(look, " a ", 2)); |
2267 | | |
2268 | | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
2269 | | // an ASCII word boundary, none of these match. |
2270 | | assert!(testlook!(look, "𝛃", 0)); |
2271 | | assert!(testlook!(look, "𝛃", 4)); |
2272 | | assert!(testlook!(look, "𝛃 ", 4)); |
2273 | | assert!(testlook!(look, " 𝛃 ", 1)); |
2274 | | assert!(testlook!(look, " 𝛃 ", 5)); |
2275 | | |
2276 | | // Unicode word boundaries between non-ASCII codepoints. Again, since |
2277 | | // this is an ASCII word boundary, none of these match. |
2278 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
2279 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
2280 | | |
2281 | | // Non word boundaries for ASCII. |
2282 | | assert!(testlook!(look, "", 0)); |
2283 | | assert!(!testlook!(look, "ab", 1)); |
2284 | | assert!(testlook!(look, "a ", 2)); |
2285 | | assert!(testlook!(look, " a ", 0)); |
2286 | | assert!(testlook!(look, " a ", 3)); |
2287 | | |
2288 | | // Non word boundaries with a non-ASCII codepoint. |
2289 | | assert!(!testlook!(look, "𝛃b", 4)); |
2290 | | assert!(testlook!(look, "b𝛃", 1)); |
2291 | | assert!(testlook!(look, "𝛃 ", 5)); |
2292 | | assert!(testlook!(look, " 𝛃 ", 0)); |
2293 | | assert!(testlook!(look, " 𝛃 ", 6)); |
2294 | | assert!(testlook!(look, "𝛃", 1)); |
2295 | | assert!(testlook!(look, "𝛃", 2)); |
2296 | | assert!(testlook!(look, "𝛃", 3)); |
2297 | | |
2298 | | // Non word boundaries with non-ASCII codepoints. |
2299 | | assert!(testlook!(look, "𝛃𐆀", 1)); |
2300 | | assert!(testlook!(look, "𝛃𐆀", 2)); |
2301 | | assert!(testlook!(look, "𝛃𐆀", 3)); |
2302 | | assert!(testlook!(look, "𝛃𐆀", 5)); |
2303 | | assert!(testlook!(look, "𝛃𐆀", 6)); |
2304 | | assert!(testlook!(look, "𝛃𐆀", 7)); |
2305 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
2306 | | } |
2307 | | |
2308 | | #[test] |
2309 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
2310 | | fn look_matches_word_start_half_unicode() { |
2311 | | let look = Look::WordStartHalfUnicode; |
2312 | | |
2313 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2314 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2315 | | |
2316 | | // Simple ASCII word boundaries. |
2317 | | assert!(testlook!(look, "a", 0)); |
2318 | | assert!(!testlook!(look, "a", 1)); |
2319 | | assert!(!testlook!(look, "a ", 1)); |
2320 | | assert!(testlook!(look, " a ", 1)); |
2321 | | assert!(!testlook!(look, " a ", 2)); |
2322 | | |
2323 | | // Unicode word boundaries with a non-ASCII codepoint. |
2324 | | assert!(testlook!(look, "𝛃", 0)); |
2325 | | assert!(!testlook!(look, "𝛃", 4)); |
2326 | | assert!(!testlook!(look, "𝛃 ", 4)); |
2327 | | assert!(testlook!(look, " 𝛃 ", 1)); |
2328 | | assert!(!testlook!(look, " 𝛃 ", 5)); |
2329 | | |
2330 | | // Unicode word boundaries between non-ASCII codepoints. |
2331 | | assert!(testlook!(look, "𝛃𐆀", 0)); |
2332 | | assert!(!testlook!(look, "𝛃𐆀", 4)); |
2333 | | |
2334 | | // Non word boundaries for ASCII. |
2335 | | assert!(testlook!(look, "", 0)); |
2336 | | assert!(!testlook!(look, "ab", 1)); |
2337 | | assert!(testlook!(look, "a ", 2)); |
2338 | | assert!(testlook!(look, " a ", 0)); |
2339 | | assert!(testlook!(look, " a ", 3)); |
2340 | | |
2341 | | // Non word boundaries with a non-ASCII codepoint. |
2342 | | assert!(!testlook!(look, "𝛃b", 4)); |
2343 | | assert!(!testlook!(look, "b𝛃", 1)); |
2344 | | assert!(testlook!(look, "𝛃 ", 5)); |
2345 | | assert!(testlook!(look, " 𝛃 ", 0)); |
2346 | | assert!(testlook!(look, " 𝛃 ", 6)); |
2347 | | assert!(!testlook!(look, "𝛃", 1)); |
2348 | | assert!(!testlook!(look, "𝛃", 2)); |
2349 | | assert!(!testlook!(look, "𝛃", 3)); |
2350 | | |
2351 | | // Non word boundaries with non-ASCII codepoints. |
2352 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2353 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2354 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2355 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2356 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2357 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2358 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
2359 | | } |
2360 | | |
2361 | | #[test] |
2362 | | #[cfg(all(not(miri), feature = "unicode-word-boundary"))] |
2363 | | fn look_matches_word_end_half_unicode() { |
2364 | | let look = Look::WordEndHalfUnicode; |
2365 | | |
2366 | | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
2367 | | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
2368 | | |
2369 | | // Simple ASCII word boundaries. |
2370 | | assert!(!testlook!(look, "a", 0)); |
2371 | | assert!(testlook!(look, "a", 1)); |
2372 | | assert!(testlook!(look, "a ", 1)); |
2373 | | assert!(!testlook!(look, " a ", 1)); |
2374 | | assert!(testlook!(look, " a ", 2)); |
2375 | | |
2376 | | // Unicode word boundaries with a non-ASCII codepoint. |
2377 | | assert!(!testlook!(look, "𝛃", 0)); |
2378 | | assert!(testlook!(look, "𝛃", 4)); |
2379 | | assert!(testlook!(look, "𝛃 ", 4)); |
2380 | | assert!(!testlook!(look, " 𝛃 ", 1)); |
2381 | | assert!(testlook!(look, " 𝛃 ", 5)); |
2382 | | |
2383 | | // Unicode word boundaries between non-ASCII codepoints. |
2384 | | assert!(!testlook!(look, "𝛃𐆀", 0)); |
2385 | | assert!(testlook!(look, "𝛃𐆀", 4)); |
2386 | | |
2387 | | // Non word boundaries for ASCII. |
2388 | | assert!(testlook!(look, "", 0)); |
2389 | | assert!(!testlook!(look, "ab", 1)); |
2390 | | assert!(testlook!(look, "a ", 2)); |
2391 | | assert!(testlook!(look, " a ", 0)); |
2392 | | assert!(testlook!(look, " a ", 3)); |
2393 | | |
2394 | | // Non word boundaries with a non-ASCII codepoint. |
2395 | | assert!(!testlook!(look, "𝛃b", 4)); |
2396 | | assert!(!testlook!(look, "b𝛃", 1)); |
2397 | | assert!(testlook!(look, "𝛃 ", 5)); |
2398 | | assert!(testlook!(look, " 𝛃 ", 0)); |
2399 | | assert!(testlook!(look, " 𝛃 ", 6)); |
2400 | | assert!(!testlook!(look, "𝛃", 1)); |
2401 | | assert!(!testlook!(look, "𝛃", 2)); |
2402 | | assert!(!testlook!(look, "𝛃", 3)); |
2403 | | |
2404 | | // Non word boundaries with non-ASCII codepoints. |
2405 | | assert!(!testlook!(look, "𝛃𐆀", 1)); |
2406 | | assert!(!testlook!(look, "𝛃𐆀", 2)); |
2407 | | assert!(!testlook!(look, "𝛃𐆀", 3)); |
2408 | | assert!(!testlook!(look, "𝛃𐆀", 5)); |
2409 | | assert!(!testlook!(look, "𝛃𐆀", 6)); |
2410 | | assert!(!testlook!(look, "𝛃𐆀", 7)); |
2411 | | assert!(testlook!(look, "𝛃𐆀", 8)); |
2412 | | } |
2413 | | |
2414 | | #[test] |
2415 | | fn look_set() { |
2416 | | let mut f = LookSet::default(); |
2417 | | assert!(!f.contains(Look::Start)); |
2418 | | assert!(!f.contains(Look::End)); |
2419 | | assert!(!f.contains(Look::StartLF)); |
2420 | | assert!(!f.contains(Look::EndLF)); |
2421 | | assert!(!f.contains(Look::WordUnicode)); |
2422 | | assert!(!f.contains(Look::WordUnicodeNegate)); |
2423 | | assert!(!f.contains(Look::WordAscii)); |
2424 | | assert!(!f.contains(Look::WordAsciiNegate)); |
2425 | | |
2426 | | f = f.insert(Look::Start); |
2427 | | assert!(f.contains(Look::Start)); |
2428 | | f = f.remove(Look::Start); |
2429 | | assert!(!f.contains(Look::Start)); |
2430 | | |
2431 | | f = f.insert(Look::End); |
2432 | | assert!(f.contains(Look::End)); |
2433 | | f = f.remove(Look::End); |
2434 | | assert!(!f.contains(Look::End)); |
2435 | | |
2436 | | f = f.insert(Look::StartLF); |
2437 | | assert!(f.contains(Look::StartLF)); |
2438 | | f = f.remove(Look::StartLF); |
2439 | | assert!(!f.contains(Look::StartLF)); |
2440 | | |
2441 | | f = f.insert(Look::EndLF); |
2442 | | assert!(f.contains(Look::EndLF)); |
2443 | | f = f.remove(Look::EndLF); |
2444 | | assert!(!f.contains(Look::EndLF)); |
2445 | | |
2446 | | f = f.insert(Look::StartCRLF); |
2447 | | assert!(f.contains(Look::StartCRLF)); |
2448 | | f = f.remove(Look::StartCRLF); |
2449 | | assert!(!f.contains(Look::StartCRLF)); |
2450 | | |
2451 | | f = f.insert(Look::EndCRLF); |
2452 | | assert!(f.contains(Look::EndCRLF)); |
2453 | | f = f.remove(Look::EndCRLF); |
2454 | | assert!(!f.contains(Look::EndCRLF)); |
2455 | | |
2456 | | f = f.insert(Look::WordUnicode); |
2457 | | assert!(f.contains(Look::WordUnicode)); |
2458 | | f = f.remove(Look::WordUnicode); |
2459 | | assert!(!f.contains(Look::WordUnicode)); |
2460 | | |
2461 | | f = f.insert(Look::WordUnicodeNegate); |
2462 | | assert!(f.contains(Look::WordUnicodeNegate)); |
2463 | | f = f.remove(Look::WordUnicodeNegate); |
2464 | | assert!(!f.contains(Look::WordUnicodeNegate)); |
2465 | | |
2466 | | f = f.insert(Look::WordAscii); |
2467 | | assert!(f.contains(Look::WordAscii)); |
2468 | | f = f.remove(Look::WordAscii); |
2469 | | assert!(!f.contains(Look::WordAscii)); |
2470 | | |
2471 | | f = f.insert(Look::WordAsciiNegate); |
2472 | | assert!(f.contains(Look::WordAsciiNegate)); |
2473 | | f = f.remove(Look::WordAsciiNegate); |
2474 | | assert!(!f.contains(Look::WordAsciiNegate)); |
2475 | | |
2476 | | f = f.insert(Look::WordStartAscii); |
2477 | | assert!(f.contains(Look::WordStartAscii)); |
2478 | | f = f.remove(Look::WordStartAscii); |
2479 | | assert!(!f.contains(Look::WordStartAscii)); |
2480 | | |
2481 | | f = f.insert(Look::WordEndAscii); |
2482 | | assert!(f.contains(Look::WordEndAscii)); |
2483 | | f = f.remove(Look::WordEndAscii); |
2484 | | assert!(!f.contains(Look::WordEndAscii)); |
2485 | | |
2486 | | f = f.insert(Look::WordStartUnicode); |
2487 | | assert!(f.contains(Look::WordStartUnicode)); |
2488 | | f = f.remove(Look::WordStartUnicode); |
2489 | | assert!(!f.contains(Look::WordStartUnicode)); |
2490 | | |
2491 | | f = f.insert(Look::WordEndUnicode); |
2492 | | assert!(f.contains(Look::WordEndUnicode)); |
2493 | | f = f.remove(Look::WordEndUnicode); |
2494 | | assert!(!f.contains(Look::WordEndUnicode)); |
2495 | | |
2496 | | f = f.insert(Look::WordStartHalfAscii); |
2497 | | assert!(f.contains(Look::WordStartHalfAscii)); |
2498 | | f = f.remove(Look::WordStartHalfAscii); |
2499 | | assert!(!f.contains(Look::WordStartHalfAscii)); |
2500 | | |
2501 | | f = f.insert(Look::WordEndHalfAscii); |
2502 | | assert!(f.contains(Look::WordEndHalfAscii)); |
2503 | | f = f.remove(Look::WordEndHalfAscii); |
2504 | | assert!(!f.contains(Look::WordEndHalfAscii)); |
2505 | | |
2506 | | f = f.insert(Look::WordStartHalfUnicode); |
2507 | | assert!(f.contains(Look::WordStartHalfUnicode)); |
2508 | | f = f.remove(Look::WordStartHalfUnicode); |
2509 | | assert!(!f.contains(Look::WordStartHalfUnicode)); |
2510 | | |
2511 | | f = f.insert(Look::WordEndHalfUnicode); |
2512 | | assert!(f.contains(Look::WordEndHalfUnicode)); |
2513 | | f = f.remove(Look::WordEndHalfUnicode); |
2514 | | assert!(!f.contains(Look::WordEndHalfUnicode)); |
2515 | | } |
2516 | | |
2517 | | #[test] |
2518 | | fn look_set_iter() { |
2519 | | let set = LookSet::empty(); |
2520 | | assert_eq!(0, set.iter().count()); |
2521 | | |
2522 | | let set = LookSet::full(); |
2523 | | assert_eq!(18, set.iter().count()); |
2524 | | |
2525 | | let set = |
2526 | | LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); |
2527 | | assert_eq!(2, set.iter().count()); |
2528 | | |
2529 | | let set = LookSet::empty().insert(Look::StartLF); |
2530 | | assert_eq!(1, set.iter().count()); |
2531 | | |
2532 | | let set = LookSet::empty().insert(Look::WordAsciiNegate); |
2533 | | assert_eq!(1, set.iter().count()); |
2534 | | |
2535 | | let set = LookSet::empty().insert(Look::WordEndHalfUnicode); |
2536 | | assert_eq!(1, set.iter().count()); |
2537 | | } |
2538 | | |
2539 | | #[test] |
2540 | | #[cfg(feature = "alloc")] |
2541 | | fn look_set_debug() { |
2542 | | let res = alloc::format!("{:?}", LookSet::empty()); |
2543 | | assert_eq!("∅", res); |
2544 | | let res = alloc::format!("{:?}", LookSet::full()); |
2545 | | assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); |
2546 | | } |
2547 | | } |