/src/unicode-segmentation/src/word.rs
Line | Count | Source |
1 | | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | use core::cmp; |
12 | | use core::iter::Filter; |
13 | | |
14 | | use crate::tables::word::WordCat; |
15 | | |
16 | | /// An iterator over the substrings of a string which, after splitting the string on |
17 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
18 | | /// contain any characters with the |
19 | | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
20 | | /// property, or with |
21 | | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
22 | | /// |
23 | | /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See |
24 | | /// its documentation for more. |
25 | | /// |
26 | | /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words |
27 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
28 | | #[derive(Debug)] |
29 | | pub struct UnicodeWords<'a> { |
30 | | inner: WordsIter<'a>, |
31 | | } |
32 | | |
33 | | impl<'a> Iterator for UnicodeWords<'a> { |
34 | | type Item = &'a str; |
35 | | #[inline] |
36 | 2.04M | fn next(&mut self) -> Option<Self::Item> { |
37 | 2.04M | match &mut self.inner { |
38 | 693k | WordsIter::Ascii(i) => i.next(), |
39 | 1.35M | WordsIter::Unicode(i) => i.next(), |
40 | | } |
41 | 2.04M | } <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 36 | 2.04M | fn next(&mut self) -> Option<Self::Item> { | 37 | 2.04M | match &mut self.inner { | 38 | 693k | WordsIter::Ascii(i) => i.next(), | 39 | 1.35M | WordsIter::Unicode(i) => i.next(), | 40 | | } | 41 | 2.04M | } |
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next |
42 | | #[inline] |
43 | 4.39k | fn size_hint(&self) -> (usize, Option<usize>) { |
44 | 4.39k | match &self.inner { |
45 | 1.49k | WordsIter::Ascii(i) => i.size_hint(), |
46 | 2.90k | WordsIter::Unicode(i) => i.size_hint(), |
47 | | } |
48 | 4.39k | } <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint Line | Count | Source | 43 | 4.39k | fn size_hint(&self) -> (usize, Option<usize>) { | 44 | 4.39k | match &self.inner { | 45 | 1.49k | WordsIter::Ascii(i) => i.size_hint(), | 46 | 2.90k | WordsIter::Unicode(i) => i.size_hint(), | 47 | | } | 48 | 4.39k | } |
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint |
49 | | } |
50 | | impl<'a> DoubleEndedIterator for UnicodeWords<'a> { |
51 | | #[inline] |
52 | 0 | fn next_back(&mut self) -> Option<Self::Item> { |
53 | 0 | match &mut self.inner { |
54 | 0 | WordsIter::Ascii(i) => i.next_back(), |
55 | 0 | WordsIter::Unicode(i) => i.next_back(), |
56 | | } |
57 | 0 | } |
58 | | } |
59 | | |
60 | | /// An iterator over the substrings of a string which, after splitting the string on |
61 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
62 | | /// contain any characters with the |
63 | | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
64 | | /// property, or with |
65 | | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
66 | | /// This iterator also provides the byte offsets for each substring. |
67 | | /// |
68 | | /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See |
69 | | /// its documentation for more. |
70 | | /// |
71 | | /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices |
72 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
73 | | #[derive(Debug)] |
74 | | pub struct UnicodeWordIndices<'a> { |
75 | | inner: IndicesIter<'a>, |
76 | | } |
77 | | |
78 | | impl<'a> Iterator for UnicodeWordIndices<'a> { |
79 | | type Item = (usize, &'a str); |
80 | | #[inline] |
81 | 0 | fn next(&mut self) -> Option<Self::Item> { |
82 | 0 | match &mut self.inner { |
83 | 0 | IndicesIter::Ascii(i) => i.next(), |
84 | 0 | IndicesIter::Unicode(i) => i.next(), |
85 | | } |
86 | 0 | } |
87 | | #[inline] |
88 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
89 | 0 | match &self.inner { |
90 | 0 | IndicesIter::Ascii(i) => i.size_hint(), |
91 | 0 | IndicesIter::Unicode(i) => i.size_hint(), |
92 | | } |
93 | 0 | } |
94 | | } |
95 | | impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { |
96 | | #[inline] |
97 | 0 | fn next_back(&mut self) -> Option<Self::Item> { |
98 | 0 | match &mut self.inner { |
99 | 0 | IndicesIter::Ascii(i) => i.next_back(), |
100 | 0 | IndicesIter::Unicode(i) => i.next_back(), |
101 | | } |
102 | 0 | } |
103 | | } |
104 | | |
105 | | /// External iterator for a string's |
106 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). |
107 | | /// |
108 | | /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`] |
109 | | /// trait. See its documentation for more. |
110 | | /// |
111 | | /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds |
112 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
113 | | #[derive(Debug, Clone)] |
114 | | pub struct UWordBounds<'a> { |
115 | | string: &'a str, |
116 | | cat: Option<WordCat>, |
117 | | catb: Option<WordCat>, |
118 | | } |
119 | | |
120 | | /// External iterator for word boundaries and byte offsets. |
121 | | /// |
122 | | /// This struct is created by the [`split_word_bound_indices`] method on the |
123 | | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
124 | | /// |
125 | | /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices |
126 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
127 | | #[derive(Debug, Clone)] |
128 | | pub struct UWordBoundIndices<'a> { |
129 | | start_offset: usize, |
130 | | iter: UWordBounds<'a>, |
131 | | } |
132 | | |
133 | | impl<'a> UWordBoundIndices<'a> { |
134 | | #[inline] |
135 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
136 | | /// |
137 | | /// ```rust |
138 | | /// # use unicode_segmentation::UnicodeSegmentation; |
139 | | /// let mut iter = "Hello world".split_word_bound_indices(); |
140 | | /// assert_eq!(iter.as_str(), "Hello world"); |
141 | | /// iter.next(); |
142 | | /// assert_eq!(iter.as_str(), " world"); |
143 | | /// iter.next(); |
144 | | /// assert_eq!(iter.as_str(), "world"); |
145 | | /// ``` |
146 | 0 | pub fn as_str(&self) -> &'a str { |
147 | 0 | self.iter.as_str() |
148 | 0 | } |
149 | | } |
150 | | |
151 | | impl<'a> Iterator for UWordBoundIndices<'a> { |
152 | | type Item = (usize, &'a str); |
153 | | |
154 | | #[inline] |
155 | 0 | fn next(&mut self) -> Option<(usize, &'a str)> { |
156 | 0 | self.iter |
157 | 0 | .next() |
158 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
159 | 0 | } |
160 | | |
161 | | #[inline] |
162 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
163 | 0 | self.iter.size_hint() |
164 | 0 | } |
165 | | } |
166 | | |
167 | | impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { |
168 | | #[inline] |
169 | 0 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
170 | 0 | self.iter |
171 | 0 | .next_back() |
172 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
173 | 0 | } |
174 | | } |
175 | | |
176 | | // state machine for word boundary rules |
177 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
178 | | enum UWordBoundsState { |
179 | | Start, |
180 | | Letter, |
181 | | HLetter, |
182 | | Numeric, |
183 | | Katakana, |
184 | | ExtendNumLet, |
185 | | Regional(RegionalState), |
186 | | FormatExtend(FormatExtendType), |
187 | | Zwj, |
188 | | Emoji, |
189 | | WSegSpace, |
190 | | } |
191 | | |
192 | | // subtypes for FormatExtend state in UWordBoundsState |
193 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
194 | | enum FormatExtendType { |
195 | | AcceptAny, |
196 | | AcceptNone, |
197 | | RequireLetter, |
198 | | RequireHLetter, |
199 | | AcceptQLetter, |
200 | | RequireNumeric, |
201 | | } |
202 | | |
203 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
204 | | enum RegionalState { |
205 | | Half, |
206 | | Full, |
207 | | Unknown, |
208 | | } |
209 | | |
210 | 50.1k | fn is_emoji(ch: char) -> bool { |
211 | | use crate::tables::emoji; |
212 | 50.1k | emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic |
213 | 50.1k | } |
214 | | |
215 | | impl<'a> Iterator for UWordBounds<'a> { |
216 | | type Item = &'a str; |
217 | | |
218 | | #[inline] |
219 | 10.1k | fn size_hint(&self) -> (usize, Option<usize>) { |
220 | 10.1k | let slen = self.string.len(); |
221 | 10.1k | (cmp::min(slen, 1), Some(slen)) |
222 | 10.1k | } <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint Line | Count | Source | 219 | 10.1k | fn size_hint(&self) -> (usize, Option<usize>) { | 220 | 10.1k | let slen = self.string.len(); | 221 | 10.1k | (cmp::min(slen, 1), Some(slen)) | 222 | 10.1k | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint |
223 | | |
224 | | #[inline] |
225 | 24.8M | fn next(&mut self) -> Option<&'a str> { |
226 | | use self::FormatExtendType::*; |
227 | | use self::UWordBoundsState::*; |
228 | | use crate::tables::word as wd; |
229 | 24.8M | if self.string.is_empty() { |
230 | 5.67k | return None; |
231 | 24.8M | } |
232 | | |
233 | 24.8M | let mut take_curr = true; |
234 | 24.8M | let mut take_cat = true; |
235 | 24.8M | let mut idx = 0; |
236 | 24.8M | let mut saveidx = 0; |
237 | 24.8M | let mut state = Start; |
238 | 24.8M | let mut cat = wd::WC_Any; |
239 | 24.8M | let mut savecat = wd::WC_Any; |
240 | | |
241 | | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 |
242 | 24.8M | let mut skipped_format_extend = false; |
243 | 58.1M | for (curr, ch) in self.string.char_indices() { |
244 | 58.1M | idx = curr; |
245 | | // Whether or not the previous category was ZWJ |
246 | | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 |
247 | 58.1M | let prev_zwj = cat == wd::WC_ZWJ; |
248 | | // if there's a category cached, grab it |
249 | 58.1M | cat = match self.cat { |
250 | 55.3M | None => wd::word_category(ch).2, |
251 | 2.78M | _ => self.cat.take().unwrap(), |
252 | | }; |
253 | 58.1M | take_cat = true; |
254 | | |
255 | | // handle rule WB4 |
256 | | // just skip all format, extend, and zwj chars |
257 | | // note that Start is a special case: if there's a bunch of Format | Extend |
258 | | // characters at the beginning of a block of text, dump them out as one unit. |
259 | | // |
260 | | // (This is not obvious from the wording of UAX#29, but if you look at the |
261 | | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt |
262 | | // then the "correct" interpretation of WB4 becomes apparent.) |
263 | 58.1M | if state != Start { |
264 | 33.2M | match cat { |
265 | | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { |
266 | 108k | skipped_format_extend = true; |
267 | 108k | continue; |
268 | | } |
269 | 33.1M | _ => {} |
270 | | } |
271 | 24.8M | } |
272 | | |
273 | | // rule WB3c |
274 | | // WB4 makes all ZWJs collapse into the previous state |
275 | | // but you can still be in a Zwj state if you started with Zwj |
276 | | // |
277 | | // This means that an EP + Zwj will collapse into EP, which is wrong, |
278 | | // since EP+EP is not a boundary but EP+ZWJ+EP is |
279 | | // |
280 | | // Thus, we separately keep track of whether or not the last character |
281 | | // was a ZWJ. This is an additional bit of state tracked outside of the |
282 | | // state enum; the state enum represents the last non-zwj state encountered. |
283 | | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, |
284 | | // however we are in the previous state for the purposes of all other rules. |
285 | 57.9M | if prev_zwj && is_emoji(ch) { |
286 | 31.5k | state = Emoji; |
287 | 31.5k | continue; |
288 | 57.9M | } |
289 | | // Don't use `continue` in this match without updating `cat` |
290 | 24.8M | state = match state { |
291 | 24.8M | Start if cat == wd::WC_CR => { |
292 | 5.87M | idx += match self.get_next_cat(idx) { |
293 | 17.3k | Some(wd::WC_LF) => 1, // rule WB3 |
294 | 5.85M | _ => 0, |
295 | | }; |
296 | 5.87M | break; // rule WB3a |
297 | | } |
298 | 18.9M | Start => match cat { |
299 | 1.88M | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a |
300 | 21.9k | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a |
301 | 402k | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a |
302 | 1.57k | wd::WC_Katakana => Katakana, // rule WB13, WB13a |
303 | 57.4k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b |
304 | 5.05k | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c |
305 | 6.13M | wd::WC_LF | wd::WC_Newline => break, // rule WB3a |
306 | 5.00k | wd::WC_ZWJ => Zwj, // rule WB3c |
307 | 389k | wd::WC_WSegSpace => WSegSpace, // rule WB3d |
308 | | _ => { |
309 | 10.0M | if let Some(ncat) = self.get_next_cat(idx) { |
310 | | // rule WB4 |
311 | 10.0M | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ |
312 | | { |
313 | 31.3k | state = FormatExtend(AcceptNone); |
314 | 31.3k | self.cat = Some(ncat); |
315 | 31.3k | continue; |
316 | 10.0M | } |
317 | 1.92k | } |
318 | 10.0M | break; // rule WB999 |
319 | | } |
320 | | }, |
321 | 994k | WSegSpace => match cat { |
322 | 994k | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
323 | | _ => { |
324 | 388k | take_curr = false; |
325 | 388k | break; |
326 | | } |
327 | | }, |
328 | | Zwj => { |
329 | | // We already handle WB3c above. |
330 | 944 | take_curr = false; |
331 | 944 | break; |
332 | | } |
333 | 65.3k | Letter | HLetter => match cat { |
334 | 15.9M | wd::WC_ALetter => Letter, // rule WB5 |
335 | 3.77k | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
336 | 713k | wd::WC_Numeric => Numeric, // rule WB9 |
337 | 19.9k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
338 | 65.3k | wd::WC_Double_Quote if state == HLetter => { |
339 | 4.29k | savecat = cat; |
340 | 4.29k | saveidx = idx; |
341 | 4.29k | FormatExtend(RequireHLetter) // rule WB7b |
342 | | } |
343 | 55.2k | wd::WC_Single_Quote if state == HLetter => { |
344 | 26.0k | FormatExtend(AcceptQLetter) // rule WB7a |
345 | | } |
346 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
347 | 716k | savecat = cat; |
348 | 716k | saveidx = idx; |
349 | 716k | FormatExtend(RequireLetter) // rule WB6 |
350 | | } |
351 | | _ => { |
352 | 1.76M | take_curr = false; |
353 | 1.76M | break; |
354 | | } |
355 | | }, |
356 | 10.8M | Numeric => match cat { |
357 | 9.21M | wd::WC_Numeric => Numeric, // rule WB8 |
358 | 676k | wd::WC_ALetter => Letter, // rule WB10 |
359 | 2.44k | wd::WC_Hebrew_Letter => HLetter, // rule WB10 |
360 | 11.0k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
361 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
362 | 619k | savecat = cat; |
363 | 619k | saveidx = idx; |
364 | 619k | FormatExtend(RequireNumeric) // rule WB12 |
365 | | } |
366 | | _ => { |
367 | 345k | take_curr = false; |
368 | 345k | break; |
369 | | } |
370 | | }, |
371 | 3.35k | Katakana => match cat { |
372 | 1.03k | wd::WC_Katakana => Katakana, // rule WB13 |
373 | 968 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
374 | | _ => { |
375 | 1.35k | take_curr = false; |
376 | 1.35k | break; |
377 | | } |
378 | | }, |
379 | 220k | ExtendNumLet => match cat { |
380 | 131k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
381 | 25.4k | wd::WC_ALetter => Letter, // rule WB13b |
382 | 2.60k | wd::WC_Hebrew_Letter => HLetter, // rule WB13b |
383 | 9.19k | wd::WC_Numeric => Numeric, // rule WB13b |
384 | 820 | wd::WC_Katakana => Katakana, // rule WB13b |
385 | | _ => { |
386 | 51.0k | take_curr = false; |
387 | 51.0k | break; |
388 | | } |
389 | | }, |
390 | | Regional(RegionalState::Full) => { |
391 | | // if it reaches here we've gone too far, |
392 | | // a full flag can only compose with ZWJ/Extend/Format |
393 | | // proceeding it. |
394 | 3.44k | take_curr = false; |
395 | 3.44k | break; |
396 | | } |
397 | 5.00k | Regional(RegionalState::Half) => match cat { |
398 | 3.49k | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c |
399 | | _ => { |
400 | 1.50k | take_curr = false; |
401 | 1.50k | break; |
402 | | } |
403 | | }, |
404 | | Regional(_) => { |
405 | 0 | unreachable!("RegionalState::Unknown should not occur on forward iteration") |
406 | | } |
407 | | Emoji => { |
408 | | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. |
409 | 12.8k | take_curr = false; |
410 | 12.8k | break; |
411 | | } |
412 | 1.38M | FormatExtend(t) => match t { |
413 | | // handle FormatExtends depending on what type |
414 | 619k | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 |
415 | 716k | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 |
416 | 96.3k | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a |
417 | 4.26k | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
418 | | AcceptNone | AcceptQLetter => { |
419 | 39.8k | take_curr = false; // emit all the Format|Extend characters |
420 | 39.8k | take_cat = false; |
421 | 39.8k | break; |
422 | | } |
423 | 186k | _ => break, // rewind (in if statement below) |
424 | | }, |
425 | | } |
426 | | } |
427 | | |
428 | 24.8M | if let FormatExtend(t) = state { |
429 | | // we were looking for something and didn't find it; we have to back up |
430 | 226k | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { |
431 | 186k | idx = saveidx; |
432 | 186k | cat = savecat; |
433 | 186k | take_curr = false; |
434 | 186k | } |
435 | 24.6M | } |
436 | | |
437 | 24.8M | self.cat = if take_curr { |
438 | 22.0M | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
439 | 22.0M | None |
440 | 2.79M | } else if take_cat { |
441 | 2.75M | Some(cat) |
442 | | } else { |
443 | 39.8k | None |
444 | | }; |
445 | | |
446 | 24.8M | let retstr = &self.string[..idx]; |
447 | 24.8M | self.string = &self.string[idx..]; |
448 | 24.8M | Some(retstr) |
449 | 24.8M | } <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 225 | 24.8M | fn next(&mut self) -> Option<&'a str> { | 226 | | use self::FormatExtendType::*; | 227 | | use self::UWordBoundsState::*; | 228 | | use crate::tables::word as wd; | 229 | 24.8M | if self.string.is_empty() { | 230 | 5.67k | return None; | 231 | 24.8M | } | 232 | | | 233 | 24.8M | let mut take_curr = true; | 234 | 24.8M | let mut take_cat = true; | 235 | 24.8M | let mut idx = 0; | 236 | 24.8M | let mut saveidx = 0; | 237 | 24.8M | let mut state = Start; | 238 | 24.8M | let mut cat = wd::WC_Any; | 239 | 24.8M | let mut savecat = wd::WC_Any; | 240 | | | 241 | | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 | 242 | 24.8M | let mut skipped_format_extend = false; | 243 | 58.1M | for (curr, ch) in self.string.char_indices() { | 244 | 58.1M | idx = curr; | 245 | | // Whether or not the previous category was ZWJ | 246 | | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 | 247 | 58.1M | let prev_zwj = cat == wd::WC_ZWJ; | 248 | | // if there's a category cached, grab it | 249 | 58.1M | cat = match self.cat { | 250 | 55.3M | None => wd::word_category(ch).2, | 251 | 2.78M | _ => self.cat.take().unwrap(), | 252 | | }; | 253 | 58.1M | take_cat = true; | 254 | | | 255 | | // handle rule WB4 | 256 | | // just skip all format, extend, and zwj chars | 257 | | // note that Start is a special case: if there's a bunch of Format | Extend | 258 | | // characters at the beginning of a block of text, dump them out as one unit. | 259 | | // | 260 | | // (This is not obvious from the wording of UAX#29, but if you look at the | 261 | | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt | 262 | | // then the "correct" interpretation of WB4 becomes apparent.) | 263 | 58.1M | if state != Start { | 264 | 33.2M | match cat { | 265 | | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { | 266 | 108k | skipped_format_extend = true; | 267 | 108k | continue; | 268 | | } | 269 | 33.1M | _ => {} | 270 | | } | 271 | 24.8M | } | 272 | | | 273 | | // rule WB3c | 274 | | // WB4 makes all ZWJs collapse into the previous state | 275 | | // but you can still be in a Zwj state if you started with Zwj | 276 | | // | 277 | | // This means that an EP + Zwj will collapse into EP, which is wrong, | 278 | | // since EP+EP is not a boundary but EP+ZWJ+EP is | 279 | | // | 280 | | // Thus, we separately keep track of whether or not the last character | 281 | | // was a ZWJ. This is an additional bit of state tracked outside of the | 282 | | // state enum; the state enum represents the last non-zwj state encountered. | 283 | | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, | 284 | | // however we are in the previous state for the purposes of all other rules. | 285 | 57.9M | if prev_zwj && is_emoji(ch) { | 286 | 31.5k | state = Emoji; | 287 | 31.5k | continue; | 288 | 57.9M | } | 289 | | // Don't use `continue` in this match without updating `cat` | 290 | 24.8M | state = match state { | 291 | 24.8M | Start if cat == wd::WC_CR => { | 292 | 5.87M | idx += match self.get_next_cat(idx) { | 293 | 17.3k | Some(wd::WC_LF) => 1, // rule WB3 | 294 | 5.85M | _ => 0, | 295 | | }; | 296 | 5.87M | break; // rule WB3a | 297 | | } | 298 | 18.9M | Start => match cat { | 299 | 1.88M | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a | 300 | 21.9k | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a | 301 | 402k | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a | 302 | 1.57k | wd::WC_Katakana => Katakana, // rule WB13, WB13a | 303 | 57.4k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b | 304 | 5.05k | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c | 305 | 6.13M | wd::WC_LF | wd::WC_Newline => break, // rule WB3a | 306 | 5.00k | wd::WC_ZWJ => Zwj, // rule WB3c | 307 | 389k | wd::WC_WSegSpace => WSegSpace, // rule WB3d | 308 | | _ => { | 309 | 10.0M | if let Some(ncat) = self.get_next_cat(idx) { | 310 | | // rule WB4 | 311 | 10.0M | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ | 312 | | { | 313 | 31.3k | state = FormatExtend(AcceptNone); | 314 | 31.3k | self.cat = Some(ncat); | 315 | 31.3k | continue; | 316 | 10.0M | } | 317 | 1.92k | } | 318 | 10.0M | break; // rule WB999 | 319 | | } | 320 | | }, | 321 | 994k | WSegSpace => match cat { | 322 | 994k | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, | 323 | | _ => { | 324 | 388k | take_curr = false; | 325 | 388k | break; | 326 | | } | 327 | | }, | 328 | | Zwj => { | 329 | | // We already handle WB3c above. | 330 | 944 | take_curr = false; | 331 | 944 | break; | 332 | | } | 333 | 65.3k | Letter | HLetter => match cat { | 334 | 15.9M | wd::WC_ALetter => Letter, // rule WB5 | 335 | 3.77k | wd::WC_Hebrew_Letter => HLetter, // rule WB5 | 336 | 713k | wd::WC_Numeric => Numeric, // rule WB9 | 337 | 19.9k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 338 | 65.3k | wd::WC_Double_Quote if state == HLetter => { | 339 | 4.29k | savecat = cat; | 340 | 4.29k | saveidx = idx; | 341 | 4.29k | FormatExtend(RequireHLetter) // rule WB7b | 342 | | } | 343 | 55.2k | wd::WC_Single_Quote if state == HLetter => { | 344 | 26.0k | FormatExtend(AcceptQLetter) // rule WB7a | 345 | | } | 346 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { | 347 | 716k | savecat = cat; | 348 | 716k | saveidx = idx; | 349 | 716k | FormatExtend(RequireLetter) // rule WB6 | 350 | | } | 351 | | _ => { | 352 | 1.76M | take_curr = false; | 353 | 1.76M | break; | 354 | | } | 355 | | }, | 356 | 10.8M | Numeric => match cat { | 357 | 9.21M | wd::WC_Numeric => Numeric, // rule WB8 | 358 | 676k | wd::WC_ALetter => Letter, // rule WB10 | 359 | 2.44k | wd::WC_Hebrew_Letter => HLetter, // rule WB10 | 360 | 11.0k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 361 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { | 362 | 619k | savecat = cat; | 363 | 619k | saveidx = idx; | 364 | 619k | FormatExtend(RequireNumeric) // rule WB12 | 365 | | } | 366 | | _ => { | 367 | 345k | take_curr = false; | 368 | 345k | break; | 369 | | } | 370 | | }, | 371 | 3.35k | Katakana => match cat { | 372 | 1.03k | wd::WC_Katakana => Katakana, // rule WB13 | 373 | 968 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 374 | | _ => { | 375 | 1.35k | take_curr = false; | 376 | 1.35k | break; | 377 | | } | 378 | | }, | 379 | 220k | ExtendNumLet => match cat { | 380 | 131k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 381 | 25.4k | wd::WC_ALetter => Letter, // rule WB13b | 382 | 2.60k | wd::WC_Hebrew_Letter => HLetter, // rule WB13b | 383 | 9.19k | wd::WC_Numeric => Numeric, // rule WB13b | 384 | 820 | wd::WC_Katakana => Katakana, // rule WB13b | 385 | | _ => { | 386 | 51.0k | take_curr = false; | 387 | 51.0k | break; | 388 | | } | 389 | | }, | 390 | | Regional(RegionalState::Full) => { | 391 | | // if it reaches here we've gone too far, | 392 | | // a full flag can only compose with ZWJ/Extend/Format | 393 | | // proceeding it. | 394 | 3.44k | take_curr = false; | 395 | 3.44k | break; | 396 | | } | 397 | 5.00k | Regional(RegionalState::Half) => match cat { | 398 | 3.49k | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c | 399 | | _ => { | 400 | 1.50k | take_curr = false; | 401 | 1.50k | break; | 402 | | } | 403 | | }, | 404 | | Regional(_) => { | 405 | 0 | unreachable!("RegionalState::Unknown should not occur on forward iteration") | 406 | | } | 407 | | Emoji => { | 408 | | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. | 409 | 12.8k | take_curr = false; | 410 | 12.8k | break; | 411 | | } | 412 | 1.38M | FormatExtend(t) => match t { | 413 | | // handle FormatExtends depending on what type | 414 | 619k | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 | 415 | 716k | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 | 416 | 96.3k | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a | 417 | 4.26k | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b | 418 | | AcceptNone | AcceptQLetter => { | 419 | 39.8k | take_curr = false; // emit all the Format|Extend characters | 420 | 39.8k | take_cat = false; | 421 | 39.8k | break; | 422 | | } | 423 | 186k | _ => break, // rewind (in if statement below) | 424 | | }, | 425 | | } | 426 | | } | 427 | | | 428 | 24.8M | if let FormatExtend(t) = state { | 429 | | // we were looking for something and didn't find it; we have to back up | 430 | 226k | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { | 431 | 186k | idx = saveidx; | 432 | 186k | cat = savecat; | 433 | 186k | take_curr = false; | 434 | 186k | } | 435 | 24.6M | } | 436 | | | 437 | 24.8M | self.cat = if take_curr { | 438 | 22.0M | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); | 439 | 22.0M | None | 440 | 2.79M | } else if take_cat { | 441 | 2.75M | Some(cat) | 442 | | } else { | 443 | 39.8k | None | 444 | | }; | 445 | | | 446 | 24.8M | let retstr = &self.string[..idx]; | 447 | 24.8M | self.string = &self.string[idx..]; | 448 | 24.8M | Some(retstr) | 449 | 24.8M | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next |
450 | | } |
451 | | |
452 | | impl<'a> DoubleEndedIterator for UWordBounds<'a> { |
453 | | #[inline] |
454 | 0 | fn next_back(&mut self) -> Option<&'a str> { |
455 | | use self::FormatExtendType::*; |
456 | | use self::UWordBoundsState::*; |
457 | | use crate::tables::word as wd; |
458 | 0 | if self.string.is_empty() { |
459 | 0 | return None; |
460 | 0 | } |
461 | | |
462 | 0 | let mut take_curr = true; |
463 | 0 | let mut take_cat = true; |
464 | 0 | let mut idx = self.string.len(); |
465 | 0 | idx -= self.string.chars().next_back().unwrap().len_utf8(); |
466 | 0 | let mut previdx = idx; |
467 | 0 | let mut saveidx = idx; |
468 | 0 | let mut state = Start; |
469 | 0 | let mut savestate = Start; |
470 | 0 | let mut cat = wd::WC_Any; |
471 | | |
472 | 0 | let mut skipped_format_extend = false; |
473 | | |
474 | 0 | for (curr, ch) in self.string.char_indices().rev() { |
475 | 0 | previdx = idx; |
476 | 0 | idx = curr; |
477 | | |
478 | | // if there's a category cached, grab it |
479 | 0 | cat = match self.catb { |
480 | 0 | None => wd::word_category(ch).2, |
481 | 0 | _ => self.catb.take().unwrap(), |
482 | | }; |
483 | 0 | take_cat = true; |
484 | | |
485 | | // backward iterator over word boundaries. Mostly the same as the forward |
486 | | // iterator, with two weirdnesses: |
487 | | // (1) If we encounter a single quote in the Start state, we have to check for a |
488 | | // Hebrew Letter immediately before it. |
489 | | // (2) Format and Extend char handling takes some gymnastics. |
490 | | |
491 | 0 | if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) { |
492 | | // WB3c has more priority so we should not |
493 | | // fold in that case |
494 | 0 | if !matches!(state, FormatExtend(_) | Start) { |
495 | 0 | saveidx = previdx; |
496 | 0 | savestate = state; |
497 | 0 | state = FormatExtend(AcceptNone); |
498 | 0 | } |
499 | | |
500 | 0 | if state != Start { |
501 | 0 | continue; |
502 | 0 | } |
503 | 0 | } else if state == FormatExtend(AcceptNone) { |
504 | 0 | // finished a scan of some Format|Extend chars, restore previous state |
505 | 0 | state = savestate; |
506 | 0 | previdx = saveidx; |
507 | 0 | take_cat = false; |
508 | 0 | skipped_format_extend = true; |
509 | 0 | } |
510 | | |
511 | | // Don't use `continue` in this match without updating `catb` |
512 | 0 | state = match state { |
513 | 0 | Start | FormatExtend(AcceptAny) => match cat { |
514 | 0 | _ if is_emoji(ch) => Zwj, |
515 | 0 | wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b |
516 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b |
517 | 0 | wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b |
518 | 0 | wd::WC_Katakana => Katakana, // rule WB13, WB13b |
519 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
520 | 0 | wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c |
521 | | // rule WB4: |
522 | 0 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), |
523 | | wd::WC_Single_Quote => { |
524 | 0 | saveidx = idx; |
525 | 0 | FormatExtend(AcceptQLetter) // rule WB7a |
526 | | } |
527 | 0 | wd::WC_WSegSpace => WSegSpace, |
528 | | wd::WC_CR | wd::WC_LF | wd::WC_Newline => { |
529 | 0 | if state == Start { |
530 | 0 | if cat == wd::WC_LF { |
531 | 0 | idx -= match self.get_prev_cat(idx) { |
532 | 0 | Some(wd::WC_CR) => 1, // rule WB3 |
533 | 0 | _ => 0, |
534 | | }; |
535 | 0 | } |
536 | 0 | } else { |
537 | 0 | take_curr = false; |
538 | 0 | } |
539 | 0 | break; // rule WB3a |
540 | | } |
541 | 0 | _ => break, // rule WB999 |
542 | | }, |
543 | 0 | Zwj => match cat { |
544 | | // rule WB3c |
545 | 0 | wd::WC_ZWJ => FormatExtend(AcceptAny), |
546 | | _ => { |
547 | 0 | take_curr = false; |
548 | 0 | break; |
549 | | } |
550 | | }, |
551 | 0 | WSegSpace => match cat { |
552 | | // rule WB3d |
553 | 0 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
554 | | _ => { |
555 | 0 | take_curr = false; |
556 | 0 | break; |
557 | | } |
558 | | }, |
559 | 0 | Letter | HLetter => match cat { |
560 | 0 | wd::WC_ALetter => Letter, // rule WB5 |
561 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
562 | 0 | wd::WC_Numeric => Numeric, // rule WB10 |
563 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
564 | 0 | wd::WC_Double_Quote if state == HLetter => { |
565 | 0 | saveidx = previdx; |
566 | 0 | FormatExtend(RequireHLetter) // rule WB7c |
567 | | } |
568 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
569 | 0 | saveidx = previdx; |
570 | 0 | FormatExtend(RequireLetter) // rule WB7 |
571 | | } |
572 | | _ => { |
573 | 0 | take_curr = false; |
574 | 0 | break; |
575 | | } |
576 | | }, |
577 | 0 | Numeric => match cat { |
578 | 0 | wd::WC_Numeric => Numeric, // rule WB8 |
579 | 0 | wd::WC_ALetter => Letter, // rule WB9 |
580 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB9 |
581 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
582 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
583 | 0 | saveidx = previdx; |
584 | 0 | FormatExtend(RequireNumeric) // rule WB11 |
585 | | } |
586 | | _ => { |
587 | 0 | take_curr = false; |
588 | 0 | break; |
589 | | } |
590 | | }, |
591 | 0 | Katakana => match cat { |
592 | 0 | wd::WC_Katakana => Katakana, // rule WB13 |
593 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
594 | | _ => { |
595 | 0 | take_curr = false; |
596 | 0 | break; |
597 | | } |
598 | | }, |
599 | 0 | ExtendNumLet => match cat { |
600 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
601 | 0 | wd::WC_ALetter => Letter, // rule WB13a |
602 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB13a |
603 | 0 | wd::WC_Numeric => Numeric, // rule WB13a |
604 | 0 | wd::WC_Katakana => Katakana, // rule WB13a |
605 | | _ => { |
606 | 0 | take_curr = false; |
607 | 0 | break; |
608 | | } |
609 | | }, |
610 | 0 | Regional(mut regional_state) => match cat { |
611 | | // rule WB13c |
612 | | wd::WC_Regional_Indicator => { |
613 | 0 | if regional_state == RegionalState::Unknown { |
614 | 0 | let count = self.string[..previdx] |
615 | 0 | .chars() |
616 | 0 | .rev() |
617 | 0 | .map(|c| wd::word_category(c).2) |
618 | 0 | .filter(|&c| { |
619 | 0 | !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format) |
620 | 0 | }) |
621 | 0 | .take_while(|&c| c == wd::WC_Regional_Indicator) |
622 | 0 | .count(); |
623 | 0 | regional_state = if count % 2 == 0 { |
624 | 0 | RegionalState::Full |
625 | | } else { |
626 | 0 | RegionalState::Half |
627 | | }; |
628 | 0 | } |
629 | 0 | if regional_state == RegionalState::Full { |
630 | 0 | take_curr = false; |
631 | 0 | break; |
632 | | } else { |
633 | 0 | Regional(RegionalState::Full) |
634 | | } |
635 | | } |
636 | | _ => { |
637 | 0 | take_curr = false; |
638 | 0 | break; |
639 | | } |
640 | | }, |
641 | | Emoji => { |
642 | 0 | if is_emoji(ch) { |
643 | | // rule WB3c |
644 | 0 | Zwj |
645 | | } else { |
646 | 0 | take_curr = false; |
647 | 0 | break; |
648 | | } |
649 | | } |
650 | 0 | FormatExtend(t) => match t { |
651 | 0 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 |
652 | 0 | RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 |
653 | 0 | RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 |
654 | 0 | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a |
655 | 0 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
656 | 0 | _ => break, // backtrack will happens |
657 | | }, |
658 | | } |
659 | | } |
660 | | |
661 | 0 | if let FormatExtend(t) = state { |
662 | | // if we required something but didn't find it, backtrack |
663 | 0 | if t == RequireLetter |
664 | 0 | || t == RequireHLetter |
665 | 0 | || t == RequireNumeric |
666 | 0 | || t == AcceptNone |
667 | 0 | || t == AcceptQLetter |
668 | 0 | { |
669 | 0 | previdx = saveidx; |
670 | 0 | take_cat = false; |
671 | 0 | take_curr = false; |
672 | 0 | } |
673 | 0 | } |
674 | | |
675 | 0 | self.catb = if take_curr { |
676 | 0 | None |
677 | | } else { |
678 | 0 | idx = previdx; |
679 | 0 | if take_cat { |
680 | 0 | Some(cat) |
681 | | } else { |
682 | 0 | None |
683 | | } |
684 | | }; |
685 | | |
686 | 0 | let retstr = &self.string[idx..]; |
687 | 0 | self.string = &self.string[..idx]; |
688 | 0 | Some(retstr) |
689 | 0 | } |
690 | | } |
691 | | |
692 | | impl<'a> UWordBounds<'a> { |
693 | | #[inline] |
694 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
695 | | /// |
696 | | /// ```rust |
697 | | /// # use unicode_segmentation::UnicodeSegmentation; |
698 | | /// let mut iter = "Hello world".split_word_bounds(); |
699 | | /// assert_eq!(iter.as_str(), "Hello world"); |
700 | | /// iter.next(); |
701 | | /// assert_eq!(iter.as_str(), " world"); |
702 | | /// iter.next(); |
703 | | /// assert_eq!(iter.as_str(), "world"); |
704 | | /// ``` |
705 | 0 | pub fn as_str(&self) -> &'a str { |
706 | 0 | self.string |
707 | 0 | } |
708 | | |
709 | | #[inline] |
710 | 15.9M | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { |
711 | | use crate::tables::word as wd; |
712 | 15.9M | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
713 | 15.9M | if nidx < self.string.len() { |
714 | 15.9M | let nch = self.string[nidx..].chars().next().unwrap(); |
715 | 15.9M | Some(wd::word_category(nch).2) |
716 | | } else { |
717 | 1.96k | None |
718 | | } |
719 | 15.9M | } <unicode_segmentation::word::UWordBounds>::get_next_cat Line | Count | Source | 710 | 15.9M | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { | 711 | | use crate::tables::word as wd; | 712 | 15.9M | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); | 713 | 15.9M | if nidx < self.string.len() { | 714 | 15.9M | let nch = self.string[nidx..].chars().next().unwrap(); | 715 | 15.9M | Some(wd::word_category(nch).2) | 716 | | } else { | 717 | 1.96k | None | 718 | | } | 719 | 15.9M | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds>::get_next_cat |
720 | | |
721 | | #[inline] |
722 | 0 | fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { |
723 | | use crate::tables::word as wd; |
724 | 0 | if idx > 0 { |
725 | 0 | let nch = self.string[..idx].chars().next_back().unwrap(); |
726 | 0 | Some(wd::word_category(nch).2) |
727 | | } else { |
728 | 0 | None |
729 | | } |
730 | 0 | } |
731 | | } |
732 | | |
733 | | /// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. |
734 | | /// |
735 | | /// Since we handle only ASCII characters, we can use a much simpler set of |
736 | | /// word break values than the full Unicode algorithm. |
737 | | /// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values |
738 | | /// |
739 | | /// | Word_Break value | ASCII code points that belong to it | |
740 | | /// | -----------------| --------------------------------------------------------------- | |
741 | | /// | CR | U+000D (CR) | |
742 | | /// | LF | U+000A (LF) | |
743 | | /// | Newline | U+000B (VT), U+000C (FF) | |
744 | | /// | Single_Quote | U+0027 (') | |
745 | | /// | Double_Quote | U+0022 (") | |
746 | | /// | MidNumLet | U+002E (.) FULL STOP | |
747 | | /// | MidLetter | U+003A (:) COLON | |
748 | | /// | MidNum | U+002C (,), U+003B (;) | |
749 | | /// | Numeric | U+0030 – U+0039 (0 … 9) | |
750 | | /// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | |
751 | | /// | ExtendNumLet | U+005F (_) underscore | |
752 | | /// | WSegSpace | U+0020 (SPACE) | |
753 | | /// |
754 | | /// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') |
755 | | /// AHLetter is the same as ALetter, so we don't need to distinguish it. |
756 | | /// |
757 | | /// Any other single ASCII byte is its own boundary (the default WB999). |
758 | | #[derive(Debug)] |
759 | | struct AsciiWordBoundIter<'a> { |
760 | | rest: &'a str, |
761 | | offset: usize, |
762 | | } |
763 | | |
764 | | impl<'a> AsciiWordBoundIter<'a> { |
765 | 815 | pub fn new(s: &'a str) -> Self { |
766 | 815 | AsciiWordBoundIter { rest: s, offset: 0 } |
767 | 815 | } |
768 | | |
769 | | #[inline] |
770 | 24.4M | fn is_core(b: u8) -> bool { |
771 | 24.4M | b.is_ascii_alphanumeric() || b == b'_' |
772 | 24.4M | } <unicode_segmentation::word::AsciiWordBoundIter>::is_core Line | Count | Source | 770 | 24.4M | fn is_core(b: u8) -> bool { | 771 | 24.4M | b.is_ascii_alphanumeric() || b == b'_' | 772 | 24.4M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_core |
773 | | |
774 | | #[inline] |
775 | 1.48M | fn is_infix(b: u8, prev: u8, next: u8) -> bool { |
776 | 507k | match b { |
777 | | // Numeric separators such as "1,000" or "3.14" (WB11/WB12) |
778 | | // |
779 | | // "Numeric (MidNum | MidNumLetQ) Numeric" |
780 | 848k | b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, |
781 | | |
782 | | // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) |
783 | | // |
784 | | // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" |
785 | | // MidLetter = b':' |
786 | | // MidNumLetQ = b'.' | b'\'' |
787 | 507k | b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, |
788 | 704k | _ => false, |
789 | | } |
790 | 1.48M | } <unicode_segmentation::word::AsciiWordBoundIter>::is_infix Line | Count | Source | 775 | 1.48M | fn is_infix(b: u8, prev: u8, next: u8) -> bool { | 776 | 507k | match b { | 777 | | // Numeric separators such as "1,000" or "3.14" (WB11/WB12) | 778 | | // | 779 | | // "Numeric (MidNum | MidNumLetQ) Numeric" | 780 | 848k | b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, | 781 | | | 782 | | // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) | 783 | | // | 784 | | // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" | 785 | | // MidLetter = b':' | 786 | | // MidNumLetQ = b'.' | b'\'' | 787 | 507k | b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, | 788 | 704k | _ => false, | 789 | | } | 790 | 1.48M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_infix |
791 | | } |
792 | | |
793 | | impl<'a> Iterator for AsciiWordBoundIter<'a> { |
794 | | type Item = (usize, &'a str); |
795 | | |
796 | | #[inline] |
797 | 12.7M | fn next(&mut self) -> Option<Self::Item> { |
798 | 12.7M | if self.rest.is_empty() { |
799 | 815 | return None; |
800 | 12.7M | } |
801 | | |
802 | 12.7M | let bytes = self.rest.as_bytes(); |
803 | 12.7M | let len = bytes.len(); |
804 | | |
805 | | // 1) Keep horizontal whitespace together. |
806 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. |
807 | 12.7M | if bytes[0] == b' ' { |
808 | 295k | let mut i = 1; |
809 | 1.01M | while i < len && bytes[i] == b' ' { |
810 | 716k | i += 1; |
811 | 716k | } |
812 | 295k | let word = &self.rest[..i]; |
813 | 295k | let pos = self.offset; |
814 | 295k | self.rest = &self.rest[i..]; |
815 | 295k | self.offset += i; |
816 | 295k | return Some((pos, word)); |
817 | 12.4M | } |
818 | | |
819 | | // 2) Core-run (letters/digits/underscore + infix) |
820 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) |
821 | 12.4M | if Self::is_core(bytes[0]) { |
822 | 705k | let mut i = 1; |
823 | 12.0M | while i < len { |
824 | 12.0M | let b = bytes[i]; |
825 | 12.0M | if Self::is_core(b) |
826 | 1.48M | || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) |
827 | 11.3M | { |
828 | 11.3M | i += 1; |
829 | 11.3M | } else { |
830 | 705k | break; |
831 | | } |
832 | | } |
833 | 705k | let word = &self.rest[..i]; |
834 | 705k | let pos = self.offset; |
835 | 705k | self.rest = &self.rest[i..]; |
836 | 705k | self.offset += i; |
837 | 705k | return Some((pos, word)); |
838 | 11.7M | } |
839 | | |
840 | | // 3) Do not break within CRLF. |
841 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. |
842 | 11.7M | if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { |
843 | 4.98k | let word = &self.rest[..2]; |
844 | 4.98k | let pos = self.offset; |
845 | 4.98k | self.rest = &self.rest[2..]; |
846 | 4.98k | self.offset += 2; |
847 | 4.98k | Some((pos, word)) |
848 | | } else { |
849 | | // 4) Otherwise, break everywhere |
850 | | // Spec: the catch‑all rule WB999. |
851 | 11.7M | let word = &self.rest[..1]; |
852 | 11.7M | let pos = self.offset; |
853 | 11.7M | self.rest = &self.rest[1..]; |
854 | 11.7M | self.offset += 1; |
855 | 11.7M | Some((pos, word)) |
856 | | } |
857 | 12.7M | } <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 797 | 12.7M | fn next(&mut self) -> Option<Self::Item> { | 798 | 12.7M | if self.rest.is_empty() { | 799 | 815 | return None; | 800 | 12.7M | } | 801 | | | 802 | 12.7M | let bytes = self.rest.as_bytes(); | 803 | 12.7M | let len = bytes.len(); | 804 | | | 805 | | // 1) Keep horizontal whitespace together. | 806 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. | 807 | 12.7M | if bytes[0] == b' ' { | 808 | 295k | let mut i = 1; | 809 | 1.01M | while i < len && bytes[i] == b' ' { | 810 | 716k | i += 1; | 811 | 716k | } | 812 | 295k | let word = &self.rest[..i]; | 813 | 295k | let pos = self.offset; | 814 | 295k | self.rest = &self.rest[i..]; | 815 | 295k | self.offset += i; | 816 | 295k | return Some((pos, word)); | 817 | 12.4M | } | 818 | | | 819 | | // 2) Core-run (letters/digits/underscore + infix) | 820 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) | 821 | 12.4M | if Self::is_core(bytes[0]) { | 822 | 705k | let mut i = 1; | 823 | 12.0M | while i < len { | 824 | 12.0M | let b = bytes[i]; | 825 | 12.0M | if Self::is_core(b) | 826 | 1.48M | || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) | 827 | 11.3M | { | 828 | 11.3M | i += 1; | 829 | 11.3M | } else { | 830 | 705k | break; | 831 | | } | 832 | | } | 833 | 705k | let word = &self.rest[..i]; | 834 | 705k | let pos = self.offset; | 835 | 705k | self.rest = &self.rest[i..]; | 836 | 705k | self.offset += i; | 837 | 705k | return Some((pos, word)); | 838 | 11.7M | } | 839 | | | 840 | | // 3) Do not break within CRLF. | 841 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. | 842 | 11.7M | if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { | 843 | 4.98k | let word = &self.rest[..2]; | 844 | 4.98k | let pos = self.offset; | 845 | 4.98k | self.rest = &self.rest[2..]; | 846 | 4.98k | self.offset += 2; | 847 | 4.98k | Some((pos, word)) | 848 | | } else { | 849 | | // 4) Otherwise, break everywhere | 850 | | // Spec: the catch‑all rule WB999. | 851 | 11.7M | let word = &self.rest[..1]; | 852 | 11.7M | let pos = self.offset; | 853 | 11.7M | self.rest = &self.rest[1..]; | 854 | 11.7M | self.offset += 1; | 855 | 11.7M | Some((pos, word)) | 856 | | } | 857 | 12.7M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next |
858 | | } |
859 | | |
860 | | impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { |
861 | 0 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
862 | 0 | let rest = self.rest; |
863 | 0 | if rest.is_empty() { |
864 | 0 | return None; |
865 | 0 | } |
866 | 0 | let bytes = rest.as_bytes(); |
867 | 0 | let len = bytes.len(); |
868 | | |
869 | | // 1) Group runs of spaces |
870 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. |
871 | 0 | if bytes[len - 1] == b' ' { |
872 | | // find start of this last run of spaces |
873 | 0 | let mut start = len - 1; |
874 | 0 | while start > 0 && bytes[start - 1] == b' ' { |
875 | 0 | start -= 1; |
876 | 0 | } |
877 | 0 | let word = &rest[start..]; |
878 | 0 | let pos = self.offset + start; |
879 | 0 | self.rest = &rest[..start]; |
880 | 0 | return Some((pos, word)); |
881 | 0 | } |
882 | | |
883 | | // 2) Trailing Core-run (letters/digits/underscore + infix) |
884 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) |
885 | 0 | if Self::is_core(bytes[len - 1]) { |
886 | | // scan backwards as long as we see `is_core` or an `is_infix` |
887 | 0 | let mut start = len - 1; |
888 | 0 | while start > 0 { |
889 | 0 | let b = bytes[start - 1]; |
890 | 0 | let prev = if start >= 2 { bytes[start - 2] } else { b }; |
891 | 0 | let next = bytes[start]; // the byte we just included |
892 | 0 | if Self::is_core(b) || Self::is_infix(b, prev, next) { |
893 | 0 | start -= 1; |
894 | 0 | } else { |
895 | 0 | break; |
896 | | } |
897 | | } |
898 | 0 | let word = &rest[start..]; |
899 | 0 | let pos = self.offset + start; |
900 | 0 | self.rest = &rest[..start]; |
901 | 0 | return Some((pos, word)); |
902 | 0 | } |
903 | | |
904 | | // 3) Non-core: CR+LF as one token, otherwise single char |
905 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. |
906 | 0 | if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { |
907 | 0 | let start = len - 2; |
908 | 0 | let word = &rest[start..]; |
909 | 0 | let pos = self.offset + start; |
910 | 0 | self.rest = &rest[..start]; |
911 | 0 | return Some((pos, word)); |
912 | 0 | } |
913 | | |
914 | | // 4) Fallback – every other byte is its own segment |
915 | | // Spec: the catch‑all rule WB999. |
916 | 0 | let start = len - 1; |
917 | 0 | let word = &rest[start..]; |
918 | 0 | let pos = self.offset + start; |
919 | 0 | self.rest = &rest[..start]; |
920 | 0 | Some((pos, word)) |
921 | 0 | } |
922 | | } |
923 | | |
924 | | #[inline] |
925 | 0 | fn ascii_word_ok(t: &(usize, &str)) -> bool { |
926 | 0 | has_ascii_alphanumeric(&t.1) |
927 | 0 | } |
928 | | #[inline] |
929 | 0 | fn unicode_word_ok(t: &(usize, &str)) -> bool { |
930 | 0 | has_alphanumeric(&t.1) |
931 | 0 | } |
932 | | |
933 | | type AsciiWordsIter<'a> = Filter< |
934 | | core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>, |
935 | | fn(&&'a str) -> bool, |
936 | | >; |
937 | | type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>; |
938 | | type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>; |
939 | | type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>; |
940 | | |
941 | | #[derive(Debug)] |
942 | | enum WordsIter<'a> { |
943 | | Ascii(AsciiWordsIter<'a>), |
944 | | Unicode(UnicodeWordsIter<'a>), |
945 | | } |
946 | | |
947 | | #[derive(Debug)] |
948 | | enum IndicesIter<'a> { |
949 | | Ascii(AsciiIndicesIter<'a>), |
950 | | Unicode(UnicodeIndicesIter<'a>), |
951 | | } |
952 | | |
953 | | #[inline] |
954 | 3.24k | pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { |
955 | 3.24k | let inner = if s.is_ascii() { |
956 | 815 | WordsIter::Ascii(new_unicode_words_ascii(s)) |
957 | | } else { |
958 | 2.43k | WordsIter::Unicode(new_unicode_words_general(s)) |
959 | | }; |
960 | 3.24k | UnicodeWords { inner } |
961 | 3.24k | } unicode_segmentation::word::new_unicode_words Line | Count | Source | 954 | 3.24k | pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { | 955 | 3.24k | let inner = if s.is_ascii() { | 956 | 815 | WordsIter::Ascii(new_unicode_words_ascii(s)) | 957 | | } else { | 958 | 2.43k | WordsIter::Unicode(new_unicode_words_general(s)) | 959 | | }; | 960 | 3.24k | UnicodeWords { inner } | 961 | 3.24k | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words |
962 | | |
963 | | #[inline] |
964 | 0 | pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { |
965 | 0 | let inner = if s.is_ascii() { |
966 | 0 | IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) |
967 | | } else { |
968 | 0 | IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) |
969 | | }; |
970 | 0 | UnicodeWordIndices { inner } |
971 | 0 | } |
972 | | |
973 | | #[inline] |
974 | 5.67k | pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { |
975 | 5.67k | UWordBounds { |
976 | 5.67k | string: s, |
977 | 5.67k | cat: None, |
978 | 5.67k | catb: None, |
979 | 5.67k | } |
980 | 5.67k | } unicode_segmentation::word::new_word_bounds Line | Count | Source | 974 | 5.67k | pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { | 975 | 5.67k | UWordBounds { | 976 | 5.67k | string: s, | 977 | 5.67k | cat: None, | 978 | 5.67k | catb: None, | 979 | 5.67k | } | 980 | 5.67k | } |
Unexecuted instantiation: unicode_segmentation::word::new_word_bounds |
981 | | |
982 | | #[inline] |
983 | 0 | pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { |
984 | 0 | UWordBoundIndices { |
985 | 0 | start_offset: s.as_ptr() as usize, |
986 | 0 | iter: new_word_bounds(s), |
987 | 0 | } |
988 | 0 | } |
989 | | |
990 | | #[inline] |
991 | 815 | fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { |
992 | 815 | AsciiWordBoundIter::new(s) |
993 | 815 | } unicode_segmentation::word::new_ascii_word_bound_indices Line | Count | Source | 991 | 815 | fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { | 992 | 815 | AsciiWordBoundIter::new(s) | 993 | 815 | } |
Unexecuted instantiation: unicode_segmentation::word::new_ascii_word_bound_indices |
994 | | |
995 | | #[inline] |
996 | 6.05M | fn has_alphanumeric(s: &&str) -> bool { |
997 | | use crate::tables::util::is_alphanumeric; |
998 | | |
999 | 6.05M | s.chars().any(is_alphanumeric) |
1000 | 6.05M | } unicode_segmentation::word::has_alphanumeric Line | Count | Source | 996 | 6.05M | fn has_alphanumeric(s: &&str) -> bool { | 997 | | use crate::tables::util::is_alphanumeric; | 998 | | | 999 | 6.05M | s.chars().any(is_alphanumeric) | 1000 | 6.05M | } |
Unexecuted instantiation: unicode_segmentation::word::has_alphanumeric |
1001 | | |
1002 | | #[inline] |
1003 | 12.7M | fn has_ascii_alphanumeric(s: &&str) -> bool { |
1004 | 13.4M | s.chars().any(|c| c.is_ascii_alphanumeric()) unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}Line | Count | Source | 1004 | 13.4M | s.chars().any(|c| c.is_ascii_alphanumeric()) |
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric::{closure#0} |
1005 | 12.7M | } unicode_segmentation::word::has_ascii_alphanumeric Line | Count | Source | 1003 | 12.7M | fn has_ascii_alphanumeric(s: &&str) -> bool { | 1004 | 12.7M | s.chars().any(|c| c.is_ascii_alphanumeric()) | 1005 | 12.7M | } |
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric |
1006 | | |
1007 | | #[inline(always)] |
1008 | 12.7M | fn strip_pos((_, w): (usize, &str)) -> &str { |
1009 | 12.7M | w |
1010 | 12.7M | } |
1011 | | |
1012 | | #[inline] |
1013 | 815 | fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { |
1014 | 815 | new_ascii_word_bound_indices(s) |
1015 | 815 | .map(strip_pos as fn(_) -> _) |
1016 | 815 | .filter(has_ascii_alphanumeric) |
1017 | 815 | } unicode_segmentation::word::new_unicode_words_ascii Line | Count | Source | 1013 | 815 | fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { | 1014 | 815 | new_ascii_word_bound_indices(s) | 1015 | 815 | .map(strip_pos as fn(_) -> _) | 1016 | 815 | .filter(has_ascii_alphanumeric) | 1017 | 815 | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_ascii |
1018 | | |
1019 | | #[inline] |
1020 | 2.43k | fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { |
1021 | 2.43k | new_word_bounds(s).filter(has_alphanumeric) |
1022 | 2.43k | } unicode_segmentation::word::new_unicode_words_general Line | Count | Source | 1020 | 2.43k | fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { | 1021 | 2.43k | new_word_bounds(s).filter(has_alphanumeric) | 1022 | 2.43k | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_general |
1023 | | |
1024 | | #[cfg(test)] |
1025 | | mod tests { |
1026 | | use crate::word::{ |
1027 | | new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, |
1028 | | }; |
1029 | | use std::string::String; |
1030 | | use std::vec::Vec; |
1031 | | use std::{format, vec}; |
1032 | | |
1033 | | use proptest::prelude::*; |
1034 | | |
1035 | | #[test] |
1036 | | fn test_syriac_abbr_mark() { |
1037 | | use crate::tables::word as wd; |
1038 | | let (_, _, cat) = wd::word_category('\u{70f}'); |
1039 | | assert_eq!(cat, wd::WC_ALetter); |
1040 | | } |
1041 | | |
1042 | | #[test] |
1043 | | fn test_end_of_ayah_cat() { |
1044 | | use crate::tables::word as wd; |
1045 | | let (_, _, cat) = wd::word_category('\u{6dd}'); |
1046 | | assert_eq!(cat, wd::WC_Numeric); |
1047 | | } |
1048 | | |
1049 | | #[test] |
1050 | | fn test_ascii_word_bound_indices_various_cases() { |
1051 | | let s = "Hello, world!"; |
1052 | | let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); |
1053 | | let expected = vec![ |
1054 | | (0, "Hello"), // simple letters |
1055 | | (5, ","), |
1056 | | (6, " "), // space after comma |
1057 | | (7, "world"), // skip comma+space, stop at '!' |
1058 | | (12, "!"), // punctuation at the end |
1059 | | ]; |
1060 | | assert_eq!(words, expected); |
1061 | | } |
1062 | | |
1063 | | #[test] |
1064 | | fn test_ascii_word_indices_various_cases() { |
1065 | | let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; |
1066 | | let words: Vec<&str> = new_unicode_words_ascii(s).collect(); |
1067 | | let expected = vec![ |
1068 | | ("Hello"), // simple letters |
1069 | | ("world"), // skip comma+space, stop at '!' |
1070 | | ("can't"), // apostrophe joins letters |
1071 | | ("e.g"), |
1072 | | ("var1"), |
1073 | | ("123,456"), // digits+comma+digits |
1074 | | ("foo_bar"), |
1075 | | ("example.com"), |
1076 | | ("127.0.0.1"), |
1077 | | ("9090"), // port number |
1078 | | ]; |
1079 | | assert_eq!(words, expected); |
1080 | | } |
1081 | | |
1082 | | /// Strategy that yields every code-point from NUL (0) to DEL (127). |
1083 | | fn ascii_char() -> impl Strategy<Value = char> { |
1084 | | (0u8..=127).prop_map(|b| b as char) |
1085 | | } |
1086 | | |
1087 | | proptest! { |
1088 | | #![proptest_config(ProptestConfig::with_cases(10000))] |
1089 | | /// Fast path must equal general path for any ASCII input. |
1090 | | #[test] |
1091 | | fn proptest_ascii_matches_unicode_word_indices( |
1092 | | // Vec<char> → String, length 0‒99 |
1093 | | s in proptest::collection::vec(ascii_char(), 0..100) |
1094 | | .prop_map(|v| v.into_iter().collect::<String>()) |
1095 | | ) { |
1096 | | let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); |
1097 | | let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); |
1098 | | |
1099 | | prop_assert_eq!(fast, uni); |
1100 | | } |
1101 | | |
1102 | | /// Fast path must equal general path for any ASCII input, forwards and backwards. |
1103 | | #[test] |
1104 | | fn proptest_ascii_matches_unicode_word_indices_rev( |
1105 | | // Vec<char> → String, length 0‒99 |
1106 | | s in proptest::collection::vec(ascii_char(), 0..100) |
1107 | | .prop_map(|v| v.into_iter().collect::<String>()) |
1108 | | ) { |
1109 | | let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); |
1110 | | let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); |
1111 | | prop_assert_eq!(fast_rev, uni_rev); |
1112 | | } |
1113 | | } |
1114 | | } |