/src/unicode-segmentation/src/word.rs
Line | Count | Source |
1 | | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | use core::cmp; |
12 | | use core::iter::Filter; |
13 | | |
14 | | use crate::tables::word::WordCat; |
15 | | |
16 | | /// An iterator over the substrings of a string which, after splitting the string on |
17 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
18 | | /// contain any characters with the |
19 | | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
20 | | /// property, or with |
21 | | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
22 | | /// |
23 | | /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See |
24 | | /// its documentation for more. |
25 | | /// |
26 | | /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words |
27 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
28 | | #[derive(Debug)] |
29 | | pub struct UnicodeWords<'a> { |
30 | | inner: WordsIter<'a>, |
31 | | } |
32 | | |
33 | | impl<'a> Iterator for UnicodeWords<'a> { |
34 | | type Item = &'a str; |
35 | | #[inline] |
36 | 2.39M | fn next(&mut self) -> Option<Self::Item> { |
37 | 2.39M | match &mut self.inner { |
38 | 859k | WordsIter::Ascii(i) => i.next(), |
39 | 1.53M | WordsIter::Unicode(i) => i.next(), |
40 | | } |
41 | 2.39M | } <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 36 | 2.39M | fn next(&mut self) -> Option<Self::Item> { | 37 | 2.39M | match &mut self.inner { | 38 | 859k | WordsIter::Ascii(i) => i.next(), | 39 | 1.53M | WordsIter::Unicode(i) => i.next(), | 40 | | } | 41 | 2.39M | } |
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next |
42 | | #[inline] |
43 | 4.80k | fn size_hint(&self) -> (usize, Option<usize>) { |
44 | 4.80k | match &self.inner { |
45 | 1.65k | WordsIter::Ascii(i) => i.size_hint(), |
46 | 3.14k | WordsIter::Unicode(i) => i.size_hint(), |
47 | | } |
48 | 4.80k | } <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint Line | Count | Source | 43 | 4.80k | fn size_hint(&self) -> (usize, Option<usize>) { | 44 | 4.80k | match &self.inner { | 45 | 1.65k | WordsIter::Ascii(i) => i.size_hint(), | 46 | 3.14k | WordsIter::Unicode(i) => i.size_hint(), | 47 | | } | 48 | 4.80k | } |
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint |
49 | | } |
50 | | impl<'a> DoubleEndedIterator for UnicodeWords<'a> { |
51 | | #[inline] |
52 | 0 | fn next_back(&mut self) -> Option<Self::Item> { |
53 | 0 | match &mut self.inner { |
54 | 0 | WordsIter::Ascii(i) => i.next_back(), |
55 | 0 | WordsIter::Unicode(i) => i.next_back(), |
56 | | } |
57 | 0 | } |
58 | | } |
59 | | |
60 | | /// An iterator over the substrings of a string which, after splitting the string on |
61 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
62 | | /// contain any characters with the |
63 | | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
64 | | /// property, or with |
65 | | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
66 | | /// This iterator also provides the byte offsets for each substring. |
67 | | /// |
68 | | /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See |
69 | | /// its documentation for more. |
70 | | /// |
71 | | /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices |
72 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
73 | | #[derive(Debug)] |
74 | | pub struct UnicodeWordIndices<'a> { |
75 | | inner: IndicesIter<'a>, |
76 | | } |
77 | | |
78 | | impl<'a> Iterator for UnicodeWordIndices<'a> { |
79 | | type Item = (usize, &'a str); |
80 | | #[inline] |
81 | 0 | fn next(&mut self) -> Option<Self::Item> { |
82 | 0 | match &mut self.inner { |
83 | 0 | IndicesIter::Ascii(i) => i.next(), |
84 | 0 | IndicesIter::Unicode(i) => i.next(), |
85 | | } |
86 | 0 | } |
87 | | #[inline] |
88 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
89 | 0 | match &self.inner { |
90 | 0 | IndicesIter::Ascii(i) => i.size_hint(), |
91 | 0 | IndicesIter::Unicode(i) => i.size_hint(), |
92 | | } |
93 | 0 | } |
94 | | } |
95 | | impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { |
96 | | #[inline] |
97 | 0 | fn next_back(&mut self) -> Option<Self::Item> { |
98 | 0 | match &mut self.inner { |
99 | 0 | IndicesIter::Ascii(i) => i.next_back(), |
100 | 0 | IndicesIter::Unicode(i) => i.next_back(), |
101 | | } |
102 | 0 | } |
103 | | } |
104 | | |
105 | | /// External iterator for a string's |
106 | | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). |
107 | | /// |
108 | | /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`] |
109 | | /// trait. See its documentation for more. |
110 | | /// |
111 | | /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds |
112 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
113 | | #[derive(Debug, Clone)] |
114 | | pub struct UWordBounds<'a> { |
115 | | string: &'a str, |
116 | | cat: Option<WordCat>, |
117 | | catb: Option<WordCat>, |
118 | | } |
119 | | |
120 | | /// External iterator for word boundaries and byte offsets. |
121 | | /// |
122 | | /// This struct is created by the [`split_word_bound_indices`] method on the |
123 | | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
124 | | /// |
125 | | /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices |
126 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
127 | | #[derive(Debug, Clone)] |
128 | | pub struct UWordBoundIndices<'a> { |
129 | | start_offset: usize, |
130 | | iter: UWordBounds<'a>, |
131 | | } |
132 | | |
133 | | impl<'a> UWordBoundIndices<'a> { |
134 | | #[inline] |
135 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
136 | | /// |
137 | | /// ```rust |
138 | | /// # use unicode_segmentation::UnicodeSegmentation; |
139 | | /// let mut iter = "Hello world".split_word_bound_indices(); |
140 | | /// assert_eq!(iter.as_str(), "Hello world"); |
141 | | /// iter.next(); |
142 | | /// assert_eq!(iter.as_str(), " world"); |
143 | | /// iter.next(); |
144 | | /// assert_eq!(iter.as_str(), "world"); |
145 | | /// ``` |
146 | 0 | pub fn as_str(&self) -> &'a str { |
147 | 0 | self.iter.as_str() |
148 | 0 | } |
149 | | } |
150 | | |
151 | | impl<'a> Iterator for UWordBoundIndices<'a> { |
152 | | type Item = (usize, &'a str); |
153 | | |
154 | | #[inline] |
155 | 0 | fn next(&mut self) -> Option<(usize, &'a str)> { |
156 | 0 | self.iter |
157 | 0 | .next() |
158 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
159 | 0 | } |
160 | | |
161 | | #[inline] |
162 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
163 | 0 | self.iter.size_hint() |
164 | 0 | } |
165 | | } |
166 | | |
167 | | impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { |
168 | | #[inline] |
169 | 0 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
170 | 0 | self.iter |
171 | 0 | .next_back() |
172 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
173 | 0 | } |
174 | | } |
175 | | |
176 | | // state machine for word boundary rules |
177 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
178 | | enum UWordBoundsState { |
179 | | Start, |
180 | | Letter, |
181 | | HLetter, |
182 | | Numeric, |
183 | | Katakana, |
184 | | ExtendNumLet, |
185 | | Regional(RegionalState), |
186 | | FormatExtend(FormatExtendType), |
187 | | Zwj, |
188 | | Emoji, |
189 | | WSegSpace, |
190 | | } |
191 | | |
192 | | // subtypes for FormatExtend state in UWordBoundsState |
193 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
194 | | enum FormatExtendType { |
195 | | AcceptAny, |
196 | | AcceptNone, |
197 | | RequireLetter, |
198 | | RequireHLetter, |
199 | | AcceptQLetter, |
200 | | RequireNumeric, |
201 | | } |
202 | | |
203 | | #[derive(Clone, Copy, PartialEq, Eq, Debug)] |
204 | | enum RegionalState { |
205 | | Half, |
206 | | Full, |
207 | | Unknown, |
208 | | } |
209 | | |
210 | 63.1k | fn is_emoji(ch: char) -> bool { |
211 | | use crate::tables::emoji; |
212 | 63.1k | emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic |
213 | 63.1k | } |
214 | | |
215 | | impl<'a> Iterator for UWordBounds<'a> { |
216 | | type Item = &'a str; |
217 | | |
218 | | #[inline] |
219 | 10.9k | fn size_hint(&self) -> (usize, Option<usize>) { |
220 | 10.9k | let slen = self.string.len(); |
221 | 10.9k | (cmp::min(slen, 1), Some(slen)) |
222 | 10.9k | } <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint Line | Count | Source | 219 | 10.9k | fn size_hint(&self) -> (usize, Option<usize>) { | 220 | 10.9k | let slen = self.string.len(); | 221 | 10.9k | (cmp::min(slen, 1), Some(slen)) | 222 | 10.9k | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint |
223 | | |
224 | | #[inline] |
225 | 26.2M | fn next(&mut self) -> Option<&'a str> { |
226 | | use self::FormatExtendType::*; |
227 | | use self::UWordBoundsState::*; |
228 | | use crate::tables::word as wd; |
229 | 26.2M | if self.string.is_empty() { |
230 | 6.21k | return None; |
231 | 26.2M | } |
232 | | |
233 | 26.2M | let mut take_curr = true; |
234 | 26.2M | let mut take_cat = true; |
235 | 26.2M | let mut idx = 0; |
236 | 26.2M | let mut saveidx = 0; |
237 | 26.2M | let mut state = Start; |
238 | 26.2M | let mut cat = wd::WC_Any; |
239 | 26.2M | let mut savecat = wd::WC_Any; |
240 | | |
241 | | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 |
242 | 26.2M | let mut skipped_format_extend = false; |
243 | 69.9M | for (curr, ch) in self.string.char_indices() { |
244 | 69.9M | idx = curr; |
245 | | // Whether or not the previous category was ZWJ |
246 | | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 |
247 | 69.9M | let prev_zwj = cat == wd::WC_ZWJ; |
248 | | // if there's a category cached, grab it |
249 | 69.9M | cat = match self.cat { |
250 | 66.4M | None => wd::word_category(ch).2, |
251 | 3.53M | _ => self.cat.take().unwrap(), |
252 | | }; |
253 | 69.9M | take_cat = true; |
254 | | |
255 | | // handle rule WB4 |
256 | | // just skip all format, extend, and zwj chars |
257 | | // note that Start is a special case: if there's a bunch of Format | Extend |
258 | | // characters at the beginning of a block of text, dump them out as one unit. |
259 | | // |
260 | | // (This is not obvious from the wording of UAX#29, but if you look at the |
261 | | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt |
262 | | // then the "correct" interpretation of WB4 becomes apparent.) |
263 | 69.9M | if state != Start { |
264 | 43.7M | match cat { |
265 | | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { |
266 | 253k | skipped_format_extend = true; |
267 | 253k | continue; |
268 | | } |
269 | 43.5M | _ => {} |
270 | | } |
271 | 26.2M | } |
272 | | |
273 | | // rule WB3c |
274 | | // WB4 makes all ZWJs collapse into the previous state |
275 | | // but you can still be in a Zwj state if you started with Zwj |
276 | | // |
277 | | // This means that an EP + Zwj will collapse into EP, which is wrong, |
278 | | // since EP+EP is not a boundary but EP+ZWJ+EP is |
279 | | // |
280 | | // Thus, we separately keep track of whether or not the last character |
281 | | // was a ZWJ. This is an additional bit of state tracked outside of the |
282 | | // state enum; the state enum represents the last non-zwj state encountered. |
283 | | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, |
284 | | // however we are in the previous state for the purposes of all other rules. |
285 | 69.7M | if prev_zwj && is_emoji(ch) { |
286 | 41.1k | state = Emoji; |
287 | 41.1k | continue; |
288 | 69.7M | } |
289 | | // Don't use `continue` in this match without updating `cat` |
290 | 26.2M | state = match state { |
291 | 26.2M | Start if cat == wd::WC_CR => { |
292 | 6.60M | idx += match self.get_next_cat(idx) { |
293 | 23.1k | Some(wd::WC_LF) => 1, // rule WB3 |
294 | 6.58M | _ => 0, |
295 | | }; |
296 | 6.60M | break; // rule WB3a |
297 | | } |
298 | 19.5M | Start => match cat { |
299 | 2.34M | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a |
300 | 22.6k | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a |
301 | 445k | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a |
302 | 2.02k | wd::WC_Katakana => Katakana, // rule WB13, WB13a |
303 | 99.5k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b |
304 | 5.69k | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c |
305 | 5.97M | wd::WC_LF | wd::WC_Newline => break, // rule WB3a |
306 | 5.15k | wd::WC_ZWJ => Zwj, // rule WB3c |
307 | 573k | wd::WC_WSegSpace => WSegSpace, // rule WB3d |
308 | | _ => { |
309 | 10.1M | if let Some(ncat) = self.get_next_cat(idx) { |
310 | | // rule WB4 |
311 | 10.1M | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ |
312 | | { |
313 | 40.8k | state = FormatExtend(AcceptNone); |
314 | 40.8k | self.cat = Some(ncat); |
315 | 40.8k | continue; |
316 | 10.0M | } |
317 | 2.07k | } |
318 | 10.0M | break; // rule WB999 |
319 | | } |
320 | | }, |
321 | 3.13M | WSegSpace => match cat { |
322 | 3.13M | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
323 | | _ => { |
324 | 572k | take_curr = false; |
325 | 572k | break; |
326 | | } |
327 | | }, |
328 | | Zwj => { |
329 | | // We already handle WB3c above. |
330 | 852 | take_curr = false; |
331 | 852 | break; |
332 | | } |
333 | 83.0k | Letter | HLetter => match cat { |
334 | 21.7M | wd::WC_ALetter => Letter, // rule WB5 |
335 | 3.83k | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
336 | 933k | wd::WC_Numeric => Numeric, // rule WB9 |
337 | 32.9k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
338 | 83.0k | wd::WC_Double_Quote if state == HLetter => { |
339 | 4.66k | savecat = cat; |
340 | 4.66k | saveidx = idx; |
341 | 4.66k | FormatExtend(RequireHLetter) // rule WB7b |
342 | | } |
343 | 90.2k | wd::WC_Single_Quote if state == HLetter => { |
344 | 27.1k | FormatExtend(AcceptQLetter) // rule WB7a |
345 | | } |
346 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
347 | 740k | savecat = cat; |
348 | 740k | saveidx = idx; |
349 | 740k | FormatExtend(RequireLetter) // rule WB6 |
350 | | } |
351 | | _ => { |
352 | 2.05M | take_curr = false; |
353 | 2.05M | break; |
354 | | } |
355 | | }, |
356 | 12.4M | Numeric => match cat { |
357 | 10.5M | wd::WC_Numeric => Numeric, // rule WB8 |
358 | 876k | wd::WC_ALetter => Letter, // rule WB10 |
359 | 2.55k | wd::WC_Hebrew_Letter => HLetter, // rule WB10 |
360 | 13.6k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
361 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
362 | 596k | savecat = cat; |
363 | 596k | saveidx = idx; |
364 | 596k | FormatExtend(RequireNumeric) // rule WB12 |
365 | | } |
366 | | _ => { |
367 | 381k | take_curr = false; |
368 | 381k | break; |
369 | | } |
370 | | }, |
371 | 3.49k | Katakana => match cat { |
372 | 640 | wd::WC_Katakana => Katakana, // rule WB13 |
373 | 1.04k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
374 | | _ => { |
375 | 1.81k | take_curr = false; |
376 | 1.81k | break; |
377 | | } |
378 | | }, |
379 | 316k | ExtendNumLet => match cat { |
380 | 169k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
381 | 48.2k | wd::WC_ALetter => Letter, // rule WB13b |
382 | 2.66k | wd::WC_Hebrew_Letter => HLetter, // rule WB13b |
383 | 10.5k | wd::WC_Numeric => Numeric, // rule WB13b |
384 | 906 | wd::WC_Katakana => Katakana, // rule WB13b |
385 | | _ => { |
386 | 84.6k | take_curr = false; |
387 | 84.6k | break; |
388 | | } |
389 | | }, |
390 | | Regional(RegionalState::Full) => { |
391 | | // if it reaches here we've gone too far, |
392 | | // a full flag can only compose with ZWJ/Extend/Format |
393 | | // proceeding it. |
394 | 4.06k | take_curr = false; |
395 | 4.06k | break; |
396 | | } |
397 | 5.64k | Regional(RegionalState::Half) => match cat { |
398 | 4.11k | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c |
399 | | _ => { |
400 | 1.53k | take_curr = false; |
401 | 1.53k | break; |
402 | | } |
403 | | }, |
404 | | Regional(_) => { |
405 | 0 | unreachable!("RegionalState::Unknown should not occur on forward iteration") |
406 | | } |
407 | | Emoji => { |
408 | | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. |
409 | 14.5k | take_curr = false; |
410 | 14.5k | break; |
411 | | } |
412 | 1.39M | FormatExtend(t) => match t { |
413 | | // handle FormatExtends depending on what type |
414 | 596k | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 |
415 | 740k | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 |
416 | 252k | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a |
417 | 4.64k | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
418 | | AcceptNone | AcceptQLetter => { |
419 | 48.5k | take_curr = false; // emit all the Format|Extend characters |
420 | 48.5k | take_cat = false; |
421 | 48.5k | break; |
422 | | } |
423 | 367k | _ => break, // rewind (in if statement below) |
424 | | }, |
425 | | } |
426 | | } |
427 | | |
428 | 26.2M | if let FormatExtend(t) = state { |
429 | | // we were looking for something and didn't find it; we have to back up |
430 | 416k | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { |
431 | 367k | idx = saveidx; |
432 | 367k | cat = savecat; |
433 | 367k | take_curr = false; |
434 | 367k | } |
435 | 25.7M | } |
436 | | |
437 | 26.2M | self.cat = if take_curr { |
438 | 22.6M | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
439 | 22.6M | None |
440 | 3.53M | } else if take_cat { |
441 | 3.48M | Some(cat) |
442 | | } else { |
443 | 48.5k | None |
444 | | }; |
445 | | |
446 | 26.2M | let retstr = &self.string[..idx]; |
447 | 26.2M | self.string = &self.string[idx..]; |
448 | 26.2M | Some(retstr) |
449 | 26.2M | } <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 225 | 26.2M | fn next(&mut self) -> Option<&'a str> { | 226 | | use self::FormatExtendType::*; | 227 | | use self::UWordBoundsState::*; | 228 | | use crate::tables::word as wd; | 229 | 26.2M | if self.string.is_empty() { | 230 | 6.21k | return None; | 231 | 26.2M | } | 232 | | | 233 | 26.2M | let mut take_curr = true; | 234 | 26.2M | let mut take_cat = true; | 235 | 26.2M | let mut idx = 0; | 236 | 26.2M | let mut saveidx = 0; | 237 | 26.2M | let mut state = Start; | 238 | 26.2M | let mut cat = wd::WC_Any; | 239 | 26.2M | let mut savecat = wd::WC_Any; | 240 | | | 241 | | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 | 242 | 26.2M | let mut skipped_format_extend = false; | 243 | 69.9M | for (curr, ch) in self.string.char_indices() { | 244 | 69.9M | idx = curr; | 245 | | // Whether or not the previous category was ZWJ | 246 | | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 | 247 | 69.9M | let prev_zwj = cat == wd::WC_ZWJ; | 248 | | // if there's a category cached, grab it | 249 | 69.9M | cat = match self.cat { | 250 | 66.4M | None => wd::word_category(ch).2, | 251 | 3.53M | _ => self.cat.take().unwrap(), | 252 | | }; | 253 | 69.9M | take_cat = true; | 254 | | | 255 | | // handle rule WB4 | 256 | | // just skip all format, extend, and zwj chars | 257 | | // note that Start is a special case: if there's a bunch of Format | Extend | 258 | | // characters at the beginning of a block of text, dump them out as one unit. | 259 | | // | 260 | | // (This is not obvious from the wording of UAX#29, but if you look at the | 261 | | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt | 262 | | // then the "correct" interpretation of WB4 becomes apparent.) | 263 | 69.9M | if state != Start { | 264 | 43.7M | match cat { | 265 | | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { | 266 | 253k | skipped_format_extend = true; | 267 | 253k | continue; | 268 | | } | 269 | 43.5M | _ => {} | 270 | | } | 271 | 26.2M | } | 272 | | | 273 | | // rule WB3c | 274 | | // WB4 makes all ZWJs collapse into the previous state | 275 | | // but you can still be in a Zwj state if you started with Zwj | 276 | | // | 277 | | // This means that an EP + Zwj will collapse into EP, which is wrong, | 278 | | // since EP+EP is not a boundary but EP+ZWJ+EP is | 279 | | // | 280 | | // Thus, we separately keep track of whether or not the last character | 281 | | // was a ZWJ. This is an additional bit of state tracked outside of the | 282 | | // state enum; the state enum represents the last non-zwj state encountered. | 283 | | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, | 284 | | // however we are in the previous state for the purposes of all other rules. | 285 | 69.7M | if prev_zwj && is_emoji(ch) { | 286 | 41.1k | state = Emoji; | 287 | 41.1k | continue; | 288 | 69.7M | } | 289 | | // Don't use `continue` in this match without updating `cat` | 290 | 26.2M | state = match state { | 291 | 26.2M | Start if cat == wd::WC_CR => { | 292 | 6.60M | idx += match self.get_next_cat(idx) { | 293 | 23.1k | Some(wd::WC_LF) => 1, // rule WB3 | 294 | 6.58M | _ => 0, | 295 | | }; | 296 | 6.60M | break; // rule WB3a | 297 | | } | 298 | 19.5M | Start => match cat { | 299 | 2.34M | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a | 300 | 22.6k | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a | 301 | 445k | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a | 302 | 2.02k | wd::WC_Katakana => Katakana, // rule WB13, WB13a | 303 | 99.5k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b | 304 | 5.69k | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c | 305 | 5.97M | wd::WC_LF | wd::WC_Newline => break, // rule WB3a | 306 | 5.15k | wd::WC_ZWJ => Zwj, // rule WB3c | 307 | 573k | wd::WC_WSegSpace => WSegSpace, // rule WB3d | 308 | | _ => { | 309 | 10.1M | if let Some(ncat) = self.get_next_cat(idx) { | 310 | | // rule WB4 | 311 | 10.1M | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ | 312 | | { | 313 | 40.8k | state = FormatExtend(AcceptNone); | 314 | 40.8k | self.cat = Some(ncat); | 315 | 40.8k | continue; | 316 | 10.0M | } | 317 | 2.07k | } | 318 | 10.0M | break; // rule WB999 | 319 | | } | 320 | | }, | 321 | 3.13M | WSegSpace => match cat { | 322 | 3.13M | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, | 323 | | _ => { | 324 | 572k | take_curr = false; | 325 | 572k | break; | 326 | | } | 327 | | }, | 328 | | Zwj => { | 329 | | // We already handle WB3c above. | 330 | 852 | take_curr = false; | 331 | 852 | break; | 332 | | } | 333 | 83.0k | Letter | HLetter => match cat { | 334 | 21.7M | wd::WC_ALetter => Letter, // rule WB5 | 335 | 3.83k | wd::WC_Hebrew_Letter => HLetter, // rule WB5 | 336 | 933k | wd::WC_Numeric => Numeric, // rule WB9 | 337 | 32.9k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 338 | 83.0k | wd::WC_Double_Quote if state == HLetter => { | 339 | 4.66k | savecat = cat; | 340 | 4.66k | saveidx = idx; | 341 | 4.66k | FormatExtend(RequireHLetter) // rule WB7b | 342 | | } | 343 | 90.2k | wd::WC_Single_Quote if state == HLetter => { | 344 | 27.1k | FormatExtend(AcceptQLetter) // rule WB7a | 345 | | } | 346 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { | 347 | 740k | savecat = cat; | 348 | 740k | saveidx = idx; | 349 | 740k | FormatExtend(RequireLetter) // rule WB6 | 350 | | } | 351 | | _ => { | 352 | 2.05M | take_curr = false; | 353 | 2.05M | break; | 354 | | } | 355 | | }, | 356 | 12.4M | Numeric => match cat { | 357 | 10.5M | wd::WC_Numeric => Numeric, // rule WB8 | 358 | 876k | wd::WC_ALetter => Letter, // rule WB10 | 359 | 2.55k | wd::WC_Hebrew_Letter => HLetter, // rule WB10 | 360 | 13.6k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 361 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { | 362 | 596k | savecat = cat; | 363 | 596k | saveidx = idx; | 364 | 596k | FormatExtend(RequireNumeric) // rule WB12 | 365 | | } | 366 | | _ => { | 367 | 381k | take_curr = false; | 368 | 381k | break; | 369 | | } | 370 | | }, | 371 | 3.49k | Katakana => match cat { | 372 | 640 | wd::WC_Katakana => Katakana, // rule WB13 | 373 | 1.04k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 374 | | _ => { | 375 | 1.81k | take_curr = false; | 376 | 1.81k | break; | 377 | | } | 378 | | }, | 379 | 316k | ExtendNumLet => match cat { | 380 | 169k | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a | 381 | 48.2k | wd::WC_ALetter => Letter, // rule WB13b | 382 | 2.66k | wd::WC_Hebrew_Letter => HLetter, // rule WB13b | 383 | 10.5k | wd::WC_Numeric => Numeric, // rule WB13b | 384 | 906 | wd::WC_Katakana => Katakana, // rule WB13b | 385 | | _ => { | 386 | 84.6k | take_curr = false; | 387 | 84.6k | break; | 388 | | } | 389 | | }, | 390 | | Regional(RegionalState::Full) => { | 391 | | // if it reaches here we've gone too far, | 392 | | // a full flag can only compose with ZWJ/Extend/Format | 393 | | // proceeding it. | 394 | 4.06k | take_curr = false; | 395 | 4.06k | break; | 396 | | } | 397 | 5.64k | Regional(RegionalState::Half) => match cat { | 398 | 4.11k | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c | 399 | | _ => { | 400 | 1.53k | take_curr = false; | 401 | 1.53k | break; | 402 | | } | 403 | | }, | 404 | | Regional(_) => { | 405 | 0 | unreachable!("RegionalState::Unknown should not occur on forward iteration") | 406 | | } | 407 | | Emoji => { | 408 | | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. | 409 | 14.5k | take_curr = false; | 410 | 14.5k | break; | 411 | | } | 412 | 1.39M | FormatExtend(t) => match t { | 413 | | // handle FormatExtends depending on what type | 414 | 596k | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 | 415 | 740k | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 | 416 | 252k | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a | 417 | 4.64k | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b | 418 | | AcceptNone | AcceptQLetter => { | 419 | 48.5k | take_curr = false; // emit all the Format|Extend characters | 420 | 48.5k | take_cat = false; | 421 | 48.5k | break; | 422 | | } | 423 | 367k | _ => break, // rewind (in if statement below) | 424 | | }, | 425 | | } | 426 | | } | 427 | | | 428 | 26.2M | if let FormatExtend(t) = state { | 429 | | // we were looking for something and didn't find it; we have to back up | 430 | 416k | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { | 431 | 367k | idx = saveidx; | 432 | 367k | cat = savecat; | 433 | 367k | take_curr = false; | 434 | 367k | } | 435 | 25.7M | } | 436 | | | 437 | 26.2M | self.cat = if take_curr { | 438 | 22.6M | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); | 439 | 22.6M | None | 440 | 3.53M | } else if take_cat { | 441 | 3.48M | Some(cat) | 442 | | } else { | 443 | 48.5k | None | 444 | | }; | 445 | | | 446 | 26.2M | let retstr = &self.string[..idx]; | 447 | 26.2M | self.string = &self.string[idx..]; | 448 | 26.2M | Some(retstr) | 449 | 26.2M | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next |
450 | | } |
451 | | |
452 | | impl<'a> DoubleEndedIterator for UWordBounds<'a> { |
453 | | #[inline] |
454 | 0 | fn next_back(&mut self) -> Option<&'a str> { |
455 | | use self::FormatExtendType::*; |
456 | | use self::UWordBoundsState::*; |
457 | | use crate::tables::word as wd; |
458 | 0 | if self.string.is_empty() { |
459 | 0 | return None; |
460 | 0 | } |
461 | | |
462 | 0 | let mut take_curr = true; |
463 | 0 | let mut take_cat = true; |
464 | 0 | let mut idx = self.string.len(); |
465 | 0 | idx -= self.string.chars().next_back().unwrap().len_utf8(); |
466 | 0 | let mut previdx = idx; |
467 | 0 | let mut saveidx = idx; |
468 | 0 | let mut state = Start; |
469 | 0 | let mut savestate = Start; |
470 | 0 | let mut cat = wd::WC_Any; |
471 | | |
472 | | // WB3c is context-sensitive (ZWJ + Extended_Pictographic), |
473 | | // while WB4 collapses Extend/Format and would otherwise hide that context. |
474 | | // We therefore keep this context outside the main state machine: |
475 | | // whether the nearest non-(Extend|Format) char to the right is emoji. |
476 | 0 | let mut right_significant_is_emoji: bool = false; |
477 | | |
478 | 0 | let mut skipped_format_extend = false; |
479 | | |
480 | 0 | for (curr, ch) in self.string.char_indices().rev() { |
481 | 0 | previdx = idx; |
482 | 0 | idx = curr; |
483 | | |
484 | | // if there's a category cached, grab it |
485 | 0 | cat = match self.catb { |
486 | 0 | None => wd::word_category(ch).2, |
487 | 0 | _ => self.catb.take().unwrap(), |
488 | | }; |
489 | 0 | take_cat = true; |
490 | | |
491 | | // backward iterator over word boundaries. Mostly the same as the forward |
492 | | // iterator, with two weirdnesses: |
493 | | // (1) If we encounter a single quote in the Start state, we have to check for a |
494 | | // Hebrew Letter immediately before it. |
495 | | // (2) Format and Extend char handling takes some gymnastics. |
496 | | |
497 | | // Reverse-direction WB3c check: when we encounter ZWJ and the nearest |
498 | | // significant right-side char is emoji, do not break here. |
499 | 0 | if cat == wd::WC_ZWJ && state != Zwj && right_significant_is_emoji { |
500 | 0 | continue; |
501 | 0 | } |
502 | | |
503 | | // Keep the right-side WB3c context up to date as we move left. |
504 | | // Ignore Extend/Format here to mirror WB4 collapsing behavior. |
505 | 0 | if cat != wd::WC_Extend && cat != wd::WC_Format { |
506 | 0 | right_significant_is_emoji = is_emoji(ch); |
507 | 0 | } |
508 | | |
509 | 0 | if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) { |
510 | | // WB3c has more priority so we should not |
511 | | // fold in that case |
512 | 0 | if !matches!(state, FormatExtend(_) | Start) { |
513 | 0 | saveidx = previdx; |
514 | 0 | savestate = state; |
515 | 0 | state = FormatExtend(AcceptNone); |
516 | 0 | } |
517 | | |
518 | 0 | if state != Start { |
519 | 0 | continue; |
520 | 0 | } |
521 | 0 | } else if state == FormatExtend(AcceptNone) { |
522 | 0 | // finished a scan of some Format|Extend chars, restore previous state |
523 | 0 | state = savestate; |
524 | 0 | previdx = saveidx; |
525 | 0 | take_cat = false; |
526 | 0 | skipped_format_extend = true; |
527 | 0 | } |
528 | | |
529 | | // Don't use `continue` in this match without updating `catb` |
530 | 0 | state = match state { |
531 | 0 | Start | FormatExtend(AcceptAny) => match cat { |
532 | 0 | wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b |
533 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b |
534 | 0 | wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b |
535 | 0 | wd::WC_Katakana => Katakana, // rule WB13, WB13b |
536 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
537 | 0 | wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c |
538 | | // rule WB4: |
539 | 0 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), |
540 | | wd::WC_Single_Quote => { |
541 | 0 | saveidx = idx; |
542 | 0 | FormatExtend(AcceptQLetter) // rule WB7a |
543 | | } |
544 | 0 | wd::WC_WSegSpace => WSegSpace, |
545 | | wd::WC_CR | wd::WC_LF | wd::WC_Newline => { |
546 | 0 | if state == Start { |
547 | 0 | if cat == wd::WC_LF { |
548 | 0 | idx -= match self.get_prev_cat(idx) { |
549 | 0 | Some(wd::WC_CR) => 1, // rule WB3 |
550 | 0 | _ => 0, |
551 | | }; |
552 | 0 | } |
553 | 0 | } else { |
554 | 0 | take_curr = false; |
555 | 0 | } |
556 | 0 | break; // rule WB3a |
557 | | } |
558 | 0 | _ if is_emoji(ch) => Zwj, |
559 | 0 | _ => break, // rule WB999 |
560 | | }, |
561 | 0 | Zwj => match cat { |
562 | | // rule WB3c |
563 | 0 | wd::WC_ZWJ => FormatExtend(AcceptAny), |
564 | | _ => { |
565 | 0 | take_curr = false; |
566 | 0 | break; |
567 | | } |
568 | | }, |
569 | 0 | WSegSpace => match cat { |
570 | | // rule WB3d |
571 | 0 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
572 | | _ => { |
573 | 0 | take_curr = false; |
574 | 0 | break; |
575 | | } |
576 | | }, |
577 | 0 | Letter | HLetter => match cat { |
578 | 0 | wd::WC_ALetter => Letter, // rule WB5 |
579 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
580 | 0 | wd::WC_Numeric => Numeric, // rule WB10 |
581 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
582 | 0 | wd::WC_Double_Quote if state == HLetter => { |
583 | 0 | saveidx = previdx; |
584 | 0 | FormatExtend(RequireHLetter) // rule WB7c |
585 | | } |
586 | | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
587 | 0 | saveidx = previdx; |
588 | 0 | FormatExtend(RequireLetter) // rule WB7 |
589 | | } |
590 | | _ => { |
591 | 0 | take_curr = false; |
592 | 0 | break; |
593 | | } |
594 | | }, |
595 | 0 | Numeric => match cat { |
596 | 0 | wd::WC_Numeric => Numeric, // rule WB8 |
597 | 0 | wd::WC_ALetter => Letter, // rule WB9 |
598 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB9 |
599 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
600 | | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
601 | 0 | saveidx = previdx; |
602 | 0 | FormatExtend(RequireNumeric) // rule WB11 |
603 | | } |
604 | | _ => { |
605 | 0 | take_curr = false; |
606 | 0 | break; |
607 | | } |
608 | | }, |
609 | 0 | Katakana => match cat { |
610 | 0 | wd::WC_Katakana => Katakana, // rule WB13 |
611 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
612 | | _ => { |
613 | 0 | take_curr = false; |
614 | 0 | break; |
615 | | } |
616 | | }, |
617 | 0 | ExtendNumLet => match cat { |
618 | 0 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
619 | 0 | wd::WC_ALetter => Letter, // rule WB13a |
620 | 0 | wd::WC_Hebrew_Letter => HLetter, // rule WB13a |
621 | 0 | wd::WC_Numeric => Numeric, // rule WB13a |
622 | 0 | wd::WC_Katakana => Katakana, // rule WB13a |
623 | | _ => { |
624 | 0 | take_curr = false; |
625 | 0 | break; |
626 | | } |
627 | | }, |
628 | 0 | Regional(mut regional_state) => match cat { |
629 | | // rule WB13c |
630 | | wd::WC_Regional_Indicator => { |
631 | 0 | if regional_state == RegionalState::Unknown { |
632 | 0 | let count = self.string[..previdx] |
633 | 0 | .chars() |
634 | 0 | .rev() |
635 | 0 | .map(|c| wd::word_category(c).2) |
636 | 0 | .filter(|&c| { |
637 | 0 | !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format) |
638 | 0 | }) |
639 | 0 | .take_while(|&c| c == wd::WC_Regional_Indicator) |
640 | 0 | .count(); |
641 | 0 | regional_state = if count % 2 == 0 { |
642 | 0 | RegionalState::Full |
643 | | } else { |
644 | 0 | RegionalState::Half |
645 | | }; |
646 | 0 | } |
647 | 0 | if regional_state == RegionalState::Full { |
648 | 0 | take_curr = false; |
649 | 0 | break; |
650 | | } else { |
651 | 0 | Regional(RegionalState::Full) |
652 | | } |
653 | | } |
654 | | _ => { |
655 | 0 | take_curr = false; |
656 | 0 | break; |
657 | | } |
658 | | }, |
659 | | Emoji => { |
660 | 0 | if is_emoji(ch) { |
661 | | // rule WB3c |
662 | 0 | Zwj |
663 | | } else { |
664 | 0 | take_curr = false; |
665 | 0 | break; |
666 | | } |
667 | | } |
668 | 0 | FormatExtend(t) => match t { |
669 | 0 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 |
670 | 0 | RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 |
671 | 0 | RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 |
672 | 0 | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a |
673 | 0 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
674 | 0 | _ => break, // backtrack will happens |
675 | | }, |
676 | | } |
677 | | } |
678 | | |
679 | 0 | if let FormatExtend(t) = state { |
680 | | // if we required something but didn't find it, backtrack |
681 | 0 | if t == RequireLetter |
682 | 0 | || t == RequireHLetter |
683 | 0 | || t == RequireNumeric |
684 | 0 | || t == AcceptNone |
685 | 0 | || t == AcceptQLetter |
686 | 0 | { |
687 | 0 | previdx = saveidx; |
688 | 0 | take_cat = false; |
689 | 0 | take_curr = false; |
690 | 0 | } |
691 | 0 | } |
692 | | |
693 | 0 | self.catb = if take_curr { |
694 | 0 | None |
695 | | } else { |
696 | 0 | idx = previdx; |
697 | 0 | if take_cat { |
698 | 0 | Some(cat) |
699 | | } else { |
700 | 0 | None |
701 | | } |
702 | | }; |
703 | | |
704 | 0 | let retstr = &self.string[idx..]; |
705 | 0 | self.string = &self.string[..idx]; |
706 | 0 | Some(retstr) |
707 | 0 | } |
708 | | } |
709 | | |
710 | | impl<'a> UWordBounds<'a> { |
711 | | #[inline] |
712 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
713 | | /// |
714 | | /// ```rust |
715 | | /// # use unicode_segmentation::UnicodeSegmentation; |
716 | | /// let mut iter = "Hello world".split_word_bounds(); |
717 | | /// assert_eq!(iter.as_str(), "Hello world"); |
718 | | /// iter.next(); |
719 | | /// assert_eq!(iter.as_str(), " world"); |
720 | | /// iter.next(); |
721 | | /// assert_eq!(iter.as_str(), "world"); |
722 | | /// ``` |
723 | 0 | pub fn as_str(&self) -> &'a str { |
724 | 0 | self.string |
725 | 0 | } |
726 | | |
727 | | #[inline] |
728 | 16.7M | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { |
729 | | use crate::tables::word as wd; |
730 | 16.7M | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
731 | 16.7M | if nidx < self.string.len() { |
732 | 16.7M | let nch = self.string[nidx..].chars().next().unwrap(); |
733 | 16.7M | Some(wd::word_category(nch).2) |
734 | | } else { |
735 | 2.13k | None |
736 | | } |
737 | 16.7M | } <unicode_segmentation::word::UWordBounds>::get_next_cat Line | Count | Source | 728 | 16.7M | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { | 729 | | use crate::tables::word as wd; | 730 | 16.7M | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); | 731 | 16.7M | if nidx < self.string.len() { | 732 | 16.7M | let nch = self.string[nidx..].chars().next().unwrap(); | 733 | 16.7M | Some(wd::word_category(nch).2) | 734 | | } else { | 735 | 2.13k | None | 736 | | } | 737 | 16.7M | } |
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds>::get_next_cat |
738 | | |
739 | | #[inline] |
740 | 0 | fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { |
741 | | use crate::tables::word as wd; |
742 | 0 | if idx > 0 { |
743 | 0 | let nch = self.string[..idx].chars().next_back().unwrap(); |
744 | 0 | Some(wd::word_category(nch).2) |
745 | | } else { |
746 | 0 | None |
747 | | } |
748 | 0 | } |
749 | | } |
750 | | |
751 | | /// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. |
752 | | /// |
753 | | /// Since we handle only ASCII characters, we can use a much simpler set of |
754 | | /// word break values than the full Unicode algorithm. |
755 | | /// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values |
756 | | /// |
757 | | /// | Word_Break value | ASCII code points that belong to it | |
758 | | /// | -----------------| --------------------------------------------------------------- | |
759 | | /// | CR | U+000D (CR) | |
760 | | /// | LF | U+000A (LF) | |
761 | | /// | Newline | U+000B (VT), U+000C (FF) | |
762 | | /// | Single_Quote | U+0027 (') | |
763 | | /// | Double_Quote | U+0022 (") | |
764 | | /// | MidNumLet | U+002E (.) FULL STOP | |
765 | | /// | MidLetter | U+003A (:) COLON | |
766 | | /// | MidNum | U+002C (,), U+003B (;) | |
767 | | /// | Numeric | U+0030 – U+0039 (0 … 9) | |
768 | | /// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | |
769 | | /// | ExtendNumLet | U+005F (_) underscore | |
770 | | /// | WSegSpace | U+0020 (SPACE) | |
771 | | /// |
772 | | /// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') |
773 | | /// AHLetter is the same as ALetter, so we don't need to distinguish it. |
774 | | /// |
775 | | /// Any other single ASCII byte is its own boundary (the default WB999). |
776 | | #[derive(Debug)] |
777 | | struct AsciiWordBoundIter<'a> { |
778 | | rest: &'a str, |
779 | | offset: usize, |
780 | | } |
781 | | |
782 | | impl<'a> AsciiWordBoundIter<'a> { |
783 | 900 | pub fn new(s: &'a str) -> Self { |
784 | 900 | AsciiWordBoundIter { rest: s, offset: 0 } |
785 | 900 | } |
786 | | |
787 | | #[inline] |
788 | 22.8M | fn is_core(b: u8) -> bool { |
789 | 22.8M | b.is_ascii_alphanumeric() || b == b'_' |
790 | 22.8M | } <unicode_segmentation::word::AsciiWordBoundIter>::is_core Line | Count | Source | 788 | 22.8M | fn is_core(b: u8) -> bool { | 789 | 22.8M | b.is_ascii_alphanumeric() || b == b'_' | 790 | 22.8M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_core |
791 | | |
792 | | #[inline] |
793 | 1.45M | fn is_infix(b: u8, prev: u8, next: u8) -> bool { |
794 | 366k | match b { |
795 | | // Numeric separators such as "1,000" or "3.14" (WB11/WB12) |
796 | | // |
797 | | // "Numeric (MidNum | MidNumLetQ) Numeric" |
798 | 652k | b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, |
799 | | |
800 | | // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) |
801 | | // |
802 | | // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" |
803 | | // MidLetter = b':' |
804 | | // MidNumLetQ = b'.' | b'\'' |
805 | 366k | b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, |
806 | 872k | _ => false, |
807 | | } |
808 | 1.45M | } <unicode_segmentation::word::AsciiWordBoundIter>::is_infix Line | Count | Source | 793 | 1.45M | fn is_infix(b: u8, prev: u8, next: u8) -> bool { | 794 | 366k | match b { | 795 | | // Numeric separators such as "1,000" or "3.14" (WB11/WB12) | 796 | | // | 797 | | // "Numeric (MidNum | MidNumLetQ) Numeric" | 798 | 652k | b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, | 799 | | | 800 | | // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) | 801 | | // | 802 | | // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" | 803 | | // MidLetter = b':' | 804 | | // MidNumLetQ = b'.' | b'\'' | 805 | 366k | b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, | 806 | 872k | _ => false, | 807 | | } | 808 | 1.45M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_infix |
809 | | } |
810 | | |
811 | | impl<'a> Iterator for AsciiWordBoundIter<'a> { |
812 | | type Item = (usize, &'a str); |
813 | | |
814 | | #[inline] |
815 | 11.5M | fn next(&mut self) -> Option<Self::Item> { |
816 | 11.5M | if self.rest.is_empty() { |
817 | 900 | return None; |
818 | 11.5M | } |
819 | | |
820 | 11.5M | let bytes = self.rest.as_bytes(); |
821 | 11.5M | let len = bytes.len(); |
822 | | |
823 | | // 1) Keep horizontal whitespace together. |
824 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. |
825 | 11.5M | if bytes[0] == b' ' { |
826 | 347k | let mut i = 1; |
827 | 1.78M | while i < len && bytes[i] == b' ' { |
828 | 1.43M | i += 1; |
829 | 1.43M | } |
830 | 347k | let word = &self.rest[..i]; |
831 | 347k | let pos = self.offset; |
832 | 347k | self.rest = &self.rest[i..]; |
833 | 347k | self.offset += i; |
834 | 347k | return Some((pos, word)); |
835 | 11.1M | } |
836 | | |
837 | | // 2) Core-run (letters/digits/underscore + infix) |
838 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) |
839 | 11.1M | if Self::is_core(bytes[0]) { |
840 | 872k | let mut i = 1; |
841 | 11.7M | while i < len { |
842 | 11.7M | let b = bytes[i]; |
843 | 11.7M | if Self::is_core(b) |
844 | 1.45M | || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) |
845 | 10.8M | { |
846 | 10.8M | i += 1; |
847 | 10.8M | } else { |
848 | 872k | break; |
849 | | } |
850 | | } |
851 | 872k | let word = &self.rest[..i]; |
852 | 872k | let pos = self.offset; |
853 | 872k | self.rest = &self.rest[i..]; |
854 | 872k | self.offset += i; |
855 | 872k | return Some((pos, word)); |
856 | 10.2M | } |
857 | | |
858 | | // 3) Do not break within CRLF. |
859 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. |
860 | 10.2M | if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { |
861 | 4.38k | let word = &self.rest[..2]; |
862 | 4.38k | let pos = self.offset; |
863 | 4.38k | self.rest = &self.rest[2..]; |
864 | 4.38k | self.offset += 2; |
865 | 4.38k | Some((pos, word)) |
866 | | } else { |
867 | | // 4) Otherwise, break everywhere |
868 | | // Spec: the catch‑all rule WB999. |
869 | 10.2M | let word = &self.rest[..1]; |
870 | 10.2M | let pos = self.offset; |
871 | 10.2M | self.rest = &self.rest[1..]; |
872 | 10.2M | self.offset += 1; |
873 | 10.2M | Some((pos, word)) |
874 | | } |
875 | 11.5M | } <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 815 | 11.5M | fn next(&mut self) -> Option<Self::Item> { | 816 | 11.5M | if self.rest.is_empty() { | 817 | 900 | return None; | 818 | 11.5M | } | 819 | | | 820 | 11.5M | let bytes = self.rest.as_bytes(); | 821 | 11.5M | let len = bytes.len(); | 822 | | | 823 | | // 1) Keep horizontal whitespace together. | 824 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. | 825 | 11.5M | if bytes[0] == b' ' { | 826 | 347k | let mut i = 1; | 827 | 1.78M | while i < len && bytes[i] == b' ' { | 828 | 1.43M | i += 1; | 829 | 1.43M | } | 830 | 347k | let word = &self.rest[..i]; | 831 | 347k | let pos = self.offset; | 832 | 347k | self.rest = &self.rest[i..]; | 833 | 347k | self.offset += i; | 834 | 347k | return Some((pos, word)); | 835 | 11.1M | } | 836 | | | 837 | | // 2) Core-run (letters/digits/underscore + infix) | 838 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) | 839 | 11.1M | if Self::is_core(bytes[0]) { | 840 | 872k | let mut i = 1; | 841 | 11.7M | while i < len { | 842 | 11.7M | let b = bytes[i]; | 843 | 11.7M | if Self::is_core(b) | 844 | 1.45M | || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) | 845 | 10.8M | { | 846 | 10.8M | i += 1; | 847 | 10.8M | } else { | 848 | 872k | break; | 849 | | } | 850 | | } | 851 | 872k | let word = &self.rest[..i]; | 852 | 872k | let pos = self.offset; | 853 | 872k | self.rest = &self.rest[i..]; | 854 | 872k | self.offset += i; | 855 | 872k | return Some((pos, word)); | 856 | 10.2M | } | 857 | | | 858 | | // 3) Do not break within CRLF. | 859 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. | 860 | 10.2M | if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { | 861 | 4.38k | let word = &self.rest[..2]; | 862 | 4.38k | let pos = self.offset; | 863 | 4.38k | self.rest = &self.rest[2..]; | 864 | 4.38k | self.offset += 2; | 865 | 4.38k | Some((pos, word)) | 866 | | } else { | 867 | | // 4) Otherwise, break everywhere | 868 | | // Spec: the catch‑all rule WB999. | 869 | 10.2M | let word = &self.rest[..1]; | 870 | 10.2M | let pos = self.offset; | 871 | 10.2M | self.rest = &self.rest[1..]; | 872 | 10.2M | self.offset += 1; | 873 | 10.2M | Some((pos, word)) | 874 | | } | 875 | 11.5M | } |
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next |
876 | | } |
877 | | |
878 | | impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { |
879 | 0 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
880 | 0 | let rest = self.rest; |
881 | 0 | if rest.is_empty() { |
882 | 0 | return None; |
883 | 0 | } |
884 | 0 | let bytes = rest.as_bytes(); |
885 | 0 | let len = bytes.len(); |
886 | | |
887 | | // 1) Group runs of spaces |
888 | | // Spec: WB3d joins adjacent *WSegSpace* into a single segment. |
889 | 0 | if bytes[len - 1] == b' ' { |
890 | | // find start of this last run of spaces |
891 | 0 | let mut start = len - 1; |
892 | 0 | while start > 0 && bytes[start - 1] == b' ' { |
893 | 0 | start -= 1; |
894 | 0 | } |
895 | 0 | let word = &rest[start..]; |
896 | 0 | let pos = self.offset + start; |
897 | 0 | self.rest = &rest[..start]; |
898 | 0 | return Some((pos, word)); |
899 | 0 | } |
900 | | |
901 | | // 2) Trailing Core-run (letters/digits/underscore + infix) |
902 | | // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) |
903 | 0 | if Self::is_core(bytes[len - 1]) { |
904 | | // scan backwards as long as we see `is_core` or an `is_infix` |
905 | 0 | let mut start = len - 1; |
906 | 0 | while start > 0 { |
907 | 0 | let b = bytes[start - 1]; |
908 | 0 | let prev = if start >= 2 { bytes[start - 2] } else { b }; |
909 | 0 | let next = bytes[start]; // the byte we just included |
910 | 0 | if Self::is_core(b) || Self::is_infix(b, prev, next) { |
911 | 0 | start -= 1; |
912 | 0 | } else { |
913 | 0 | break; |
914 | | } |
915 | | } |
916 | 0 | let word = &rest[start..]; |
917 | 0 | let pos = self.offset + start; |
918 | 0 | self.rest = &rest[..start]; |
919 | 0 | return Some((pos, word)); |
920 | 0 | } |
921 | | |
922 | | // 3) Non-core: CR+LF as one token, otherwise single char |
923 | | // Spec: WB3 treats CR+LF as a single non‑breaking pair. |
924 | 0 | if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { |
925 | 0 | let start = len - 2; |
926 | 0 | let word = &rest[start..]; |
927 | 0 | let pos = self.offset + start; |
928 | 0 | self.rest = &rest[..start]; |
929 | 0 | return Some((pos, word)); |
930 | 0 | } |
931 | | |
932 | | // 4) Fallback – every other byte is its own segment |
933 | | // Spec: the catch‑all rule WB999. |
934 | 0 | let start = len - 1; |
935 | 0 | let word = &rest[start..]; |
936 | 0 | let pos = self.offset + start; |
937 | 0 | self.rest = &rest[..start]; |
938 | 0 | Some((pos, word)) |
939 | 0 | } |
940 | | } |
941 | | |
942 | | #[inline] |
943 | 0 | fn ascii_word_ok(t: &(usize, &str)) -> bool { |
944 | 0 | has_ascii_alphanumeric(&t.1) |
945 | 0 | } |
946 | | #[inline] |
947 | 0 | fn unicode_word_ok(t: &(usize, &str)) -> bool { |
948 | 0 | has_alphanumeric(&t.1) |
949 | 0 | } |
950 | | |
951 | | type AsciiWordsIter<'a> = Filter< |
952 | | core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>, |
953 | | fn(&&'a str) -> bool, |
954 | | >; |
955 | | type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>; |
956 | | type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>; |
957 | | type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>; |
958 | | |
959 | | #[derive(Debug)] |
960 | | enum WordsIter<'a> { |
961 | | Ascii(AsciiWordsIter<'a>), |
962 | | Unicode(UnicodeWordsIter<'a>), |
963 | | } |
964 | | |
965 | | #[derive(Debug)] |
966 | | enum IndicesIter<'a> { |
967 | | Ascii(AsciiIndicesIter<'a>), |
968 | | Unicode(UnicodeIndicesIter<'a>), |
969 | | } |
970 | | |
971 | | #[inline] |
972 | 3.55k | pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { |
973 | 3.55k | let inner = if s.is_ascii() { |
974 | 900 | WordsIter::Ascii(new_unicode_words_ascii(s)) |
975 | | } else { |
976 | 2.65k | WordsIter::Unicode(new_unicode_words_general(s)) |
977 | | }; |
978 | 3.55k | UnicodeWords { inner } |
979 | 3.55k | } unicode_segmentation::word::new_unicode_words Line | Count | Source | 972 | 3.55k | pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { | 973 | 3.55k | let inner = if s.is_ascii() { | 974 | 900 | WordsIter::Ascii(new_unicode_words_ascii(s)) | 975 | | } else { | 976 | 2.65k | WordsIter::Unicode(new_unicode_words_general(s)) | 977 | | }; | 978 | 3.55k | UnicodeWords { inner } | 979 | 3.55k | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words |
980 | | |
981 | | #[inline] |
982 | 0 | pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { |
983 | 0 | let inner = if s.is_ascii() { |
984 | 0 | IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) |
985 | | } else { |
986 | 0 | IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) |
987 | | }; |
988 | 0 | UnicodeWordIndices { inner } |
989 | 0 | } |
990 | | |
991 | | #[inline] |
992 | 6.21k | pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { |
993 | 6.21k | UWordBounds { |
994 | 6.21k | string: s, |
995 | 6.21k | cat: None, |
996 | 6.21k | catb: None, |
997 | 6.21k | } |
998 | 6.21k | } unicode_segmentation::word::new_word_bounds Line | Count | Source | 992 | 6.21k | pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { | 993 | 6.21k | UWordBounds { | 994 | 6.21k | string: s, | 995 | 6.21k | cat: None, | 996 | 6.21k | catb: None, | 997 | 6.21k | } | 998 | 6.21k | } |
Unexecuted instantiation: unicode_segmentation::word::new_word_bounds |
999 | | |
1000 | | #[inline] |
1001 | 0 | pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { |
1002 | 0 | UWordBoundIndices { |
1003 | 0 | start_offset: s.as_ptr() as usize, |
1004 | 0 | iter: new_word_bounds(s), |
1005 | 0 | } |
1006 | 0 | } |
1007 | | |
1008 | | #[inline] |
1009 | 900 | fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { |
1010 | 900 | AsciiWordBoundIter::new(s) |
1011 | 900 | } unicode_segmentation::word::new_ascii_word_bound_indices Line | Count | Source | 1009 | 900 | fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { | 1010 | 900 | AsciiWordBoundIter::new(s) | 1011 | 900 | } |
Unexecuted instantiation: unicode_segmentation::word::new_ascii_word_bound_indices |
1012 | | |
1013 | | #[inline] |
1014 | 7.34M | fn has_alphanumeric(s: &&str) -> bool { |
1015 | | use crate::tables::util::is_alphanumeric; |
1016 | | |
1017 | 7.34M | s.chars().any(is_alphanumeric) |
1018 | 7.34M | } unicode_segmentation::word::has_alphanumeric Line | Count | Source | 1014 | 7.34M | fn has_alphanumeric(s: &&str) -> bool { | 1015 | | use crate::tables::util::is_alphanumeric; | 1016 | | | 1017 | 7.34M | s.chars().any(is_alphanumeric) | 1018 | 7.34M | } |
Unexecuted instantiation: unicode_segmentation::word::has_alphanumeric |
1019 | | |
1020 | | #[inline] |
1021 | 11.5M | fn has_ascii_alphanumeric(s: &&str) -> bool { |
1022 | 12.9M | s.chars().any(|c| c.is_ascii_alphanumeric()) unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}Line | Count | Source | 1022 | 12.9M | s.chars().any(|c| c.is_ascii_alphanumeric()) |
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric::{closure#0} |
1023 | 11.5M | } unicode_segmentation::word::has_ascii_alphanumeric Line | Count | Source | 1021 | 11.5M | fn has_ascii_alphanumeric(s: &&str) -> bool { | 1022 | 11.5M | s.chars().any(|c| c.is_ascii_alphanumeric()) | 1023 | 11.5M | } |
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric |
1024 | | |
1025 | | #[inline(always)] |
1026 | 11.5M | fn strip_pos((_, w): (usize, &str)) -> &str { |
1027 | 11.5M | w |
1028 | 11.5M | } |
1029 | | |
1030 | | #[inline] |
1031 | 900 | fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { |
1032 | 900 | new_ascii_word_bound_indices(s) |
1033 | 900 | .map(strip_pos as fn(_) -> _) |
1034 | 900 | .filter(has_ascii_alphanumeric) |
1035 | 900 | } unicode_segmentation::word::new_unicode_words_ascii Line | Count | Source | 1031 | 900 | fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { | 1032 | 900 | new_ascii_word_bound_indices(s) | 1033 | 900 | .map(strip_pos as fn(_) -> _) | 1034 | 900 | .filter(has_ascii_alphanumeric) | 1035 | 900 | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_ascii |
1036 | | |
1037 | | #[inline] |
1038 | 2.65k | fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { |
1039 | 2.65k | new_word_bounds(s).filter(has_alphanumeric) |
1040 | 2.65k | } unicode_segmentation::word::new_unicode_words_general Line | Count | Source | 1038 | 2.65k | fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { | 1039 | 2.65k | new_word_bounds(s).filter(has_alphanumeric) | 1040 | 2.65k | } |
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_general |
1041 | | |
1042 | | #[cfg(test)] |
1043 | | mod tests { |
1044 | | use crate::word::{ |
1045 | | new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, |
1046 | | }; |
1047 | | use std::string::String; |
1048 | | use std::vec; |
1049 | | use std::vec::Vec; |
1050 | | |
1051 | | use proptest::prelude::*; |
1052 | | |
1053 | | #[test] |
1054 | | fn test_syriac_abbr_mark() { |
1055 | | use crate::tables::word as wd; |
1056 | | let (_, _, cat) = wd::word_category('\u{70f}'); |
1057 | | assert_eq!(cat, wd::WC_ALetter); |
1058 | | } |
1059 | | |
1060 | | #[test] |
1061 | | fn test_end_of_ayah_cat() { |
1062 | | use crate::tables::word as wd; |
1063 | | let (_, _, cat) = wd::word_category('\u{6dd}'); |
1064 | | assert_eq!(cat, wd::WC_Numeric); |
1065 | | } |
1066 | | |
1067 | | #[test] |
1068 | | fn test_ascii_word_bound_indices_various_cases() { |
1069 | | let s = "Hello, world!"; |
1070 | | let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); |
1071 | | let expected = vec![ |
1072 | | (0, "Hello"), // simple letters |
1073 | | (5, ","), |
1074 | | (6, " "), // space after comma |
1075 | | (7, "world"), // skip comma+space, stop at '!' |
1076 | | (12, "!"), // punctuation at the end |
1077 | | ]; |
1078 | | assert_eq!(words, expected); |
1079 | | } |
1080 | | |
1081 | | #[test] |
1082 | | fn test_ascii_word_indices_various_cases() { |
1083 | | let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; |
1084 | | let words: Vec<&str> = new_unicode_words_ascii(s).collect(); |
1085 | | let expected = vec![ |
1086 | | ("Hello"), // simple letters |
1087 | | ("world"), // skip comma+space, stop at '!' |
1088 | | ("can't"), // apostrophe joins letters |
1089 | | ("e.g"), |
1090 | | ("var1"), |
1091 | | ("123,456"), // digits+comma+digits |
1092 | | ("foo_bar"), |
1093 | | ("example.com"), |
1094 | | ("127.0.0.1"), |
1095 | | ("9090"), // port number |
1096 | | ]; |
1097 | | assert_eq!(words, expected); |
1098 | | } |
1099 | | |
1100 | | /// Strategy that yields every code-point from NUL (0) to DEL (127). |
1101 | | fn ascii_char() -> impl Strategy<Value = char> { |
1102 | | (0u8..=127).prop_map(|b| b as char) |
1103 | | } |
1104 | | |
1105 | | proptest! { |
1106 | | #![proptest_config(ProptestConfig::with_cases(10000))] |
1107 | | /// Fast path must equal general path for any ASCII input. |
1108 | | #[test] |
1109 | | fn proptest_ascii_matches_unicode_word_indices( |
1110 | | // Vec<char> → String, length 0‒99 |
1111 | | s in proptest::collection::vec(ascii_char(), 0..100) |
1112 | | .prop_map(|v| v.into_iter().collect::<String>()) |
1113 | | ) { |
1114 | | let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); |
1115 | | let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); |
1116 | | |
1117 | | prop_assert_eq!(fast, uni); |
1118 | | } |
1119 | | |
1120 | | /// Fast path must equal general path for any ASCII input, forwards and backwards. |
1121 | | #[test] |
1122 | | fn proptest_ascii_matches_unicode_word_indices_rev( |
1123 | | // Vec<char> → String, length 0‒99 |
1124 | | s in proptest::collection::vec(ascii_char(), 0..100) |
1125 | | .prop_map(|v| v.into_iter().collect::<String>()) |
1126 | | ) { |
1127 | | let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); |
1128 | | let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); |
1129 | | prop_assert_eq!(fast_rev, uni_rev); |
1130 | | } |
1131 | | } |
1132 | | } |