Coverage Report

Created: 2025-11-24 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/unicode-segmentation/src/word.rs
Line
Count
Source
1
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2
// file at the top-level directory of this distribution and at
3
// http://rust-lang.org/COPYRIGHT.
4
//
5
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8
// option. This file may not be copied, modified, or distributed
9
// except according to those terms.
10
11
use core::cmp;
12
use core::iter::Filter;
13
14
use crate::tables::word::WordCat;
15
16
/// An iterator over the substrings of a string which, after splitting the string on
17
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18
/// contain any characters with the
19
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20
/// property, or with
21
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22
///
23
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24
/// its documentation for more.
25
///
26
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28
#[derive(Debug)]
29
pub struct UnicodeWords<'a> {
30
    inner: WordsIter<'a>,
31
}
32
33
impl<'a> Iterator for UnicodeWords<'a> {
34
    type Item = &'a str;
35
    #[inline]
36
2.04M
    fn next(&mut self) -> Option<Self::Item> {
37
2.04M
        match &mut self.inner {
38
693k
            WordsIter::Ascii(i) => i.next(),
39
1.35M
            WordsIter::Unicode(i) => i.next(),
40
        }
41
2.04M
    }
<unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
36
2.04M
    fn next(&mut self) -> Option<Self::Item> {
37
2.04M
        match &mut self.inner {
38
693k
            WordsIter::Ascii(i) => i.next(),
39
1.35M
            WordsIter::Unicode(i) => i.next(),
40
        }
41
2.04M
    }
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next
42
    #[inline]
43
4.39k
    fn size_hint(&self) -> (usize, Option<usize>) {
44
4.39k
        match &self.inner {
45
1.49k
            WordsIter::Ascii(i) => i.size_hint(),
46
2.90k
            WordsIter::Unicode(i) => i.size_hint(),
47
        }
48
4.39k
    }
<unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint
Line
Count
Source
43
4.39k
    fn size_hint(&self) -> (usize, Option<usize>) {
44
4.39k
        match &self.inner {
45
1.49k
            WordsIter::Ascii(i) => i.size_hint(),
46
2.90k
            WordsIter::Unicode(i) => i.size_hint(),
47
        }
48
4.39k
    }
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint
49
}
50
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
51
    #[inline]
52
0
    fn next_back(&mut self) -> Option<Self::Item> {
53
0
        match &mut self.inner {
54
0
            WordsIter::Ascii(i) => i.next_back(),
55
0
            WordsIter::Unicode(i) => i.next_back(),
56
        }
57
0
    }
58
}
59
60
/// An iterator over the substrings of a string which, after splitting the string on
61
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
62
/// contain any characters with the
63
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
64
/// property, or with
65
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
66
/// This iterator also provides the byte offsets for each substring.
67
///
68
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
69
/// its documentation for more.
70
///
71
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
72
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
73
#[derive(Debug)]
74
pub struct UnicodeWordIndices<'a> {
75
    inner: IndicesIter<'a>,
76
}
77
78
impl<'a> Iterator for UnicodeWordIndices<'a> {
79
    type Item = (usize, &'a str);
80
    #[inline]
81
0
    fn next(&mut self) -> Option<Self::Item> {
82
0
        match &mut self.inner {
83
0
            IndicesIter::Ascii(i) => i.next(),
84
0
            IndicesIter::Unicode(i) => i.next(),
85
        }
86
0
    }
87
    #[inline]
88
0
    fn size_hint(&self) -> (usize, Option<usize>) {
89
0
        match &self.inner {
90
0
            IndicesIter::Ascii(i) => i.size_hint(),
91
0
            IndicesIter::Unicode(i) => i.size_hint(),
92
        }
93
0
    }
94
}
95
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
96
    #[inline]
97
0
    fn next_back(&mut self) -> Option<Self::Item> {
98
0
        match &mut self.inner {
99
0
            IndicesIter::Ascii(i) => i.next_back(),
100
0
            IndicesIter::Unicode(i) => i.next_back(),
101
        }
102
0
    }
103
}
104
105
/// External iterator for a string's
106
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
107
///
108
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
109
/// trait. See its documentation for more.
110
///
111
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
112
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
113
#[derive(Debug, Clone)]
114
pub struct UWordBounds<'a> {
115
    string: &'a str,
116
    cat: Option<WordCat>,
117
    catb: Option<WordCat>,
118
}
119
120
/// External iterator for word boundaries and byte offsets.
121
///
122
/// This struct is created by the [`split_word_bound_indices`] method on the
123
/// [`UnicodeSegmentation`] trait. See its documentation for more.
124
///
125
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
126
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
127
#[derive(Debug, Clone)]
128
pub struct UWordBoundIndices<'a> {
129
    start_offset: usize,
130
    iter: UWordBounds<'a>,
131
}
132
133
impl<'a> UWordBoundIndices<'a> {
134
    #[inline]
135
    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
136
    ///
137
    /// ```rust
138
    /// # use unicode_segmentation::UnicodeSegmentation;
139
    /// let mut iter = "Hello world".split_word_bound_indices();
140
    /// assert_eq!(iter.as_str(), "Hello world");
141
    /// iter.next();
142
    /// assert_eq!(iter.as_str(), " world");
143
    /// iter.next();
144
    /// assert_eq!(iter.as_str(), "world");
145
    /// ```
146
0
    pub fn as_str(&self) -> &'a str {
147
0
        self.iter.as_str()
148
0
    }
149
}
150
151
impl<'a> Iterator for UWordBoundIndices<'a> {
152
    type Item = (usize, &'a str);
153
154
    #[inline]
155
0
    fn next(&mut self) -> Option<(usize, &'a str)> {
156
0
        self.iter
157
0
            .next()
158
0
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
159
0
    }
160
161
    #[inline]
162
0
    fn size_hint(&self) -> (usize, Option<usize>) {
163
0
        self.iter.size_hint()
164
0
    }
165
}
166
167
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
168
    #[inline]
169
0
    fn next_back(&mut self) -> Option<(usize, &'a str)> {
170
0
        self.iter
171
0
            .next_back()
172
0
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
173
0
    }
174
}
175
176
// state machine for word boundary rules
177
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178
enum UWordBoundsState {
179
    Start,
180
    Letter,
181
    HLetter,
182
    Numeric,
183
    Katakana,
184
    ExtendNumLet,
185
    Regional(RegionalState),
186
    FormatExtend(FormatExtendType),
187
    Zwj,
188
    Emoji,
189
    WSegSpace,
190
}
191
192
// subtypes for FormatExtend state in UWordBoundsState
193
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
194
enum FormatExtendType {
195
    AcceptAny,
196
    AcceptNone,
197
    RequireLetter,
198
    RequireHLetter,
199
    AcceptQLetter,
200
    RequireNumeric,
201
}
202
203
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
204
enum RegionalState {
205
    Half,
206
    Full,
207
    Unknown,
208
}
209
210
50.1k
fn is_emoji(ch: char) -> bool {
211
    use crate::tables::emoji;
212
50.1k
    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
213
50.1k
}
214
215
impl<'a> Iterator for UWordBounds<'a> {
216
    type Item = &'a str;
217
218
    #[inline]
219
10.1k
    fn size_hint(&self) -> (usize, Option<usize>) {
220
10.1k
        let slen = self.string.len();
221
10.1k
        (cmp::min(slen, 1), Some(slen))
222
10.1k
    }
<unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint
Line
Count
Source
219
10.1k
    fn size_hint(&self) -> (usize, Option<usize>) {
220
10.1k
        let slen = self.string.len();
221
10.1k
        (cmp::min(slen, 1), Some(slen))
222
10.1k
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint
223
224
    #[inline]
225
24.8M
    fn next(&mut self) -> Option<&'a str> {
226
        use self::FormatExtendType::*;
227
        use self::UWordBoundsState::*;
228
        use crate::tables::word as wd;
229
24.8M
        if self.string.is_empty() {
230
5.67k
            return None;
231
24.8M
        }
232
233
24.8M
        let mut take_curr = true;
234
24.8M
        let mut take_cat = true;
235
24.8M
        let mut idx = 0;
236
24.8M
        let mut saveidx = 0;
237
24.8M
        let mut state = Start;
238
24.8M
        let mut cat = wd::WC_Any;
239
24.8M
        let mut savecat = wd::WC_Any;
240
241
        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242
24.8M
        let mut skipped_format_extend = false;
243
58.1M
        for (curr, ch) in self.string.char_indices() {
244
58.1M
            idx = curr;
245
            // Whether or not the previous category was ZWJ
246
            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247
58.1M
            let prev_zwj = cat == wd::WC_ZWJ;
248
            // if there's a category cached, grab it
249
58.1M
            cat = match self.cat {
250
55.3M
                None => wd::word_category(ch).2,
251
2.78M
                _ => self.cat.take().unwrap(),
252
            };
253
58.1M
            take_cat = true;
254
255
            // handle rule WB4
256
            // just skip all format, extend, and zwj chars
257
            // note that Start is a special case: if there's a bunch of Format | Extend
258
            // characters at the beginning of a block of text, dump them out as one unit.
259
            //
260
            // (This is not obvious from the wording of UAX#29, but if you look at the
261
            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262
            // then the "correct" interpretation of WB4 becomes apparent.)
263
58.1M
            if state != Start {
264
33.2M
                match cat {
265
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266
108k
                        skipped_format_extend = true;
267
108k
                        continue;
268
                    }
269
33.1M
                    _ => {}
270
                }
271
24.8M
            }
272
273
            // rule WB3c
274
            // WB4 makes all ZWJs collapse into the previous state
275
            // but you can still be in a Zwj state if you started with Zwj
276
            //
277
            // This means that an EP + Zwj will collapse into EP, which is wrong,
278
            // since EP+EP is not a boundary but EP+ZWJ+EP is
279
            //
280
            // Thus, we separately keep track of whether or not the last character
281
            // was a ZWJ. This is an additional bit of state tracked outside of the
282
            // state enum; the state enum represents the last non-zwj state encountered.
283
            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284
            // however we are in the previous state for the purposes of all other rules.
285
57.9M
            if prev_zwj && is_emoji(ch) {
286
31.5k
                state = Emoji;
287
31.5k
                continue;
288
57.9M
            }
289
            // Don't use `continue` in this match without updating `cat`
290
24.8M
            state = match state {
291
24.8M
                Start if cat == wd::WC_CR => {
292
5.87M
                    idx += match self.get_next_cat(idx) {
293
17.3k
                        Some(wd::WC_LF) => 1, // rule WB3
294
5.85M
                        _ => 0,
295
                    };
296
5.87M
                    break; // rule WB3a
297
                }
298
18.9M
                Start => match cat {
299
1.88M
                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
300
21.9k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301
402k
                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
302
1.57k
                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
303
57.4k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304
5.05k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305
6.13M
                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306
5.00k
                    wd::WC_ZWJ => Zwj,                   // rule WB3c
307
389k
                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
308
                    _ => {
309
10.0M
                        if let Some(ncat) = self.get_next_cat(idx) {
310
                            // rule WB4
311
10.0M
                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312
                            {
313
31.3k
                                state = FormatExtend(AcceptNone);
314
31.3k
                                self.cat = Some(ncat);
315
31.3k
                                continue;
316
10.0M
                            }
317
1.92k
                        }
318
10.0M
                        break; // rule WB999
319
                    }
320
                },
321
994k
                WSegSpace => match cat {
322
994k
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323
                    _ => {
324
388k
                        take_curr = false;
325
388k
                        break;
326
                    }
327
                },
328
                Zwj => {
329
                    // We already handle WB3c above.
330
944
                    take_curr = false;
331
944
                    break;
332
                }
333
65.3k
                Letter | HLetter => match cat {
334
15.9M
                    wd::WC_ALetter => Letter,            // rule WB5
335
3.77k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
336
713k
                    wd::WC_Numeric => Numeric,           // rule WB9
337
19.9k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338
65.3k
                    wd::WC_Double_Quote if state == HLetter => {
339
4.29k
                        savecat = cat;
340
4.29k
                        saveidx = idx;
341
4.29k
                        FormatExtend(RequireHLetter) // rule WB7b
342
                    }
343
55.2k
                    wd::WC_Single_Quote if state == HLetter => {
344
26.0k
                        FormatExtend(AcceptQLetter) // rule WB7a
345
                    }
346
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347
716k
                        savecat = cat;
348
716k
                        saveidx = idx;
349
716k
                        FormatExtend(RequireLetter) // rule WB6
350
                    }
351
                    _ => {
352
1.76M
                        take_curr = false;
353
1.76M
                        break;
354
                    }
355
                },
356
10.8M
                Numeric => match cat {
357
9.21M
                    wd::WC_Numeric => Numeric,           // rule WB8
358
676k
                    wd::WC_ALetter => Letter,            // rule WB10
359
2.44k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
360
11.0k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362
619k
                        savecat = cat;
363
619k
                        saveidx = idx;
364
619k
                        FormatExtend(RequireNumeric) // rule WB12
365
                    }
366
                    _ => {
367
345k
                        take_curr = false;
368
345k
                        break;
369
                    }
370
                },
371
3.35k
                Katakana => match cat {
372
1.03k
                    wd::WC_Katakana => Katakana,         // rule WB13
373
968
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374
                    _ => {
375
1.35k
                        take_curr = false;
376
1.35k
                        break;
377
                    }
378
                },
379
220k
                ExtendNumLet => match cat {
380
131k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381
25.4k
                    wd::WC_ALetter => Letter,            // rule WB13b
382
2.60k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
383
9.19k
                    wd::WC_Numeric => Numeric,           // rule WB13b
384
820
                    wd::WC_Katakana => Katakana,         // rule WB13b
385
                    _ => {
386
51.0k
                        take_curr = false;
387
51.0k
                        break;
388
                    }
389
                },
390
                Regional(RegionalState::Full) => {
391
                    // if it reaches here we've gone too far,
392
                    // a full flag can only compose with ZWJ/Extend/Format
393
                    // proceeding it.
394
3.44k
                    take_curr = false;
395
3.44k
                    break;
396
                }
397
5.00k
                Regional(RegionalState::Half) => match cat {
398
3.49k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399
                    _ => {
400
1.50k
                        take_curr = false;
401
1.50k
                        break;
402
                    }
403
                },
404
                Regional(_) => {
405
0
                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
406
                }
407
                Emoji => {
408
                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409
12.8k
                    take_curr = false;
410
12.8k
                    break;
411
                }
412
1.38M
                FormatExtend(t) => match t {
413
                    // handle FormatExtends depending on what type
414
619k
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415
716k
                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416
96.3k
                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417
4.26k
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418
                    AcceptNone | AcceptQLetter => {
419
39.8k
                        take_curr = false; // emit all the Format|Extend characters
420
39.8k
                        take_cat = false;
421
39.8k
                        break;
422
                    }
423
186k
                    _ => break, // rewind (in if statement below)
424
                },
425
            }
426
        }
427
428
24.8M
        if let FormatExtend(t) = state {
429
            // we were looking for something and didn't find it; we have to back up
430
226k
            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431
186k
                idx = saveidx;
432
186k
                cat = savecat;
433
186k
                take_curr = false;
434
186k
            }
435
24.6M
        }
436
437
24.8M
        self.cat = if take_curr {
438
22.0M
            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439
22.0M
            None
440
2.79M
        } else if take_cat {
441
2.75M
            Some(cat)
442
        } else {
443
39.8k
            None
444
        };
445
446
24.8M
        let retstr = &self.string[..idx];
447
24.8M
        self.string = &self.string[idx..];
448
24.8M
        Some(retstr)
449
24.8M
    }
<unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
225
24.8M
    fn next(&mut self) -> Option<&'a str> {
226
        use self::FormatExtendType::*;
227
        use self::UWordBoundsState::*;
228
        use crate::tables::word as wd;
229
24.8M
        if self.string.is_empty() {
230
5.67k
            return None;
231
24.8M
        }
232
233
24.8M
        let mut take_curr = true;
234
24.8M
        let mut take_cat = true;
235
24.8M
        let mut idx = 0;
236
24.8M
        let mut saveidx = 0;
237
24.8M
        let mut state = Start;
238
24.8M
        let mut cat = wd::WC_Any;
239
24.8M
        let mut savecat = wd::WC_Any;
240
241
        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242
24.8M
        let mut skipped_format_extend = false;
243
58.1M
        for (curr, ch) in self.string.char_indices() {
244
58.1M
            idx = curr;
245
            // Whether or not the previous category was ZWJ
246
            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247
58.1M
            let prev_zwj = cat == wd::WC_ZWJ;
248
            // if there's a category cached, grab it
249
58.1M
            cat = match self.cat {
250
55.3M
                None => wd::word_category(ch).2,
251
2.78M
                _ => self.cat.take().unwrap(),
252
            };
253
58.1M
            take_cat = true;
254
255
            // handle rule WB4
256
            // just skip all format, extend, and zwj chars
257
            // note that Start is a special case: if there's a bunch of Format | Extend
258
            // characters at the beginning of a block of text, dump them out as one unit.
259
            //
260
            // (This is not obvious from the wording of UAX#29, but if you look at the
261
            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262
            // then the "correct" interpretation of WB4 becomes apparent.)
263
58.1M
            if state != Start {
264
33.2M
                match cat {
265
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266
108k
                        skipped_format_extend = true;
267
108k
                        continue;
268
                    }
269
33.1M
                    _ => {}
270
                }
271
24.8M
            }
272
273
            // rule WB3c
274
            // WB4 makes all ZWJs collapse into the previous state
275
            // but you can still be in a Zwj state if you started with Zwj
276
            //
277
            // This means that an EP + Zwj will collapse into EP, which is wrong,
278
            // since EP+EP is not a boundary but EP+ZWJ+EP is
279
            //
280
            // Thus, we separately keep track of whether or not the last character
281
            // was a ZWJ. This is an additional bit of state tracked outside of the
282
            // state enum; the state enum represents the last non-zwj state encountered.
283
            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284
            // however we are in the previous state for the purposes of all other rules.
285
57.9M
            if prev_zwj && is_emoji(ch) {
286
31.5k
                state = Emoji;
287
31.5k
                continue;
288
57.9M
            }
289
            // Don't use `continue` in this match without updating `cat`
290
24.8M
            state = match state {
291
24.8M
                Start if cat == wd::WC_CR => {
292
5.87M
                    idx += match self.get_next_cat(idx) {
293
17.3k
                        Some(wd::WC_LF) => 1, // rule WB3
294
5.85M
                        _ => 0,
295
                    };
296
5.87M
                    break; // rule WB3a
297
                }
298
18.9M
                Start => match cat {
299
1.88M
                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
300
21.9k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301
402k
                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
302
1.57k
                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
303
57.4k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304
5.05k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305
6.13M
                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306
5.00k
                    wd::WC_ZWJ => Zwj,                   // rule WB3c
307
389k
                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
308
                    _ => {
309
10.0M
                        if let Some(ncat) = self.get_next_cat(idx) {
310
                            // rule WB4
311
10.0M
                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312
                            {
313
31.3k
                                state = FormatExtend(AcceptNone);
314
31.3k
                                self.cat = Some(ncat);
315
31.3k
                                continue;
316
10.0M
                            }
317
1.92k
                        }
318
10.0M
                        break; // rule WB999
319
                    }
320
                },
321
994k
                WSegSpace => match cat {
322
994k
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323
                    _ => {
324
388k
                        take_curr = false;
325
388k
                        break;
326
                    }
327
                },
328
                Zwj => {
329
                    // We already handle WB3c above.
330
944
                    take_curr = false;
331
944
                    break;
332
                }
333
65.3k
                Letter | HLetter => match cat {
334
15.9M
                    wd::WC_ALetter => Letter,            // rule WB5
335
3.77k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
336
713k
                    wd::WC_Numeric => Numeric,           // rule WB9
337
19.9k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338
65.3k
                    wd::WC_Double_Quote if state == HLetter => {
339
4.29k
                        savecat = cat;
340
4.29k
                        saveidx = idx;
341
4.29k
                        FormatExtend(RequireHLetter) // rule WB7b
342
                    }
343
55.2k
                    wd::WC_Single_Quote if state == HLetter => {
344
26.0k
                        FormatExtend(AcceptQLetter) // rule WB7a
345
                    }
346
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347
716k
                        savecat = cat;
348
716k
                        saveidx = idx;
349
716k
                        FormatExtend(RequireLetter) // rule WB6
350
                    }
351
                    _ => {
352
1.76M
                        take_curr = false;
353
1.76M
                        break;
354
                    }
355
                },
356
10.8M
                Numeric => match cat {
357
9.21M
                    wd::WC_Numeric => Numeric,           // rule WB8
358
676k
                    wd::WC_ALetter => Letter,            // rule WB10
359
2.44k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
360
11.0k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362
619k
                        savecat = cat;
363
619k
                        saveidx = idx;
364
619k
                        FormatExtend(RequireNumeric) // rule WB12
365
                    }
366
                    _ => {
367
345k
                        take_curr = false;
368
345k
                        break;
369
                    }
370
                },
371
3.35k
                Katakana => match cat {
372
1.03k
                    wd::WC_Katakana => Katakana,         // rule WB13
373
968
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374
                    _ => {
375
1.35k
                        take_curr = false;
376
1.35k
                        break;
377
                    }
378
                },
379
220k
                ExtendNumLet => match cat {
380
131k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381
25.4k
                    wd::WC_ALetter => Letter,            // rule WB13b
382
2.60k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
383
9.19k
                    wd::WC_Numeric => Numeric,           // rule WB13b
384
820
                    wd::WC_Katakana => Katakana,         // rule WB13b
385
                    _ => {
386
51.0k
                        take_curr = false;
387
51.0k
                        break;
388
                    }
389
                },
390
                Regional(RegionalState::Full) => {
391
                    // if it reaches here we've gone too far,
392
                    // a full flag can only compose with ZWJ/Extend/Format
393
                    // proceeding it.
394
3.44k
                    take_curr = false;
395
3.44k
                    break;
396
                }
397
5.00k
                Regional(RegionalState::Half) => match cat {
398
3.49k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399
                    _ => {
400
1.50k
                        take_curr = false;
401
1.50k
                        break;
402
                    }
403
                },
404
                Regional(_) => {
405
0
                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
406
                }
407
                Emoji => {
408
                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409
12.8k
                    take_curr = false;
410
12.8k
                    break;
411
                }
412
1.38M
                FormatExtend(t) => match t {
413
                    // handle FormatExtends depending on what type
414
619k
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415
716k
                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416
96.3k
                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417
4.26k
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418
                    AcceptNone | AcceptQLetter => {
419
39.8k
                        take_curr = false; // emit all the Format|Extend characters
420
39.8k
                        take_cat = false;
421
39.8k
                        break;
422
                    }
423
186k
                    _ => break, // rewind (in if statement below)
424
                },
425
            }
426
        }
427
428
24.8M
        if let FormatExtend(t) = state {
429
            // we were looking for something and didn't find it; we have to back up
430
226k
            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431
186k
                idx = saveidx;
432
186k
                cat = savecat;
433
186k
                take_curr = false;
434
186k
            }
435
24.6M
        }
436
437
24.8M
        self.cat = if take_curr {
438
22.0M
            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439
22.0M
            None
440
2.79M
        } else if take_cat {
441
2.75M
            Some(cat)
442
        } else {
443
39.8k
            None
444
        };
445
446
24.8M
        let retstr = &self.string[..idx];
447
24.8M
        self.string = &self.string[idx..];
448
24.8M
        Some(retstr)
449
24.8M
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next
450
}
451
452
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
453
    #[inline]
454
0
    fn next_back(&mut self) -> Option<&'a str> {
455
        use self::FormatExtendType::*;
456
        use self::UWordBoundsState::*;
457
        use crate::tables::word as wd;
458
0
        if self.string.is_empty() {
459
0
            return None;
460
0
        }
461
462
0
        let mut take_curr = true;
463
0
        let mut take_cat = true;
464
0
        let mut idx = self.string.len();
465
0
        idx -= self.string.chars().next_back().unwrap().len_utf8();
466
0
        let mut previdx = idx;
467
0
        let mut saveidx = idx;
468
0
        let mut state = Start;
469
0
        let mut savestate = Start;
470
0
        let mut cat = wd::WC_Any;
471
472
0
        let mut skipped_format_extend = false;
473
474
0
        for (curr, ch) in self.string.char_indices().rev() {
475
0
            previdx = idx;
476
0
            idx = curr;
477
478
            // if there's a category cached, grab it
479
0
            cat = match self.catb {
480
0
                None => wd::word_category(ch).2,
481
0
                _ => self.catb.take().unwrap(),
482
            };
483
0
            take_cat = true;
484
485
            // backward iterator over word boundaries. Mostly the same as the forward
486
            // iterator, with two weirdnesses:
487
            // (1) If we encounter a single quote in the Start state, we have to check for a
488
            //     Hebrew Letter immediately before it.
489
            // (2) Format and Extend char handling takes some gymnastics.
490
491
0
            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
492
                // WB3c has more priority so we should not
493
                // fold in that case
494
0
                if !matches!(state, FormatExtend(_) | Start) {
495
0
                    saveidx = previdx;
496
0
                    savestate = state;
497
0
                    state = FormatExtend(AcceptNone);
498
0
                }
499
500
0
                if state != Start {
501
0
                    continue;
502
0
                }
503
0
            } else if state == FormatExtend(AcceptNone) {
504
0
                // finished a scan of some Format|Extend chars, restore previous state
505
0
                state = savestate;
506
0
                previdx = saveidx;
507
0
                take_cat = false;
508
0
                skipped_format_extend = true;
509
0
            }
510
511
            // Don't use `continue` in this match without updating `catb`
512
0
            state = match state {
513
0
                Start | FormatExtend(AcceptAny) => match cat {
514
0
                    _ if is_emoji(ch) => Zwj,
515
0
                    wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
516
0
                    wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
517
0
                    wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
518
0
                    wd::WC_Katakana => Katakana, // rule WB13, WB13b
519
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
520
0
                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
521
                    // rule WB4:
522
0
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
523
                    wd::WC_Single_Quote => {
524
0
                        saveidx = idx;
525
0
                        FormatExtend(AcceptQLetter) // rule WB7a
526
                    }
527
0
                    wd::WC_WSegSpace => WSegSpace,
528
                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
529
0
                        if state == Start {
530
0
                            if cat == wd::WC_LF {
531
0
                                idx -= match self.get_prev_cat(idx) {
532
0
                                    Some(wd::WC_CR) => 1, // rule WB3
533
0
                                    _ => 0,
534
                                };
535
0
                            }
536
0
                        } else {
537
0
                            take_curr = false;
538
0
                        }
539
0
                        break; // rule WB3a
540
                    }
541
0
                    _ => break, // rule WB999
542
                },
543
0
                Zwj => match cat {
544
                    // rule WB3c
545
0
                    wd::WC_ZWJ => FormatExtend(AcceptAny),
546
                    _ => {
547
0
                        take_curr = false;
548
0
                        break;
549
                    }
550
                },
551
0
                WSegSpace => match cat {
552
                    // rule WB3d
553
0
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
554
                    _ => {
555
0
                        take_curr = false;
556
0
                        break;
557
                    }
558
                },
559
0
                Letter | HLetter => match cat {
560
0
                    wd::WC_ALetter => Letter,            // rule WB5
561
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
562
0
                    wd::WC_Numeric => Numeric,           // rule WB10
563
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
564
0
                    wd::WC_Double_Quote if state == HLetter => {
565
0
                        saveidx = previdx;
566
0
                        FormatExtend(RequireHLetter) // rule WB7c
567
                    }
568
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
569
0
                        saveidx = previdx;
570
0
                        FormatExtend(RequireLetter) // rule WB7
571
                    }
572
                    _ => {
573
0
                        take_curr = false;
574
0
                        break;
575
                    }
576
                },
577
0
                Numeric => match cat {
578
0
                    wd::WC_Numeric => Numeric,           // rule WB8
579
0
                    wd::WC_ALetter => Letter,            // rule WB9
580
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
581
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
582
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
583
0
                        saveidx = previdx;
584
0
                        FormatExtend(RequireNumeric) // rule WB11
585
                    }
586
                    _ => {
587
0
                        take_curr = false;
588
0
                        break;
589
                    }
590
                },
591
0
                Katakana => match cat {
592
0
                    wd::WC_Katakana => Katakana,         // rule WB13
593
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
594
                    _ => {
595
0
                        take_curr = false;
596
0
                        break;
597
                    }
598
                },
599
0
                ExtendNumLet => match cat {
600
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
601
0
                    wd::WC_ALetter => Letter,            // rule WB13a
602
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
603
0
                    wd::WC_Numeric => Numeric,           // rule WB13a
604
0
                    wd::WC_Katakana => Katakana,         // rule WB13a
605
                    _ => {
606
0
                        take_curr = false;
607
0
                        break;
608
                    }
609
                },
610
0
                Regional(mut regional_state) => match cat {
611
                    // rule WB13c
612
                    wd::WC_Regional_Indicator => {
613
0
                        if regional_state == RegionalState::Unknown {
614
0
                            let count = self.string[..previdx]
615
0
                                .chars()
616
0
                                .rev()
617
0
                                .map(|c| wd::word_category(c).2)
618
0
                                .filter(|&c| {
619
0
                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
620
0
                                })
621
0
                                .take_while(|&c| c == wd::WC_Regional_Indicator)
622
0
                                .count();
623
0
                            regional_state = if count % 2 == 0 {
624
0
                                RegionalState::Full
625
                            } else {
626
0
                                RegionalState::Half
627
                            };
628
0
                        }
629
0
                        if regional_state == RegionalState::Full {
630
0
                            take_curr = false;
631
0
                            break;
632
                        } else {
633
0
                            Regional(RegionalState::Full)
634
                        }
635
                    }
636
                    _ => {
637
0
                        take_curr = false;
638
0
                        break;
639
                    }
640
                },
641
                Emoji => {
642
0
                    if is_emoji(ch) {
643
                        // rule WB3c
644
0
                        Zwj
645
                    } else {
646
0
                        take_curr = false;
647
0
                        break;
648
                    }
649
                }
650
0
                FormatExtend(t) => match t {
651
0
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
652
0
                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
653
0
                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
654
0
                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
655
0
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
656
0
                    _ => break,                                         // backtrack will happens
657
                },
658
            }
659
        }
660
661
0
        if let FormatExtend(t) = state {
662
            // if we required something but didn't find it, backtrack
663
0
            if t == RequireLetter
664
0
                || t == RequireHLetter
665
0
                || t == RequireNumeric
666
0
                || t == AcceptNone
667
0
                || t == AcceptQLetter
668
0
            {
669
0
                previdx = saveidx;
670
0
                take_cat = false;
671
0
                take_curr = false;
672
0
            }
673
0
        }
674
675
0
        self.catb = if take_curr {
676
0
            None
677
        } else {
678
0
            idx = previdx;
679
0
            if take_cat {
680
0
                Some(cat)
681
            } else {
682
0
                None
683
            }
684
        };
685
686
0
        let retstr = &self.string[idx..];
687
0
        self.string = &self.string[..idx];
688
0
        Some(retstr)
689
0
    }
690
}
691
692
impl<'a> UWordBounds<'a> {
693
    #[inline]
694
    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
695
    ///
696
    /// ```rust
697
    /// # use unicode_segmentation::UnicodeSegmentation;
698
    /// let mut iter = "Hello world".split_word_bounds();
699
    /// assert_eq!(iter.as_str(), "Hello world");
700
    /// iter.next();
701
    /// assert_eq!(iter.as_str(), " world");
702
    /// iter.next();
703
    /// assert_eq!(iter.as_str(), "world");
704
    /// ```
705
0
    pub fn as_str(&self) -> &'a str {
706
0
        self.string
707
0
    }
708
709
    #[inline]
710
15.9M
    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
711
        use crate::tables::word as wd;
712
15.9M
        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
713
15.9M
        if nidx < self.string.len() {
714
15.9M
            let nch = self.string[nidx..].chars().next().unwrap();
715
15.9M
            Some(wd::word_category(nch).2)
716
        } else {
717
1.96k
            None
718
        }
719
15.9M
    }
<unicode_segmentation::word::UWordBounds>::get_next_cat
Line
Count
Source
710
15.9M
    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
711
        use crate::tables::word as wd;
712
15.9M
        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
713
15.9M
        if nidx < self.string.len() {
714
15.9M
            let nch = self.string[nidx..].chars().next().unwrap();
715
15.9M
            Some(wd::word_category(nch).2)
716
        } else {
717
1.96k
            None
718
        }
719
15.9M
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds>::get_next_cat
720
721
    #[inline]
722
0
    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
723
        use crate::tables::word as wd;
724
0
        if idx > 0 {
725
0
            let nch = self.string[..idx].chars().next_back().unwrap();
726
0
            Some(wd::word_category(nch).2)
727
        } else {
728
0
            None
729
        }
730
0
    }
731
}
732
733
/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
734
///
735
/// Since we handle only ASCII characters, we can use a much simpler set of
736
/// word break values than the full Unicode algorithm.
737
/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
738
///
739
/// | Word_Break value | ASCII code points that belong to it                             |
740
/// | -----------------| --------------------------------------------------------------- |
741
/// | CR               | U+000D (CR)                                                     |
742
/// | LF               | U+000A (LF)                                                     |
743
/// | Newline          | U+000B (VT), U+000C (FF)                                        |
744
/// | Single_Quote     | U+0027 (')                                                      |
745
/// | Double_Quote     | U+0022 (")                                                      |
746
/// | MidNumLet        | U+002E (.) FULL STOP                                            |
747
/// | MidLetter        | U+003A (:) COLON                                                |
748
/// | MidNum           | U+002C (,), U+003B (;)                                          |
749
/// | Numeric          | U+0030 – U+0039 (0 … 9)                                         |
750
/// | ALetter          | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z)                |
751
/// | ExtendNumLet     | U+005F (_) underscore                                           |
752
/// | WSegSpace        | U+0020 (SPACE)                                                  |
753
///
754
/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
755
/// AHLetter is the same as ALetter, so we don't need to distinguish it.
756
///
757
/// Any other single ASCII byte is its own boundary (the default WB999).
758
#[derive(Debug)]
759
struct AsciiWordBoundIter<'a> {
760
    rest: &'a str,
761
    offset: usize,
762
}
763
764
impl<'a> AsciiWordBoundIter<'a> {
765
815
    pub fn new(s: &'a str) -> Self {
766
815
        AsciiWordBoundIter { rest: s, offset: 0 }
767
815
    }
768
769
    #[inline]
770
24.4M
    fn is_core(b: u8) -> bool {
771
24.4M
        b.is_ascii_alphanumeric() || b == b'_'
772
24.4M
    }
<unicode_segmentation::word::AsciiWordBoundIter>::is_core
Line
Count
Source
770
24.4M
    fn is_core(b: u8) -> bool {
771
24.4M
        b.is_ascii_alphanumeric() || b == b'_'
772
24.4M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_core
773
774
    #[inline]
775
1.48M
    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
776
507k
        match b {
777
            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
778
            //
779
            // "Numeric (MidNum | MidNumLetQ) Numeric"
780
848k
            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
781
782
            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
783
            //
784
            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
785
            // MidLetter  = b':'
786
            // MidNumLetQ = b'.' | b'\''
787
507k
            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
788
704k
            _ => false,
789
        }
790
1.48M
    }
<unicode_segmentation::word::AsciiWordBoundIter>::is_infix
Line
Count
Source
775
1.48M
    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
776
507k
        match b {
777
            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
778
            //
779
            // "Numeric (MidNum | MidNumLetQ) Numeric"
780
848k
            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
781
782
            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
783
            //
784
            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
785
            // MidLetter  = b':'
786
            // MidNumLetQ = b'.' | b'\''
787
507k
            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
788
704k
            _ => false,
789
        }
790
1.48M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_infix
791
}
792
793
impl<'a> Iterator for AsciiWordBoundIter<'a> {
794
    type Item = (usize, &'a str);
795
796
    #[inline]
797
12.7M
    fn next(&mut self) -> Option<Self::Item> {
798
12.7M
        if self.rest.is_empty() {
799
815
            return None;
800
12.7M
        }
801
802
12.7M
        let bytes = self.rest.as_bytes();
803
12.7M
        let len = bytes.len();
804
805
        // 1) Keep horizontal whitespace together.
806
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
807
12.7M
        if bytes[0] == b' ' {
808
295k
            let mut i = 1;
809
1.01M
            while i < len && bytes[i] == b' ' {
810
716k
                i += 1;
811
716k
            }
812
295k
            let word = &self.rest[..i];
813
295k
            let pos = self.offset;
814
295k
            self.rest = &self.rest[i..];
815
295k
            self.offset += i;
816
295k
            return Some((pos, word));
817
12.4M
        }
818
819
        // 2) Core-run (letters/digits/underscore + infix)
820
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
821
12.4M
        if Self::is_core(bytes[0]) {
822
705k
            let mut i = 1;
823
12.0M
            while i < len {
824
12.0M
                let b = bytes[i];
825
12.0M
                if Self::is_core(b)
826
1.48M
                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
827
11.3M
                {
828
11.3M
                    i += 1;
829
11.3M
                } else {
830
705k
                    break;
831
                }
832
            }
833
705k
            let word = &self.rest[..i];
834
705k
            let pos = self.offset;
835
705k
            self.rest = &self.rest[i..];
836
705k
            self.offset += i;
837
705k
            return Some((pos, word));
838
11.7M
        }
839
840
        // 3) Do not break within CRLF.
841
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
842
11.7M
        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
843
4.98k
            let word = &self.rest[..2];
844
4.98k
            let pos = self.offset;
845
4.98k
            self.rest = &self.rest[2..];
846
4.98k
            self.offset += 2;
847
4.98k
            Some((pos, word))
848
        } else {
849
            // 4) Otherwise, break everywhere
850
            // Spec: the catch‑all rule WB999.
851
11.7M
            let word = &self.rest[..1];
852
11.7M
            let pos = self.offset;
853
11.7M
            self.rest = &self.rest[1..];
854
11.7M
            self.offset += 1;
855
11.7M
            Some((pos, word))
856
        }
857
12.7M
    }
<unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
797
12.7M
    fn next(&mut self) -> Option<Self::Item> {
798
12.7M
        if self.rest.is_empty() {
799
815
            return None;
800
12.7M
        }
801
802
12.7M
        let bytes = self.rest.as_bytes();
803
12.7M
        let len = bytes.len();
804
805
        // 1) Keep horizontal whitespace together.
806
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
807
12.7M
        if bytes[0] == b' ' {
808
295k
            let mut i = 1;
809
1.01M
            while i < len && bytes[i] == b' ' {
810
716k
                i += 1;
811
716k
            }
812
295k
            let word = &self.rest[..i];
813
295k
            let pos = self.offset;
814
295k
            self.rest = &self.rest[i..];
815
295k
            self.offset += i;
816
295k
            return Some((pos, word));
817
12.4M
        }
818
819
        // 2) Core-run (letters/digits/underscore + infix)
820
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
821
12.4M
        if Self::is_core(bytes[0]) {
822
705k
            let mut i = 1;
823
12.0M
            while i < len {
824
12.0M
                let b = bytes[i];
825
12.0M
                if Self::is_core(b)
826
1.48M
                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
827
11.3M
                {
828
11.3M
                    i += 1;
829
11.3M
                } else {
830
705k
                    break;
831
                }
832
            }
833
705k
            let word = &self.rest[..i];
834
705k
            let pos = self.offset;
835
705k
            self.rest = &self.rest[i..];
836
705k
            self.offset += i;
837
705k
            return Some((pos, word));
838
11.7M
        }
839
840
        // 3) Do not break within CRLF.
841
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
842
11.7M
        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
843
4.98k
            let word = &self.rest[..2];
844
4.98k
            let pos = self.offset;
845
4.98k
            self.rest = &self.rest[2..];
846
4.98k
            self.offset += 2;
847
4.98k
            Some((pos, word))
848
        } else {
849
            // 4) Otherwise, break everywhere
850
            // Spec: the catch‑all rule WB999.
851
11.7M
            let word = &self.rest[..1];
852
11.7M
            let pos = self.offset;
853
11.7M
            self.rest = &self.rest[1..];
854
11.7M
            self.offset += 1;
855
11.7M
            Some((pos, word))
856
        }
857
12.7M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next
858
}
859
860
impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
861
0
    fn next_back(&mut self) -> Option<(usize, &'a str)> {
862
0
        let rest = self.rest;
863
0
        if rest.is_empty() {
864
0
            return None;
865
0
        }
866
0
        let bytes = rest.as_bytes();
867
0
        let len = bytes.len();
868
869
        // 1) Group runs of spaces
870
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
871
0
        if bytes[len - 1] == b' ' {
872
            // find start of this last run of spaces
873
0
            let mut start = len - 1;
874
0
            while start > 0 && bytes[start - 1] == b' ' {
875
0
                start -= 1;
876
0
            }
877
0
            let word = &rest[start..];
878
0
            let pos = self.offset + start;
879
0
            self.rest = &rest[..start];
880
0
            return Some((pos, word));
881
0
        }
882
883
        // 2) Trailing Core-run (letters/digits/underscore + infix)
884
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
885
0
        if Self::is_core(bytes[len - 1]) {
886
            // scan backwards as long as we see `is_core` or an `is_infix`
887
0
            let mut start = len - 1;
888
0
            while start > 0 {
889
0
                let b = bytes[start - 1];
890
0
                let prev = if start >= 2 { bytes[start - 2] } else { b };
891
0
                let next = bytes[start]; // the byte we just included
892
0
                if Self::is_core(b) || Self::is_infix(b, prev, next) {
893
0
                    start -= 1;
894
0
                } else {
895
0
                    break;
896
                }
897
            }
898
0
            let word = &rest[start..];
899
0
            let pos = self.offset + start;
900
0
            self.rest = &rest[..start];
901
0
            return Some((pos, word));
902
0
        }
903
904
        // 3) Non-core: CR+LF as one token, otherwise single char
905
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
906
0
        if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
907
0
            let start = len - 2;
908
0
            let word = &rest[start..];
909
0
            let pos = self.offset + start;
910
0
            self.rest = &rest[..start];
911
0
            return Some((pos, word));
912
0
        }
913
914
        // 4) Fallback – every other byte is its own segment
915
        // Spec: the catch‑all rule WB999.
916
0
        let start = len - 1;
917
0
        let word = &rest[start..];
918
0
        let pos = self.offset + start;
919
0
        self.rest = &rest[..start];
920
0
        Some((pos, word))
921
0
    }
922
}
923
924
#[inline]
925
0
fn ascii_word_ok(t: &(usize, &str)) -> bool {
926
0
    has_ascii_alphanumeric(&t.1)
927
0
}
928
#[inline]
929
0
fn unicode_word_ok(t: &(usize, &str)) -> bool {
930
0
    has_alphanumeric(&t.1)
931
0
}
932
933
type AsciiWordsIter<'a> = Filter<
934
    core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
935
    fn(&&'a str) -> bool,
936
>;
937
type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
938
type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
939
type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
940
941
#[derive(Debug)]
942
enum WordsIter<'a> {
943
    Ascii(AsciiWordsIter<'a>),
944
    Unicode(UnicodeWordsIter<'a>),
945
}
946
947
#[derive(Debug)]
948
enum IndicesIter<'a> {
949
    Ascii(AsciiIndicesIter<'a>),
950
    Unicode(UnicodeIndicesIter<'a>),
951
}
952
953
#[inline]
954
3.24k
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
955
3.24k
    let inner = if s.is_ascii() {
956
815
        WordsIter::Ascii(new_unicode_words_ascii(s))
957
    } else {
958
2.43k
        WordsIter::Unicode(new_unicode_words_general(s))
959
    };
960
3.24k
    UnicodeWords { inner }
961
3.24k
}
unicode_segmentation::word::new_unicode_words
Line
Count
Source
954
3.24k
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
955
3.24k
    let inner = if s.is_ascii() {
956
815
        WordsIter::Ascii(new_unicode_words_ascii(s))
957
    } else {
958
2.43k
        WordsIter::Unicode(new_unicode_words_general(s))
959
    };
960
3.24k
    UnicodeWords { inner }
961
3.24k
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words
962
963
#[inline]
964
0
pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
965
0
    let inner = if s.is_ascii() {
966
0
        IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok))
967
    } else {
968
0
        IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok))
969
    };
970
0
    UnicodeWordIndices { inner }
971
0
}
972
973
#[inline]
974
5.67k
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
975
5.67k
    UWordBounds {
976
5.67k
        string: s,
977
5.67k
        cat: None,
978
5.67k
        catb: None,
979
5.67k
    }
980
5.67k
}
unicode_segmentation::word::new_word_bounds
Line
Count
Source
974
5.67k
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
975
5.67k
    UWordBounds {
976
5.67k
        string: s,
977
5.67k
        cat: None,
978
5.67k
        catb: None,
979
5.67k
    }
980
5.67k
}
Unexecuted instantiation: unicode_segmentation::word::new_word_bounds
981
982
#[inline]
983
0
pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
984
0
    UWordBoundIndices {
985
0
        start_offset: s.as_ptr() as usize,
986
0
        iter: new_word_bounds(s),
987
0
    }
988
0
}
989
990
#[inline]
991
815
fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
992
815
    AsciiWordBoundIter::new(s)
993
815
}
unicode_segmentation::word::new_ascii_word_bound_indices
Line
Count
Source
991
815
fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
992
815
    AsciiWordBoundIter::new(s)
993
815
}
Unexecuted instantiation: unicode_segmentation::word::new_ascii_word_bound_indices
994
995
#[inline]
996
6.05M
fn has_alphanumeric(s: &&str) -> bool {
997
    use crate::tables::util::is_alphanumeric;
998
999
6.05M
    s.chars().any(is_alphanumeric)
1000
6.05M
}
unicode_segmentation::word::has_alphanumeric
Line
Count
Source
996
6.05M
fn has_alphanumeric(s: &&str) -> bool {
997
    use crate::tables::util::is_alphanumeric;
998
999
6.05M
    s.chars().any(is_alphanumeric)
1000
6.05M
}
Unexecuted instantiation: unicode_segmentation::word::has_alphanumeric
1001
1002
#[inline]
1003
12.7M
fn has_ascii_alphanumeric(s: &&str) -> bool {
1004
13.4M
    s.chars().any(|c| c.is_ascii_alphanumeric())
unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}
Line
Count
Source
1004
13.4M
    s.chars().any(|c| c.is_ascii_alphanumeric())
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}
1005
12.7M
}
unicode_segmentation::word::has_ascii_alphanumeric
Line
Count
Source
1003
12.7M
fn has_ascii_alphanumeric(s: &&str) -> bool {
1004
12.7M
    s.chars().any(|c| c.is_ascii_alphanumeric())
1005
12.7M
}
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric
1006
1007
#[inline(always)]
1008
12.7M
fn strip_pos((_, w): (usize, &str)) -> &str {
1009
12.7M
    w
1010
12.7M
}
1011
1012
#[inline]
1013
815
fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1014
815
    new_ascii_word_bound_indices(s)
1015
815
        .map(strip_pos as fn(_) -> _)
1016
815
        .filter(has_ascii_alphanumeric)
1017
815
}
unicode_segmentation::word::new_unicode_words_ascii
Line
Count
Source
1013
815
fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1014
815
    new_ascii_word_bound_indices(s)
1015
815
        .map(strip_pos as fn(_) -> _)
1016
815
        .filter(has_ascii_alphanumeric)
1017
815
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_ascii
1018
1019
#[inline]
1020
2.43k
fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1021
2.43k
    new_word_bounds(s).filter(has_alphanumeric)
1022
2.43k
}
unicode_segmentation::word::new_unicode_words_general
Line
Count
Source
1020
2.43k
fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1021
2.43k
    new_word_bounds(s).filter(has_alphanumeric)
1022
2.43k
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_general
1023
1024
#[cfg(test)]
1025
mod tests {
1026
    use crate::word::{
1027
        new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices,
1028
    };
1029
    use std::string::String;
1030
    use std::vec::Vec;
1031
    use std::{format, vec};
1032
1033
    use proptest::prelude::*;
1034
1035
    #[test]
1036
    fn test_syriac_abbr_mark() {
1037
        use crate::tables::word as wd;
1038
        let (_, _, cat) = wd::word_category('\u{70f}');
1039
        assert_eq!(cat, wd::WC_ALetter);
1040
    }
1041
1042
    #[test]
1043
    fn test_end_of_ayah_cat() {
1044
        use crate::tables::word as wd;
1045
        let (_, _, cat) = wd::word_category('\u{6dd}');
1046
        assert_eq!(cat, wd::WC_Numeric);
1047
    }
1048
1049
    #[test]
1050
    fn test_ascii_word_bound_indices_various_cases() {
1051
        let s = "Hello, world!";
1052
        let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect();
1053
        let expected = vec![
1054
            (0, "Hello"), // simple letters
1055
            (5, ","),
1056
            (6, " "),     // space after comma
1057
            (7, "world"), // skip comma+space, stop at '!'
1058
            (12, "!"),    // punctuation at the end
1059
        ];
1060
        assert_eq!(words, expected);
1061
    }
1062
1063
    #[test]
1064
    fn test_ascii_word_indices_various_cases() {
1065
        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";
1066
        let words: Vec<&str> = new_unicode_words_ascii(s).collect();
1067
        let expected = vec![
1068
            ("Hello"), // simple letters
1069
            ("world"), // skip comma+space, stop at '!'
1070
            ("can't"), // apostrophe joins letters
1071
            ("e.g"),
1072
            ("var1"),
1073
            ("123,456"), // digits+comma+digits
1074
            ("foo_bar"),
1075
            ("example.com"),
1076
            ("127.0.0.1"),
1077
            ("9090"), // port number
1078
        ];
1079
        assert_eq!(words, expected);
1080
    }
1081
1082
    /// Strategy that yields every code-point from NUL (0) to DEL (127).
1083
    fn ascii_char() -> impl Strategy<Value = char> {
1084
        (0u8..=127).prop_map(|b| b as char)
1085
    }
1086
1087
    proptest! {
1088
        #![proptest_config(ProptestConfig::with_cases(10000))]
1089
        /// Fast path must equal general path for any ASCII input.
1090
        #[test]
1091
        fn proptest_ascii_matches_unicode_word_indices(
1092
            // Vec<char> → String, length 0‒99
1093
            s in proptest::collection::vec(ascii_char(), 0..100)
1094
                   .prop_map(|v| v.into_iter().collect::<String>())
1095
        ) {
1096
            let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect();
1097
            let uni:  Vec<(usize, &str)> = new_word_bound_indices(&s).collect();
1098
1099
            prop_assert_eq!(fast, uni);
1100
        }
1101
1102
        /// Fast path must equal general path for any ASCII input, forwards and backwards.
1103
        #[test]
1104
        fn proptest_ascii_matches_unicode_word_indices_rev(
1105
            // Vec<char> → String, length 0‒99
1106
            s in proptest::collection::vec(ascii_char(), 0..100)
1107
                   .prop_map(|v| v.into_iter().collect::<String>())
1108
        ) {
1109
            let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
1110
            let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
1111
            prop_assert_eq!(fast_rev, uni_rev);
1112
        }
1113
    }
1114
}