Coverage Report

Created: 2026-04-12 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/unicode-segmentation/src/word.rs
Line
Count
Source
1
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2
// file at the top-level directory of this distribution and at
3
// http://rust-lang.org/COPYRIGHT.
4
//
5
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8
// option. This file may not be copied, modified, or distributed
9
// except according to those terms.
10
11
use core::cmp;
12
use core::iter::Filter;
13
14
use crate::tables::word::WordCat;
15
16
/// An iterator over the substrings of a string which, after splitting the string on
17
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18
/// contain any characters with the
19
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20
/// property, or with
21
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22
///
23
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24
/// its documentation for more.
25
///
26
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28
#[derive(Debug)]
29
pub struct UnicodeWords<'a> {
30
    inner: WordsIter<'a>,
31
}
32
33
impl<'a> Iterator for UnicodeWords<'a> {
34
    type Item = &'a str;
35
    #[inline]
36
2.39M
    fn next(&mut self) -> Option<Self::Item> {
37
2.39M
        match &mut self.inner {
38
859k
            WordsIter::Ascii(i) => i.next(),
39
1.53M
            WordsIter::Unicode(i) => i.next(),
40
        }
41
2.39M
    }
<unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
36
2.39M
    fn next(&mut self) -> Option<Self::Item> {
37
2.39M
        match &mut self.inner {
38
859k
            WordsIter::Ascii(i) => i.next(),
39
1.53M
            WordsIter::Unicode(i) => i.next(),
40
        }
41
2.39M
    }
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::next
42
    #[inline]
43
4.80k
    fn size_hint(&self) -> (usize, Option<usize>) {
44
4.80k
        match &self.inner {
45
1.65k
            WordsIter::Ascii(i) => i.size_hint(),
46
3.14k
            WordsIter::Unicode(i) => i.size_hint(),
47
        }
48
4.80k
    }
<unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint
Line
Count
Source
43
4.80k
    fn size_hint(&self) -> (usize, Option<usize>) {
44
4.80k
        match &self.inner {
45
1.65k
            WordsIter::Ascii(i) => i.size_hint(),
46
3.14k
            WordsIter::Unicode(i) => i.size_hint(),
47
        }
48
4.80k
    }
Unexecuted instantiation: <unicode_segmentation::word::UnicodeWords as core::iter::traits::iterator::Iterator>::size_hint
49
}
50
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
51
    #[inline]
52
0
    fn next_back(&mut self) -> Option<Self::Item> {
53
0
        match &mut self.inner {
54
0
            WordsIter::Ascii(i) => i.next_back(),
55
0
            WordsIter::Unicode(i) => i.next_back(),
56
        }
57
0
    }
58
}
59
60
/// An iterator over the substrings of a string which, after splitting the string on
61
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
62
/// contain any characters with the
63
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
64
/// property, or with
65
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
66
/// This iterator also provides the byte offsets for each substring.
67
///
68
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
69
/// its documentation for more.
70
///
71
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
72
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
73
#[derive(Debug)]
74
pub struct UnicodeWordIndices<'a> {
75
    inner: IndicesIter<'a>,
76
}
77
78
impl<'a> Iterator for UnicodeWordIndices<'a> {
79
    type Item = (usize, &'a str);
80
    #[inline]
81
0
    fn next(&mut self) -> Option<Self::Item> {
82
0
        match &mut self.inner {
83
0
            IndicesIter::Ascii(i) => i.next(),
84
0
            IndicesIter::Unicode(i) => i.next(),
85
        }
86
0
    }
87
    #[inline]
88
0
    fn size_hint(&self) -> (usize, Option<usize>) {
89
0
        match &self.inner {
90
0
            IndicesIter::Ascii(i) => i.size_hint(),
91
0
            IndicesIter::Unicode(i) => i.size_hint(),
92
        }
93
0
    }
94
}
95
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
96
    #[inline]
97
0
    fn next_back(&mut self) -> Option<Self::Item> {
98
0
        match &mut self.inner {
99
0
            IndicesIter::Ascii(i) => i.next_back(),
100
0
            IndicesIter::Unicode(i) => i.next_back(),
101
        }
102
0
    }
103
}
104
105
/// External iterator for a string's
106
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
107
///
108
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
109
/// trait. See its documentation for more.
110
///
111
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
112
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
113
#[derive(Debug, Clone)]
114
pub struct UWordBounds<'a> {
115
    string: &'a str,
116
    cat: Option<WordCat>,
117
    catb: Option<WordCat>,
118
}
119
120
/// External iterator for word boundaries and byte offsets.
121
///
122
/// This struct is created by the [`split_word_bound_indices`] method on the
123
/// [`UnicodeSegmentation`] trait. See its documentation for more.
124
///
125
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
126
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
127
#[derive(Debug, Clone)]
128
pub struct UWordBoundIndices<'a> {
129
    start_offset: usize,
130
    iter: UWordBounds<'a>,
131
}
132
133
impl<'a> UWordBoundIndices<'a> {
134
    #[inline]
135
    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
136
    ///
137
    /// ```rust
138
    /// # use unicode_segmentation::UnicodeSegmentation;
139
    /// let mut iter = "Hello world".split_word_bound_indices();
140
    /// assert_eq!(iter.as_str(), "Hello world");
141
    /// iter.next();
142
    /// assert_eq!(iter.as_str(), " world");
143
    /// iter.next();
144
    /// assert_eq!(iter.as_str(), "world");
145
    /// ```
146
0
    pub fn as_str(&self) -> &'a str {
147
0
        self.iter.as_str()
148
0
    }
149
}
150
151
impl<'a> Iterator for UWordBoundIndices<'a> {
152
    type Item = (usize, &'a str);
153
154
    #[inline]
155
0
    fn next(&mut self) -> Option<(usize, &'a str)> {
156
0
        self.iter
157
0
            .next()
158
0
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
159
0
    }
160
161
    #[inline]
162
0
    fn size_hint(&self) -> (usize, Option<usize>) {
163
0
        self.iter.size_hint()
164
0
    }
165
}
166
167
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
168
    #[inline]
169
0
    fn next_back(&mut self) -> Option<(usize, &'a str)> {
170
0
        self.iter
171
0
            .next_back()
172
0
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
173
0
    }
174
}
175
176
// state machine for word boundary rules
177
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178
enum UWordBoundsState {
179
    Start,
180
    Letter,
181
    HLetter,
182
    Numeric,
183
    Katakana,
184
    ExtendNumLet,
185
    Regional(RegionalState),
186
    FormatExtend(FormatExtendType),
187
    Zwj,
188
    Emoji,
189
    WSegSpace,
190
}
191
192
// subtypes for FormatExtend state in UWordBoundsState
193
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
194
enum FormatExtendType {
195
    AcceptAny,
196
    AcceptNone,
197
    RequireLetter,
198
    RequireHLetter,
199
    AcceptQLetter,
200
    RequireNumeric,
201
}
202
203
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
204
enum RegionalState {
205
    Half,
206
    Full,
207
    Unknown,
208
}
209
210
63.1k
fn is_emoji(ch: char) -> bool {
211
    use crate::tables::emoji;
212
63.1k
    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
213
63.1k
}
214
215
impl<'a> Iterator for UWordBounds<'a> {
216
    type Item = &'a str;
217
218
    #[inline]
219
10.9k
    fn size_hint(&self) -> (usize, Option<usize>) {
220
10.9k
        let slen = self.string.len();
221
10.9k
        (cmp::min(slen, 1), Some(slen))
222
10.9k
    }
<unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint
Line
Count
Source
219
10.9k
    fn size_hint(&self) -> (usize, Option<usize>) {
220
10.9k
        let slen = self.string.len();
221
10.9k
        (cmp::min(slen, 1), Some(slen))
222
10.9k
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::size_hint
223
224
    #[inline]
225
26.2M
    fn next(&mut self) -> Option<&'a str> {
226
        use self::FormatExtendType::*;
227
        use self::UWordBoundsState::*;
228
        use crate::tables::word as wd;
229
26.2M
        if self.string.is_empty() {
230
6.21k
            return None;
231
26.2M
        }
232
233
26.2M
        let mut take_curr = true;
234
26.2M
        let mut take_cat = true;
235
26.2M
        let mut idx = 0;
236
26.2M
        let mut saveidx = 0;
237
26.2M
        let mut state = Start;
238
26.2M
        let mut cat = wd::WC_Any;
239
26.2M
        let mut savecat = wd::WC_Any;
240
241
        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242
26.2M
        let mut skipped_format_extend = false;
243
69.9M
        for (curr, ch) in self.string.char_indices() {
244
69.9M
            idx = curr;
245
            // Whether or not the previous category was ZWJ
246
            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247
69.9M
            let prev_zwj = cat == wd::WC_ZWJ;
248
            // if there's a category cached, grab it
249
69.9M
            cat = match self.cat {
250
66.4M
                None => wd::word_category(ch).2,
251
3.53M
                _ => self.cat.take().unwrap(),
252
            };
253
69.9M
            take_cat = true;
254
255
            // handle rule WB4
256
            // just skip all format, extend, and zwj chars
257
            // note that Start is a special case: if there's a bunch of Format | Extend
258
            // characters at the beginning of a block of text, dump them out as one unit.
259
            //
260
            // (This is not obvious from the wording of UAX#29, but if you look at the
261
            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262
            // then the "correct" interpretation of WB4 becomes apparent.)
263
69.9M
            if state != Start {
264
43.7M
                match cat {
265
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266
253k
                        skipped_format_extend = true;
267
253k
                        continue;
268
                    }
269
43.5M
                    _ => {}
270
                }
271
26.2M
            }
272
273
            // rule WB3c
274
            // WB4 makes all ZWJs collapse into the previous state
275
            // but you can still be in a Zwj state if you started with Zwj
276
            //
277
            // This means that an EP + Zwj will collapse into EP, which is wrong,
278
            // since EP+EP is not a boundary but EP+ZWJ+EP is
279
            //
280
            // Thus, we separately keep track of whether or not the last character
281
            // was a ZWJ. This is an additional bit of state tracked outside of the
282
            // state enum; the state enum represents the last non-zwj state encountered.
283
            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284
            // however we are in the previous state for the purposes of all other rules.
285
69.7M
            if prev_zwj && is_emoji(ch) {
286
41.1k
                state = Emoji;
287
41.1k
                continue;
288
69.7M
            }
289
            // Don't use `continue` in this match without updating `cat`
290
26.2M
            state = match state {
291
26.2M
                Start if cat == wd::WC_CR => {
292
6.60M
                    idx += match self.get_next_cat(idx) {
293
23.1k
                        Some(wd::WC_LF) => 1, // rule WB3
294
6.58M
                        _ => 0,
295
                    };
296
6.60M
                    break; // rule WB3a
297
                }
298
19.5M
                Start => match cat {
299
2.34M
                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
300
22.6k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301
445k
                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
302
2.02k
                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
303
99.5k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304
5.69k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305
5.97M
                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306
5.15k
                    wd::WC_ZWJ => Zwj,                   // rule WB3c
307
573k
                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
308
                    _ => {
309
10.1M
                        if let Some(ncat) = self.get_next_cat(idx) {
310
                            // rule WB4
311
10.1M
                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312
                            {
313
40.8k
                                state = FormatExtend(AcceptNone);
314
40.8k
                                self.cat = Some(ncat);
315
40.8k
                                continue;
316
10.0M
                            }
317
2.07k
                        }
318
10.0M
                        break; // rule WB999
319
                    }
320
                },
321
3.13M
                WSegSpace => match cat {
322
3.13M
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323
                    _ => {
324
572k
                        take_curr = false;
325
572k
                        break;
326
                    }
327
                },
328
                Zwj => {
329
                    // We already handle WB3c above.
330
852
                    take_curr = false;
331
852
                    break;
332
                }
333
83.0k
                Letter | HLetter => match cat {
334
21.7M
                    wd::WC_ALetter => Letter,            // rule WB5
335
3.83k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
336
933k
                    wd::WC_Numeric => Numeric,           // rule WB9
337
32.9k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338
83.0k
                    wd::WC_Double_Quote if state == HLetter => {
339
4.66k
                        savecat = cat;
340
4.66k
                        saveidx = idx;
341
4.66k
                        FormatExtend(RequireHLetter) // rule WB7b
342
                    }
343
90.2k
                    wd::WC_Single_Quote if state == HLetter => {
344
27.1k
                        FormatExtend(AcceptQLetter) // rule WB7a
345
                    }
346
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347
740k
                        savecat = cat;
348
740k
                        saveidx = idx;
349
740k
                        FormatExtend(RequireLetter) // rule WB6
350
                    }
351
                    _ => {
352
2.05M
                        take_curr = false;
353
2.05M
                        break;
354
                    }
355
                },
356
12.4M
                Numeric => match cat {
357
10.5M
                    wd::WC_Numeric => Numeric,           // rule WB8
358
876k
                    wd::WC_ALetter => Letter,            // rule WB10
359
2.55k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
360
13.6k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362
596k
                        savecat = cat;
363
596k
                        saveidx = idx;
364
596k
                        FormatExtend(RequireNumeric) // rule WB12
365
                    }
366
                    _ => {
367
381k
                        take_curr = false;
368
381k
                        break;
369
                    }
370
                },
371
3.49k
                Katakana => match cat {
372
640
                    wd::WC_Katakana => Katakana,         // rule WB13
373
1.04k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374
                    _ => {
375
1.81k
                        take_curr = false;
376
1.81k
                        break;
377
                    }
378
                },
379
316k
                ExtendNumLet => match cat {
380
169k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381
48.2k
                    wd::WC_ALetter => Letter,            // rule WB13b
382
2.66k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
383
10.5k
                    wd::WC_Numeric => Numeric,           // rule WB13b
384
906
                    wd::WC_Katakana => Katakana,         // rule WB13b
385
                    _ => {
386
84.6k
                        take_curr = false;
387
84.6k
                        break;
388
                    }
389
                },
390
                Regional(RegionalState::Full) => {
391
                    // if it reaches here we've gone too far,
392
                    // a full flag can only compose with ZWJ/Extend/Format
393
                    // proceeding it.
394
4.06k
                    take_curr = false;
395
4.06k
                    break;
396
                }
397
5.64k
                Regional(RegionalState::Half) => match cat {
398
4.11k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399
                    _ => {
400
1.53k
                        take_curr = false;
401
1.53k
                        break;
402
                    }
403
                },
404
                Regional(_) => {
405
0
                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
406
                }
407
                Emoji => {
408
                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409
14.5k
                    take_curr = false;
410
14.5k
                    break;
411
                }
412
1.39M
                FormatExtend(t) => match t {
413
                    // handle FormatExtends depending on what type
414
596k
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415
740k
                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416
252k
                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417
4.64k
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418
                    AcceptNone | AcceptQLetter => {
419
48.5k
                        take_curr = false; // emit all the Format|Extend characters
420
48.5k
                        take_cat = false;
421
48.5k
                        break;
422
                    }
423
367k
                    _ => break, // rewind (in if statement below)
424
                },
425
            }
426
        }
427
428
26.2M
        if let FormatExtend(t) = state {
429
            // we were looking for something and didn't find it; we have to back up
430
416k
            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431
367k
                idx = saveidx;
432
367k
                cat = savecat;
433
367k
                take_curr = false;
434
367k
            }
435
25.7M
        }
436
437
26.2M
        self.cat = if take_curr {
438
22.6M
            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439
22.6M
            None
440
3.53M
        } else if take_cat {
441
3.48M
            Some(cat)
442
        } else {
443
48.5k
            None
444
        };
445
446
26.2M
        let retstr = &self.string[..idx];
447
26.2M
        self.string = &self.string[idx..];
448
26.2M
        Some(retstr)
449
26.2M
    }
<unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
225
26.2M
    fn next(&mut self) -> Option<&'a str> {
226
        use self::FormatExtendType::*;
227
        use self::UWordBoundsState::*;
228
        use crate::tables::word as wd;
229
26.2M
        if self.string.is_empty() {
230
6.21k
            return None;
231
26.2M
        }
232
233
26.2M
        let mut take_curr = true;
234
26.2M
        let mut take_cat = true;
235
26.2M
        let mut idx = 0;
236
26.2M
        let mut saveidx = 0;
237
26.2M
        let mut state = Start;
238
26.2M
        let mut cat = wd::WC_Any;
239
26.2M
        let mut savecat = wd::WC_Any;
240
241
        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242
26.2M
        let mut skipped_format_extend = false;
243
69.9M
        for (curr, ch) in self.string.char_indices() {
244
69.9M
            idx = curr;
245
            // Whether or not the previous category was ZWJ
246
            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247
69.9M
            let prev_zwj = cat == wd::WC_ZWJ;
248
            // if there's a category cached, grab it
249
69.9M
            cat = match self.cat {
250
66.4M
                None => wd::word_category(ch).2,
251
3.53M
                _ => self.cat.take().unwrap(),
252
            };
253
69.9M
            take_cat = true;
254
255
            // handle rule WB4
256
            // just skip all format, extend, and zwj chars
257
            // note that Start is a special case: if there's a bunch of Format | Extend
258
            // characters at the beginning of a block of text, dump them out as one unit.
259
            //
260
            // (This is not obvious from the wording of UAX#29, but if you look at the
261
            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262
            // then the "correct" interpretation of WB4 becomes apparent.)
263
69.9M
            if state != Start {
264
43.7M
                match cat {
265
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266
253k
                        skipped_format_extend = true;
267
253k
                        continue;
268
                    }
269
43.5M
                    _ => {}
270
                }
271
26.2M
            }
272
273
            // rule WB3c
274
            // WB4 makes all ZWJs collapse into the previous state
275
            // but you can still be in a Zwj state if you started with Zwj
276
            //
277
            // This means that an EP + Zwj will collapse into EP, which is wrong,
278
            // since EP+EP is not a boundary but EP+ZWJ+EP is
279
            //
280
            // Thus, we separately keep track of whether or not the last character
281
            // was a ZWJ. This is an additional bit of state tracked outside of the
282
            // state enum; the state enum represents the last non-zwj state encountered.
283
            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284
            // however we are in the previous state for the purposes of all other rules.
285
69.7M
            if prev_zwj && is_emoji(ch) {
286
41.1k
                state = Emoji;
287
41.1k
                continue;
288
69.7M
            }
289
            // Don't use `continue` in this match without updating `cat`
290
26.2M
            state = match state {
291
26.2M
                Start if cat == wd::WC_CR => {
292
6.60M
                    idx += match self.get_next_cat(idx) {
293
23.1k
                        Some(wd::WC_LF) => 1, // rule WB3
294
6.58M
                        _ => 0,
295
                    };
296
6.60M
                    break; // rule WB3a
297
                }
298
19.5M
                Start => match cat {
299
2.34M
                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
300
22.6k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301
445k
                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
302
2.02k
                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
303
99.5k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304
5.69k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305
5.97M
                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306
5.15k
                    wd::WC_ZWJ => Zwj,                   // rule WB3c
307
573k
                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
308
                    _ => {
309
10.1M
                        if let Some(ncat) = self.get_next_cat(idx) {
310
                            // rule WB4
311
10.1M
                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312
                            {
313
40.8k
                                state = FormatExtend(AcceptNone);
314
40.8k
                                self.cat = Some(ncat);
315
40.8k
                                continue;
316
10.0M
                            }
317
2.07k
                        }
318
10.0M
                        break; // rule WB999
319
                    }
320
                },
321
3.13M
                WSegSpace => match cat {
322
3.13M
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323
                    _ => {
324
572k
                        take_curr = false;
325
572k
                        break;
326
                    }
327
                },
328
                Zwj => {
329
                    // We already handle WB3c above.
330
852
                    take_curr = false;
331
852
                    break;
332
                }
333
83.0k
                Letter | HLetter => match cat {
334
21.7M
                    wd::WC_ALetter => Letter,            // rule WB5
335
3.83k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
336
933k
                    wd::WC_Numeric => Numeric,           // rule WB9
337
32.9k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338
83.0k
                    wd::WC_Double_Quote if state == HLetter => {
339
4.66k
                        savecat = cat;
340
4.66k
                        saveidx = idx;
341
4.66k
                        FormatExtend(RequireHLetter) // rule WB7b
342
                    }
343
90.2k
                    wd::WC_Single_Quote if state == HLetter => {
344
27.1k
                        FormatExtend(AcceptQLetter) // rule WB7a
345
                    }
346
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347
740k
                        savecat = cat;
348
740k
                        saveidx = idx;
349
740k
                        FormatExtend(RequireLetter) // rule WB6
350
                    }
351
                    _ => {
352
2.05M
                        take_curr = false;
353
2.05M
                        break;
354
                    }
355
                },
356
12.4M
                Numeric => match cat {
357
10.5M
                    wd::WC_Numeric => Numeric,           // rule WB8
358
876k
                    wd::WC_ALetter => Letter,            // rule WB10
359
2.55k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
360
13.6k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362
596k
                        savecat = cat;
363
596k
                        saveidx = idx;
364
596k
                        FormatExtend(RequireNumeric) // rule WB12
365
                    }
366
                    _ => {
367
381k
                        take_curr = false;
368
381k
                        break;
369
                    }
370
                },
371
3.49k
                Katakana => match cat {
372
640
                    wd::WC_Katakana => Katakana,         // rule WB13
373
1.04k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374
                    _ => {
375
1.81k
                        take_curr = false;
376
1.81k
                        break;
377
                    }
378
                },
379
316k
                ExtendNumLet => match cat {
380
169k
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381
48.2k
                    wd::WC_ALetter => Letter,            // rule WB13b
382
2.66k
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
383
10.5k
                    wd::WC_Numeric => Numeric,           // rule WB13b
384
906
                    wd::WC_Katakana => Katakana,         // rule WB13b
385
                    _ => {
386
84.6k
                        take_curr = false;
387
84.6k
                        break;
388
                    }
389
                },
390
                Regional(RegionalState::Full) => {
391
                    // if it reaches here we've gone too far,
392
                    // a full flag can only compose with ZWJ/Extend/Format
393
                    // proceeding it.
394
4.06k
                    take_curr = false;
395
4.06k
                    break;
396
                }
397
5.64k
                Regional(RegionalState::Half) => match cat {
398
4.11k
                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399
                    _ => {
400
1.53k
                        take_curr = false;
401
1.53k
                        break;
402
                    }
403
                },
404
                Regional(_) => {
405
0
                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
406
                }
407
                Emoji => {
408
                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409
14.5k
                    take_curr = false;
410
14.5k
                    break;
411
                }
412
1.39M
                FormatExtend(t) => match t {
413
                    // handle FormatExtends depending on what type
414
596k
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415
740k
                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416
252k
                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417
4.64k
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418
                    AcceptNone | AcceptQLetter => {
419
48.5k
                        take_curr = false; // emit all the Format|Extend characters
420
48.5k
                        take_cat = false;
421
48.5k
                        break;
422
                    }
423
367k
                    _ => break, // rewind (in if statement below)
424
                },
425
            }
426
        }
427
428
26.2M
        if let FormatExtend(t) = state {
429
            // we were looking for something and didn't find it; we have to back up
430
416k
            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431
367k
                idx = saveidx;
432
367k
                cat = savecat;
433
367k
                take_curr = false;
434
367k
            }
435
25.7M
        }
436
437
26.2M
        self.cat = if take_curr {
438
22.6M
            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439
22.6M
            None
440
3.53M
        } else if take_cat {
441
3.48M
            Some(cat)
442
        } else {
443
48.5k
            None
444
        };
445
446
26.2M
        let retstr = &self.string[..idx];
447
26.2M
        self.string = &self.string[idx..];
448
26.2M
        Some(retstr)
449
26.2M
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds as core::iter::traits::iterator::Iterator>::next
450
}
451
452
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
453
    #[inline]
454
0
    fn next_back(&mut self) -> Option<&'a str> {
455
        use self::FormatExtendType::*;
456
        use self::UWordBoundsState::*;
457
        use crate::tables::word as wd;
458
0
        if self.string.is_empty() {
459
0
            return None;
460
0
        }
461
462
0
        let mut take_curr = true;
463
0
        let mut take_cat = true;
464
0
        let mut idx = self.string.len();
465
0
        idx -= self.string.chars().next_back().unwrap().len_utf8();
466
0
        let mut previdx = idx;
467
0
        let mut saveidx = idx;
468
0
        let mut state = Start;
469
0
        let mut savestate = Start;
470
0
        let mut cat = wd::WC_Any;
471
472
        // WB3c is context-sensitive (ZWJ + Extended_Pictographic),
473
        // while WB4 collapses Extend/Format and would otherwise hide that context.
474
        // We therefore keep this context outside the main state machine:
475
        // whether the nearest non-(Extend|Format) char to the right is emoji.
476
0
        let mut right_significant_is_emoji: bool = false;
477
478
0
        let mut skipped_format_extend = false;
479
480
0
        for (curr, ch) in self.string.char_indices().rev() {
481
0
            previdx = idx;
482
0
            idx = curr;
483
484
            // if there's a category cached, grab it
485
0
            cat = match self.catb {
486
0
                None => wd::word_category(ch).2,
487
0
                _ => self.catb.take().unwrap(),
488
            };
489
0
            take_cat = true;
490
491
            // backward iterator over word boundaries. Mostly the same as the forward
492
            // iterator, with two weirdnesses:
493
            // (1) If we encounter a single quote in the Start state, we have to check for a
494
            //     Hebrew Letter immediately before it.
495
            // (2) Format and Extend char handling takes some gymnastics.
496
497
            // Reverse-direction WB3c check: when we encounter ZWJ and the nearest
498
            // significant right-side char is emoji, do not break here.
499
0
            if cat == wd::WC_ZWJ && state != Zwj && right_significant_is_emoji {
500
0
                continue;
501
0
            }
502
503
            // Keep the right-side WB3c context up to date as we move left.
504
            // Ignore Extend/Format here to mirror WB4 collapsing behavior.
505
0
            if cat != wd::WC_Extend && cat != wd::WC_Format {
506
0
                right_significant_is_emoji = is_emoji(ch);
507
0
            }
508
509
0
            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
510
                // WB3c has more priority so we should not
511
                // fold in that case
512
0
                if !matches!(state, FormatExtend(_) | Start) {
513
0
                    saveidx = previdx;
514
0
                    savestate = state;
515
0
                    state = FormatExtend(AcceptNone);
516
0
                }
517
518
0
                if state != Start {
519
0
                    continue;
520
0
                }
521
0
            } else if state == FormatExtend(AcceptNone) {
522
0
                // finished a scan of some Format|Extend chars, restore previous state
523
0
                state = savestate;
524
0
                previdx = saveidx;
525
0
                take_cat = false;
526
0
                skipped_format_extend = true;
527
0
            }
528
529
            // Don't use `continue` in this match without updating `catb`
530
0
            state = match state {
531
0
                Start | FormatExtend(AcceptAny) => match cat {
532
0
                    wd::WC_ALetter => Letter,            // rule WB5, WB7, WB10, WB13b
533
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB7, WB7c, WB10, WB13b
534
0
                    wd::WC_Numeric => Numeric,           // rule WB8, WB9, WB11, WB13b
535
0
                    wd::WC_Katakana => Katakana,         // rule WB13, WB13b
536
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
537
0
                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
538
                    // rule WB4:
539
0
                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
540
                    wd::WC_Single_Quote => {
541
0
                        saveidx = idx;
542
0
                        FormatExtend(AcceptQLetter) // rule WB7a
543
                    }
544
0
                    wd::WC_WSegSpace => WSegSpace,
545
                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
546
0
                        if state == Start {
547
0
                            if cat == wd::WC_LF {
548
0
                                idx -= match self.get_prev_cat(idx) {
549
0
                                    Some(wd::WC_CR) => 1, // rule WB3
550
0
                                    _ => 0,
551
                                };
552
0
                            }
553
0
                        } else {
554
0
                            take_curr = false;
555
0
                        }
556
0
                        break; // rule WB3a
557
                    }
558
0
                    _ if is_emoji(ch) => Zwj,
559
0
                    _ => break, // rule WB999
560
                },
561
0
                Zwj => match cat {
562
                    // rule WB3c
563
0
                    wd::WC_ZWJ => FormatExtend(AcceptAny),
564
                    _ => {
565
0
                        take_curr = false;
566
0
                        break;
567
                    }
568
                },
569
0
                WSegSpace => match cat {
570
                    // rule WB3d
571
0
                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
572
                    _ => {
573
0
                        take_curr = false;
574
0
                        break;
575
                    }
576
                },
577
0
                Letter | HLetter => match cat {
578
0
                    wd::WC_ALetter => Letter,            // rule WB5
579
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
580
0
                    wd::WC_Numeric => Numeric,           // rule WB10
581
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
582
0
                    wd::WC_Double_Quote if state == HLetter => {
583
0
                        saveidx = previdx;
584
0
                        FormatExtend(RequireHLetter) // rule WB7c
585
                    }
586
                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
587
0
                        saveidx = previdx;
588
0
                        FormatExtend(RequireLetter) // rule WB7
589
                    }
590
                    _ => {
591
0
                        take_curr = false;
592
0
                        break;
593
                    }
594
                },
595
0
                Numeric => match cat {
596
0
                    wd::WC_Numeric => Numeric,           // rule WB8
597
0
                    wd::WC_ALetter => Letter,            // rule WB9
598
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
599
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
600
                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
601
0
                        saveidx = previdx;
602
0
                        FormatExtend(RequireNumeric) // rule WB11
603
                    }
604
                    _ => {
605
0
                        take_curr = false;
606
0
                        break;
607
                    }
608
                },
609
0
                Katakana => match cat {
610
0
                    wd::WC_Katakana => Katakana,         // rule WB13
611
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
612
                    _ => {
613
0
                        take_curr = false;
614
0
                        break;
615
                    }
616
                },
617
0
                ExtendNumLet => match cat {
618
0
                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
619
0
                    wd::WC_ALetter => Letter,            // rule WB13a
620
0
                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
621
0
                    wd::WC_Numeric => Numeric,           // rule WB13a
622
0
                    wd::WC_Katakana => Katakana,         // rule WB13a
623
                    _ => {
624
0
                        take_curr = false;
625
0
                        break;
626
                    }
627
                },
628
0
                Regional(mut regional_state) => match cat {
629
                    // rule WB13c
630
                    wd::WC_Regional_Indicator => {
631
0
                        if regional_state == RegionalState::Unknown {
632
0
                            let count = self.string[..previdx]
633
0
                                .chars()
634
0
                                .rev()
635
0
                                .map(|c| wd::word_category(c).2)
636
0
                                .filter(|&c| {
637
0
                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
638
0
                                })
639
0
                                .take_while(|&c| c == wd::WC_Regional_Indicator)
640
0
                                .count();
641
0
                            regional_state = if count % 2 == 0 {
642
0
                                RegionalState::Full
643
                            } else {
644
0
                                RegionalState::Half
645
                            };
646
0
                        }
647
0
                        if regional_state == RegionalState::Full {
648
0
                            take_curr = false;
649
0
                            break;
650
                        } else {
651
0
                            Regional(RegionalState::Full)
652
                        }
653
                    }
654
                    _ => {
655
0
                        take_curr = false;
656
0
                        break;
657
                    }
658
                },
659
                Emoji => {
660
0
                    if is_emoji(ch) {
661
                        // rule WB3c
662
0
                        Zwj
663
                    } else {
664
0
                        take_curr = false;
665
0
                        break;
666
                    }
667
                }
668
0
                FormatExtend(t) => match t {
669
0
                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
670
0
                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
671
0
                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
672
0
                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
673
0
                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
674
0
                    _ => break,                                         // backtrack will happens
675
                },
676
            }
677
        }
678
679
0
        if let FormatExtend(t) = state {
680
            // if we required something but didn't find it, backtrack
681
0
            if t == RequireLetter
682
0
                || t == RequireHLetter
683
0
                || t == RequireNumeric
684
0
                || t == AcceptNone
685
0
                || t == AcceptQLetter
686
0
            {
687
0
                previdx = saveidx;
688
0
                take_cat = false;
689
0
                take_curr = false;
690
0
            }
691
0
        }
692
693
0
        self.catb = if take_curr {
694
0
            None
695
        } else {
696
0
            idx = previdx;
697
0
            if take_cat {
698
0
                Some(cat)
699
            } else {
700
0
                None
701
            }
702
        };
703
704
0
        let retstr = &self.string[idx..];
705
0
        self.string = &self.string[..idx];
706
0
        Some(retstr)
707
0
    }
708
}
709
710
impl<'a> UWordBounds<'a> {
711
    #[inline]
712
    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
713
    ///
714
    /// ```rust
715
    /// # use unicode_segmentation::UnicodeSegmentation;
716
    /// let mut iter = "Hello world".split_word_bounds();
717
    /// assert_eq!(iter.as_str(), "Hello world");
718
    /// iter.next();
719
    /// assert_eq!(iter.as_str(), " world");
720
    /// iter.next();
721
    /// assert_eq!(iter.as_str(), "world");
722
    /// ```
723
0
    pub fn as_str(&self) -> &'a str {
724
0
        self.string
725
0
    }
726
727
    #[inline]
728
16.7M
    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
729
        use crate::tables::word as wd;
730
16.7M
        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
731
16.7M
        if nidx < self.string.len() {
732
16.7M
            let nch = self.string[nidx..].chars().next().unwrap();
733
16.7M
            Some(wd::word_category(nch).2)
734
        } else {
735
2.13k
            None
736
        }
737
16.7M
    }
<unicode_segmentation::word::UWordBounds>::get_next_cat
Line
Count
Source
728
16.7M
    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
729
        use crate::tables::word as wd;
730
16.7M
        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
731
16.7M
        if nidx < self.string.len() {
732
16.7M
            let nch = self.string[nidx..].chars().next().unwrap();
733
16.7M
            Some(wd::word_category(nch).2)
734
        } else {
735
2.13k
            None
736
        }
737
16.7M
    }
Unexecuted instantiation: <unicode_segmentation::word::UWordBounds>::get_next_cat
738
739
    #[inline]
740
0
    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
741
        use crate::tables::word as wd;
742
0
        if idx > 0 {
743
0
            let nch = self.string[..idx].chars().next_back().unwrap();
744
0
            Some(wd::word_category(nch).2)
745
        } else {
746
0
            None
747
        }
748
0
    }
749
}
750
751
/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
752
///
753
/// Since we handle only ASCII characters, we can use a much simpler set of
754
/// word break values than the full Unicode algorithm.
755
/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
756
///
757
/// | Word_Break value | ASCII code points that belong to it                             |
758
/// | -----------------| --------------------------------------------------------------- |
759
/// | CR               | U+000D (CR)                                                     |
760
/// | LF               | U+000A (LF)                                                     |
761
/// | Newline          | U+000B (VT), U+000C (FF)                                        |
762
/// | Single_Quote     | U+0027 (')                                                      |
763
/// | Double_Quote     | U+0022 (")                                                      |
764
/// | MidNumLet        | U+002E (.) FULL STOP                                            |
765
/// | MidLetter        | U+003A (:) COLON                                                |
766
/// | MidNum           | U+002C (,), U+003B (;)                                          |
767
/// | Numeric          | U+0030 – U+0039 (0 … 9)                                         |
768
/// | ALetter          | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z)                |
769
/// | ExtendNumLet     | U+005F (_) underscore                                           |
770
/// | WSegSpace        | U+0020 (SPACE)                                                  |
771
///
772
/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
773
/// AHLetter is the same as ALetter, so we don't need to distinguish it.
774
///
775
/// Any other single ASCII byte is its own boundary (the default WB999).
776
#[derive(Debug)]
777
struct AsciiWordBoundIter<'a> {
778
    rest: &'a str,
779
    offset: usize,
780
}
781
782
impl<'a> AsciiWordBoundIter<'a> {
783
900
    pub fn new(s: &'a str) -> Self {
784
900
        AsciiWordBoundIter { rest: s, offset: 0 }
785
900
    }
786
787
    #[inline]
788
22.8M
    fn is_core(b: u8) -> bool {
789
22.8M
        b.is_ascii_alphanumeric() || b == b'_'
790
22.8M
    }
<unicode_segmentation::word::AsciiWordBoundIter>::is_core
Line
Count
Source
788
22.8M
    fn is_core(b: u8) -> bool {
789
22.8M
        b.is_ascii_alphanumeric() || b == b'_'
790
22.8M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_core
791
792
    #[inline]
793
1.45M
    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
794
366k
        match b {
795
            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
796
            //
797
            // "Numeric (MidNum | MidNumLetQ) Numeric"
798
652k
            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
799
800
            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
801
            //
802
            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
803
            // MidLetter  = b':'
804
            // MidNumLetQ = b'.' | b'\''
805
366k
            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
806
872k
            _ => false,
807
        }
808
1.45M
    }
<unicode_segmentation::word::AsciiWordBoundIter>::is_infix
Line
Count
Source
793
1.45M
    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
794
366k
        match b {
795
            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
796
            //
797
            // "Numeric (MidNum | MidNumLetQ) Numeric"
798
652k
            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
799
800
            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
801
            //
802
            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
803
            // MidLetter  = b':'
804
            // MidNumLetQ = b'.' | b'\''
805
366k
            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
806
872k
            _ => false,
807
        }
808
1.45M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter>::is_infix
809
}
810
811
impl<'a> Iterator for AsciiWordBoundIter<'a> {
812
    type Item = (usize, &'a str);
813
814
    #[inline]
815
11.5M
    fn next(&mut self) -> Option<Self::Item> {
816
11.5M
        if self.rest.is_empty() {
817
900
            return None;
818
11.5M
        }
819
820
11.5M
        let bytes = self.rest.as_bytes();
821
11.5M
        let len = bytes.len();
822
823
        // 1) Keep horizontal whitespace together.
824
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
825
11.5M
        if bytes[0] == b' ' {
826
347k
            let mut i = 1;
827
1.78M
            while i < len && bytes[i] == b' ' {
828
1.43M
                i += 1;
829
1.43M
            }
830
347k
            let word = &self.rest[..i];
831
347k
            let pos = self.offset;
832
347k
            self.rest = &self.rest[i..];
833
347k
            self.offset += i;
834
347k
            return Some((pos, word));
835
11.1M
        }
836
837
        // 2) Core-run (letters/digits/underscore + infix)
838
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
839
11.1M
        if Self::is_core(bytes[0]) {
840
872k
            let mut i = 1;
841
11.7M
            while i < len {
842
11.7M
                let b = bytes[i];
843
11.7M
                if Self::is_core(b)
844
1.45M
                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
845
10.8M
                {
846
10.8M
                    i += 1;
847
10.8M
                } else {
848
872k
                    break;
849
                }
850
            }
851
872k
            let word = &self.rest[..i];
852
872k
            let pos = self.offset;
853
872k
            self.rest = &self.rest[i..];
854
872k
            self.offset += i;
855
872k
            return Some((pos, word));
856
10.2M
        }
857
858
        // 3) Do not break within CRLF.
859
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
860
10.2M
        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
861
4.38k
            let word = &self.rest[..2];
862
4.38k
            let pos = self.offset;
863
4.38k
            self.rest = &self.rest[2..];
864
4.38k
            self.offset += 2;
865
4.38k
            Some((pos, word))
866
        } else {
867
            // 4) Otherwise, break everywhere
868
            // Spec: the catch‑all rule WB999.
869
10.2M
            let word = &self.rest[..1];
870
10.2M
            let pos = self.offset;
871
10.2M
            self.rest = &self.rest[1..];
872
10.2M
            self.offset += 1;
873
10.2M
            Some((pos, word))
874
        }
875
11.5M
    }
<unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
815
11.5M
    fn next(&mut self) -> Option<Self::Item> {
816
11.5M
        if self.rest.is_empty() {
817
900
            return None;
818
11.5M
        }
819
820
11.5M
        let bytes = self.rest.as_bytes();
821
11.5M
        let len = bytes.len();
822
823
        // 1) Keep horizontal whitespace together.
824
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
825
11.5M
        if bytes[0] == b' ' {
826
347k
            let mut i = 1;
827
1.78M
            while i < len && bytes[i] == b' ' {
828
1.43M
                i += 1;
829
1.43M
            }
830
347k
            let word = &self.rest[..i];
831
347k
            let pos = self.offset;
832
347k
            self.rest = &self.rest[i..];
833
347k
            self.offset += i;
834
347k
            return Some((pos, word));
835
11.1M
        }
836
837
        // 2) Core-run (letters/digits/underscore + infix)
838
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
839
11.1M
        if Self::is_core(bytes[0]) {
840
872k
            let mut i = 1;
841
11.7M
            while i < len {
842
11.7M
                let b = bytes[i];
843
11.7M
                if Self::is_core(b)
844
1.45M
                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
845
10.8M
                {
846
10.8M
                    i += 1;
847
10.8M
                } else {
848
872k
                    break;
849
                }
850
            }
851
872k
            let word = &self.rest[..i];
852
872k
            let pos = self.offset;
853
872k
            self.rest = &self.rest[i..];
854
872k
            self.offset += i;
855
872k
            return Some((pos, word));
856
10.2M
        }
857
858
        // 3) Do not break within CRLF.
859
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
860
10.2M
        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
861
4.38k
            let word = &self.rest[..2];
862
4.38k
            let pos = self.offset;
863
4.38k
            self.rest = &self.rest[2..];
864
4.38k
            self.offset += 2;
865
4.38k
            Some((pos, word))
866
        } else {
867
            // 4) Otherwise, break everywhere
868
            // Spec: the catch‑all rule WB999.
869
10.2M
            let word = &self.rest[..1];
870
10.2M
            let pos = self.offset;
871
10.2M
            self.rest = &self.rest[1..];
872
10.2M
            self.offset += 1;
873
10.2M
            Some((pos, word))
874
        }
875
11.5M
    }
Unexecuted instantiation: <unicode_segmentation::word::AsciiWordBoundIter as core::iter::traits::iterator::Iterator>::next
876
}
877
878
impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
879
0
    fn next_back(&mut self) -> Option<(usize, &'a str)> {
880
0
        let rest = self.rest;
881
0
        if rest.is_empty() {
882
0
            return None;
883
0
        }
884
0
        let bytes = rest.as_bytes();
885
0
        let len = bytes.len();
886
887
        // 1) Group runs of spaces
888
        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
889
0
        if bytes[len - 1] == b' ' {
890
            // find start of this last run of spaces
891
0
            let mut start = len - 1;
892
0
            while start > 0 && bytes[start - 1] == b' ' {
893
0
                start -= 1;
894
0
            }
895
0
            let word = &rest[start..];
896
0
            let pos = self.offset + start;
897
0
            self.rest = &rest[..start];
898
0
            return Some((pos, word));
899
0
        }
900
901
        // 2) Trailing Core-run (letters/digits/underscore + infix)
902
        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
903
0
        if Self::is_core(bytes[len - 1]) {
904
            // scan backwards as long as we see `is_core` or an `is_infix`
905
0
            let mut start = len - 1;
906
0
            while start > 0 {
907
0
                let b = bytes[start - 1];
908
0
                let prev = if start >= 2 { bytes[start - 2] } else { b };
909
0
                let next = bytes[start]; // the byte we just included
910
0
                if Self::is_core(b) || Self::is_infix(b, prev, next) {
911
0
                    start -= 1;
912
0
                } else {
913
0
                    break;
914
                }
915
            }
916
0
            let word = &rest[start..];
917
0
            let pos = self.offset + start;
918
0
            self.rest = &rest[..start];
919
0
            return Some((pos, word));
920
0
        }
921
922
        // 3) Non-core: CR+LF as one token, otherwise single char
923
        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
924
0
        if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
925
0
            let start = len - 2;
926
0
            let word = &rest[start..];
927
0
            let pos = self.offset + start;
928
0
            self.rest = &rest[..start];
929
0
            return Some((pos, word));
930
0
        }
931
932
        // 4) Fallback – every other byte is its own segment
933
        // Spec: the catch‑all rule WB999.
934
0
        let start = len - 1;
935
0
        let word = &rest[start..];
936
0
        let pos = self.offset + start;
937
0
        self.rest = &rest[..start];
938
0
        Some((pos, word))
939
0
    }
940
}
941
942
#[inline]
943
0
fn ascii_word_ok(t: &(usize, &str)) -> bool {
944
0
    has_ascii_alphanumeric(&t.1)
945
0
}
946
#[inline]
947
0
fn unicode_word_ok(t: &(usize, &str)) -> bool {
948
0
    has_alphanumeric(&t.1)
949
0
}
950
951
type AsciiWordsIter<'a> = Filter<
952
    core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
953
    fn(&&'a str) -> bool,
954
>;
955
type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
956
type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
957
type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
958
959
#[derive(Debug)]
960
enum WordsIter<'a> {
961
    Ascii(AsciiWordsIter<'a>),
962
    Unicode(UnicodeWordsIter<'a>),
963
}
964
965
#[derive(Debug)]
966
enum IndicesIter<'a> {
967
    Ascii(AsciiIndicesIter<'a>),
968
    Unicode(UnicodeIndicesIter<'a>),
969
}
970
971
#[inline]
972
3.55k
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
973
3.55k
    let inner = if s.is_ascii() {
974
900
        WordsIter::Ascii(new_unicode_words_ascii(s))
975
    } else {
976
2.65k
        WordsIter::Unicode(new_unicode_words_general(s))
977
    };
978
3.55k
    UnicodeWords { inner }
979
3.55k
}
unicode_segmentation::word::new_unicode_words
Line
Count
Source
972
3.55k
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
973
3.55k
    let inner = if s.is_ascii() {
974
900
        WordsIter::Ascii(new_unicode_words_ascii(s))
975
    } else {
976
2.65k
        WordsIter::Unicode(new_unicode_words_general(s))
977
    };
978
3.55k
    UnicodeWords { inner }
979
3.55k
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words
980
981
#[inline]
982
0
pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
983
0
    let inner = if s.is_ascii() {
984
0
        IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok))
985
    } else {
986
0
        IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok))
987
    };
988
0
    UnicodeWordIndices { inner }
989
0
}
990
991
#[inline]
992
6.21k
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
993
6.21k
    UWordBounds {
994
6.21k
        string: s,
995
6.21k
        cat: None,
996
6.21k
        catb: None,
997
6.21k
    }
998
6.21k
}
unicode_segmentation::word::new_word_bounds
Line
Count
Source
992
6.21k
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
993
6.21k
    UWordBounds {
994
6.21k
        string: s,
995
6.21k
        cat: None,
996
6.21k
        catb: None,
997
6.21k
    }
998
6.21k
}
Unexecuted instantiation: unicode_segmentation::word::new_word_bounds
999
1000
#[inline]
1001
0
pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
1002
0
    UWordBoundIndices {
1003
0
        start_offset: s.as_ptr() as usize,
1004
0
        iter: new_word_bounds(s),
1005
0
    }
1006
0
}
1007
1008
#[inline]
1009
900
fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
1010
900
    AsciiWordBoundIter::new(s)
1011
900
}
unicode_segmentation::word::new_ascii_word_bound_indices
Line
Count
Source
1009
900
fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
1010
900
    AsciiWordBoundIter::new(s)
1011
900
}
Unexecuted instantiation: unicode_segmentation::word::new_ascii_word_bound_indices
1012
1013
#[inline]
1014
7.34M
fn has_alphanumeric(s: &&str) -> bool {
1015
    use crate::tables::util::is_alphanumeric;
1016
1017
7.34M
    s.chars().any(is_alphanumeric)
1018
7.34M
}
unicode_segmentation::word::has_alphanumeric
Line
Count
Source
1014
7.34M
fn has_alphanumeric(s: &&str) -> bool {
1015
    use crate::tables::util::is_alphanumeric;
1016
1017
7.34M
    s.chars().any(is_alphanumeric)
1018
7.34M
}
Unexecuted instantiation: unicode_segmentation::word::has_alphanumeric
1019
1020
#[inline]
1021
11.5M
fn has_ascii_alphanumeric(s: &&str) -> bool {
1022
12.9M
    s.chars().any(|c| c.is_ascii_alphanumeric())
unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}
Line
Count
Source
1022
12.9M
    s.chars().any(|c| c.is_ascii_alphanumeric())
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric::{closure#0}
1023
11.5M
}
unicode_segmentation::word::has_ascii_alphanumeric
Line
Count
Source
1021
11.5M
fn has_ascii_alphanumeric(s: &&str) -> bool {
1022
11.5M
    s.chars().any(|c| c.is_ascii_alphanumeric())
1023
11.5M
}
Unexecuted instantiation: unicode_segmentation::word::has_ascii_alphanumeric
1024
1025
#[inline(always)]
1026
11.5M
fn strip_pos((_, w): (usize, &str)) -> &str {
1027
11.5M
    w
1028
11.5M
}
1029
1030
#[inline]
1031
900
fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1032
900
    new_ascii_word_bound_indices(s)
1033
900
        .map(strip_pos as fn(_) -> _)
1034
900
        .filter(has_ascii_alphanumeric)
1035
900
}
unicode_segmentation::word::new_unicode_words_ascii
Line
Count
Source
1031
900
fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1032
900
    new_ascii_word_bound_indices(s)
1033
900
        .map(strip_pos as fn(_) -> _)
1034
900
        .filter(has_ascii_alphanumeric)
1035
900
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_ascii
1036
1037
#[inline]
1038
2.65k
fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1039
2.65k
    new_word_bounds(s).filter(has_alphanumeric)
1040
2.65k
}
unicode_segmentation::word::new_unicode_words_general
Line
Count
Source
1038
2.65k
fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1039
2.65k
    new_word_bounds(s).filter(has_alphanumeric)
1040
2.65k
}
Unexecuted instantiation: unicode_segmentation::word::new_unicode_words_general
1041
1042
#[cfg(test)]
1043
mod tests {
1044
    use crate::word::{
1045
        new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices,
1046
    };
1047
    use std::string::String;
1048
    use std::vec;
1049
    use std::vec::Vec;
1050
1051
    use proptest::prelude::*;
1052
1053
    #[test]
1054
    fn test_syriac_abbr_mark() {
1055
        use crate::tables::word as wd;
1056
        let (_, _, cat) = wd::word_category('\u{70f}');
1057
        assert_eq!(cat, wd::WC_ALetter);
1058
    }
1059
1060
    #[test]
1061
    fn test_end_of_ayah_cat() {
1062
        use crate::tables::word as wd;
1063
        let (_, _, cat) = wd::word_category('\u{6dd}');
1064
        assert_eq!(cat, wd::WC_Numeric);
1065
    }
1066
1067
    #[test]
1068
    fn test_ascii_word_bound_indices_various_cases() {
1069
        let s = "Hello, world!";
1070
        let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect();
1071
        let expected = vec![
1072
            (0, "Hello"), // simple letters
1073
            (5, ","),
1074
            (6, " "),     // space after comma
1075
            (7, "world"), // skip comma+space, stop at '!'
1076
            (12, "!"),    // punctuation at the end
1077
        ];
1078
        assert_eq!(words, expected);
1079
    }
1080
1081
    #[test]
1082
    fn test_ascii_word_indices_various_cases() {
1083
        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";
1084
        let words: Vec<&str> = new_unicode_words_ascii(s).collect();
1085
        let expected = vec![
1086
            ("Hello"), // simple letters
1087
            ("world"), // skip comma+space, stop at '!'
1088
            ("can't"), // apostrophe joins letters
1089
            ("e.g"),
1090
            ("var1"),
1091
            ("123,456"), // digits+comma+digits
1092
            ("foo_bar"),
1093
            ("example.com"),
1094
            ("127.0.0.1"),
1095
            ("9090"), // port number
1096
        ];
1097
        assert_eq!(words, expected);
1098
    }
1099
1100
    /// Strategy that yields every code-point from NUL (0) to DEL (127).
1101
    fn ascii_char() -> impl Strategy<Value = char> {
1102
        (0u8..=127).prop_map(|b| b as char)
1103
    }
1104
1105
    proptest! {
1106
        #![proptest_config(ProptestConfig::with_cases(10000))]
1107
        /// Fast path must equal general path for any ASCII input.
1108
        #[test]
1109
        fn proptest_ascii_matches_unicode_word_indices(
1110
            // Vec<char> → String, length 0‒99
1111
            s in proptest::collection::vec(ascii_char(), 0..100)
1112
                   .prop_map(|v| v.into_iter().collect::<String>())
1113
        ) {
1114
            let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect();
1115
            let uni:  Vec<(usize, &str)> = new_word_bound_indices(&s).collect();
1116
1117
            prop_assert_eq!(fast, uni);
1118
        }
1119
1120
        /// Fast path must equal general path for any ASCII input, forwards and backwards.
1121
        #[test]
1122
        fn proptest_ascii_matches_unicode_word_indices_rev(
1123
            // Vec<char> → String, length 0‒99
1124
            s in proptest::collection::vec(ascii_char(), 0..100)
1125
                   .prop_map(|v| v.into_iter().collect::<String>())
1126
        ) {
1127
            let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
1128
            let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
1129
            prop_assert_eq!(fast_rev, uni_rev);
1130
        }
1131
    }
1132
}