Coverage Report

Created: 2025-07-11 07:04

/src/unicode-segmentation/src/sentence.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2
// file at the top-level directory of this distribution and at
3
// http://rust-lang.org/COPYRIGHT.
4
//
5
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8
// option. This file may not be copied, modified, or distributed
9
// except according to those terms.
10
11
use core::cmp;
12
use core::iter::Filter;
13
14
// All of the logic for forward iteration over sentences
15
mod fwd {
16
    use crate::tables::sentence::SentenceCat;
17
    use core::cmp;
18
19
    // Describe a parsed part of source string as described in this table:
20
    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21
    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
22
    enum StatePart {
23
        Sot,
24
        Eot,
25
        Other,
26
        CR,
27
        LF,
28
        Sep,
29
        ATerm,
30
        UpperLower,
31
        ClosePlus,
32
        SpPlus,
33
        STerm,
34
    }
35
36
    #[derive(Debug, Clone, PartialEq, Eq)]
37
    struct SentenceBreaksState(pub [StatePart; 4]);
38
39
    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40
        StatePart::Sot,
41
        StatePart::Sot,
42
        StatePart::Sot,
43
        StatePart::Sot,
44
    ]);
45
46
    #[derive(Debug, Clone)]
47
    pub struct SentenceBreaks<'a> {
48
        pub string: &'a str,
49
        pos: usize,
50
        state: SentenceBreaksState,
51
    }
52
53
    impl SentenceBreaksState {
54
        // Attempt to advance the internal state by one part
55
        // Whitespace and some punctutation will be collapsed
56
        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57
            let &SentenceBreaksState(parts) = self;
58
            let parts = match (parts[3], cat) {
59
                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60
                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61
                _ => [
62
                    parts[1],
63
                    parts[2],
64
                    parts[3],
65
                    match cat {
66
                        SentenceCat::SC_CR => StatePart::CR,
67
                        SentenceCat::SC_LF => StatePart::LF,
68
                        SentenceCat::SC_Sep => StatePart::Sep,
69
                        SentenceCat::SC_ATerm => StatePart::ATerm,
70
                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
71
                        SentenceCat::SC_Close => StatePart::ClosePlus,
72
                        SentenceCat::SC_Sp => StatePart::SpPlus,
73
                        SentenceCat::SC_STerm => StatePart::STerm,
74
                        _ => StatePart::Other,
75
                    },
76
                ],
77
            };
78
            SentenceBreaksState(parts)
79
        }
80
81
        fn end(&self) -> SentenceBreaksState {
82
            let &SentenceBreaksState(parts) = self;
83
            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84
        }
85
86
        // Helper function to check if state head matches a single `StatePart`
87
        fn match1(&self, part: StatePart) -> bool {
88
            let &SentenceBreaksState(parts) = self;
89
            part == parts[3]
90
        }
91
92
        // Helper function to check if first two `StateParts` in state match
93
        // the given two
94
        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95
            let &SentenceBreaksState(parts) = self;
96
            part1 == parts[2] && part2 == parts[3]
97
        }
98
    }
99
100
    // https://unicode.org/reports/tr29/#SB8
101
    // TODO cache this, it is currently quadratic
102
    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103
        let &SentenceBreaksState(parts) = state;
104
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105
        if parts[idx] == StatePart::ClosePlus {
106
            idx -= 1
107
        }
108
109
        if parts[idx] == StatePart::ATerm {
110
            use crate::tables::sentence as se;
111
112
            for next_char in ahead.chars() {
113
                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
114
                match se::sentence_category(next_char).2 {
115
                    se::SC_Lower => return true,
116
                    se::SC_OLetter
117
                    | se::SC_Upper
118
                    | se::SC_Sep
119
                    | se::SC_CR
120
                    | se::SC_LF
121
                    | se::SC_STerm
122
                    | se::SC_ATerm => return false,
123
                    _ => continue,
124
                }
125
            }
126
        }
127
128
        false
129
    }
130
131
    // https://unicode.org/reports/tr29/#SB8a
132
    fn match_sb8a(state: &SentenceBreaksState) -> bool {
133
        // SATerm Close* Sp*
134
        let &SentenceBreaksState(parts) = state;
135
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136
        if parts[idx] == StatePart::ClosePlus {
137
            idx -= 1
138
        }
139
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140
    }
141
142
    // https://unicode.org/reports/tr29/#SB9
143
    fn match_sb9(state: &SentenceBreaksState) -> bool {
144
        // SATerm Close*
145
        let &SentenceBreaksState(parts) = state;
146
        let idx = if parts[3] == StatePart::ClosePlus {
147
            2
148
        } else {
149
            3
150
        };
151
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
152
    }
153
154
    // https://unicode.org/reports/tr29/#SB11
155
    fn match_sb11(state: &SentenceBreaksState) -> bool {
156
        // SATerm Close* Sp* ParaSep?
157
        let &SentenceBreaksState(parts) = state;
158
        let mut idx = match parts[3] {
159
            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
160
            _ => 3,
161
        };
162
163
        if parts[idx] == StatePart::SpPlus {
164
            idx -= 1
165
        }
166
        if parts[idx] == StatePart::ClosePlus {
167
            idx -= 1
168
        }
169
170
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
171
    }
172
173
    impl<'a> Iterator for SentenceBreaks<'a> {
174
        // Returns the index of the character which follows a break
175
        type Item = usize;
176
177
        #[inline]
178
0
        fn size_hint(&self) -> (usize, Option<usize>) {
179
0
            let slen = self.string.len();
180
0
            // A sentence could be one character
181
0
            (cmp::min(slen, 2), Some(slen + 1))
182
0
        }
183
184
        #[inline]
185
0
        fn next(&mut self) -> Option<usize> {
186
            use crate::tables::sentence as se;
187
188
0
            for next_char in self.string[self.pos..].chars() {
189
0
                let position_before = self.pos;
190
0
                let state_before = self.state.clone();
191
0
192
0
                let next_cat = se::sentence_category(next_char).2;
193
0
194
0
                self.pos += next_char.len_utf8();
195
0
                self.state = self.state.next(next_cat);
196
197
0
                match next_cat {
198
                    // SB1 https://unicode.org/reports/tr29/#SB1
199
0
                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201
                    // SB2 is handled when inner iterator (chars) is finished
202
203
                    // SB3 https://unicode.org/reports/tr29/#SB3
204
0
                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206
                    // SB4 https://unicode.org/reports/tr29/#SB4
207
0
                    _ if state_before.match1(StatePart::Sep)
208
0
                        || state_before.match1(StatePart::CR)
209
0
                        || state_before.match1(StatePart::LF) =>
210
0
                    {
211
0
                        return Some(position_before)
212
                    }
213
214
                    // SB5 https://unicode.org/reports/tr29/#SB5
215
0
                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
216
217
                    // SB6 https://unicode.org/reports/tr29/#SB6
218
0
                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220
                    // SB7 https://unicode.org/reports/tr29/#SB7
221
                    SentenceCat::SC_Upper
222
0
                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223
0
                    {
224
0
                        continue
225
                    }
226
227
                    // SB8 https://unicode.org/reports/tr29/#SB8
228
0
                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230
                    // SB8a https://unicode.org/reports/tr29/#SB8a
231
                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
232
0
                        if match_sb8a(&state_before) =>
233
                    {
234
0
                        continue
235
                    }
236
237
                    // SB9 https://unicode.org/reports/tr29/#SB9
238
                    SentenceCat::SC_Close
239
                    | SentenceCat::SC_Sp
240
                    | SentenceCat::SC_Sep
241
                    | SentenceCat::SC_CR
242
                    | SentenceCat::SC_LF
243
0
                        if match_sb9(&state_before) =>
244
                    {
245
0
                        continue
246
                    }
247
248
                    // SB10 https://unicode.org/reports/tr29/#SB10
249
                    SentenceCat::SC_Sp
250
                    | SentenceCat::SC_Sep
251
                    | SentenceCat::SC_CR
252
                    | SentenceCat::SC_LF
253
0
                        if match_sb8a(&state_before) =>
254
                    {
255
0
                        continue
256
                    }
257
258
                    // SB11 https://unicode.org/reports/tr29/#SB11
259
0
                    _ if match_sb11(&state_before) => return Some(position_before),
260
261
                    // SB998 https://unicode.org/reports/tr29/#SB998
262
0
                    _ => continue,
263
                }
264
            }
265
266
            // SB2 https://unicode.org/reports/tr29/#SB2
267
0
            if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) {
268
0
                None
269
            } else {
270
0
                self.state = self.state.end();
271
0
                Some(self.pos)
272
            }
273
0
        }
274
    }
275
276
    pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
277
        SentenceBreaks {
278
            string: source,
279
            pos: 0,
280
            state: INITIAL_STATE,
281
        }
282
    }
283
}
284
285
/// An iterator over the substrings of a string which, after splitting the string on
286
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
287
/// contain any characters with the
288
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
289
/// property, or with
290
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
291
///
292
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
293
/// trait. See its documentation for more.
294
///
295
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
296
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
297
#[derive(Debug, Clone)]
298
pub struct UnicodeSentences<'a> {
299
    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
300
}
301
302
/// External iterator for a string's
303
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
304
///
305
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
306
/// trait. See its documentation for more.
307
///
308
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
309
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
310
#[derive(Debug, Clone)]
311
pub struct USentenceBounds<'a> {
312
    iter: fwd::SentenceBreaks<'a>,
313
    sentence_start: Option<usize>,
314
}
315
316
/// External iterator for sentence boundaries and byte offsets.
317
///
318
/// This struct is created by the [`split_sentence_bound_indices`] method on the
319
/// [`UnicodeSegmentation`] trait. See its documentation for more.
320
///
321
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
322
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
323
#[derive(Debug, Clone)]
324
pub struct USentenceBoundIndices<'a> {
325
    start_offset: usize,
326
    iter: USentenceBounds<'a>,
327
}
328
329
#[inline]
330
0
pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
331
0
    USentenceBounds {
332
0
        iter: fwd::new_sentence_breaks(source),
333
0
        sentence_start: None,
334
0
    }
335
0
}
336
337
#[inline]
338
0
pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
339
0
    USentenceBoundIndices {
340
0
        start_offset: source.as_ptr() as usize,
341
0
        iter: new_sentence_bounds(source),
342
0
    }
343
0
}
344
345
#[inline]
346
0
pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347
    use super::UnicodeSegmentation;
348
    use crate::tables::util::is_alphanumeric;
349
350
    fn has_alphanumeric(s: &&str) -> bool {
351
        s.chars().any(is_alphanumeric)
352
    }
353
0
    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354
0
355
0
    UnicodeSentences {
356
0
        inner: s.split_sentence_bounds().filter(has_alphanumeric),
357
0
    }
358
0
}
359
360
impl<'a> Iterator for UnicodeSentences<'a> {
361
    type Item = &'a str;
362
363
    #[inline]
364
0
    fn next(&mut self) -> Option<&'a str> {
365
0
        self.inner.next()
366
0
    }
367
368
    #[inline]
369
0
    fn size_hint(&self) -> (usize, Option<usize>) {
370
0
        self.inner.size_hint()
371
0
    }
372
}
373
374
impl<'a> Iterator for USentenceBounds<'a> {
375
    type Item = &'a str;
376
377
    #[inline]
378
0
    fn size_hint(&self) -> (usize, Option<usize>) {
379
0
        let (lower, upper) = self.iter.size_hint();
380
0
        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
381
0
    }
382
383
    #[inline]
384
0
    fn next(&mut self) -> Option<&'a str> {
385
0
        if self.sentence_start.is_none() {
386
0
            if let Some(start_pos) = self.iter.next() {
387
0
                self.sentence_start = Some(start_pos)
388
            } else {
389
0
                return None;
390
            }
391
0
        }
392
393
0
        if let Some(break_pos) = self.iter.next() {
394
0
            let start_pos = self.sentence_start.unwrap();
395
0
            let sentence = &self.iter.string[start_pos..break_pos];
396
0
            self.sentence_start = Some(break_pos);
397
0
            Some(sentence)
398
        } else {
399
0
            None
400
        }
401
0
    }
402
}
403
404
impl<'a> Iterator for USentenceBoundIndices<'a> {
405
    type Item = (usize, &'a str);
406
407
    #[inline]
408
0
    fn next(&mut self) -> Option<(usize, &'a str)> {
409
0
        self.iter
410
0
            .next()
411
0
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
412
0
    }
413
414
    #[inline]
415
0
    fn size_hint(&self) -> (usize, Option<usize>) {
416
0
        self.iter.size_hint()
417
0
    }
418
}