/src/unicode-segmentation/src/sentence.rs

Source (jump to first uncovered line)
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use core::cmp;
use core::iter::Filter;

// All of the logic for forward iteration over sentences
mod fwd {
    use crate::tables::sentence::SentenceCat;
    use core::cmp;

    // Describe a parsed part of source string as described in this table:
    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
    enum StatePart {
        Sot,
        Eot,
        Other,
        CR,
        LF,
        Sep,
        ATerm,
        UpperLower,
        ClosePlus,
        SpPlus,
        STerm,
    }

    #[derive(Debug, Clone, PartialEq, Eq)]
    struct SentenceBreaksState(pub [StatePart; 4]);

    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
        StatePart::Sot,
        StatePart::Sot,
        StatePart::Sot,
        StatePart::Sot,
    ]);

    #[derive(Debug, Clone)]
    pub struct SentenceBreaks<'a> {
        pub string: &'a str,
        pos: usize,
        state: SentenceBreaksState,
    }

    impl SentenceBreaksState {
        // Attempt to advance the internal state by one part
        // Whitespace and some punctutation will be collapsed
        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
            let &SentenceBreaksState(parts) = self;
            let parts = match (parts[3], cat) {
                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
                _ => [
                    parts[1],
                    parts[2],
                    parts[3],
                    match cat {
                        SentenceCat::SC_CR => StatePart::CR,
                        SentenceCat::SC_LF => StatePart::LF,
                        SentenceCat::SC_Sep => StatePart::Sep,
                        SentenceCat::SC_ATerm => StatePart::ATerm,
                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
                        SentenceCat::SC_Close => StatePart::ClosePlus,
                        SentenceCat::SC_Sp => StatePart::SpPlus,
                        SentenceCat::SC_STerm => StatePart::STerm,
                        _ => StatePart::Other,
                    },
                ],
            };
            SentenceBreaksState(parts)
        }

        fn end(&self) -> SentenceBreaksState {
            let &SentenceBreaksState(parts) = self;
            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
        }

        // Helper function to check if state head matches a single `StatePart`
        fn match1(&self, part: StatePart) -> bool {
            let &SentenceBreaksState(parts) = self;
            part == parts[3]
        }

        // Helper function to check if first two `StateParts` in state match
        // the given two
        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
            let &SentenceBreaksState(parts) = self;
            part1 == parts[2] && part2 == parts[3]
        }
    }

    // https://unicode.org/reports/tr29/#SB8
    // TODO cache this, it is currently quadratic
    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
        let &SentenceBreaksState(parts) = state;
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
        if parts[idx] == StatePart::ClosePlus {
            idx -= 1
        }

        if parts[idx] == StatePart::ATerm {
            use crate::tables::sentence as se;

            for next_char in ahead.chars() {
                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
                match se::sentence_category(next_char).2 {
                    se::SC_Lower => return true,
                    se::SC_OLetter
                    | se::SC_Upper
                    | se::SC_Sep
                    | se::SC_CR
                    | se::SC_LF
                    | se::SC_STerm
                    | se::SC_ATerm => return false,
                    _ => continue,
                }
            }
        }

        false
    }

    // https://unicode.org/reports/tr29/#SB8a
    fn match_sb8a(state: &SentenceBreaksState) -> bool {
        // SATerm Close* Sp*
        let &SentenceBreaksState(parts) = state;
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
        if parts[idx] == StatePart::ClosePlus {
            idx -= 1
        }
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    // https://unicode.org/reports/tr29/#SB9
    fn match_sb9(state: &SentenceBreaksState) -> bool {
        // SATerm Close*
        let &SentenceBreaksState(parts) = state;
        let idx = if parts[3] == StatePart::ClosePlus {
            2
        } else {
            3
        };
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    // https://unicode.org/reports/tr29/#SB11
    fn match_sb11(state: &SentenceBreaksState) -> bool {
        // SATerm Close* Sp* ParaSep?
        let &SentenceBreaksState(parts) = state;
        let mut idx = match parts[3] {
            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
            _ => 3,
        };

        if parts[idx] == StatePart::SpPlus {
            idx -= 1
        }
        if parts[idx] == StatePart::ClosePlus {
            idx -= 1
        }

        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    impl<'a> Iterator for SentenceBreaks<'a> {
        // Returns the index of the character which follows a break
        type Item = usize;

        #[inline]
        fn size_hint(&self) -> (usize, Option<usize>) {
            let slen = self.string.len();
            // A sentence could be one character
            (cmp::min(slen, 2), Some(slen + 1))
        }

        #[inline]
        fn next(&mut self) -> Option<usize> {
            use crate::tables::sentence as se;

            for next_char in self.string[self.pos..].chars() {
                let position_before = self.pos;
                let state_before = self.state.clone();

                let next_cat = se::sentence_category(next_char).2;

                self.pos += next_char.len_utf8();
                self.state = self.state.next(next_cat);

                match next_cat {
                    // SB1 https://unicode.org/reports/tr29/#SB1
                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),

                    // SB2 is handled when inner iterator (chars) is finished

                    // SB3 https://unicode.org/reports/tr29/#SB3
                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,

                    // SB4 https://unicode.org/reports/tr29/#SB4
                    _ if state_before.match1(StatePart::Sep)
                        || state_before.match1(StatePart::CR)
                        || state_before.match1(StatePart::LF) =>
                    {
                        return Some(position_before)
                    }

                    // SB5 https://unicode.org/reports/tr29/#SB5
                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,

                    // SB6 https://unicode.org/reports/tr29/#SB6
                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,

                    // SB7 https://unicode.org/reports/tr29/#SB7
                    SentenceCat::SC_Upper
                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
                    {
                        continue
                    }

                    // SB8 https://unicode.org/reports/tr29/#SB8
                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,

                    // SB8a https://unicode.org/reports/tr29/#SB8a
                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
                        if match_sb8a(&state_before) =>
                    {
                        continue
                    }

                    // SB9 https://unicode.org/reports/tr29/#SB9
                    SentenceCat::SC_Close
                    | SentenceCat::SC_Sp
                    | SentenceCat::SC_Sep
                    | SentenceCat::SC_CR
                    | SentenceCat::SC_LF
                        if match_sb9(&state_before) =>
                    {
                        continue
                    }

                    // SB10 https://unicode.org/reports/tr29/#SB10
                    SentenceCat::SC_Sp
                    | SentenceCat::SC_Sep
                    | SentenceCat::SC_CR
                    | SentenceCat::SC_LF
                        if match_sb8a(&state_before) =>
                    {
                        continue
                    }

                    // SB11 https://unicode.org/reports/tr29/#SB11
                    _ if match_sb11(&state_before) => return Some(position_before),

                    // SB998 https://unicode.org/reports/tr29/#SB998
                    _ => continue,
                }
            }

            // SB2 https://unicode.org/reports/tr29/#SB2
            if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) {
                None
            } else {
                self.state = self.state.end();
                Some(self.pos)
            }
        }
    }

    pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
        SentenceBreaks {
            string: source,
            pos: 0,
            state: INITIAL_STATE,
        }
    }
}

/// An iterator over the substrings of a string which, after splitting the string on
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Debug, Clone)]
pub struct UnicodeSentences<'a> {
    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
}

/// External iterator for a string's
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Debug, Clone)]
pub struct USentenceBounds<'a> {
    iter: fwd::SentenceBreaks<'a>,
    sentence_start: Option<usize>,
}

/// External iterator for sentence boundaries and byte offsets.
///
/// This struct is created by the [`split_sentence_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Debug, Clone)]
pub struct USentenceBoundIndices<'a> {
    start_offset: usize,
    iter: USentenceBounds<'a>,
}

#[inline]
pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
    USentenceBounds {
        iter: fwd::new_sentence_breaks(source),
        sentence_start: None,
    }
}

#[inline]
pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
    USentenceBoundIndices {
        start_offset: source.as_ptr() as usize,
        iter: new_sentence_bounds(source),
    }
}

#[inline]
pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
    use super::UnicodeSegmentation;
    use crate::tables::util::is_alphanumeric;

    fn has_alphanumeric(s: &&str) -> bool {
        s.chars().any(is_alphanumeric)
    }
    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

    UnicodeSentences {
        inner: s.split_sentence_bounds().filter(has_alphanumeric),
    }
}

impl<'a> Iterator for UnicodeSentences<'a> {
    type Item = &'a str;

    #[inline]
    fn next(&mut self) -> Option<&'a str> {
        self.inner.next()
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.inner.size_hint()
    }
}

impl<'a> Iterator for USentenceBounds<'a> {
    type Item = &'a str;

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let (lower, upper) = self.iter.size_hint();
        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
    }

    #[inline]
    fn next(&mut self) -> Option<&'a str> {
        if self.sentence_start.is_none() {
            if let Some(start_pos) = self.iter.next() {
                self.sentence_start = Some(start_pos)
            } else {
                return None;
            }
        }

        if let Some(break_pos) = self.iter.next() {
            let start_pos = self.sentence_start.unwrap();
            let sentence = &self.iter.string[start_pos..break_pos];
            self.sentence_start = Some(break_pos);
            Some(sentence)
        } else {
            None
        }
    }
}

impl<'a> Iterator for USentenceBoundIndices<'a> {
    type Item = (usize, &'a str);

    #[inline]
    fn next(&mut self) -> Option<(usize, &'a str)> {
        self.iter
            .next()
            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}

Coverage Report

Created: 2025-07-11 07:04

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2		// file at the top-level directory of this distribution and at
3		// http://rust-lang.org/COPYRIGHT.
4		//
5		// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6		// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7		// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8		// option. This file may not be copied, modified, or distributed
9		// except according to those terms.
10
11		use core::cmp;
12		use core::iter::Filter;
13
14		// All of the logic for forward iteration over sentences
15		mod fwd {
16		use crate::tables::sentence::SentenceCat;
17		use core::cmp;
18
19		// Describe a parsed part of source string as described in this table:
20		// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21		#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22		enum StatePart {
23		Sot,
24		Eot,
25		Other,
26		CR,
27		LF,
28		Sep,
29		ATerm,
30		UpperLower,
31		ClosePlus,
32		SpPlus,
33		STerm,
34		}
35
36		#[derive(Debug, Clone, PartialEq, Eq)]
37		struct SentenceBreaksState(pub [StatePart; 4]);
38
39		const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40		StatePart::Sot,
41		StatePart::Sot,
42		StatePart::Sot,
43		StatePart::Sot,
44		]);
45
46		#[derive(Debug, Clone)]
47		pub struct SentenceBreaks<'a> {
48		pub string: &'a str,
49		pos: usize,
50		state: SentenceBreaksState,
51		}
52
53		impl SentenceBreaksState {
54		// Attempt to advance the internal state by one part
55		// Whitespace and some punctutation will be collapsed
56		fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57		let &SentenceBreaksState(parts) = self;
58		let parts = match (parts[3], cat) {
59		(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60		(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61		_ => [
62		parts[1],
63		parts[2],
64		parts[3],
65		match cat {
66		SentenceCat::SC_CR => StatePart::CR,
67		SentenceCat::SC_LF => StatePart::LF,
68		SentenceCat::SC_Sep => StatePart::Sep,
69		SentenceCat::SC_ATerm => StatePart::ATerm,
70		SentenceCat::SC_Upper \| SentenceCat::SC_Lower => StatePart::UpperLower,
71		SentenceCat::SC_Close => StatePart::ClosePlus,
72		SentenceCat::SC_Sp => StatePart::SpPlus,
73		SentenceCat::SC_STerm => StatePart::STerm,
74		_ => StatePart::Other,
75		},
76		],
77		};
78		SentenceBreaksState(parts)
79		}
80
81		fn end(&self) -> SentenceBreaksState {
82		let &SentenceBreaksState(parts) = self;
83		SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84		}
85
86		// Helper function to check if state head matches a single `StatePart`
87		fn match1(&self, part: StatePart) -> bool {
88		let &SentenceBreaksState(parts) = self;
89		part == parts[3]
90		}
91
92		// Helper function to check if first two `StateParts` in state match
93		// the given two
94		fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95		let &SentenceBreaksState(parts) = self;
96		part1 == parts[2] && part2 == parts[3]
97		}
98		}
99
100		// https://unicode.org/reports/tr29/#SB8
101		// TODO cache this, it is currently quadratic
102		fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103		let &SentenceBreaksState(parts) = state;
104		let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105		if parts[idx] == StatePart::ClosePlus {
106		idx -= 1
107		}
108
109		if parts[idx] == StatePart::ATerm {
110		use crate::tables::sentence as se;
111
112		for next_char in ahead.chars() {
113		//( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) )* Lower
114		match se::sentence_category(next_char).2 {
115		se::SC_Lower => return true,
116		se::SC_OLetter
117		\| se::SC_Upper
118		\| se::SC_Sep
119		\| se::SC_CR
120		\| se::SC_LF
121		\| se::SC_STerm
122		\| se::SC_ATerm => return false,
123		_ => continue,
124		}
125		}
126		}
127
128		false
129		}
130
131		// https://unicode.org/reports/tr29/#SB8a
132		fn match_sb8a(state: &SentenceBreaksState) -> bool {
133		// SATerm Close* Sp*
134		let &SentenceBreaksState(parts) = state;
135		let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136		if parts[idx] == StatePart::ClosePlus {
137		idx -= 1
138		}
139		parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
140		}
141
142		// https://unicode.org/reports/tr29/#SB9
143		fn match_sb9(state: &SentenceBreaksState) -> bool {
144		// SATerm Close*
145		let &SentenceBreaksState(parts) = state;
146		let idx = if parts[3] == StatePart::ClosePlus {
147		2
148		} else {
149		3
150		};
151		parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
152		}
153
154		// https://unicode.org/reports/tr29/#SB11
155		fn match_sb11(state: &SentenceBreaksState) -> bool {
156		// SATerm Close* Sp* ParaSep?
157		let &SentenceBreaksState(parts) = state;
158		let mut idx = match parts[3] {
159		StatePart::Sep \| StatePart::CR \| StatePart::LF => 2,
160		_ => 3,
161		};
162
163		if parts[idx] == StatePart::SpPlus {
164		idx -= 1
165		}
166		if parts[idx] == StatePart::ClosePlus {
167		idx -= 1
168		}
169
170		parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
171		}
172
173		impl<'a> Iterator for SentenceBreaks<'a> {
174		// Returns the index of the character which follows a break
175		type Item = usize;
176
177		#[inline]
178	0	fn size_hint(&self) -> (usize, Option<usize>) {
179	0	let slen = self.string.len();
180	0	// A sentence could be one character
181	0	(cmp::min(slen, 2), Some(slen + 1))
182	0	}
183
184		#[inline]
185	0	fn next(&mut self) -> Option<usize> {
186		use crate::tables::sentence as se;
187
188	0	for next_char in self.string[self.pos..].chars() {
189	0	let position_before = self.pos;
190	0	let state_before = self.state.clone();
191	0
192	0	let next_cat = se::sentence_category(next_char).2;
193	0
194	0	self.pos += next_char.len_utf8();
195	0	self.state = self.state.next(next_cat);
196
197	0	match next_cat {
198		// SB1 https://unicode.org/reports/tr29/#SB1
199	0	_ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201		// SB2 is handled when inner iterator (chars) is finished
202
203		// SB3 https://unicode.org/reports/tr29/#SB3
204	0	SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206		// SB4 https://unicode.org/reports/tr29/#SB4
207	0	_ if state_before.match1(StatePart::Sep)
208	0	\|\| state_before.match1(StatePart::CR)
209	0	\|\| state_before.match1(StatePart::LF) =>
210	0	{
211	0	return Some(position_before)
212		}
213
214		// SB5 https://unicode.org/reports/tr29/#SB5
215	0	SentenceCat::SC_Extend \| SentenceCat::SC_Format => self.state = state_before,
216
217		// SB6 https://unicode.org/reports/tr29/#SB6
218	0	SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220		// SB7 https://unicode.org/reports/tr29/#SB7
221		SentenceCat::SC_Upper
222	0	if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223	0	{
224	0	continue
225		}
226
227		// SB8 https://unicode.org/reports/tr29/#SB8
228	0	_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230		// SB8a https://unicode.org/reports/tr29/#SB8a
231		SentenceCat::SC_SContinue \| SentenceCat::SC_STerm \| SentenceCat::SC_ATerm
232	0	if match_sb8a(&state_before) =>
233		{
234	0	continue
235		}
236
237		// SB9 https://unicode.org/reports/tr29/#SB9
238		SentenceCat::SC_Close
239		\| SentenceCat::SC_Sp
240		\| SentenceCat::SC_Sep
241		\| SentenceCat::SC_CR
242		\| SentenceCat::SC_LF
243	0	if match_sb9(&state_before) =>
244		{
245	0	continue
246		}
247
248		// SB10 https://unicode.org/reports/tr29/#SB10
249		SentenceCat::SC_Sp
250		\| SentenceCat::SC_Sep
251		\| SentenceCat::SC_CR
252		\| SentenceCat::SC_LF
253	0	if match_sb8a(&state_before) =>
254		{
255	0	continue
256		}
257
258		// SB11 https://unicode.org/reports/tr29/#SB11
259	0	_ if match_sb11(&state_before) => return Some(position_before),
260
261		// SB998 https://unicode.org/reports/tr29/#SB998
262	0	_ => continue,
263		}
264		}
265
266		// SB2 https://unicode.org/reports/tr29/#SB2
267	0	if self.state.match1(StatePart::Sot) \|\| self.state.match1(StatePart::Eot) {
268	0	None
269		} else {
270	0	self.state = self.state.end();
271	0	Some(self.pos)
272		}
273	0	}
274		}
275
276		pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
277		SentenceBreaks {
278		string: source,
279		pos: 0,
280		state: INITIAL_STATE,
281		}
282		}
283		}
284
285		/// An iterator over the substrings of a string which, after splitting the string on
286		/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
287		/// contain any characters with the
288		/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
289		/// property, or with
290		/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
291		///
292		/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
293		/// trait. See its documentation for more.
294		///
295		/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
296		/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
297		#[derive(Debug, Clone)]
298		pub struct UnicodeSentences<'a> {
299		inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
300		}
301
302		/// External iterator for a string's
303		/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
304		///
305		/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
306		/// trait. See its documentation for more.
307		///
308		/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
309		/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
310		#[derive(Debug, Clone)]
311		pub struct USentenceBounds<'a> {
312		iter: fwd::SentenceBreaks<'a>,
313		sentence_start: Option<usize>,
314		}
315
316		/// External iterator for sentence boundaries and byte offsets.
317		///
318		/// This struct is created by the [`split_sentence_bound_indices`] method on the
319		/// [`UnicodeSegmentation`] trait. See its documentation for more.
320		///
321		/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
322		/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
323		#[derive(Debug, Clone)]
324		pub struct USentenceBoundIndices<'a> {
325		start_offset: usize,
326		iter: USentenceBounds<'a>,
327		}
328
329		#[inline]
330	0	pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
331	0	USentenceBounds {
332	0	iter: fwd::new_sentence_breaks(source),
333	0	sentence_start: None,
334	0	}
335	0	}
336
337		#[inline]
338	0	pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
339	0	USentenceBoundIndices {
340	0	start_offset: source.as_ptr() as usize,
341	0	iter: new_sentence_bounds(source),
342	0	}
343	0	}
344
345		#[inline]
346	0	pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347		use super::UnicodeSegmentation;
348		use crate::tables::util::is_alphanumeric;
349
350		fn has_alphanumeric(s: &&str) -> bool {
351		s.chars().any(is_alphanumeric)
352		}
353	0	let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354	0
355	0	UnicodeSentences {
356	0	inner: s.split_sentence_bounds().filter(has_alphanumeric),
357	0	}
358	0	}
359
360		impl<'a> Iterator for UnicodeSentences<'a> {
361		type Item = &'a str;
362
363		#[inline]
364	0	fn next(&mut self) -> Option<&'a str> {
365	0	self.inner.next()
366	0	}
367
368		#[inline]
369	0	fn size_hint(&self) -> (usize, Option<usize>) {
370	0	self.inner.size_hint()
371	0	}
372		}
373
374		impl<'a> Iterator for USentenceBounds<'a> {
375		type Item = &'a str;
376
377		#[inline]
378	0	fn size_hint(&self) -> (usize, Option<usize>) {
379	0	let (lower, upper) = self.iter.size_hint();
380	0	(cmp::max(0, lower - 1), upper.map(\|u\| cmp::max(0, u - 1)))
381	0	}
382
383		#[inline]
384	0	fn next(&mut self) -> Option<&'a str> {
385	0	if self.sentence_start.is_none() {
386	0	if let Some(start_pos) = self.iter.next() {
387	0	self.sentence_start = Some(start_pos)
388		} else {
389	0	return None;
390		}
391	0	}
392
393	0	if let Some(break_pos) = self.iter.next() {
394	0	let start_pos = self.sentence_start.unwrap();
395	0	let sentence = &self.iter.string[start_pos..break_pos];
396	0	self.sentence_start = Some(break_pos);
397	0	Some(sentence)
398		} else {
399	0	None
400		}
401	0	}
402		}
403
404		impl<'a> Iterator for USentenceBoundIndices<'a> {
405		type Item = (usize, &'a str);
406
407		#[inline]
408	0	fn next(&mut self) -> Option<(usize, &'a str)> {
409	0	self.iter
410	0	.next()
411	0	.map(\|s\| (s.as_ptr() as usize - self.start_offset, s))
412	0	}
413
414		#[inline]
415	0	fn size_hint(&self) -> (usize, Option<usize>) {
416	0	self.iter.size_hint()
417	0	}
418		}