/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_segmenter-1.5.0/src/sentence.rs

Source
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use alloc::vec::Vec;
use icu_provider::prelude::*;

use crate::indices::{Latin1Indices, Utf16Indices};
use crate::iterator_helpers::derive_usize_iterator_with_type;
use crate::rule_segmenter::*;
use crate::{provider::*, SegmenterError};
use utf8_iter::Utf8CharIndices;

/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
///
/// Lifetimes:
///
/// - `'l` = lifetime of the segmenter object from which this iterator was created
/// - `'s` = lifetime of the string being segmented
///
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
/// _after_ the boundary (for a boundary at the end of text, this index is the length
/// of the [`str`] or array of code units).
///
/// For examples of use, see [`SentenceSegmenter`].
#[derive(Debug)]
pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
    RuleBreakIterator<'l, 's, Y>,
);

derive_usize_iterator_with_type!(SentenceBreakIterator);

/// Sentence break iterator for an `str` (a UTF-8 string).
///
/// For examples of use, see [`SentenceSegmenter`].
pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;

/// Sentence break iterator for a potentially invalid UTF-8 string.
///
/// For examples of use, see [`SentenceSegmenter`].
pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
    SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;

/// Sentence break iterator for a Latin-1 (8-bit) string.
///
/// For examples of use, see [`SentenceSegmenter`].
pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;

/// Sentence break iterator for a UTF-16 string.
///
/// For examples of use, see [`SentenceSegmenter`].
pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;

/// Supports loading sentence break data, and creating sentence break iterators for different string
/// encodings.
///
/// # Examples
///
/// Segment a string:
///
/// ```rust
/// use icu::segmenter::SentenceSegmenter;
/// let segmenter = SentenceSegmenter::new();
///
/// let breakpoints: Vec<usize> =
///     segmenter.segment_str("Hello World").collect();
/// assert_eq!(&breakpoints, &[0, 11]);
/// ```
///
/// Segment a Latin1 byte string:
///
/// ```rust
/// use icu::segmenter::SentenceSegmenter;
/// let segmenter = SentenceSegmenter::new();
///
/// let breakpoints: Vec<usize> =
///     segmenter.segment_latin1(b"Hello World").collect();
/// assert_eq!(&breakpoints, &[0, 11]);
/// ```
///
/// Successive boundaries can be used to retrieve the sentences.
/// In particular, the first boundary is always 0, and the last one is the
/// length of the segmented text in code units.
///
/// ```rust
/// # use icu::segmenter::SentenceSegmenter;
/// # let segmenter = SentenceSegmenter::new();
/// use itertools::Itertools;
/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
/// let sentences: Vec<&str> = segmenter
///     .segment_str(text)
///     .tuple_windows()
///     .map(|(i, j)| &text[i..j])
///     .collect();
/// assert_eq!(
///     &sentences,
///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
/// );
/// ```
#[derive(Debug)]
pub struct SentenceSegmenter {
    payload: DataPayload<SentenceBreakDataV1Marker>,
}

#[cfg(feature = "compiled_data")]
impl Default for SentenceSegmenter {
    fn default() -> Self {
        Self::new()
    }
}

impl SentenceSegmenter {
    /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
    ///
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
    ///
    /// [📚 Help choosing a constructor](icu_provider::constructors)
    #[cfg(feature = "compiled_data")]
    pub fn new() -> Self {
        Self {
            payload: DataPayload::from_static_ref(
                crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
            ),
        }
    }

    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
        #[cfg(skip)]
        functions: [
            new,
            try_new_with_any_provider,
            try_new_with_buffer_provider,
            try_new_unstable,
            Self,
        ]
    );

    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
    where
        D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
    {
        let payload = provider.load(Default::default())?.take_payload()?;
        Ok(Self { payload })
    }

    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
        SentenceBreakIterator(RuleBreakIterator {
            iter: input.char_indices(),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }
    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
    ///
    /// Invalid characters are treated as REPLACEMENT CHARACTER
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_utf8<'l, 's>(
        &'l self,
        input: &'s [u8],
    ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
        SentenceBreakIterator(RuleBreakIterator {
            iter: Utf8CharIndices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }
    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_latin1<'l, 's>(
        &'l self,
        input: &'s [u8],
    ) -> SentenceBreakIteratorLatin1<'l, 's> {
        SentenceBreakIterator(RuleBreakIterator {
            iter: Latin1Indices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }

    /// Creates a sentence break iterator for a UTF-16 string.
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
        SentenceBreakIterator(RuleBreakIterator {
            iter: Utf16Indices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }
}

#[cfg(all(test, feature = "serde"))]
#[test]
fn empty_string() {
    let segmenter = SentenceSegmenter::new();
    let breaks: Vec<usize> = segmenter.segment_str("").collect();
    assert_eq!(breaks, [0]);
}

Coverage Report

Created: 2025-11-24 06:32

Line	Count	Source
1		// This file is part of ICU4X. For terms of use, please see the file
2		// called LICENSE at the top level of the ICU4X source tree
3		// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5		use alloc::vec::Vec;
6		use icu_provider::prelude::*;
7
8		use crate::indices::{Latin1Indices, Utf16Indices};
9		use crate::iterator_helpers::derive_usize_iterator_with_type;
10		use crate::rule_segmenter::*;
11		use crate::{provider::*, SegmenterError};
12		use utf8_iter::Utf8CharIndices;
13
14		/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
15		///
16		/// Lifetimes:
17		///
18		/// - `'l` = lifetime of the segmenter object from which this iterator was created
19		/// - `'s` = lifetime of the string being segmented
20		///
21		/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22		/// _after_ the boundary (for a boundary at the end of text, this index is the length
23		/// of the [`str`] or array of code units).
24		///
25		/// For examples of use, see [`SentenceSegmenter`].
26		#[derive(Debug)]
27		pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28		RuleBreakIterator<'l, 's, Y>,
29		);
30
31		derive_usize_iterator_with_type!(SentenceBreakIterator);
32
33		/// Sentence break iterator for an `str` (a UTF-8 string).
34		///
35		/// For examples of use, see [`SentenceSegmenter`].
36		pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;
37
38		/// Sentence break iterator for a potentially invalid UTF-8 string.
39		///
40		/// For examples of use, see [`SentenceSegmenter`].
41		pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
42		SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
43
44		/// Sentence break iterator for a Latin-1 (8-bit) string.
45		///
46		/// For examples of use, see [`SentenceSegmenter`].
47		pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;
48
49		/// Sentence break iterator for a UTF-16 string.
50		///
51		/// For examples of use, see [`SentenceSegmenter`].
52		pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;
53
54		/// Supports loading sentence break data, and creating sentence break iterators for different string
55		/// encodings.
56		///
57		/// # Examples
58		///
59		/// Segment a string:
60		///
61		/// ```rust
62		/// use icu::segmenter::SentenceSegmenter;
63		/// let segmenter = SentenceSegmenter::new();
64		///
65		/// let breakpoints: Vec<usize> =
66		/// segmenter.segment_str("Hello World").collect();
67		/// assert_eq!(&breakpoints, &[0, 11]);
68		/// ```
69		///
70		/// Segment a Latin1 byte string:
71		///
72		/// ```rust
73		/// use icu::segmenter::SentenceSegmenter;
74		/// let segmenter = SentenceSegmenter::new();
75		///
76		/// let breakpoints: Vec<usize> =
77		/// segmenter.segment_latin1(b"Hello World").collect();
78		/// assert_eq!(&breakpoints, &[0, 11]);
79		/// ```
80		///
81		/// Successive boundaries can be used to retrieve the sentences.
82		/// In particular, the first boundary is always 0, and the last one is the
83		/// length of the segmented text in code units.
84		///
85		/// ```rust
86		/// # use icu::segmenter::SentenceSegmenter;
87		/// # let segmenter = SentenceSegmenter::new();
88		/// use itertools::Itertools;
89		/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
90		/// let sentences: Vec<&str> = segmenter
91		/// .segment_str(text)
92		/// .tuple_windows()
93		/// .map(\|(i, j)\| &text[i..j])
94		/// .collect();
95		/// assert_eq!(
96		/// &sentences,
97		/// &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
98		/// );
99		/// ```
100		#[derive(Debug)]
101		pub struct SentenceSegmenter {
102		payload: DataPayload<SentenceBreakDataV1Marker>,
103		}
104
105		#[cfg(feature = "compiled_data")]
106		impl Default for SentenceSegmenter {
107	0	fn default() -> Self {
108	0	Self::new()
109	0	}
110		}
111
112		impl SentenceSegmenter {
113		/// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
114		///
115		/// ✨ Enabled with the `compiled_data` Cargo feature.
116		///
117		/// [📚 Help choosing a constructor](icu_provider::constructors)
118		#[cfg(feature = "compiled_data")]
119	0	pub fn new() -> Self {
120	0	Self {
121	0	payload: DataPayload::from_static_ref(
122	0	crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
123	0	),
124	0	}
125	0	}
126
127		icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
128		#[cfg(skip)]
129		functions: [
130		new,
131		try_new_with_any_provider,
132		try_new_with_buffer_provider,
133		try_new_unstable,
134		Self,
135		]
136		);
137
138		#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
139	0	pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
140	0	where
141	0	D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
142		{
143	0	let payload = provider.load(Default::default())?.take_payload()?;
144	0	Ok(Self { payload })
145	0	} Unexecuted instantiation: <icu_segmenter::sentence::SentenceSegmenter>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::sentence::SentenceSegmenter>::try_new_unstable::<_>
146
147		/// Creates a sentence break iterator for an `str` (a UTF-8 string).
148		///
149		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
150	0	pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
151	0	SentenceBreakIterator(RuleBreakIterator {
152	0	iter: input.char_indices(),
153	0	len: input.len(),
154	0	current_pos_data: None,
155	0	result_cache: Vec::new(),
156	0	data: self.payload.get(),
157	0	complex: None,
158	0	boundary_property: 0,
159	0	})
160	0	}
161		/// Creates a sentence break iterator for a potentially ill-formed UTF8 string
162		///
163		/// Invalid characters are treated as REPLACEMENT CHARACTER
164		///
165		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
166	0	pub fn segment_utf8<'l, 's>(
167	0	&'l self,
168	0	input: &'s [u8],
169	0	) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
170	0	SentenceBreakIterator(RuleBreakIterator {
171	0	iter: Utf8CharIndices::new(input),
172	0	len: input.len(),
173	0	current_pos_data: None,
174	0	result_cache: Vec::new(),
175	0	data: self.payload.get(),
176	0	complex: None,
177	0	boundary_property: 0,
178	0	})
179	0	}
180		/// Creates a sentence break iterator for a Latin-1 (8-bit) string.
181		///
182		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
183	0	pub fn segment_latin1<'l, 's>(
184	0	&'l self,
185	0	input: &'s [u8],
186	0	) -> SentenceBreakIteratorLatin1<'l, 's> {
187	0	SentenceBreakIterator(RuleBreakIterator {
188	0	iter: Latin1Indices::new(input),
189	0	len: input.len(),
190	0	current_pos_data: None,
191	0	result_cache: Vec::new(),
192	0	data: self.payload.get(),
193	0	complex: None,
194	0	boundary_property: 0,
195	0	})
196	0	}
197
198		/// Creates a sentence break iterator for a UTF-16 string.
199		///
200		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
201	0	pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
202	0	SentenceBreakIterator(RuleBreakIterator {
203	0	iter: Utf16Indices::new(input),
204	0	len: input.len(),
205	0	current_pos_data: None,
206	0	result_cache: Vec::new(),
207	0	data: self.payload.get(),
208	0	complex: None,
209	0	boundary_property: 0,
210	0	})
211	0	}
212		}
213
214		#[cfg(all(test, feature = "serde"))]
215		#[test]
216		fn empty_string() {
217		let segmenter = SentenceSegmenter::new();
218		let breaks: Vec<usize> = segmenter.segment_str("").collect();
219		assert_eq!(breaks, [0]);
220		}