/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_segmenter-1.5.0/src/grapheme.rs

Source
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use alloc::vec::Vec;
use icu_provider::prelude::*;

use crate::indices::{Latin1Indices, Utf16Indices};
use crate::iterator_helpers::derive_usize_iterator_with_type;
use crate::rule_segmenter::*;
use crate::{provider::*, SegmenterError};
use utf8_iter::Utf8CharIndices;

/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
///
/// Lifetimes:
///
/// - `'l` = lifetime of the segmenter object from which this iterator was created
/// - `'s` = lifetime of the string being segmented
///
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
/// _after_ the boundary (for a boundary at the end of text, this index is the length
/// of the [`str`] or array of code units).
///
/// For examples of use, see [`GraphemeClusterSegmenter`].
#[derive(Debug)]
pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
    RuleBreakIterator<'l, 's, Y>,
);

derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);

/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
///
/// For examples of use, see [`GraphemeClusterSegmenter`].
pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;

/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
///
/// For examples of use, see [`GraphemeClusterSegmenter`].
pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;

/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
///
/// For examples of use, see [`GraphemeClusterSegmenter`].
pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;

/// Grapheme cluster break iterator for a UTF-16 string.
///
/// For examples of use, see [`GraphemeClusterSegmenter`].
pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;

/// Segments a string into grapheme clusters.
///
/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
/// different string encodings.
///
/// # Examples
///
/// Segment a string:
///
/// ```rust
/// use icu::segmenter::GraphemeClusterSegmenter;
/// let segmenter = GraphemeClusterSegmenter::new();
///
/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
/// ```
///
/// Segment a Latin1 byte string:
///
/// ```rust
/// use icu::segmenter::GraphemeClusterSegmenter;
/// let segmenter = GraphemeClusterSegmenter::new();
///
/// let breakpoints: Vec<usize> =
///     segmenter.segment_latin1(b"Hello World").collect();
/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
/// ```
///
/// Successive boundaries can be used to retrieve the grapheme clusters.
/// In particular, the first boundary is always 0, and the last one is the
/// length of the segmented text in code units.
///
/// ```rust
/// # use icu::segmenter::GraphemeClusterSegmenter;
/// # let segmenter =
/// #     GraphemeClusterSegmenter::new();
/// use itertools::Itertools;
/// let text = "मांजर";
/// let grapheme_clusters: Vec<&str> = segmenter
///     .segment_str(text)
///     .tuple_windows()
///     .map(|(i, j)| &text[i..j])
///     .collect();
/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
/// ```
///
/// This segmenter applies all rules provided to the constructor.
/// Thus, if the data supplied by the provider comprises all
/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
/// _Unicode Text Segmentation_, which is the case of default data
/// (both test data and data produced by `icu_datagen`), the `segment_*`
/// functions return extended grapheme cluster boundaries, as opposed to
/// legacy grapheme cluster boundaries.  See [_Section 3, Grapheme Cluster
/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
///
/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
///
/// ```rust
/// use icu::segmenter::GraphemeClusterSegmenter;
/// let segmenter =
///     GraphemeClusterSegmenter::new();
///
/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
/// // but not a legacy grapheme cluster.
/// let ni = "நி";
/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
/// ```
#[derive(Debug)]
pub struct GraphemeClusterSegmenter {
    payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
}

#[cfg(feature = "compiled_data")]
impl Default for GraphemeClusterSegmenter {
    fn default() -> Self {
        Self::new()
    }
}

impl GraphemeClusterSegmenter {
    /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
    ///
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
    ///
    /// [📚 Help choosing a constructor](icu_provider::constructors)
    #[cfg(feature = "compiled_data")]
    pub fn new() -> Self {
        Self {
            payload: DataPayload::from_static_ref(
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
            ),
        }
    }

    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
        #[cfg(skip)]
        functions: [
            new,
            try_new_with_any_provider,
            try_new_with_buffer_provider,
            try_new_unstable,
            Self,
    ]);

    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
    where
        D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
    {
        let payload = provider.load(Default::default())?.take_payload()?;
        Ok(Self { payload })
    }

    /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
    pub fn segment_str<'l, 's>(
        &'l self,
        input: &'s str,
    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
        GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
    }

    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub(crate) fn new_and_segment_str<'l, 's>(
        input: &'s str,
        payload: &'l RuleBreakDataV1<'l>,
    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
        GraphemeClusterBreakIterator(RuleBreakIterator {
            iter: input.char_indices(),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: payload,
            complex: None,
            boundary_property: 0,
        })
    }

    /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
    ///
    /// Invalid characters are treated as REPLACEMENT CHARACTER
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_utf8<'l, 's>(
        &'l self,
        input: &'s [u8],
    ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
        GraphemeClusterBreakIterator(RuleBreakIterator {
            iter: Utf8CharIndices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }
    /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_latin1<'l, 's>(
        &'l self,
        input: &'s [u8],
    ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
        GraphemeClusterBreakIterator(RuleBreakIterator {
            iter: Latin1Indices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: self.payload.get(),
            complex: None,
            boundary_property: 0,
        })
    }

    /// Creates a grapheme cluster break iterator for a UTF-16 string.
    ///
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
    pub fn segment_utf16<'l, 's>(
        &'l self,
        input: &'s [u16],
    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
        GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
    }

    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
    pub(crate) fn new_and_segment_utf16<'l, 's>(
        input: &'s [u16],
        payload: &'l RuleBreakDataV1<'l>,
    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
        GraphemeClusterBreakIterator(RuleBreakIterator {
            iter: Utf16Indices::new(input),
            len: input.len(),
            current_pos_data: None,
            result_cache: Vec::new(),
            data: payload,
            complex: None,
            boundary_property: 0,
        })
    }
}

#[test]
fn empty_string() {
    let segmenter = GraphemeClusterSegmenter::new();
    let breaks: Vec<usize> = segmenter.segment_str("").collect();
    assert_eq!(breaks, [0]);
}

#[test]
fn emoji_flags() {
    // https://github.com/unicode-org/icu4x/issues/4780
    let segmenter = GraphemeClusterSegmenter::new();
    let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴󠁧󠁢󠁥󠁮󠁧󠁿").collect();
    assert_eq!(breaks, [0, 8, 36]);
}

Coverage Report

Created: 2025-11-11 06:52

Line	Count	Source
1		// This file is part of ICU4X. For terms of use, please see the file
2		// called LICENSE at the top level of the ICU4X source tree
3		// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5		use alloc::vec::Vec;
6		use icu_provider::prelude::*;
7
8		use crate::indices::{Latin1Indices, Utf16Indices};
9		use crate::iterator_helpers::derive_usize_iterator_with_type;
10		use crate::rule_segmenter::*;
11		use crate::{provider::*, SegmenterError};
12		use utf8_iter::Utf8CharIndices;
13
14		/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
15		///
16		/// Lifetimes:
17		///
18		/// - `'l` = lifetime of the segmenter object from which this iterator was created
19		/// - `'s` = lifetime of the string being segmented
20		///
21		/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22		/// _after_ the boundary (for a boundary at the end of text, this index is the length
23		/// of the [`str`] or array of code units).
24		///
25		/// For examples of use, see [`GraphemeClusterSegmenter`].
26		#[derive(Debug)]
27		pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28		RuleBreakIterator<'l, 's, Y>,
29		);
30
31		derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);
32
33		/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
34		///
35		/// For examples of use, see [`GraphemeClusterSegmenter`].
36		pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
37		GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;
38
39		/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
40		///
41		/// For examples of use, see [`GraphemeClusterSegmenter`].
42		pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
43		GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
44
45		/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
46		///
47		/// For examples of use, see [`GraphemeClusterSegmenter`].
48		pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
49		GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;
50
51		/// Grapheme cluster break iterator for a UTF-16 string.
52		///
53		/// For examples of use, see [`GraphemeClusterSegmenter`].
54		pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
55		GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;
56
57		/// Segments a string into grapheme clusters.
58		///
59		/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
60		/// different string encodings.
61		///
62		/// # Examples
63		///
64		/// Segment a string:
65		///
66		/// ```rust
67		/// use icu::segmenter::GraphemeClusterSegmenter;
68		/// let segmenter = GraphemeClusterSegmenter::new();
69		///
70		/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
71		/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
72		/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
73		/// ```
74		///
75		/// Segment a Latin1 byte string:
76		///
77		/// ```rust
78		/// use icu::segmenter::GraphemeClusterSegmenter;
79		/// let segmenter = GraphemeClusterSegmenter::new();
80		///
81		/// let breakpoints: Vec<usize> =
82		/// segmenter.segment_latin1(b"Hello World").collect();
83		/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
84		/// ```
85		///
86		/// Successive boundaries can be used to retrieve the grapheme clusters.
87		/// In particular, the first boundary is always 0, and the last one is the
88		/// length of the segmented text in code units.
89		///
90		/// ```rust
91		/// # use icu::segmenter::GraphemeClusterSegmenter;
92		/// # let segmenter =
93		/// # GraphemeClusterSegmenter::new();
94		/// use itertools::Itertools;
95		/// let text = "मांजर";
96		/// let grapheme_clusters: Vec<&str> = segmenter
97		/// .segment_str(text)
98		/// .tuple_windows()
99		/// .map(\|(i, j)\| &text[i..j])
100		/// .collect();
101		/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
102		/// ```
103		///
104		/// This segmenter applies all rules provided to the constructor.
105		/// Thus, if the data supplied by the provider comprises all
106		/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
107		/// _Unicode Text Segmentation_, which is the case of default data
108		/// (both test data and data produced by `icu_datagen`), the `segment_*`
109		/// functions return extended grapheme cluster boundaries, as opposed to
110		/// legacy grapheme cluster boundaries. See [_Section 3, Grapheme Cluster
111		/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
112		/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
113		///
114		/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
115		/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
116		/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
117		///
118		/// ```rust
119		/// use icu::segmenter::GraphemeClusterSegmenter;
120		/// let segmenter =
121		/// GraphemeClusterSegmenter::new();
122		///
123		/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
124		/// // but not a legacy grapheme cluster.
125		/// let ni = "நி";
126		/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
127		/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
128		/// ```
129		#[derive(Debug)]
130		pub struct GraphemeClusterSegmenter {
131		payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
132		}
133
134		#[cfg(feature = "compiled_data")]
135		impl Default for GraphemeClusterSegmenter {
136	0	fn default() -> Self {
137	0	Self::new()
138	0	}
139		}
140
141		impl GraphemeClusterSegmenter {
142		/// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
143		///
144		/// ✨ Enabled with the `compiled_data` Cargo feature.
145		///
146		/// [📚 Help choosing a constructor](icu_provider::constructors)
147		#[cfg(feature = "compiled_data")]
148	0	pub fn new() -> Self {
149	0	Self {
150	0	payload: DataPayload::from_static_ref(
151	0	crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
152	0	),
153	0	}
154	0	}
155
156		icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
157		#[cfg(skip)]
158		functions: [
159		new,
160		try_new_with_any_provider,
161		try_new_with_buffer_provider,
162		try_new_unstable,
163		Self,
164		]);
165
166		#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
167	0	pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
168	0	where
169	0	D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
170		{
171	0	let payload = provider.load(Default::default())?.take_payload()?;
172	0	Ok(Self { payload })
173	0	} Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<_>
174
175		/// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
176	0	pub fn segment_str<'l, 's>(
177	0	&'l self,
178	0	input: &'s str,
179	0	) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
180	0	GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
181	0	}
182
183		/// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
184		///
185		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
186	0	pub(crate) fn new_and_segment_str<'l, 's>(
187	0	input: &'s str,
188	0	payload: &'l RuleBreakDataV1<'l>,
189	0	) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
190	0	GraphemeClusterBreakIterator(RuleBreakIterator {
191	0	iter: input.char_indices(),
192	0	len: input.len(),
193	0	current_pos_data: None,
194	0	result_cache: Vec::new(),
195	0	data: payload,
196	0	complex: None,
197	0	boundary_property: 0,
198	0	})
199	0	}
200
201		/// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
202		///
203		/// Invalid characters are treated as REPLACEMENT CHARACTER
204		///
205		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
206	0	pub fn segment_utf8<'l, 's>(
207	0	&'l self,
208	0	input: &'s [u8],
209	0	) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
210	0	GraphemeClusterBreakIterator(RuleBreakIterator {
211	0	iter: Utf8CharIndices::new(input),
212	0	len: input.len(),
213	0	current_pos_data: None,
214	0	result_cache: Vec::new(),
215	0	data: self.payload.get(),
216	0	complex: None,
217	0	boundary_property: 0,
218	0	})
219	0	}
220		/// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
221		///
222		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
223	0	pub fn segment_latin1<'l, 's>(
224	0	&'l self,
225	0	input: &'s [u8],
226	0	) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
227	0	GraphemeClusterBreakIterator(RuleBreakIterator {
228	0	iter: Latin1Indices::new(input),
229	0	len: input.len(),
230	0	current_pos_data: None,
231	0	result_cache: Vec::new(),
232	0	data: self.payload.get(),
233	0	complex: None,
234	0	boundary_property: 0,
235	0	})
236	0	}
237
238		/// Creates a grapheme cluster break iterator for a UTF-16 string.
239		///
240		/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
241	0	pub fn segment_utf16<'l, 's>(
242	0	&'l self,
243	0	input: &'s [u16],
244	0	) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
245	0	GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
246	0	}
247
248		/// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
249	0	pub(crate) fn new_and_segment_utf16<'l, 's>(
250	0	input: &'s [u16],
251	0	payload: &'l RuleBreakDataV1<'l>,
252	0	) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
253	0	GraphemeClusterBreakIterator(RuleBreakIterator {
254	0	iter: Utf16Indices::new(input),
255	0	len: input.len(),
256	0	current_pos_data: None,
257	0	result_cache: Vec::new(),
258	0	data: payload,
259	0	complex: None,
260	0	boundary_property: 0,
261	0	})
262	0	}
263		}
264
265		#[test]
266		fn empty_string() {
267		let segmenter = GraphemeClusterSegmenter::new();
268		let breaks: Vec<usize> = segmenter.segment_str("").collect();
269		assert_eq!(breaks, [0]);
270		}
271
272		#[test]
273		fn emoji_flags() {
274		// https://github.com/unicode-org/icu4x/issues/4780
275		let segmenter = GraphemeClusterSegmenter::new();
276		let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴󠁧󠁢󠁥󠁮󠁧󠁿").collect();
277		assert_eq!(breaks, [0, 8, 36]);
278		}