Coverage Report

Created: 2025-11-11 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_segmenter-1.5.0/src/grapheme.rs
Line
Count
Source
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
use alloc::vec::Vec;
6
use icu_provider::prelude::*;
7
8
use crate::indices::{Latin1Indices, Utf16Indices};
9
use crate::iterator_helpers::derive_usize_iterator_with_type;
10
use crate::rule_segmenter::*;
11
use crate::{provider::*, SegmenterError};
12
use utf8_iter::Utf8CharIndices;
13
14
/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
15
///
16
/// Lifetimes:
17
///
18
/// - `'l` = lifetime of the segmenter object from which this iterator was created
19
/// - `'s` = lifetime of the string being segmented
20
///
21
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22
/// _after_ the boundary (for a boundary at the end of text, this index is the length
23
/// of the [`str`] or array of code units).
24
///
25
/// For examples of use, see [`GraphemeClusterSegmenter`].
26
#[derive(Debug)]
27
pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28
    RuleBreakIterator<'l, 's, Y>,
29
);
30
31
derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);
32
33
/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
34
///
35
/// For examples of use, see [`GraphemeClusterSegmenter`].
36
pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
37
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;
38
39
/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
40
///
41
/// For examples of use, see [`GraphemeClusterSegmenter`].
42
pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
43
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
44
45
/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
46
///
47
/// For examples of use, see [`GraphemeClusterSegmenter`].
48
pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
49
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;
50
51
/// Grapheme cluster break iterator for a UTF-16 string.
52
///
53
/// For examples of use, see [`GraphemeClusterSegmenter`].
54
pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
55
    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;
56
57
/// Segments a string into grapheme clusters.
58
///
59
/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
60
/// different string encodings.
61
///
62
/// # Examples
63
///
64
/// Segment a string:
65
///
66
/// ```rust
67
/// use icu::segmenter::GraphemeClusterSegmenter;
68
/// let segmenter = GraphemeClusterSegmenter::new();
69
///
70
/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
71
/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
72
/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
73
/// ```
74
///
75
/// Segment a Latin1 byte string:
76
///
77
/// ```rust
78
/// use icu::segmenter::GraphemeClusterSegmenter;
79
/// let segmenter = GraphemeClusterSegmenter::new();
80
///
81
/// let breakpoints: Vec<usize> =
82
///     segmenter.segment_latin1(b"Hello World").collect();
83
/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
84
/// ```
85
///
86
/// Successive boundaries can be used to retrieve the grapheme clusters.
87
/// In particular, the first boundary is always 0, and the last one is the
88
/// length of the segmented text in code units.
89
///
90
/// ```rust
91
/// # use icu::segmenter::GraphemeClusterSegmenter;
92
/// # let segmenter =
93
/// #     GraphemeClusterSegmenter::new();
94
/// use itertools::Itertools;
95
/// let text = "मांजर";
96
/// let grapheme_clusters: Vec<&str> = segmenter
97
///     .segment_str(text)
98
///     .tuple_windows()
99
///     .map(|(i, j)| &text[i..j])
100
///     .collect();
101
/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
102
/// ```
103
///
104
/// This segmenter applies all rules provided to the constructor.
105
/// Thus, if the data supplied by the provider comprises all
106
/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
107
/// _Unicode Text Segmentation_, which is the case of default data
108
/// (both test data and data produced by `icu_datagen`), the `segment_*`
109
/// functions return extended grapheme cluster boundaries, as opposed to
110
/// legacy grapheme cluster boundaries.  See [_Section 3, Grapheme Cluster
111
/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
112
/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
113
///
114
/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
115
/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
116
/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
117
///
118
/// ```rust
119
/// use icu::segmenter::GraphemeClusterSegmenter;
120
/// let segmenter =
121
///     GraphemeClusterSegmenter::new();
122
///
123
/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
124
/// // but not a legacy grapheme cluster.
125
/// let ni = "நி";
126
/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
127
/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
128
/// ```
129
#[derive(Debug)]
130
pub struct GraphemeClusterSegmenter {
131
    payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
132
}
133
134
#[cfg(feature = "compiled_data")]
135
impl Default for GraphemeClusterSegmenter {
136
0
    fn default() -> Self {
137
0
        Self::new()
138
0
    }
139
}
140
141
impl GraphemeClusterSegmenter {
142
    /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
143
    ///
144
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
145
    ///
146
    /// [📚 Help choosing a constructor](icu_provider::constructors)
147
    #[cfg(feature = "compiled_data")]
148
0
    pub fn new() -> Self {
149
0
        Self {
150
0
            payload: DataPayload::from_static_ref(
151
0
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
152
0
            ),
153
0
        }
154
0
    }
155
156
    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
157
        #[cfg(skip)]
158
        functions: [
159
            new,
160
            try_new_with_any_provider,
161
            try_new_with_buffer_provider,
162
            try_new_unstable,
163
            Self,
164
    ]);
165
166
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
167
0
    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
168
0
    where
169
0
        D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
170
    {
171
0
        let payload = provider.load(Default::default())?.take_payload()?;
172
0
        Ok(Self { payload })
173
0
    }
Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<_>
174
175
    /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
176
0
    pub fn segment_str<'l, 's>(
177
0
        &'l self,
178
0
        input: &'s str,
179
0
    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
180
0
        GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
181
0
    }
182
183
    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
184
    ///
185
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
186
0
    pub(crate) fn new_and_segment_str<'l, 's>(
187
0
        input: &'s str,
188
0
        payload: &'l RuleBreakDataV1<'l>,
189
0
    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
190
0
        GraphemeClusterBreakIterator(RuleBreakIterator {
191
0
            iter: input.char_indices(),
192
0
            len: input.len(),
193
0
            current_pos_data: None,
194
0
            result_cache: Vec::new(),
195
0
            data: payload,
196
0
            complex: None,
197
0
            boundary_property: 0,
198
0
        })
199
0
    }
200
201
    /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
202
    ///
203
    /// Invalid characters are treated as REPLACEMENT CHARACTER
204
    ///
205
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
206
0
    pub fn segment_utf8<'l, 's>(
207
0
        &'l self,
208
0
        input: &'s [u8],
209
0
    ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
210
0
        GraphemeClusterBreakIterator(RuleBreakIterator {
211
0
            iter: Utf8CharIndices::new(input),
212
0
            len: input.len(),
213
0
            current_pos_data: None,
214
0
            result_cache: Vec::new(),
215
0
            data: self.payload.get(),
216
0
            complex: None,
217
0
            boundary_property: 0,
218
0
        })
219
0
    }
220
    /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
221
    ///
222
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
223
0
    pub fn segment_latin1<'l, 's>(
224
0
        &'l self,
225
0
        input: &'s [u8],
226
0
    ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
227
0
        GraphemeClusterBreakIterator(RuleBreakIterator {
228
0
            iter: Latin1Indices::new(input),
229
0
            len: input.len(),
230
0
            current_pos_data: None,
231
0
            result_cache: Vec::new(),
232
0
            data: self.payload.get(),
233
0
            complex: None,
234
0
            boundary_property: 0,
235
0
        })
236
0
    }
237
238
    /// Creates a grapheme cluster break iterator for a UTF-16 string.
239
    ///
240
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
241
0
    pub fn segment_utf16<'l, 's>(
242
0
        &'l self,
243
0
        input: &'s [u16],
244
0
    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
245
0
        GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
246
0
    }
247
248
    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
249
0
    pub(crate) fn new_and_segment_utf16<'l, 's>(
250
0
        input: &'s [u16],
251
0
        payload: &'l RuleBreakDataV1<'l>,
252
0
    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
253
0
        GraphemeClusterBreakIterator(RuleBreakIterator {
254
0
            iter: Utf16Indices::new(input),
255
0
            len: input.len(),
256
0
            current_pos_data: None,
257
0
            result_cache: Vec::new(),
258
0
            data: payload,
259
0
            complex: None,
260
0
            boundary_property: 0,
261
0
        })
262
0
    }
263
}
264
265
#[test]
266
fn empty_string() {
267
    let segmenter = GraphemeClusterSegmenter::new();
268
    let breaks: Vec<usize> = segmenter.segment_str("").collect();
269
    assert_eq!(breaks, [0]);
270
}
271
272
#[test]
273
fn emoji_flags() {
274
    // https://github.com/unicode-org/icu4x/issues/4780
275
    let segmenter = GraphemeClusterSegmenter::new();
276
    let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴󠁧󠁢󠁥󠁮󠁧󠁿").collect();
277
    assert_eq!(breaks, [0, 8, 36]);
278
}