/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_segmenter-1.5.0/src/grapheme.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use alloc::vec::Vec; |
6 | | use icu_provider::prelude::*; |
7 | | |
8 | | use crate::indices::{Latin1Indices, Utf16Indices}; |
9 | | use crate::iterator_helpers::derive_usize_iterator_with_type; |
10 | | use crate::rule_segmenter::*; |
11 | | use crate::{provider::*, SegmenterError}; |
12 | | use utf8_iter::Utf8CharIndices; |
13 | | |
14 | | /// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string. |
15 | | /// |
16 | | /// Lifetimes: |
17 | | /// |
18 | | /// - `'l` = lifetime of the segmenter object from which this iterator was created |
19 | | /// - `'s` = lifetime of the string being segmented |
20 | | /// |
21 | | /// The [`Iterator::Item`] is an [`usize`] representing index of a code unit |
22 | | /// _after_ the boundary (for a boundary at the end of text, this index is the length |
23 | | /// of the [`str`] or array of code units). |
24 | | /// |
25 | | /// For examples of use, see [`GraphemeClusterSegmenter`]. |
26 | | #[derive(Debug)] |
27 | | pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>( |
28 | | RuleBreakIterator<'l, 's, Y>, |
29 | | ); |
30 | | |
31 | | derive_usize_iterator_with_type!(GraphemeClusterBreakIterator); |
32 | | |
33 | | /// Grapheme cluster break iterator for an `str` (a UTF-8 string). |
34 | | /// |
35 | | /// For examples of use, see [`GraphemeClusterSegmenter`]. |
36 | | pub type GraphemeClusterBreakIteratorUtf8<'l, 's> = |
37 | | GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>; |
38 | | |
39 | | /// Grapheme cluster break iterator for a potentially invalid UTF-8 string. |
40 | | /// |
41 | | /// For examples of use, see [`GraphemeClusterSegmenter`]. |
42 | | pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> = |
43 | | GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>; |
44 | | |
45 | | /// Grapheme cluster break iterator for a Latin-1 (8-bit) string. |
46 | | /// |
47 | | /// For examples of use, see [`GraphemeClusterSegmenter`]. |
48 | | pub type GraphemeClusterBreakIteratorLatin1<'l, 's> = |
49 | | GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>; |
50 | | |
51 | | /// Grapheme cluster break iterator for a UTF-16 string. |
52 | | /// |
53 | | /// For examples of use, see [`GraphemeClusterSegmenter`]. |
54 | | pub type GraphemeClusterBreakIteratorUtf16<'l, 's> = |
55 | | GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>; |
56 | | |
57 | | /// Segments a string into grapheme clusters. |
58 | | /// |
59 | | /// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for |
60 | | /// different string encodings. |
61 | | /// |
62 | | /// # Examples |
63 | | /// |
64 | | /// Segment a string: |
65 | | /// |
66 | | /// ```rust |
67 | | /// use icu::segmenter::GraphemeClusterSegmenter; |
68 | | /// let segmenter = GraphemeClusterSegmenter::new(); |
69 | | /// |
70 | | /// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect(); |
71 | | /// // World Map (U+1F5FA) is encoded in four bytes in UTF-8. |
72 | | /// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]); |
73 | | /// ``` |
74 | | /// |
75 | | /// Segment a Latin1 byte string: |
76 | | /// |
77 | | /// ```rust |
78 | | /// use icu::segmenter::GraphemeClusterSegmenter; |
79 | | /// let segmenter = GraphemeClusterSegmenter::new(); |
80 | | /// |
81 | | /// let breakpoints: Vec<usize> = |
82 | | /// segmenter.segment_latin1(b"Hello World").collect(); |
83 | | /// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); |
84 | | /// ``` |
85 | | /// |
86 | | /// Successive boundaries can be used to retrieve the grapheme clusters. |
87 | | /// In particular, the first boundary is always 0, and the last one is the |
88 | | /// length of the segmented text in code units. |
89 | | /// |
90 | | /// ```rust |
91 | | /// # use icu::segmenter::GraphemeClusterSegmenter; |
92 | | /// # let segmenter = |
93 | | /// # GraphemeClusterSegmenter::new(); |
94 | | /// use itertools::Itertools; |
95 | | /// let text = "मांजर"; |
96 | | /// let grapheme_clusters: Vec<&str> = segmenter |
97 | | /// .segment_str(text) |
98 | | /// .tuple_windows() |
99 | | /// .map(|(i, j)| &text[i..j]) |
100 | | /// .collect(); |
101 | | /// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]); |
102 | | /// ``` |
103 | | /// |
104 | | /// This segmenter applies all rules provided to the constructor. |
105 | | /// Thus, if the data supplied by the provider comprises all |
106 | | /// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29, |
107 | | /// _Unicode Text Segmentation_, which is the case of default data |
108 | | /// (both test data and data produced by `icu_datagen`), the `segment_*` |
109 | | /// functions return extended grapheme cluster boundaries, as opposed to |
110 | | /// legacy grapheme cluster boundaries. See [_Section 3, Grapheme Cluster |
111 | | /// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC], |
112 | | /// in Unicode Standard Annex #29, _Unicode Text Segmentation_. |
113 | | /// |
114 | | /// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules |
115 | | /// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries |
116 | | /// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters |
117 | | /// |
118 | | /// ```rust |
119 | | /// use icu::segmenter::GraphemeClusterSegmenter; |
120 | | /// let segmenter = |
121 | | /// GraphemeClusterSegmenter::new(); |
122 | | /// |
123 | | /// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster, |
124 | | /// // but not a legacy grapheme cluster. |
125 | | /// let ni = "நி"; |
126 | | /// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect(); |
127 | | /// assert_eq!(&egc_boundaries, &[0, ni.len()]); |
128 | | /// ``` |
129 | | #[derive(Debug)] |
130 | | pub struct GraphemeClusterSegmenter { |
131 | | payload: DataPayload<GraphemeClusterBreakDataV1Marker>, |
132 | | } |
133 | | |
134 | | #[cfg(feature = "compiled_data")] |
135 | | impl Default for GraphemeClusterSegmenter { |
136 | 0 | fn default() -> Self { |
137 | 0 | Self::new() |
138 | 0 | } |
139 | | } |
140 | | |
141 | | impl GraphemeClusterSegmenter { |
142 | | /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data. |
143 | | /// |
144 | | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
145 | | /// |
146 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
147 | | #[cfg(feature = "compiled_data")] |
148 | 0 | pub fn new() -> Self { |
149 | 0 | Self { |
150 | 0 | payload: DataPayload::from_static_ref( |
151 | 0 | crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, |
152 | 0 | ), |
153 | 0 | } |
154 | 0 | } |
155 | | |
156 | | icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError, |
157 | | #[cfg(skip)] |
158 | | functions: [ |
159 | | new, |
160 | | try_new_with_any_provider, |
161 | | try_new_with_buffer_provider, |
162 | | try_new_unstable, |
163 | | Self, |
164 | | ]); |
165 | | |
166 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
167 | 0 | pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError> |
168 | 0 | where |
169 | 0 | D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized, |
170 | | { |
171 | 0 | let payload = provider.load(Default::default())?.take_payload()?; |
172 | 0 | Ok(Self { payload }) |
173 | 0 | } Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::grapheme::GraphemeClusterSegmenter>::try_new_unstable::<_> |
174 | | |
175 | | /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string). |
176 | 0 | pub fn segment_str<'l, 's>( |
177 | 0 | &'l self, |
178 | 0 | input: &'s str, |
179 | 0 | ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> { |
180 | 0 | GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get()) |
181 | 0 | } |
182 | | |
183 | | /// Creates a grapheme cluster break iterator from grapheme cluster rule payload. |
184 | | /// |
185 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
186 | 0 | pub(crate) fn new_and_segment_str<'l, 's>( |
187 | 0 | input: &'s str, |
188 | 0 | payload: &'l RuleBreakDataV1<'l>, |
189 | 0 | ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> { |
190 | 0 | GraphemeClusterBreakIterator(RuleBreakIterator { |
191 | 0 | iter: input.char_indices(), |
192 | 0 | len: input.len(), |
193 | 0 | current_pos_data: None, |
194 | 0 | result_cache: Vec::new(), |
195 | 0 | data: payload, |
196 | 0 | complex: None, |
197 | 0 | boundary_property: 0, |
198 | 0 | }) |
199 | 0 | } |
200 | | |
201 | | /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string |
202 | | /// |
203 | | /// Invalid characters are treated as REPLACEMENT CHARACTER |
204 | | /// |
205 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
206 | 0 | pub fn segment_utf8<'l, 's>( |
207 | 0 | &'l self, |
208 | 0 | input: &'s [u8], |
209 | 0 | ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> { |
210 | 0 | GraphemeClusterBreakIterator(RuleBreakIterator { |
211 | 0 | iter: Utf8CharIndices::new(input), |
212 | 0 | len: input.len(), |
213 | 0 | current_pos_data: None, |
214 | 0 | result_cache: Vec::new(), |
215 | 0 | data: self.payload.get(), |
216 | 0 | complex: None, |
217 | 0 | boundary_property: 0, |
218 | 0 | }) |
219 | 0 | } |
220 | | /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string. |
221 | | /// |
222 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
223 | 0 | pub fn segment_latin1<'l, 's>( |
224 | 0 | &'l self, |
225 | 0 | input: &'s [u8], |
226 | 0 | ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> { |
227 | 0 | GraphemeClusterBreakIterator(RuleBreakIterator { |
228 | 0 | iter: Latin1Indices::new(input), |
229 | 0 | len: input.len(), |
230 | 0 | current_pos_data: None, |
231 | 0 | result_cache: Vec::new(), |
232 | 0 | data: self.payload.get(), |
233 | 0 | complex: None, |
234 | 0 | boundary_property: 0, |
235 | 0 | }) |
236 | 0 | } |
237 | | |
238 | | /// Creates a grapheme cluster break iterator for a UTF-16 string. |
239 | | /// |
240 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
241 | 0 | pub fn segment_utf16<'l, 's>( |
242 | 0 | &'l self, |
243 | 0 | input: &'s [u16], |
244 | 0 | ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> { |
245 | 0 | GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get()) |
246 | 0 | } |
247 | | |
248 | | /// Creates a grapheme cluster break iterator from grapheme cluster rule payload. |
249 | 0 | pub(crate) fn new_and_segment_utf16<'l, 's>( |
250 | 0 | input: &'s [u16], |
251 | 0 | payload: &'l RuleBreakDataV1<'l>, |
252 | 0 | ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> { |
253 | 0 | GraphemeClusterBreakIterator(RuleBreakIterator { |
254 | 0 | iter: Utf16Indices::new(input), |
255 | 0 | len: input.len(), |
256 | 0 | current_pos_data: None, |
257 | 0 | result_cache: Vec::new(), |
258 | 0 | data: payload, |
259 | 0 | complex: None, |
260 | 0 | boundary_property: 0, |
261 | 0 | }) |
262 | 0 | } |
263 | | } |
264 | | |
265 | | #[test] |
266 | | fn empty_string() { |
267 | | let segmenter = GraphemeClusterSegmenter::new(); |
268 | | let breaks: Vec<usize> = segmenter.segment_str("").collect(); |
269 | | assert_eq!(breaks, [0]); |
270 | | } |
271 | | |
272 | | #[test] |
273 | | fn emoji_flags() { |
274 | | // https://github.com/unicode-org/icu4x/issues/4780 |
275 | | let segmenter = GraphemeClusterSegmenter::new(); |
276 | | let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴").collect(); |
277 | | assert_eq!(breaks, [0, 8, 36]); |
278 | | } |