/src/unicode-segmentation/src/grapheme.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | use core::cmp; |
12 | | |
13 | | use crate::tables::grapheme::GraphemeCat; |
14 | | |
15 | | /// External iterator for grapheme clusters and byte offsets. |
16 | | /// |
17 | | /// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`] |
18 | | /// trait. See its documentation for more. |
19 | | /// |
20 | | /// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices |
21 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
22 | | #[derive(Debug, Clone)] |
23 | | pub struct GraphemeIndices<'a> { |
24 | | start_offset: usize, |
25 | | iter: Graphemes<'a>, |
26 | | } |
27 | | |
28 | | impl<'a> GraphemeIndices<'a> { |
29 | | #[inline] |
30 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
31 | | /// |
32 | | /// ```rust |
33 | | /// # use unicode_segmentation::UnicodeSegmentation; |
34 | | /// let mut iter = "abc".grapheme_indices(true); |
35 | | /// assert_eq!(iter.as_str(), "abc"); |
36 | | /// iter.next(); |
37 | | /// assert_eq!(iter.as_str(), "bc"); |
38 | | /// iter.next(); |
39 | | /// iter.next(); |
40 | | /// assert_eq!(iter.as_str(), ""); |
41 | | /// ``` |
42 | 0 | pub fn as_str(&self) -> &'a str { |
43 | 0 | self.iter.as_str() |
44 | 0 | } |
45 | | } |
46 | | |
47 | | impl<'a> Iterator for GraphemeIndices<'a> { |
48 | | type Item = (usize, &'a str); |
49 | | |
50 | | #[inline] |
51 | 0 | fn next(&mut self) -> Option<(usize, &'a str)> { |
52 | 0 | self.iter |
53 | 0 | .next() |
54 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
55 | 0 | } |
56 | | |
57 | | #[inline] |
58 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
59 | 0 | self.iter.size_hint() |
60 | 0 | } |
61 | | } |
62 | | |
63 | | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { |
64 | | #[inline] |
65 | 0 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
66 | 0 | self.iter |
67 | 0 | .next_back() |
68 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
69 | 0 | } |
70 | | } |
71 | | |
72 | | /// External iterator for a string's |
73 | | /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries). |
74 | | /// |
75 | | /// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its |
76 | | /// documentation for more. |
77 | | /// |
78 | | /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes |
79 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
80 | | #[derive(Clone, Debug)] |
81 | | pub struct Graphemes<'a> { |
82 | | string: &'a str, |
83 | | cursor: GraphemeCursor, |
84 | | cursor_back: GraphemeCursor, |
85 | | } |
86 | | |
87 | | impl<'a> Graphemes<'a> { |
88 | | #[inline] |
89 | | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
90 | | /// |
91 | | /// ```rust |
92 | | /// # use unicode_segmentation::UnicodeSegmentation; |
93 | | /// let mut iter = "abc".graphemes(true); |
94 | | /// assert_eq!(iter.as_str(), "abc"); |
95 | | /// iter.next(); |
96 | | /// assert_eq!(iter.as_str(), "bc"); |
97 | | /// iter.next(); |
98 | | /// iter.next(); |
99 | | /// assert_eq!(iter.as_str(), ""); |
100 | | /// ``` |
101 | 0 | pub fn as_str(&self) -> &'a str { |
102 | 0 | &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()] |
103 | 0 | } |
104 | | } |
105 | | |
106 | | impl<'a> Iterator for Graphemes<'a> { |
107 | | type Item = &'a str; |
108 | | |
109 | | #[inline] |
110 | 6.30k | fn size_hint(&self) -> (usize, Option<usize>) { |
111 | 6.30k | let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor(); |
112 | 6.30k | (cmp::min(slen, 1), Some(slen)) |
113 | 6.30k | } <unicode_segmentation::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::size_hint Line | Count | Source | 110 | 6.30k | fn size_hint(&self) -> (usize, Option<usize>) { | 111 | 6.30k | let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor(); | 112 | 6.30k | (cmp::min(slen, 1), Some(slen)) | 113 | 6.30k | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::size_hint |
114 | | |
115 | | #[inline] |
116 | 24.0M | fn next(&mut self) -> Option<&'a str> { |
117 | 24.0M | let start = self.cursor.cur_cursor(); |
118 | 24.0M | if start == self.cursor_back.cur_cursor() { |
119 | 2.11k | return None; |
120 | 24.0M | } |
121 | 24.0M | let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap(); |
122 | 24.0M | Some(&self.string[start..next]) |
123 | 24.0M | } <unicode_segmentation::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 116 | 24.0M | fn next(&mut self) -> Option<&'a str> { | 117 | 24.0M | let start = self.cursor.cur_cursor(); | 118 | 24.0M | if start == self.cursor_back.cur_cursor() { | 119 | 2.11k | return None; | 120 | 24.0M | } | 121 | 24.0M | let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap(); | 122 | 24.0M | Some(&self.string[start..next]) | 123 | 24.0M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::next |
124 | | } |
125 | | |
126 | | impl<'a> DoubleEndedIterator for Graphemes<'a> { |
127 | | #[inline] |
128 | 0 | fn next_back(&mut self) -> Option<&'a str> { |
129 | 0 | let end = self.cursor_back.cur_cursor(); |
130 | 0 | if end == self.cursor.cur_cursor() { |
131 | 0 | return None; |
132 | 0 | } |
133 | 0 | let prev = self |
134 | 0 | .cursor_back |
135 | 0 | .prev_boundary(self.string, 0) |
136 | 0 | .unwrap() |
137 | 0 | .unwrap(); |
138 | 0 | Some(&self.string[prev..end]) |
139 | 0 | } |
140 | | } |
141 | | |
142 | | #[inline] |
143 | 2.11k | pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> { |
144 | 2.11k | let len = s.len(); |
145 | 2.11k | Graphemes { |
146 | 2.11k | string: s, |
147 | 2.11k | cursor: GraphemeCursor::new(0, len, is_extended), |
148 | 2.11k | cursor_back: GraphemeCursor::new(len, len, is_extended), |
149 | 2.11k | } |
150 | 2.11k | } unicode_segmentation::grapheme::new_graphemes Line | Count | Source | 143 | 2.11k | pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> { | 144 | 2.11k | let len = s.len(); | 145 | 2.11k | Graphemes { | 146 | 2.11k | string: s, | 147 | 2.11k | cursor: GraphemeCursor::new(0, len, is_extended), | 148 | 2.11k | cursor_back: GraphemeCursor::new(len, len, is_extended), | 149 | 2.11k | } | 150 | 2.11k | } |
Unexecuted instantiation: unicode_segmentation::grapheme::new_graphemes |
151 | | |
152 | | #[inline] |
153 | 0 | pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> { |
154 | 0 | GraphemeIndices { |
155 | 0 | start_offset: s.as_ptr() as usize, |
156 | 0 | iter: new_graphemes(s, is_extended), |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | /// maybe unify with PairResult? |
161 | | /// An enum describing information about a potential boundary. |
162 | | #[derive(PartialEq, Eq, Clone, Debug)] |
163 | | enum GraphemeState { |
164 | | /// No information is known. |
165 | | Unknown, |
166 | | /// It is known to not be a boundary. |
167 | | NotBreak, |
168 | | /// It is known to be a boundary. |
169 | | Break, |
170 | | /// The codepoint after it has Indic_Conjunct_Break=Consonant, |
171 | | /// so there is a break before so a boundary if it is preceded by another |
172 | | /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker |
173 | | /// and zero or more InCB = Extend (in any order). |
174 | | InCbConsonant, |
175 | | /// The codepoint after is a Regional Indicator Symbol, so a boundary iff |
176 | | /// it is preceded by an even number of RIS codepoints. (GB12, GB13) |
177 | | Regional, |
178 | | /// The codepoint after is Extended_Pictographic, |
179 | | /// so whether it's a boundary depends on pre-context according to GB11. |
180 | | Emoji, |
181 | | } |
182 | | |
183 | | /// Cursor-based segmenter for grapheme clusters. |
184 | | /// |
185 | | /// This allows working with ropes and other datastructures where the string is not contiguous or |
186 | | /// fully known at initialization time. |
187 | | #[derive(Clone, Debug)] |
188 | | pub struct GraphemeCursor { |
189 | | /// Current cursor position. |
190 | | offset: usize, |
191 | | /// Total length of the string. |
192 | | len: usize, |
193 | | /// A config flag indicating whether this cursor computes legacy or extended |
194 | | /// grapheme cluster boundaries (enables GB9a and GB9b if set). |
195 | | is_extended: bool, |
196 | | /// Information about the potential boundary at `offset` |
197 | | state: GraphemeState, |
198 | | /// Category of codepoint immediately preceding cursor, if known. |
199 | | cat_before: Option<GraphemeCat>, |
200 | | /// Category of codepoint immediately after cursor, if known. |
201 | | cat_after: Option<GraphemeCat>, |
202 | | /// If set, at least one more codepoint immediately preceding this offset |
203 | | /// is needed to resolve whether there's a boundary at `offset`. |
204 | | pre_context_offset: Option<usize>, |
205 | | /// The number of `InCB=Linker` codepoints preceding `offset` |
206 | | /// (potentially intermingled with `InCB=Extend`). |
207 | | incb_linker_count: Option<usize>, |
208 | | /// The number of RIS codepoints preceding `offset`. If `pre_context_offset` |
209 | | /// is set, then counts the number of RIS between that and `offset`, otherwise |
210 | | /// is an accurate count relative to the string. |
211 | | ris_count: Option<usize>, |
212 | | /// Set if a call to `prev_boundary` or `next_boundary` was suspended due |
213 | | /// to needing more input. |
214 | | resuming: bool, |
215 | | /// Cached grapheme category and associated scalar value range. |
216 | | grapheme_cat_cache: (u32, u32, GraphemeCat), |
217 | | } |
218 | | |
219 | | /// An error return indicating that not enough content was available in the |
220 | | /// provided chunk to satisfy the query, and that more content must be provided. |
221 | | #[derive(PartialEq, Eq, Debug)] |
222 | | pub enum GraphemeIncomplete { |
223 | | /// More pre-context is needed. The caller should call `provide_context` |
224 | | /// with a chunk ending at the offset given, then retry the query. This |
225 | | /// will only be returned if the `chunk_start` parameter is nonzero. |
226 | | PreContext(usize), |
227 | | |
228 | | /// When requesting `prev_boundary`, the cursor is moving past the beginning |
229 | | /// of the current chunk, so the chunk before that is requested. This will |
230 | | /// only be returned if the `chunk_start` parameter is nonzero. |
231 | | PrevChunk, |
232 | | |
233 | | /// When requesting `next_boundary`, the cursor is moving past the end of the |
234 | | /// current chunk, so the chunk after that is requested. This will only be |
235 | | /// returned if the chunk ends before the `len` parameter provided on |
236 | | /// creation of the cursor. |
237 | | NextChunk, // requesting chunk following the one given |
238 | | |
239 | | /// An error returned when the chunk given does not contain the cursor position. |
240 | | InvalidOffset, |
241 | | } |
242 | | |
243 | | // An enum describing the result from lookup of a pair of categories. |
244 | | #[derive(PartialEq, Eq)] |
245 | | enum PairResult { |
246 | | /// definitely not a break |
247 | | NotBreak, |
248 | | /// definitely a break |
249 | | Break, |
250 | | /// a break iff not in extended mode |
251 | | Extended, |
252 | | /// a break unless in extended mode and preceded by |
253 | | /// a sequence of 0 or more InCB=Extend and one or more |
254 | | /// InCB = Linker (in any order), |
255 | | /// preceded by another InCB=Consonant |
256 | | InCbConsonant, |
257 | | /// a break if preceded by an even number of RIS |
258 | | Regional, |
259 | | /// a break if preceded by emoji base and (Extend)* |
260 | | Emoji, |
261 | | } |
262 | | |
263 | | #[inline] |
264 | 24.1M | fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { |
265 | | use self::PairResult::*; |
266 | | use crate::tables::grapheme::GraphemeCat::*; |
267 | 24.1M | match (before, after) { |
268 | 9.48k | (GC_CR, GC_LF) => NotBreak, // GB3 |
269 | 10.6M | (GC_Control | GC_CR | GC_LF, _) => Break, // GB4 |
270 | 273k | (_, GC_Control | GC_CR | GC_LF) => Break, // GB5 |
271 | 1.11k | (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak, // GB6 |
272 | 1.58k | (GC_LV | GC_V, GC_V | GC_T) => NotBreak, // GB7 |
273 | 530 | (GC_LVT | GC_T, GC_T) => NotBreak, // GB8 |
274 | 53.8k | (_, GC_Extend | GC_ZWJ) => NotBreak, // GB9 |
275 | 1.47k | (_, GC_SpacingMark) => Extended, // GB9a |
276 | 2.40k | (GC_Prepend, _) => Extended, // GB9b |
277 | 8.83k | (_, GC_InCB_Consonant) => InCbConsonant, // GB9c |
278 | 10.2k | (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11 |
279 | 1.87k | (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 |
280 | 13.0M | (_, _) => Break, // GB999 |
281 | | } |
282 | 24.0M | } unicode_segmentation::grapheme::check_pair Line | Count | Source | 264 | 24.1M | fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { | 265 | | use self::PairResult::*; | 266 | | use crate::tables::grapheme::GraphemeCat::*; | 267 | 24.1M | match (before, after) { | 268 | 9.48k | (GC_CR, GC_LF) => NotBreak, // GB3 | 269 | 10.6M | (GC_Control | GC_CR | GC_LF, _) => Break, // GB4 | 270 | 273k | (_, GC_Control | GC_CR | GC_LF) => Break, // GB5 | 271 | 1.11k | (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak, // GB6 | 272 | 1.58k | (GC_LV | GC_V, GC_V | GC_T) => NotBreak, // GB7 | 273 | 530 | (GC_LVT | GC_T, GC_T) => NotBreak, // GB8 | 274 | 53.8k | (_, GC_Extend | GC_ZWJ) => NotBreak, // GB9 | 275 | 1.47k | (_, GC_SpacingMark) => Extended, // GB9a | 276 | 2.40k | (GC_Prepend, _) => Extended, // GB9b | 277 | 8.83k | (_, GC_InCB_Consonant) => InCbConsonant, // GB9c | 278 | 10.2k | (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11 | 279 | 1.87k | (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 | 280 | 13.0M | (_, _) => Break, // GB999 | 281 | | } | 282 | 24.0M | } |
Unexecuted instantiation: unicode_segmentation::grapheme::check_pair |
283 | | |
284 | | impl GraphemeCursor { |
285 | | /// Create a new cursor. The string and initial offset are given at creation |
286 | | /// time, but the contents of the string are not. The `is_extended` parameter |
287 | | /// controls whether extended grapheme clusters are selected. |
288 | | /// |
289 | | /// The `offset` parameter must be on a codepoint boundary. |
290 | | /// |
291 | | /// ```rust |
292 | | /// # use unicode_segmentation::GraphemeCursor; |
293 | | /// let s = "हिन्दी"; |
294 | | /// let mut legacy = GraphemeCursor::new(0, s.len(), false); |
295 | | /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len()))); |
296 | | /// let mut extended = GraphemeCursor::new(0, s.len(), true); |
297 | | /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len()))); |
298 | | /// ``` |
299 | 4.23k | pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { |
300 | 4.23k | let state = if offset == 0 || offset == len { |
301 | 4.23k | GraphemeState::Break |
302 | | } else { |
303 | 0 | GraphemeState::Unknown |
304 | | }; |
305 | 4.23k | GraphemeCursor { |
306 | 4.23k | offset, |
307 | 4.23k | len, |
308 | 4.23k | state, |
309 | 4.23k | is_extended, |
310 | 4.23k | cat_before: None, |
311 | 4.23k | cat_after: None, |
312 | 4.23k | pre_context_offset: None, |
313 | 4.23k | incb_linker_count: None, |
314 | 4.23k | ris_count: None, |
315 | 4.23k | resuming: false, |
316 | 4.23k | grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control), |
317 | 4.23k | } |
318 | 4.23k | } |
319 | | |
320 | 24.1M | fn grapheme_category(&mut self, ch: char) -> GraphemeCat { |
321 | | use crate::tables::grapheme as gr; |
322 | | use crate::tables::grapheme::GraphemeCat::*; |
323 | | |
324 | 24.1M | if ch <= '\u{7e}' { |
325 | | // Special-case optimization for ascii, except U+007F. This |
326 | | // improves performance even for many primarily non-ascii texts, |
327 | | // due to use of punctuation and white space characters from the |
328 | | // ascii range. |
329 | 23.8M | if ch >= '\u{20}' { |
330 | 13.1M | GC_Any |
331 | 10.7M | } else if ch == '\n' { |
332 | 97.8k | GC_LF |
333 | 10.6M | } else if ch == '\r' { |
334 | 1.60M | GC_CR |
335 | | } else { |
336 | 9.02M | GC_Control |
337 | | } |
338 | | } else { |
339 | | // If this char isn't within the cached range, update the cache to the |
340 | | // range that includes it. |
341 | 332k | if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 { |
342 | 130k | self.grapheme_cat_cache = gr::grapheme_category(ch); |
343 | 201k | } |
344 | 332k | self.grapheme_cat_cache.2 |
345 | | } |
346 | 24.1M | } |
347 | | |
348 | | // Not sure I'm gonna keep this, the advantage over new() seems thin. |
349 | | |
350 | | /// Set the cursor to a new location in the same string. |
351 | | /// |
352 | | /// ```rust |
353 | | /// # use unicode_segmentation::GraphemeCursor; |
354 | | /// let s = "abcd"; |
355 | | /// let mut cursor = GraphemeCursor::new(0, s.len(), false); |
356 | | /// assert_eq!(cursor.cur_cursor(), 0); |
357 | | /// cursor.set_cursor(2); |
358 | | /// assert_eq!(cursor.cur_cursor(), 2); |
359 | | /// ``` |
360 | 0 | pub fn set_cursor(&mut self, offset: usize) { |
361 | 0 | if offset != self.offset { |
362 | 0 | self.offset = offset; |
363 | 0 | self.state = if offset == 0 || offset == self.len { |
364 | 0 | GraphemeState::Break |
365 | | } else { |
366 | 0 | GraphemeState::Unknown |
367 | | }; |
368 | | // reset state derived from text around cursor |
369 | 0 | self.cat_before = None; |
370 | 0 | self.cat_after = None; |
371 | 0 | self.incb_linker_count = None; |
372 | 0 | self.ris_count = None; |
373 | 0 | } |
374 | 0 | } |
375 | | |
376 | | #[inline] |
377 | | /// The current offset of the cursor. Equal to the last value provided to |
378 | | /// `new()` or `set_cursor()`, or returned from `next_boundary()` or |
379 | | /// `prev_boundary()`. |
380 | | /// |
381 | | /// ```rust |
382 | | /// # use unicode_segmentation::GraphemeCursor; |
383 | | /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes. |
384 | | /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; |
385 | | /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); |
386 | | /// assert_eq!(cursor.cur_cursor(), 4); |
387 | | /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); |
388 | | /// assert_eq!(cursor.cur_cursor(), 8); |
389 | | /// ``` |
390 | 48.1M | pub fn cur_cursor(&self) -> usize { |
391 | 48.1M | self.offset |
392 | 48.1M | } <unicode_segmentation::grapheme::GraphemeCursor>::cur_cursor Line | Count | Source | 390 | 48.1M | pub fn cur_cursor(&self) -> usize { | 391 | 48.1M | self.offset | 392 | 48.1M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::cur_cursor |
393 | | |
394 | | /// Provide additional pre-context when it is needed to decide a boundary. |
395 | | /// The end of the chunk must coincide with the value given in the |
396 | | /// `GraphemeIncomplete::PreContext` request. |
397 | | /// |
398 | | /// ```rust |
399 | | /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; |
400 | | /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; |
401 | | /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); |
402 | | /// // Not enough pre-context to decide if there's a boundary between the two flags. |
403 | | /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8))); |
404 | | /// // Provide one more Regional Indicator Symbol of pre-context |
405 | | /// cursor.provide_context(&flags[4..8], 4); |
406 | | /// // Still not enough context to decide. |
407 | | /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4))); |
408 | | /// // Provide additional requested context. |
409 | | /// cursor.provide_context(&flags[0..4], 0); |
410 | | /// // That's enough to decide (it always is when context goes to the start of the string) |
411 | | /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true)); |
412 | | /// ``` |
413 | 0 | pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { |
414 | | use crate::tables::grapheme as gr; |
415 | 0 | assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap()); |
416 | 0 | self.pre_context_offset = None; |
417 | 0 | if self.is_extended && chunk_start + chunk.len() == self.offset { |
418 | 0 | let ch = chunk.chars().next_back().unwrap(); |
419 | 0 | if self.grapheme_category(ch) == gr::GC_Prepend { |
420 | 0 | self.decide(false); // GB9b |
421 | 0 | return; |
422 | 0 | } |
423 | 0 | } |
424 | 0 | match self.state { |
425 | 0 | GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start), |
426 | 0 | GraphemeState::Regional => self.handle_regional(chunk, chunk_start), |
427 | 0 | GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), |
428 | | _ => { |
429 | 0 | if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start { |
430 | 0 | let ch = chunk.chars().next_back().unwrap(); |
431 | 0 | self.cat_before = Some(self.grapheme_category(ch)); |
432 | 0 | } |
433 | | } |
434 | | } |
435 | 0 | } |
436 | | |
437 | | #[inline] |
438 | 24.1M | fn decide(&mut self, is_break: bool) { |
439 | 24.1M | self.state = if is_break { |
440 | 24.0M | GraphemeState::Break |
441 | | } else { |
442 | 75.4k | GraphemeState::NotBreak |
443 | | }; |
444 | 24.1M | } <unicode_segmentation::grapheme::GraphemeCursor>::decide Line | Count | Source | 438 | 24.1M | fn decide(&mut self, is_break: bool) { | 439 | 24.1M | self.state = if is_break { | 440 | 24.0M | GraphemeState::Break | 441 | | } else { | 442 | 75.4k | GraphemeState::NotBreak | 443 | | }; | 444 | 24.1M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::decide |
445 | | |
446 | | #[inline] |
447 | 24.1M | fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> { |
448 | 24.1M | self.decide(is_break); |
449 | 24.1M | Ok(is_break) |
450 | 24.1M | } <unicode_segmentation::grapheme::GraphemeCursor>::decision Line | Count | Source | 447 | 24.1M | fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> { | 448 | 24.1M | self.decide(is_break); | 449 | 24.1M | Ok(is_break) | 450 | 24.1M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::decision |
451 | | |
452 | | #[inline] |
453 | 19.1k | fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> { |
454 | 19.1k | if self.state == GraphemeState::Break { |
455 | 15.4k | Ok(true) |
456 | 3.69k | } else if self.state == GraphemeState::NotBreak { |
457 | 3.69k | Ok(false) |
458 | 0 | } else if let Some(pre_context_offset) = self.pre_context_offset { |
459 | 0 | Err(GraphemeIncomplete::PreContext(pre_context_offset)) |
460 | | } else { |
461 | 0 | unreachable!("inconsistent state"); |
462 | | } |
463 | 19.1k | } <unicode_segmentation::grapheme::GraphemeCursor>::is_boundary_result Line | Count | Source | 453 | 19.1k | fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> { | 454 | 19.1k | if self.state == GraphemeState::Break { | 455 | 15.4k | Ok(true) | 456 | 3.69k | } else if self.state == GraphemeState::NotBreak { | 457 | 3.69k | Ok(false) | 458 | 0 | } else if let Some(pre_context_offset) = self.pre_context_offset { | 459 | 0 | Err(GraphemeIncomplete::PreContext(pre_context_offset)) | 460 | | } else { | 461 | 0 | unreachable!("inconsistent state"); | 462 | | } | 463 | 19.1k | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::is_boundary_result |
464 | | |
465 | | /// For handling rule GB9c: |
466 | | /// |
467 | | /// There's an `InCB=Consonant` after this, and we need to look back |
468 | | /// to verify whether there should be a break. |
469 | | /// |
470 | | /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt` |
471 | | /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`). |
472 | | /// If we find the consonant in question, then there's no break; if we find a consonant |
473 | | /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break; |
474 | | /// otherwise we need more context |
475 | | #[inline] |
476 | 8.83k | fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) { |
477 | | use crate::tables::{self, grapheme as gr}; |
478 | | |
479 | | // GB9c only applies to extended grapheme clusters |
480 | 8.83k | if !self.is_extended { |
481 | 0 | self.decide(true); |
482 | 0 | return; |
483 | 8.83k | } |
484 | 8.83k | |
485 | 8.83k | let mut incb_linker_count = self.incb_linker_count.unwrap_or(0); |
486 | | |
487 | 16.1k | for ch in chunk.chars().rev() { |
488 | 16.1k | if tables::is_incb_linker(ch) { |
489 | 3.75k | // We found an InCB linker |
490 | 3.75k | incb_linker_count += 1; |
491 | 3.75k | self.incb_linker_count = Some(incb_linker_count); |
492 | 12.3k | } else if tables::derived_property::InCB_Extend(ch) { |
493 | 3.60k | // We ignore InCB extends, continue |
494 | 3.60k | } else { |
495 | | // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant |
496 | 8.76k | let result = !(self.incb_linker_count.unwrap_or(0) > 0 |
497 | 2.09k | && self.grapheme_category(ch) == gr::GC_InCB_Consonant); |
498 | 8.76k | self.decide(result); |
499 | 8.76k | return; |
500 | | } |
501 | | } |
502 | | |
503 | 63 | if chunk_start == 0 { |
504 | 63 | // Start of text and we still haven't found a consonant, so break |
505 | 63 | self.decide(true); |
506 | 63 | } else { |
507 | 0 | // We need more context |
508 | 0 | self.pre_context_offset = Some(chunk_start); |
509 | 0 | self.state = GraphemeState::InCbConsonant; |
510 | 0 | } |
511 | 8.83k | } <unicode_segmentation::grapheme::GraphemeCursor>::handle_incb_consonant Line | Count | Source | 476 | 8.83k | fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) { | 477 | | use crate::tables::{self, grapheme as gr}; | 478 | | | 479 | | // GB9c only applies to extended grapheme clusters | 480 | 8.83k | if !self.is_extended { | 481 | 0 | self.decide(true); | 482 | 0 | return; | 483 | 8.83k | } | 484 | 8.83k | | 485 | 8.83k | let mut incb_linker_count = self.incb_linker_count.unwrap_or(0); | 486 | | | 487 | 16.1k | for ch in chunk.chars().rev() { | 488 | 16.1k | if tables::is_incb_linker(ch) { | 489 | 3.75k | // We found an InCB linker | 490 | 3.75k | incb_linker_count += 1; | 491 | 3.75k | self.incb_linker_count = Some(incb_linker_count); | 492 | 12.3k | } else if tables::derived_property::InCB_Extend(ch) { | 493 | 3.60k | // We ignore InCB extends, continue | 494 | 3.60k | } else { | 495 | | // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant | 496 | 8.76k | let result = !(self.incb_linker_count.unwrap_or(0) > 0 | 497 | 2.09k | && self.grapheme_category(ch) == gr::GC_InCB_Consonant); | 498 | 8.76k | self.decide(result); | 499 | 8.76k | return; | 500 | | } | 501 | | } | 502 | | | 503 | 63 | if chunk_start == 0 { | 504 | 63 | // Start of text and we still haven't found a consonant, so break | 505 | 63 | self.decide(true); | 506 | 63 | } else { | 507 | 0 | // We need more context | 508 | 0 | self.pre_context_offset = Some(chunk_start); | 509 | 0 | self.state = GraphemeState::InCbConsonant; | 510 | 0 | } | 511 | 8.83k | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::handle_incb_consonant |
512 | | |
513 | | #[inline] |
514 | 28 | fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { |
515 | | use crate::tables::grapheme as gr; |
516 | 28 | let mut ris_count = self.ris_count.unwrap_or(0); |
517 | 28 | for ch in chunk.chars().rev() { |
518 | 28 | if self.grapheme_category(ch) != gr::GC_Regional_Indicator { |
519 | 0 | self.ris_count = Some(ris_count); |
520 | 0 | self.decide((ris_count % 2) == 0); |
521 | 0 | return; |
522 | 28 | } |
523 | 28 | ris_count += 1; |
524 | | } |
525 | 28 | self.ris_count = Some(ris_count); |
526 | 28 | if chunk_start == 0 { |
527 | 28 | self.decide((ris_count % 2) == 0); |
528 | 28 | } else { |
529 | 0 | self.pre_context_offset = Some(chunk_start); |
530 | 0 | self.state = GraphemeState::Regional; |
531 | 0 | } |
532 | 28 | } <unicode_segmentation::grapheme::GraphemeCursor>::handle_regional Line | Count | Source | 514 | 28 | fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { | 515 | | use crate::tables::grapheme as gr; | 516 | 28 | let mut ris_count = self.ris_count.unwrap_or(0); | 517 | 28 | for ch in chunk.chars().rev() { | 518 | 28 | if self.grapheme_category(ch) != gr::GC_Regional_Indicator { | 519 | 0 | self.ris_count = Some(ris_count); | 520 | 0 | self.decide((ris_count % 2) == 0); | 521 | 0 | return; | 522 | 28 | } | 523 | 28 | ris_count += 1; | 524 | | } | 525 | 28 | self.ris_count = Some(ris_count); | 526 | 28 | if chunk_start == 0 { | 527 | 28 | self.decide((ris_count % 2) == 0); | 528 | 28 | } else { | 529 | 0 | self.pre_context_offset = Some(chunk_start); | 530 | 0 | self.state = GraphemeState::Regional; | 531 | 0 | } | 532 | 28 | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::handle_regional |
533 | | |
534 | | #[inline] |
535 | 10.2k | fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { |
536 | | use crate::tables::grapheme as gr; |
537 | 10.2k | let mut iter = chunk.chars().rev(); |
538 | 10.2k | if let Some(ch) = iter.next() { |
539 | 10.2k | if self.grapheme_category(ch) != gr::GC_ZWJ { |
540 | 0 | self.decide(true); |
541 | 0 | return; |
542 | 10.2k | } |
543 | 0 | } |
544 | 11.5k | for ch in iter { |
545 | 11.5k | match self.grapheme_category(ch) { |
546 | 1.33k | gr::GC_Extend => (), |
547 | | gr::GC_Extended_Pictographic => { |
548 | 3.04k | self.decide(false); |
549 | 3.04k | return; |
550 | | } |
551 | | _ => { |
552 | 7.15k | self.decide(true); |
553 | 7.15k | return; |
554 | | } |
555 | | } |
556 | | } |
557 | 59 | if chunk_start == 0 { |
558 | 59 | self.decide(true); |
559 | 59 | } else { |
560 | 0 | self.pre_context_offset = Some(chunk_start); |
561 | 0 | self.state = GraphemeState::Emoji; |
562 | 0 | } |
563 | 10.2k | } <unicode_segmentation::grapheme::GraphemeCursor>::handle_emoji Line | Count | Source | 535 | 10.2k | fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { | 536 | | use crate::tables::grapheme as gr; | 537 | 10.2k | let mut iter = chunk.chars().rev(); | 538 | 10.2k | if let Some(ch) = iter.next() { | 539 | 10.2k | if self.grapheme_category(ch) != gr::GC_ZWJ { | 540 | 0 | self.decide(true); | 541 | 0 | return; | 542 | 10.2k | } | 543 | 0 | } | 544 | 11.5k | for ch in iter { | 545 | 11.5k | match self.grapheme_category(ch) { | 546 | 1.33k | gr::GC_Extend => (), | 547 | | gr::GC_Extended_Pictographic => { | 548 | 3.04k | self.decide(false); | 549 | 3.04k | return; | 550 | | } | 551 | | _ => { | 552 | 7.15k | self.decide(true); | 553 | 7.15k | return; | 554 | | } | 555 | | } | 556 | | } | 557 | 59 | if chunk_start == 0 { | 558 | 59 | self.decide(true); | 559 | 59 | } else { | 560 | 0 | self.pre_context_offset = Some(chunk_start); | 561 | 0 | self.state = GraphemeState::Emoji; | 562 | 0 | } | 563 | 10.2k | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::handle_emoji |
564 | | |
565 | | #[inline] |
566 | | /// Determine whether the current cursor location is a grapheme cluster boundary. |
567 | | /// Only a part of the string need be supplied. If `chunk_start` is nonzero or |
568 | | /// the length of `chunk` is not equal to `len` on creation, then this method |
569 | | /// may return `GraphemeIncomplete::PreContext`. The caller should then |
570 | | /// call `provide_context` with the requested chunk, then retry calling this |
571 | | /// method. |
572 | | /// |
573 | | /// For partial chunks, if the cursor is not at the beginning or end of the |
574 | | /// string, the chunk should contain at least the codepoint following the cursor. |
575 | | /// If the string is nonempty, the chunk must be nonempty. |
576 | | /// |
577 | | /// All calls should have consistent chunk contents (ie, if a chunk provides |
578 | | /// content for a given slice, all further chunks covering that slice must have |
579 | | /// the same content for it). |
580 | | /// |
581 | | /// ```rust |
582 | | /// # use unicode_segmentation::GraphemeCursor; |
583 | | /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; |
584 | | /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); |
585 | | /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true)); |
586 | | /// cursor.set_cursor(12); |
587 | | /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false)); |
588 | | /// ``` |
589 | 24.1M | pub fn is_boundary( |
590 | 24.1M | &mut self, |
591 | 24.1M | chunk: &str, |
592 | 24.1M | chunk_start: usize, |
593 | 24.1M | ) -> Result<bool, GraphemeIncomplete> { |
594 | | use crate::tables::grapheme as gr; |
595 | 24.1M | if self.state == GraphemeState::Break { |
596 | 2.11k | return Ok(true); |
597 | 24.1M | } |
598 | 24.1M | if self.state == GraphemeState::NotBreak { |
599 | 0 | return Ok(false); |
600 | 24.1M | } |
601 | 24.1M | if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len())) |
602 | 0 | && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none()) |
603 | | { |
604 | 0 | return Err(GraphemeIncomplete::InvalidOffset); |
605 | 24.1M | } |
606 | 24.1M | if let Some(pre_context_offset) = self.pre_context_offset { |
607 | 0 | return Err(GraphemeIncomplete::PreContext(pre_context_offset)); |
608 | 24.1M | } |
609 | 24.1M | let offset_in_chunk = self.offset.saturating_sub(chunk_start); |
610 | 24.1M | if self.cat_after.is_none() { |
611 | 0 | let ch = chunk[offset_in_chunk..].chars().next().unwrap(); |
612 | 0 | self.cat_after = Some(self.grapheme_category(ch)); |
613 | 24.1M | } |
614 | 24.1M | if self.offset == chunk_start { |
615 | 0 | let mut need_pre_context = true; |
616 | 0 | match self.cat_after.unwrap() { |
617 | 0 | gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant, |
618 | 0 | gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, |
619 | 0 | gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji, |
620 | 0 | _ => need_pre_context = self.cat_before.is_none(), |
621 | | } |
622 | 0 | if need_pre_context { |
623 | 0 | self.pre_context_offset = Some(chunk_start); |
624 | 0 | return Err(GraphemeIncomplete::PreContext(chunk_start)); |
625 | 0 | } |
626 | 24.1M | } |
627 | 24.1M | if self.cat_before.is_none() { |
628 | 0 | let ch = chunk[..offset_in_chunk].chars().next_back().unwrap(); |
629 | 0 | self.cat_before = Some(self.grapheme_category(ch)); |
630 | 24.1M | } |
631 | 24.1M | match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { |
632 | 66.5k | PairResult::NotBreak => self.decision(false), |
633 | 24.0M | PairResult::Break => self.decision(true), |
634 | | PairResult::Extended => { |
635 | 3.87k | let is_extended = self.is_extended; |
636 | 3.87k | self.decision(!is_extended) |
637 | | } |
638 | | PairResult::InCbConsonant => { |
639 | 8.83k | self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start); |
640 | 8.83k | self.is_boundary_result() |
641 | | } |
642 | | PairResult::Regional => { |
643 | 1.87k | if let Some(ris_count) = self.ris_count { |
644 | 1.85k | return self.decision((ris_count % 2) == 0); |
645 | 28 | } |
646 | 28 | self.handle_regional(&chunk[..offset_in_chunk], chunk_start); |
647 | 28 | self.is_boundary_result() |
648 | | } |
649 | | PairResult::Emoji => { |
650 | 10.2k | self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); |
651 | 10.2k | self.is_boundary_result() |
652 | | } |
653 | | } |
654 | 24.1M | } <unicode_segmentation::grapheme::GraphemeCursor>::is_boundary Line | Count | Source | 589 | 24.1M | pub fn is_boundary( | 590 | 24.1M | &mut self, | 591 | 24.1M | chunk: &str, | 592 | 24.1M | chunk_start: usize, | 593 | 24.1M | ) -> Result<bool, GraphemeIncomplete> { | 594 | | use crate::tables::grapheme as gr; | 595 | 24.1M | if self.state == GraphemeState::Break { | 596 | 2.11k | return Ok(true); | 597 | 24.1M | } | 598 | 24.1M | if self.state == GraphemeState::NotBreak { | 599 | 0 | return Ok(false); | 600 | 24.1M | } | 601 | 24.1M | if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len())) | 602 | 0 | && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none()) | 603 | | { | 604 | 0 | return Err(GraphemeIncomplete::InvalidOffset); | 605 | 24.1M | } | 606 | 24.1M | if let Some(pre_context_offset) = self.pre_context_offset { | 607 | 0 | return Err(GraphemeIncomplete::PreContext(pre_context_offset)); | 608 | 24.1M | } | 609 | 24.1M | let offset_in_chunk = self.offset.saturating_sub(chunk_start); | 610 | 24.1M | if self.cat_after.is_none() { | 611 | 0 | let ch = chunk[offset_in_chunk..].chars().next().unwrap(); | 612 | 0 | self.cat_after = Some(self.grapheme_category(ch)); | 613 | 24.1M | } | 614 | 24.1M | if self.offset == chunk_start { | 615 | 0 | let mut need_pre_context = true; | 616 | 0 | match self.cat_after.unwrap() { | 617 | 0 | gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant, | 618 | 0 | gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, | 619 | 0 | gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji, | 620 | 0 | _ => need_pre_context = self.cat_before.is_none(), | 621 | | } | 622 | 0 | if need_pre_context { | 623 | 0 | self.pre_context_offset = Some(chunk_start); | 624 | 0 | return Err(GraphemeIncomplete::PreContext(chunk_start)); | 625 | 0 | } | 626 | 24.1M | } | 627 | 24.1M | if self.cat_before.is_none() { | 628 | 0 | let ch = chunk[..offset_in_chunk].chars().next_back().unwrap(); | 629 | 0 | self.cat_before = Some(self.grapheme_category(ch)); | 630 | 24.1M | } | 631 | 24.1M | match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { | 632 | 66.5k | PairResult::NotBreak => self.decision(false), | 633 | 24.0M | PairResult::Break => self.decision(true), | 634 | | PairResult::Extended => { | 635 | 3.87k | let is_extended = self.is_extended; | 636 | 3.87k | self.decision(!is_extended) | 637 | | } | 638 | | PairResult::InCbConsonant => { | 639 | 8.83k | self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start); | 640 | 8.83k | self.is_boundary_result() | 641 | | } | 642 | | PairResult::Regional => { | 643 | 1.87k | if let Some(ris_count) = self.ris_count { | 644 | 1.85k | return self.decision((ris_count % 2) == 0); | 645 | 28 | } | 646 | 28 | self.handle_regional(&chunk[..offset_in_chunk], chunk_start); | 647 | 28 | self.is_boundary_result() | 648 | | } | 649 | | PairResult::Emoji => { | 650 | 10.2k | self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); | 651 | 10.2k | self.is_boundary_result() | 652 | | } | 653 | | } | 654 | 24.1M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::is_boundary |
655 | | |
656 | | #[inline] |
657 | | /// Find the next boundary after the current cursor position. Only a part of |
658 | | /// the string need be supplied. If the chunk is incomplete, then this |
659 | | /// method might return `GraphemeIncomplete::PreContext` or |
660 | | /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should |
661 | | /// call `provide_context` with the requested chunk, then retry. In the |
662 | | /// latter case, the caller should provide the chunk following the one |
663 | | /// given, then retry. |
664 | | /// |
665 | | /// See `is_boundary` for expectations on the provided chunk. |
666 | | /// |
667 | | /// ```rust |
668 | | /// # use unicode_segmentation::GraphemeCursor; |
669 | | /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; |
670 | | /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); |
671 | | /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); |
672 | | /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16))); |
673 | | /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None)); |
674 | | /// ``` |
675 | | /// |
676 | | /// And an example that uses partial strings: |
677 | | /// |
678 | | /// ```rust |
679 | | /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; |
680 | | /// let s = "abcd"; |
681 | | /// let mut cursor = GraphemeCursor::new(0, s.len(), false); |
682 | | /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1))); |
683 | | /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk)); |
684 | | /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2))); |
685 | | /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3))); |
686 | | /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4))); |
687 | | /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None)); |
688 | | /// ``` |
689 | 24.0M | pub fn next_boundary( |
690 | 24.0M | &mut self, |
691 | 24.0M | chunk: &str, |
692 | 24.0M | chunk_start: usize, |
693 | 24.0M | ) -> Result<Option<usize>, GraphemeIncomplete> { |
694 | 24.0M | if self.offset == self.len { |
695 | 0 | return Ok(None); |
696 | 24.0M | } |
697 | 24.0M | let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars(); |
698 | 24.0M | let mut ch = match iter.next() { |
699 | 24.0M | Some(ch) => ch, |
700 | 0 | None => return Err(GraphemeIncomplete::NextChunk), |
701 | | }; |
702 | | loop { |
703 | 24.1M | if self.resuming { |
704 | 0 | if self.cat_after.is_none() { |
705 | 0 | self.cat_after = Some(self.grapheme_category(ch)); |
706 | 0 | } |
707 | | } else { |
708 | 24.1M | self.offset = self.offset.saturating_add(ch.len_utf8()); |
709 | 24.1M | self.state = GraphemeState::Unknown; |
710 | 24.1M | self.cat_before = self.cat_after.take(); |
711 | 24.1M | if self.cat_before.is_none() { |
712 | 2.11k | self.cat_before = Some(self.grapheme_category(ch)); |
713 | 24.1M | } |
714 | 24.1M | if crate::tables::is_incb_linker(ch) { |
715 | 7.38k | self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1)); <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary::{closure#0} Line | Count | Source | 715 | 7.28k | self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1)); |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary::{closure#0} |
716 | 24.1M | } else if !crate::tables::derived_property::InCB_Extend(ch) { |
717 | 24.0M | self.incb_linker_count = Some(0); |
718 | 24.0M | } |
719 | 24.1M | if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { |
720 | 4.01k | self.ris_count = self.ris_count.map(|c| c + 1); <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary::{closure#1} Line | Count | Source | 720 | 3.95k | self.ris_count = self.ris_count.map(|c| c + 1); |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary::{closure#1} |
721 | 24.1M | } else { |
722 | 24.1M | self.ris_count = Some(0); |
723 | 24.1M | } |
724 | 24.1M | if let Some(next_ch) = iter.next() { |
725 | 24.1M | ch = next_ch; |
726 | 24.1M | self.cat_after = Some(self.grapheme_category(ch)); |
727 | 24.1M | } else if self.offset == self.len { |
728 | 2.11k | self.decide(true); |
729 | 2.11k | } else { |
730 | 0 | self.resuming = true; |
731 | 0 | return Err(GraphemeIncomplete::NextChunk); |
732 | | } |
733 | | } |
734 | 24.1M | self.resuming = true; |
735 | 24.1M | if self.is_boundary(chunk, chunk_start)? { |
736 | 24.0M | self.resuming = false; |
737 | 24.0M | return Ok(Some(self.offset)); |
738 | 75.4k | } |
739 | 75.4k | self.resuming = false; |
740 | | } |
741 | 24.0M | } <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary Line | Count | Source | 689 | 24.0M | pub fn next_boundary( | 690 | 24.0M | &mut self, | 691 | 24.0M | chunk: &str, | 692 | 24.0M | chunk_start: usize, | 693 | 24.0M | ) -> Result<Option<usize>, GraphemeIncomplete> { | 694 | 24.0M | if self.offset == self.len { | 695 | 0 | return Ok(None); | 696 | 24.0M | } | 697 | 24.0M | let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars(); | 698 | 24.0M | let mut ch = match iter.next() { | 699 | 24.0M | Some(ch) => ch, | 700 | 0 | None => return Err(GraphemeIncomplete::NextChunk), | 701 | | }; | 702 | | loop { | 703 | 24.1M | if self.resuming { | 704 | 0 | if self.cat_after.is_none() { | 705 | 0 | self.cat_after = Some(self.grapheme_category(ch)); | 706 | 0 | } | 707 | | } else { | 708 | 24.1M | self.offset = self.offset.saturating_add(ch.len_utf8()); | 709 | 24.1M | self.state = GraphemeState::Unknown; | 710 | 24.1M | self.cat_before = self.cat_after.take(); | 711 | 24.1M | if self.cat_before.is_none() { | 712 | 2.11k | self.cat_before = Some(self.grapheme_category(ch)); | 713 | 24.1M | } | 714 | 24.1M | if crate::tables::is_incb_linker(ch) { | 715 | 7.38k | self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1)); | 716 | 24.1M | } else if !crate::tables::derived_property::InCB_Extend(ch) { | 717 | 24.0M | self.incb_linker_count = Some(0); | 718 | 24.0M | } | 719 | 24.1M | if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { | 720 | 4.01k | self.ris_count = self.ris_count.map(|c| c + 1); | 721 | 24.1M | } else { | 722 | 24.1M | self.ris_count = Some(0); | 723 | 24.1M | } | 724 | 24.1M | if let Some(next_ch) = iter.next() { | 725 | 24.1M | ch = next_ch; | 726 | 24.1M | self.cat_after = Some(self.grapheme_category(ch)); | 727 | 24.1M | } else if self.offset == self.len { | 728 | 2.11k | self.decide(true); | 729 | 2.11k | } else { | 730 | 0 | self.resuming = true; | 731 | 0 | return Err(GraphemeIncomplete::NextChunk); | 732 | | } | 733 | | } | 734 | 24.1M | self.resuming = true; | 735 | 24.1M | if self.is_boundary(chunk, chunk_start)? { | 736 | 24.0M | self.resuming = false; | 737 | 24.0M | return Ok(Some(self.offset)); | 738 | 75.4k | } | 739 | 75.4k | self.resuming = false; | 740 | | } | 741 | 24.0M | } |
Unexecuted instantiation: <unicode_segmentation::grapheme::GraphemeCursor>::next_boundary |
742 | | |
743 | | /// Find the previous boundary after the current cursor position. Only a part |
744 | | /// of the string need be supplied. If the chunk is incomplete, then this |
745 | | /// method might return `GraphemeIncomplete::PreContext` or |
746 | | /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should |
747 | | /// call `provide_context` with the requested chunk, then retry. In the |
748 | | /// latter case, the caller should provide the chunk preceding the one |
749 | | /// given, then retry. |
750 | | /// |
751 | | /// See `is_boundary` for expectations on the provided chunk. |
752 | | /// |
753 | | /// ```rust |
754 | | /// # use unicode_segmentation::GraphemeCursor; |
755 | | /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; |
756 | | /// let mut cursor = GraphemeCursor::new(12, flags.len(), false); |
757 | | /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8))); |
758 | | /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0))); |
759 | | /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None)); |
760 | | /// ``` |
761 | | /// |
762 | | /// And an example that uses partial strings (note the exact return is not |
763 | | /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily): |
764 | | /// |
765 | | /// ```rust |
766 | | /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; |
767 | | /// let s = "abcd"; |
768 | | /// let mut cursor = GraphemeCursor::new(4, s.len(), false); |
769 | | /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3))); |
770 | | /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk)); |
771 | | /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2))); |
772 | | /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1))); |
773 | | /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0))); |
774 | | /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None)); |
775 | | /// ``` |
776 | 0 | pub fn prev_boundary( |
777 | 0 | &mut self, |
778 | 0 | chunk: &str, |
779 | 0 | chunk_start: usize, |
780 | 0 | ) -> Result<Option<usize>, GraphemeIncomplete> { |
781 | 0 | if self.offset == 0 { |
782 | 0 | return Ok(None); |
783 | 0 | } |
784 | 0 | if self.offset == chunk_start { |
785 | 0 | return Err(GraphemeIncomplete::PrevChunk); |
786 | 0 | } |
787 | 0 | let mut iter = chunk[..self.offset.saturating_sub(chunk_start)] |
788 | 0 | .chars() |
789 | 0 | .rev(); |
790 | 0 | let mut ch = iter.next().unwrap(); |
791 | | loop { |
792 | 0 | if self.offset == chunk_start { |
793 | 0 | self.resuming = true; |
794 | 0 | return Err(GraphemeIncomplete::PrevChunk); |
795 | 0 | } |
796 | 0 | if self.resuming { |
797 | 0 | self.cat_before = Some(self.grapheme_category(ch)); |
798 | 0 | } else { |
799 | 0 | self.offset -= ch.len_utf8(); |
800 | 0 | self.cat_after = self.cat_before.take(); |
801 | 0 | self.state = GraphemeState::Unknown; |
802 | 0 | if let Some(incb_linker_count) = self.incb_linker_count { |
803 | 0 | self.ris_count = if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) { |
804 | 0 | Some(incb_linker_count - 1) |
805 | 0 | } else if crate::tables::derived_property::InCB_Extend(ch) { |
806 | 0 | Some(incb_linker_count) |
807 | | } else { |
808 | 0 | None |
809 | | }; |
810 | 0 | } |
811 | 0 | if let Some(ris_count) = self.ris_count { |
812 | 0 | self.ris_count = if ris_count > 0 { |
813 | 0 | Some(ris_count - 1) |
814 | | } else { |
815 | 0 | None |
816 | | }; |
817 | 0 | } |
818 | 0 | if let Some(prev_ch) = iter.next() { |
819 | 0 | ch = prev_ch; |
820 | 0 | self.cat_before = Some(self.grapheme_category(ch)); |
821 | 0 | } else if self.offset == 0 { |
822 | 0 | self.decide(true); |
823 | 0 | } else { |
824 | 0 | self.resuming = true; |
825 | 0 | self.cat_after = Some(self.grapheme_category(ch)); |
826 | 0 | return Err(GraphemeIncomplete::PrevChunk); |
827 | | } |
828 | | } |
829 | 0 | self.resuming = true; |
830 | 0 | if self.is_boundary(chunk, chunk_start)? { |
831 | 0 | self.resuming = false; |
832 | 0 | return Ok(Some(self.offset)); |
833 | 0 | } |
834 | 0 | self.resuming = false; |
835 | | } |
836 | 0 | } |
837 | | } |
838 | | |
839 | | #[test] |
840 | | fn test_grapheme_cursor_ris_precontext() { |
841 | | let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}"; |
842 | | let mut c = GraphemeCursor::new(8, s.len(), true); |
843 | | assert_eq!( |
844 | | c.is_boundary(&s[4..], 4), |
845 | | Err(GraphemeIncomplete::PreContext(4)) |
846 | | ); |
847 | | c.provide_context(&s[..4], 0); |
848 | | assert_eq!(c.is_boundary(&s[4..], 4), Ok(true)); |
849 | | } |
850 | | |
851 | | #[test] |
852 | | fn test_grapheme_cursor_chunk_start_require_precontext() { |
853 | | let s = "\r\n"; |
854 | | let mut c = GraphemeCursor::new(1, s.len(), true); |
855 | | assert_eq!( |
856 | | c.is_boundary(&s[1..], 1), |
857 | | Err(GraphemeIncomplete::PreContext(1)) |
858 | | ); |
859 | | c.provide_context(&s[..1], 0); |
860 | | assert_eq!(c.is_boundary(&s[1..], 1), Ok(false)); |
861 | | } |
862 | | |
863 | | #[test] |
864 | | fn test_grapheme_cursor_prev_boundary() { |
865 | | let s = "abcd"; |
866 | | let mut c = GraphemeCursor::new(3, s.len(), true); |
867 | | assert_eq!( |
868 | | c.prev_boundary(&s[2..], 2), |
869 | | Err(GraphemeIncomplete::PrevChunk) |
870 | | ); |
871 | | assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2))); |
872 | | } |
873 | | |
874 | | #[test] |
875 | | fn test_grapheme_cursor_prev_boundary_chunk_start() { |
876 | | let s = "abcd"; |
877 | | let mut c = GraphemeCursor::new(2, s.len(), true); |
878 | | assert_eq!( |
879 | | c.prev_boundary(&s[2..], 2), |
880 | | Err(GraphemeIncomplete::PrevChunk) |
881 | | ); |
882 | | assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1))); |
883 | | } |