/rust/registry/src/index.crates.io-6f17d22bba15001f/bstr-1.10.0/src/unicode/grapheme.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use regex_automata::{dfa::Automaton, Anchored, Input}; |
2 | | |
3 | | use crate::{ |
4 | | ext_slice::ByteSlice, |
5 | | unicode::fsm::{ |
6 | | grapheme_break_fwd::GRAPHEME_BREAK_FWD, |
7 | | grapheme_break_rev::GRAPHEME_BREAK_REV, |
8 | | regional_indicator_rev::REGIONAL_INDICATOR_REV, |
9 | | }, |
10 | | utf8, |
11 | | }; |
12 | | |
13 | | /// An iterator over grapheme clusters in a byte string. |
14 | | /// |
15 | | /// This iterator is typically constructed by |
16 | | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). |
17 | | /// |
18 | | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
19 | | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
20 | | /// one or more codepoints. For end user oriented tasks, one should generally |
21 | | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
22 | | /// always yields one codepoint at a time. |
23 | | /// |
24 | | /// Since graphemes are made up of one or more codepoints, this iterator yields |
25 | | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints |
26 | | /// are [substituted](index.html#handling-of-invalid-utf-8). |
27 | | /// |
28 | | /// This iterator can be used in reverse. When reversed, exactly the same |
29 | | /// set of grapheme clusters are yielded, but in reverse order. |
30 | | /// |
31 | | /// This iterator only yields *extended* grapheme clusters, in accordance with |
32 | | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
33 | | #[derive(Clone, Debug)] |
34 | | pub struct Graphemes<'a> { |
35 | | bs: &'a [u8], |
36 | | } |
37 | | |
38 | | impl<'a> Graphemes<'a> { |
39 | 0 | pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { |
40 | 0 | Graphemes { bs } |
41 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::new Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::new Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::new |
42 | | |
43 | | /// View the underlying data as a subslice of the original data. |
44 | | /// |
45 | | /// The slice returned has the same lifetime as the original slice, and so |
46 | | /// the iterator can continue to be used while this exists. |
47 | | /// |
48 | | /// # Examples |
49 | | /// |
50 | | /// ``` |
51 | | /// use bstr::ByteSlice; |
52 | | /// |
53 | | /// let mut it = b"abc".graphemes(); |
54 | | /// |
55 | | /// assert_eq!(b"abc", it.as_bytes()); |
56 | | /// it.next(); |
57 | | /// assert_eq!(b"bc", it.as_bytes()); |
58 | | /// it.next(); |
59 | | /// it.next(); |
60 | | /// assert_eq!(b"", it.as_bytes()); |
61 | | /// ``` |
62 | | #[inline] |
63 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
64 | 0 | self.bs |
65 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::as_bytes Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::as_bytes Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes>::as_bytes |
66 | | } |
67 | | |
68 | | impl<'a> Iterator for Graphemes<'a> { |
69 | | type Item = &'a str; |
70 | | |
71 | | #[inline] |
72 | 0 | fn next(&mut self) -> Option<&'a str> { |
73 | 0 | let (grapheme, size) = decode_grapheme(self.bs); |
74 | 0 | if size == 0 { |
75 | 0 | return None; |
76 | 0 | } |
77 | 0 | self.bs = &self.bs[size..]; |
78 | 0 | Some(grapheme) |
79 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::iterator::Iterator>::next |
80 | | } |
81 | | |
82 | | impl<'a> DoubleEndedIterator for Graphemes<'a> { |
83 | | #[inline] |
84 | 0 | fn next_back(&mut self) -> Option<&'a str> { |
85 | 0 | let (grapheme, size) = decode_last_grapheme(self.bs); |
86 | 0 | if size == 0 { |
87 | 0 | return None; |
88 | 0 | } |
89 | 0 | self.bs = &self.bs[..self.bs.len() - size]; |
90 | 0 | Some(grapheme) |
91 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::unicode::grapheme::Graphemes as core::iter::traits::double_ended::DoubleEndedIterator>::next_back |
92 | | } |
93 | | |
94 | | /// An iterator over grapheme clusters in a byte string and their byte index |
95 | | /// positions. |
96 | | /// |
97 | | /// This iterator is typically constructed by |
98 | | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). |
99 | | /// |
100 | | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
101 | | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
102 | | /// one or more codepoints. For end user oriented tasks, one should generally |
103 | | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
104 | | /// always yields one codepoint at a time. |
105 | | /// |
106 | | /// Since graphemes are made up of one or more codepoints, this iterator |
107 | | /// yields `&str` elements (along with their start and end byte offsets). |
108 | | /// When invalid UTF-8 is encountered, replacement codepoints are |
109 | | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the |
110 | | /// indices yielded by this iterator may not correspond to the length of the |
111 | | /// grapheme cluster yielded with those indices. For example, when this |
112 | | /// iterator encounters `\xFF` in the byte string, then it will yield a pair |
113 | | /// of indices ranging over a single byte, but will provide an `&str` |
114 | | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when |
115 | | /// given only valid UTF-8, then all indices are in exact correspondence with |
116 | | /// their paired grapheme cluster. |
117 | | /// |
118 | | /// This iterator can be used in reverse. When reversed, exactly the same |
119 | | /// set of grapheme clusters are yielded, but in reverse order. |
120 | | /// |
121 | | /// This iterator only yields *extended* grapheme clusters, in accordance with |
122 | | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
123 | | #[derive(Clone, Debug)] |
124 | | pub struct GraphemeIndices<'a> { |
125 | | bs: &'a [u8], |
126 | | forward_index: usize, |
127 | | reverse_index: usize, |
128 | | } |
129 | | |
130 | | impl<'a> GraphemeIndices<'a> { |
131 | 0 | pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { |
132 | 0 | GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } |
133 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::new Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::new Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::new |
134 | | |
135 | | /// View the underlying data as a subslice of the original data. |
136 | | /// |
137 | | /// The slice returned has the same lifetime as the original slice, and so |
138 | | /// the iterator can continue to be used while this exists. |
139 | | /// |
140 | | /// # Examples |
141 | | /// |
142 | | /// ``` |
143 | | /// use bstr::ByteSlice; |
144 | | /// |
145 | | /// let mut it = b"abc".grapheme_indices(); |
146 | | /// |
147 | | /// assert_eq!(b"abc", it.as_bytes()); |
148 | | /// it.next(); |
149 | | /// assert_eq!(b"bc", it.as_bytes()); |
150 | | /// it.next(); |
151 | | /// it.next(); |
152 | | /// assert_eq!(b"", it.as_bytes()); |
153 | | /// ``` |
154 | | #[inline] |
155 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
156 | 0 | self.bs |
157 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::as_bytes Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::as_bytes Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices>::as_bytes |
158 | | } |
159 | | |
160 | | impl<'a> Iterator for GraphemeIndices<'a> { |
161 | | type Item = (usize, usize, &'a str); |
162 | | |
163 | | #[inline] |
164 | 0 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { |
165 | 0 | let index = self.forward_index; |
166 | 0 | let (grapheme, size) = decode_grapheme(self.bs); |
167 | 0 | if size == 0 { |
168 | 0 | return None; |
169 | 0 | } |
170 | 0 | self.bs = &self.bs[size..]; |
171 | 0 | self.forward_index += size; |
172 | 0 | Some((index, index + size, grapheme)) |
173 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::iterator::Iterator>::next |
174 | | } |
175 | | |
176 | | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { |
177 | | #[inline] |
178 | 0 | fn next_back(&mut self) -> Option<(usize, usize, &'a str)> { |
179 | 0 | let (grapheme, size) = decode_last_grapheme(self.bs); |
180 | 0 | if size == 0 { |
181 | 0 | return None; |
182 | 0 | } |
183 | 0 | self.bs = &self.bs[..self.bs.len() - size]; |
184 | 0 | self.reverse_index -= size; |
185 | 0 | Some((self.reverse_index, self.reverse_index + size, grapheme)) |
186 | 0 | } Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::unicode::grapheme::GraphemeIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back |
187 | | } |
188 | | |
189 | | /// Decode a grapheme from the given byte string. |
190 | | /// |
191 | | /// This returns the resulting grapheme (which may be a Unicode replacement |
192 | | /// codepoint if invalid UTF-8 was found), along with the number of bytes |
193 | | /// decoded in the byte string. The number of bytes decoded may not be the |
194 | | /// same as the length of grapheme in the case where invalid UTF-8 is found. |
195 | 0 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { |
196 | 0 | if bs.is_empty() { |
197 | 0 | ("", 0) |
198 | 0 | } else if bs.len() >= 2 |
199 | 0 | && bs[0].is_ascii() |
200 | 0 | && bs[1].is_ascii() |
201 | 0 | && !bs[0].is_ascii_whitespace() |
202 | | { |
203 | | // FIXME: It is somewhat sad that we have to special case this, but it |
204 | | // leads to a significant speed up in predominantly ASCII text. The |
205 | | // issue here is that the DFA has a bit of overhead, and running it for |
206 | | // every byte in mostly ASCII text results in a bit slowdown. We should |
207 | | // re-litigate this once regex-automata 0.3 is out, but it might be |
208 | | // hard to avoid the special case. A DFA is always going to at least |
209 | | // require some memory access. |
210 | | |
211 | | // Safe because all ASCII bytes are valid UTF-8. |
212 | 0 | let grapheme = unsafe { bs[..1].to_str_unchecked() }; |
213 | 0 | (grapheme, 1) |
214 | 0 | } else if let Some(hm) = { |
215 | 0 | let input = Input::new(bs).anchored(Anchored::Yes); |
216 | 0 | GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap() |
217 | 0 | } { |
218 | | // Safe because a match can only occur for valid UTF-8. |
219 | 0 | let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() }; |
220 | 0 | (grapheme, grapheme.len()) |
221 | | } else { |
222 | | const INVALID: &'static str = "\u{FFFD}"; |
223 | | // No match on non-empty bytes implies we found invalid UTF-8. |
224 | 0 | let (_, size) = utf8::decode_lossy(bs); |
225 | 0 | (INVALID, size) |
226 | | } |
227 | 0 | } Unexecuted instantiation: bstr::unicode::grapheme::decode_grapheme Unexecuted instantiation: bstr::unicode::grapheme::decode_grapheme Unexecuted instantiation: bstr::unicode::grapheme::decode_grapheme |
228 | | |
229 | 0 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { |
230 | 0 | if bs.is_empty() { |
231 | 0 | ("", 0) |
232 | 0 | } else if let Some(hm) = { |
233 | 0 | let input = Input::new(bs).anchored(Anchored::Yes); |
234 | 0 | GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap() |
235 | 0 | } { |
236 | 0 | let start = adjust_rev_for_regional_indicator(bs, hm.offset()); |
237 | 0 | // Safe because a match can only occur for valid UTF-8. |
238 | 0 | let grapheme = unsafe { bs[start..].to_str_unchecked() }; |
239 | 0 | (grapheme, grapheme.len()) |
240 | | } else { |
241 | | const INVALID: &'static str = "\u{FFFD}"; |
242 | | // No match on non-empty bytes implies we found invalid UTF-8. |
243 | 0 | let (_, size) = utf8::decode_last_lossy(bs); |
244 | 0 | (INVALID, size) |
245 | | } |
246 | 0 | } Unexecuted instantiation: bstr::unicode::grapheme::decode_last_grapheme Unexecuted instantiation: bstr::unicode::grapheme::decode_last_grapheme Unexecuted instantiation: bstr::unicode::grapheme::decode_last_grapheme |
247 | | |
248 | | /// Return the correct offset for the next grapheme decoded at the end of the |
249 | | /// given byte string, where `i` is the initial guess. In particular, |
250 | | /// `&bs[i..]` represents the candidate grapheme. |
251 | | /// |
252 | | /// `i` is returned by this function in all cases except when `&bs[i..]` is |
253 | | /// a pair of regional indicator codepoints. In that case, if an odd number of |
254 | | /// additional regional indicator codepoints precedes `i`, then `i` is |
255 | | /// adjusted such that it points to only a single regional indicator. |
256 | | /// |
257 | | /// This "fixing" is necessary to handle the requirement that a break cannot |
258 | | /// occur between regional indicators where it would cause an odd number of |
259 | | /// regional indicators to exist before the break from the *start* of the |
260 | | /// string. A reverse regex cannot detect this case easily without look-around. |
261 | 0 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { |
262 | 0 | // All regional indicators use a 4 byte encoding, and we only care about |
263 | 0 | // the case where we found a pair of regional indicators. |
264 | 0 | if bs.len() - i != 8 { |
265 | 0 | return i; |
266 | 0 | } |
267 | 0 | // Count all contiguous occurrences of regional indicators. If there's an |
268 | 0 | // even number of them, then we can accept the pair we found. Otherwise, |
269 | 0 | // we can only take one of them. |
270 | 0 | // |
271 | 0 | // FIXME: This is quadratic in the worst case, e.g., a string of just |
272 | 0 | // regional indicator codepoints. A fix probably requires refactoring this |
273 | 0 | // code a bit such that we don't rescan regional indicators. |
274 | 0 | let mut count = 0; |
275 | 0 | while let Some(hm) = { |
276 | 0 | let input = Input::new(bs).anchored(Anchored::Yes); |
277 | 0 | REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap() |
278 | 0 | } { |
279 | 0 | bs = &bs[..hm.offset()]; |
280 | 0 | count += 1; |
281 | 0 | } |
282 | 0 | if count % 2 == 0 { |
283 | 0 | i |
284 | | } else { |
285 | 0 | i + 4 |
286 | | } |
287 | 0 | } Unexecuted instantiation: bstr::unicode::grapheme::adjust_rev_for_regional_indicator Unexecuted instantiation: bstr::unicode::grapheme::adjust_rev_for_regional_indicator Unexecuted instantiation: bstr::unicode::grapheme::adjust_rev_for_regional_indicator |
288 | | |
289 | | #[cfg(all(test, feature = "std"))] |
290 | | mod tests { |
291 | | use alloc::{ |
292 | | string::{String, ToString}, |
293 | | vec, |
294 | | vec::Vec, |
295 | | }; |
296 | | |
297 | | #[cfg(not(miri))] |
298 | | use ucd_parse::GraphemeClusterBreakTest; |
299 | | |
300 | | use crate::tests::LOSSY_TESTS; |
301 | | |
302 | | use super::*; |
303 | | |
304 | | #[test] |
305 | | #[cfg(not(miri))] |
306 | | fn forward_ucd() { |
307 | | for (i, test) in ucdtests().into_iter().enumerate() { |
308 | | let given = test.grapheme_clusters.concat(); |
309 | | let got: Vec<String> = Graphemes::new(given.as_bytes()) |
310 | | .map(|cluster| cluster.to_string()) |
311 | | .collect(); |
312 | | assert_eq!( |
313 | | test.grapheme_clusters, |
314 | | got, |
315 | | "\ngrapheme forward break test {} failed:\n\ |
316 | | given: {:?}\n\ |
317 | | expected: {:?}\n\ |
318 | | got: {:?}\n", |
319 | | i, |
320 | | uniescape(&given), |
321 | | uniescape_vec(&test.grapheme_clusters), |
322 | | uniescape_vec(&got), |
323 | | ); |
324 | | } |
325 | | } |
326 | | |
327 | | #[test] |
328 | | #[cfg(not(miri))] |
329 | | fn reverse_ucd() { |
330 | | for (i, test) in ucdtests().into_iter().enumerate() { |
331 | | let given = test.grapheme_clusters.concat(); |
332 | | let mut got: Vec<String> = Graphemes::new(given.as_bytes()) |
333 | | .rev() |
334 | | .map(|cluster| cluster.to_string()) |
335 | | .collect(); |
336 | | got.reverse(); |
337 | | assert_eq!( |
338 | | test.grapheme_clusters, |
339 | | got, |
340 | | "\n\ngrapheme reverse break test {} failed:\n\ |
341 | | given: {:?}\n\ |
342 | | expected: {:?}\n\ |
343 | | got: {:?}\n", |
344 | | i, |
345 | | uniescape(&given), |
346 | | uniescape_vec(&test.grapheme_clusters), |
347 | | uniescape_vec(&got), |
348 | | ); |
349 | | } |
350 | | } |
351 | | |
352 | | #[test] |
353 | | fn forward_lossy() { |
354 | | for &(expected, input) in LOSSY_TESTS { |
355 | | let got = Graphemes::new(input.as_bytes()).collect::<String>(); |
356 | | assert_eq!(expected, got); |
357 | | } |
358 | | } |
359 | | |
360 | | #[test] |
361 | | fn reverse_lossy() { |
362 | | for &(expected, input) in LOSSY_TESTS { |
363 | | let expected: String = expected.chars().rev().collect(); |
364 | | let got = |
365 | | Graphemes::new(input.as_bytes()).rev().collect::<String>(); |
366 | | assert_eq!(expected, got); |
367 | | } |
368 | | } |
369 | | |
370 | | #[cfg(not(miri))] |
371 | | fn uniescape(s: &str) -> String { |
372 | | s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() |
373 | | } |
374 | | |
375 | | #[cfg(not(miri))] |
376 | | fn uniescape_vec(strs: &[String]) -> Vec<String> { |
377 | | strs.iter().map(|s| uniescape(s)).collect() |
378 | | } |
379 | | |
380 | | /// Return all of the UCD for grapheme breaks. |
381 | | #[cfg(not(miri))] |
382 | | fn ucdtests() -> Vec<GraphemeClusterBreakTest> { |
383 | | const TESTDATA: &'static str = |
384 | | include_str!("data/GraphemeBreakTest.txt"); |
385 | | |
386 | | let mut tests = vec![]; |
387 | | for mut line in TESTDATA.lines() { |
388 | | line = line.trim(); |
389 | | if line.starts_with("#") || line.contains("surrogate") { |
390 | | continue; |
391 | | } |
392 | | tests.push(line.parse().unwrap()); |
393 | | } |
394 | | tests |
395 | | } |
396 | | } |