/src/unicode-normalization/src/stream_safe.rs
Line | Count | Source |
1 | | use core::iter::FusedIterator; |
2 | | |
3 | | use crate::lookups::{ |
4 | | canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, |
5 | | stream_safe_trailing_nonstarters, |
6 | | }; |
7 | | use crate::normalize::{hangul_decomposition_length, is_hangul_syllable}; |
8 | | use crate::tables::stream_safe_leading_nonstarters; |
9 | | |
10 | | pub(crate) const MAX_NONSTARTERS: usize = 30; |
11 | | const COMBINING_GRAPHEME_JOINER: char = '\u{034F}'; |
12 | | |
13 | | /// [UAX15-D4]: This iterator keeps track of how many non-starters there have been |
14 | | /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner |
15 | | /// (U+034F) if the count exceeds 30. |
16 | | /// |
17 | | /// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4 |
18 | | pub struct StreamSafe<I> { |
19 | | iter: I, |
20 | | nonstarter_count: usize, |
21 | | buffer: Option<char>, |
22 | | } |
23 | | |
24 | | impl<I: Iterator<Item = char>> StreamSafe<I> { |
25 | | /// Create a new stream safe iterator. |
26 | | /// |
27 | | /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe) |
28 | | /// on the iterator. |
29 | | #[inline] |
30 | 25.9k | pub fn new(iter: I) -> Self { |
31 | 25.9k | Self { |
32 | 25.9k | iter, |
33 | 25.9k | nonstarter_count: 0, |
34 | 25.9k | buffer: None, |
35 | 25.9k | } |
36 | 25.9k | } |
37 | | } |
38 | | |
39 | | impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> { |
40 | | type Item = char; |
41 | | |
42 | | #[inline] |
43 | 75.2M | fn next(&mut self) -> Option<char> { |
44 | 75.2M | let next_ch = match self.buffer.take().or_else(|| self.iter.next()) { |
45 | 31.9k | None => return None, |
46 | 75.2M | Some(c) => c, |
47 | 75.2M | }; |
48 | 75.2M | let d = classify_nonstarters(next_ch); |
49 | 75.2M | if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS { |
50 | | // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing |
51 | | // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the |
52 | | // iterator (via `self.buffer`), and we'll reclassify it next iteration. |
53 | 1.07M | self.nonstarter_count = 0; |
54 | 1.07M | self.buffer = Some(next_ch); |
55 | 1.07M | return Some(COMBINING_GRAPHEME_JOINER); |
56 | 74.1M | } |
57 | 74.1M | |
58 | 74.1M | // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous |
59 | 74.1M | // nonstarters in NKFD. |
60 | 74.1M | if d.leading_nonstarters == d.decomposition_len { |
61 | 38.1M | self.nonstarter_count += d.decomposition_len; |
62 | 38.1M | } |
63 | | // Otherwise, reset the counter to the decomposition's number of trailing nonstarters. |
64 | 36.0M | else { |
65 | 36.0M | self.nonstarter_count = d.trailing_nonstarters; |
66 | 36.0M | } |
67 | 74.1M | Some(next_ch) |
68 | 75.2M | } |
69 | | } |
70 | | |
71 | | impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {} |
72 | | |
73 | | #[derive(Debug)] |
74 | | pub(crate) struct Decomposition { |
75 | | pub(crate) leading_nonstarters: usize, |
76 | | pub(crate) trailing_nonstarters: usize, |
77 | | pub(crate) decomposition_len: usize, |
78 | | } |
79 | | |
80 | | #[inline] |
81 | 143M | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { |
82 | 143M | // As usual, fast path for ASCII (which is always a starter) |
83 | 143M | if c <= '\x7f' { |
84 | 29.4M | return Decomposition { |
85 | 29.4M | leading_nonstarters: 0, |
86 | 29.4M | trailing_nonstarters: 0, |
87 | 29.4M | decomposition_len: 1, |
88 | 29.4M | }; |
89 | 114M | } |
90 | 114M | // Next, special case Hangul, since it's not handled by our tables. |
91 | 114M | if is_hangul_syllable(c) { |
92 | 30.6k | return Decomposition { |
93 | 30.6k | leading_nonstarters: 0, |
94 | 30.6k | trailing_nonstarters: 0, |
95 | 30.6k | decomposition_len: hangul_decomposition_length(c), |
96 | 30.6k | }; |
97 | 114M | } |
98 | 114M | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
99 | 114M | match decomp { |
100 | 23.0M | Some(decomp) => Decomposition { |
101 | 23.0M | leading_nonstarters: stream_safe_leading_nonstarters(c), |
102 | 23.0M | trailing_nonstarters: stream_safe_trailing_nonstarters(c), |
103 | 23.0M | decomposition_len: decomp.len(), |
104 | 23.0M | }, |
105 | | None => { |
106 | 91.4M | let is_nonstarter = canonical_combining_class(c) != 0; |
107 | 91.4M | let nonstarter = if is_nonstarter { 1 } else { 0 }; |
108 | 91.4M | Decomposition { |
109 | 91.4M | leading_nonstarters: nonstarter, |
110 | 91.4M | trailing_nonstarters: nonstarter, |
111 | 91.4M | decomposition_len: 1, |
112 | 91.4M | } |
113 | | } |
114 | | } |
115 | 143M | } unicode_normalization::stream_safe::classify_nonstarters Line | Count | Source | 81 | 68.6M | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { | 82 | 68.6M | // As usual, fast path for ASCII (which is always a starter) | 83 | 68.6M | if c <= '\x7f' { | 84 | 0 | return Decomposition { | 85 | 0 | leading_nonstarters: 0, | 86 | 0 | trailing_nonstarters: 0, | 87 | 0 | decomposition_len: 1, | 88 | 0 | }; | 89 | 68.6M | } | 90 | 68.6M | // Next, special case Hangul, since it's not handled by our tables. | 91 | 68.6M | if is_hangul_syllable(c) { | 92 | 18.7k | return Decomposition { | 93 | 18.7k | leading_nonstarters: 0, | 94 | 18.7k | trailing_nonstarters: 0, | 95 | 18.7k | decomposition_len: hangul_decomposition_length(c), | 96 | 18.7k | }; | 97 | 68.6M | } | 98 | 68.6M | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 99 | 68.6M | match decomp { | 100 | 7.46M | Some(decomp) => Decomposition { | 101 | 7.46M | leading_nonstarters: stream_safe_leading_nonstarters(c), | 102 | 7.46M | trailing_nonstarters: stream_safe_trailing_nonstarters(c), | 103 | 7.46M | decomposition_len: decomp.len(), | 104 | 7.46M | }, | 105 | | None => { | 106 | 61.2M | let is_nonstarter = canonical_combining_class(c) != 0; | 107 | 61.2M | let nonstarter = if is_nonstarter { 1 } else { 0 }; | 108 | 61.2M | Decomposition { | 109 | 61.2M | leading_nonstarters: nonstarter, | 110 | 61.2M | trailing_nonstarters: nonstarter, | 111 | 61.2M | decomposition_len: 1, | 112 | 61.2M | } | 113 | | } | 114 | | } | 115 | 68.6M | } |
unicode_normalization::stream_safe::classify_nonstarters Line | Count | Source | 81 | 75.2M | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { | 82 | 75.2M | // As usual, fast path for ASCII (which is always a starter) | 83 | 75.2M | if c <= '\x7f' { | 84 | 29.4M | return Decomposition { | 85 | 29.4M | leading_nonstarters: 0, | 86 | 29.4M | trailing_nonstarters: 0, | 87 | 29.4M | decomposition_len: 1, | 88 | 29.4M | }; | 89 | 45.7M | } | 90 | 45.7M | // Next, special case Hangul, since it's not handled by our tables. | 91 | 45.7M | if is_hangul_syllable(c) { | 92 | 11.9k | return Decomposition { | 93 | 11.9k | leading_nonstarters: 0, | 94 | 11.9k | trailing_nonstarters: 0, | 95 | 11.9k | decomposition_len: hangul_decomposition_length(c), | 96 | 11.9k | }; | 97 | 45.7M | } | 98 | 45.7M | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 99 | 45.7M | match decomp { | 100 | 15.5M | Some(decomp) => Decomposition { | 101 | 15.5M | leading_nonstarters: stream_safe_leading_nonstarters(c), | 102 | 15.5M | trailing_nonstarters: stream_safe_trailing_nonstarters(c), | 103 | 15.5M | decomposition_len: decomp.len(), | 104 | 15.5M | }, | 105 | | None => { | 106 | 30.2M | let is_nonstarter = canonical_combining_class(c) != 0; | 107 | 30.2M | let nonstarter = if is_nonstarter { 1 } else { 0 }; | 108 | 30.2M | Decomposition { | 109 | 30.2M | leading_nonstarters: nonstarter, | 110 | 30.2M | trailing_nonstarters: nonstarter, | 111 | 30.2M | decomposition_len: 1, | 112 | 30.2M | } | 113 | | } | 114 | | } | 115 | 75.2M | } |
|
116 | | |
117 | | #[cfg(test)] |
118 | | mod tests { |
119 | | use super::{classify_nonstarters, StreamSafe}; |
120 | | use crate::lookups::canonical_combining_class; |
121 | | use crate::normalize::decompose_compatible; |
122 | | |
123 | | #[cfg(not(feature = "std"))] |
124 | | use alloc::{string::String, vec::Vec}; |
125 | | |
126 | | use core::char; |
127 | | |
128 | | fn stream_safe(s: &str) -> String { |
129 | | StreamSafe::new(s.chars()).collect() |
130 | | } |
131 | | |
132 | | #[test] |
133 | | fn test_simple() { |
134 | | let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone"; |
135 | | assert_eq!(stream_safe(technically_okay), technically_okay); |
136 | | |
137 | | let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone"; |
138 | | let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone"; |
139 | | assert_eq!(stream_safe(too_much), fixed_it); |
140 | | |
141 | | let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone"; |
142 | | let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone"; |
143 | | assert_eq!(stream_safe(woah_nelly), its_cool); |
144 | | } |
145 | | |
146 | | #[test] |
147 | | fn test_all_nonstarters() { |
148 | | let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}"; |
149 | | let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}"; |
150 | | assert_eq!(stream_safe(s), expected); |
151 | | } |
152 | | |
153 | | #[test] |
154 | | fn test_classify_nonstarters() { |
155 | | // Highest character in the `compat_fully_decomp` table is 2FA1D |
156 | | for ch in 0..0x2FA1E { |
157 | | let ch = match char::from_u32(ch) { |
158 | | Some(c) => c, |
159 | | None => continue, |
160 | | }; |
161 | | let c = classify_nonstarters(ch); |
162 | | let mut s = Vec::new(); |
163 | | decompose_compatible(ch, |c| s.push(c)); |
164 | | |
165 | | assert_eq!(s.len(), c.decomposition_len); |
166 | | |
167 | | let num_leading = s |
168 | | .iter() |
169 | | .take_while(|&c| canonical_combining_class(*c) != 0) |
170 | | .count(); |
171 | | let num_trailing = s |
172 | | .iter() |
173 | | .rev() |
174 | | .take_while(|&c| canonical_combining_class(*c) != 0) |
175 | | .count(); |
176 | | |
177 | | assert_eq!(num_leading, c.leading_nonstarters); |
178 | | assert_eq!(num_trailing, c.trailing_nonstarters); |
179 | | } |
180 | | } |
181 | | } |