Coverage Report

Created: 2025-07-11 07:04

/src/unicode-normalization/src/stream_safe.rs
Line
Count
Source
1
use core::iter::FusedIterator;
2
3
use crate::lookups::{
4
    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5
    stream_safe_trailing_nonstarters,
6
};
7
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8
use crate::tables::stream_safe_leading_nonstarters;
9
10
pub(crate) const MAX_NONSTARTERS: usize = 30;
11
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
14
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15
/// (U+034F) if the count exceeds 30.
16
///
17
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
18
pub struct StreamSafe<I> {
19
    iter: I,
20
    nonstarter_count: usize,
21
    buffer: Option<char>,
22
}
23
24
impl<I: Iterator<Item = char>> StreamSafe<I> {
25
    /// Create a new stream safe iterator.
26
    ///
27
    /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28
    /// on the iterator.
29
    #[inline]
30
25.9k
    pub fn new(iter: I) -> Self {
31
25.9k
        Self {
32
25.9k
            iter,
33
25.9k
            nonstarter_count: 0,
34
25.9k
            buffer: None,
35
25.9k
        }
36
25.9k
    }
37
}
38
39
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
40
    type Item = char;
41
42
    #[inline]
43
75.2M
    fn next(&mut self) -> Option<char> {
44
75.2M
        let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
45
31.9k
            None => return None,
46
75.2M
            Some(c) => c,
47
75.2M
        };
48
75.2M
        let d = classify_nonstarters(next_ch);
49
75.2M
        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
50
            // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
51
            // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
52
            // iterator (via `self.buffer`), and we'll reclassify it next iteration.
53
1.07M
            self.nonstarter_count = 0;
54
1.07M
            self.buffer = Some(next_ch);
55
1.07M
            return Some(COMBINING_GRAPHEME_JOINER);
56
74.1M
        }
57
74.1M
58
74.1M
        // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
59
74.1M
        // nonstarters in NKFD.
60
74.1M
        if d.leading_nonstarters == d.decomposition_len {
61
38.1M
            self.nonstarter_count += d.decomposition_len;
62
38.1M
        }
63
        // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
64
36.0M
        else {
65
36.0M
            self.nonstarter_count = d.trailing_nonstarters;
66
36.0M
        }
67
74.1M
        Some(next_ch)
68
75.2M
    }
69
}
70
71
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
72
73
#[derive(Debug)]
74
pub(crate) struct Decomposition {
75
    pub(crate) leading_nonstarters: usize,
76
    pub(crate) trailing_nonstarters: usize,
77
    pub(crate) decomposition_len: usize,
78
}
79
80
#[inline]
81
143M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
82
143M
    // As usual, fast path for ASCII (which is always a starter)
83
143M
    if c <= '\x7f' {
84
29.4M
        return Decomposition {
85
29.4M
            leading_nonstarters: 0,
86
29.4M
            trailing_nonstarters: 0,
87
29.4M
            decomposition_len: 1,
88
29.4M
        };
89
114M
    }
90
114M
    // Next, special case Hangul, since it's not handled by our tables.
91
114M
    if is_hangul_syllable(c) {
92
30.6k
        return Decomposition {
93
30.6k
            leading_nonstarters: 0,
94
30.6k
            trailing_nonstarters: 0,
95
30.6k
            decomposition_len: hangul_decomposition_length(c),
96
30.6k
        };
97
114M
    }
98
114M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
99
114M
    match decomp {
100
23.0M
        Some(decomp) => Decomposition {
101
23.0M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
102
23.0M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
103
23.0M
            decomposition_len: decomp.len(),
104
23.0M
        },
105
        None => {
106
91.4M
            let is_nonstarter = canonical_combining_class(c) != 0;
107
91.4M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
108
91.4M
            Decomposition {
109
91.4M
                leading_nonstarters: nonstarter,
110
91.4M
                trailing_nonstarters: nonstarter,
111
91.4M
                decomposition_len: 1,
112
91.4M
            }
113
        }
114
    }
115
143M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
81
68.6M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
82
68.6M
    // As usual, fast path for ASCII (which is always a starter)
83
68.6M
    if c <= '\x7f' {
84
0
        return Decomposition {
85
0
            leading_nonstarters: 0,
86
0
            trailing_nonstarters: 0,
87
0
            decomposition_len: 1,
88
0
        };
89
68.6M
    }
90
68.6M
    // Next, special case Hangul, since it's not handled by our tables.
91
68.6M
    if is_hangul_syllable(c) {
92
18.7k
        return Decomposition {
93
18.7k
            leading_nonstarters: 0,
94
18.7k
            trailing_nonstarters: 0,
95
18.7k
            decomposition_len: hangul_decomposition_length(c),
96
18.7k
        };
97
68.6M
    }
98
68.6M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
99
68.6M
    match decomp {
100
7.46M
        Some(decomp) => Decomposition {
101
7.46M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
102
7.46M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
103
7.46M
            decomposition_len: decomp.len(),
104
7.46M
        },
105
        None => {
106
61.2M
            let is_nonstarter = canonical_combining_class(c) != 0;
107
61.2M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
108
61.2M
            Decomposition {
109
61.2M
                leading_nonstarters: nonstarter,
110
61.2M
                trailing_nonstarters: nonstarter,
111
61.2M
                decomposition_len: 1,
112
61.2M
            }
113
        }
114
    }
115
68.6M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
81
75.2M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
82
75.2M
    // As usual, fast path for ASCII (which is always a starter)
83
75.2M
    if c <= '\x7f' {
84
29.4M
        return Decomposition {
85
29.4M
            leading_nonstarters: 0,
86
29.4M
            trailing_nonstarters: 0,
87
29.4M
            decomposition_len: 1,
88
29.4M
        };
89
45.7M
    }
90
45.7M
    // Next, special case Hangul, since it's not handled by our tables.
91
45.7M
    if is_hangul_syllable(c) {
92
11.9k
        return Decomposition {
93
11.9k
            leading_nonstarters: 0,
94
11.9k
            trailing_nonstarters: 0,
95
11.9k
            decomposition_len: hangul_decomposition_length(c),
96
11.9k
        };
97
45.7M
    }
98
45.7M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
99
45.7M
    match decomp {
100
15.5M
        Some(decomp) => Decomposition {
101
15.5M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
102
15.5M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
103
15.5M
            decomposition_len: decomp.len(),
104
15.5M
        },
105
        None => {
106
30.2M
            let is_nonstarter = canonical_combining_class(c) != 0;
107
30.2M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
108
30.2M
            Decomposition {
109
30.2M
                leading_nonstarters: nonstarter,
110
30.2M
                trailing_nonstarters: nonstarter,
111
30.2M
                decomposition_len: 1,
112
30.2M
            }
113
        }
114
    }
115
75.2M
}
116
117
#[cfg(test)]
118
mod tests {
119
    use super::{classify_nonstarters, StreamSafe};
120
    use crate::lookups::canonical_combining_class;
121
    use crate::normalize::decompose_compatible;
122
123
    #[cfg(not(feature = "std"))]
124
    use alloc::{string::String, vec::Vec};
125
126
    use core::char;
127
128
    fn stream_safe(s: &str) -> String {
129
        StreamSafe::new(s.chars()).collect()
130
    }
131
132
    #[test]
133
    fn test_simple() {
134
        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
135
        assert_eq!(stream_safe(technically_okay), technically_okay);
136
137
        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
138
        let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
139
        assert_eq!(stream_safe(too_much), fixed_it);
140
141
        let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
142
        let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
143
        assert_eq!(stream_safe(woah_nelly), its_cool);
144
    }
145
146
    #[test]
147
    fn test_all_nonstarters() {
148
        let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
149
        let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
150
        assert_eq!(stream_safe(s), expected);
151
    }
152
153
    #[test]
154
    fn test_classify_nonstarters() {
155
        // Highest character in the `compat_fully_decomp` table is 2FA1D
156
        for ch in 0..0x2FA1E {
157
            let ch = match char::from_u32(ch) {
158
                Some(c) => c,
159
                None => continue,
160
            };
161
            let c = classify_nonstarters(ch);
162
            let mut s = Vec::new();
163
            decompose_compatible(ch, |c| s.push(c));
164
165
            assert_eq!(s.len(), c.decomposition_len);
166
167
            let num_leading = s
168
                .iter()
169
                .take_while(|&c| canonical_combining_class(*c) != 0)
170
                .count();
171
            let num_trailing = s
172
                .iter()
173
                .rev()
174
                .take_while(|&c| canonical_combining_class(*c) != 0)
175
                .count();
176
177
            assert_eq!(num_leading, c.leading_nonstarters);
178
            assert_eq!(num_trailing, c.trailing_nonstarters);
179
        }
180
    }
181
}