Coverage Report

Created: 2025-12-14 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/unicode-normalization/src/stream_safe.rs
Line
Count
Source
1
use core::iter::FusedIterator;
2
3
use crate::lookups::{
4
    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5
    stream_safe_trailing_nonstarters,
6
};
7
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8
use crate::tables::stream_safe_leading_nonstarters;
9
10
pub(crate) const MAX_NONSTARTERS: usize = 30;
11
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
14
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15
/// (U+034F) if the count exceeds 30.
16
///
17
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
18
pub struct StreamSafe<I> {
19
    iter: I,
20
    nonstarter_count: usize,
21
    buffer: Option<char>,
22
}
23
24
impl<I: Iterator<Item = char>> StreamSafe<I> {
25
    /// Create a new stream safe iterator.
26
    ///
27
    /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28
    /// on the iterator.
29
    #[inline]
30
30.2k
    pub fn new(iter: I) -> Self {
31
30.2k
        Self {
32
30.2k
            iter,
33
30.2k
            nonstarter_count: 0,
34
30.2k
            buffer: None,
35
30.2k
        }
36
30.2k
    }
37
}
38
39
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
40
    type Item = char;
41
42
    #[inline]
43
184M
    fn next(&mut self) -> Option<char> {
44
184M
        let next_ch = self.buffer.take().or_else(|| self.iter.next())?;
45
184M
        let d = classify_nonstarters(next_ch);
46
184M
        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
47
            // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
48
            // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
49
            // iterator (via `self.buffer`), and we'll reclassify it next iteration.
50
3.39M
            self.nonstarter_count = 0;
51
3.39M
            self.buffer = Some(next_ch);
52
3.39M
            return Some(COMBINING_GRAPHEME_JOINER);
53
180M
        }
54
55
        // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
56
        // nonstarters in NKFD.
57
180M
        if d.leading_nonstarters == d.decomposition_len {
58
118M
            self.nonstarter_count += d.decomposition_len;
59
118M
        }
60
        // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
61
62.4M
        else {
62
62.4M
            self.nonstarter_count = d.trailing_nonstarters;
63
62.4M
        }
64
180M
        Some(next_ch)
65
184M
    }
66
}
67
68
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
69
70
#[derive(Debug)]
71
pub(crate) struct Decomposition {
72
    pub(crate) leading_nonstarters: usize,
73
    pub(crate) trailing_nonstarters: usize,
74
    pub(crate) decomposition_len: usize,
75
}
76
77
#[inline]
78
384M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
384M
    if c <= '\x7f' {
81
51.5M
        return Decomposition {
82
51.5M
            leading_nonstarters: 0,
83
51.5M
            trailing_nonstarters: 0,
84
51.5M
            decomposition_len: 1,
85
51.5M
        };
86
333M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
333M
    if is_hangul_syllable(c) {
89
16.2k
        return Decomposition {
90
16.2k
            leading_nonstarters: 0,
91
16.2k
            trailing_nonstarters: 0,
92
16.2k
            decomposition_len: hangul_decomposition_length(c),
93
16.2k
        };
94
333M
    }
95
333M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
333M
    match decomp {
97
50.8M
        Some(decomp) => Decomposition {
98
50.8M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
50.8M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
50.8M
            decomposition_len: decomp.len(),
101
50.8M
        },
102
        None => {
103
282M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
282M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
282M
            Decomposition {
106
282M
                leading_nonstarters: nonstarter,
107
282M
                trailing_nonstarters: nonstarter,
108
282M
                decomposition_len: 1,
109
282M
            }
110
        }
111
    }
112
384M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
78
200M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
200M
    if c <= '\x7f' {
81
0
        return Decomposition {
82
0
            leading_nonstarters: 0,
83
0
            trailing_nonstarters: 0,
84
0
            decomposition_len: 1,
85
0
        };
86
200M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
200M
    if is_hangul_syllable(c) {
89
9.05k
        return Decomposition {
90
9.05k
            leading_nonstarters: 0,
91
9.05k
            trailing_nonstarters: 0,
92
9.05k
            decomposition_len: hangul_decomposition_length(c),
93
9.05k
        };
94
200M
    }
95
200M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
200M
    match decomp {
97
11.4M
        Some(decomp) => Decomposition {
98
11.4M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
11.4M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
11.4M
            decomposition_len: decomp.len(),
101
11.4M
        },
102
        None => {
103
189M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
189M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
189M
            Decomposition {
106
189M
                leading_nonstarters: nonstarter,
107
189M
                trailing_nonstarters: nonstarter,
108
189M
                decomposition_len: 1,
109
189M
            }
110
        }
111
    }
112
200M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
78
184M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
184M
    if c <= '\x7f' {
81
51.5M
        return Decomposition {
82
51.5M
            leading_nonstarters: 0,
83
51.5M
            trailing_nonstarters: 0,
84
51.5M
            decomposition_len: 1,
85
51.5M
        };
86
132M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
132M
    if is_hangul_syllable(c) {
89
7.20k
        return Decomposition {
90
7.20k
            leading_nonstarters: 0,
91
7.20k
            trailing_nonstarters: 0,
92
7.20k
            decomposition_len: hangul_decomposition_length(c),
93
7.20k
        };
94
132M
    }
95
132M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
132M
    match decomp {
97
39.4M
        Some(decomp) => Decomposition {
98
39.4M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
39.4M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
39.4M
            decomposition_len: decomp.len(),
101
39.4M
        },
102
        None => {
103
93.1M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
93.1M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
93.1M
            Decomposition {
106
93.1M
                leading_nonstarters: nonstarter,
107
93.1M
                trailing_nonstarters: nonstarter,
108
93.1M
                decomposition_len: 1,
109
93.1M
            }
110
        }
111
    }
112
184M
}
113
114
#[cfg(test)]
115
mod tests {
116
    use super::{classify_nonstarters, StreamSafe};
117
    use crate::lookups::canonical_combining_class;
118
    use crate::normalize::decompose_compatible;
119
120
    #[cfg(not(feature = "std"))]
121
    use alloc::{string::String, vec::Vec};
122
123
    use core::char;
124
125
    fn stream_safe(s: &str) -> String {
126
        StreamSafe::new(s.chars()).collect()
127
    }
128
129
    #[test]
130
    fn test_simple() {
131
        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
132
        assert_eq!(stream_safe(technically_okay), technically_okay);
133
134
        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135
        let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
136
        assert_eq!(stream_safe(too_much), fixed_it);
137
138
        let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
139
        let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
140
        assert_eq!(stream_safe(woah_nelly), its_cool);
141
    }
142
143
    #[test]
144
    fn test_all_nonstarters() {
145
        let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
146
        let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
147
        assert_eq!(stream_safe(s), expected);
148
    }
149
150
    #[test]
151
    fn test_classify_nonstarters() {
152
        // Highest character in the `compat_fully_decomp` table is 2FA1D
153
        for ch in 0..0x2FA1E {
154
            let ch = match char::from_u32(ch) {
155
                Some(c) => c,
156
                None => continue,
157
            };
158
            let c = classify_nonstarters(ch);
159
            let mut s = Vec::new();
160
            decompose_compatible(ch, |c| s.push(c));
161
162
            assert_eq!(s.len(), c.decomposition_len);
163
164
            let num_leading = s
165
                .iter()
166
                .take_while(|&c| canonical_combining_class(*c) != 0)
167
                .count();
168
            let num_trailing = s
169
                .iter()
170
                .rev()
171
                .take_while(|&c| canonical_combining_class(*c) != 0)
172
                .count();
173
174
            assert_eq!(num_leading, c.leading_nonstarters);
175
            assert_eq!(num_trailing, c.trailing_nonstarters);
176
        }
177
    }
178
}