Coverage Report

Created: 2025-11-24 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/unicode-normalization/src/stream_safe.rs
Line
Count
Source
1
use core::iter::FusedIterator;
2
3
use crate::lookups::{
4
    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5
    stream_safe_trailing_nonstarters,
6
};
7
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8
use crate::tables::stream_safe_leading_nonstarters;
9
10
pub(crate) const MAX_NONSTARTERS: usize = 30;
11
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
14
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15
/// (U+034F) if the count exceeds 30.
16
///
17
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
18
pub struct StreamSafe<I> {
19
    iter: I,
20
    nonstarter_count: usize,
21
    buffer: Option<char>,
22
}
23
24
impl<I: Iterator<Item = char>> StreamSafe<I> {
25
    /// Create a new stream safe iterator.
26
    ///
27
    /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28
    /// on the iterator.
29
    #[inline]
30
31.1k
    pub fn new(iter: I) -> Self {
31
31.1k
        Self {
32
31.1k
            iter,
33
31.1k
            nonstarter_count: 0,
34
31.1k
            buffer: None,
35
31.1k
        }
36
31.1k
    }
37
}
38
39
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
40
    type Item = char;
41
42
    #[inline]
43
185M
    fn next(&mut self) -> Option<char> {
44
185M
        let next_ch = self.buffer.take().or_else(|| self.iter.next())?;
45
185M
        let d = classify_nonstarters(next_ch);
46
185M
        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
47
            // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
48
            // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
49
            // iterator (via `self.buffer`), and we'll reclassify it next iteration.
50
2.75M
            self.nonstarter_count = 0;
51
2.75M
            self.buffer = Some(next_ch);
52
2.75M
            return Some(COMBINING_GRAPHEME_JOINER);
53
182M
        }
54
55
        // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
56
        // nonstarters in NKFD.
57
182M
        if d.leading_nonstarters == d.decomposition_len {
58
95.0M
            self.nonstarter_count += d.decomposition_len;
59
95.0M
        }
60
        // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
61
87.4M
        else {
62
87.4M
            self.nonstarter_count = d.trailing_nonstarters;
63
87.4M
        }
64
182M
        Some(next_ch)
65
185M
    }
66
}
67
68
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
69
70
#[derive(Debug)]
71
pub(crate) struct Decomposition {
72
    pub(crate) leading_nonstarters: usize,
73
    pub(crate) trailing_nonstarters: usize,
74
    pub(crate) decomposition_len: usize,
75
}
76
77
#[inline]
78
352M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
352M
    if c <= '\x7f' {
81
75.2M
        return Decomposition {
82
75.2M
            leading_nonstarters: 0,
83
75.2M
            trailing_nonstarters: 0,
84
75.2M
            decomposition_len: 1,
85
75.2M
        };
86
277M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
277M
    if is_hangul_syllable(c) {
89
18.3k
        return Decomposition {
90
18.3k
            leading_nonstarters: 0,
91
18.3k
            trailing_nonstarters: 0,
92
18.3k
            decomposition_len: hangul_decomposition_length(c),
93
18.3k
        };
94
277M
    }
95
277M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
277M
    match decomp {
97
49.1M
        Some(decomp) => Decomposition {
98
49.1M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
49.1M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
49.1M
            decomposition_len: decomp.len(),
101
49.1M
        },
102
        None => {
103
228M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
228M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
228M
            Decomposition {
106
228M
                leading_nonstarters: nonstarter,
107
228M
                trailing_nonstarters: nonstarter,
108
228M
                decomposition_len: 1,
109
228M
            }
110
        }
111
    }
112
352M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
78
167M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
167M
    if c <= '\x7f' {
81
0
        return Decomposition {
82
0
            leading_nonstarters: 0,
83
0
            trailing_nonstarters: 0,
84
0
            decomposition_len: 1,
85
0
        };
86
167M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
167M
    if is_hangul_syllable(c) {
89
10.1k
        return Decomposition {
90
10.1k
            leading_nonstarters: 0,
91
10.1k
            trailing_nonstarters: 0,
92
10.1k
            decomposition_len: hangul_decomposition_length(c),
93
10.1k
        };
94
167M
    }
95
167M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
167M
    match decomp {
97
13.7M
        Some(decomp) => Decomposition {
98
13.7M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
13.7M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
13.7M
            decomposition_len: decomp.len(),
101
13.7M
        },
102
        None => {
103
153M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
153M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
153M
            Decomposition {
106
153M
                leading_nonstarters: nonstarter,
107
153M
                trailing_nonstarters: nonstarter,
108
153M
                decomposition_len: 1,
109
153M
            }
110
        }
111
    }
112
167M
}
unicode_normalization::stream_safe::classify_nonstarters
Line
Count
Source
78
185M
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79
    // As usual, fast path for ASCII (which is always a starter)
80
185M
    if c <= '\x7f' {
81
75.2M
        return Decomposition {
82
75.2M
            leading_nonstarters: 0,
83
75.2M
            trailing_nonstarters: 0,
84
75.2M
            decomposition_len: 1,
85
75.2M
        };
86
109M
    }
87
    // Next, special case Hangul, since it's not handled by our tables.
88
109M
    if is_hangul_syllable(c) {
89
8.22k
        return Decomposition {
90
8.22k
            leading_nonstarters: 0,
91
8.22k
            trailing_nonstarters: 0,
92
8.22k
            decomposition_len: hangul_decomposition_length(c),
93
8.22k
        };
94
109M
    }
95
109M
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96
109M
    match decomp {
97
35.4M
        Some(decomp) => Decomposition {
98
35.4M
            leading_nonstarters: stream_safe_leading_nonstarters(c),
99
35.4M
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100
35.4M
            decomposition_len: decomp.len(),
101
35.4M
        },
102
        None => {
103
74.5M
            let is_nonstarter = canonical_combining_class(c) != 0;
104
74.5M
            let nonstarter = if is_nonstarter { 1 } else { 0 };
105
74.5M
            Decomposition {
106
74.5M
                leading_nonstarters: nonstarter,
107
74.5M
                trailing_nonstarters: nonstarter,
108
74.5M
                decomposition_len: 1,
109
74.5M
            }
110
        }
111
    }
112
185M
}
113
114
#[cfg(test)]
115
mod tests {
116
    use super::{classify_nonstarters, StreamSafe};
117
    use crate::lookups::canonical_combining_class;
118
    use crate::normalize::decompose_compatible;
119
120
    #[cfg(not(feature = "std"))]
121
    use alloc::{string::String, vec::Vec};
122
123
    use core::char;
124
125
    fn stream_safe(s: &str) -> String {
126
        StreamSafe::new(s.chars()).collect()
127
    }
128
129
    #[test]
130
    fn test_simple() {
131
        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
132
        assert_eq!(stream_safe(technically_okay), technically_okay);
133
134
        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135
        let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
136
        assert_eq!(stream_safe(too_much), fixed_it);
137
138
        let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
139
        let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
140
        assert_eq!(stream_safe(woah_nelly), its_cool);
141
    }
142
143
    #[test]
144
    fn test_all_nonstarters() {
145
        let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
146
        let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
147
        assert_eq!(stream_safe(s), expected);
148
    }
149
150
    #[test]
151
    fn test_classify_nonstarters() {
152
        // Highest character in the `compat_fully_decomp` table is 2FA1D
153
        for ch in 0..0x2FA1E {
154
            let ch = match char::from_u32(ch) {
155
                Some(c) => c,
156
                None => continue,
157
            };
158
            let c = classify_nonstarters(ch);
159
            let mut s = Vec::new();
160
            decompose_compatible(ch, |c| s.push(c));
161
162
            assert_eq!(s.len(), c.decomposition_len);
163
164
            let num_leading = s
165
                .iter()
166
                .take_while(|&c| canonical_combining_class(*c) != 0)
167
                .count();
168
            let num_trailing = s
169
                .iter()
170
                .rev()
171
                .take_while(|&c| canonical_combining_class(*c) != 0)
172
                .count();
173
174
            assert_eq!(num_leading, c.leading_nonstarters);
175
            assert_eq!(num_trailing, c.trailing_nonstarters);
176
        }
177
    }
178
}