/src/unicode-normalization/src/quick_check.rs
Line | Count | Source |
1 | | use crate::lookups::canonical_combining_class; |
2 | | use crate::stream_safe; |
3 | | use crate::tables; |
4 | | use crate::UnicodeNormalization; |
5 | | |
6 | | /// QuickCheck quickly determines if a string is normalized, it can return |
7 | | /// `Maybe` |
8 | | /// |
9 | | /// The QuickCheck algorithm can quickly determine if a text is or isn't |
10 | | /// normalized without any allocations in many cases, but it has to be able to |
11 | | /// return `Maybe` when a full decomposition and recomposition is necessary. |
12 | | #[derive(Debug, Eq, PartialEq)] |
13 | | pub enum IsNormalized { |
14 | | /// The text is definitely normalized. |
15 | | Yes, |
16 | | /// The text is definitely not normalized. |
17 | | No, |
18 | | /// The text may be normalized. |
19 | | Maybe, |
20 | | } |
21 | | |
22 | | // https://unicode.org/reports/tr15/#Detecting_Normalization_Forms |
23 | | #[inline] |
24 | 0 | fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized |
25 | 0 | where |
26 | 0 | I: Iterator<Item = char>, |
27 | 0 | F: Fn(char) -> IsNormalized, |
28 | | { |
29 | 0 | let mut last_cc = 0u8; |
30 | 0 | let mut nonstarter_count = 0; |
31 | 0 | let mut result = IsNormalized::Yes; |
32 | 0 | for ch in s { |
33 | | // For ASCII we know it's always allowed and a starter |
34 | 0 | if ch <= '\x7f' { |
35 | 0 | last_cc = 0; |
36 | 0 | nonstarter_count = 0; |
37 | 0 | continue; |
38 | 0 | } |
39 | | |
40 | | // Otherwise, lookup the combining class and QC property |
41 | 0 | let cc = canonical_combining_class(ch); |
42 | 0 | if last_cc > cc && cc != 0 { |
43 | 0 | return IsNormalized::No; |
44 | 0 | } |
45 | 0 | match is_allowed(ch) { |
46 | 0 | IsNormalized::Yes => (), |
47 | 0 | IsNormalized::No => return IsNormalized::No, |
48 | 0 | IsNormalized::Maybe => { |
49 | 0 | result = IsNormalized::Maybe; |
50 | 0 | } |
51 | | } |
52 | 0 | if stream_safe { |
53 | 0 | let decomp = stream_safe::classify_nonstarters(ch); |
54 | | |
55 | | // If we're above `MAX_NONSTARTERS`, we're definitely *not* |
56 | | // stream-safe normalized. |
57 | 0 | if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS { |
58 | 0 | return IsNormalized::No; |
59 | 0 | } |
60 | 0 | if decomp.leading_nonstarters == decomp.decomposition_len { |
61 | 0 | nonstarter_count += decomp.decomposition_len; |
62 | 0 | } else { |
63 | 0 | nonstarter_count = decomp.trailing_nonstarters; |
64 | 0 | } |
65 | 0 | } |
66 | 0 | last_cc = cc; |
67 | | } |
68 | 0 | result |
69 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::quick_check::<unicode_normalization::tables::qc_nfc, core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::quick_check::<unicode_normalization::tables::qc_nfd, core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::quick_check::<unicode_normalization::tables::qc_nfkc, core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::quick_check::<unicode_normalization::tables::qc_nfkd, core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::quick_check::<_, _> |
70 | | |
71 | | /// Quickly check if a string is in NFC, potentially returning |
72 | | /// `IsNormalized::Maybe` if further checks are necessary. In this case a check |
73 | | /// like `s.chars().nfc().eq(s.chars())` should suffice. |
74 | | #[inline] |
75 | 0 | pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
76 | 0 | quick_check(s, tables::qc_nfc, false) |
77 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_quick::<_> |
78 | | |
79 | | /// Quickly check if a string is in NFKC. |
80 | | #[inline] |
81 | 0 | pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
82 | 0 | quick_check(s, tables::qc_nfkc, false) |
83 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfkc_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfkc_quick::<_> |
84 | | |
85 | | /// Quickly check if a string is in NFD. |
86 | | #[inline] |
87 | 0 | pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
88 | 0 | quick_check(s, tables::qc_nfd, false) |
89 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_quick::<_> |
90 | | |
91 | | /// Quickly check if a string is in NFKD. |
92 | | #[inline] |
93 | 0 | pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
94 | 0 | quick_check(s, tables::qc_nfkd, false) |
95 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfkd_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfkd_quick::<_> |
96 | | |
97 | | /// Quickly check if a string is Stream-Safe NFC. |
98 | | #[inline] |
99 | 0 | pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
100 | 0 | quick_check(s, tables::qc_nfc, true) |
101 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_stream_safe_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_stream_safe_quick::<_> |
102 | | |
103 | | /// Quickly check if a string is Stream-Safe NFD. |
104 | | #[inline] |
105 | 0 | pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
106 | 0 | quick_check(s, tables::qc_nfd, true) |
107 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_stream_safe_quick::<core::str::iter::Chars> Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_stream_safe_quick::<_> |
108 | | |
109 | | /// Authoritatively check if a string is in NFC. |
110 | | #[inline] |
111 | 0 | pub fn is_nfc(s: &str) -> bool { |
112 | 0 | match is_nfc_quick(s.chars()) { |
113 | 0 | IsNormalized::Yes => true, |
114 | 0 | IsNormalized::No => false, |
115 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().nfc()), |
116 | | } |
117 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfc Unexecuted instantiation: unicode_normalization::quick_check::is_nfc |
118 | | |
119 | | /// Authoritatively check if a string is in NFKC. |
120 | | #[inline] |
121 | 0 | pub fn is_nfkc(s: &str) -> bool { |
122 | 0 | match is_nfkc_quick(s.chars()) { |
123 | 0 | IsNormalized::Yes => true, |
124 | 0 | IsNormalized::No => false, |
125 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()), |
126 | | } |
127 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfkc Unexecuted instantiation: unicode_normalization::quick_check::is_nfkc |
128 | | |
129 | | /// Authoritatively check if a string is in NFD. |
130 | | #[inline] |
131 | 0 | pub fn is_nfd(s: &str) -> bool { |
132 | 0 | match is_nfd_quick(s.chars()) { |
133 | 0 | IsNormalized::Yes => true, |
134 | 0 | IsNormalized::No => false, |
135 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().nfd()), |
136 | | } |
137 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfd Unexecuted instantiation: unicode_normalization::quick_check::is_nfd |
138 | | |
139 | | /// Authoritatively check if a string is in NFKD. |
140 | | #[inline] |
141 | 0 | pub fn is_nfkd(s: &str) -> bool { |
142 | 0 | match is_nfkd_quick(s.chars()) { |
143 | 0 | IsNormalized::Yes => true, |
144 | 0 | IsNormalized::No => false, |
145 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()), |
146 | | } |
147 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfkd Unexecuted instantiation: unicode_normalization::quick_check::is_nfkd |
148 | | |
149 | | /// Authoritatively check if a string is Stream-Safe NFC. |
150 | | #[inline] |
151 | 0 | pub fn is_nfc_stream_safe(s: &str) -> bool { |
152 | 0 | match is_nfc_stream_safe_quick(s.chars()) { |
153 | 0 | IsNormalized::Yes => true, |
154 | 0 | IsNormalized::No => false, |
155 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()), |
156 | | } |
157 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_stream_safe Unexecuted instantiation: unicode_normalization::quick_check::is_nfc_stream_safe |
158 | | |
159 | | /// Authoritatively check if a string is Stream-Safe NFD. |
160 | | #[inline] |
161 | 0 | pub fn is_nfd_stream_safe(s: &str) -> bool { |
162 | 0 | match is_nfd_stream_safe_quick(s.chars()) { |
163 | 0 | IsNormalized::Yes => true, |
164 | 0 | IsNormalized::No => false, |
165 | 0 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()), |
166 | | } |
167 | 0 | } Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_stream_safe Unexecuted instantiation: unicode_normalization::quick_check::is_nfd_stream_safe |
168 | | |
169 | | #[cfg(test)] |
170 | | mod tests { |
171 | | use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized}; |
172 | | |
173 | | #[test] |
174 | | fn test_stream_safe_nfd() { |
175 | | let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone"; |
176 | | assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes); |
177 | | |
178 | | let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone"; |
179 | | assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No); |
180 | | } |
181 | | |
182 | | #[test] |
183 | | fn test_stream_safe_nfc() { |
184 | | let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y"; |
185 | | assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe); |
186 | | |
187 | | let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y"; |
188 | | assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No); |
189 | | } |
190 | | } |