/src/unicode-normalization/src/lib.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | //! Unicode character composition and decomposition utilities |
12 | | //! as described in |
13 | | //! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). |
14 | | //! |
15 | | //! ```rust |
16 | | //! extern crate unicode_normalization; |
17 | | //! |
18 | | //! use unicode_normalization::char::compose; |
19 | | //! use unicode_normalization::UnicodeNormalization; |
20 | | //! |
21 | | //! fn main() { |
22 | | //! assert_eq!(compose('A','\u{30a}'), Some('Å')); |
23 | | //! |
24 | | //! let s = "ÅΩ"; |
25 | | //! let c = s.nfc().collect::<String>(); |
26 | | //! assert_eq!(c, "ÅΩ"); |
27 | | //! } |
28 | | //! ``` |
29 | | //! |
30 | | //! # crates.io |
31 | | //! |
32 | | //! You can use this package in your project by adding the following |
33 | | //! to your `Cargo.toml`: |
34 | | //! |
35 | | //! ```toml |
36 | | //! [dependencies] |
37 | | //! unicode-normalization = "0.1.20" |
38 | | //! ``` |
39 | | |
40 | | #![deny(missing_docs, unsafe_code)] |
41 | | #![doc( |
42 | | html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", |
43 | | html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" |
44 | | )] |
45 | | #![cfg_attr(not(feature = "std"), no_std)] |
46 | | |
47 | | #[cfg(not(feature = "std"))] |
48 | | extern crate alloc; |
49 | | |
50 | | #[cfg(feature = "std")] |
51 | | extern crate core; |
52 | | |
53 | | extern crate tinyvec; |
54 | | |
55 | | pub use crate::decompose::Decompositions; |
56 | | pub use crate::quick_check::{ |
57 | | is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, |
58 | | is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, |
59 | | IsNormalized, |
60 | | }; |
61 | | pub use crate::recompose::Recompositions; |
62 | | pub use crate::replace::Replacements; |
63 | | pub use crate::stream_safe::StreamSafe; |
64 | | pub use crate::tables::UNICODE_VERSION; |
65 | | use core::{option, str::Chars}; |
66 | | |
67 | | mod decompose; |
68 | | mod lookups; |
69 | | mod normalize; |
70 | | mod perfect_hash; |
71 | | mod quick_check; |
72 | | mod recompose; |
73 | | mod replace; |
74 | | mod stream_safe; |
75 | | mod tables; |
76 | | |
77 | | #[doc(hidden)] |
78 | | pub mod __test_api; |
79 | | #[cfg(test)] |
80 | | mod test; |
81 | | |
82 | | /// Methods for composing and decomposing characters. |
83 | | pub mod char { |
84 | | pub use crate::normalize::{ |
85 | | compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible, |
86 | | }; |
87 | | |
88 | | pub use crate::lookups::{canonical_combining_class, is_combining_mark}; |
89 | | |
90 | | /// Return whether the given character is assigned (`General_Category` != `Unassigned`) |
91 | | /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version |
92 | | /// of Unicode. |
93 | | pub use crate::tables::is_public_assigned; |
94 | | } |
95 | | |
96 | | /// Methods for iterating over strings while applying Unicode normalizations |
97 | | /// as described in |
98 | | /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). |
99 | | pub trait UnicodeNormalization<I: Iterator<Item = char>> { |
100 | | /// Returns an iterator over the string in Unicode Normalization Form D |
101 | | /// (canonical decomposition). |
102 | | fn nfd(self) -> Decompositions<I>; |
103 | | |
104 | | /// Returns an iterator over the string in Unicode Normalization Form KD |
105 | | /// (compatibility decomposition). |
106 | | fn nfkd(self) -> Decompositions<I>; |
107 | | |
108 | | /// An Iterator over the string in Unicode Normalization Form C |
109 | | /// (canonical decomposition followed by canonical composition). |
110 | | fn nfc(self) -> Recompositions<I>; |
111 | | |
112 | | /// An Iterator over the string in Unicode Normalization Form KC |
113 | | /// (compatibility decomposition followed by canonical composition). |
114 | | fn nfkc(self) -> Recompositions<I>; |
115 | | |
116 | | /// A transformation which replaces [CJK Compatibility Ideograph] codepoints |
117 | | /// with normal forms using [Standardized Variation Sequences]. This is not |
118 | | /// part of the canonical or compatibility decomposition algorithms, but |
119 | | /// performing it before those algorithms produces normalized output which |
120 | | /// better preserves the intent of the original text. |
121 | | /// |
122 | | /// Note that many systems today ignore variation selectors, so these |
123 | | /// may not immediately help text display as intended, but they at |
124 | | /// least preserve the information in a standardized form, giving |
125 | | /// implementations the option to recognize them. |
126 | | /// |
127 | | /// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph |
128 | | /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence |
129 | | fn cjk_compat_variants(self) -> Replacements<I>; |
130 | | |
131 | | /// An Iterator over the string with Conjoining Grapheme Joiner characters |
132 | | /// inserted according to the Stream-Safe Text Process ([UAX15-D4]). |
133 | | /// |
134 | | /// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4 |
135 | | fn stream_safe(self) -> StreamSafe<I>; |
136 | | } |
137 | | |
138 | | impl<'a> UnicodeNormalization<Chars<'a>> for &'a str { |
139 | | #[inline] |
140 | 146 | fn nfd(self) -> Decompositions<Chars<'a>> { |
141 | 146 | Decompositions::new_canonical(self.chars()) |
142 | 146 | } Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd Line | Count | Source | 140 | 146 | fn nfd(self) -> Decompositions<Chars<'a>> { | 141 | 146 | Decompositions::new_canonical(self.chars()) | 142 | 146 | } |
|
143 | | |
144 | | #[inline] |
145 | 323 | fn nfkd(self) -> Decompositions<Chars<'a>> { |
146 | 323 | Decompositions::new_compatible(self.chars()) |
147 | 323 | } Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd Line | Count | Source | 145 | 323 | fn nfkd(self) -> Decompositions<Chars<'a>> { | 146 | 323 | Decompositions::new_compatible(self.chars()) | 147 | 323 | } |
|
148 | | |
149 | | #[inline] |
150 | 922 | fn nfc(self) -> Recompositions<Chars<'a>> { |
151 | 922 | Recompositions::new_canonical(self.chars()) |
152 | 922 | } Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc Line | Count | Source | 150 | 922 | fn nfc(self) -> Recompositions<Chars<'a>> { | 151 | 922 | Recompositions::new_canonical(self.chars()) | 152 | 922 | } |
|
153 | | |
154 | | #[inline] |
155 | 1.44k | fn nfkc(self) -> Recompositions<Chars<'a>> { |
156 | 1.44k | Recompositions::new_compatible(self.chars()) |
157 | 1.44k | } Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc Line | Count | Source | 155 | 1.44k | fn nfkc(self) -> Recompositions<Chars<'a>> { | 156 | 1.44k | Recompositions::new_compatible(self.chars()) | 157 | 1.44k | } |
|
158 | | |
159 | | #[inline] |
160 | 0 | fn cjk_compat_variants(self) -> Replacements<Chars<'a>> { |
161 | 0 | Replacements::new_cjk_compat_variants(self.chars()) |
162 | 0 | } |
163 | | |
164 | | #[inline] |
165 | 273 | fn stream_safe(self) -> StreamSafe<Chars<'a>> { |
166 | 273 | StreamSafe::new(self.chars()) |
167 | 273 | } Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe Line | Count | Source | 165 | 273 | fn stream_safe(self) -> StreamSafe<Chars<'a>> { | 166 | 273 | StreamSafe::new(self.chars()) | 167 | 273 | } |
|
168 | | } |
169 | | |
170 | | impl UnicodeNormalization<option::IntoIter<char>> for char { |
171 | | #[inline] |
172 | 0 | fn nfd(self) -> Decompositions<option::IntoIter<char>> { |
173 | 0 | Decompositions::new_canonical(Some(self).into_iter()) |
174 | 0 | } |
175 | | |
176 | | #[inline] |
177 | 0 | fn nfkd(self) -> Decompositions<option::IntoIter<char>> { |
178 | 0 | Decompositions::new_compatible(Some(self).into_iter()) |
179 | 0 | } |
180 | | |
181 | | #[inline] |
182 | 0 | fn nfc(self) -> Recompositions<option::IntoIter<char>> { |
183 | 0 | Recompositions::new_canonical(Some(self).into_iter()) |
184 | 0 | } |
185 | | |
186 | | #[inline] |
187 | 0 | fn nfkc(self) -> Recompositions<option::IntoIter<char>> { |
188 | 0 | Recompositions::new_compatible(Some(self).into_iter()) |
189 | 0 | } |
190 | | |
191 | | #[inline] |
192 | 0 | fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> { |
193 | 0 | Replacements::new_cjk_compat_variants(Some(self).into_iter()) |
194 | 0 | } |
195 | | |
196 | | #[inline] |
197 | 0 | fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> { |
198 | 0 | StreamSafe::new(Some(self).into_iter()) |
199 | 0 | } |
200 | | } |
201 | | |
202 | | impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I { |
203 | | #[inline] |
204 | 10.8k | fn nfd(self) -> Decompositions<I> { |
205 | 10.8k | Decompositions::new_canonical(self) |
206 | 10.8k | } <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd Line | Count | Source | 204 | 5.40k | fn nfd(self) -> Decompositions<I> { | 205 | 5.40k | Decompositions::new_canonical(self) | 206 | 5.40k | } |
<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars> as unicode_normalization::UnicodeNormalization<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>>>::nfd Line | Count | Source | 204 | 5.40k | fn nfd(self) -> Decompositions<I> { | 205 | 5.40k | Decompositions::new_canonical(self) | 206 | 5.40k | } |
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfd |
207 | | |
208 | | #[inline] |
209 | 5.40k | fn nfkd(self) -> Decompositions<I> { |
210 | 5.40k | Decompositions::new_compatible(self) |
211 | 5.40k | } <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd Line | Count | Source | 209 | 5.40k | fn nfkd(self) -> Decompositions<I> { | 210 | 5.40k | Decompositions::new_compatible(self) | 211 | 5.40k | } |
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfkd |
212 | | |
213 | | #[inline] |
214 | 22.4k | fn nfc(self) -> Recompositions<I> { |
215 | 22.4k | Recompositions::new_canonical(self) |
216 | 22.4k | } <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc Line | Count | Source | 214 | 8.94k | fn nfc(self) -> Recompositions<I> { | 215 | 8.94k | Recompositions::new_canonical(self) | 216 | 8.94k | } |
<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars> as unicode_normalization::UnicodeNormalization<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>>>::nfc Line | Count | Source | 214 | 11.1k | fn nfc(self) -> Recompositions<I> { | 215 | 11.1k | Recompositions::new_canonical(self) | 216 | 11.1k | } |
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfc <streaming::Counter as unicode_normalization::UnicodeNormalization<streaming::Counter>>::nfc Line | Count | Source | 214 | 2.36k | fn nfc(self) -> Recompositions<I> { | 215 | 2.36k | Recompositions::new_canonical(self) | 216 | 2.36k | } |
|
217 | | |
218 | | #[inline] |
219 | 8.78k | fn nfkc(self) -> Recompositions<I> { |
220 | 8.78k | Recompositions::new_compatible(self) |
221 | 8.78k | } <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc Line | Count | Source | 219 | 8.78k | fn nfkc(self) -> Recompositions<I> { | 220 | 8.78k | Recompositions::new_compatible(self) | 221 | 8.78k | } |
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfkc |
222 | | |
223 | | #[inline] |
224 | 0 | fn cjk_compat_variants(self) -> Replacements<I> { |
225 | 0 | Replacements::new_cjk_compat_variants(self) |
226 | 0 | } |
227 | | |
228 | | #[inline] |
229 | 29.7k | fn stream_safe(self) -> StreamSafe<I> { |
230 | 29.7k | StreamSafe::new(self) |
231 | 29.7k | } <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe Line | Count | Source | 229 | 27.3k | fn stream_safe(self) -> StreamSafe<I> { | 230 | 27.3k | StreamSafe::new(self) | 231 | 27.3k | } |
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::stream_safe <core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe Line | Count | Source | 229 | 2.36k | fn stream_safe(self) -> StreamSafe<I> { | 230 | 2.36k | StreamSafe::new(self) | 231 | 2.36k | } |
|
232 | | } |