Coverage Report

Created: 2025-08-29 06:50

/src/unicode-normalization/src/lib.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2
// file at the top-level directory of this distribution and at
3
// http://rust-lang.org/COPYRIGHT.
4
//
5
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8
// option. This file may not be copied, modified, or distributed
9
// except according to those terms.
10
11
//! Unicode character composition and decomposition utilities
12
//! as described in
13
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14
//!
15
//! ```rust
16
//! extern crate unicode_normalization;
17
//!
18
//! use unicode_normalization::char::compose;
19
//! use unicode_normalization::UnicodeNormalization;
20
//!
21
//! fn main() {
22
//!     assert_eq!(compose('A','\u{30a}'), Some('Å'));
23
//!
24
//!     let s = "ÅΩ";
25
//!     let c = s.nfc().collect::<String>();
26
//!     assert_eq!(c, "ÅΩ");
27
//! }
28
//! ```
29
//!
30
//! # crates.io
31
//!
32
//! You can use this package in your project by adding the following
33
//! to your `Cargo.toml`:
34
//!
35
//! ```toml
36
//! [dependencies]
37
//! unicode-normalization = "0.1.20"
38
//! ```
39
40
#![deny(missing_docs, unsafe_code)]
41
#![doc(
42
    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43
    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44
)]
45
#![cfg_attr(not(feature = "std"), no_std)]
46
47
#[cfg(not(feature = "std"))]
48
extern crate alloc;
49
50
#[cfg(feature = "std")]
51
extern crate core;
52
53
extern crate tinyvec;
54
55
pub use crate::decompose::Decompositions;
56
pub use crate::quick_check::{
57
    is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58
    is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59
    IsNormalized,
60
};
61
pub use crate::recompose::Recompositions;
62
pub use crate::replace::Replacements;
63
pub use crate::stream_safe::StreamSafe;
64
pub use crate::tables::UNICODE_VERSION;
65
use core::{option, str::Chars};
66
67
mod decompose;
68
mod lookups;
69
mod normalize;
70
mod perfect_hash;
71
mod quick_check;
72
mod recompose;
73
mod replace;
74
mod stream_safe;
75
mod tables;
76
77
#[doc(hidden)]
78
pub mod __test_api;
79
#[cfg(test)]
80
mod test;
81
82
/// Methods for composing and decomposing characters.
83
pub mod char {
84
    pub use crate::normalize::{
85
        compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
86
    };
87
88
    pub use crate::lookups::{canonical_combining_class, is_combining_mark};
89
90
    /// Return whether the given character is assigned (`General_Category` != `Unassigned`)
91
    /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
92
    /// of Unicode.
93
    pub use crate::tables::is_public_assigned;
94
}
95
96
/// Methods for iterating over strings while applying Unicode normalizations
97
/// as described in
98
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
99
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
100
    /// Returns an iterator over the string in Unicode Normalization Form D
101
    /// (canonical decomposition).
102
    fn nfd(self) -> Decompositions<I>;
103
104
    /// Returns an iterator over the string in Unicode Normalization Form KD
105
    /// (compatibility decomposition).
106
    fn nfkd(self) -> Decompositions<I>;
107
108
    /// An Iterator over the string in Unicode Normalization Form C
109
    /// (canonical decomposition followed by canonical composition).
110
    fn nfc(self) -> Recompositions<I>;
111
112
    /// An Iterator over the string in Unicode Normalization Form KC
113
    /// (compatibility decomposition followed by canonical composition).
114
    fn nfkc(self) -> Recompositions<I>;
115
116
    /// A transformation which replaces [CJK Compatibility Ideograph] codepoints
117
    /// with normal forms using [Standardized Variation Sequences]. This is not
118
    /// part of the canonical or compatibility decomposition algorithms, but
119
    /// performing it before those algorithms produces normalized output which
120
    /// better preserves the intent of the original text.
121
    ///
122
    /// Note that many systems today ignore variation selectors, so these
123
    /// may not immediately help text display as intended, but they at
124
    /// least preserve the information in a standardized form, giving
125
    /// implementations the option to recognize them.
126
    ///
127
    /// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
128
    /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
129
    fn cjk_compat_variants(self) -> Replacements<I>;
130
131
    /// An Iterator over the string with Conjoining Grapheme Joiner characters
132
    /// inserted according to the Stream-Safe Text Process ([UAX15-D4]).
133
    ///
134
    /// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
135
    fn stream_safe(self) -> StreamSafe<I>;
136
}
137
138
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
139
    #[inline]
140
146
    fn nfd(self) -> Decompositions<Chars<'a>> {
141
146
        Decompositions::new_canonical(self.chars())
142
146
    }
Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd
<&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd
Line
Count
Source
140
146
    fn nfd(self) -> Decompositions<Chars<'a>> {
141
146
        Decompositions::new_canonical(self.chars())
142
146
    }
143
144
    #[inline]
145
323
    fn nfkd(self) -> Decompositions<Chars<'a>> {
146
323
        Decompositions::new_compatible(self.chars())
147
323
    }
Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd
<&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd
Line
Count
Source
145
323
    fn nfkd(self) -> Decompositions<Chars<'a>> {
146
323
        Decompositions::new_compatible(self.chars())
147
323
    }
148
149
    #[inline]
150
922
    fn nfc(self) -> Recompositions<Chars<'a>> {
151
922
        Recompositions::new_canonical(self.chars())
152
922
    }
Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc
<&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc
Line
Count
Source
150
922
    fn nfc(self) -> Recompositions<Chars<'a>> {
151
922
        Recompositions::new_canonical(self.chars())
152
922
    }
153
154
    #[inline]
155
1.44k
    fn nfkc(self) -> Recompositions<Chars<'a>> {
156
1.44k
        Recompositions::new_compatible(self.chars())
157
1.44k
    }
Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc
<&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc
Line
Count
Source
155
1.44k
    fn nfkc(self) -> Recompositions<Chars<'a>> {
156
1.44k
        Recompositions::new_compatible(self.chars())
157
1.44k
    }
158
159
    #[inline]
160
0
    fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
161
0
        Replacements::new_cjk_compat_variants(self.chars())
162
0
    }
163
164
    #[inline]
165
273
    fn stream_safe(self) -> StreamSafe<Chars<'a>> {
166
273
        StreamSafe::new(self.chars())
167
273
    }
Unexecuted instantiation: <&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe
<&str as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe
Line
Count
Source
165
273
    fn stream_safe(self) -> StreamSafe<Chars<'a>> {
166
273
        StreamSafe::new(self.chars())
167
273
    }
168
}
169
170
impl UnicodeNormalization<option::IntoIter<char>> for char {
171
    #[inline]
172
0
    fn nfd(self) -> Decompositions<option::IntoIter<char>> {
173
0
        Decompositions::new_canonical(Some(self).into_iter())
174
0
    }
175
176
    #[inline]
177
0
    fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
178
0
        Decompositions::new_compatible(Some(self).into_iter())
179
0
    }
180
181
    #[inline]
182
0
    fn nfc(self) -> Recompositions<option::IntoIter<char>> {
183
0
        Recompositions::new_canonical(Some(self).into_iter())
184
0
    }
185
186
    #[inline]
187
0
    fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
188
0
        Recompositions::new_compatible(Some(self).into_iter())
189
0
    }
190
191
    #[inline]
192
0
    fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
193
0
        Replacements::new_cjk_compat_variants(Some(self).into_iter())
194
0
    }
195
196
    #[inline]
197
0
    fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
198
0
        StreamSafe::new(Some(self).into_iter())
199
0
    }
200
}
201
202
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
203
    #[inline]
204
10.8k
    fn nfd(self) -> Decompositions<I> {
205
10.8k
        Decompositions::new_canonical(self)
206
10.8k
    }
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfd
Line
Count
Source
204
5.40k
    fn nfd(self) -> Decompositions<I> {
205
5.40k
        Decompositions::new_canonical(self)
206
5.40k
    }
<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars> as unicode_normalization::UnicodeNormalization<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>>>::nfd
Line
Count
Source
204
5.40k
    fn nfd(self) -> Decompositions<I> {
205
5.40k
        Decompositions::new_canonical(self)
206
5.40k
    }
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfd
207
208
    #[inline]
209
5.40k
    fn nfkd(self) -> Decompositions<I> {
210
5.40k
        Decompositions::new_compatible(self)
211
5.40k
    }
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkd
Line
Count
Source
209
5.40k
    fn nfkd(self) -> Decompositions<I> {
210
5.40k
        Decompositions::new_compatible(self)
211
5.40k
    }
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfkd
212
213
    #[inline]
214
22.4k
    fn nfc(self) -> Recompositions<I> {
215
22.4k
        Recompositions::new_canonical(self)
216
22.4k
    }
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfc
Line
Count
Source
214
8.94k
    fn nfc(self) -> Recompositions<I> {
215
8.94k
        Recompositions::new_canonical(self)
216
8.94k
    }
<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars> as unicode_normalization::UnicodeNormalization<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>>>::nfc
Line
Count
Source
214
11.1k
    fn nfc(self) -> Recompositions<I> {
215
11.1k
        Recompositions::new_canonical(self)
216
11.1k
    }
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfc
<streaming::Counter as unicode_normalization::UnicodeNormalization<streaming::Counter>>::nfc
Line
Count
Source
214
2.36k
    fn nfc(self) -> Recompositions<I> {
215
2.36k
        Recompositions::new_canonical(self)
216
2.36k
    }
217
218
    #[inline]
219
8.78k
    fn nfkc(self) -> Recompositions<I> {
220
8.78k
        Recompositions::new_compatible(self)
221
8.78k
    }
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::nfkc
Line
Count
Source
219
8.78k
    fn nfkc(self) -> Recompositions<I> {
220
8.78k
        Recompositions::new_compatible(self)
221
8.78k
    }
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::nfkc
222
223
    #[inline]
224
0
    fn cjk_compat_variants(self) -> Replacements<I> {
225
0
        Replacements::new_cjk_compat_variants(self)
226
0
    }
227
228
    #[inline]
229
29.7k
    fn stream_safe(self) -> StreamSafe<I> {
230
29.7k
        StreamSafe::new(self)
231
29.7k
    }
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe
Line
Count
Source
229
27.3k
    fn stream_safe(self) -> StreamSafe<I> {
230
27.3k
        StreamSafe::new(self)
231
27.3k
    }
Unexecuted instantiation: <_ as unicode_normalization::UnicodeNormalization<_>>::stream_safe
<core::str::iter::Chars as unicode_normalization::UnicodeNormalization<core::str::iter::Chars>>::stream_safe
Line
Count
Source
229
2.36k
    fn stream_safe(self) -> StreamSafe<I> {
230
2.36k
        StreamSafe::new(self)
231
2.36k
    }
232
}