/src/unicode-normalization/src/normalize.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | //! Functions for computing canonical and compatible decompositions for Unicode characters. |
12 | | use crate::lookups::{ |
13 | | canonical_fully_decomposed, cjk_compat_variants_fully_decomposed, |
14 | | compatibility_fully_decomposed, composition_table, |
15 | | }; |
16 | | |
17 | | use core::char; |
18 | | |
19 | | /// Compute canonical Unicode decomposition for character. |
20 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
21 | | /// for more information. |
22 | | #[inline] |
23 | 104M | pub fn decompose_canonical<F>(c: char, emit_char: F) |
24 | 104M | where |
25 | 104M | F: FnMut(char), |
26 | 104M | { |
27 | 104M | decompose(c, canonical_fully_decomposed, emit_char) |
28 | 104M | } unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 45.5M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 45.5M | where | 25 | 45.5M | F: FnMut(char), | 26 | 45.5M | { | 27 | 45.5M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 45.5M | } |
unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 31.0M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 31.0M | where | 25 | 31.0M | F: FnMut(char), | 26 | 31.0M | { | 27 | 31.0M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 31.0M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_canonical::<_> unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 10.0M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 10.0M | where | 25 | 10.0M | F: FnMut(char), | 26 | 10.0M | { | 27 | 10.0M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 10.0M | } |
unicode_normalization::normalize::decompose_canonical::<process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 23 | 33 | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 33 | where | 25 | 33 | F: FnMut(char), | 26 | 33 | { | 27 | 33 | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 33 | } |
unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 17.3M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 17.3M | where | 25 | 17.3M | F: FnMut(char), | 26 | 17.3M | { | 27 | 17.3M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 17.3M | } |
|
29 | | |
30 | | /// Compute canonical or compatible Unicode decomposition for character. |
31 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
32 | | /// for more information. |
33 | | #[inline] |
34 | 62.1M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { |
35 | 62.1M | let decompose_char = |
36 | 51.2M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} Line | Count | Source | 36 | 31.5M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_>::{closure#0} Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}>::{closure#0} Line | Count | Source | 36 | 52 | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} Line | Count | Source | 36 | 19.7M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 30.3M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_>::{closure#0}::{closure#0} Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 51 | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 17.8M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
|
37 | 62.1M | decompose(c, decompose_char, emit_char) |
38 | 62.1M | } Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 34 | 39.6M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 39.6M | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 39.6M | decompose(c, decompose_char, emit_char) | 38 | 39.6M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_> Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 34 | 72 | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 72 | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 72 | decompose(c, decompose_char, emit_char) | 38 | 72 | } |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 34 | 22.4M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 22.4M | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 22.4M | decompose(c, decompose_char, emit_char) | 38 | 22.4M | } |
|
39 | | |
40 | | /// Compute standard-variation decomposition for character. |
41 | | /// |
42 | | /// [Standardized Variation Sequences] are used instead of the standard canonical |
43 | | /// decompositions, notably for CJK codepoints with singleton canonical decompositions, |
44 | | /// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the |
45 | | /// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information. |
46 | | /// |
47 | | /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence |
48 | | /// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html |
49 | | /// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary |
50 | | #[inline] |
51 | 0 | pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F) |
52 | 0 | where |
53 | 0 | F: FnMut(char), |
54 | 0 | { |
55 | 0 | // 7-bit ASCII never decomposes |
56 | 0 | if c <= '\x7f' { |
57 | 0 | emit_char(c); |
58 | 0 | return; |
59 | 0 | } |
60 | | |
61 | | // Don't perform decomposition for Hangul |
62 | | |
63 | 0 | if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) { |
64 | 0 | for &d in decomposed { |
65 | 0 | emit_char(d); |
66 | 0 | } |
67 | 0 | return; |
68 | 0 | } |
69 | 0 |
|
70 | 0 | // Finally bottom out. |
71 | 0 | emit_char(c); |
72 | 0 | } |
73 | | |
74 | | #[inline] |
75 | | #[allow(unsafe_code)] |
76 | 166M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) |
77 | 166M | where |
78 | 166M | D: Fn(char) -> Option<&'static [char]>, |
79 | 166M | F: FnMut(char), |
80 | 166M | { |
81 | 166M | // 7-bit ASCII never decomposes |
82 | 166M | if c <= '\x7f' { |
83 | 36.0M | emit_char(c); |
84 | 36.0M | return; |
85 | 130M | } |
86 | 130M | |
87 | 130M | // Perform decomposition for Hangul |
88 | 130M | if is_hangul_syllable(c) { |
89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above |
90 | 144k | unsafe { |
91 | 144k | decompose_hangul(c, emit_char); |
92 | 144k | } |
93 | 144k | return; |
94 | 129M | } |
95 | | |
96 | 129M | if let Some(decomposed) = decompose_char(c) { |
97 | 264M | for &d in decomposed { |
98 | 192M | emit_char(d); |
99 | 192M | } |
100 | 71.6M | return; |
101 | 58.3M | } |
102 | 58.3M | |
103 | 58.3M | // Finally bottom out. |
104 | 58.3M | emit_char(c); |
105 | 166M | } Unexecuted instantiation: unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 76 | 39.6M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 39.6M | where | 78 | 39.6M | D: Fn(char) -> Option<&'static [char]>, | 79 | 39.6M | F: FnMut(char), | 80 | 39.6M | { | 81 | 39.6M | // 7-bit ASCII never decomposes | 82 | 39.6M | if c <= '\x7f' { | 83 | 8.02M | emit_char(c); | 84 | 8.02M | return; | 85 | 31.6M | } | 86 | 31.6M | | 87 | 31.6M | // Perform decomposition for Hangul | 88 | 31.6M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 111k | unsafe { | 91 | 111k | decompose_hangul(c, emit_char); | 92 | 111k | } | 93 | 111k | return; | 94 | 31.5M | } | 95 | | | 96 | 31.5M | if let Some(decomposed) = decompose_char(c) { | 97 | 56.8M | for &d in decomposed { | 98 | 44.0M | emit_char(d); | 99 | 44.0M | } | 100 | 12.8M | return; | 101 | 18.6M | } | 102 | 18.6M | | 103 | 18.6M | // Finally bottom out. | 104 | 18.6M | emit_char(c); | 105 | 39.6M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 45.5M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 45.5M | where | 78 | 45.5M | D: Fn(char) -> Option<&'static [char]>, | 79 | 45.5M | F: FnMut(char), | 80 | 45.5M | { | 81 | 45.5M | // 7-bit ASCII never decomposes | 82 | 45.5M | if c <= '\x7f' { | 83 | 7.69M | emit_char(c); | 84 | 7.69M | return; | 85 | 37.8M | } | 86 | 37.8M | | 87 | 37.8M | // Perform decomposition for Hangul | 88 | 37.8M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 7.67k | unsafe { | 91 | 7.67k | decompose_hangul(c, emit_char); | 92 | 7.67k | } | 93 | 7.67k | return; | 94 | 37.8M | } | 95 | | | 96 | 37.8M | if let Some(decomposed) = decompose_char(c) { | 97 | 36.8M | for &d in decomposed { | 98 | 24.9M | emit_char(d); | 99 | 24.9M | } | 100 | 11.9M | return; | 101 | 25.8M | } | 102 | 25.8M | | 103 | 25.8M | // Finally bottom out. | 104 | 25.8M | emit_char(c); | 105 | 45.5M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 31.0M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 31.0M | where | 78 | 31.0M | D: Fn(char) -> Option<&'static [char]>, | 79 | 31.0M | F: FnMut(char), | 80 | 31.0M | { | 81 | 31.0M | // 7-bit ASCII never decomposes | 82 | 31.0M | if c <= '\x7f' { | 83 | 6.43M | emit_char(c); | 84 | 6.43M | return; | 85 | 24.6M | } | 86 | 24.6M | | 87 | 24.6M | // Perform decomposition for Hangul | 88 | 24.6M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 7.25k | unsafe { | 91 | 7.25k | decompose_hangul(c, emit_char); | 92 | 7.25k | } | 93 | 7.25k | return; | 94 | 24.6M | } | 95 | | | 96 | 24.6M | if let Some(decomposed) = decompose_char(c) { | 97 | 35.8M | for &d in decomposed { | 98 | 24.1M | emit_char(d); | 99 | 24.1M | } | 100 | 11.6M | return; | 101 | 12.9M | } | 102 | 12.9M | | 103 | 12.9M | // Finally bottom out. | 104 | 12.9M | emit_char(c); | 105 | 31.0M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose::<_, _> Unexecuted instantiation: unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 10.0M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 10.0M | where | 78 | 10.0M | D: Fn(char) -> Option<&'static [char]>, | 79 | 10.0M | F: FnMut(char), | 80 | 10.0M | { | 81 | 10.0M | // 7-bit ASCII never decomposes | 82 | 10.0M | if c <= '\x7f' { | 83 | 7.41M | emit_char(c); | 84 | 7.41M | return; | 85 | 2.64M | } | 86 | 2.64M | | 87 | 2.64M | // Perform decomposition for Hangul | 88 | 2.64M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 1.35k | unsafe { | 91 | 1.35k | decompose_hangul(c, emit_char); | 92 | 1.35k | } | 93 | 1.35k | return; | 94 | 2.64M | } | 95 | | | 96 | 2.64M | if let Some(decomposed) = decompose_char(c) { | 97 | 7.26M | for &d in decomposed { | 98 | 4.84M | emit_char(d); | 99 | 4.84M | } | 100 | 2.42M | return; | 101 | 223k | } | 102 | 223k | | 103 | 223k | // Finally bottom out. | 104 | 223k | emit_char(c); | 105 | 10.0M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<process::rust_fuzzer_test_input::{closure#1}>::{closure#0}, process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 76 | 72 | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 72 | where | 78 | 72 | D: Fn(char) -> Option<&'static [char]>, | 79 | 72 | F: FnMut(char), | 80 | 72 | { | 81 | 72 | // 7-bit ASCII never decomposes | 82 | 72 | if c <= '\x7f' { | 83 | 7 | emit_char(c); | 84 | 7 | return; | 85 | 65 | } | 86 | 65 | | 87 | 65 | // Perform decomposition for Hangul | 88 | 65 | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 13 | unsafe { | 91 | 13 | decompose_hangul(c, emit_char); | 92 | 13 | } | 93 | 13 | return; | 94 | 52 | } | 95 | | | 96 | 52 | if let Some(decomposed) = decompose_char(c) { | 97 | 3 | for &d in decomposed { | 98 | 2 | emit_char(d); | 99 | 2 | } | 100 | 1 | return; | 101 | 51 | } | 102 | 51 | | 103 | 51 | // Finally bottom out. | 104 | 51 | emit_char(c); | 105 | 72 | } |
unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 76 | 22.4M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 22.4M | where | 78 | 22.4M | D: Fn(char) -> Option<&'static [char]>, | 79 | 22.4M | F: FnMut(char), | 80 | 22.4M | { | 81 | 22.4M | // 7-bit ASCII never decomposes | 82 | 22.4M | if c <= '\x7f' { | 83 | 2.72M | emit_char(c); | 84 | 2.72M | return; | 85 | 19.7M | } | 86 | 19.7M | | 87 | 19.7M | // Perform decomposition for Hangul | 88 | 19.7M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 743 | unsafe { | 91 | 743 | decompose_hangul(c, emit_char); | 92 | 743 | } | 93 | 743 | return; | 94 | 19.7M | } | 95 | | | 96 | 19.7M | if let Some(decomposed) = decompose_char(c) { | 97 | 88.1M | for &d in decomposed { | 98 | 68.5M | emit_char(d); | 99 | 68.5M | } | 100 | 19.5M | return; | 101 | 153k | } | 102 | 153k | | 103 | 153k | // Finally bottom out. | 104 | 153k | emit_char(c); | 105 | 22.4M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 76 | 33 | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 33 | where | 78 | 33 | D: Fn(char) -> Option<&'static [char]>, | 79 | 33 | F: FnMut(char), | 80 | 33 | { | 81 | 33 | // 7-bit ASCII never decomposes | 82 | 33 | if c <= '\x7f' { | 83 | 5 | emit_char(c); | 84 | 5 | return; | 85 | 28 | } | 86 | 28 | | 87 | 28 | // Perform decomposition for Hangul | 88 | 28 | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 1 | unsafe { | 91 | 1 | decompose_hangul(c, emit_char); | 92 | 1 | } | 93 | 1 | return; | 94 | 27 | } | 95 | | | 96 | 27 | if let Some(decomposed) = decompose_char(c) { | 97 | 5 | for &d in decomposed { | 98 | 3 | emit_char(d); | 99 | 3 | } | 100 | 2 | return; | 101 | 25 | } | 102 | 25 | | 103 | 25 | // Finally bottom out. | 104 | 25 | emit_char(c); | 105 | 33 | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 17.3M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 17.3M | where | 78 | 17.3M | D: Fn(char) -> Option<&'static [char]>, | 79 | 17.3M | F: FnMut(char), | 80 | 17.3M | { | 81 | 17.3M | // 7-bit ASCII never decomposes | 82 | 17.3M | if c <= '\x7f' { | 83 | 3.78M | emit_char(c); | 84 | 3.78M | return; | 85 | 13.5M | } | 86 | 13.5M | | 87 | 13.5M | // Perform decomposition for Hangul | 88 | 13.5M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 15.5k | unsafe { | 91 | 15.5k | decompose_hangul(c, emit_char); | 92 | 15.5k | } | 93 | 15.5k | return; | 94 | 13.5M | } | 95 | | | 96 | 13.5M | if let Some(decomposed) = decompose_char(c) { | 97 | 39.6M | for &d in decomposed { | 98 | 26.4M | emit_char(d); | 99 | 26.4M | } | 100 | 13.2M | return; | 101 | 380k | } | 102 | 380k | | 103 | 380k | // Finally bottom out. | 104 | 380k | emit_char(c); | 105 | 17.3M | } |
|
106 | | |
107 | | /// Compose two characters into a single character, if possible. |
108 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
109 | | /// for more information. |
110 | 68.3M | pub fn compose(a: char, b: char) -> Option<char> { |
111 | 68.3M | compose_hangul(a, b).or_else(|| composition_table(a, b)) |
112 | 68.3M | } |
113 | | |
114 | | // Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior |
115 | | // http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior |
116 | | const S_BASE: u32 = 0xAC00; |
117 | | const L_BASE: u32 = 0x1100; |
118 | | const V_BASE: u32 = 0x1161; |
119 | | const T_BASE: u32 = 0x11A7; |
120 | | const L_COUNT: u32 = 19; |
121 | | const V_COUNT: u32 = 21; |
122 | | const T_COUNT: u32 = 28; |
123 | | const N_COUNT: u32 = V_COUNT * T_COUNT; |
124 | | const S_COUNT: u32 = L_COUNT * N_COUNT; |
125 | | |
126 | | const S_LAST: u32 = S_BASE + S_COUNT - 1; |
127 | | const L_LAST: u32 = L_BASE + L_COUNT - 1; |
128 | | const V_LAST: u32 = V_BASE + V_COUNT - 1; |
129 | | const T_LAST: u32 = T_BASE + T_COUNT - 1; |
130 | | |
131 | | // Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`, |
132 | | // i.e. `T_BASE + 1 ..= T_LAST`. |
133 | | const T_FIRST: u32 = T_BASE + 1; |
134 | | |
135 | | // Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF) |
136 | 295M | pub(crate) fn is_hangul_syllable(c: char) -> bool { |
137 | 295M | // Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant |
138 | 295M | (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) |
139 | 295M | } |
140 | | |
141 | | // Decompose a precomposed Hangul syllable |
142 | | // Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF |
143 | | #[allow(unsafe_code, unused_unsafe)] |
144 | | #[inline(always)] |
145 | 144k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) |
146 | 144k | where |
147 | 144k | F: FnMut(char), |
148 | 144k | { |
149 | 144k | // This will be at most 0x2baf, the size of the Hangul Syllables block |
150 | 144k | let s_index = s as u32 - S_BASE; |
151 | 144k | // This will be at most 0x2baf / (21 * 28), 19 |
152 | 144k | let l_index = s_index / N_COUNT; |
153 | 144k | unsafe { |
154 | 144k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
155 | 144k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); |
156 | 144k | |
157 | 144k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 |
158 | 144k | let v_index = (s_index % N_COUNT) / T_COUNT; |
159 | 144k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
160 | 144k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); |
161 | 144k | |
162 | 144k | // Safety: This will be at most T_COUNT - 1 (27) |
163 | 144k | let t_index = s_index % T_COUNT; |
164 | 144k | if t_index > 0 { |
165 | 89.7k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
166 | 89.7k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); |
167 | 89.7k | } |
168 | | } |
169 | 144k | } unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 7.67k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 7.67k | where | 147 | 7.67k | F: FnMut(char), | 148 | 7.67k | { | 149 | 7.67k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 7.67k | let s_index = s as u32 - S_BASE; | 151 | 7.67k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 7.67k | let l_index = s_index / N_COUNT; | 153 | 7.67k | unsafe { | 154 | 7.67k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 7.67k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 7.67k | | 157 | 7.67k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 7.67k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 7.67k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 7.67k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 7.67k | | 162 | 7.67k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 7.67k | let t_index = s_index % T_COUNT; | 164 | 7.67k | if t_index > 0 { | 165 | 7.00k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 7.00k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 7.00k | } | 168 | | } | 169 | 7.67k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 7.25k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 7.25k | where | 147 | 7.25k | F: FnMut(char), | 148 | 7.25k | { | 149 | 7.25k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 7.25k | let s_index = s as u32 - S_BASE; | 151 | 7.25k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 7.25k | let l_index = s_index / N_COUNT; | 153 | 7.25k | unsafe { | 154 | 7.25k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 7.25k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 7.25k | | 157 | 7.25k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 7.25k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 7.25k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 7.25k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 7.25k | | 162 | 7.25k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 7.25k | let t_index = s_index % T_COUNT; | 164 | 7.25k | if t_index > 0 { | 165 | 6.72k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 6.72k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 6.72k | } | 168 | | } | 169 | 7.25k | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 145 | 111k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 111k | where | 147 | 111k | F: FnMut(char), | 148 | 111k | { | 149 | 111k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 111k | let s_index = s as u32 - S_BASE; | 151 | 111k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 111k | let l_index = s_index / N_COUNT; | 153 | 111k | unsafe { | 154 | 111k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 111k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 111k | | 157 | 111k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 111k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 111k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 111k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 111k | | 162 | 111k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 111k | let t_index = s_index % T_COUNT; | 164 | 111k | if t_index > 0 { | 165 | 58.9k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 58.9k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 58.9k | } | 168 | | } | 169 | 111k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<_> unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 1.35k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 1.35k | where | 147 | 1.35k | F: FnMut(char), | 148 | 1.35k | { | 149 | 1.35k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 1.35k | let s_index = s as u32 - S_BASE; | 151 | 1.35k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 1.35k | let l_index = s_index / N_COUNT; | 153 | 1.35k | unsafe { | 154 | 1.35k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 1.35k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 1.35k | | 157 | 1.35k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 1.35k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 1.35k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 1.35k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 1.35k | | 162 | 1.35k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 1.35k | let t_index = s_index % T_COUNT; | 164 | 1.35k | if t_index > 0 { | 165 | 1.15k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 1.15k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 1.15k | } | 168 | | } | 169 | 1.35k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_hangul::<process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 145 | 1 | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 1 | where | 147 | 1 | F: FnMut(char), | 148 | 1 | { | 149 | 1 | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 1 | let s_index = s as u32 - S_BASE; | 151 | 1 | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 1 | let l_index = s_index / N_COUNT; | 153 | 1 | unsafe { | 154 | 1 | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 1 | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 1 | | 157 | 1 | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 1 | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 1 | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 1 | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 1 | | 162 | 1 | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 1 | let t_index = s_index % T_COUNT; | 164 | 1 | if t_index > 0 { | 165 | 1 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 1 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 1 | } | 168 | | } | 169 | 1 | } |
unicode_normalization::normalize::decompose_hangul::<process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 145 | 13 | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 13 | where | 147 | 13 | F: FnMut(char), | 148 | 13 | { | 149 | 13 | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 13 | let s_index = s as u32 - S_BASE; | 151 | 13 | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 13 | let l_index = s_index / N_COUNT; | 153 | 13 | unsafe { | 154 | 13 | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 13 | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 13 | | 157 | 13 | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 13 | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 13 | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 13 | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 13 | | 162 | 13 | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 13 | let t_index = s_index % T_COUNT; | 164 | 13 | if t_index > 0 { | 165 | 12 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 12 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 12 | } | 168 | | } | 169 | 13 | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 15.5k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 15.5k | where | 147 | 15.5k | F: FnMut(char), | 148 | 15.5k | { | 149 | 15.5k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 15.5k | let s_index = s as u32 - S_BASE; | 151 | 15.5k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 15.5k | let l_index = s_index / N_COUNT; | 153 | 15.5k | unsafe { | 154 | 15.5k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 15.5k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 15.5k | | 157 | 15.5k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 15.5k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 15.5k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 15.5k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 15.5k | | 162 | 15.5k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 15.5k | let t_index = s_index % T_COUNT; | 164 | 15.5k | if t_index > 0 { | 165 | 15.3k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 15.3k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 15.3k | } | 168 | | } | 169 | 15.5k | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 145 | 743 | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 743 | where | 147 | 743 | F: FnMut(char), | 148 | 743 | { | 149 | 743 | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 743 | let s_index = s as u32 - S_BASE; | 151 | 743 | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 743 | let l_index = s_index / N_COUNT; | 153 | 743 | unsafe { | 154 | 743 | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 743 | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 743 | | 157 | 743 | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 743 | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 743 | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 743 | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 743 | | 162 | 743 | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 743 | let t_index = s_index % T_COUNT; | 164 | 743 | if t_index > 0 { | 165 | 548 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 548 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 548 | } | 168 | | } | 169 | 743 | } |
|
170 | | |
171 | | #[inline] |
172 | 31.9k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { |
173 | 31.9k | let si = s as u32 - S_BASE; |
174 | 31.9k | let ti = si % T_COUNT; |
175 | 31.9k | if ti > 0 { |
176 | 29.9k | 3 |
177 | | } else { |
178 | 2.03k | 2 |
179 | | } |
180 | 31.9k | } unicode_normalization::normalize::hangul_decomposition_length Line | Count | Source | 172 | 19.4k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { | 173 | 19.4k | let si = s as u32 - S_BASE; | 174 | 19.4k | let ti = si % T_COUNT; | 175 | 19.4k | if ti > 0 { | 176 | 18.6k | 3 | 177 | | } else { | 178 | 856 | 2 | 179 | | } | 180 | 19.4k | } |
unicode_normalization::normalize::hangul_decomposition_length Line | Count | Source | 172 | 12.5k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { | 173 | 12.5k | let si = s as u32 - S_BASE; | 174 | 12.5k | let ti = si % T_COUNT; | 175 | 12.5k | if ti > 0 { | 176 | 11.3k | 3 | 177 | | } else { | 178 | 1.17k | 2 | 179 | | } | 180 | 12.5k | } |
|
181 | | |
182 | | // Compose a pair of Hangul Jamo |
183 | | #[allow(unsafe_code)] |
184 | | #[inline(always)] |
185 | | #[allow(ellipsis_inclusive_range_patterns)] |
186 | 68.3M | fn compose_hangul(a: char, b: char) -> Option<char> { |
187 | 68.3M | let (a, b) = (a as u32, b as u32); |
188 | 68.3M | match (a, b) { |
189 | | // Compose a leading consonant and a vowel together into an LV_Syllable |
190 | 3.58M | (L_BASE..=L_LAST, V_BASE..=V_LAST) => { |
191 | | // Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19) |
192 | | // and v_index will be <= V_COUNT (21) |
193 | 224k | let l_index = a - L_BASE; |
194 | 224k | let v_index = b - V_BASE; |
195 | 224k | // Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400. |
196 | 224k | let lv_index = l_index * N_COUNT + v_index * T_COUNT; |
197 | 224k | // Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range |
198 | 224k | // for BMP unicode |
199 | 224k | let s = S_BASE + lv_index; |
200 | 224k | // Safety: We've verified this is in-range |
201 | 224k | Some(unsafe { char::from_u32_unchecked(s) }) |
202 | | } |
203 | | // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable |
204 | 3.03M | (S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => { |
205 | 117k | // Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19. |
206 | 117k | // Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the |
207 | 117k | // surrogates start), so this is safe. |
208 | 117k | Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) }) |
209 | | } |
210 | 67.9M | _ => None, |
211 | | } |
212 | 68.3M | } |
213 | | |
214 | | #[cfg(test)] |
215 | | mod tests { |
216 | | use super::compose_hangul; |
217 | | |
218 | | // Regression test from a bugfix where we were composing an LV_Syllable with |
219 | | // T_BASE directly. (We should only compose an LV_Syllable with a character |
220 | | // in the range `T_BASE + 1 ..= T_LAST`.) |
221 | | #[test] |
222 | | fn test_hangul_composition() { |
223 | | assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None); |
224 | | } |
225 | | } |