/src/unicode-normalization/src/normalize.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | //! Functions for computing canonical and compatible decompositions for Unicode characters. |
12 | | use crate::lookups::{ |
13 | | canonical_fully_decomposed, cjk_compat_variants_fully_decomposed, |
14 | | compatibility_fully_decomposed, composition_table, |
15 | | }; |
16 | | |
17 | | use core::char; |
18 | | |
19 | | /// Compute canonical Unicode decomposition for character. |
20 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
21 | | /// for more information. |
22 | | #[inline] |
23 | 107M | pub fn decompose_canonical<F>(c: char, emit_char: F) |
24 | 107M | where |
25 | 107M | F: FnMut(char), |
26 | 107M | { |
27 | 107M | decompose(c, canonical_fully_decomposed, emit_char) |
28 | 107M | } unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 40.1M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 40.1M | where | 25 | 40.1M | F: FnMut(char), | 26 | 40.1M | { | 27 | 40.1M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 40.1M | } |
unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 28.0M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 28.0M | where | 25 | 28.0M | F: FnMut(char), | 26 | 28.0M | { | 27 | 28.0M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 28.0M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_canonical::<_> unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 10.2M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 10.2M | where | 25 | 10.2M | F: FnMut(char), | 26 | 10.2M | { | 27 | 10.2M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 10.2M | } |
unicode_normalization::normalize::decompose_canonical::<process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 23 | 76 | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 76 | where | 25 | 76 | F: FnMut(char), | 26 | 76 | { | 27 | 76 | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 76 | } |
unicode_normalization::normalize::decompose_canonical::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 23 | 28.7M | pub fn decompose_canonical<F>(c: char, emit_char: F) | 24 | 28.7M | where | 25 | 28.7M | F: FnMut(char), | 26 | 28.7M | { | 27 | 28.7M | decompose(c, canonical_fully_decomposed, emit_char) | 28 | 28.7M | } |
|
29 | | |
30 | | /// Compute canonical or compatible Unicode decomposition for character. |
31 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
32 | | /// for more information. |
33 | | #[inline] |
34 | 46.1M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { |
35 | 46.1M | let decompose_char = |
36 | 30.8M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} Line | Count | Source | 36 | 22.0M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_>::{closure#0} Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}>::{closure#0} Line | Count | Source | 36 | 49 | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0} Line | Count | Source | 36 | 8.76M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 20.7M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_>::{closure#0}::{closure#0} Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 42 | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}::{closure#0} Line | Count | Source | 36 | 6.58M | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
|
37 | 46.1M | decompose(c, decompose_char, emit_char) |
38 | 46.1M | } Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 34 | 35.1M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 35.1M | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 35.1M | decompose(c, decompose_char, emit_char) | 38 | 35.1M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<_> Unexecuted instantiation: unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_compatible::<process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 34 | 71 | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 71 | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 71 | decompose(c, decompose_char, emit_char) | 38 | 71 | } |
unicode_normalization::normalize::decompose_compatible::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 34 | 11.0M | pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) { | 35 | 11.0M | let decompose_char = | 36 | | |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 37 | 11.0M | decompose(c, decompose_char, emit_char) | 38 | 11.0M | } |
|
39 | | |
40 | | /// Compute standard-variation decomposition for character. |
41 | | /// |
42 | | /// [Standardized Variation Sequences] are used instead of the standard canonical |
43 | | /// decompositions, notably for CJK codepoints with singleton canonical decompositions, |
44 | | /// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the |
45 | | /// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information. |
46 | | /// |
47 | | /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence |
48 | | /// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html |
49 | | /// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary |
50 | | #[inline] |
51 | 0 | pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F) |
52 | 0 | where |
53 | 0 | F: FnMut(char), |
54 | 0 | { |
55 | 0 | // 7-bit ASCII never decomposes |
56 | 0 | if c <= '\x7f' { |
57 | 0 | emit_char(c); |
58 | 0 | return; |
59 | 0 | } |
60 | | |
61 | | // Don't perform decomposition for Hangul |
62 | | |
63 | 0 | if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) { |
64 | 0 | for &d in decomposed { |
65 | 0 | emit_char(d); |
66 | 0 | } |
67 | 0 | return; |
68 | 0 | } |
69 | 0 |
|
70 | 0 | // Finally bottom out. |
71 | 0 | emit_char(c); |
72 | 0 | } |
73 | | |
74 | | #[inline] |
75 | | #[allow(unsafe_code)] |
76 | 153M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) |
77 | 153M | where |
78 | 153M | D: Fn(char) -> Option<&'static [char]>, |
79 | 153M | F: FnMut(char), |
80 | 153M | { |
81 | 153M | // 7-bit ASCII never decomposes |
82 | 153M | if c <= '\x7f' { |
83 | 53.1M | emit_char(c); |
84 | 53.1M | return; |
85 | 100M | } |
86 | 100M | |
87 | 100M | // Perform decomposition for Hangul |
88 | 100M | if is_hangul_syllable(c) { |
89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above |
90 | 588k | unsafe { |
91 | 588k | decompose_hangul(c, emit_char); |
92 | 588k | } |
93 | 588k | return; |
94 | 99.6M | } |
95 | | |
96 | 99.6M | if let Some(decomposed) = decompose_char(c) { |
97 | 226M | for &d in decomposed { |
98 | 168M | emit_char(d); |
99 | 168M | } |
100 | 57.6M | return; |
101 | 41.9M | } |
102 | 41.9M | |
103 | 41.9M | // Finally bottom out. |
104 | 41.9M | emit_char(c); |
105 | 153M | } Unexecuted instantiation: unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 76 | 35.1M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 35.1M | where | 78 | 35.1M | D: Fn(char) -> Option<&'static [char]>, | 79 | 35.1M | F: FnMut(char), | 80 | 35.1M | { | 81 | 35.1M | // 7-bit ASCII never decomposes | 82 | 35.1M | if c <= '\x7f' { | 83 | 12.5M | emit_char(c); | 84 | 12.5M | return; | 85 | 22.6M | } | 86 | 22.6M | | 87 | 22.6M | // Perform decomposition for Hangul | 88 | 22.6M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 546k | unsafe { | 91 | 546k | decompose_hangul(c, emit_char); | 92 | 546k | } | 93 | 546k | return; | 94 | 22.0M | } | 95 | | | 96 | 22.0M | if let Some(decomposed) = decompose_char(c) { | 97 | 43.9M | for &d in decomposed { | 98 | 34.8M | emit_char(d); | 99 | 34.8M | } | 100 | 9.06M | return; | 101 | 13.0M | } | 102 | 13.0M | | 103 | 13.0M | // Finally bottom out. | 104 | 13.0M | emit_char(c); | 105 | 35.1M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 40.1M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 40.1M | where | 78 | 40.1M | D: Fn(char) -> Option<&'static [char]>, | 79 | 40.1M | F: FnMut(char), | 80 | 40.1M | { | 81 | 40.1M | // 7-bit ASCII never decomposes | 82 | 40.1M | if c <= '\x7f' { | 83 | 14.1M | emit_char(c); | 84 | 14.1M | return; | 85 | 25.9M | } | 86 | 25.9M | | 87 | 25.9M | // Perform decomposition for Hangul | 88 | 25.9M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 7.45k | unsafe { | 91 | 7.45k | decompose_hangul(c, emit_char); | 92 | 7.45k | } | 93 | 7.45k | return; | 94 | 25.9M | } | 95 | | | 96 | 25.9M | if let Some(decomposed) = decompose_char(c) { | 97 | 25.0M | for &d in decomposed { | 98 | 17.0M | emit_char(d); | 99 | 17.0M | } | 100 | 7.97M | return; | 101 | 18.0M | } | 102 | 18.0M | | 103 | 18.0M | // Finally bottom out. | 104 | 18.0M | emit_char(c); | 105 | 40.1M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 28.0M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 28.0M | where | 78 | 28.0M | D: Fn(char) -> Option<&'static [char]>, | 79 | 28.0M | F: FnMut(char), | 80 | 28.0M | { | 81 | 28.0M | // 7-bit ASCII never decomposes | 82 | 28.0M | if c <= '\x7f' { | 83 | 10.8M | emit_char(c); | 84 | 10.8M | return; | 85 | 17.1M | } | 86 | 17.1M | | 87 | 17.1M | // Perform decomposition for Hangul | 88 | 17.1M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 7.02k | unsafe { | 91 | 7.02k | decompose_hangul(c, emit_char); | 92 | 7.02k | } | 93 | 7.02k | return; | 94 | 17.1M | } | 95 | | | 96 | 17.1M | if let Some(decomposed) = decompose_char(c) { | 97 | 23.9M | for &d in decomposed { | 98 | 16.2M | emit_char(d); | 99 | 16.2M | } | 100 | 7.71M | return; | 101 | 9.46M | } | 102 | 9.46M | | 103 | 9.46M | // Finally bottom out. | 104 | 9.46M | emit_char(c); | 105 | 28.0M | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose::<_, _> Unexecuted instantiation: unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 10.2M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 10.2M | where | 78 | 10.2M | D: Fn(char) -> Option<&'static [char]>, | 79 | 10.2M | F: FnMut(char), | 80 | 10.2M | { | 81 | 10.2M | // 7-bit ASCII never decomposes | 82 | 10.2M | if c <= '\x7f' { | 83 | 7.73M | emit_char(c); | 84 | 7.73M | return; | 85 | 2.49M | } | 86 | 2.49M | | 87 | 2.49M | // Perform decomposition for Hangul | 88 | 2.49M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 1.12k | unsafe { | 91 | 1.12k | decompose_hangul(c, emit_char); | 92 | 1.12k | } | 93 | 1.12k | return; | 94 | 2.49M | } | 95 | | | 96 | 2.49M | if let Some(decomposed) = decompose_char(c) { | 97 | 6.27M | for &d in decomposed { | 98 | 4.18M | emit_char(d); | 99 | 4.18M | } | 100 | 2.09M | return; | 101 | 403k | } | 102 | 403k | | 103 | 403k | // Finally bottom out. | 104 | 403k | emit_char(c); | 105 | 10.2M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<process::rust_fuzzer_test_input::{closure#1}>::{closure#0}, process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 76 | 71 | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 71 | where | 78 | 71 | D: Fn(char) -> Option<&'static [char]>, | 79 | 71 | F: FnMut(char), | 80 | 71 | { | 81 | 71 | // 7-bit ASCII never decomposes | 82 | 71 | if c <= '\x7f' { | 83 | 8 | emit_char(c); | 84 | 8 | return; | 85 | 63 | } | 86 | 63 | | 87 | 63 | // Perform decomposition for Hangul | 88 | 63 | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 14 | unsafe { | 91 | 14 | decompose_hangul(c, emit_char); | 92 | 14 | } | 93 | 14 | return; | 94 | 49 | } | 95 | | | 96 | 49 | if let Some(decomposed) = decompose_char(c) { | 97 | 23 | for &d in decomposed { | 98 | 15 | emit_char(d); | 99 | 15 | } | 100 | 8 | return; | 101 | 41 | } | 102 | 41 | | 103 | 41 | // Finally bottom out. | 104 | 41 | emit_char(c); | 105 | 71 | } |
unicode_normalization::normalize::decompose::<unicode_normalization::normalize::decompose_compatible<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}>::{closure#0}, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 76 | 11.0M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 11.0M | where | 78 | 11.0M | D: Fn(char) -> Option<&'static [char]>, | 79 | 11.0M | F: FnMut(char), | 80 | 11.0M | { | 81 | 11.0M | // 7-bit ASCII never decomposes | 82 | 11.0M | if c <= '\x7f' { | 83 | 2.22M | emit_char(c); | 84 | 2.22M | return; | 85 | 8.77M | } | 86 | 8.77M | | 87 | 8.77M | // Perform decomposition for Hangul | 88 | 8.77M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 4.14k | unsafe { | 91 | 4.14k | decompose_hangul(c, emit_char); | 92 | 4.14k | } | 93 | 4.14k | return; | 94 | 8.76M | } | 95 | | | 96 | 8.76M | if let Some(decomposed) = decompose_char(c) { | 97 | 60.1M | for &d in decomposed { | 98 | 51.7M | emit_char(d); | 99 | 51.7M | } | 100 | 8.43M | return; | 101 | 335k | } | 102 | 335k | | 103 | 335k | // Finally bottom out. | 104 | 335k | emit_char(c); | 105 | 11.0M | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 76 | 76 | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 76 | where | 78 | 76 | D: Fn(char) -> Option<&'static [char]>, | 79 | 76 | F: FnMut(char), | 80 | 76 | { | 81 | 76 | // 7-bit ASCII never decomposes | 82 | 76 | if c <= '\x7f' { | 83 | 9 | emit_char(c); | 84 | 9 | return; | 85 | 67 | } | 86 | 67 | | 87 | 67 | // Perform decomposition for Hangul | 88 | 67 | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 14 | unsafe { | 91 | 14 | decompose_hangul(c, emit_char); | 92 | 14 | } | 93 | 14 | return; | 94 | 53 | } | 95 | | | 96 | 53 | if let Some(decomposed) = decompose_char(c) { | 97 | 7 | for &d in decomposed { | 98 | 4 | emit_char(d); | 99 | 4 | } | 100 | 3 | return; | 101 | 50 | } | 102 | 50 | | 103 | 50 | // Finally bottom out. | 104 | 50 | emit_char(c); | 105 | 76 | } |
unicode_normalization::normalize::decompose::<unicode_normalization::lookups::canonical_fully_decomposed, <unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 76 | 28.7M | fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F) | 77 | 28.7M | where | 78 | 28.7M | D: Fn(char) -> Option<&'static [char]>, | 79 | 28.7M | F: FnMut(char), | 80 | 28.7M | { | 81 | 28.7M | // 7-bit ASCII never decomposes | 82 | 28.7M | if c <= '\x7f' { | 83 | 5.60M | emit_char(c); | 84 | 5.60M | return; | 85 | 23.1M | } | 86 | 23.1M | | 87 | 23.1M | // Perform decomposition for Hangul | 88 | 23.1M | if is_hangul_syllable(c) { | 89 | | // Safety: Hangul Syllables invariant checked by is_hangul_syllable above | 90 | 22.5k | unsafe { | 91 | 22.5k | decompose_hangul(c, emit_char); | 92 | 22.5k | } | 93 | 22.5k | return; | 94 | 23.1M | } | 95 | | | 96 | 23.1M | if let Some(decomposed) = decompose_char(c) { | 97 | 67.1M | for &d in decomposed { | 98 | 44.7M | emit_char(d); | 99 | 44.7M | } | 100 | 22.3M | return; | 101 | 733k | } | 102 | 733k | | 103 | 733k | // Finally bottom out. | 104 | 733k | emit_char(c); | 105 | 28.7M | } |
|
106 | | |
107 | | /// Compose two characters into a single character, if possible. |
108 | | /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) |
109 | | /// for more information. |
110 | 84.4M | pub fn compose(a: char, b: char) -> Option<char> { |
111 | 84.4M | compose_hangul(a, b).or_else(|| composition_table(a, b)) |
112 | 84.4M | } |
113 | | |
114 | | // Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior |
115 | | // http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior |
116 | | const S_BASE: u32 = 0xAC00; |
117 | | const L_BASE: u32 = 0x1100; |
118 | | const V_BASE: u32 = 0x1161; |
119 | | const T_BASE: u32 = 0x11A7; |
120 | | const L_COUNT: u32 = 19; |
121 | | const V_COUNT: u32 = 21; |
122 | | const T_COUNT: u32 = 28; |
123 | | const N_COUNT: u32 = V_COUNT * T_COUNT; |
124 | | const S_COUNT: u32 = L_COUNT * N_COUNT; |
125 | | |
126 | | const S_LAST: u32 = S_BASE + S_COUNT - 1; |
127 | | const L_LAST: u32 = L_BASE + L_COUNT - 1; |
128 | | const V_LAST: u32 = V_BASE + V_COUNT - 1; |
129 | | const T_LAST: u32 = T_BASE + T_COUNT - 1; |
130 | | |
131 | | // Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`, |
132 | | // i.e. `T_BASE + 1 ..= T_LAST`. |
133 | | const T_FIRST: u32 = T_BASE + 1; |
134 | | |
135 | | // Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF) |
136 | 214M | pub(crate) fn is_hangul_syllable(c: char) -> bool { |
137 | 214M | // Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant |
138 | 214M | (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) |
139 | 214M | } |
140 | | |
141 | | // Decompose a precomposed Hangul syllable |
142 | | // Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF |
143 | | #[allow(unsafe_code, unused_unsafe)] |
144 | | #[inline(always)] |
145 | 588k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) |
146 | 588k | where |
147 | 588k | F: FnMut(char), |
148 | 588k | { |
149 | 588k | // This will be at most 0x2baf, the size of the Hangul Syllables block |
150 | 588k | let s_index = s as u32 - S_BASE; |
151 | 588k | // This will be at most 0x2baf / (21 * 28), 19 |
152 | 588k | let l_index = s_index / N_COUNT; |
153 | 588k | unsafe { |
154 | 588k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
155 | 588k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); |
156 | 588k | |
157 | 588k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 |
158 | 588k | let v_index = (s_index % N_COUNT) / T_COUNT; |
159 | 588k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
160 | 588k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); |
161 | 588k | |
162 | 588k | // Safety: This will be at most T_COUNT - 1 (27) |
163 | 588k | let t_index = s_index % T_COUNT; |
164 | 588k | if t_index > 0 { |
165 | 311k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) |
166 | 311k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); |
167 | 311k | } |
168 | | } |
169 | 588k | } unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 7.45k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 7.45k | where | 147 | 7.45k | F: FnMut(char), | 148 | 7.45k | { | 149 | 7.45k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 7.45k | let s_index = s as u32 - S_BASE; | 151 | 7.45k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 7.45k | let l_index = s_index / N_COUNT; | 153 | 7.45k | unsafe { | 154 | 7.45k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 7.45k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 7.45k | | 157 | 7.45k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 7.45k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 7.45k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 7.45k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 7.45k | | 162 | 7.45k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 7.45k | let t_index = s_index % T_COUNT; | 164 | 7.45k | if t_index > 0 { | 165 | 6.68k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 6.68k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 6.68k | } | 168 | | } | 169 | 7.45k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<unicode_normalization::stream_safe::StreamSafe<core::str::iter::Chars>> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 7.02k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 7.02k | where | 147 | 7.02k | F: FnMut(char), | 148 | 7.02k | { | 149 | 7.02k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 7.02k | let s_index = s as u32 - S_BASE; | 151 | 7.02k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 7.02k | let l_index = s_index / N_COUNT; | 153 | 7.02k | unsafe { | 154 | 7.02k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 7.02k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 7.02k | | 157 | 7.02k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 7.02k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 7.02k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 7.02k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 7.02k | | 162 | 7.02k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 7.02k | let t_index = s_index % T_COUNT; | 164 | 7.02k | if t_index > 0 { | 165 | 6.40k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 6.40k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 6.40k | } | 168 | | } | 169 | 7.02k | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 145 | 546k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 546k | where | 147 | 546k | F: FnMut(char), | 148 | 546k | { | 149 | 546k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 546k | let s_index = s as u32 - S_BASE; | 151 | 546k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 546k | let l_index = s_index / N_COUNT; | 153 | 546k | unsafe { | 154 | 546k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 546k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 546k | | 157 | 546k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 546k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 546k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 546k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 546k | | 162 | 546k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 546k | let t_index = s_index % T_COUNT; | 164 | 546k | if t_index > 0 { | 165 | 275k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 275k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 275k | } | 168 | | } | 169 | 546k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<_> unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 1.12k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 1.12k | where | 147 | 1.12k | F: FnMut(char), | 148 | 1.12k | { | 149 | 1.12k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 1.12k | let s_index = s as u32 - S_BASE; | 151 | 1.12k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 1.12k | let l_index = s_index / N_COUNT; | 153 | 1.12k | unsafe { | 154 | 1.12k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 1.12k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 1.12k | | 157 | 1.12k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 1.12k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 1.12k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 1.12k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 1.12k | | 162 | 1.12k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 1.12k | let t_index = s_index % T_COUNT; | 164 | 1.12k | if t_index > 0 { | 165 | 900 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 900 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 900 | } | 168 | | } | 169 | 1.12k | } |
Unexecuted instantiation: unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<streaming::Counter> as core::iter::traits::iterator::Iterator>::next::{closure#1}> unicode_normalization::normalize::decompose_hangul::<process::rust_fuzzer_test_input::{closure#0}> Line | Count | Source | 145 | 14 | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 14 | where | 147 | 14 | F: FnMut(char), | 148 | 14 | { | 149 | 14 | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 14 | let s_index = s as u32 - S_BASE; | 151 | 14 | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 14 | let l_index = s_index / N_COUNT; | 153 | 14 | unsafe { | 154 | 14 | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 14 | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 14 | | 157 | 14 | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 14 | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 14 | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 14 | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 14 | | 162 | 14 | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 14 | let t_index = s_index % T_COUNT; | 164 | 14 | if t_index > 0 { | 165 | 13 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 13 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 13 | } | 168 | | } | 169 | 14 | } |
unicode_normalization::normalize::decompose_hangul::<process::rust_fuzzer_test_input::{closure#1}> Line | Count | Source | 145 | 14 | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 14 | where | 147 | 14 | F: FnMut(char), | 148 | 14 | { | 149 | 14 | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 14 | let s_index = s as u32 - S_BASE; | 151 | 14 | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 14 | let l_index = s_index / N_COUNT; | 153 | 14 | unsafe { | 154 | 14 | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 14 | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 14 | | 157 | 14 | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 14 | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 14 | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 14 | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 14 | | 162 | 14 | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 14 | let t_index = s_index % T_COUNT; | 164 | 14 | if t_index > 0 { | 165 | 14 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 14 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 14 | } | 168 | | } | 169 | 14 | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#0}> Line | Count | Source | 145 | 22.5k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 22.5k | where | 147 | 22.5k | F: FnMut(char), | 148 | 22.5k | { | 149 | 22.5k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 22.5k | let s_index = s as u32 - S_BASE; | 151 | 22.5k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 22.5k | let l_index = s_index / N_COUNT; | 153 | 22.5k | unsafe { | 154 | 22.5k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 22.5k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 22.5k | | 157 | 22.5k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 22.5k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 22.5k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 22.5k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 22.5k | | 162 | 22.5k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 22.5k | let t_index = s_index % T_COUNT; | 164 | 22.5k | if t_index > 0 { | 165 | 20.8k | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 20.8k | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 20.8k | } | 168 | | } | 169 | 22.5k | } |
unicode_normalization::normalize::decompose_hangul::<<unicode_normalization::decompose::Decompositions<core::str::iter::Chars> as core::iter::traits::iterator::Iterator>::next::{closure#1}> Line | Count | Source | 145 | 4.14k | unsafe fn decompose_hangul<F>(s: char, mut emit_char: F) | 146 | 4.14k | where | 147 | 4.14k | F: FnMut(char), | 148 | 4.14k | { | 149 | 4.14k | // This will be at most 0x2baf, the size of the Hangul Syllables block | 150 | 4.14k | let s_index = s as u32 - S_BASE; | 151 | 4.14k | // This will be at most 0x2baf / (21 * 28), 19 | 152 | 4.14k | let l_index = s_index / N_COUNT; | 153 | 4.14k | unsafe { | 154 | 4.14k | // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 155 | 4.14k | emit_char(char::from_u32_unchecked(L_BASE + l_index)); | 156 | 4.14k | | 157 | 4.14k | // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 | 158 | 4.14k | let v_index = (s_index % N_COUNT) / T_COUNT; | 159 | 4.14k | // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 160 | 4.14k | emit_char(char::from_u32_unchecked(V_BASE + v_index)); | 161 | 4.14k | | 162 | 4.14k | // Safety: This will be at most T_COUNT - 1 (27) | 163 | 4.14k | let t_index = s_index % T_COUNT; | 164 | 4.14k | if t_index > 0 { | 165 | 919 | // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) | 166 | 919 | emit_char(char::from_u32_unchecked(T_BASE + t_index)); | 167 | 3.22k | } | 168 | | } | 169 | 4.14k | } |
|
170 | | |
171 | | #[inline] |
172 | 30.6k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { |
173 | 30.6k | let si = s as u32 - S_BASE; |
174 | 30.6k | let ti = si % T_COUNT; |
175 | 30.6k | if ti > 0 { |
176 | 28.4k | 3 |
177 | | } else { |
178 | 2.26k | 2 |
179 | | } |
180 | 30.6k | } unicode_normalization::normalize::hangul_decomposition_length Line | Count | Source | 172 | 18.7k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { | 173 | 18.7k | let si = s as u32 - S_BASE; | 174 | 18.7k | let ti = si % T_COUNT; | 175 | 18.7k | if ti > 0 { | 176 | 17.7k | 3 | 177 | | } else { | 178 | 986 | 2 | 179 | | } | 180 | 18.7k | } |
unicode_normalization::normalize::hangul_decomposition_length Line | Count | Source | 172 | 11.9k | pub(crate) fn hangul_decomposition_length(s: char) -> usize { | 173 | 11.9k | let si = s as u32 - S_BASE; | 174 | 11.9k | let ti = si % T_COUNT; | 175 | 11.9k | if ti > 0 { | 176 | 10.6k | 3 | 177 | | } else { | 178 | 1.28k | 2 | 179 | | } | 180 | 11.9k | } |
|
181 | | |
182 | | // Compose a pair of Hangul Jamo |
183 | | #[allow(unsafe_code)] |
184 | | #[inline(always)] |
185 | | #[allow(ellipsis_inclusive_range_patterns)] |
186 | 84.4M | fn compose_hangul(a: char, b: char) -> Option<char> { |
187 | 84.4M | let (a, b) = (a as u32, b as u32); |
188 | 84.4M | match (a, b) { |
189 | | // Compose a leading consonant and a vowel together into an LV_Syllable |
190 | 6.71M | (L_BASE..=L_LAST, V_BASE..=V_LAST) => { |
191 | | // Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19) |
192 | | // and v_index will be <= V_COUNT (21) |
193 | 1.09M | let l_index = a - L_BASE; |
194 | 1.09M | let v_index = b - V_BASE; |
195 | 1.09M | // Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400. |
196 | 1.09M | let lv_index = l_index * N_COUNT + v_index * T_COUNT; |
197 | 1.09M | // Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range |
198 | 1.09M | // for BMP unicode |
199 | 1.09M | let s = S_BASE + lv_index; |
200 | 1.09M | // Safety: We've verified this is in-range |
201 | 1.09M | Some(unsafe { char::from_u32_unchecked(s) }) |
202 | | } |
203 | | // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable |
204 | 3.76M | (S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => { |
205 | 552k | // Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19. |
206 | 552k | // Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the |
207 | 552k | // surrogates start), so this is safe. |
208 | 552k | Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) }) |
209 | | } |
210 | 82.7M | _ => None, |
211 | | } |
212 | 84.4M | } |
213 | | |
214 | | #[cfg(test)] |
215 | | mod tests { |
216 | | use super::compose_hangul; |
217 | | |
218 | | // Regression test from a bugfix where we were composing an LV_Syllable with |
219 | | // T_BASE directly. (We should only compose an LV_Syllable with a character |
220 | | // in the range `T_BASE + 1 ..= T_LAST`.) |
221 | | #[test] |
222 | | fn test_hangul_composition() { |
223 | | assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None); |
224 | | } |
225 | | } |