/src/unicode-normalization/src/lookups.rs
Line | Count | Source |
1 | | // Copyright 2019 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | //! Lookups of unicode properties using minimal perfect hashing. |
12 | | |
13 | | use crate::perfect_hash::mph_lookup; |
14 | | use crate::tables::*; |
15 | | |
16 | | /// Look up the canonical combining class for a codepoint. |
17 | | /// |
18 | | /// The value returned is as defined in the Unicode Character Database. |
19 | 1.33G | pub fn canonical_combining_class(c: char) -> u8 { |
20 | 1.33G | mph_lookup( |
21 | 1.33G | c.into(), |
22 | 1.33G | CANONICAL_COMBINING_CLASS_SALT, |
23 | 1.33G | CANONICAL_COMBINING_CLASS_KV, |
24 | | u8_lookup_fk, |
25 | | u8_lookup_fv, |
26 | | 0, |
27 | | ) |
28 | 1.33G | } |
29 | | |
30 | 133M | pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> { |
31 | 133M | if c1 < '\u{10000}' && c2 < '\u{10000}' { |
32 | 132M | mph_lookup( |
33 | 132M | (c1 as u32) << 16 | (c2 as u32), |
34 | 132M | COMPOSITION_TABLE_SALT, |
35 | 132M | COMPOSITION_TABLE_KV, |
36 | | pair_lookup_fk, |
37 | | pair_lookup_fv_opt, |
38 | 132M | None, |
39 | | ) |
40 | | } else { |
41 | 124k | composition_table_astral(c1, c2) |
42 | | } |
43 | 133M | } |
44 | | |
45 | 428M | pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> { |
46 | 428M | mph_lookup( |
47 | 428M | c.into(), |
48 | 428M | CANONICAL_DECOMPOSED_SALT, |
49 | 428M | CANONICAL_DECOMPOSED_KV, |
50 | | pair_lookup_fk, |
51 | | pair_lookup_fv_opt, |
52 | 428M | None, |
53 | | ) |
54 | 428M | .map(|(start, len)| &CANONICAL_DECOMPOSED_CHARS[start as usize..][..len as usize]) |
55 | 428M | } |
56 | | |
57 | 332M | pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> { |
58 | 332M | mph_lookup( |
59 | 332M | c.into(), |
60 | 332M | COMPATIBILITY_DECOMPOSED_SALT, |
61 | 332M | COMPATIBILITY_DECOMPOSED_KV, |
62 | | pair_lookup_fk, |
63 | | pair_lookup_fv_opt, |
64 | 332M | None, |
65 | | ) |
66 | 332M | .map(|(start, len)| &COMPATIBILITY_DECOMPOSED_CHARS[start as usize..][..len as usize]) |
67 | 332M | } |
68 | | |
69 | 0 | pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> { |
70 | 0 | mph_lookup( |
71 | 0 | c.into(), |
72 | 0 | CJK_COMPAT_VARIANTS_DECOMPOSED_SALT, |
73 | 0 | CJK_COMPAT_VARIANTS_DECOMPOSED_KV, |
74 | | pair_lookup_fk, |
75 | | pair_lookup_fv_opt, |
76 | 0 | None, |
77 | | ) |
78 | 0 | .map(|(start, len)| &CJK_COMPAT_VARIANTS_DECOMPOSED_CHARS[start as usize..][..len as usize]) |
79 | 0 | } |
80 | | |
81 | | /// Return whether the given character is a combining mark (`General_Category=Mark`) |
82 | 51 | pub fn is_combining_mark(c: char) -> bool { |
83 | 51 | mph_lookup( |
84 | 51 | c.into(), |
85 | 51 | COMBINING_MARK_SALT, |
86 | 51 | COMBINING_MARK_KV, |
87 | | bool_lookup_fk, |
88 | | bool_lookup_fv, |
89 | | false, |
90 | | ) |
91 | 51 | } |
92 | | |
93 | 46.8M | pub fn stream_safe_trailing_nonstarters(c: char) -> usize { |
94 | 46.8M | mph_lookup( |
95 | 46.8M | c.into(), |
96 | 46.8M | TRAILING_NONSTARTERS_SALT, |
97 | 46.8M | TRAILING_NONSTARTERS_KV, |
98 | 46.8M | u8_lookup_fk, |
99 | 46.8M | u8_lookup_fv, |
100 | 46.8M | 0, |
101 | 46.8M | ) as usize |
102 | 46.8M | } |
103 | | |
104 | | /// Extract the key in a 24 bit key and 8 bit value packed in a u32. |
105 | | #[inline] |
106 | 1.37G | fn u8_lookup_fk(kv: u32) -> u32 { |
107 | 1.37G | kv >> 8 |
108 | 1.37G | } |
109 | | |
110 | | /// Extract the value in a 24 bit key and 8 bit value packed in a u32. |
111 | | #[inline] |
112 | 938M | fn u8_lookup_fv(kv: u32) -> u8 { |
113 | 938M | (kv & 0xff) as u8 |
114 | 938M | } |
115 | | |
116 | | /// Extract the key for a boolean lookup. |
117 | | #[inline] |
118 | 51 | fn bool_lookup_fk(kv: u32) -> u32 { |
119 | 51 | kv |
120 | 51 | } |
121 | | |
122 | | /// Extract the value for a boolean lookup. |
123 | | #[inline] |
124 | 0 | fn bool_lookup_fv(_kv: u32) -> bool { |
125 | 0 | true |
126 | 0 | } |
127 | | |
128 | | /// Extract the key in a pair. |
129 | | #[inline] |
130 | 893M | fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 { |
131 | 893M | kv.0 |
132 | 893M | } unicode_normalization::lookups::pair_lookup_fk::<(u16, u16)> Line | Count | Source | 130 | 760M | fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 { | 131 | 760M | kv.0 | 132 | 760M | } |
unicode_normalization::lookups::pair_lookup_fk::<char> Line | Count | Source | 130 | 132M | fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 { | 131 | 132M | kv.0 | 132 | 132M | } |
|
133 | | |
134 | | /// Extract the value in a pair, returning an option. |
135 | | #[inline] |
136 | 141M | fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> { |
137 | 141M | Some(kv.1) |
138 | 141M | } unicode_normalization::lookups::pair_lookup_fv_opt::<(u16, u16)> Line | Count | Source | 136 | 131M | fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> { | 137 | 131M | Some(kv.1) | 138 | 131M | } |
unicode_normalization::lookups::pair_lookup_fv_opt::<char> Line | Count | Source | 136 | 10.0M | fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> { | 137 | 10.0M | Some(kv.1) | 138 | 10.0M | } |
|