/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_properties-1.5.1/src/runtime.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you |
6 | | //! have a use case for this! |
7 | | //! |
8 | | //! This module contains utilities for working with properties where the specific property in use |
9 | | //! is not known at compile time. |
10 | | //! |
11 | | //! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working |
12 | | //! with properties at runtime tailored for the use case of ECMA262-compatible regex engines. |
13 | | |
14 | | #[cfg(doc)] |
15 | | use super::{maps, script, GeneralCategory, GeneralCategoryGroup, Script}; |
16 | | |
17 | | /// This type can represent any Unicode property. |
18 | | /// |
19 | | /// This is intended to be used in situations where the exact unicode property needed is |
20 | | /// only known at runtime, for example in regex engines. |
21 | | /// |
22 | | /// The values are intended to be identical to ICU4C's UProperty enum |
23 | | #[allow(clippy::exhaustive_structs)] // newtype |
24 | | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] |
25 | | pub struct UnicodeProperty(pub u32); |
26 | | |
27 | | #[allow(non_upper_case_globals)] |
28 | | #[allow(unused)] // experimental, may be made public later |
29 | | impl UnicodeProperty { |
30 | | /// Binary property `Alphabetic` |
31 | | pub const Alphabetic: Self = UnicodeProperty(0); |
32 | | /// Binary property `ASCII_Hex_Digit` |
33 | | pub const AsciiHexDigit: Self = UnicodeProperty(1); |
34 | | /// Binary property `Bidi_Control` |
35 | | pub const BidiControl: Self = UnicodeProperty(2); |
36 | | /// Binary property `Bidi_Mirrored` |
37 | | pub const BidiMirrored: Self = UnicodeProperty(3); |
38 | | /// Binary property `Dash` |
39 | | pub const Dash: Self = UnicodeProperty(4); |
40 | | /// Binary property `Default_Ignorable_Code_Point` |
41 | | pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5); |
42 | | /// Binary property `Deprecated` |
43 | | pub const Deprecated: Self = UnicodeProperty(6); |
44 | | /// Binary property `Diacritic` |
45 | | pub const Diacritic: Self = UnicodeProperty(7); |
46 | | /// Binary property `Extender` |
47 | | pub const Extender: Self = UnicodeProperty(8); |
48 | | /// Binary property `Full_Composition_Exclusion` |
49 | | pub const FullCompositionExclusion: Self = UnicodeProperty(9); |
50 | | /// Binary property `Grapheme_Base` |
51 | | pub const GraphemeBase: Self = UnicodeProperty(10); |
52 | | /// Binary property `Grapheme_Extend` |
53 | | pub const GraphemeExtend: Self = UnicodeProperty(11); |
54 | | /// Binary property `Grapheme_Link` |
55 | | pub const GraphemeLink: Self = UnicodeProperty(12); |
56 | | /// Binary property `Hex_Digit` |
57 | | pub const HexDigit: Self = UnicodeProperty(13); |
58 | | /// Binary property `Hyphen` |
59 | | pub const Hyphen: Self = UnicodeProperty(14); |
60 | | /// Binary property `ID_Continue` |
61 | | pub const IdContinue: Self = UnicodeProperty(15); |
62 | | /// Binary property `ID_Start` |
63 | | pub const IdStart: Self = UnicodeProperty(16); |
64 | | /// Binary property `Ideographic` |
65 | | pub const Ideographic: Self = UnicodeProperty(17); |
66 | | /// Binary property `IDS_Binary_Operator` |
67 | | pub const IdsBinaryOperator: Self = UnicodeProperty(18); |
68 | | /// Binary property `IDS_Trinary_Operator` |
69 | | pub const IdsTrinaryOperator: Self = UnicodeProperty(19); |
70 | | /// Binary property `Join_Control` |
71 | | pub const JoinControl: Self = UnicodeProperty(20); |
72 | | /// Binary property `Logical_Order_Exception` |
73 | | pub const LogicalOrderException: Self = UnicodeProperty(21); |
74 | | /// Binary property `Lowercase` |
75 | | pub const Lowercase: Self = UnicodeProperty(22); |
76 | | /// Binary property `Math` |
77 | | pub const Math: Self = UnicodeProperty(23); |
78 | | /// Binary property `Noncharacter_Code_Point` |
79 | | pub const NoncharacterCodePoint: Self = UnicodeProperty(24); |
80 | | /// Binary property `Quotation_Mark` |
81 | | pub const QuotationMark: Self = UnicodeProperty(25); |
82 | | /// Binary property `Radical` |
83 | | pub const Radical: Self = UnicodeProperty(26); |
84 | | /// Binary property `Soft_Dotted` |
85 | | pub const SoftDotted: Self = UnicodeProperty(27); |
86 | | /// Binary property `Terminal_Punctuation` |
87 | | pub const TerminalPunctuation: Self = UnicodeProperty(28); |
88 | | /// Binary property `Unified_Ideograph` |
89 | | pub const UnifiedIdeograph: Self = UnicodeProperty(29); |
90 | | /// Binary property `Uppercase` |
91 | | pub const Uppercase: Self = UnicodeProperty(30); |
92 | | /// Binary property `White_Space` |
93 | | pub const WhiteSpace: Self = UnicodeProperty(31); |
94 | | /// Binary property `XID_Continue` |
95 | | pub const XidContinue: Self = UnicodeProperty(32); |
96 | | /// Binary property `XID_Start` |
97 | | pub const XidStart: Self = UnicodeProperty(33); |
98 | | /// Binary property `Case_Sensitive` |
99 | | pub const CaseSensitive: Self = UnicodeProperty(34); |
100 | | /// Binary property `Sentence_Terminal` |
101 | | pub const SentenceTerminal: Self = UnicodeProperty(35); |
102 | | /// Binary property `Variation_Selector` |
103 | | pub const VariationSelector: Self = UnicodeProperty(36); |
104 | | /// Binary property `NFD_Inert` |
105 | | pub const NfdInert: Self = UnicodeProperty(37); |
106 | | /// Binary property `NFKD_Inert` |
107 | | pub const NfkdInert: Self = UnicodeProperty(38); |
108 | | /// Binary property `NFC_Inert` |
109 | | pub const NfcInert: Self = UnicodeProperty(39); |
110 | | /// Binary property `NFKC_Inert` |
111 | | pub const NfkcInert: Self = UnicodeProperty(40); |
112 | | /// Binary property `Segment_Starter` |
113 | | pub const SegmentStarter: Self = UnicodeProperty(41); |
114 | | /// Binary property `Pattern_Syntax` |
115 | | pub const PatternSyntax: Self = UnicodeProperty(42); |
116 | | /// Binary property `Pattern_White_Space` |
117 | | pub const PatternWhiteSpace: Self = UnicodeProperty(43); |
118 | | /// Binary property `alnum` |
119 | | pub const Alnum: Self = UnicodeProperty(44); |
120 | | /// Binary property `blank` |
121 | | pub const Blank: Self = UnicodeProperty(45); |
122 | | /// Binary property `graph` |
123 | | pub const Graph: Self = UnicodeProperty(46); |
124 | | /// Binary property `print` |
125 | | pub const Print: Self = UnicodeProperty(47); |
126 | | /// Binary property `xdigit` |
127 | | pub const XDigit: Self = UnicodeProperty(48); |
128 | | /// Binary property `Cased` |
129 | | pub const Cased: Self = UnicodeProperty(49); |
130 | | /// Binary property `Case_Ignorable` |
131 | | pub const CaseIgnorable: Self = UnicodeProperty(50); |
132 | | /// Binary property `Changes_When_Lowercased` |
133 | | pub const ChangesWhenLowercased: Self = UnicodeProperty(51); |
134 | | /// Binary property `Changes_When_Uppercased` |
135 | | pub const ChangesWhenUppercased: Self = UnicodeProperty(52); |
136 | | /// Binary property `Changes_When_Titlecased` |
137 | | pub const ChangesWhenTitlecased: Self = UnicodeProperty(53); |
138 | | /// Binary property `Changes_When_Casefolded` |
139 | | pub const ChangesWhenCasefolded: Self = UnicodeProperty(54); |
140 | | /// Binary property `Changes_When_Casemapped` |
141 | | pub const ChangesWhenCasemapped: Self = UnicodeProperty(55); |
142 | | /// Binary property `Changes_When_NFKC_Casefolded` |
143 | | pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56); |
144 | | /// Binary property `Emoji` |
145 | | pub const Emoji: Self = UnicodeProperty(57); |
146 | | /// Binary property `Emoji_Presentation` |
147 | | pub const EmojiPresentation: Self = UnicodeProperty(58); |
148 | | /// Binary property `Emoji_Modifier` |
149 | | pub const EmojiModifier: Self = UnicodeProperty(59); |
150 | | /// Binary property `Emoji_Modifier_Base` |
151 | | pub const EmojiModifierBase: Self = UnicodeProperty(60); |
152 | | /// Binary property `Emoji_Component` |
153 | | pub const EmojiComponent: Self = UnicodeProperty(61); |
154 | | /// Binary property `Regional_Indicator` |
155 | | pub const RegionalIndicator: Self = UnicodeProperty(62); |
156 | | /// Binary property `Prepended_Concatenation_Mark` |
157 | | pub const PrependedConcatenationMark: Self = UnicodeProperty(63); |
158 | | /// Binary property `Extended_Pictographic` |
159 | | pub const ExtendedPictographic: Self = UnicodeProperty(64); |
160 | | /// Binary property `Basic_Emoji` |
161 | | pub const BasicEmoji: Self = UnicodeProperty(65); |
162 | | /// Binary property `Emoji_Keycap_Sequence` |
163 | | pub const EmojiKeycapSequence: Self = UnicodeProperty(66); |
164 | | /// Binary property `RGI_Emoji_Modifier_Sequence` |
165 | | pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67); |
166 | | /// Binary property `RGI_Emoji_Flag_Sequence` |
167 | | pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68); |
168 | | /// Binary property `RGI_Emoji_Tag_Sequence` |
169 | | pub const RgiEmojiTagSequence: Self = UnicodeProperty(69); |
170 | | /// Binary property `RGI_Emoji_ZWJ_Sequence` |
171 | | pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70); |
172 | | /// Binary property `RGI_Emoji` |
173 | | pub const RgiEmoji: Self = UnicodeProperty(71); |
174 | | |
175 | | const BINARY_MAX: Self = Self::RgiEmoji; |
176 | | |
177 | | /// Enumerated property `Bidi_Class` |
178 | | pub const BidiClass: Self = UnicodeProperty(0x1000); |
179 | | /// Enumerated property `Block` |
180 | | pub const Block: Self = UnicodeProperty(0x1001); |
181 | | /// Enumerated property `Canonical_Combining_Class` |
182 | | pub const CombiningClass: Self = UnicodeProperty(0x1002); |
183 | | /// Enumerated property `Decomposition_Type` |
184 | | pub const DecompositionType: Self = UnicodeProperty(0x1003); |
185 | | /// Enumerated property `East_Asian_Width` |
186 | | pub const EastAsianWidth: Self = UnicodeProperty(0x1004); |
187 | | /// Enumerated property `General_Category` |
188 | | pub const GeneralCategory: Self = UnicodeProperty(0x1005); |
189 | | /// Enumerated property `Joining_Group` |
190 | | pub const JoiningGroup: Self = UnicodeProperty(0x1006); |
191 | | /// Enumerated property `Joining_Type` |
192 | | pub const JoiningType: Self = UnicodeProperty(0x1007); |
193 | | /// Enumerated property `Line_Break` |
194 | | pub const LineBreak: Self = UnicodeProperty(0x1008); |
195 | | /// Enumerated property `Numeric_Type` |
196 | | pub const NumericType: Self = UnicodeProperty(0x1009); |
197 | | /// Enumerated property `Script` |
198 | | pub const Script: Self = UnicodeProperty(0x100A); |
199 | | /// Enumerated property `Hangul_Syllable_Type` |
200 | | pub const HangulSyllableType: Self = UnicodeProperty(0x100B); |
201 | | /// Enumerated property `NFD_Quick_Check` |
202 | | pub const NFDQuickCheck: Self = UnicodeProperty(0x100C); |
203 | | /// Enumerated property `NFKD_Quick_Check` |
204 | | pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D); |
205 | | /// Enumerated property `NFC_Quick_Check` |
206 | | pub const NFCQuickCheck: Self = UnicodeProperty(0x100E); |
207 | | /// Enumerated property `NFKC_Quick_Check` |
208 | | pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F); |
209 | | /// Enumerated property `Lead_Canonical_Combining_Class` |
210 | | pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010); |
211 | | /// Enumerated property `Trail_Canonical_Combining_Class` |
212 | | pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011); |
213 | | /// Enumerated property `Grapheme_Cluster_Break` |
214 | | pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012); |
215 | | /// Enumerated property `Sentence_Break` |
216 | | pub const SentenceBreak: Self = UnicodeProperty(0x1013); |
217 | | /// Enumerated property `Word_Break` |
218 | | pub const WordBreak: Self = UnicodeProperty(0x1014); |
219 | | /// Enumerated property `Bidi_Paired_Bracket_Type` |
220 | | pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015); |
221 | | /// Enumerated property `Indic_Positional_Category` |
222 | | pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016); |
223 | | /// Enumerated property `Indic_Syllabic_Category` |
224 | | pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017); |
225 | | /// Enumerated property `Vertical_Orientation` |
226 | | pub const VerticalOrientation: Self = UnicodeProperty(0x1018); |
227 | | |
228 | | const ENUMERATED_MAX: Self = Self::VerticalOrientation; |
229 | | |
230 | | /// Mask property `General_Category_Mask` |
231 | | pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000); |
232 | | |
233 | | /// Double property `Numeric_Value` |
234 | | pub const NumericValue: Self = UnicodeProperty(0x3000); |
235 | | |
236 | | /// String property `Age` |
237 | | pub const Age: Self = UnicodeProperty(0x4000); |
238 | | /// String property `Bidi_Mirroring_Glyph` |
239 | | pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001); |
240 | | /// String property `Case_Folding` |
241 | | pub const CaseFolding: Self = UnicodeProperty(0x4002); |
242 | | /// String property `ISO_Comment` |
243 | | pub const ISOComment: Self = UnicodeProperty(0x4003); |
244 | | /// String property `Lowercase_Mapping` |
245 | | pub const LowercaseMapping: Self = UnicodeProperty(0x4004); |
246 | | /// String property `Name` |
247 | | pub const Name: Self = UnicodeProperty(0x4005); |
248 | | /// String property `Simple_Case_Folding` |
249 | | pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006); |
250 | | /// String property `Simple_Lowercase_Mapping` |
251 | | pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007); |
252 | | /// String property `Simple_Titlecase_Mapping` |
253 | | pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008); |
254 | | /// String property `Simple_Uppercase_Mapping` |
255 | | pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009); |
256 | | /// String property `Titlecase_Mapping` |
257 | | pub const TitlecaseMapping: Self = UnicodeProperty(0x400A); |
258 | | /// String property `Unicode_1_Name` |
259 | | pub const Unicode1_Name: Self = UnicodeProperty(0x400B); |
260 | | /// String property `Uppercase_Mapping` |
261 | | pub const UppercaseMapping: Self = UnicodeProperty(0x400C); |
262 | | /// String property `Bidi_Paired_Bracket` |
263 | | pub const BidiPairedBracket: Self = UnicodeProperty(0x400D); |
264 | | |
265 | | const STRING_MAX: Self = Self::BidiPairedBracket; |
266 | | |
267 | | /// Misc property `Script_Extensions` |
268 | | pub const ScriptExtensions: Self = UnicodeProperty(0x7000); |
269 | | } |
270 | | |
271 | | #[allow(unused)] // experimental, may be made public later |
272 | | impl UnicodeProperty { |
273 | | /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`] |
274 | | /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset] |
275 | | /// |
276 | | /// Returns none if the name does not match any of the names in this subset. Performs |
277 | | /// strict matching of names. |
278 | | /// |
279 | | /// If using this to implement an ECMA262-compliant regex engine, please note these caveats: |
280 | | /// |
281 | | /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`]. |
282 | | /// Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`], |
283 | | /// [`maps::load_general_category()`], [`maps::load_script()`] and [`script::load_script_with_extensions_unstable()`]. |
284 | | /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not |
285 | | /// defined as properties. |
286 | | /// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]` |
287 | | /// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`). |
288 | | /// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]` |
289 | | /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`. |
290 | | /// This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup |
291 | | /// functions where necessary. |
292 | | /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties, |
293 | | /// e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so |
294 | | /// since these are property values, not properties, but you can use |
295 | | /// [`GeneralCategory::get_name_to_enum_mapper()`] or [`GeneralCategoryGroup::get_name_to_enum_mapper()`] |
296 | | /// to handle this. |
297 | | /// |
298 | | /// |
299 | | /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties |
300 | 0 | pub fn parse_ecma262_name(name: &str) -> Option<Self> { |
301 | 0 | let prop = match name { |
302 | 0 | "General_Category" | "gc" => Self::GeneralCategory, |
303 | 0 | "Script" | "sc" => Self::Script, |
304 | 0 | "Script_Extensions" | "scx" => Self::ScriptExtensions, |
305 | 0 | "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit, |
306 | 0 | "Alphabetic" | "Alpha" => Self::Alphabetic, |
307 | 0 | "Bidi_Control" | "Bidi_C" => Self::BidiControl, |
308 | 0 | "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored, |
309 | 0 | "Case_Ignorable" | "CI" => Self::CaseIgnorable, |
310 | 0 | "Cased" => Self::Cased, |
311 | 0 | "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded, |
312 | 0 | "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped, |
313 | 0 | "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased, |
314 | 0 | "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded, |
315 | 0 | "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased, |
316 | 0 | "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased, |
317 | 0 | "Dash" => Self::Dash, |
318 | 0 | "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint, |
319 | 0 | "Deprecated" | "Dep" => Self::Deprecated, |
320 | 0 | "Diacritic" | "Dia" => Self::Diacritic, |
321 | 0 | "Emoji" => Self::Emoji, |
322 | 0 | "Emoji_Component" | "EComp" => Self::EmojiComponent, |
323 | 0 | "Emoji_Modifier" | "EMod" => Self::EmojiModifier, |
324 | 0 | "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase, |
325 | 0 | "Emoji_Presentation" | "EPres" => Self::EmojiPresentation, |
326 | 0 | "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic, |
327 | 0 | "Extender" | "Ext" => Self::Extender, |
328 | 0 | "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase, |
329 | 0 | "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend, |
330 | 0 | "Hex_Digit" | "Hex" => Self::HexDigit, |
331 | 0 | "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator, |
332 | 0 | "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator, |
333 | 0 | "ID_Continue" | "IDC" => Self::IdContinue, |
334 | 0 | "ID_Start" | "IDS" => Self::IdStart, |
335 | 0 | "Ideographic" | "Ideo" => Self::Ideographic, |
336 | 0 | "Join_Control" | "Join_C" => Self::JoinControl, |
337 | 0 | "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException, |
338 | 0 | "Lowercase" | "Lower" => Self::Lowercase, |
339 | 0 | "Math" => Self::Math, |
340 | 0 | "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint, |
341 | 0 | "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax, |
342 | 0 | "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace, |
343 | 0 | "Quotation_Mark" | "QMark" => Self::QuotationMark, |
344 | 0 | "Radical" => Self::Radical, |
345 | 0 | "Regional_Indicator" | "RI" => Self::RegionalIndicator, |
346 | 0 | "Sentence_Terminal" | "STerm" => Self::SentenceTerminal, |
347 | 0 | "Soft_Dotted" | "SD" => Self::SoftDotted, |
348 | 0 | "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation, |
349 | 0 | "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph, |
350 | 0 | "Uppercase" | "Upper" => Self::Uppercase, |
351 | 0 | "Variation_Selector" | "VS" => Self::VariationSelector, |
352 | 0 | "White_Space" | "space" => Self::WhiteSpace, |
353 | 0 | "XID_Continue" | "XIDC" => Self::XidContinue, |
354 | 0 | "XID_Start" | "XIDS" => Self::XidStart, |
355 | 0 | _ => return None, |
356 | | }; |
357 | | |
358 | 0 | Some(prop) |
359 | 0 | } |
360 | | } |