Coverage Report

Created: 2025-05-07 06:59

/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_properties-1.5.1/src/runtime.rs
Line
Count
Source (jump to first uncovered line)
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you
6
//! have a use case for this!
7
//!
8
//! This module contains utilities for working with properties where the specific property in use
9
//! is not known at compile time.
10
//!
11
//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working
12
//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines.
13
14
#[cfg(doc)]
15
use super::{maps, script, GeneralCategory, GeneralCategoryGroup, Script};
16
17
/// This type can represent any Unicode property.
18
///
19
/// This is intended to be used in situations where the exact unicode property needed is
20
/// only known at runtime, for example in regex engines.
21
///
22
/// The values are intended to be identical to ICU4C's UProperty enum
23
#[allow(clippy::exhaustive_structs)] // newtype
24
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
25
pub struct UnicodeProperty(pub u32);
26
27
#[allow(non_upper_case_globals)]
28
#[allow(unused)] // experimental, may be made public later
29
impl UnicodeProperty {
30
    /// Binary property `Alphabetic`
31
    pub const Alphabetic: Self = UnicodeProperty(0);
32
    /// Binary property `ASCII_Hex_Digit`
33
    pub const AsciiHexDigit: Self = UnicodeProperty(1);
34
    /// Binary property `Bidi_Control`
35
    pub const BidiControl: Self = UnicodeProperty(2);
36
    /// Binary property `Bidi_Mirrored`
37
    pub const BidiMirrored: Self = UnicodeProperty(3);
38
    /// Binary property `Dash`
39
    pub const Dash: Self = UnicodeProperty(4);
40
    /// Binary property `Default_Ignorable_Code_Point`
41
    pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5);
42
    /// Binary property `Deprecated`
43
    pub const Deprecated: Self = UnicodeProperty(6);
44
    /// Binary property `Diacritic`
45
    pub const Diacritic: Self = UnicodeProperty(7);
46
    /// Binary property `Extender`
47
    pub const Extender: Self = UnicodeProperty(8);
48
    /// Binary property `Full_Composition_Exclusion`
49
    pub const FullCompositionExclusion: Self = UnicodeProperty(9);
50
    /// Binary property `Grapheme_Base`
51
    pub const GraphemeBase: Self = UnicodeProperty(10);
52
    /// Binary property `Grapheme_Extend`
53
    pub const GraphemeExtend: Self = UnicodeProperty(11);
54
    /// Binary property `Grapheme_Link`
55
    pub const GraphemeLink: Self = UnicodeProperty(12);
56
    /// Binary property `Hex_Digit`
57
    pub const HexDigit: Self = UnicodeProperty(13);
58
    /// Binary property `Hyphen`
59
    pub const Hyphen: Self = UnicodeProperty(14);
60
    /// Binary property `ID_Continue`
61
    pub const IdContinue: Self = UnicodeProperty(15);
62
    /// Binary property `ID_Start`
63
    pub const IdStart: Self = UnicodeProperty(16);
64
    /// Binary property `Ideographic`
65
    pub const Ideographic: Self = UnicodeProperty(17);
66
    /// Binary property `IDS_Binary_Operator`
67
    pub const IdsBinaryOperator: Self = UnicodeProperty(18);
68
    /// Binary property `IDS_Trinary_Operator`
69
    pub const IdsTrinaryOperator: Self = UnicodeProperty(19);
70
    /// Binary property `Join_Control`
71
    pub const JoinControl: Self = UnicodeProperty(20);
72
    /// Binary property `Logical_Order_Exception`
73
    pub const LogicalOrderException: Self = UnicodeProperty(21);
74
    /// Binary property `Lowercase`
75
    pub const Lowercase: Self = UnicodeProperty(22);
76
    /// Binary property `Math`
77
    pub const Math: Self = UnicodeProperty(23);
78
    /// Binary property `Noncharacter_Code_Point`
79
    pub const NoncharacterCodePoint: Self = UnicodeProperty(24);
80
    /// Binary property `Quotation_Mark`
81
    pub const QuotationMark: Self = UnicodeProperty(25);
82
    /// Binary property `Radical`
83
    pub const Radical: Self = UnicodeProperty(26);
84
    /// Binary property `Soft_Dotted`
85
    pub const SoftDotted: Self = UnicodeProperty(27);
86
    /// Binary property `Terminal_Punctuation`
87
    pub const TerminalPunctuation: Self = UnicodeProperty(28);
88
    /// Binary property `Unified_Ideograph`
89
    pub const UnifiedIdeograph: Self = UnicodeProperty(29);
90
    /// Binary property `Uppercase`
91
    pub const Uppercase: Self = UnicodeProperty(30);
92
    /// Binary property `White_Space`
93
    pub const WhiteSpace: Self = UnicodeProperty(31);
94
    /// Binary property `XID_Continue`
95
    pub const XidContinue: Self = UnicodeProperty(32);
96
    /// Binary property `XID_Start`
97
    pub const XidStart: Self = UnicodeProperty(33);
98
    /// Binary property `Case_Sensitive`
99
    pub const CaseSensitive: Self = UnicodeProperty(34);
100
    /// Binary property `Sentence_Terminal`
101
    pub const SentenceTerminal: Self = UnicodeProperty(35);
102
    /// Binary property `Variation_Selector`
103
    pub const VariationSelector: Self = UnicodeProperty(36);
104
    /// Binary property `NFD_Inert`
105
    pub const NfdInert: Self = UnicodeProperty(37);
106
    /// Binary property `NFKD_Inert`
107
    pub const NfkdInert: Self = UnicodeProperty(38);
108
    /// Binary property `NFC_Inert`
109
    pub const NfcInert: Self = UnicodeProperty(39);
110
    /// Binary property `NFKC_Inert`
111
    pub const NfkcInert: Self = UnicodeProperty(40);
112
    /// Binary property `Segment_Starter`
113
    pub const SegmentStarter: Self = UnicodeProperty(41);
114
    /// Binary property `Pattern_Syntax`
115
    pub const PatternSyntax: Self = UnicodeProperty(42);
116
    /// Binary property `Pattern_White_Space`
117
    pub const PatternWhiteSpace: Self = UnicodeProperty(43);
118
    /// Binary property `alnum`
119
    pub const Alnum: Self = UnicodeProperty(44);
120
    /// Binary property `blank`
121
    pub const Blank: Self = UnicodeProperty(45);
122
    /// Binary property `graph`
123
    pub const Graph: Self = UnicodeProperty(46);
124
    /// Binary property `print`
125
    pub const Print: Self = UnicodeProperty(47);
126
    /// Binary property `xdigit`
127
    pub const XDigit: Self = UnicodeProperty(48);
128
    /// Binary property `Cased`
129
    pub const Cased: Self = UnicodeProperty(49);
130
    /// Binary property `Case_Ignorable`
131
    pub const CaseIgnorable: Self = UnicodeProperty(50);
132
    /// Binary property `Changes_When_Lowercased`
133
    pub const ChangesWhenLowercased: Self = UnicodeProperty(51);
134
    /// Binary property `Changes_When_Uppercased`
135
    pub const ChangesWhenUppercased: Self = UnicodeProperty(52);
136
    /// Binary property `Changes_When_Titlecased`
137
    pub const ChangesWhenTitlecased: Self = UnicodeProperty(53);
138
    /// Binary property `Changes_When_Casefolded`
139
    pub const ChangesWhenCasefolded: Self = UnicodeProperty(54);
140
    /// Binary property `Changes_When_Casemapped`
141
    pub const ChangesWhenCasemapped: Self = UnicodeProperty(55);
142
    /// Binary property `Changes_When_NFKC_Casefolded`
143
    pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56);
144
    /// Binary property `Emoji`
145
    pub const Emoji: Self = UnicodeProperty(57);
146
    /// Binary property `Emoji_Presentation`
147
    pub const EmojiPresentation: Self = UnicodeProperty(58);
148
    /// Binary property `Emoji_Modifier`
149
    pub const EmojiModifier: Self = UnicodeProperty(59);
150
    /// Binary property `Emoji_Modifier_Base`
151
    pub const EmojiModifierBase: Self = UnicodeProperty(60);
152
    /// Binary property `Emoji_Component`
153
    pub const EmojiComponent: Self = UnicodeProperty(61);
154
    /// Binary property `Regional_Indicator`
155
    pub const RegionalIndicator: Self = UnicodeProperty(62);
156
    /// Binary property `Prepended_Concatenation_Mark`
157
    pub const PrependedConcatenationMark: Self = UnicodeProperty(63);
158
    /// Binary property `Extended_Pictographic`
159
    pub const ExtendedPictographic: Self = UnicodeProperty(64);
160
    /// Binary property `Basic_Emoji`
161
    pub const BasicEmoji: Self = UnicodeProperty(65);
162
    /// Binary property `Emoji_Keycap_Sequence`
163
    pub const EmojiKeycapSequence: Self = UnicodeProperty(66);
164
    /// Binary property `RGI_Emoji_Modifier_Sequence`
165
    pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67);
166
    /// Binary property `RGI_Emoji_Flag_Sequence`
167
    pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68);
168
    /// Binary property `RGI_Emoji_Tag_Sequence`
169
    pub const RgiEmojiTagSequence: Self = UnicodeProperty(69);
170
    /// Binary property `RGI_Emoji_ZWJ_Sequence`
171
    pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70);
172
    /// Binary property `RGI_Emoji`
173
    pub const RgiEmoji: Self = UnicodeProperty(71);
174
175
    const BINARY_MAX: Self = Self::RgiEmoji;
176
177
    /// Enumerated property `Bidi_Class`
178
    pub const BidiClass: Self = UnicodeProperty(0x1000);
179
    /// Enumerated property `Block`
180
    pub const Block: Self = UnicodeProperty(0x1001);
181
    /// Enumerated property `Canonical_Combining_Class`
182
    pub const CombiningClass: Self = UnicodeProperty(0x1002);
183
    /// Enumerated property `Decomposition_Type`
184
    pub const DecompositionType: Self = UnicodeProperty(0x1003);
185
    /// Enumerated property `East_Asian_Width`
186
    pub const EastAsianWidth: Self = UnicodeProperty(0x1004);
187
    /// Enumerated property `General_Category`
188
    pub const GeneralCategory: Self = UnicodeProperty(0x1005);
189
    /// Enumerated property `Joining_Group`
190
    pub const JoiningGroup: Self = UnicodeProperty(0x1006);
191
    /// Enumerated property `Joining_Type`
192
    pub const JoiningType: Self = UnicodeProperty(0x1007);
193
    /// Enumerated property `Line_Break`
194
    pub const LineBreak: Self = UnicodeProperty(0x1008);
195
    /// Enumerated property `Numeric_Type`
196
    pub const NumericType: Self = UnicodeProperty(0x1009);
197
    /// Enumerated property `Script`
198
    pub const Script: Self = UnicodeProperty(0x100A);
199
    /// Enumerated property `Hangul_Syllable_Type`
200
    pub const HangulSyllableType: Self = UnicodeProperty(0x100B);
201
    /// Enumerated property `NFD_Quick_Check`
202
    pub const NFDQuickCheck: Self = UnicodeProperty(0x100C);
203
    /// Enumerated property `NFKD_Quick_Check`
204
    pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D);
205
    /// Enumerated property `NFC_Quick_Check`
206
    pub const NFCQuickCheck: Self = UnicodeProperty(0x100E);
207
    /// Enumerated property `NFKC_Quick_Check`
208
    pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F);
209
    /// Enumerated property `Lead_Canonical_Combining_Class`
210
    pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010);
211
    /// Enumerated property `Trail_Canonical_Combining_Class`
212
    pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011);
213
    /// Enumerated property `Grapheme_Cluster_Break`
214
    pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012);
215
    /// Enumerated property `Sentence_Break`
216
    pub const SentenceBreak: Self = UnicodeProperty(0x1013);
217
    /// Enumerated property `Word_Break`
218
    pub const WordBreak: Self = UnicodeProperty(0x1014);
219
    /// Enumerated property `Bidi_Paired_Bracket_Type`
220
    pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015);
221
    /// Enumerated property `Indic_Positional_Category`
222
    pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016);
223
    /// Enumerated property `Indic_Syllabic_Category`
224
    pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017);
225
    /// Enumerated property `Vertical_Orientation`
226
    pub const VerticalOrientation: Self = UnicodeProperty(0x1018);
227
228
    const ENUMERATED_MAX: Self = Self::VerticalOrientation;
229
230
    /// Mask property `General_Category_Mask`
231
    pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000);
232
233
    /// Double property `Numeric_Value`
234
    pub const NumericValue: Self = UnicodeProperty(0x3000);
235
236
    /// String property `Age`
237
    pub const Age: Self = UnicodeProperty(0x4000);
238
    /// String property `Bidi_Mirroring_Glyph`
239
    pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001);
240
    /// String property `Case_Folding`
241
    pub const CaseFolding: Self = UnicodeProperty(0x4002);
242
    /// String property `ISO_Comment`
243
    pub const ISOComment: Self = UnicodeProperty(0x4003);
244
    /// String property `Lowercase_Mapping`
245
    pub const LowercaseMapping: Self = UnicodeProperty(0x4004);
246
    /// String property `Name`
247
    pub const Name: Self = UnicodeProperty(0x4005);
248
    /// String property `Simple_Case_Folding`
249
    pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006);
250
    /// String property `Simple_Lowercase_Mapping`
251
    pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007);
252
    /// String property `Simple_Titlecase_Mapping`
253
    pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008);
254
    /// String property `Simple_Uppercase_Mapping`
255
    pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009);
256
    /// String property `Titlecase_Mapping`
257
    pub const TitlecaseMapping: Self = UnicodeProperty(0x400A);
258
    /// String property `Unicode_1_Name`
259
    pub const Unicode1_Name: Self = UnicodeProperty(0x400B);
260
    /// String property `Uppercase_Mapping`
261
    pub const UppercaseMapping: Self = UnicodeProperty(0x400C);
262
    /// String property `Bidi_Paired_Bracket`
263
    pub const BidiPairedBracket: Self = UnicodeProperty(0x400D);
264
265
    const STRING_MAX: Self = Self::BidiPairedBracket;
266
267
    /// Misc property `Script_Extensions`
268
    pub const ScriptExtensions: Self = UnicodeProperty(0x7000);
269
}
270
271
#[allow(unused)] // experimental, may be made public later
272
impl UnicodeProperty {
273
    /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`]
274
    /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset]
275
    ///
276
    /// Returns none if the name does not match any of the names in this subset. Performs
277
    /// strict matching of names.
278
    ///
279
    /// If using this to implement an ECMA262-compliant regex engine, please note these caveats:
280
    ///
281
    /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`].
282
    ///   Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`],
283
    ///   [`maps::load_general_category()`], [`maps::load_script()`]  and [`script::load_script_with_extensions_unstable()`].
284
    /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not
285
    ///   defined as properties.
286
    ///    - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
287
    ///    - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
288
    ///    - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
289
    /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`.
290
    ///   This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup
291
    ///   functions where necessary.
292
    /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties,
293
    ///   e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so
294
    ///   since these are property values, not properties, but you can use
295
    ///   [`GeneralCategory::get_name_to_enum_mapper()`] or  [`GeneralCategoryGroup::get_name_to_enum_mapper()`]
296
    ///   to handle this.
297
    ///
298
    ///
299
    /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties
300
0
    pub fn parse_ecma262_name(name: &str) -> Option<Self> {
301
0
        let prop = match name {
302
0
            "General_Category" | "gc" => Self::GeneralCategory,
303
0
            "Script" | "sc" => Self::Script,
304
0
            "Script_Extensions" | "scx" => Self::ScriptExtensions,
305
0
            "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit,
306
0
            "Alphabetic" | "Alpha" => Self::Alphabetic,
307
0
            "Bidi_Control" | "Bidi_C" => Self::BidiControl,
308
0
            "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored,
309
0
            "Case_Ignorable" | "CI" => Self::CaseIgnorable,
310
0
            "Cased" => Self::Cased,
311
0
            "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded,
312
0
            "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped,
313
0
            "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased,
314
0
            "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded,
315
0
            "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased,
316
0
            "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased,
317
0
            "Dash" => Self::Dash,
318
0
            "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint,
319
0
            "Deprecated" | "Dep" => Self::Deprecated,
320
0
            "Diacritic" | "Dia" => Self::Diacritic,
321
0
            "Emoji" => Self::Emoji,
322
0
            "Emoji_Component" | "EComp" => Self::EmojiComponent,
323
0
            "Emoji_Modifier" | "EMod" => Self::EmojiModifier,
324
0
            "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase,
325
0
            "Emoji_Presentation" | "EPres" => Self::EmojiPresentation,
326
0
            "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic,
327
0
            "Extender" | "Ext" => Self::Extender,
328
0
            "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase,
329
0
            "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend,
330
0
            "Hex_Digit" | "Hex" => Self::HexDigit,
331
0
            "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator,
332
0
            "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator,
333
0
            "ID_Continue" | "IDC" => Self::IdContinue,
334
0
            "ID_Start" | "IDS" => Self::IdStart,
335
0
            "Ideographic" | "Ideo" => Self::Ideographic,
336
0
            "Join_Control" | "Join_C" => Self::JoinControl,
337
0
            "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException,
338
0
            "Lowercase" | "Lower" => Self::Lowercase,
339
0
            "Math" => Self::Math,
340
0
            "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint,
341
0
            "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax,
342
0
            "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace,
343
0
            "Quotation_Mark" | "QMark" => Self::QuotationMark,
344
0
            "Radical" => Self::Radical,
345
0
            "Regional_Indicator" | "RI" => Self::RegionalIndicator,
346
0
            "Sentence_Terminal" | "STerm" => Self::SentenceTerminal,
347
0
            "Soft_Dotted" | "SD" => Self::SoftDotted,
348
0
            "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation,
349
0
            "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph,
350
0
            "Uppercase" | "Upper" => Self::Uppercase,
351
0
            "Variation_Selector" | "VS" => Self::VariationSelector,
352
0
            "White_Space" | "space" => Self::WhiteSpace,
353
0
            "XID_Continue" | "XIDC" => Self::XidContinue,
354
0
            "XID_Start" | "XIDS" => Self::XidStart,
355
0
            _ => return None,
356
        };
357
358
0
        Some(prop)
359
0
    }
360
}