Coverage Report

Created: 2021-03-22 08:29

/rust/registry/src/github.com-1ecc6299db9ec823/regex-syntax-0.6.23/src/unicode.rs
Line
Count
Source (jump to first uncovered line)
1
use std::error;
2
use std::fmt;
3
use std::result;
4
5
use hir;
6
7
/// A type alias for errors specific to Unicode handling of classes.
8
pub type Result<T> = result::Result<T, Error>;
9
10
/// An inclusive range of codepoints from a generated file (hence the static
11
/// lifetime).
12
type Range = &'static [(char, char)];
13
14
/// An error that occurs when dealing with Unicode.
15
///
16
/// We don't impl the Error trait here because these always get converted
17
/// into other public errors. (This error type isn't exported.)
18
#[derive(Debug)]
19
pub enum Error {
20
    PropertyNotFound,
21
    PropertyValueNotFound,
22
    // Not used when unicode-perl is enabled.
23
    #[allow(dead_code)]
24
    PerlClassNotFound,
25
}
26
27
/// A type alias for errors specific to Unicode case folding.
28
pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29
30
/// An error that occurs when Unicode-aware simple case folding fails.
31
///
32
/// This error can occur when the case mapping tables necessary for Unicode
33
/// aware case folding are unavailable. This only occurs when the
34
/// `unicode-case` feature is disabled. (The feature is enabled by default.)
35
#[derive(Debug)]
36
pub struct CaseFoldError(());
37
38
impl error::Error for CaseFoldError {}
39
40
impl fmt::Display for CaseFoldError {
41
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42
        write!(
43
            f,
44
            "Unicode-aware case folding is not available \
45
             (probably because the unicode-case feature is not enabled)"
46
        )
47
    }
48
}
49
50
/// An error that occurs when the Unicode-aware `\w` class is unavailable.
51
///
52
/// This error can occur when the data tables necessary for the Unicode aware
53
/// Perl character class `\w` are unavailable. This only occurs when the
54
/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55
0
#[derive(Debug)]
56
pub struct UnicodeWordError(());
57
58
impl error::Error for UnicodeWordError {}
59
60
impl fmt::Display for UnicodeWordError {
61
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
62
        write!(
63
            f,
64
            "Unicode-aware \\w class is not available \
65
             (probably because the unicode-perl feature is not enabled)"
66
        )
67
    }
68
}
69
70
/// Return an iterator over the equivalence class of simple case mappings
71
/// for the given codepoint. The equivalence class does not include the
72
/// given codepoint.
73
///
74
/// If the equivalence class is empty, then this returns the next scalar
75
/// value that has a non-empty equivalence class, if it exists. If no such
76
/// scalar value exists, then `None` is returned. The point of this behavior
77
/// is to permit callers to avoid calling `simple_fold` more than they need
78
/// to, since there is some cost to fetching the equivalence class.
79
///
80
/// This returns an error if the Unicode case folding tables are not available.
81
pub fn simple_fold(
82
    c: char,
83
) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84
    #[cfg(not(feature = "unicode-case"))]
85
    fn imp(
86
        _: char,
87
    ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88
    {
89
        use std::option::IntoIter;
90
        Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91
    }
92
93
    #[cfg(feature = "unicode-case")]
94
    fn imp(
95
        c: char,
96
    ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97
    {
98
        use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99
100
        Ok(CASE_FOLDING_SIMPLE
101
            .binary_search_by_key(&c, |&(c1, _)| c1)
102
            .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c))
103
            .map_err(|i| {
104
                if i >= CASE_FOLDING_SIMPLE.len() {
105
                    None
106
                } else {
107
                    Some(CASE_FOLDING_SIMPLE[i].0)
108
                }
109
            }))
110
    }
111
112
    imp(c)
113
}
114
115
/// Returns true if and only if the given (inclusive) range contains at least
116
/// one Unicode scalar value that has a non-empty non-trivial simple case
117
/// mapping.
118
///
119
/// This function panics if `end < start`.
120
///
121
/// This returns an error if the Unicode case folding tables are not available.
122
pub fn contains_simple_case_mapping(
123
    start: char,
124
    end: char,
125
) -> FoldResult<bool> {
126
    #[cfg(not(feature = "unicode-case"))]
127
    fn imp(_: char, _: char) -> FoldResult<bool> {
128
        Err(CaseFoldError(()))
129
    }
130
131
    #[cfg(feature = "unicode-case")]
132
    fn imp(start: char, end: char) -> FoldResult<bool> {
133
        use std::cmp::Ordering;
134
        use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
135
136
        assert!(start <= end);
137
        Ok(CASE_FOLDING_SIMPLE
138
            .binary_search_by(|&(c, _)| {
139
                if start <= c && c <= end {
140
                    Ordering::Equal
141
                } else if c > end {
142
                    Ordering::Greater
143
                } else {
144
                    Ordering::Less
145
                }
146
            })
147
            .is_ok())
148
    }
149
150
    imp(start, end)
151
}
152
153
/// A query for finding a character class defined by Unicode. This supports
154
/// either use of a property name directly, or lookup by property value. The
155
/// former generally refers to Binary properties (see UTS#44, Table 8), but
156
/// as a special exception (see UTS#18, Section 1.2) both general categories
157
/// (an enumeration) and scripts (a catalog) are supported as if each of their
158
/// possible values were a binary property.
159
///
160
/// In all circumstances, property names and values are normalized and
161
/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162
///
163
/// The lifetime `'a` refers to the shorter of the lifetimes of property name
164
/// and property value.
165
#[derive(Debug)]
166
pub enum ClassQuery<'a> {
167
    /// Return a class corresponding to a Unicode binary property, named by
168
    /// a single letter.
169
    OneLetter(char),
170
    /// Return a class corresponding to a Unicode binary property.
171
    ///
172
    /// Note that, by special exception (see UTS#18, Section 1.2), both
173
    /// general category values and script values are permitted here as if
174
    /// they were a binary property.
175
    Binary(&'a str),
176
    /// Return a class corresponding to all codepoints whose property
177
    /// (identified by `property_name`) corresponds to the given value
178
    /// (identified by `property_value`).
179
    ByValue {
180
        /// A property name.
181
        property_name: &'a str,
182
        /// A property value.
183
        property_value: &'a str,
184
    },
185
}
186
187
impl<'a> ClassQuery<'a> {
188
0
    fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189
0
        match *self {
190
0
            ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191
0
            ClassQuery::Binary(name) => self.canonical_binary(name),
192
0
            ClassQuery::ByValue { property_name, property_value } => {
193
0
                let property_name = symbolic_name_normalize(property_name);
194
0
                let property_value = symbolic_name_normalize(property_value);
195
196
0
                let canon_name = match canonical_prop(&property_name)? {
197
0
                    None => return Err(Error::PropertyNotFound),
198
0
                    Some(canon_name) => canon_name,
199
                };
200
0
                Ok(match canon_name {
201
0
                    "General_Category" => {
202
0
                        let canon = match canonical_gencat(&property_value)? {
203
0
                            None => return Err(Error::PropertyValueNotFound),
204
0
                            Some(canon) => canon,
205
0
                        };
206
0
                        CanonicalClassQuery::GeneralCategory(canon)
207
                    }
208
0
                    "Script" => {
209
0
                        let canon = match canonical_script(&property_value)? {
210
0
                            None => return Err(Error::PropertyValueNotFound),
211
0
                            Some(canon) => canon,
212
0
                        };
213
0
                        CanonicalClassQuery::Script(canon)
214
                    }
215
                    _ => {
216
0
                        let vals = match property_values(canon_name)? {
217
0
                            None => return Err(Error::PropertyValueNotFound),
218
0
                            Some(vals) => vals,
219
                        };
220
0
                        let canon_val =
221
0
                            match canonical_value(vals, &property_value) {
222
0
                                None => {
223
0
                                    return Err(Error::PropertyValueNotFound)
224
                                }
225
0
                                Some(canon_val) => canon_val,
226
0
                            };
227
0
                        CanonicalClassQuery::ByValue {
228
0
                            property_name: canon_name,
229
0
                            property_value: canon_val,
230
0
                        }
231
                    }
232
                })
233
            }
234
        }
235
0
    }
236
237
0
    fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
238
0
        let norm = symbolic_name_normalize(name);
239
0
240
0
        // This is a special case where 'cf' refers to the 'Format' general
241
0
        // category, but where the 'cf' abbreviation is also an abbreviation
242
0
        // for the 'Case_Folding' property. But we want to treat it as
243
0
        // a general category. (Currently, we don't even support the
244
0
        // 'Case_Folding' property. But if we do in the future, users will be
245
0
        // required to spell it out.)
246
0
        if norm != "cf" {
247
0
            if let Some(canon) = canonical_prop(&norm)? {
248
0
                return Ok(CanonicalClassQuery::Binary(canon));
249
0
            }
250
0
        }
251
0
        if let Some(canon) = canonical_gencat(&norm)? {
252
0
            return Ok(CanonicalClassQuery::GeneralCategory(canon));
253
0
        }
254
0
        if let Some(canon) = canonical_script(&norm)? {
255
0
            return Ok(CanonicalClassQuery::Script(canon));
256
0
        }
257
0
        Err(Error::PropertyNotFound)
258
0
    }
259
}
260
261
/// Like ClassQuery, but its parameters have been canonicalized. This also
262
/// differentiates binary properties from flattened general categories and
263
/// scripts.
264
0
#[derive(Debug, Eq, PartialEq)]
265
enum CanonicalClassQuery {
266
    /// The canonical binary property name.
267
    Binary(&'static str),
268
    /// The canonical general category name.
269
    GeneralCategory(&'static str),
270
    /// The canonical script name.
271
    Script(&'static str),
272
    /// An arbitrary association between property and value, both of which
273
    /// have been canonicalized.
274
    ///
275
    /// Note that by construction, the property name of ByValue will never
276
    /// be General_Category or Script. Those two cases are subsumed by the
277
    /// eponymous variants.
278
    ByValue {
279
        /// The canonical property name.
280
        property_name: &'static str,
281
        /// The canonical property value.
282
        property_value: &'static str,
283
    },
284
}
285
286
/// Looks up a Unicode class given a query. If one doesn't exist, then
287
/// `None` is returned.
288
pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> {
289
    use self::CanonicalClassQuery::*;
290
291
0
    match query.canonicalize()? {
292
0
        Binary(name) => bool_property(name),
293
0
        GeneralCategory(name) => gencat(name),
294
0
        Script(name) => script(name),
295
0
        ByValue { property_name: "Age", property_value } => {
296
0
            let mut class = hir::ClassUnicode::empty();
297
0
            for set in ages(property_value)? {
298
0
                class.union(&hir_class(set));
299
0
            }
300
0
            Ok(class)
301
        }
302
0
        ByValue { property_name: "Script_Extensions", property_value } => {
303
0
            script_extension(property_value)
304
        }
305
        ByValue {
306
0
            property_name: "Grapheme_Cluster_Break",
307
0
            property_value,
308
0
        } => gcb(property_value),
309
0
        ByValue { property_name: "Sentence_Break", property_value } => {
310
0
            sb(property_value)
311
        }
312
0
        ByValue { property_name: "Word_Break", property_value } => {
313
0
            wb(property_value)
314
        }
315
        _ => {
316
            // What else should we support?
317
0
            Err(Error::PropertyNotFound)
318
        }
319
    }
320
0
}
321
322
/// Returns a Unicode aware class for \w.
323
///
324
/// This returns an error if the data is not available for \w.
325
pub fn perl_word() -> Result<hir::ClassUnicode> {
326
    #[cfg(not(feature = "unicode-perl"))]
327
    fn imp() -> Result<hir::ClassUnicode> {
328
        Err(Error::PerlClassNotFound)
329
    }
330
331
    #[cfg(feature = "unicode-perl")]
332
    fn imp() -> Result<hir::ClassUnicode> {
333
        use unicode_tables::perl_word::PERL_WORD;
334
        Ok(hir_class(PERL_WORD))
335
    }
336
337
    imp()
338
}
339
340
/// Returns a Unicode aware class for \s.
341
///
342
/// This returns an error if the data is not available for \s.
343
pub fn perl_space() -> Result<hir::ClassUnicode> {
344
    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
345
    fn imp() -> Result<hir::ClassUnicode> {
346
        Err(Error::PerlClassNotFound)
347
    }
348
349
    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
350
    fn imp() -> Result<hir::ClassUnicode> {
351
        use unicode_tables::perl_space::WHITE_SPACE;
352
        Ok(hir_class(WHITE_SPACE))
353
    }
354
355
    #[cfg(feature = "unicode-bool")]
356
    fn imp() -> Result<hir::ClassUnicode> {
357
        use unicode_tables::property_bool::WHITE_SPACE;
358
        Ok(hir_class(WHITE_SPACE))
359
    }
360
361
    imp()
362
}
363
364
/// Returns a Unicode aware class for \d.
365
///
366
/// This returns an error if the data is not available for \d.
367
pub fn perl_digit() -> Result<hir::ClassUnicode> {
368
    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
369
    fn imp() -> Result<hir::ClassUnicode> {
370
        Err(Error::PerlClassNotFound)
371
    }
372
373
    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
374
    fn imp() -> Result<hir::ClassUnicode> {
375
        use unicode_tables::perl_decimal::DECIMAL_NUMBER;
376
        Ok(hir_class(DECIMAL_NUMBER))
377
    }
378
379
    #[cfg(feature = "unicode-gencat")]
380
    fn imp() -> Result<hir::ClassUnicode> {
381
        use unicode_tables::general_category::DECIMAL_NUMBER;
382
        Ok(hir_class(DECIMAL_NUMBER))
383
    }
384
385
    imp()
386
}
387
388
/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
389
0
pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
390
0
    let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
391
0
        .iter()
392
0
        .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
393
0
        .collect();
394
0
    hir::ClassUnicode::new(hir_ranges)
395
0
}
396
397
/// Returns true only if the given codepoint is in the `\w` character class.
398
///
399
/// If the `unicode-perl` feature is not enabled, then this returns an error.
400
pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
401
    #[cfg(not(feature = "unicode-perl"))]
402
    fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
403
        Err(UnicodeWordError(()))
404
    }
405
406
    #[cfg(feature = "unicode-perl")]
407
    fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
408
        use is_word_byte;
409
        use std::cmp::Ordering;
410
        use unicode_tables::perl_word::PERL_WORD;
411
412
        if c <= 0x7F as char && is_word_byte(c as u8) {
413
            return Ok(true);
414
        }
415
        Ok(PERL_WORD
416
            .binary_search_by(|&(start, end)| {
417
                if start <= c && c <= end {
418
                    Ordering::Equal
419
                } else if start > c {
420
                    Ordering::Greater
421
                } else {
422
                    Ordering::Less
423
                }
424
            })
425
            .is_ok())
426
    }
427
428
    imp(c)
429
}
430
431
/// A mapping of property values for a specific property.
432
///
433
/// The first element of each tuple is a normalized property value while the
434
/// second element of each tuple is the corresponding canonical property
435
/// value.
436
type PropertyValues = &'static [(&'static str, &'static str)];
437
438
0
fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
439
0
    Ok(match normalized_value {
440
0
        "any" => Some("Any"),
441
0
        "assigned" => Some("Assigned"),
442
0
        "ascii" => Some("ASCII"),
443
        _ => {
444
0
            let gencats = property_values("General_Category")?.unwrap();
445
0
            canonical_value(gencats, normalized_value)
446
        }
447
    })
448
0
}
449
450
0
fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
451
0
    let scripts = property_values("Script")?.unwrap();
452
0
    Ok(canonical_value(scripts, normalized_value))
453
0
}
454
455
/// Find the canonical property name for the given normalized property name.
456
///
457
/// If no such property exists, then `None` is returned.
458
///
459
/// The normalized property name must have been normalized according to
460
/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
461
///
462
/// If the property names data is not available, then an error is returned.
463
fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
464
    #[cfg(not(any(
465
        feature = "unicode-age",
466
        feature = "unicode-bool",
467
        feature = "unicode-gencat",
468
        feature = "unicode-perl",
469
        feature = "unicode-script",
470
        feature = "unicode-segment",
471
    )))]
472
    fn imp(_: &str) -> Result<Option<&'static str>> {
473
        Err(Error::PropertyNotFound)
474
    }
475
476
    #[cfg(any(
477
        feature = "unicode-age",
478
        feature = "unicode-bool",
479
        feature = "unicode-gencat",
480
        feature = "unicode-perl",
481
        feature = "unicode-script",
482
        feature = "unicode-segment",
483
    ))]
484
    fn imp(name: &str) -> Result<Option<&'static str>> {
485
        use unicode_tables::property_names::PROPERTY_NAMES;
486
487
        Ok(PROPERTY_NAMES
488
            .binary_search_by_key(&name, |&(n, _)| n)
489
            .ok()
490
            .map(|i| PROPERTY_NAMES[i].1))
491
    }
492
493
    imp(normalized_name)
494
}
495
496
/// Find the canonical property value for the given normalized property
497
/// value.
498
///
499
/// The given property values should correspond to the values for the property
500
/// under question, which can be found using `property_values`.
501
///
502
/// If no such property value exists, then `None` is returned.
503
///
504
/// The normalized property value must have been normalized according to
505
/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
506
0
fn canonical_value(
507
0
    vals: PropertyValues,
508
0
    normalized_value: &str,
509
0
) -> Option<&'static str> {
510
0
    vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
511
0
        .ok()
512
0
        .map(|i| vals[i].1)
513
0
}
514
515
/// Return the table of property values for the given property name.
516
///
517
/// If the property values data is not available, then an error is returned.
518
fn property_values(
519
    canonical_property_name: &'static str,
520
) -> Result<Option<PropertyValues>> {
521
    #[cfg(not(any(
522
        feature = "unicode-age",
523
        feature = "unicode-bool",
524
        feature = "unicode-gencat",
525
        feature = "unicode-perl",
526
        feature = "unicode-script",
527
        feature = "unicode-segment",
528
    )))]
529
    fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
530
        Err(Error::PropertyValueNotFound)
531
    }
532
533
    #[cfg(any(
534
        feature = "unicode-age",
535
        feature = "unicode-bool",
536
        feature = "unicode-gencat",
537
        feature = "unicode-perl",
538
        feature = "unicode-script",
539
        feature = "unicode-segment",
540
    ))]
541
    fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
542
        use unicode_tables::property_values::PROPERTY_VALUES;
543
544
        Ok(PROPERTY_VALUES
545
            .binary_search_by_key(&name, |&(n, _)| n)
546
            .ok()
547
            .map(|i| PROPERTY_VALUES[i].1))
548
    }
549
550
    imp(canonical_property_name)
551
}
552
553
// This is only used in some cases, but small enough to just let it be dead
554
// instead of figuring out (and maintaining) the right set of features.
555
#[allow(dead_code)]
556
0
fn property_set(
557
0
    name_map: &'static [(&'static str, Range)],
558
0
    canonical: &'static str,
559
0
) -> Option<Range> {
560
0
    name_map
561
0
        .binary_search_by_key(&canonical, |x| x.0)
562
0
        .ok()
563
0
        .map(|i| name_map[i].1)
564
0
}
565
566
/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
567
/// of codepoints that were added in a particular revision of Unicode. The
568
/// iterator yields items in chronological order.
569
///
570
/// If the given age value isn't valid or if the data isn't available, then an
571
/// error is returned instead.
572
fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573
    #[cfg(not(feature = "unicode-age"))]
574
    fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
575
        use std::option::IntoIter;
576
        Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
577
    }
578
579
    #[cfg(feature = "unicode-age")]
580
    fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
581
        use unicode_tables::age;
582
583
        const AGES: &'static [(&'static str, Range)] = &[
584
            ("V1_1", age::V1_1),
585
            ("V2_0", age::V2_0),
586
            ("V2_1", age::V2_1),
587
            ("V3_0", age::V3_0),
588
            ("V3_1", age::V3_1),
589
            ("V3_2", age::V3_2),
590
            ("V4_0", age::V4_0),
591
            ("V4_1", age::V4_1),
592
            ("V5_0", age::V5_0),
593
            ("V5_1", age::V5_1),
594
            ("V5_2", age::V5_2),
595
            ("V6_0", age::V6_0),
596
            ("V6_1", age::V6_1),
597
            ("V6_2", age::V6_2),
598
            ("V6_3", age::V6_3),
599
            ("V7_0", age::V7_0),
600
            ("V8_0", age::V8_0),
601
            ("V9_0", age::V9_0),
602
            ("V10_0", age::V10_0),
603
            ("V11_0", age::V11_0),
604
            ("V12_0", age::V12_0),
605
            ("V12_1", age::V12_1),
606
            ("V13_0", age::V13_0),
607
        ];
608
        assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
609
610
        let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
611
        match pos {
612
            None => Err(Error::PropertyValueNotFound),
613
            Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)),
614
        }
615
    }
616
617
    imp(canonical_age)
618
}
619
620
/// Returns the Unicode HIR class corresponding to the given general category.
621
///
622
/// Name canonicalization is assumed to be performed by the caller.
623
///
624
/// If the given general category could not be found, or if the general
625
/// category data is not available, then an error is returned.
626
0
fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
627
0
    #[cfg(not(feature = "unicode-gencat"))]
628
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
629
0
        Err(Error::PropertyNotFound)
630
0
    }
631
0
632
0
    #[cfg(feature = "unicode-gencat")]
633
0
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
634
0
        use unicode_tables::general_category::BY_NAME;
635
0
        match name {
636
0
            "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
637
0
            "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
638
0
            "Assigned" => {
639
0
                let mut cls = gencat("Unassigned")?;
640
0
                cls.negate();
641
0
                Ok(cls)
642
0
            }
643
0
            name => property_set(BY_NAME, name)
644
0
                .map(hir_class)
645
0
                .ok_or(Error::PropertyValueNotFound),
646
0
        }
647
0
    }
648
0
649
0
    match canonical_name {
650
0
        "Decimal_Number" => perl_digit(),
651
0
        name => imp(name),
652
    }
653
0
}
654
655
/// Returns the Unicode HIR class corresponding to the given script.
656
///
657
/// Name canonicalization is assumed to be performed by the caller.
658
///
659
/// If the given script could not be found, or if the script data is not
660
/// available, then an error is returned.
661
fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
662
    #[cfg(not(feature = "unicode-script"))]
663
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
664
        Err(Error::PropertyNotFound)
665
    }
666
667
    #[cfg(feature = "unicode-script")]
668
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
669
        use unicode_tables::script::BY_NAME;
670
        property_set(BY_NAME, name)
671
            .map(hir_class)
672
            .ok_or(Error::PropertyValueNotFound)
673
    }
674
675
    imp(canonical_name)
676
}
677
678
/// Returns the Unicode HIR class corresponding to the given script extension.
679
///
680
/// Name canonicalization is assumed to be performed by the caller.
681
///
682
/// If the given script extension could not be found, or if the script data is
683
/// not available, then an error is returned.
684
fn script_extension(
685
    canonical_name: &'static str,
686
) -> Result<hir::ClassUnicode> {
687
    #[cfg(not(feature = "unicode-script"))]
688
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
689
        Err(Error::PropertyNotFound)
690
    }
691
692
    #[cfg(feature = "unicode-script")]
693
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
694
        use unicode_tables::script_extension::BY_NAME;
695
        property_set(BY_NAME, name)
696
            .map(hir_class)
697
            .ok_or(Error::PropertyValueNotFound)
698
    }
699
700
    imp(canonical_name)
701
}
702
703
/// Returns the Unicode HIR class corresponding to the given Unicode boolean
704
/// property.
705
///
706
/// Name canonicalization is assumed to be performed by the caller.
707
///
708
/// If the given boolean property could not be found, or if the boolean
709
/// property data is not available, then an error is returned.
710
0
fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
711
0
    #[cfg(not(feature = "unicode-bool"))]
712
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
713
0
        Err(Error::PropertyNotFound)
714
0
    }
715
0
716
0
    #[cfg(feature = "unicode-bool")]
717
0
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
718
0
        use unicode_tables::property_bool::BY_NAME;
719
0
        property_set(BY_NAME, name)
720
0
            .map(hir_class)
721
0
            .ok_or(Error::PropertyNotFound)
722
0
    }
723
0
724
0
    match canonical_name {
725
0
        "Decimal_Number" => perl_digit(),
726
0
        "White_Space" => perl_space(),
727
0
        name => imp(name),
728
    }
729
0
}
730
731
/// Returns the Unicode HIR class corresponding to the given grapheme cluster
732
/// break property.
733
///
734
/// Name canonicalization is assumed to be performed by the caller.
735
///
736
/// If the given property could not be found, or if the corresponding data is
737
/// not available, then an error is returned.
738
fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
739
    #[cfg(not(feature = "unicode-segment"))]
740
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
741
        Err(Error::PropertyNotFound)
742
    }
743
744
    #[cfg(feature = "unicode-segment")]
745
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
746
        use unicode_tables::grapheme_cluster_break::BY_NAME;
747
        property_set(BY_NAME, name)
748
            .map(hir_class)
749
            .ok_or(Error::PropertyValueNotFound)
750
    }
751
752
    imp(canonical_name)
753
}
754
755
/// Returns the Unicode HIR class corresponding to the given word break
756
/// property.
757
///
758
/// Name canonicalization is assumed to be performed by the caller.
759
///
760
/// If the given property could not be found, or if the corresponding data is
761
/// not available, then an error is returned.
762
fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
763
    #[cfg(not(feature = "unicode-segment"))]
764
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
765
        Err(Error::PropertyNotFound)
766
    }
767
768
    #[cfg(feature = "unicode-segment")]
769
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
770
        use unicode_tables::word_break::BY_NAME;
771
        property_set(BY_NAME, name)
772
            .map(hir_class)
773
            .ok_or(Error::PropertyValueNotFound)
774
    }
775
776
    imp(canonical_name)
777
}
778
779
/// Returns the Unicode HIR class corresponding to the given sentence
780
/// break property.
781
///
782
/// Name canonicalization is assumed to be performed by the caller.
783
///
784
/// If the given property could not be found, or if the corresponding data is
785
/// not available, then an error is returned.
786
fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
787
    #[cfg(not(feature = "unicode-segment"))]
788
    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
789
        Err(Error::PropertyNotFound)
790
    }
791
792
    #[cfg(feature = "unicode-segment")]
793
    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
794
        use unicode_tables::sentence_break::BY_NAME;
795
        property_set(BY_NAME, name)
796
            .map(hir_class)
797
            .ok_or(Error::PropertyValueNotFound)
798
    }
799
800
    imp(canonical_name)
801
}
802
803
/// Like symbolic_name_normalize_bytes, but operates on a string.
804
fn symbolic_name_normalize(x: &str) -> String {
805
    let mut tmp = x.as_bytes().to_vec();
806
    let len = symbolic_name_normalize_bytes(&mut tmp).len();
807
    tmp.truncate(len);
808
    // This should always succeed because `symbolic_name_normalize_bytes`
809
    // guarantees that `&tmp[..len]` is always valid UTF-8.
810
    //
811
    // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
812
    // to be worth skipping the additional safety check. A benchmark must
813
    // justify it first.
814
    String::from_utf8(tmp).unwrap()
815
}
816
817
/// Normalize the given symbolic name in place according to UAX44-LM3.
818
///
819
/// A "symbolic name" typically corresponds to property names and property
820
/// value aliases. Note, though, that it should not be applied to property
821
/// string values.
822
///
823
/// The slice returned is guaranteed to be valid UTF-8 for all possible values
824
/// of `slice`.
825
///
826
/// See: https://unicode.org/reports/tr44/#UAX44-LM3
827
fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
828
    // I couldn't find a place in the standard that specified that property
829
    // names/aliases had a particular structure (unlike character names), but
830
    // we assume that it's ASCII only and drop anything that isn't ASCII.
831
    let mut start = 0;
832
    let mut starts_with_is = false;
833
    if slice.len() >= 2 {
834
        // Ignore any "is" prefix.
835
        starts_with_is = slice[0..2] == b"is"[..]
836
            || slice[0..2] == b"IS"[..]
837
            || slice[0..2] == b"iS"[..]
838
            || slice[0..2] == b"Is"[..];
839
        if starts_with_is {
840
            start = 2;
841
        }
842
    }
843
    let mut next_write = 0;
844
    for i in start..slice.len() {
845
        // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
846
        // UTF-8, we ensure that the slice contains only ASCII bytes. In
847
        // particular, we drop every non-ASCII byte from the normalized string.
848
        let b = slice[i];
849
        if b == b' ' || b == b'_' || b == b'-' {
850
            continue;
851
        } else if b'A' <= b && b <= b'Z' {
852
            slice[next_write] = b + (b'a' - b'A');
853
            next_write += 1;
854
        } else if b <= 0x7F {
855
            slice[next_write] = b;
856
            next_write += 1;
857
        }
858
    }
859
    // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
860
    // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
861
    // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
862
    // is actually an alias for the 'Other' general category.
863
    if starts_with_is && next_write == 1 && slice[0] == b'c' {
864
        slice[0] = b'i';
865
        slice[1] = b's';
866
        slice[2] = b'c';
867
        next_write = 3;
868
    }
869
    &mut slice[..next_write]
870
}
871
872
#[cfg(test)]
873
mod tests {
874
    use super::{
875
        contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
876
        symbolic_name_normalize_bytes,
877
    };
878
879
    #[cfg(feature = "unicode-case")]
880
    fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
881
        simple_fold(c).unwrap().unwrap()
882
    }
883
884
    #[cfg(feature = "unicode-case")]
885
    fn simple_fold_err(c: char) -> Option<char> {
886
        match simple_fold(c).unwrap() {
887
            Ok(_) => unreachable!("simple_fold returned Ok iterator"),
888
            Err(next) => next,
889
        }
890
    }
891
892
    #[cfg(feature = "unicode-case")]
893
    fn contains_case_map(start: char, end: char) -> bool {
894
        contains_simple_case_mapping(start, end).unwrap()
895
    }
896
897
    #[test]
898
    #[cfg(feature = "unicode-case")]
899
    fn simple_fold_k() {
900
        let xs: Vec<char> = simple_fold_ok('k').collect();
901
        assert_eq!(xs, vec!['K', 'K']);
902
903
        let xs: Vec<char> = simple_fold_ok('K').collect();
904
        assert_eq!(xs, vec!['k', 'K']);
905
906
        let xs: Vec<char> = simple_fold_ok('K').collect();
907
        assert_eq!(xs, vec!['K', 'k']);
908
    }
909
910
    #[test]
911
    #[cfg(feature = "unicode-case")]
912
    fn simple_fold_a() {
913
        let xs: Vec<char> = simple_fold_ok('a').collect();
914
        assert_eq!(xs, vec!['A']);
915
916
        let xs: Vec<char> = simple_fold_ok('A').collect();
917
        assert_eq!(xs, vec!['a']);
918
    }
919
920
    #[test]
921
    #[cfg(feature = "unicode-case")]
922
    fn simple_fold_empty() {
923
        assert_eq!(Some('A'), simple_fold_err('?'));
924
        assert_eq!(Some('A'), simple_fold_err('@'));
925
        assert_eq!(Some('a'), simple_fold_err('['));
926
        assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
927
    }
928
929
    #[test]
930
    #[cfg(feature = "unicode-case")]
931
    fn simple_fold_max() {
932
        assert_eq!(None, simple_fold_err('\u{10FFFE}'));
933
        assert_eq!(None, simple_fold_err('\u{10FFFF}'));
934
    }
935
936
    #[test]
937
    #[cfg(not(feature = "unicode-case"))]
938
    fn simple_fold_disabled() {
939
        assert!(simple_fold('a').is_err());
940
    }
941
942
    #[test]
943
    #[cfg(feature = "unicode-case")]
944
    fn range_contains() {
945
        assert!(contains_case_map('A', 'A'));
946
        assert!(contains_case_map('Z', 'Z'));
947
        assert!(contains_case_map('A', 'Z'));
948
        assert!(contains_case_map('@', 'A'));
949
        assert!(contains_case_map('Z', '['));
950
        assert!(contains_case_map('☃', 'Ⰰ'));
951
952
        assert!(!contains_case_map('[', '['));
953
        assert!(!contains_case_map('[', '`'));
954
955
        assert!(!contains_case_map('☃', '☃'));
956
    }
957
958
    #[test]
959
    #[cfg(not(feature = "unicode-case"))]
960
    fn range_contains_disabled() {
961
        assert!(contains_simple_case_mapping('a', 'a').is_err());
962
    }
963
964
    #[test]
965
    #[cfg(feature = "unicode-gencat")]
966
    fn regression_466() {
967
        use super::{CanonicalClassQuery, ClassQuery};
968
969
        let q = ClassQuery::OneLetter('C');
970
        assert_eq!(
971
            q.canonicalize().unwrap(),
972
            CanonicalClassQuery::GeneralCategory("Other")
973
        );
974
    }
975
976
    #[test]
977
    fn sym_normalize() {
978
        let sym_norm = symbolic_name_normalize;
979
980
        assert_eq!(sym_norm("Line_Break"), "linebreak");
981
        assert_eq!(sym_norm("Line-break"), "linebreak");
982
        assert_eq!(sym_norm("linebreak"), "linebreak");
983
        assert_eq!(sym_norm("BA"), "ba");
984
        assert_eq!(sym_norm("ba"), "ba");
985
        assert_eq!(sym_norm("Greek"), "greek");
986
        assert_eq!(sym_norm("isGreek"), "greek");
987
        assert_eq!(sym_norm("IS_Greek"), "greek");
988
        assert_eq!(sym_norm("isc"), "isc");
989
        assert_eq!(sym_norm("is c"), "isc");
990
        assert_eq!(sym_norm("is_c"), "isc");
991
    }
992
993
    #[test]
994
    fn valid_utf8_symbolic() {
995
        let mut x = b"abc\xFFxyz".to_vec();
996
        let y = symbolic_name_normalize_bytes(&mut x);
997
        assert_eq!(y, b"abcxyz");
998
    }
999
}