Coverage Report

Created: 2025-10-31 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.8.5/src/unicode.rs
Line
Count
Source
1
use alloc::{
2
    string::{String, ToString},
3
    vec::Vec,
4
};
5
6
use crate::hir;
7
8
/// An inclusive range of codepoints from a generated file (hence the static
9
/// lifetime).
10
type Range = &'static [(char, char)];
11
12
/// An error that occurs when dealing with Unicode.
13
///
14
/// We don't impl the Error trait here because these always get converted
15
/// into other public errors. (This error type isn't exported.)
16
#[derive(Debug)]
17
pub enum Error {
18
    PropertyNotFound,
19
    PropertyValueNotFound,
20
    // Not used when unicode-perl is enabled.
21
    #[allow(dead_code)]
22
    PerlClassNotFound,
23
}
24
25
/// An error that occurs when Unicode-aware simple case folding fails.
26
///
27
/// This error can occur when the case mapping tables necessary for Unicode
28
/// aware case folding are unavailable. This only occurs when the
29
/// `unicode-case` feature is disabled. (The feature is enabled by default.)
30
#[derive(Debug)]
31
pub struct CaseFoldError(());
32
33
#[cfg(feature = "std")]
34
impl std::error::Error for CaseFoldError {}
35
36
impl core::fmt::Display for CaseFoldError {
37
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
38
0
        write!(
39
0
            f,
40
0
            "Unicode-aware case folding is not available \
41
0
             (probably because the unicode-case feature is not enabled)"
42
        )
43
0
    }
44
}
45
46
/// An error that occurs when the Unicode-aware `\w` class is unavailable.
47
///
48
/// This error can occur when the data tables necessary for the Unicode aware
49
/// Perl character class `\w` are unavailable. This only occurs when the
50
/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
51
#[derive(Debug)]
52
pub struct UnicodeWordError(());
53
54
#[cfg(feature = "std")]
55
impl std::error::Error for UnicodeWordError {}
56
57
impl core::fmt::Display for UnicodeWordError {
58
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
59
0
        write!(
60
0
            f,
61
0
            "Unicode-aware \\w class is not available \
62
0
             (probably because the unicode-perl feature is not enabled)"
63
        )
64
0
    }
65
}
66
67
/// A state oriented traverser of the simple case folding table.
68
///
69
/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70
/// return an error if the underlying case folding table is unavailable.
71
///
72
/// After construction, it is expected that callers will use
73
/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74
/// increasing order. For example, calling it on `b` and then on `a` is illegal
75
/// and will result in a panic.
76
///
77
/// The main idea of this type is that it tries hard to make mapping lookups
78
/// fast by exploiting the structure of the underlying table, and the ordering
79
/// assumption enables this.
80
#[derive(Debug)]
81
pub struct SimpleCaseFolder {
82
    /// The simple case fold table. It's a sorted association list, where the
83
    /// keys are Unicode scalar values and the values are the corresponding
84
    /// equivalence class (not including the key) of the "simple" case folded
85
    /// Unicode scalar values.
86
    table: &'static [(char, &'static [char])],
87
    /// The last codepoint that was used for a lookup.
88
    last: Option<char>,
89
    /// The index to the entry in `table` corresponding to the smallest key `k`
90
    /// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91
    /// in particular, `k0` may not be in the table!
92
    next: usize,
93
}
94
95
impl SimpleCaseFolder {
96
    /// Create a new simple case folder, returning an error if the underlying
97
    /// case folding table is unavailable.
98
0
    pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99
        #[cfg(not(feature = "unicode-case"))]
100
        {
101
0
            Err(CaseFoldError(()))
102
        }
103
        #[cfg(feature = "unicode-case")]
104
        {
105
            Ok(SimpleCaseFolder {
106
                table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107
                last: None,
108
                next: 0,
109
            })
110
        }
111
0
    }
112
113
    /// Return the equivalence class of case folded codepoints for the given
114
    /// codepoint. The equivalence class returned never includes the codepoint
115
    /// given. If the given codepoint has no case folded codepoints (i.e.,
116
    /// no entry in the underlying case folding table), then this returns an
117
    /// empty slice.
118
    ///
119
    /// # Panics
120
    ///
121
    /// This panics when called with a `c` that is less than or equal to the
122
    /// previous call. In other words, callers need to use this method with
123
    /// strictly increasing values of `c`.
124
0
    pub fn mapping(&mut self, c: char) -> &'static [char] {
125
0
        if let Some(last) = self.last {
126
0
            assert!(
127
0
                last < c,
128
0
                "got codepoint U+{:X} which occurs before \
129
0
                 last codepoint U+{:X}",
130
0
                u32::from(c),
131
0
                u32::from(last),
132
            );
133
0
        }
134
0
        self.last = Some(c);
135
0
        if self.next >= self.table.len() {
136
0
            return &[];
137
0
        }
138
0
        let (k, v) = self.table[self.next];
139
0
        if k == c {
140
0
            self.next += 1;
141
0
            return v;
142
0
        }
143
0
        match self.get(c) {
144
0
            Err(i) => {
145
0
                self.next = i;
146
0
                &[]
147
            }
148
0
            Ok(i) => {
149
                // Since we require lookups to proceed
150
                // in order, anything we find should be
151
                // after whatever we thought might be
152
                // next. Otherwise, the caller is either
153
                // going out of order or we would have
154
                // found our next key at 'self.next'.
155
0
                assert!(i > self.next);
156
0
                self.next = i + 1;
157
0
                self.table[i].1
158
            }
159
        }
160
0
    }
161
162
    /// Returns true if and only if the given range overlaps with any region
163
    /// of the underlying case folding table. That is, when true, there exists
164
    /// at least one codepoint in the inclusive range `[start, end]` that has
165
    /// a non-trivial equivalence class of case folded codepoints. Conversely,
166
    /// when this returns false, all codepoints in the range `[start, end]`
167
    /// correspond to the trivial equivalence class of case folded codepoints,
168
    /// i.e., itself.
169
    ///
170
    /// This is useful to call before iterating over the codepoints in the
171
    /// range and looking up the mapping for each. If you know none of the
172
    /// mappings will return anything, then you might be able to skip doing it
173
    /// altogether.
174
    ///
175
    /// # Panics
176
    ///
177
    /// This panics when `end < start`.
178
0
    pub fn overlaps(&self, start: char, end: char) -> bool {
179
        use core::cmp::Ordering;
180
181
0
        assert!(start <= end);
182
0
        self.table
183
0
            .binary_search_by(|&(c, _)| {
184
0
                if start <= c && c <= end {
185
0
                    Ordering::Equal
186
0
                } else if c > end {
187
0
                    Ordering::Greater
188
                } else {
189
0
                    Ordering::Less
190
                }
191
0
            })
192
0
            .is_ok()
193
0
    }
194
195
    /// Returns the index at which `c` occurs in the simple case fold table. If
196
    /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197
    /// c` and `table[i].0 > c`.
198
0
    fn get(&self, c: char) -> Result<usize, usize> {
199
0
        self.table.binary_search_by_key(&c, |&(c1, _)| c1)
200
0
    }
201
}
202
203
/// A query for finding a character class defined by Unicode. This supports
204
/// either use of a property name directly, or lookup by property value. The
205
/// former generally refers to Binary properties (see UTS#44, Table 8), but
206
/// as a special exception (see UTS#18, Section 1.2) both general categories
207
/// (an enumeration) and scripts (a catalog) are supported as if each of their
208
/// possible values were a binary property.
209
///
210
/// In all circumstances, property names and values are normalized and
211
/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
212
///
213
/// The lifetime `'a` refers to the shorter of the lifetimes of property name
214
/// and property value.
215
#[derive(Debug)]
216
pub enum ClassQuery<'a> {
217
    /// Return a class corresponding to a Unicode binary property, named by
218
    /// a single letter.
219
    OneLetter(char),
220
    /// Return a class corresponding to a Unicode binary property.
221
    ///
222
    /// Note that, by special exception (see UTS#18, Section 1.2), both
223
    /// general category values and script values are permitted here as if
224
    /// they were a binary property.
225
    Binary(&'a str),
226
    /// Return a class corresponding to all codepoints whose property
227
    /// (identified by `property_name`) corresponds to the given value
228
    /// (identified by `property_value`).
229
    ByValue {
230
        /// A property name.
231
        property_name: &'a str,
232
        /// A property value.
233
        property_value: &'a str,
234
    },
235
}
236
237
impl<'a> ClassQuery<'a> {
238
0
    fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {
239
0
        match *self {
240
0
            ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
241
0
            ClassQuery::Binary(name) => self.canonical_binary(name),
242
0
            ClassQuery::ByValue { property_name, property_value } => {
243
0
                let property_name = symbolic_name_normalize(property_name);
244
0
                let property_value = symbolic_name_normalize(property_value);
245
246
0
                let canon_name = match canonical_prop(&property_name)? {
247
0
                    None => return Err(Error::PropertyNotFound),
248
0
                    Some(canon_name) => canon_name,
249
                };
250
0
                Ok(match canon_name {
251
0
                    "General_Category" => {
252
0
                        let canon = match canonical_gencat(&property_value)? {
253
0
                            None => return Err(Error::PropertyValueNotFound),
254
0
                            Some(canon) => canon,
255
                        };
256
0
                        CanonicalClassQuery::GeneralCategory(canon)
257
                    }
258
0
                    "Script" => {
259
0
                        let canon = match canonical_script(&property_value)? {
260
0
                            None => return Err(Error::PropertyValueNotFound),
261
0
                            Some(canon) => canon,
262
                        };
263
0
                        CanonicalClassQuery::Script(canon)
264
                    }
265
                    _ => {
266
0
                        let vals = match property_values(canon_name)? {
267
0
                            None => return Err(Error::PropertyValueNotFound),
268
0
                            Some(vals) => vals,
269
                        };
270
0
                        let canon_val =
271
0
                            match canonical_value(vals, &property_value) {
272
                                None => {
273
0
                                    return Err(Error::PropertyValueNotFound)
274
                                }
275
0
                                Some(canon_val) => canon_val,
276
                            };
277
0
                        CanonicalClassQuery::ByValue {
278
0
                            property_name: canon_name,
279
0
                            property_value: canon_val,
280
0
                        }
281
                    }
282
                })
283
            }
284
        }
285
0
    }
286
287
0
    fn canonical_binary(
288
0
        &self,
289
0
        name: &str,
290
0
    ) -> Result<CanonicalClassQuery, Error> {
291
0
        let norm = symbolic_name_normalize(name);
292
293
        // This is a special case where 'cf' refers to the 'Format' general
294
        // category, but where the 'cf' abbreviation is also an abbreviation
295
        // for the 'Case_Folding' property. But we want to treat it as
296
        // a general category. (Currently, we don't even support the
297
        // 'Case_Folding' property. But if we do in the future, users will be
298
        // required to spell it out.)
299
        //
300
        // Also 'sc' refers to the 'Currency_Symbol' general category, but is
301
        // also the abbreviation for the 'Script' property. So we avoid calling
302
        // 'canonical_prop' for it too, which would erroneously normalize it
303
        // to 'Script'.
304
        //
305
        // Another case: 'lc' is an abbreviation for the 'Cased_Letter'
306
        // general category, but is also an abbreviation for the 'Lowercase_Mapping'
307
        // property. We don't currently support the latter, so as with 'cf'
308
        // above, we treat 'lc' as 'Cased_Letter'.
309
0
        if norm != "cf" && norm != "sc" && norm != "lc" {
310
0
            if let Some(canon) = canonical_prop(&norm)? {
311
0
                return Ok(CanonicalClassQuery::Binary(canon));
312
0
            }
313
0
        }
314
0
        if let Some(canon) = canonical_gencat(&norm)? {
315
0
            return Ok(CanonicalClassQuery::GeneralCategory(canon));
316
0
        }
317
0
        if let Some(canon) = canonical_script(&norm)? {
318
0
            return Ok(CanonicalClassQuery::Script(canon));
319
0
        }
320
0
        Err(Error::PropertyNotFound)
321
0
    }
322
}
323
324
/// Like ClassQuery, but its parameters have been canonicalized. This also
325
/// differentiates binary properties from flattened general categories and
326
/// scripts.
327
#[derive(Debug, Eq, PartialEq)]
328
enum CanonicalClassQuery {
329
    /// The canonical binary property name.
330
    Binary(&'static str),
331
    /// The canonical general category name.
332
    GeneralCategory(&'static str),
333
    /// The canonical script name.
334
    Script(&'static str),
335
    /// An arbitrary association between property and value, both of which
336
    /// have been canonicalized.
337
    ///
338
    /// Note that by construction, the property name of ByValue will never
339
    /// be General_Category or Script. Those two cases are subsumed by the
340
    /// eponymous variants.
341
    ByValue {
342
        /// The canonical property name.
343
        property_name: &'static str,
344
        /// The canonical property value.
345
        property_value: &'static str,
346
    },
347
}
348
349
/// Looks up a Unicode class given a query. If one doesn't exist, then
350
/// `None` is returned.
351
0
pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {
352
    use self::CanonicalClassQuery::*;
353
354
0
    match query.canonicalize()? {
355
0
        Binary(name) => bool_property(name),
356
0
        GeneralCategory(name) => gencat(name),
357
0
        Script(name) => script(name),
358
0
        ByValue { property_name: "Age", property_value } => {
359
0
            let mut class = hir::ClassUnicode::empty();
360
0
            for set in ages(property_value)? {
361
0
                class.union(&hir_class(set));
362
0
            }
363
0
            Ok(class)
364
        }
365
0
        ByValue { property_name: "Script_Extensions", property_value } => {
366
0
            script_extension(property_value)
367
        }
368
        ByValue {
369
0
            property_name: "Grapheme_Cluster_Break",
370
0
            property_value,
371
0
        } => gcb(property_value),
372
0
        ByValue { property_name: "Sentence_Break", property_value } => {
373
0
            sb(property_value)
374
        }
375
0
        ByValue { property_name: "Word_Break", property_value } => {
376
0
            wb(property_value)
377
        }
378
        _ => {
379
            // What else should we support?
380
0
            Err(Error::PropertyNotFound)
381
        }
382
    }
383
0
}
384
385
/// Returns a Unicode aware class for \w.
386
///
387
/// This returns an error if the data is not available for \w.
388
0
pub fn perl_word() -> Result<hir::ClassUnicode, Error> {
389
    #[cfg(not(feature = "unicode-perl"))]
390
0
    fn imp() -> Result<hir::ClassUnicode, Error> {
391
0
        Err(Error::PerlClassNotFound)
392
0
    }
393
394
    #[cfg(feature = "unicode-perl")]
395
    fn imp() -> Result<hir::ClassUnicode, Error> {
396
        use crate::unicode_tables::perl_word::PERL_WORD;
397
        Ok(hir_class(PERL_WORD))
398
    }
399
400
0
    imp()
401
0
}
402
403
/// Returns a Unicode aware class for \s.
404
///
405
/// This returns an error if the data is not available for \s.
406
0
pub fn perl_space() -> Result<hir::ClassUnicode, Error> {
407
    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
408
0
    fn imp() -> Result<hir::ClassUnicode, Error> {
409
0
        Err(Error::PerlClassNotFound)
410
0
    }
411
412
    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
413
    fn imp() -> Result<hir::ClassUnicode, Error> {
414
        use crate::unicode_tables::perl_space::WHITE_SPACE;
415
        Ok(hir_class(WHITE_SPACE))
416
    }
417
418
    #[cfg(feature = "unicode-bool")]
419
    fn imp() -> Result<hir::ClassUnicode, Error> {
420
        use crate::unicode_tables::property_bool::WHITE_SPACE;
421
        Ok(hir_class(WHITE_SPACE))
422
    }
423
424
0
    imp()
425
0
}
426
427
/// Returns a Unicode aware class for \d.
428
///
429
/// This returns an error if the data is not available for \d.
430
0
pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {
431
    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
432
0
    fn imp() -> Result<hir::ClassUnicode, Error> {
433
0
        Err(Error::PerlClassNotFound)
434
0
    }
435
436
    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
437
    fn imp() -> Result<hir::ClassUnicode, Error> {
438
        use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
439
        Ok(hir_class(DECIMAL_NUMBER))
440
    }
441
442
    #[cfg(feature = "unicode-gencat")]
443
    fn imp() -> Result<hir::ClassUnicode, Error> {
444
        use crate::unicode_tables::general_category::DECIMAL_NUMBER;
445
        Ok(hir_class(DECIMAL_NUMBER))
446
    }
447
448
0
    imp()
449
0
}
450
451
/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
452
0
pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
453
0
    let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
454
0
        .iter()
455
0
        .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
456
0
        .collect();
457
0
    hir::ClassUnicode::new(hir_ranges)
458
0
}
459
460
/// Returns true only if the given codepoint is in the `\w` character class.
461
///
462
/// If the `unicode-perl` feature is not enabled, then this returns an error.
463
0
pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {
464
    #[cfg(not(feature = "unicode-perl"))]
465
0
    fn imp(_: char) -> Result<bool, UnicodeWordError> {
466
0
        Err(UnicodeWordError(()))
467
0
    }
468
469
    #[cfg(feature = "unicode-perl")]
470
    fn imp(c: char) -> Result<bool, UnicodeWordError> {
471
        use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};
472
473
        if u8::try_from(c).map_or(false, is_word_byte) {
474
            return Ok(true);
475
        }
476
        Ok(PERL_WORD
477
            .binary_search_by(|&(start, end)| {
478
                use core::cmp::Ordering;
479
480
                if start <= c && c <= end {
481
                    Ordering::Equal
482
                } else if start > c {
483
                    Ordering::Greater
484
                } else {
485
                    Ordering::Less
486
                }
487
            })
488
            .is_ok())
489
    }
490
491
0
    imp(c)
492
0
}
493
494
/// A mapping of property values for a specific property.
495
///
496
/// The first element of each tuple is a normalized property value while the
497
/// second element of each tuple is the corresponding canonical property
498
/// value.
499
type PropertyValues = &'static [(&'static str, &'static str)];
500
501
0
fn canonical_gencat(
502
0
    normalized_value: &str,
503
0
) -> Result<Option<&'static str>, Error> {
504
0
    Ok(match normalized_value {
505
0
        "any" => Some("Any"),
506
0
        "assigned" => Some("Assigned"),
507
0
        "ascii" => Some("ASCII"),
508
        _ => {
509
0
            let gencats = property_values("General_Category")?.unwrap();
510
0
            canonical_value(gencats, normalized_value)
511
        }
512
    })
513
0
}
514
515
0
fn canonical_script(
516
0
    normalized_value: &str,
517
0
) -> Result<Option<&'static str>, Error> {
518
0
    let scripts = property_values("Script")?.unwrap();
519
0
    Ok(canonical_value(scripts, normalized_value))
520
0
}
521
522
/// Find the canonical property name for the given normalized property name.
523
///
524
/// If no such property exists, then `None` is returned.
525
///
526
/// The normalized property name must have been normalized according to
527
/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
528
///
529
/// If the property names data is not available, then an error is returned.
530
0
fn canonical_prop(
531
0
    normalized_name: &str,
532
0
) -> Result<Option<&'static str>, Error> {
533
    #[cfg(not(any(
534
        feature = "unicode-age",
535
        feature = "unicode-bool",
536
        feature = "unicode-gencat",
537
        feature = "unicode-perl",
538
        feature = "unicode-script",
539
        feature = "unicode-segment",
540
    )))]
541
0
    fn imp(_: &str) -> Result<Option<&'static str>, Error> {
542
0
        Err(Error::PropertyNotFound)
543
0
    }
544
545
    #[cfg(any(
546
        feature = "unicode-age",
547
        feature = "unicode-bool",
548
        feature = "unicode-gencat",
549
        feature = "unicode-perl",
550
        feature = "unicode-script",
551
        feature = "unicode-segment",
552
    ))]
553
    fn imp(name: &str) -> Result<Option<&'static str>, Error> {
554
        use crate::unicode_tables::property_names::PROPERTY_NAMES;
555
556
        Ok(PROPERTY_NAMES
557
            .binary_search_by_key(&name, |&(n, _)| n)
558
            .ok()
559
            .map(|i| PROPERTY_NAMES[i].1))
560
    }
561
562
0
    imp(normalized_name)
563
0
}
564
565
/// Find the canonical property value for the given normalized property
566
/// value.
567
///
568
/// The given property values should correspond to the values for the property
569
/// under question, which can be found using `property_values`.
570
///
571
/// If no such property value exists, then `None` is returned.
572
///
573
/// The normalized property value must have been normalized according to
574
/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
575
0
fn canonical_value(
576
0
    vals: PropertyValues,
577
0
    normalized_value: &str,
578
0
) -> Option<&'static str> {
579
0
    vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
580
0
        .ok()
581
0
        .map(|i| vals[i].1)
582
0
}
583
584
/// Return the table of property values for the given property name.
585
///
586
/// If the property values data is not available, then an error is returned.
587
0
fn property_values(
588
0
    canonical_property_name: &'static str,
589
0
) -> Result<Option<PropertyValues>, Error> {
590
    #[cfg(not(any(
591
        feature = "unicode-age",
592
        feature = "unicode-bool",
593
        feature = "unicode-gencat",
594
        feature = "unicode-perl",
595
        feature = "unicode-script",
596
        feature = "unicode-segment",
597
    )))]
598
0
    fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {
599
0
        Err(Error::PropertyValueNotFound)
600
0
    }
601
602
    #[cfg(any(
603
        feature = "unicode-age",
604
        feature = "unicode-bool",
605
        feature = "unicode-gencat",
606
        feature = "unicode-perl",
607
        feature = "unicode-script",
608
        feature = "unicode-segment",
609
    ))]
610
    fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {
611
        use crate::unicode_tables::property_values::PROPERTY_VALUES;
612
613
        Ok(PROPERTY_VALUES
614
            .binary_search_by_key(&name, |&(n, _)| n)
615
            .ok()
616
            .map(|i| PROPERTY_VALUES[i].1))
617
    }
618
619
0
    imp(canonical_property_name)
620
0
}
621
622
// This is only used in some cases, but small enough to just let it be dead
623
// instead of figuring out (and maintaining) the right set of features.
624
#[allow(dead_code)]
625
0
fn property_set(
626
0
    name_map: &'static [(&'static str, Range)],
627
0
    canonical: &'static str,
628
0
) -> Option<Range> {
629
0
    name_map
630
0
        .binary_search_by_key(&canonical, |x| x.0)
631
0
        .ok()
632
0
        .map(|i| name_map[i].1)
633
0
}
634
635
/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
636
/// of codepoints that were added in a particular revision of Unicode. The
637
/// iterator yields items in chronological order.
638
///
639
/// If the given age value isn't valid or if the data isn't available, then an
640
/// error is returned instead.
641
0
fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
642
    #[cfg(not(feature = "unicode-age"))]
643
0
    fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {
644
        use core::option::IntoIter;
645
0
        Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
646
0
    }
647
648
    #[cfg(feature = "unicode-age")]
649
    fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
650
        use crate::unicode_tables::age;
651
652
        const AGES: &[(&str, Range)] = &[
653
            ("V1_1", age::V1_1),
654
            ("V2_0", age::V2_0),
655
            ("V2_1", age::V2_1),
656
            ("V3_0", age::V3_0),
657
            ("V3_1", age::V3_1),
658
            ("V3_2", age::V3_2),
659
            ("V4_0", age::V4_0),
660
            ("V4_1", age::V4_1),
661
            ("V5_0", age::V5_0),
662
            ("V5_1", age::V5_1),
663
            ("V5_2", age::V5_2),
664
            ("V6_0", age::V6_0),
665
            ("V6_1", age::V6_1),
666
            ("V6_2", age::V6_2),
667
            ("V6_3", age::V6_3),
668
            ("V7_0", age::V7_0),
669
            ("V8_0", age::V8_0),
670
            ("V9_0", age::V9_0),
671
            ("V10_0", age::V10_0),
672
            ("V11_0", age::V11_0),
673
            ("V12_0", age::V12_0),
674
            ("V12_1", age::V12_1),
675
            ("V13_0", age::V13_0),
676
            ("V14_0", age::V14_0),
677
            ("V15_0", age::V15_0),
678
            ("V15_1", age::V15_1),
679
            ("V16_0", age::V16_0),
680
        ];
681
        assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
682
683
        let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
684
        match pos {
685
            None => Err(Error::PropertyValueNotFound),
686
            Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
687
        }
688
    }
689
690
0
    imp(canonical_age)
691
0
}
692
693
/// Returns the Unicode HIR class corresponding to the given general category.
694
///
695
/// Name canonicalization is assumed to be performed by the caller.
696
///
697
/// If the given general category could not be found, or if the general
698
/// category data is not available, then an error is returned.
699
0
fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
700
    #[cfg(not(feature = "unicode-gencat"))]
701
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
702
0
        Err(Error::PropertyNotFound)
703
0
    }
704
705
    #[cfg(feature = "unicode-gencat")]
706
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
707
        use crate::unicode_tables::general_category::BY_NAME;
708
        match name {
709
            "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
710
            "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
711
            "Assigned" => {
712
                let mut cls = gencat("Unassigned")?;
713
                cls.negate();
714
                Ok(cls)
715
            }
716
            name => property_set(BY_NAME, name)
717
                .map(hir_class)
718
                .ok_or(Error::PropertyValueNotFound),
719
        }
720
    }
721
722
0
    match canonical_name {
723
0
        "Decimal_Number" => perl_digit(),
724
0
        name => imp(name),
725
    }
726
0
}
727
728
/// Returns the Unicode HIR class corresponding to the given script.
729
///
730
/// Name canonicalization is assumed to be performed by the caller.
731
///
732
/// If the given script could not be found, or if the script data is not
733
/// available, then an error is returned.
734
0
fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
735
    #[cfg(not(feature = "unicode-script"))]
736
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
737
0
        Err(Error::PropertyNotFound)
738
0
    }
739
740
    #[cfg(feature = "unicode-script")]
741
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
742
        use crate::unicode_tables::script::BY_NAME;
743
        property_set(BY_NAME, name)
744
            .map(hir_class)
745
            .ok_or(Error::PropertyValueNotFound)
746
    }
747
748
0
    imp(canonical_name)
749
0
}
750
751
/// Returns the Unicode HIR class corresponding to the given script extension.
752
///
753
/// Name canonicalization is assumed to be performed by the caller.
754
///
755
/// If the given script extension could not be found, or if the script data is
756
/// not available, then an error is returned.
757
0
fn script_extension(
758
0
    canonical_name: &'static str,
759
0
) -> Result<hir::ClassUnicode, Error> {
760
    #[cfg(not(feature = "unicode-script"))]
761
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
762
0
        Err(Error::PropertyNotFound)
763
0
    }
764
765
    #[cfg(feature = "unicode-script")]
766
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
767
        use crate::unicode_tables::script_extension::BY_NAME;
768
        property_set(BY_NAME, name)
769
            .map(hir_class)
770
            .ok_or(Error::PropertyValueNotFound)
771
    }
772
773
0
    imp(canonical_name)
774
0
}
775
776
/// Returns the Unicode HIR class corresponding to the given Unicode boolean
777
/// property.
778
///
779
/// Name canonicalization is assumed to be performed by the caller.
780
///
781
/// If the given boolean property could not be found, or if the boolean
782
/// property data is not available, then an error is returned.
783
0
fn bool_property(
784
0
    canonical_name: &'static str,
785
0
) -> Result<hir::ClassUnicode, Error> {
786
    #[cfg(not(feature = "unicode-bool"))]
787
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
788
0
        Err(Error::PropertyNotFound)
789
0
    }
790
791
    #[cfg(feature = "unicode-bool")]
792
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
793
        use crate::unicode_tables::property_bool::BY_NAME;
794
        property_set(BY_NAME, name)
795
            .map(hir_class)
796
            .ok_or(Error::PropertyNotFound)
797
    }
798
799
0
    match canonical_name {
800
0
        "Decimal_Number" => perl_digit(),
801
0
        "White_Space" => perl_space(),
802
0
        name => imp(name),
803
    }
804
0
}
805
806
/// Returns the Unicode HIR class corresponding to the given grapheme cluster
807
/// break property.
808
///
809
/// Name canonicalization is assumed to be performed by the caller.
810
///
811
/// If the given property could not be found, or if the corresponding data is
812
/// not available, then an error is returned.
813
0
fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
814
    #[cfg(not(feature = "unicode-segment"))]
815
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
816
0
        Err(Error::PropertyNotFound)
817
0
    }
818
819
    #[cfg(feature = "unicode-segment")]
820
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
821
        use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
822
        property_set(BY_NAME, name)
823
            .map(hir_class)
824
            .ok_or(Error::PropertyValueNotFound)
825
    }
826
827
0
    imp(canonical_name)
828
0
}
829
830
/// Returns the Unicode HIR class corresponding to the given word break
831
/// property.
832
///
833
/// Name canonicalization is assumed to be performed by the caller.
834
///
835
/// If the given property could not be found, or if the corresponding data is
836
/// not available, then an error is returned.
837
0
fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
838
    #[cfg(not(feature = "unicode-segment"))]
839
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
840
0
        Err(Error::PropertyNotFound)
841
0
    }
842
843
    #[cfg(feature = "unicode-segment")]
844
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
845
        use crate::unicode_tables::word_break::BY_NAME;
846
        property_set(BY_NAME, name)
847
            .map(hir_class)
848
            .ok_or(Error::PropertyValueNotFound)
849
    }
850
851
0
    imp(canonical_name)
852
0
}
853
854
/// Returns the Unicode HIR class corresponding to the given sentence
855
/// break property.
856
///
857
/// Name canonicalization is assumed to be performed by the caller.
858
///
859
/// If the given property could not be found, or if the corresponding data is
860
/// not available, then an error is returned.
861
0
fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
862
    #[cfg(not(feature = "unicode-segment"))]
863
0
    fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
864
0
        Err(Error::PropertyNotFound)
865
0
    }
866
867
    #[cfg(feature = "unicode-segment")]
868
    fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
869
        use crate::unicode_tables::sentence_break::BY_NAME;
870
        property_set(BY_NAME, name)
871
            .map(hir_class)
872
            .ok_or(Error::PropertyValueNotFound)
873
    }
874
875
0
    imp(canonical_name)
876
0
}
877
878
/// Like symbolic_name_normalize_bytes, but operates on a string.
879
0
fn symbolic_name_normalize(x: &str) -> String {
880
0
    let mut tmp = x.as_bytes().to_vec();
881
0
    let len = symbolic_name_normalize_bytes(&mut tmp).len();
882
0
    tmp.truncate(len);
883
    // This should always succeed because `symbolic_name_normalize_bytes`
884
    // guarantees that `&tmp[..len]` is always valid UTF-8.
885
    //
886
    // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
887
    // to be worth skipping the additional safety check. A benchmark must
888
    // justify it first.
889
0
    String::from_utf8(tmp).unwrap()
890
0
}
891
892
/// Normalize the given symbolic name in place according to UAX44-LM3.
893
///
894
/// A "symbolic name" typically corresponds to property names and property
895
/// value aliases. Note, though, that it should not be applied to property
896
/// string values.
897
///
898
/// The slice returned is guaranteed to be valid UTF-8 for all possible values
899
/// of `slice`.
900
///
901
/// See: https://unicode.org/reports/tr44/#UAX44-LM3
902
0
fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
903
    // I couldn't find a place in the standard that specified that property
904
    // names/aliases had a particular structure (unlike character names), but
905
    // we assume that it's ASCII only and drop anything that isn't ASCII.
906
0
    let mut start = 0;
907
0
    let mut starts_with_is = false;
908
0
    if slice.len() >= 2 {
909
        // Ignore any "is" prefix.
910
0
        starts_with_is = slice[0..2] == b"is"[..]
911
0
            || slice[0..2] == b"IS"[..]
912
0
            || slice[0..2] == b"iS"[..]
913
0
            || slice[0..2] == b"Is"[..];
914
0
        if starts_with_is {
915
0
            start = 2;
916
0
        }
917
0
    }
918
0
    let mut next_write = 0;
919
0
    for i in start..slice.len() {
920
        // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
921
        // UTF-8, we ensure that the slice contains only ASCII bytes. In
922
        // particular, we drop every non-ASCII byte from the normalized string.
923
0
        let b = slice[i];
924
0
        if b == b' ' || b == b'_' || b == b'-' {
925
0
            continue;
926
0
        } else if b'A' <= b && b <= b'Z' {
927
0
            slice[next_write] = b + (b'a' - b'A');
928
0
            next_write += 1;
929
0
        } else if b <= 0x7F {
930
0
            slice[next_write] = b;
931
0
            next_write += 1;
932
0
        }
933
    }
934
    // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
935
    // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
936
    // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
937
    // is actually an alias for the 'Other' general category.
938
0
    if starts_with_is && next_write == 1 && slice[0] == b'c' {
939
0
        slice[0] = b'i';
940
0
        slice[1] = b's';
941
0
        slice[2] = b'c';
942
0
        next_write = 3;
943
0
    }
944
0
    &mut slice[..next_write]
945
0
}
946
947
#[cfg(test)]
948
mod tests {
949
    use super::*;
950
951
    #[cfg(feature = "unicode-case")]
952
    fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
953
        SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
954
    }
955
956
    #[cfg(feature = "unicode-case")]
957
    fn contains_case_map(start: char, end: char) -> bool {
958
        SimpleCaseFolder::new().unwrap().overlaps(start, end)
959
    }
960
961
    #[test]
962
    #[cfg(feature = "unicode-case")]
963
    fn simple_fold_k() {
964
        let xs: Vec<char> = simple_fold_ok('k').collect();
965
        assert_eq!(xs, alloc::vec!['K', 'K']);
966
967
        let xs: Vec<char> = simple_fold_ok('K').collect();
968
        assert_eq!(xs, alloc::vec!['k', 'K']);
969
970
        let xs: Vec<char> = simple_fold_ok('K').collect();
971
        assert_eq!(xs, alloc::vec!['K', 'k']);
972
    }
973
974
    #[test]
975
    #[cfg(feature = "unicode-case")]
976
    fn simple_fold_a() {
977
        let xs: Vec<char> = simple_fold_ok('a').collect();
978
        assert_eq!(xs, alloc::vec!['A']);
979
980
        let xs: Vec<char> = simple_fold_ok('A').collect();
981
        assert_eq!(xs, alloc::vec!['a']);
982
    }
983
984
    #[test]
985
    #[cfg(not(feature = "unicode-case"))]
986
    fn simple_fold_disabled() {
987
        assert!(SimpleCaseFolder::new().is_err());
988
    }
989
990
    #[test]
991
    #[cfg(feature = "unicode-case")]
992
    fn range_contains() {
993
        assert!(contains_case_map('A', 'A'));
994
        assert!(contains_case_map('Z', 'Z'));
995
        assert!(contains_case_map('A', 'Z'));
996
        assert!(contains_case_map('@', 'A'));
997
        assert!(contains_case_map('Z', '['));
998
        assert!(contains_case_map('☃', 'Ⰰ'));
999
1000
        assert!(!contains_case_map('[', '['));
1001
        assert!(!contains_case_map('[', '`'));
1002
1003
        assert!(!contains_case_map('☃', '☃'));
1004
    }
1005
1006
    #[test]
1007
    #[cfg(feature = "unicode-gencat")]
1008
    fn regression_466() {
1009
        use super::{CanonicalClassQuery, ClassQuery};
1010
1011
        let q = ClassQuery::OneLetter('C');
1012
        assert_eq!(
1013
            q.canonicalize().unwrap(),
1014
            CanonicalClassQuery::GeneralCategory("Other")
1015
        );
1016
    }
1017
1018
    #[test]
1019
    fn sym_normalize() {
1020
        let sym_norm = symbolic_name_normalize;
1021
1022
        assert_eq!(sym_norm("Line_Break"), "linebreak");
1023
        assert_eq!(sym_norm("Line-break"), "linebreak");
1024
        assert_eq!(sym_norm("linebreak"), "linebreak");
1025
        assert_eq!(sym_norm("BA"), "ba");
1026
        assert_eq!(sym_norm("ba"), "ba");
1027
        assert_eq!(sym_norm("Greek"), "greek");
1028
        assert_eq!(sym_norm("isGreek"), "greek");
1029
        assert_eq!(sym_norm("IS_Greek"), "greek");
1030
        assert_eq!(sym_norm("isc"), "isc");
1031
        assert_eq!(sym_norm("is c"), "isc");
1032
        assert_eq!(sym_norm("is_c"), "isc");
1033
    }
1034
1035
    #[test]
1036
    fn valid_utf8_symbolic() {
1037
        let mut x = b"abc\xFFxyz".to_vec();
1038
        let y = symbolic_name_normalize_bytes(&mut x);
1039
        assert_eq!(y, b"abcxyz");
1040
    }
1041
}