/rust/registry/src/github.com-1ecc6299db9ec823/regex-syntax-0.6.23/src/unicode.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use std::error; |
2 | | use std::fmt; |
3 | | use std::result; |
4 | | |
5 | | use hir; |
6 | | |
7 | | /// A type alias for errors specific to Unicode handling of classes. |
8 | | pub type Result<T> = result::Result<T, Error>; |
9 | | |
10 | | /// An inclusive range of codepoints from a generated file (hence the static |
11 | | /// lifetime). |
12 | | type Range = &'static [(char, char)]; |
13 | | |
14 | | /// An error that occurs when dealing with Unicode. |
15 | | /// |
16 | | /// We don't impl the Error trait here because these always get converted |
17 | | /// into other public errors. (This error type isn't exported.) |
18 | | #[derive(Debug)] |
19 | | pub enum Error { |
20 | | PropertyNotFound, |
21 | | PropertyValueNotFound, |
22 | | // Not used when unicode-perl is enabled. |
23 | | #[allow(dead_code)] |
24 | | PerlClassNotFound, |
25 | | } |
26 | | |
27 | | /// A type alias for errors specific to Unicode case folding. |
28 | | pub type FoldResult<T> = result::Result<T, CaseFoldError>; |
29 | | |
30 | | /// An error that occurs when Unicode-aware simple case folding fails. |
31 | | /// |
32 | | /// This error can occur when the case mapping tables necessary for Unicode |
33 | | /// aware case folding are unavailable. This only occurs when the |
34 | | /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
35 | | #[derive(Debug)] |
36 | | pub struct CaseFoldError(()); |
37 | | |
38 | | impl error::Error for CaseFoldError {} |
39 | | |
40 | | impl fmt::Display for CaseFoldError { |
41 | | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
42 | | write!( |
43 | | f, |
44 | | "Unicode-aware case folding is not available \ |
45 | | (probably because the unicode-case feature is not enabled)" |
46 | | ) |
47 | | } |
48 | | } |
49 | | |
50 | | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
51 | | /// |
52 | | /// This error can occur when the data tables necessary for the Unicode aware |
53 | | /// Perl character class `\w` are unavailable. This only occurs when the |
54 | | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
55 | 0 | #[derive(Debug)] |
56 | | pub struct UnicodeWordError(()); |
57 | | |
58 | | impl error::Error for UnicodeWordError {} |
59 | | |
60 | | impl fmt::Display for UnicodeWordError { |
61 | | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
62 | | write!( |
63 | | f, |
64 | | "Unicode-aware \\w class is not available \ |
65 | | (probably because the unicode-perl feature is not enabled)" |
66 | | ) |
67 | | } |
68 | | } |
69 | | |
70 | | /// Return an iterator over the equivalence class of simple case mappings |
71 | | /// for the given codepoint. The equivalence class does not include the |
72 | | /// given codepoint. |
73 | | /// |
74 | | /// If the equivalence class is empty, then this returns the next scalar |
75 | | /// value that has a non-empty equivalence class, if it exists. If no such |
76 | | /// scalar value exists, then `None` is returned. The point of this behavior |
77 | | /// is to permit callers to avoid calling `simple_fold` more than they need |
78 | | /// to, since there is some cost to fetching the equivalence class. |
79 | | /// |
80 | | /// This returns an error if the Unicode case folding tables are not available. |
81 | | pub fn simple_fold( |
82 | | c: char, |
83 | | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { |
84 | | #[cfg(not(feature = "unicode-case"))] |
85 | | fn imp( |
86 | | _: char, |
87 | | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
88 | | { |
89 | | use std::option::IntoIter; |
90 | | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) |
91 | | } |
92 | | |
93 | | #[cfg(feature = "unicode-case")] |
94 | | fn imp( |
95 | | c: char, |
96 | | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
97 | | { |
98 | | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
99 | | |
100 | | Ok(CASE_FOLDING_SIMPLE |
101 | | .binary_search_by_key(&c, |&(c1, _)| c1) |
102 | | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) |
103 | | .map_err(|i| { |
104 | | if i >= CASE_FOLDING_SIMPLE.len() { |
105 | | None |
106 | | } else { |
107 | | Some(CASE_FOLDING_SIMPLE[i].0) |
108 | | } |
109 | | })) |
110 | | } |
111 | | |
112 | | imp(c) |
113 | | } |
114 | | |
115 | | /// Returns true if and only if the given (inclusive) range contains at least |
116 | | /// one Unicode scalar value that has a non-empty non-trivial simple case |
117 | | /// mapping. |
118 | | /// |
119 | | /// This function panics if `end < start`. |
120 | | /// |
121 | | /// This returns an error if the Unicode case folding tables are not available. |
122 | | pub fn contains_simple_case_mapping( |
123 | | start: char, |
124 | | end: char, |
125 | | ) -> FoldResult<bool> { |
126 | | #[cfg(not(feature = "unicode-case"))] |
127 | | fn imp(_: char, _: char) -> FoldResult<bool> { |
128 | | Err(CaseFoldError(())) |
129 | | } |
130 | | |
131 | | #[cfg(feature = "unicode-case")] |
132 | | fn imp(start: char, end: char) -> FoldResult<bool> { |
133 | | use std::cmp::Ordering; |
134 | | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
135 | | |
136 | | assert!(start <= end); |
137 | | Ok(CASE_FOLDING_SIMPLE |
138 | | .binary_search_by(|&(c, _)| { |
139 | | if start <= c && c <= end { |
140 | | Ordering::Equal |
141 | | } else if c > end { |
142 | | Ordering::Greater |
143 | | } else { |
144 | | Ordering::Less |
145 | | } |
146 | | }) |
147 | | .is_ok()) |
148 | | } |
149 | | |
150 | | imp(start, end) |
151 | | } |
152 | | |
153 | | /// A query for finding a character class defined by Unicode. This supports |
154 | | /// either use of a property name directly, or lookup by property value. The |
155 | | /// former generally refers to Binary properties (see UTS#44, Table 8), but |
156 | | /// as a special exception (see UTS#18, Section 1.2) both general categories |
157 | | /// (an enumeration) and scripts (a catalog) are supported as if each of their |
158 | | /// possible values were a binary property. |
159 | | /// |
160 | | /// In all circumstances, property names and values are normalized and |
161 | | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
162 | | /// |
163 | | /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
164 | | /// and property value. |
165 | | #[derive(Debug)] |
166 | | pub enum ClassQuery<'a> { |
167 | | /// Return a class corresponding to a Unicode binary property, named by |
168 | | /// a single letter. |
169 | | OneLetter(char), |
170 | | /// Return a class corresponding to a Unicode binary property. |
171 | | /// |
172 | | /// Note that, by special exception (see UTS#18, Section 1.2), both |
173 | | /// general category values and script values are permitted here as if |
174 | | /// they were a binary property. |
175 | | Binary(&'a str), |
176 | | /// Return a class corresponding to all codepoints whose property |
177 | | /// (identified by `property_name`) corresponds to the given value |
178 | | /// (identified by `property_value`). |
179 | | ByValue { |
180 | | /// A property name. |
181 | | property_name: &'a str, |
182 | | /// A property value. |
183 | | property_value: &'a str, |
184 | | }, |
185 | | } |
186 | | |
187 | | impl<'a> ClassQuery<'a> { |
188 | 0 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { |
189 | 0 | match *self { |
190 | 0 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
191 | 0 | ClassQuery::Binary(name) => self.canonical_binary(name), |
192 | 0 | ClassQuery::ByValue { property_name, property_value } => { |
193 | 0 | let property_name = symbolic_name_normalize(property_name); |
194 | 0 | let property_value = symbolic_name_normalize(property_value); |
195 | | |
196 | 0 | let canon_name = match canonical_prop(&property_name)? { |
197 | 0 | None => return Err(Error::PropertyNotFound), |
198 | 0 | Some(canon_name) => canon_name, |
199 | | }; |
200 | 0 | Ok(match canon_name { |
201 | 0 | "General_Category" => { |
202 | 0 | let canon = match canonical_gencat(&property_value)? { |
203 | 0 | None => return Err(Error::PropertyValueNotFound), |
204 | 0 | Some(canon) => canon, |
205 | 0 | }; |
206 | 0 | CanonicalClassQuery::GeneralCategory(canon) |
207 | | } |
208 | 0 | "Script" => { |
209 | 0 | let canon = match canonical_script(&property_value)? { |
210 | 0 | None => return Err(Error::PropertyValueNotFound), |
211 | 0 | Some(canon) => canon, |
212 | 0 | }; |
213 | 0 | CanonicalClassQuery::Script(canon) |
214 | | } |
215 | | _ => { |
216 | 0 | let vals = match property_values(canon_name)? { |
217 | 0 | None => return Err(Error::PropertyValueNotFound), |
218 | 0 | Some(vals) => vals, |
219 | | }; |
220 | 0 | let canon_val = |
221 | 0 | match canonical_value(vals, &property_value) { |
222 | 0 | None => { |
223 | 0 | return Err(Error::PropertyValueNotFound) |
224 | | } |
225 | 0 | Some(canon_val) => canon_val, |
226 | 0 | }; |
227 | 0 | CanonicalClassQuery::ByValue { |
228 | 0 | property_name: canon_name, |
229 | 0 | property_value: canon_val, |
230 | 0 | } |
231 | | } |
232 | | }) |
233 | | } |
234 | | } |
235 | 0 | } |
236 | | |
237 | 0 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { |
238 | 0 | let norm = symbolic_name_normalize(name); |
239 | 0 |
|
240 | 0 | // This is a special case where 'cf' refers to the 'Format' general |
241 | 0 | // category, but where the 'cf' abbreviation is also an abbreviation |
242 | 0 | // for the 'Case_Folding' property. But we want to treat it as |
243 | 0 | // a general category. (Currently, we don't even support the |
244 | 0 | // 'Case_Folding' property. But if we do in the future, users will be |
245 | 0 | // required to spell it out.) |
246 | 0 | if norm != "cf" { |
247 | 0 | if let Some(canon) = canonical_prop(&norm)? { |
248 | 0 | return Ok(CanonicalClassQuery::Binary(canon)); |
249 | 0 | } |
250 | 0 | } |
251 | 0 | if let Some(canon) = canonical_gencat(&norm)? { |
252 | 0 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
253 | 0 | } |
254 | 0 | if let Some(canon) = canonical_script(&norm)? { |
255 | 0 | return Ok(CanonicalClassQuery::Script(canon)); |
256 | 0 | } |
257 | 0 | Err(Error::PropertyNotFound) |
258 | 0 | } |
259 | | } |
260 | | |
261 | | /// Like ClassQuery, but its parameters have been canonicalized. This also |
262 | | /// differentiates binary properties from flattened general categories and |
263 | | /// scripts. |
264 | 0 | #[derive(Debug, Eq, PartialEq)] |
265 | | enum CanonicalClassQuery { |
266 | | /// The canonical binary property name. |
267 | | Binary(&'static str), |
268 | | /// The canonical general category name. |
269 | | GeneralCategory(&'static str), |
270 | | /// The canonical script name. |
271 | | Script(&'static str), |
272 | | /// An arbitrary association between property and value, both of which |
273 | | /// have been canonicalized. |
274 | | /// |
275 | | /// Note that by construction, the property name of ByValue will never |
276 | | /// be General_Category or Script. Those two cases are subsumed by the |
277 | | /// eponymous variants. |
278 | | ByValue { |
279 | | /// The canonical property name. |
280 | | property_name: &'static str, |
281 | | /// The canonical property value. |
282 | | property_value: &'static str, |
283 | | }, |
284 | | } |
285 | | |
286 | | /// Looks up a Unicode class given a query. If one doesn't exist, then |
287 | | /// `None` is returned. |
288 | | pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> { |
289 | | use self::CanonicalClassQuery::*; |
290 | | |
291 | 0 | match query.canonicalize()? { |
292 | 0 | Binary(name) => bool_property(name), |
293 | 0 | GeneralCategory(name) => gencat(name), |
294 | 0 | Script(name) => script(name), |
295 | 0 | ByValue { property_name: "Age", property_value } => { |
296 | 0 | let mut class = hir::ClassUnicode::empty(); |
297 | 0 | for set in ages(property_value)? { |
298 | 0 | class.union(&hir_class(set)); |
299 | 0 | } |
300 | 0 | Ok(class) |
301 | | } |
302 | 0 | ByValue { property_name: "Script_Extensions", property_value } => { |
303 | 0 | script_extension(property_value) |
304 | | } |
305 | | ByValue { |
306 | 0 | property_name: "Grapheme_Cluster_Break", |
307 | 0 | property_value, |
308 | 0 | } => gcb(property_value), |
309 | 0 | ByValue { property_name: "Sentence_Break", property_value } => { |
310 | 0 | sb(property_value) |
311 | | } |
312 | 0 | ByValue { property_name: "Word_Break", property_value } => { |
313 | 0 | wb(property_value) |
314 | | } |
315 | | _ => { |
316 | | // What else should we support? |
317 | 0 | Err(Error::PropertyNotFound) |
318 | | } |
319 | | } |
320 | 0 | } |
321 | | |
322 | | /// Returns a Unicode aware class for \w. |
323 | | /// |
324 | | /// This returns an error if the data is not available for \w. |
325 | | pub fn perl_word() -> Result<hir::ClassUnicode> { |
326 | | #[cfg(not(feature = "unicode-perl"))] |
327 | | fn imp() -> Result<hir::ClassUnicode> { |
328 | | Err(Error::PerlClassNotFound) |
329 | | } |
330 | | |
331 | | #[cfg(feature = "unicode-perl")] |
332 | | fn imp() -> Result<hir::ClassUnicode> { |
333 | | use unicode_tables::perl_word::PERL_WORD; |
334 | | Ok(hir_class(PERL_WORD)) |
335 | | } |
336 | | |
337 | | imp() |
338 | | } |
339 | | |
340 | | /// Returns a Unicode aware class for \s. |
341 | | /// |
342 | | /// This returns an error if the data is not available for \s. |
343 | | pub fn perl_space() -> Result<hir::ClassUnicode> { |
344 | | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] |
345 | | fn imp() -> Result<hir::ClassUnicode> { |
346 | | Err(Error::PerlClassNotFound) |
347 | | } |
348 | | |
349 | | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] |
350 | | fn imp() -> Result<hir::ClassUnicode> { |
351 | | use unicode_tables::perl_space::WHITE_SPACE; |
352 | | Ok(hir_class(WHITE_SPACE)) |
353 | | } |
354 | | |
355 | | #[cfg(feature = "unicode-bool")] |
356 | | fn imp() -> Result<hir::ClassUnicode> { |
357 | | use unicode_tables::property_bool::WHITE_SPACE; |
358 | | Ok(hir_class(WHITE_SPACE)) |
359 | | } |
360 | | |
361 | | imp() |
362 | | } |
363 | | |
364 | | /// Returns a Unicode aware class for \d. |
365 | | /// |
366 | | /// This returns an error if the data is not available for \d. |
367 | | pub fn perl_digit() -> Result<hir::ClassUnicode> { |
368 | | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] |
369 | | fn imp() -> Result<hir::ClassUnicode> { |
370 | | Err(Error::PerlClassNotFound) |
371 | | } |
372 | | |
373 | | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] |
374 | | fn imp() -> Result<hir::ClassUnicode> { |
375 | | use unicode_tables::perl_decimal::DECIMAL_NUMBER; |
376 | | Ok(hir_class(DECIMAL_NUMBER)) |
377 | | } |
378 | | |
379 | | #[cfg(feature = "unicode-gencat")] |
380 | | fn imp() -> Result<hir::ClassUnicode> { |
381 | | use unicode_tables::general_category::DECIMAL_NUMBER; |
382 | | Ok(hir_class(DECIMAL_NUMBER)) |
383 | | } |
384 | | |
385 | | imp() |
386 | | } |
387 | | |
388 | | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
389 | 0 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
390 | 0 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges |
391 | 0 | .iter() |
392 | 0 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
393 | 0 | .collect(); |
394 | 0 | hir::ClassUnicode::new(hir_ranges) |
395 | 0 | } |
396 | | |
397 | | /// Returns true only if the given codepoint is in the `\w` character class. |
398 | | /// |
399 | | /// If the `unicode-perl` feature is not enabled, then this returns an error. |
400 | | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { |
401 | | #[cfg(not(feature = "unicode-perl"))] |
402 | | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { |
403 | | Err(UnicodeWordError(())) |
404 | | } |
405 | | |
406 | | #[cfg(feature = "unicode-perl")] |
407 | | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { |
408 | | use is_word_byte; |
409 | | use std::cmp::Ordering; |
410 | | use unicode_tables::perl_word::PERL_WORD; |
411 | | |
412 | | if c <= 0x7F as char && is_word_byte(c as u8) { |
413 | | return Ok(true); |
414 | | } |
415 | | Ok(PERL_WORD |
416 | | .binary_search_by(|&(start, end)| { |
417 | | if start <= c && c <= end { |
418 | | Ordering::Equal |
419 | | } else if start > c { |
420 | | Ordering::Greater |
421 | | } else { |
422 | | Ordering::Less |
423 | | } |
424 | | }) |
425 | | .is_ok()) |
426 | | } |
427 | | |
428 | | imp(c) |
429 | | } |
430 | | |
431 | | /// A mapping of property values for a specific property. |
432 | | /// |
433 | | /// The first element of each tuple is a normalized property value while the |
434 | | /// second element of each tuple is the corresponding canonical property |
435 | | /// value. |
436 | | type PropertyValues = &'static [(&'static str, &'static str)]; |
437 | | |
438 | 0 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { |
439 | 0 | Ok(match normalized_value { |
440 | 0 | "any" => Some("Any"), |
441 | 0 | "assigned" => Some("Assigned"), |
442 | 0 | "ascii" => Some("ASCII"), |
443 | | _ => { |
444 | 0 | let gencats = property_values("General_Category")?.unwrap(); |
445 | 0 | canonical_value(gencats, normalized_value) |
446 | | } |
447 | | }) |
448 | 0 | } |
449 | | |
450 | 0 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { |
451 | 0 | let scripts = property_values("Script")?.unwrap(); |
452 | 0 | Ok(canonical_value(scripts, normalized_value)) |
453 | 0 | } |
454 | | |
455 | | /// Find the canonical property name for the given normalized property name. |
456 | | /// |
457 | | /// If no such property exists, then `None` is returned. |
458 | | /// |
459 | | /// The normalized property name must have been normalized according to |
460 | | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
461 | | /// |
462 | | /// If the property names data is not available, then an error is returned. |
463 | | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { |
464 | | #[cfg(not(any( |
465 | | feature = "unicode-age", |
466 | | feature = "unicode-bool", |
467 | | feature = "unicode-gencat", |
468 | | feature = "unicode-perl", |
469 | | feature = "unicode-script", |
470 | | feature = "unicode-segment", |
471 | | )))] |
472 | | fn imp(_: &str) -> Result<Option<&'static str>> { |
473 | | Err(Error::PropertyNotFound) |
474 | | } |
475 | | |
476 | | #[cfg(any( |
477 | | feature = "unicode-age", |
478 | | feature = "unicode-bool", |
479 | | feature = "unicode-gencat", |
480 | | feature = "unicode-perl", |
481 | | feature = "unicode-script", |
482 | | feature = "unicode-segment", |
483 | | ))] |
484 | | fn imp(name: &str) -> Result<Option<&'static str>> { |
485 | | use unicode_tables::property_names::PROPERTY_NAMES; |
486 | | |
487 | | Ok(PROPERTY_NAMES |
488 | | .binary_search_by_key(&name, |&(n, _)| n) |
489 | | .ok() |
490 | | .map(|i| PROPERTY_NAMES[i].1)) |
491 | | } |
492 | | |
493 | | imp(normalized_name) |
494 | | } |
495 | | |
496 | | /// Find the canonical property value for the given normalized property |
497 | | /// value. |
498 | | /// |
499 | | /// The given property values should correspond to the values for the property |
500 | | /// under question, which can be found using `property_values`. |
501 | | /// |
502 | | /// If no such property value exists, then `None` is returned. |
503 | | /// |
504 | | /// The normalized property value must have been normalized according to |
505 | | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
506 | 0 | fn canonical_value( |
507 | 0 | vals: PropertyValues, |
508 | 0 | normalized_value: &str, |
509 | 0 | ) -> Option<&'static str> { |
510 | 0 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
511 | 0 | .ok() |
512 | 0 | .map(|i| vals[i].1) |
513 | 0 | } |
514 | | |
515 | | /// Return the table of property values for the given property name. |
516 | | /// |
517 | | /// If the property values data is not available, then an error is returned. |
518 | | fn property_values( |
519 | | canonical_property_name: &'static str, |
520 | | ) -> Result<Option<PropertyValues>> { |
521 | | #[cfg(not(any( |
522 | | feature = "unicode-age", |
523 | | feature = "unicode-bool", |
524 | | feature = "unicode-gencat", |
525 | | feature = "unicode-perl", |
526 | | feature = "unicode-script", |
527 | | feature = "unicode-segment", |
528 | | )))] |
529 | | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { |
530 | | Err(Error::PropertyValueNotFound) |
531 | | } |
532 | | |
533 | | #[cfg(any( |
534 | | feature = "unicode-age", |
535 | | feature = "unicode-bool", |
536 | | feature = "unicode-gencat", |
537 | | feature = "unicode-perl", |
538 | | feature = "unicode-script", |
539 | | feature = "unicode-segment", |
540 | | ))] |
541 | | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { |
542 | | use unicode_tables::property_values::PROPERTY_VALUES; |
543 | | |
544 | | Ok(PROPERTY_VALUES |
545 | | .binary_search_by_key(&name, |&(n, _)| n) |
546 | | .ok() |
547 | | .map(|i| PROPERTY_VALUES[i].1)) |
548 | | } |
549 | | |
550 | | imp(canonical_property_name) |
551 | | } |
552 | | |
553 | | // This is only used in some cases, but small enough to just let it be dead |
554 | | // instead of figuring out (and maintaining) the right set of features. |
555 | | #[allow(dead_code)] |
556 | 0 | fn property_set( |
557 | 0 | name_map: &'static [(&'static str, Range)], |
558 | 0 | canonical: &'static str, |
559 | 0 | ) -> Option<Range> { |
560 | 0 | name_map |
561 | 0 | .binary_search_by_key(&canonical, |x| x.0) |
562 | 0 | .ok() |
563 | 0 | .map(|i| name_map[i].1) |
564 | 0 | } |
565 | | |
566 | | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
567 | | /// of codepoints that were added in a particular revision of Unicode. The |
568 | | /// iterator yields items in chronological order. |
569 | | /// |
570 | | /// If the given age value isn't valid or if the data isn't available, then an |
571 | | /// error is returned instead. |
572 | | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
573 | | #[cfg(not(feature = "unicode-age"))] |
574 | | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { |
575 | | use std::option::IntoIter; |
576 | | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
577 | | } |
578 | | |
579 | | #[cfg(feature = "unicode-age")] |
580 | | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
581 | | use unicode_tables::age; |
582 | | |
583 | | const AGES: &'static [(&'static str, Range)] = &[ |
584 | | ("V1_1", age::V1_1), |
585 | | ("V2_0", age::V2_0), |
586 | | ("V2_1", age::V2_1), |
587 | | ("V3_0", age::V3_0), |
588 | | ("V3_1", age::V3_1), |
589 | | ("V3_2", age::V3_2), |
590 | | ("V4_0", age::V4_0), |
591 | | ("V4_1", age::V4_1), |
592 | | ("V5_0", age::V5_0), |
593 | | ("V5_1", age::V5_1), |
594 | | ("V5_2", age::V5_2), |
595 | | ("V6_0", age::V6_0), |
596 | | ("V6_1", age::V6_1), |
597 | | ("V6_2", age::V6_2), |
598 | | ("V6_3", age::V6_3), |
599 | | ("V7_0", age::V7_0), |
600 | | ("V8_0", age::V8_0), |
601 | | ("V9_0", age::V9_0), |
602 | | ("V10_0", age::V10_0), |
603 | | ("V11_0", age::V11_0), |
604 | | ("V12_0", age::V12_0), |
605 | | ("V12_1", age::V12_1), |
606 | | ("V13_0", age::V13_0), |
607 | | ]; |
608 | | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); |
609 | | |
610 | | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
611 | | match pos { |
612 | | None => Err(Error::PropertyValueNotFound), |
613 | | Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), |
614 | | } |
615 | | } |
616 | | |
617 | | imp(canonical_age) |
618 | | } |
619 | | |
620 | | /// Returns the Unicode HIR class corresponding to the given general category. |
621 | | /// |
622 | | /// Name canonicalization is assumed to be performed by the caller. |
623 | | /// |
624 | | /// If the given general category could not be found, or if the general |
625 | | /// category data is not available, then an error is returned. |
626 | 0 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
627 | 0 | #[cfg(not(feature = "unicode-gencat"))] |
628 | 0 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
629 | 0 | Err(Error::PropertyNotFound) |
630 | 0 | } |
631 | 0 |
|
632 | 0 | #[cfg(feature = "unicode-gencat")] |
633 | 0 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
634 | 0 | use unicode_tables::general_category::BY_NAME; |
635 | 0 | match name { |
636 | 0 | "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), |
637 | 0 | "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), |
638 | 0 | "Assigned" => { |
639 | 0 | let mut cls = gencat("Unassigned")?; |
640 | 0 | cls.negate(); |
641 | 0 | Ok(cls) |
642 | 0 | } |
643 | 0 | name => property_set(BY_NAME, name) |
644 | 0 | .map(hir_class) |
645 | 0 | .ok_or(Error::PropertyValueNotFound), |
646 | 0 | } |
647 | 0 | } |
648 | 0 |
|
649 | 0 | match canonical_name { |
650 | 0 | "Decimal_Number" => perl_digit(), |
651 | 0 | name => imp(name), |
652 | | } |
653 | 0 | } |
654 | | |
655 | | /// Returns the Unicode HIR class corresponding to the given script. |
656 | | /// |
657 | | /// Name canonicalization is assumed to be performed by the caller. |
658 | | /// |
659 | | /// If the given script could not be found, or if the script data is not |
660 | | /// available, then an error is returned. |
661 | | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
662 | | #[cfg(not(feature = "unicode-script"))] |
663 | | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
664 | | Err(Error::PropertyNotFound) |
665 | | } |
666 | | |
667 | | #[cfg(feature = "unicode-script")] |
668 | | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
669 | | use unicode_tables::script::BY_NAME; |
670 | | property_set(BY_NAME, name) |
671 | | .map(hir_class) |
672 | | .ok_or(Error::PropertyValueNotFound) |
673 | | } |
674 | | |
675 | | imp(canonical_name) |
676 | | } |
677 | | |
678 | | /// Returns the Unicode HIR class corresponding to the given script extension. |
679 | | /// |
680 | | /// Name canonicalization is assumed to be performed by the caller. |
681 | | /// |
682 | | /// If the given script extension could not be found, or if the script data is |
683 | | /// not available, then an error is returned. |
684 | | fn script_extension( |
685 | | canonical_name: &'static str, |
686 | | ) -> Result<hir::ClassUnicode> { |
687 | | #[cfg(not(feature = "unicode-script"))] |
688 | | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
689 | | Err(Error::PropertyNotFound) |
690 | | } |
691 | | |
692 | | #[cfg(feature = "unicode-script")] |
693 | | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
694 | | use unicode_tables::script_extension::BY_NAME; |
695 | | property_set(BY_NAME, name) |
696 | | .map(hir_class) |
697 | | .ok_or(Error::PropertyValueNotFound) |
698 | | } |
699 | | |
700 | | imp(canonical_name) |
701 | | } |
702 | | |
703 | | /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
704 | | /// property. |
705 | | /// |
706 | | /// Name canonicalization is assumed to be performed by the caller. |
707 | | /// |
708 | | /// If the given boolean property could not be found, or if the boolean |
709 | | /// property data is not available, then an error is returned. |
710 | 0 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
711 | 0 | #[cfg(not(feature = "unicode-bool"))] |
712 | 0 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
713 | 0 | Err(Error::PropertyNotFound) |
714 | 0 | } |
715 | 0 |
|
716 | 0 | #[cfg(feature = "unicode-bool")] |
717 | 0 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
718 | 0 | use unicode_tables::property_bool::BY_NAME; |
719 | 0 | property_set(BY_NAME, name) |
720 | 0 | .map(hir_class) |
721 | 0 | .ok_or(Error::PropertyNotFound) |
722 | 0 | } |
723 | 0 |
|
724 | 0 | match canonical_name { |
725 | 0 | "Decimal_Number" => perl_digit(), |
726 | 0 | "White_Space" => perl_space(), |
727 | 0 | name => imp(name), |
728 | | } |
729 | 0 | } |
730 | | |
731 | | /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
732 | | /// break property. |
733 | | /// |
734 | | /// Name canonicalization is assumed to be performed by the caller. |
735 | | /// |
736 | | /// If the given property could not be found, or if the corresponding data is |
737 | | /// not available, then an error is returned. |
738 | | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
739 | | #[cfg(not(feature = "unicode-segment"))] |
740 | | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
741 | | Err(Error::PropertyNotFound) |
742 | | } |
743 | | |
744 | | #[cfg(feature = "unicode-segment")] |
745 | | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
746 | | use unicode_tables::grapheme_cluster_break::BY_NAME; |
747 | | property_set(BY_NAME, name) |
748 | | .map(hir_class) |
749 | | .ok_or(Error::PropertyValueNotFound) |
750 | | } |
751 | | |
752 | | imp(canonical_name) |
753 | | } |
754 | | |
755 | | /// Returns the Unicode HIR class corresponding to the given word break |
756 | | /// property. |
757 | | /// |
758 | | /// Name canonicalization is assumed to be performed by the caller. |
759 | | /// |
760 | | /// If the given property could not be found, or if the corresponding data is |
761 | | /// not available, then an error is returned. |
762 | | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
763 | | #[cfg(not(feature = "unicode-segment"))] |
764 | | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
765 | | Err(Error::PropertyNotFound) |
766 | | } |
767 | | |
768 | | #[cfg(feature = "unicode-segment")] |
769 | | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
770 | | use unicode_tables::word_break::BY_NAME; |
771 | | property_set(BY_NAME, name) |
772 | | .map(hir_class) |
773 | | .ok_or(Error::PropertyValueNotFound) |
774 | | } |
775 | | |
776 | | imp(canonical_name) |
777 | | } |
778 | | |
779 | | /// Returns the Unicode HIR class corresponding to the given sentence |
780 | | /// break property. |
781 | | /// |
782 | | /// Name canonicalization is assumed to be performed by the caller. |
783 | | /// |
784 | | /// If the given property could not be found, or if the corresponding data is |
785 | | /// not available, then an error is returned. |
786 | | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
787 | | #[cfg(not(feature = "unicode-segment"))] |
788 | | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
789 | | Err(Error::PropertyNotFound) |
790 | | } |
791 | | |
792 | | #[cfg(feature = "unicode-segment")] |
793 | | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
794 | | use unicode_tables::sentence_break::BY_NAME; |
795 | | property_set(BY_NAME, name) |
796 | | .map(hir_class) |
797 | | .ok_or(Error::PropertyValueNotFound) |
798 | | } |
799 | | |
800 | | imp(canonical_name) |
801 | | } |
802 | | |
803 | | /// Like symbolic_name_normalize_bytes, but operates on a string. |
804 | | fn symbolic_name_normalize(x: &str) -> String { |
805 | | let mut tmp = x.as_bytes().to_vec(); |
806 | | let len = symbolic_name_normalize_bytes(&mut tmp).len(); |
807 | | tmp.truncate(len); |
808 | | // This should always succeed because `symbolic_name_normalize_bytes` |
809 | | // guarantees that `&tmp[..len]` is always valid UTF-8. |
810 | | // |
811 | | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
812 | | // to be worth skipping the additional safety check. A benchmark must |
813 | | // justify it first. |
814 | | String::from_utf8(tmp).unwrap() |
815 | | } |
816 | | |
817 | | /// Normalize the given symbolic name in place according to UAX44-LM3. |
818 | | /// |
819 | | /// A "symbolic name" typically corresponds to property names and property |
820 | | /// value aliases. Note, though, that it should not be applied to property |
821 | | /// string values. |
822 | | /// |
823 | | /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
824 | | /// of `slice`. |
825 | | /// |
826 | | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 |
827 | | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
828 | | // I couldn't find a place in the standard that specified that property |
829 | | // names/aliases had a particular structure (unlike character names), but |
830 | | // we assume that it's ASCII only and drop anything that isn't ASCII. |
831 | | let mut start = 0; |
832 | | let mut starts_with_is = false; |
833 | | if slice.len() >= 2 { |
834 | | // Ignore any "is" prefix. |
835 | | starts_with_is = slice[0..2] == b"is"[..] |
836 | | || slice[0..2] == b"IS"[..] |
837 | | || slice[0..2] == b"iS"[..] |
838 | | || slice[0..2] == b"Is"[..]; |
839 | | if starts_with_is { |
840 | | start = 2; |
841 | | } |
842 | | } |
843 | | let mut next_write = 0; |
844 | | for i in start..slice.len() { |
845 | | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
846 | | // UTF-8, we ensure that the slice contains only ASCII bytes. In |
847 | | // particular, we drop every non-ASCII byte from the normalized string. |
848 | | let b = slice[i]; |
849 | | if b == b' ' || b == b'_' || b == b'-' { |
850 | | continue; |
851 | | } else if b'A' <= b && b <= b'Z' { |
852 | | slice[next_write] = b + (b'a' - b'A'); |
853 | | next_write += 1; |
854 | | } else if b <= 0x7F { |
855 | | slice[next_write] = b; |
856 | | next_write += 1; |
857 | | } |
858 | | } |
859 | | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
860 | | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
861 | | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
862 | | // is actually an alias for the 'Other' general category. |
863 | | if starts_with_is && next_write == 1 && slice[0] == b'c' { |
864 | | slice[0] = b'i'; |
865 | | slice[1] = b's'; |
866 | | slice[2] = b'c'; |
867 | | next_write = 3; |
868 | | } |
869 | | &mut slice[..next_write] |
870 | | } |
871 | | |
872 | | #[cfg(test)] |
873 | | mod tests { |
874 | | use super::{ |
875 | | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, |
876 | | symbolic_name_normalize_bytes, |
877 | | }; |
878 | | |
879 | | #[cfg(feature = "unicode-case")] |
880 | | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
881 | | simple_fold(c).unwrap().unwrap() |
882 | | } |
883 | | |
884 | | #[cfg(feature = "unicode-case")] |
885 | | fn simple_fold_err(c: char) -> Option<char> { |
886 | | match simple_fold(c).unwrap() { |
887 | | Ok(_) => unreachable!("simple_fold returned Ok iterator"), |
888 | | Err(next) => next, |
889 | | } |
890 | | } |
891 | | |
892 | | #[cfg(feature = "unicode-case")] |
893 | | fn contains_case_map(start: char, end: char) -> bool { |
894 | | contains_simple_case_mapping(start, end).unwrap() |
895 | | } |
896 | | |
897 | | #[test] |
898 | | #[cfg(feature = "unicode-case")] |
899 | | fn simple_fold_k() { |
900 | | let xs: Vec<char> = simple_fold_ok('k').collect(); |
901 | | assert_eq!(xs, vec!['K', 'K']); |
902 | | |
903 | | let xs: Vec<char> = simple_fold_ok('K').collect(); |
904 | | assert_eq!(xs, vec!['k', 'K']); |
905 | | |
906 | | let xs: Vec<char> = simple_fold_ok('K').collect(); |
907 | | assert_eq!(xs, vec!['K', 'k']); |
908 | | } |
909 | | |
910 | | #[test] |
911 | | #[cfg(feature = "unicode-case")] |
912 | | fn simple_fold_a() { |
913 | | let xs: Vec<char> = simple_fold_ok('a').collect(); |
914 | | assert_eq!(xs, vec!['A']); |
915 | | |
916 | | let xs: Vec<char> = simple_fold_ok('A').collect(); |
917 | | assert_eq!(xs, vec!['a']); |
918 | | } |
919 | | |
920 | | #[test] |
921 | | #[cfg(feature = "unicode-case")] |
922 | | fn simple_fold_empty() { |
923 | | assert_eq!(Some('A'), simple_fold_err('?')); |
924 | | assert_eq!(Some('A'), simple_fold_err('@')); |
925 | | assert_eq!(Some('a'), simple_fold_err('[')); |
926 | | assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); |
927 | | } |
928 | | |
929 | | #[test] |
930 | | #[cfg(feature = "unicode-case")] |
931 | | fn simple_fold_max() { |
932 | | assert_eq!(None, simple_fold_err('\u{10FFFE}')); |
933 | | assert_eq!(None, simple_fold_err('\u{10FFFF}')); |
934 | | } |
935 | | |
936 | | #[test] |
937 | | #[cfg(not(feature = "unicode-case"))] |
938 | | fn simple_fold_disabled() { |
939 | | assert!(simple_fold('a').is_err()); |
940 | | } |
941 | | |
942 | | #[test] |
943 | | #[cfg(feature = "unicode-case")] |
944 | | fn range_contains() { |
945 | | assert!(contains_case_map('A', 'A')); |
946 | | assert!(contains_case_map('Z', 'Z')); |
947 | | assert!(contains_case_map('A', 'Z')); |
948 | | assert!(contains_case_map('@', 'A')); |
949 | | assert!(contains_case_map('Z', '[')); |
950 | | assert!(contains_case_map('☃', 'Ⰰ')); |
951 | | |
952 | | assert!(!contains_case_map('[', '[')); |
953 | | assert!(!contains_case_map('[', '`')); |
954 | | |
955 | | assert!(!contains_case_map('☃', '☃')); |
956 | | } |
957 | | |
958 | | #[test] |
959 | | #[cfg(not(feature = "unicode-case"))] |
960 | | fn range_contains_disabled() { |
961 | | assert!(contains_simple_case_mapping('a', 'a').is_err()); |
962 | | } |
963 | | |
964 | | #[test] |
965 | | #[cfg(feature = "unicode-gencat")] |
966 | | fn regression_466() { |
967 | | use super::{CanonicalClassQuery, ClassQuery}; |
968 | | |
969 | | let q = ClassQuery::OneLetter('C'); |
970 | | assert_eq!( |
971 | | q.canonicalize().unwrap(), |
972 | | CanonicalClassQuery::GeneralCategory("Other") |
973 | | ); |
974 | | } |
975 | | |
976 | | #[test] |
977 | | fn sym_normalize() { |
978 | | let sym_norm = symbolic_name_normalize; |
979 | | |
980 | | assert_eq!(sym_norm("Line_Break"), "linebreak"); |
981 | | assert_eq!(sym_norm("Line-break"), "linebreak"); |
982 | | assert_eq!(sym_norm("linebreak"), "linebreak"); |
983 | | assert_eq!(sym_norm("BA"), "ba"); |
984 | | assert_eq!(sym_norm("ba"), "ba"); |
985 | | assert_eq!(sym_norm("Greek"), "greek"); |
986 | | assert_eq!(sym_norm("isGreek"), "greek"); |
987 | | assert_eq!(sym_norm("IS_Greek"), "greek"); |
988 | | assert_eq!(sym_norm("isc"), "isc"); |
989 | | assert_eq!(sym_norm("is c"), "isc"); |
990 | | assert_eq!(sym_norm("is_c"), "isc"); |
991 | | } |
992 | | |
993 | | #[test] |
994 | | fn valid_utf8_symbolic() { |
995 | | let mut x = b"abc\xFFxyz".to_vec(); |
996 | | let y = symbolic_name_normalize_bytes(&mut x); |
997 | | assert_eq!(y, b"abcxyz"); |
998 | | } |
999 | | } |