/rust/registry/src/index.crates.io-6f17d22bba15001f/ucd-trie-0.1.7/src/lib.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | The ucd-trie crate provides a compressed trie set specifically tailored for |
3 | | Unicode codepoints. The principle use case for such a trie is to represent |
4 | | properties defined by Unicode that correspond to sets of Unicode codepoints. |
5 | | (These properties are formally called boolean properties or "single valued" |
6 | | properties. See |
7 | | [UTR#23 S3.3](https://www.unicode.org/reports/tr23/#PropertyTypeDefinitions) |
8 | | for more details.) |
9 | | |
10 | | This crate has two principle types: `TrieSetOwned` and `TrieSetSlice`, |
11 | | corresponding to a similar split as there is between `Vec<T>` and `&[T]`. |
12 | | `TrieSetOwned` is the only way to construct a trie from a set of Unicode |
13 | | codepoints. |
14 | | |
15 | | The intended use of this library is to embed a static instance of |
16 | | `TrieSetSlice` into your source code, and then use its methods as defined in |
17 | | this crate to test membership. (The `ucd-generate` tool can likely generate |
18 | | this code for you.) |
19 | | |
20 | | Finally, while this crate uses the standard library by default, it provides |
21 | | `no_std` functionality by disabling the `std` feature. When `no_std` is |
22 | | enabled, then `TrieSetOwned` is not provided. Instead, only `TrieSetSlice` is |
23 | | provided, which means `no_std` crates can still embed tries into their code. |
24 | | */ |
25 | | |
26 | | #![deny(missing_docs)] |
27 | | #![cfg_attr(not(feature = "std"), no_std)] |
28 | | |
29 | | use core::fmt; |
30 | | |
31 | | #[cfg(feature = "std")] |
32 | | pub use crate::owned::{Error, Result, TrieSetOwned}; |
33 | | |
34 | | #[cfg(test)] |
35 | | #[allow(dead_code)] |
36 | | mod general_category; |
37 | | #[cfg(feature = "std")] |
38 | | mod owned; |
39 | | |
40 | | const CHUNK_SIZE: usize = 64; |
41 | | |
42 | | /// A type alias for `TrieSetSlice<'static>`. |
43 | | pub type TrieSet = TrieSetSlice<'static>; |
44 | | |
45 | | /// A borrowed trie set. |
46 | | #[derive(Clone, Copy)] |
47 | | pub struct TrieSetSlice<'a> { |
48 | | /// first tree, one level |
49 | | #[doc(hidden)] |
50 | | pub tree1_level1: &'a [u64], |
51 | | /// second tree, first level |
52 | | #[doc(hidden)] |
53 | | pub tree2_level1: &'a [u8], |
54 | | /// second tree, second level |
55 | | #[doc(hidden)] |
56 | | pub tree2_level2: &'a [u64], |
57 | | /// third tree, first level |
58 | | #[doc(hidden)] |
59 | | pub tree3_level1: &'a [u8], |
60 | | /// third tree, second level |
61 | | #[doc(hidden)] |
62 | | pub tree3_level2: &'a [u8], |
63 | | /// third tree, third level |
64 | | #[doc(hidden)] |
65 | | pub tree3_level3: &'a [u64], |
66 | | } |
67 | | |
68 | | impl<'a> fmt::Debug for TrieSetSlice<'a> { |
69 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
70 | 0 | write!(f, "TrieSetSlice(...)") |
71 | 0 | } |
72 | | } |
73 | | |
74 | | impl<'a> TrieSetSlice<'a> { |
75 | | /// Returns true if and only if the given Unicode scalar value is in this |
76 | | /// set. |
77 | 0 | pub fn contains_char(&self, c: char) -> bool { |
78 | 0 | self.contains(c as usize) |
79 | 0 | } |
80 | | |
81 | | /// Returns true if and only if the given codepoint is in this set. |
82 | | /// |
83 | | /// If the given value exceeds the codepoint range (i.e., it's greater |
84 | | /// than `0x10FFFF`), then this returns false. |
85 | 0 | pub fn contains_u32(&self, cp: u32) -> bool { |
86 | 0 | if cp > 0x10FFFF { |
87 | 0 | return false; |
88 | 0 | } |
89 | 0 | self.contains(cp as usize) |
90 | 0 | } |
91 | | |
92 | | #[inline(always)] |
93 | 0 | fn contains(&self, cp: usize) -> bool { |
94 | 0 | if cp < 0x800 { |
95 | 0 | self.chunk_contains(cp, self.tree1_level1[cp >> 6]) |
96 | 0 | } else if cp < 0x10000 { |
97 | 0 | let leaf = match self.tree2_level1.get((cp >> 6) - 0x20) { |
98 | 0 | None => return false, |
99 | 0 | Some(&leaf) => leaf, |
100 | 0 | }; |
101 | 0 | self.chunk_contains(cp, self.tree2_level2[leaf as usize]) |
102 | | } else { |
103 | 0 | let child = match self.tree3_level1.get((cp >> 12) - 0x10) { |
104 | 0 | None => return false, |
105 | 0 | Some(&child) => child, |
106 | 0 | }; |
107 | 0 | let i = ((child as usize) * CHUNK_SIZE) + ((cp >> 6) & 0b111111); |
108 | 0 | let leaf = self.tree3_level2[i]; |
109 | 0 | self.chunk_contains(cp, self.tree3_level3[leaf as usize]) |
110 | | } |
111 | 0 | } |
112 | | |
113 | | #[inline(always)] |
114 | 0 | fn chunk_contains(&self, cp: usize, chunk: u64) -> bool { |
115 | 0 | ((chunk >> (cp & 0b111111)) & 1) == 1 |
116 | 0 | } |
117 | | } |