/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zerotrie-0.2.2/src/builder/mod.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! # ZeroTrie Builder |
6 | | //! |
7 | | //! There are two implementations of the ZeroTrie Builder: |
8 | | //! |
9 | | //! - [konst::ZeroTrieBuilderConst] allows for human-readable const construction |
10 | | //! - [nonconst::ZeroTrieBuilder] has the full feaure set but requires `alloc` |
11 | | //! |
12 | | //! The two builders follow the same algorithm but have different capabilities. |
13 | | //! |
14 | | //! ## Builder Algorithm Overview |
15 | | //! |
16 | | //! The tries are built backwards, from the last node to the first node. The key step of the |
17 | | //! algorithm is **determining what is the next node to prepend.** |
18 | | //! |
19 | | //! In the simple case of [`ZeroTrieSimpleAscii`], all nodes are binary-search, so if the input |
20 | | //! strings are provided in lexicographic order, there is a simple, deterministic method for |
21 | | //! identifying the next node. This insight is what enables us to make the const builder. |
22 | | //! |
23 | | //! The builder works with the following intermediate state variables: |
24 | | //! |
25 | | //! - `prefix_len` indicates the byte index we are currently processing. |
26 | | //! - `i` and `j` bracket a window of strings in the input that share the same prefix. |
27 | | //! - `current_len` is the length in bytes of the current self-contained trie. |
28 | | //! - `lengths_stack` contains metadata for branch nodes. |
29 | | //! |
30 | | //! What follows is a verbal explanation of the build steps for a trie containing: |
31 | | //! |
32 | | //! - "" → 11 |
33 | | //! - "ad" → 22 |
34 | | //! - "adef" → 33 |
35 | | //! - "adghk" → 44 |
36 | | //! |
37 | | //! When a node is prepended, it is shown in **boldface**. |
38 | | //! |
39 | | //! 1. Initialize the builder by setting `i=3`, `j=4`, `prefix_len=5` (the last string), |
40 | | //! `current_len=0`, and `lengths_stack` empty. Start the main loop. |
41 | | //! 2. Top of loop. The string at `i` is equal in length to `prefix_len`, so we prepend |
42 | | //! our first node: a **value node 44**, which requires a 2-byte varint. Increase |
43 | | //! `current_len` to 2. |
44 | | //! 3. Reduce `prefix_len` to 4, read our `key_ascii="k"`, and recalculate `i` and `j` |
45 | | //! _(this calculation is a long chunk of code in the builder impls)_. Since there is no |
46 | | //! other string with the prefix "adgh", `i` and `j` stay the same, we prepend an |
47 | | //! **ASCII node "k"**, increase `current_len` to 3, and continue the main loop. |
48 | | //! 4. Top of loop. The string at `i` is of length 5, but `prefix_len` is 4, so there is |
49 | | //! no value node to prepend. |
50 | | //! 5. Reduce `prefix_len` to 3, read our `key_ascii="h"`, and recalculate `i` and `j`. |
51 | | //! There are no other strings sharing the prefix "abg", so we prepend an |
52 | | //! **ASCII node "h"**, increase `current_len` to 4, and continue the main loop. |
53 | | //! 6. Top of loop. There is still no value node to prepend. |
54 | | //! 7. Reduce `prefix_len` to 2, read our `key_ascii="g"`, and recalculate `i` and `j`. |
55 | | //! We find that `i=1` and `j=4`, the range of strings sharing the prefix "ad". Since |
56 | | //! `i` or `j` changed, proceed to evaluate the branch node. |
57 | | //! 8. The last branch byte `ascii_j` for this prefix is "g", which is the same as `key_ascii`, |
58 | | //! so we are the _last_ target of a branch node. Push an entry onto `lengths_stack`: |
59 | | //! `BranchMeta { ascii: "g", cumulative_length: 4, local_length: 4, count: 1 }`. |
60 | | //! 9. The first branch byte `ascii_i` for this prefix is "e", which is NOT equal to `key_ascii`, |
61 | | //! so we are _not the first_ target of a branch node. We therefore start evaluating the |
62 | | //! string preceding where we were at the top of the current loop. We set `i=2`, `j=3`, |
63 | | //! `prefix_len=4` (length of the string at `i`), and continue the main loop. |
64 | | //! 10. Top of loop. Since the string at `i` is equal in length to `prefix_len`, we prepend a |
65 | | //! **value node 33** (which requires a 2-byte varint) and increase `current_len` to 2. |
66 | | //! 11. Reduce `prefix_len` to 3, read our `key_ascii="f"`, and recalculate `i` and `j`. |
67 | | //! They stay the same, so we prepend an **ASCII node "f"**, increase `current_len` to 3, |
68 | | //! and continue the main loop. |
69 | | //! 12. Top of loop. No value node this time. |
70 | | //! 13. Reduce `prefix_len` to 2, read our `key_ascii="e"`, and recalculate `i` and `j`. |
71 | | //! They go back to `i=1` and `j=4`. |
72 | | //! 14. The last branch byte `ascii_j` for this prefix is "g", which is NOT equal to `key_ascii`, |
73 | | //! so we are _not the last_ target of a branch node. We peek at the entry at the front of |
74 | | //! the lengths stack and use it to push another entry onto the stack: |
75 | | //! `BranchMeta { ascii: "e", cumulative_length: 7, local_length: 3, count: 2 }` |
76 | | //! 15. The first branch byte `ascii_i` for this prefix is "e", which is the same as `key_ascii`, |
77 | | //! wo we are the _first_ target of a branch node. We can therefore proceed to prepend the |
78 | | //! metadata for the branch node. We peek at the top of the stack and find that there are 2 |
79 | | //! tries reachable from this branch and they have a total byte length of 5. We then pull off |
80 | | //! 2 entries from the stack into a local variable `branch_metas`. From here, we write out |
81 | | //! the **offset table**, **lookup table**, and **branch head node**, which are determined |
82 | | //! from the metadata entries. We set `current_len` to the length of the two tries plus the |
83 | | //! metadata, which happens to be 11. Then we return to the top of the main loop. |
84 | | //! 16. Top of loop. The string at `i` is length 2, which is the same as `prefix_len`, so we |
85 | | //! prepend a **value node 22** (2-byte varint) and increase `current_len` to 13. |
86 | | //! 17. Reduce `prefix_len` to 1, read our `key_ascii="d"`, and recalculate `i` and `j`. |
87 | | //! They stay the same, so we prepend an **ASCII node "d"**, increase `current_len` to 14, |
88 | | //! and continue the main loop. |
89 | | //! 18. Top of loop. No value node this time. |
90 | | //! 19. Reduce `prefix_len` to 0, read our `key_ascii="a"`, and recalculate `i` and `j`. |
91 | | //! They change to `i=0` and `j=4`, since all strings have the empty string as a prefix. |
92 | | //! However, `ascii_i` and `ascii_j` both equal `key_ascii`, so we prepend **ASCII node "a"**, |
93 | | //! increase `current_len` to 15, and continue the main loop. |
94 | | //! 16. Top of loop. The string at `i` is length 0, which is the same as `prefix_len`, so we |
95 | | //! prepend a **value node 11** and increase `current_len` to 16. |
96 | | //! 17. We can no longer reduce `prefix_len`, so our trie is complete. |
97 | | //! |
98 | | //! ## Perfect Hash Reordering |
99 | | //! |
100 | | //! When the PHF is added to the mix, the main change is that the strings are no longer in sorted |
101 | | //! order when they are in the trie. To resolve this issue, when adding a branch node, the target |
102 | | //! tries are rearranged in-place in the buffer to be in the correct order for the PHF. |
103 | | //! |
104 | | //! ## Example |
105 | | //! |
106 | | //! Here is the output of the trie described above. |
107 | | //! |
108 | | //! ``` |
109 | | //! use zerotrie::ZeroTrieSimpleAscii; |
110 | | //! |
111 | | //! const DATA: [(&str, usize); 4] = |
112 | | //! [("", 11), ("ad", 22), ("adef", 33), ("adghk", 44)]; |
113 | | //! |
114 | | //! // As demonstrated above, the required capacity for this trie is 16 bytes |
115 | | //! const TRIE: ZeroTrieSimpleAscii<[u8; 16]> = |
116 | | //! ZeroTrieSimpleAscii::from_sorted_str_tuples(&DATA); |
117 | | //! |
118 | | //! assert_eq!( |
119 | | //! TRIE.as_bytes(), |
120 | | //! &[ |
121 | | //! 0x8B, // value node 11 |
122 | | //! b'a', // ASCII node 'a' |
123 | | //! b'd', // ASCII node 'd' |
124 | | //! 0x90, // value node 22 lead byte |
125 | | //! 0x06, // value node 22 trail byte |
126 | | //! 0xC2, // branch node 2 |
127 | | //! b'e', // first target of branch |
128 | | //! b'g', // second target of branch |
129 | | //! 3, // offset |
130 | | //! b'f', // ASCII node 'f' |
131 | | //! 0x90, // value node 33 lead byte |
132 | | //! 0x11, // value node 33 trail byte |
133 | | //! b'h', // ASCII node 'h' |
134 | | //! b'k', // ASCII node 'k' |
135 | | //! 0x90, // value node 44 lead byte |
136 | | //! 0x1C, // value node 44 trail byte |
137 | | //! ] |
138 | | //! ); |
139 | | //! |
140 | | //! assert_eq!(TRIE.get(b""), Some(11)); |
141 | | //! assert_eq!(TRIE.get(b"ad"), Some(22)); |
142 | | //! assert_eq!(TRIE.get(b"adef"), Some(33)); |
143 | | //! assert_eq!(TRIE.get(b"adghk"), Some(44)); |
144 | | //! assert_eq!(TRIE.get(b"unknown"), None); |
145 | | //! ``` |
146 | | |
147 | | mod branch_meta; |
148 | | pub(crate) mod bytestr; |
149 | | pub(crate) mod konst; |
150 | | #[cfg(feature = "litemap")] |
151 | | mod litemap; |
152 | | #[cfg(feature = "alloc")] |
153 | | pub(crate) mod nonconst; |
154 | | |
155 | | use bytestr::ByteStr; |
156 | | |
157 | | use super::ZeroTrieSimpleAscii; |
158 | | |
159 | | impl<const N: usize> ZeroTrieSimpleAscii<[u8; N]> { |
160 | | /// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values. |
161 | | /// |
162 | | /// This function needs to know the exact length of the resulting trie at compile time. To |
163 | | /// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting |
164 | | /// compile error which will tell you how to set `N`, like this: |
165 | | /// |
166 | | /// > the evaluated program panicked at 'Buffer too large. Size needed: 17' |
167 | | /// |
168 | | /// That error message says you need to set `N` to 17. |
169 | | /// |
170 | | /// Also see [`Self::from_sorted_str_tuples`]. |
171 | | /// |
172 | | /// # Panics |
173 | | /// |
174 | | /// Panics if `items` is not sorted or if `N` is not correct. |
175 | | /// |
176 | | /// # Examples |
177 | | /// |
178 | | /// Create a `const` ZeroTrieSimpleAscii at compile time: |
179 | | /// |
180 | | /// ``` |
181 | | /// use zerotrie::ZeroTrieSimpleAscii; |
182 | | /// |
183 | | /// // The required capacity for this trie happens to be 17 bytes |
184 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = |
185 | | /// ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ |
186 | | /// (b"bar", 2), |
187 | | /// (b"bazzoo", 3), |
188 | | /// (b"foo", 1), |
189 | | /// ]); |
190 | | /// |
191 | | /// assert_eq!(TRIE.get(b"foo"), Some(1)); |
192 | | /// assert_eq!(TRIE.get(b"bar"), Some(2)); |
193 | | /// assert_eq!(TRIE.get(b"bazzoo"), Some(3)); |
194 | | /// assert_eq!(TRIE.get(b"unknown"), None); |
195 | | /// ``` |
196 | | /// |
197 | | /// Panics if strings are not sorted: |
198 | | /// |
199 | | /// ```compile_fail |
200 | | /// # use zerotrie::ZeroTrieSimpleAscii; |
201 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ |
202 | | /// (b"foo", 1), |
203 | | /// (b"bar", 2), |
204 | | /// (b"bazzoo", 3), |
205 | | /// ]); |
206 | | /// ``` |
207 | | /// |
208 | | /// Panics if capacity is too small: |
209 | | /// |
210 | | /// ```compile_fail |
211 | | /// # use zerotrie::ZeroTrieSimpleAscii; |
212 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 15]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ |
213 | | /// (b"bar", 2), |
214 | | /// (b"bazzoo", 3), |
215 | | /// (b"foo", 1), |
216 | | /// ]); |
217 | | /// ``` |
218 | | /// |
219 | | /// Panics if capacity is too large: |
220 | | /// |
221 | | /// ```compile_fail |
222 | | /// # use zerotrie::ZeroTrieSimpleAscii; |
223 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 20]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ |
224 | | /// (b"bar", 2), |
225 | | /// (b"bazzoo", 3), |
226 | | /// (b"foo", 1), |
227 | | /// ]); |
228 | | /// ``` |
229 | 0 | pub const fn from_sorted_u8_tuples(tuples: &[(&[u8], usize)]) -> Self { |
230 | | use konst::*; |
231 | 0 | let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples); |
232 | 0 | let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice); |
233 | 0 | match result { |
234 | 0 | Ok(s) => Self::from_store(s.build_or_panic()), |
235 | 0 | Err(_) => panic!("Failed to build ZeroTrie"), |
236 | | } |
237 | 0 | } |
238 | | |
239 | | /// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values. |
240 | | /// |
241 | | /// This function needs to know the exact length of the resulting trie at compile time. To |
242 | | /// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting |
243 | | /// compile error which will tell you how to set `N`, like this: |
244 | | /// |
245 | | /// > the evaluated program panicked at 'Buffer too large. Size needed: 17' |
246 | | /// |
247 | | /// That error message says you need to set `N` to 17. |
248 | | /// |
249 | | /// Also see [`Self::from_sorted_u8_tuples`]. |
250 | | /// |
251 | | /// # Panics |
252 | | /// |
253 | | /// Panics if `items` is not sorted, if `N` is not correct, or if any of the strings contain |
254 | | /// non-ASCII characters. |
255 | | /// |
256 | | /// # Examples |
257 | | /// |
258 | | /// Create a `const` ZeroTrieSimpleAscii at compile time: |
259 | | /// |
260 | | /// ``` |
261 | | /// use zerotrie::ZeroTrieSimpleAscii; |
262 | | /// |
263 | | /// // The required capacity for this trie happens to be 17 bytes |
264 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = |
265 | | /// ZeroTrieSimpleAscii::from_sorted_str_tuples(&[ |
266 | | /// ("bar", 2), |
267 | | /// ("bazzoo", 3), |
268 | | /// ("foo", 1), |
269 | | /// ]); |
270 | | /// |
271 | | /// assert_eq!(TRIE.get(b"foo"), Some(1)); |
272 | | /// assert_eq!(TRIE.get(b"bar"), Some(2)); |
273 | | /// assert_eq!(TRIE.get(b"bazzoo"), Some(3)); |
274 | | /// assert_eq!(TRIE.get(b"unknown"), None); |
275 | | /// ``` |
276 | | /// |
277 | | /// Panics if the strings are not ASCII: |
278 | | /// |
279 | | /// ```compile_fail |
280 | | /// # use zerotrie::ZeroTrieSimpleAscii; |
281 | | /// const TRIE: ZeroTrieSimpleAscii<[u8; 100]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[ |
282 | | /// ("bár", 2), |
283 | | /// ("båzzöo", 3), |
284 | | /// ("foo", 1), |
285 | | /// ]); |
286 | | /// ``` |
287 | 0 | pub const fn from_sorted_str_tuples(tuples: &[(&str, usize)]) -> Self { |
288 | | use konst::*; |
289 | 0 | let byte_str_slice = ByteStr::from_str_slice_with_value(tuples); |
290 | | // 100 is the value of `K`, the size of the lengths stack. If compile errors are |
291 | | // encountered, this number may need to be increased. |
292 | 0 | let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice); |
293 | 0 | match result { |
294 | 0 | Ok(s) => Self::from_store(s.build_or_panic()), |
295 | 0 | Err(_) => panic!("Failed to build ZeroTrie"), |
296 | | } |
297 | 0 | } |
298 | | } |