/rust/registry/src/index.crates.io-6f17d22bba15001f/zerotrie-0.1.3/src/reader.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! # Internal layout of ZeroTrie |
6 | | //! |
7 | | //! A ZeroTrie is composed of a series of nodes stored in sequence in a byte slice. |
8 | | //! |
9 | | //! There are 4 types of nodes: |
10 | | //! |
11 | | //! 1. ASCII (`0xxxxxxx`): matches a literal ASCII byte. |
12 | | //! 2. Span (`101xxxxx`): matches a span of non-ASCII bytes. |
13 | | //! 3. Value (`100xxxxx`): associates a value with a string |
14 | | //! 4. Branch (`11xxxxxx`): matches one of a set of bytes. |
15 | | //! |
16 | | //! Span, Value, and Branch nodes contain a varint, which has different semantics for each: |
17 | | //! |
18 | | //! - Span varint: length of the span |
19 | | //! - Value varint: value associated with the string |
20 | | //! - Branch varint: number of edges in the branch and width of the offset table |
21 | | //! |
22 | | //! If reading an ASCII, Span, or Branch node, one or more bytes are consumed from the input |
23 | | //! string. If the next byte(s) in the input string do not match the node, we return `None`. |
24 | | //! If reading a Value node, if the string is empty, return `Some(value)`; otherwise, we skip |
25 | | //! the Value node and continue on to the next node. |
26 | | //! |
27 | | //! When a node is consumed, a shorter, well-formed ZeroTrie remains. |
28 | | //! |
29 | | //! ### Basic Example |
30 | | //! |
31 | | //! Here is an example ZeroTrie without branch nodes: |
32 | | //! |
33 | | //! ``` |
34 | | //! use zerotrie::ZeroTriePerfectHash; |
35 | | //! |
36 | | //! let bytes = [ |
37 | | //! b'a', // ASCII literal |
38 | | //! 0b10001010, // value 10 |
39 | | //! b'b', // ASCII literal |
40 | | //! 0b10100011, // span of 3 |
41 | | //! 0x81, // first byte in span |
42 | | //! 0x91, // second byte in span |
43 | | //! 0xA1, // third and final byte in span |
44 | | //! 0b10000100, // value 4 |
45 | | //! ]; |
46 | | //! |
47 | | //! let trie = ZeroTriePerfectHash::from_bytes(&bytes); |
48 | | //! |
49 | | //! // First value: "a" → 10 |
50 | | //! assert_eq!(trie.get(b"a"), Some(10)); |
51 | | //! |
52 | | //! // Second value: "ab\x81\x91\xA1" → 4 |
53 | | //! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4)); |
54 | | //! |
55 | | //! // A few examples of strings that do NOT have values in the trie: |
56 | | //! assert_eq!(trie.get(b"ab"), None); |
57 | | //! assert_eq!(trie.get(b"b"), None); |
58 | | //! assert_eq!(trie.get(b"b\x81\x91\xA1"), None); |
59 | | //! ``` |
60 | | //! |
61 | | //! ## Branch Nodes |
62 | | //! |
63 | | //! There are two types of branch nodes: binary search and perfect hash. `ZeroTrieSimpleAscii` |
64 | | //! contains only binary search nodes, whereas `ZeroTriePerfectHash` can contain either. |
65 | | //! |
66 | | //! The head node of the branch has a varint that encodes two things: |
67 | | //! |
68 | | //! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256 |
69 | | //! - Bits 9 and 10: width of the offset table (`W`) |
70 | | //! |
71 | | //! Note that N is always in the range [1, 256]. There can't be more than 256 edges because |
72 | | //! there are only 256 unique u8 values. |
73 | | //! |
74 | | //! A few examples of the head node of the branch: |
75 | | //! |
76 | | //! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0 |
77 | | //! - `0b11000110`: varint bits `110`: N = 6; W = 0 |
78 | | //! - `0b11100000 0b00000101`: varint bits `1000101`: N = 69; W = 0 |
79 | | //! - `0b11100010 0b00000000`: varint bits `101000000`: N = 64; W = 1 |
80 | | //! |
81 | | //! In `ZeroTriePerfectHash`, if N <= 15, the branch is assumed to be a binary search, and if |
82 | | //! N > 15, the branch is assumed to be a perfect hash. |
83 | | //! |
84 | | //! ### Binary Search Branch Nodes |
85 | | //! |
86 | | //! A binary search branch node is used when: |
87 | | //! |
88 | | //! 1. The trie is a `ZeroTrieSimpleAscii`, OR |
89 | | //! 2. There are 15 or fewer items in the branch. |
90 | | //! |
91 | | //! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte |
92 | | //! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search), |
93 | | //! the index `i` of the byte within the list is used to index into the offset table (described |
94 | | //! below). If the byte is not in the list, the string is not in the trie, so return `None`. |
95 | | //! |
96 | | //! ### Perfect Hash Branch Nodes |
97 | | //! |
98 | | //! A perfect hash branch node is used when: |
99 | | //! |
100 | | //! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND |
101 | | //! 2. There are 16 or more items in the branch. |
102 | | //! |
103 | | //! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing |
104 | | //! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an |
105 | | //! index within the hash table `i` is resolved and used as input to index into the offset |
106 | | //! table (described below), or the value is determined to not be present and `None` is |
107 | | //! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`]. |
108 | | //! |
109 | | //! ### Offset Tables |
110 | | //! |
111 | | //! The _offset table_ encodes the range of the remaining buffer containing the trie reachable |
112 | | //! from the byte matched in the branch node. Both types of branch nodes include an offset |
113 | | //! table followig the key lookup. Given the index `i` from the first step, the range |
114 | | //! `[s_i, s_(i+1))` brackets the next step in the trie. |
115 | | //! |
116 | | //! Offset tables utilize the `W` parameter stored in the branch head node. The special case |
117 | | //! when `W == 0`, with `N - 1` bytes, is easiest to understand: |
118 | | //! |
119 | | //! **Offset table, W = 0:** `[s_1, s_2, ..., s_(N-1)]` |
120 | | //! |
121 | | //! Note that `s_0` is always 0 and `s_N` is always the length of the remaining slice, so those |
122 | | //! values are not explicitly included in the offset table. |
123 | | //! |
124 | | //! When W > 0, the high and low bits of the offsets are in separate bytes, arranged as follows: |
125 | | //! |
126 | | //! **Generalized offset table:** `[a_1, a_2, ..., a_(N-1), b_1, b_2, ..., b_(N-1), c_1, ...]` |
127 | | //! |
128 | | //! where `s_i = (a_i << 8 + b_i) << 8 + c_i ...` (high bits first, low bits last) |
129 | | //! |
130 | | //! ### Advanced Example |
131 | | //! |
132 | | //! The following trie encodes the following map. It has multiple varints and branch nodes, which |
133 | | //! are all binary search with W = 0. Note that there is a value for the empty string. |
134 | | //! |
135 | | //! - "" → 0 |
136 | | //! - "axb" → 100 |
137 | | //! - "ayc" → 2 |
138 | | //! - "azd" → 3 |
139 | | //! - "bxe" → 4 |
140 | | //! - "bxefg" → 500 |
141 | | //! - "bxefh" → 6 |
142 | | //! - "bxei" → 7 |
143 | | //! - "bxeikl" → 8 |
144 | | //! |
145 | | //! ``` |
146 | | //! use zerotrie::ZeroTrieSimpleAscii; |
147 | | //! |
148 | | //! let bytes = [ |
149 | | //! 0b10000000, // value 0 |
150 | | //! 0b11000010, // branch of 2 |
151 | | //! b'a', // |
152 | | //! b'b', // |
153 | | //! 13, // |
154 | | //! 0b11000011, // start of 'a' subtree: branch of 3 |
155 | | //! b'x', // |
156 | | //! b'y', // |
157 | | //! b'z', // |
158 | | //! 3, // |
159 | | //! 5, // |
160 | | //! b'b', // |
161 | | //! 0b10010000, // value 100 (lead) |
162 | | //! 0x54, // value 100 (trail) |
163 | | //! b'c', // |
164 | | //! 0b10000010, // value 2 |
165 | | //! b'd', // |
166 | | //! 0b10000011, // value 3 |
167 | | //! b'x', // start of 'b' subtree |
168 | | //! b'e', // |
169 | | //! 0b10000100, // value 4 |
170 | | //! 0b11000010, // branch of 2 |
171 | | //! b'f', // |
172 | | //! b'i', // |
173 | | //! 7, // |
174 | | //! 0b11000010, // branch of 2 |
175 | | //! b'g', // |
176 | | //! b'h', // |
177 | | //! 2, // |
178 | | //! 0b10010011, // value 500 (lead) |
179 | | //! 0x64, // value 500 (trail) |
180 | | //! 0b10000110, // value 6 |
181 | | //! 0b10000111, // value 7 |
182 | | //! b'k', // |
183 | | //! b'l', // |
184 | | //! 0b10001000, // value 8 |
185 | | //! ]; |
186 | | //! |
187 | | //! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); |
188 | | //! |
189 | | //! // Assert that the specified items are in the map |
190 | | //! assert_eq!(trie.get(b""), Some(0)); |
191 | | //! assert_eq!(trie.get(b"axb"), Some(100)); |
192 | | //! assert_eq!(trie.get(b"ayc"), Some(2)); |
193 | | //! assert_eq!(trie.get(b"azd"), Some(3)); |
194 | | //! assert_eq!(trie.get(b"bxe"), Some(4)); |
195 | | //! assert_eq!(trie.get(b"bxefg"), Some(500)); |
196 | | //! assert_eq!(trie.get(b"bxefh"), Some(6)); |
197 | | //! assert_eq!(trie.get(b"bxei"), Some(7)); |
198 | | //! assert_eq!(trie.get(b"bxeikl"), Some(8)); |
199 | | //! |
200 | | //! // Assert that some other items are not in the map |
201 | | //! assert_eq!(trie.get(b"a"), None); |
202 | | //! assert_eq!(trie.get(b"bx"), None); |
203 | | //! assert_eq!(trie.get(b"xba"), None); |
204 | | //! ``` |
205 | | |
206 | | use crate::byte_phf::PerfectByteHashMap; |
207 | | use crate::cursor::AsciiProbeResult; |
208 | | use crate::helpers::*; |
209 | | use crate::options::*; |
210 | | use crate::varint::read_varint_meta2; |
211 | | use crate::varint::read_varint_meta3; |
212 | | |
213 | | #[cfg(feature = "alloc")] |
214 | | use alloc::string::String; |
215 | | |
216 | | /// Given a slice starting with an offset table, returns the trie for the given index. |
217 | | /// |
218 | | /// Arguments: |
219 | | /// - `trie` = a trie pointing at an offset table (after the branch node and search table) |
220 | | /// - `i` = the desired index within the offset table |
221 | | /// - `n` = the number of items in the offset table |
222 | | /// - `w` = the width of the offset table items minus one |
223 | | #[inline] |
224 | 0 | fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> &[u8] { |
225 | 0 | let mut p = 0usize; |
226 | 0 | let mut q = 0usize; |
227 | | loop { |
228 | | let indices; |
229 | 0 | (indices, trie) = trie.debug_split_at(n - 1); |
230 | 0 | p = (p << 8) |
231 | 0 | + if i == 0 { |
232 | 0 | 0 |
233 | | } else { |
234 | 0 | *indices.get(i - 1).debug_unwrap_or(&0) as usize |
235 | | }; |
236 | 0 | q = match indices.get(i) { |
237 | 0 | Some(x) => (q << 8) + *x as usize, |
238 | 0 | None => trie.len(), |
239 | | }; |
240 | 0 | if w == 0 { |
241 | 0 | break; |
242 | 0 | } |
243 | 0 | w -= 1; |
244 | | } |
245 | 0 | trie.get(p..q).debug_unwrap_or(&[]) |
246 | 0 | } Unexecuted instantiation: zerotrie::reader::get_branch Unexecuted instantiation: zerotrie::reader::get_branch |
247 | | |
248 | | /// Version of [`get_branch()`] specialized for the case `w == 0` for performance |
249 | | #[inline] |
250 | 0 | fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> &[u8] { |
251 | 0 | let indices; |
252 | 0 | (indices, trie) = trie.debug_split_at(n - 1); |
253 | 0 | let p = if i == 0 { |
254 | 0 | 0 |
255 | | } else { |
256 | 0 | *indices.get(i - 1).debug_unwrap_or(&0) as usize |
257 | | }; |
258 | 0 | let q = match indices.get(i) { |
259 | 0 | Some(x) => *x as usize, |
260 | 0 | None => trie.len(), |
261 | | }; |
262 | 0 | trie.get(p..q).debug_unwrap_or(&[]) |
263 | 0 | } Unexecuted instantiation: zerotrie::reader::get_branch_w0 Unexecuted instantiation: zerotrie::reader::get_branch_w0 |
264 | | |
265 | | /// The node type. See the module-level docs for more explanation of the four node types. |
266 | | enum NodeType { |
267 | | /// An ASCII node. Contains a single literal ASCII byte and no varint. |
268 | | Ascii, |
269 | | /// A span node. Contains a varint indicating how big the span is. |
270 | | Span, |
271 | | /// A value node. Contains a varint representing the value. |
272 | | Value, |
273 | | /// A branch node. Contains a varint of the number of output nodes, plus W in the high bits. |
274 | | Branch, |
275 | | } |
276 | | |
277 | | impl core::fmt::Debug for NodeType { |
278 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
279 | | use NodeType::*; |
280 | 0 | f.write_str(match *self { |
281 | 0 | Ascii => "a", |
282 | 0 | Span => "s", |
283 | 0 | Value => "v", |
284 | 0 | Branch => "m", |
285 | | }) |
286 | 0 | } |
287 | | } |
288 | | |
289 | | #[inline] |
290 | 0 | fn byte_type(b: u8) -> NodeType { |
291 | 0 | match b & 0b11100000 { |
292 | 0 | 0b10000000 => NodeType::Value, |
293 | 0 | 0b10100000 => NodeType::Span, |
294 | 0 | 0b11000000 => NodeType::Branch, |
295 | 0 | 0b11100000 => NodeType::Branch, |
296 | 0 | _ => NodeType::Ascii, |
297 | | } |
298 | 0 | } Unexecuted instantiation: zerotrie::reader::byte_type Unexecuted instantiation: zerotrie::reader::byte_type |
299 | | |
300 | | #[inline] |
301 | 0 | pub(crate) fn get_parameterized<T: ZeroTrieWithOptions + ?Sized>( |
302 | 0 | mut trie: &[u8], |
303 | 0 | mut ascii: &[u8], |
304 | 0 | ) -> Option<usize> { |
305 | | loop { |
306 | | let (b, x, i, search); |
307 | 0 | (b, trie) = trie.split_first()?; |
308 | 0 | let byte_type = byte_type(*b); |
309 | 0 | (x, trie) = match byte_type { |
310 | 0 | NodeType::Ascii => (0, trie), |
311 | | NodeType::Span => { |
312 | 0 | if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) { |
313 | 0 | read_varint_meta3(*b, trie) |
314 | | } else { |
315 | 0 | debug_assert!(false, "Span node found in ASCII trie!"); |
316 | 0 | return None; |
317 | | } |
318 | | } |
319 | 0 | NodeType::Value => read_varint_meta3(*b, trie), |
320 | 0 | NodeType::Branch => read_varint_meta2(*b, trie), |
321 | | }; |
322 | 0 | if let Some((c, temp)) = ascii.split_first() { |
323 | 0 | if matches!(byte_type, NodeType::Ascii) { |
324 | 0 | let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) |
325 | | { |
326 | 0 | b.to_ascii_lowercase() == c.to_ascii_lowercase() |
327 | | } else { |
328 | 0 | b == c |
329 | | }; |
330 | 0 | if is_match { |
331 | | // Matched a byte |
332 | 0 | ascii = temp; |
333 | 0 | continue; |
334 | | } else { |
335 | | // Byte that doesn't match |
336 | 0 | return None; |
337 | | } |
338 | 0 | } |
339 | 0 | if matches!(byte_type, NodeType::Value) { |
340 | | // Value node, but not at end of string |
341 | 0 | continue; |
342 | 0 | } |
343 | 0 | if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) |
344 | 0 | && matches!(byte_type, NodeType::Span) |
345 | | { |
346 | | let (trie_span, ascii_span); |
347 | 0 | (trie_span, trie) = trie.debug_split_at(x); |
348 | 0 | (ascii_span, ascii) = ascii.maybe_split_at(x)?; |
349 | 0 | if trie_span == ascii_span { |
350 | | // Matched a byte span |
351 | 0 | continue; |
352 | | } else { |
353 | | // Byte span that doesn't match |
354 | 0 | return None; |
355 | | } |
356 | 0 | } |
357 | | // Branch node |
358 | 0 | let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; |
359 | 0 | let w = if matches!(T::OPTIONS.capacity_mode, CapacityMode::Extended) { |
360 | 0 | w |
361 | | } else { |
362 | | // See the table below regarding this assertion |
363 | 0 | debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); |
364 | 0 | w & 0x3 |
365 | | }; |
366 | 0 | let x = if x == 0 { 256 } else { x }; |
367 | 0 | if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 { |
368 | | // binary search |
369 | 0 | (search, trie) = trie.debug_split_at(x); |
370 | 0 | let bsearch_result = |
371 | 0 | if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { |
372 | 0 | search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { |
373 | 0 | x.to_ascii_lowercase() |
374 | 0 | }) Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTriePerfectHash<zerovec::zerovec::ZeroVec<u8>>>::{closure#0} Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTrieSimpleAscii<zerovec::zerovec::ZeroVec<u8>>>::{closure#0} Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroAsciiIgnoreCaseTrie<zerovec::zerovec::ZeroVec<u8>>>::{closure#0} Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTrieExtendedCapacity<zerovec::zerovec::ZeroVec<u8>>>::{closure#0} Unexecuted instantiation: zerotrie::reader::get_parameterized::<_>::{closure#0} |
375 | | } else { |
376 | 0 | search.binary_search(c) |
377 | | }; |
378 | 0 | i = bsearch_result.ok()?; |
379 | | } else { |
380 | | // phf |
381 | 0 | (search, trie) = trie.debug_split_at(x * 2 + 1); |
382 | 0 | i = PerfectByteHashMap::from_store(search).get(*c)?; |
383 | | } |
384 | 0 | trie = if w == 0 { |
385 | 0 | get_branch_w0(trie, i, x) |
386 | | } else { |
387 | 0 | get_branch(trie, i, x, w) |
388 | | }; |
389 | 0 | ascii = temp; |
390 | 0 | continue; |
391 | | } else { |
392 | 0 | if matches!(byte_type, NodeType::Value) { |
393 | | // Value node at end of string |
394 | 0 | return Some(x); |
395 | 0 | } |
396 | 0 | return None; |
397 | | } |
398 | | } |
399 | 0 | } Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTriePerfectHash<zerovec::zerovec::ZeroVec<u8>>> Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTrieSimpleAscii<zerovec::zerovec::ZeroVec<u8>>> Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroAsciiIgnoreCaseTrie<zerovec::zerovec::ZeroVec<u8>>> Unexecuted instantiation: zerotrie::reader::get_parameterized::<zerotrie::zerotrie::ZeroTrieExtendedCapacity<zerovec::zerovec::ZeroVec<u8>>> Unexecuted instantiation: zerotrie::reader::get_parameterized::<_> |
400 | | |
401 | | // DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w. |
402 | | // |
403 | | // | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 | |
404 | | // |---------------|-------------------|--------------------|-------------------|--------------------| |
405 | | // | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns | |
406 | | // | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | |
407 | | // | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | |
408 | | |
409 | | /// Steps one node into the trie assuming all branch nodes are binary search and that |
410 | | /// there are no span nodes. |
411 | | /// |
412 | | /// The input-output argument `trie` starts at the original trie and ends pointing to |
413 | | /// the sub-trie reachable by `c`. |
414 | | #[inline] |
415 | 0 | pub(crate) fn step_parameterized<T: ZeroTrieWithOptions + ?Sized>( |
416 | 0 | trie: &mut &[u8], |
417 | 0 | c: u8, |
418 | 0 | ) -> Option<u8> { |
419 | 0 | // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`. |
420 | 0 | // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie. |
421 | 0 | // If a span node is encountered, `None` is returned later in this function. |
422 | 0 | debug_assert!( |
423 | 0 | matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly), |
424 | 0 | "Spans not yet implemented in step function" |
425 | | ); |
426 | | // PHF can be easily implemented but the code is not yet reachable |
427 | 0 | debug_assert!( |
428 | 0 | matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly), |
429 | 0 | "PHF not yet implemented in step function" |
430 | | ); |
431 | | // Extended Capacity can be easily implemented but the code is not yet reachable |
432 | 0 | debug_assert!( |
433 | 0 | matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal), |
434 | 0 | "Extended capacity not yet implemented in step function" |
435 | | ); |
436 | | let (mut b, x, search); |
437 | | loop { |
438 | 0 | (b, *trie) = match trie.split_first() { |
439 | 0 | Some(v) => v, |
440 | | None => { |
441 | | // Empty trie or only a value node |
442 | 0 | return None; |
443 | | } |
444 | | }; |
445 | 0 | match byte_type(*b) { |
446 | | NodeType::Ascii => { |
447 | 0 | let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) |
448 | | { |
449 | 0 | b.to_ascii_lowercase() == c.to_ascii_lowercase() |
450 | | } else { |
451 | 0 | *b == c |
452 | | }; |
453 | 0 | if is_match { |
454 | | // Matched a byte |
455 | 0 | return Some(*b); |
456 | | } else { |
457 | | // Byte that doesn't match |
458 | 0 | *trie = &[]; |
459 | 0 | return None; |
460 | | } |
461 | | } |
462 | | NodeType::Branch => { |
463 | | // Proceed to the branch node logic below |
464 | 0 | (x, *trie) = read_varint_meta2(*b, trie); |
465 | 0 | break; |
466 | | } |
467 | | NodeType::Span => { |
468 | | // Question: Should we put the trie back into a valid state? |
469 | | // Currently this code is unreachable so let's not worry about it. |
470 | 0 | debug_assert!(false, "Span node found in ASCII trie!"); |
471 | 0 | return None; |
472 | | } |
473 | | NodeType::Value => { |
474 | | // Skip the value node and go to the next node |
475 | 0 | (_, *trie) = read_varint_meta3(*b, trie); |
476 | 0 | continue; |
477 | | } |
478 | | }; |
479 | | } |
480 | | // Branch node |
481 | 0 | let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; |
482 | | // See comment above regarding this assertion |
483 | 0 | debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); |
484 | 0 | let w = w & 0x3; |
485 | 0 | let x = if x == 0 { 256 } else { x }; |
486 | | // Always use binary search |
487 | 0 | (search, *trie) = trie.debug_split_at(x); |
488 | 0 | let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { |
489 | 0 | search.binary_search_by_key(&c.to_ascii_lowercase(), |x| x.to_ascii_lowercase()) Unexecuted instantiation: zerotrie::reader::step_parameterized::<zerotrie::zerotrie::ZeroTrieSimpleAscii<[u8]>>::{closure#0} Unexecuted instantiation: zerotrie::reader::step_parameterized::<zerotrie::zerotrie::ZeroAsciiIgnoreCaseTrie<[u8]>>::{closure#0} |
490 | | } else { |
491 | 0 | search.binary_search(&c) |
492 | | }; |
493 | 0 | match bsearch_result { |
494 | 0 | Ok(i) => { |
495 | 0 | // Matched a byte |
496 | 0 | *trie = if w == 0 { |
497 | 0 | get_branch_w0(trie, i, x) |
498 | | } else { |
499 | 0 | get_branch(trie, i, x, w) |
500 | | }; |
501 | 0 | Some(search[i]) |
502 | | } |
503 | | Err(_) => { |
504 | | // Byte that doesn't match |
505 | 0 | *trie = &[]; |
506 | 0 | None |
507 | | } |
508 | | } |
509 | 0 | } Unexecuted instantiation: zerotrie::reader::step_parameterized::<zerotrie::zerotrie::ZeroTrieSimpleAscii<[u8]>> Unexecuted instantiation: zerotrie::reader::step_parameterized::<zerotrie::zerotrie::ZeroAsciiIgnoreCaseTrie<[u8]>> |
510 | | |
511 | | /// Steps one node into the trie, assuming all branch nodes are binary search and that |
512 | | /// there are no span nodes, using an index. |
513 | | /// |
514 | | /// The input-output argument `trie` starts at the original trie and ends pointing to |
515 | | /// the sub-trie indexed by `index`. |
516 | | #[inline] |
517 | 0 | pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>( |
518 | 0 | trie: &mut &[u8], |
519 | 0 | index: usize, |
520 | 0 | ) -> Option<AsciiProbeResult> { |
521 | 0 | // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`. |
522 | 0 | // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie. |
523 | 0 | // If a span node is encountered, `None` is returned later in this function. |
524 | 0 | debug_assert!( |
525 | 0 | matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly), |
526 | 0 | "Spans not yet implemented in step function" |
527 | | ); |
528 | | // PHF can be easily implemented but the code is not yet reachable |
529 | 0 | debug_assert!( |
530 | 0 | matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly), |
531 | 0 | "PHF not yet implemented in step function" |
532 | | ); |
533 | | // Extended Capacity can be easily implemented but the code is not yet reachable |
534 | 0 | debug_assert!( |
535 | 0 | matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal), |
536 | 0 | "Extended capacity not yet implemented in step function" |
537 | | ); |
538 | | let (mut b, x, search); |
539 | | loop { |
540 | 0 | (b, *trie) = match trie.split_first() { |
541 | 0 | Some(v) => v, |
542 | | None => { |
543 | | // Empty trie or only a value node |
544 | 0 | return None; |
545 | | } |
546 | | }; |
547 | 0 | match byte_type(*b) { |
548 | | NodeType::Ascii => { |
549 | 0 | if index > 0 { |
550 | 0 | *trie = &[]; |
551 | 0 | return None; |
552 | 0 | } |
553 | 0 | return Some(AsciiProbeResult { |
554 | 0 | byte: *b, |
555 | 0 | total_siblings: 1, |
556 | 0 | }); |
557 | | } |
558 | | NodeType::Branch => { |
559 | | // Proceed to the branch node logic below |
560 | 0 | (x, *trie) = read_varint_meta2(*b, trie); |
561 | 0 | break; |
562 | | } |
563 | | NodeType::Span => { |
564 | | // Question: Should we put the trie back into a valid state? |
565 | | // Currently this code is unreachable so let's not worry about it. |
566 | 0 | debug_assert!(false, "Span node found in ASCII trie!"); |
567 | 0 | return None; |
568 | | } |
569 | | NodeType::Value => { |
570 | | // Skip the value node and go to the next node |
571 | 0 | (_, *trie) = read_varint_meta3(*b, trie); |
572 | 0 | continue; |
573 | | } |
574 | | }; |
575 | | } |
576 | | // Branch node |
577 | 0 | let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; |
578 | 0 | debug_assert!(u8::try_from(x).is_ok()); |
579 | 0 | let total_siblings = x as u8; |
580 | 0 | // See comment above regarding this assertion |
581 | 0 | debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); |
582 | 0 | let w = w & 0x3; |
583 | 0 | let x = if x == 0 { 256 } else { x }; |
584 | 0 | if index >= x { |
585 | 0 | *trie = &[]; |
586 | 0 | return None; |
587 | 0 | } |
588 | 0 | (search, *trie) = trie.debug_split_at(x); |
589 | 0 | *trie = if w == 0 { |
590 | 0 | get_branch_w0(trie, index, x) |
591 | | } else { |
592 | 0 | get_branch(trie, index, x, w) |
593 | | }; |
594 | 0 | Some(AsciiProbeResult { |
595 | 0 | byte: search[index], |
596 | 0 | total_siblings, |
597 | 0 | }) |
598 | 0 | } Unexecuted instantiation: zerotrie::reader::probe_parameterized::<zerotrie::zerotrie::ZeroTrieSimpleAscii<[u8]>> Unexecuted instantiation: zerotrie::reader::probe_parameterized::<zerotrie::zerotrie::ZeroAsciiIgnoreCaseTrie<[u8]>> |
599 | | |
600 | | /// Steps one node into the trie if the head node is a value node, returning the value. |
601 | | /// If the head node is not a value node, no change is made. |
602 | | /// |
603 | | /// The input-output argument `trie` starts at the original trie and ends pointing to |
604 | | /// the sub-trie with the value node removed. |
605 | 0 | pub(crate) fn take_value(trie: &mut &[u8]) -> Option<usize> { |
606 | 0 | let (b, new_trie) = trie.split_first()?; |
607 | 0 | match byte_type(*b) { |
608 | 0 | NodeType::Ascii | NodeType::Span | NodeType::Branch => None, |
609 | | NodeType::Value => { |
610 | | let x; |
611 | 0 | (x, *trie) = read_varint_meta3(*b, new_trie); |
612 | 0 | Some(x) |
613 | | } |
614 | | } |
615 | 0 | } |
616 | | |
617 | | #[cfg(feature = "alloc")] |
618 | | use alloc::vec::Vec; |
619 | | |
620 | | /// Internal iterator type for walking the strings contained in a ZeroTrie. |
621 | | #[cfg(feature = "alloc")] |
622 | | pub(crate) struct ZeroTrieIterator<'a> { |
623 | | /// Whether the PHF is enabled on this trie. |
624 | | use_phf: bool, |
625 | | /// Intermediate state during iteration: |
626 | | /// 1. A trie (usually a slice of the original, bigger trie) |
627 | | /// 2. The string that leads to the trie |
628 | | /// 3. If the trie's lead node is a branch node, the current index being evaluated |
629 | | state: Vec<(&'a [u8], Vec<u8>, usize)>, |
630 | | } |
631 | | |
632 | | #[cfg(feature = "alloc")] |
633 | | impl<'a> ZeroTrieIterator<'a> { |
634 | 0 | pub fn new<S: AsRef<[u8]> + ?Sized>(store: &'a S, use_phf: bool) -> Self { |
635 | 0 | ZeroTrieIterator { |
636 | 0 | use_phf, |
637 | 0 | state: alloc::vec![(store.as_ref(), alloc::vec![], 0)], |
638 | 0 | } |
639 | 0 | } |
640 | | } |
641 | | |
642 | | #[cfg(feature = "alloc")] |
643 | | impl<'a> Iterator for ZeroTrieIterator<'a> { |
644 | | type Item = (Vec<u8>, usize); |
645 | 0 | fn next(&mut self) -> Option<Self::Item> { |
646 | | let (mut trie, mut string, mut branch_idx); |
647 | 0 | (trie, string, branch_idx) = self.state.pop()?; |
648 | | loop { |
649 | | let (b, x, span, search); |
650 | 0 | let return_trie = trie; |
651 | 0 | (b, trie) = match trie.split_first() { |
652 | 0 | Some(tpl) => tpl, |
653 | | None => { |
654 | | // At end of current branch; step back to the branch node. |
655 | | // If there are no more branches, we are finished. |
656 | 0 | (trie, string, branch_idx) = self.state.pop()?; |
657 | 0 | continue; |
658 | | } |
659 | | }; |
660 | 0 | let byte_type = byte_type(*b); |
661 | 0 | if matches!(byte_type, NodeType::Ascii) { |
662 | 0 | string.push(*b); |
663 | 0 | continue; |
664 | 0 | } |
665 | 0 | (x, trie) = match byte_type { |
666 | 0 | NodeType::Ascii => (0, trie), |
667 | 0 | NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), |
668 | 0 | NodeType::Branch => read_varint_meta2(*b, trie), |
669 | | }; |
670 | 0 | if matches!(byte_type, NodeType::Span) { |
671 | 0 | (span, trie) = trie.debug_split_at(x); |
672 | 0 | string.extend(span); |
673 | 0 | continue; |
674 | 0 | } |
675 | 0 | if matches!(byte_type, NodeType::Value) { |
676 | 0 | let retval = string.clone(); |
677 | 0 | // Return to this position on the next step |
678 | 0 | self.state.push((trie, string, 0)); |
679 | 0 | return Some((retval, x)); |
680 | 0 | } |
681 | | // Match node |
682 | 0 | let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; |
683 | 0 | let x = if x == 0 { 256 } else { x }; |
684 | 0 | if branch_idx + 1 < x { |
685 | 0 | // Return to this branch node at the next index |
686 | 0 | self.state |
687 | 0 | .push((return_trie, string.clone(), branch_idx + 1)); |
688 | 0 | } |
689 | 0 | let byte = if x < 16 || !self.use_phf { |
690 | | // binary search |
691 | 0 | (search, trie) = trie.debug_split_at(x); |
692 | 0 | debug_unwrap!(search.get(branch_idx), return None) |
693 | | } else { |
694 | | // phf |
695 | 0 | (search, trie) = trie.debug_split_at(x * 2 + 1); |
696 | 0 | debug_unwrap!(search.get(branch_idx + x + 1), return None) |
697 | | }; |
698 | 0 | string.push(*byte); |
699 | 0 | trie = if w == 0 { |
700 | 0 | get_branch_w0(trie, branch_idx, x) |
701 | | } else { |
702 | 0 | get_branch(trie, branch_idx, x, w) |
703 | | }; |
704 | 0 | branch_idx = 0; |
705 | | } |
706 | 0 | } |
707 | | } |
708 | | |
709 | | #[cfg(feature = "alloc")] |
710 | 0 | pub(crate) fn get_iter_phf<S: AsRef<[u8]> + ?Sized>( |
711 | 0 | store: &S, |
712 | 0 | ) -> impl Iterator<Item = (Vec<u8>, usize)> + '_ { |
713 | 0 | ZeroTrieIterator::new(store, true) |
714 | 0 | } |
715 | | |
716 | | /// # Panics |
717 | | /// Panics if the trie contains non-ASCII items. |
718 | | #[cfg(feature = "alloc")] |
719 | 0 | pub(crate) fn get_iter_ascii_or_panic<S: AsRef<[u8]> + ?Sized>( |
720 | 0 | store: &S, |
721 | 0 | ) -> impl Iterator<Item = (String, usize)> + '_ { |
722 | 0 | ZeroTrieIterator::new(store, false).map(|(k, v)| { |
723 | 0 | #[allow(clippy::unwrap_used)] // in signature of function |
724 | 0 | let ascii_str = String::from_utf8(k).unwrap(); |
725 | 0 | (ascii_str, v) |
726 | 0 | }) |
727 | 0 | } |