/rust/registry/src/index.crates.io-6f17d22bba15001f/zerotrie-0.1.3/src/varint.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! Varint spec for ZeroTrie: |
6 | | //! |
7 | | //! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value |
8 | | //! - Trail bytes: top bit is varint extender; rest are low bits of value |
9 | | //! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte |
10 | | //! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize` |
11 | | //! |
12 | | //! This is best shown by examples. |
13 | | //! |
14 | | //! ```txt |
15 | | //! xxx0'1010 = 10 |
16 | | //! xxx0'1111 = 15 (largest single-byte value with M=3) |
17 | | //! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3) |
18 | | //! xxx1'0000 0000'0001 = 17 |
19 | | //! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3) |
20 | | //! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3) |
21 | | //! xxx1'0000 1000'0000 0000'0001 = 2065 |
22 | | //! ``` |
23 | | //! |
24 | | //! The latent values by number of bytes for M=3 are: |
25 | | //! |
26 | | //! - 1 byte: 0 |
27 | | //! - 2 bytes: 16 = 0x10 = 0b10000 |
28 | | //! - 3 bytes: 2064 = 0x810 = 0b100000010000 |
29 | | //! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000 |
30 | | //! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000 |
31 | | //! - … |
32 | | //! |
33 | | //! For M=2, the latent values are: |
34 | | //! |
35 | | //! - 1 byte: 0 |
36 | | //! - 2 bytes: 32 = 0x20 = 0b100000 |
37 | | //! - 3 bytes: 4128 = 0x1020 = 0b1000000100000 |
38 | | //! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000 |
39 | | //! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000 |
40 | | //! - … |
41 | | |
42 | | use crate::builder::konst::ConstArrayBuilder; |
43 | | |
44 | | #[cfg(feature = "alloc")] |
45 | | use crate::builder::nonconst::TrieBuilderStore; |
46 | | |
47 | | /// Reads a varint with 2 bits of metadata in the lead byte. |
48 | | /// |
49 | | /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
50 | | /// |
51 | | /// If the varint spills off the end of the slice, a debug assertion will fail, |
52 | | /// and the function will return the value up to that point. |
53 | 0 | pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
54 | 0 | let mut value = (start & 0b00011111) as usize; |
55 | 0 | let mut remainder = remainder; |
56 | 0 | if (start & 0b00100000) != 0 { |
57 | | loop { |
58 | | let next; |
59 | 0 | (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
60 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
61 | | // The second addition could overflow; in such a case we just inform the |
62 | | // developer via the debug assertion. |
63 | 0 | value = (value << 7) + ((*next & 0b01111111) as usize) + 32; |
64 | 0 | if (*next & 0b10000000) == 0 { |
65 | 0 | break; |
66 | 0 | } |
67 | | } |
68 | 0 | } |
69 | 0 | (value, remainder) |
70 | 0 | } |
71 | | |
72 | | /// Reads a varint with 3 bits of metadata in the lead byte. |
73 | | /// |
74 | | /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
75 | | /// |
76 | | /// If the varint spills off the end of the slice, a debug assertion will fail, |
77 | | /// and the function will return the value up to that point. |
78 | 0 | pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
79 | 0 | let mut value = (start & 0b00001111) as usize; |
80 | 0 | let mut remainder = remainder; |
81 | 0 | if (start & 0b00010000) != 0 { |
82 | | loop { |
83 | | let next; |
84 | 0 | (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
85 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
86 | | // The second addition could overflow; in such a case we just inform the |
87 | | // developer via the debug assertion. |
88 | 0 | value = (value << 7) + ((*next & 0b01111111) as usize) + 16; |
89 | 0 | if (*next & 0b10000000) == 0 { |
90 | 0 | break; |
91 | 0 | } |
92 | | } |
93 | 0 | } |
94 | 0 | (value, remainder) |
95 | 0 | } |
96 | | |
97 | | /// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`]. |
98 | | /// |
99 | | /// Returns the varint value. |
100 | | #[cfg(feature = "alloc")] |
101 | 0 | pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>( |
102 | 0 | start: u8, |
103 | 0 | remainder: &mut S, |
104 | 0 | ) -> Option<usize> { |
105 | 0 | let mut value = (start & 0b00001111) as usize; |
106 | 0 | if (start & 0b00010000) != 0 { |
107 | | loop { |
108 | 0 | let next = remainder.atbs_pop_front()?; |
109 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
110 | | // The second addition could overflow; in such a case we just inform the |
111 | | // developer via the debug assertion. |
112 | 0 | value = (value << 7) + ((next & 0b01111111) as usize) + 16; |
113 | 0 | if (next & 0b10000000) == 0 { |
114 | 0 | break; |
115 | 0 | } |
116 | | } |
117 | 0 | } |
118 | 0 | Some(value) |
119 | 0 | } |
120 | | |
121 | | #[cfg(test)] |
122 | | const MAX_VARINT: usize = usize::MAX; |
123 | | |
124 | | // *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value. |
125 | | // Add an extra 1 since the lead byte holds only 5 bits of data. |
126 | | const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7; |
127 | | |
128 | | /// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata. |
129 | 0 | pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
130 | 0 | let mut result = [0; MAX_VARINT_LENGTH]; |
131 | 0 | let mut i = MAX_VARINT_LENGTH - 1; |
132 | 0 | let mut value = value; |
133 | 0 | let mut last = true; |
134 | | loop { |
135 | 0 | if value < 32 { |
136 | 0 | result[i] = value as u8; |
137 | 0 | if !last { |
138 | 0 | result[i] |= 0b00100000; |
139 | 0 | } |
140 | 0 | break; |
141 | 0 | } |
142 | 0 | value -= 32; |
143 | 0 | result[i] = (value as u8) & 0b01111111; |
144 | 0 | if !last { |
145 | 0 | result[i] |= 0b10000000; |
146 | 0 | } else { |
147 | 0 | last = false; |
148 | 0 | } |
149 | 0 | value >>= 7; |
150 | 0 | i -= 1; |
151 | | } |
152 | | // The bytes are from i to the end. |
153 | 0 | ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
154 | 0 | } |
155 | | |
156 | | /// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata. |
157 | 0 | pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
158 | 0 | let mut result = [0; MAX_VARINT_LENGTH]; |
159 | 0 | let mut i = MAX_VARINT_LENGTH - 1; |
160 | 0 | let mut value = value; |
161 | 0 | let mut last = true; |
162 | | loop { |
163 | 0 | if value < 16 { |
164 | 0 | result[i] = value as u8; |
165 | 0 | if !last { |
166 | 0 | result[i] |= 0b00010000; |
167 | 0 | } |
168 | 0 | break; |
169 | 0 | } |
170 | 0 | value -= 16; |
171 | 0 | result[i] = (value as u8) & 0b01111111; |
172 | 0 | if !last { |
173 | 0 | result[i] |= 0b10000000; |
174 | 0 | } else { |
175 | 0 | last = false; |
176 | 0 | } |
177 | 0 | value >>= 7; |
178 | 0 | i -= 1; |
179 | | } |
180 | | // The bytes are from i to the end. |
181 | 0 | ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
182 | 0 | } |
183 | | |
184 | | /// A secondary implementation that separates the latent value while computing the varint. |
185 | | #[cfg(test)] |
186 | | pub(crate) const fn write_varint_reference( |
187 | | value: usize, |
188 | | ) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
189 | | let mut result = [0; MAX_VARINT_LENGTH]; |
190 | | if value < 32 { |
191 | | result[0] = value as u8; |
192 | | return ConstArrayBuilder::from_manual_slice(result, 0, 1); |
193 | | } |
194 | | result[0] = 32; |
195 | | let mut latent = 32; |
196 | | let mut steps = 2; |
197 | | loop { |
198 | | let next_latent = (latent << 7) + 32; |
199 | | if value < next_latent || next_latent == latent { |
200 | | break; |
201 | | } |
202 | | latent = next_latent; |
203 | | steps += 1; |
204 | | } |
205 | | let mut value = value - latent; |
206 | | let mut i = steps; |
207 | | while i > 0 { |
208 | | i -= 1; |
209 | | result[i] |= (value as u8) & 0b01111111; |
210 | | value >>= 7; |
211 | | if i > 0 && i < steps - 1 { |
212 | | result[i] |= 0b10000000; |
213 | | } |
214 | | } |
215 | | // The bytes are from 0 to `steps`. |
216 | | ConstArrayBuilder::from_manual_slice(result, 0, steps) |
217 | | } |
218 | | |
219 | | #[cfg(test)] |
220 | | mod tests { |
221 | | use super::*; |
222 | | |
223 | | #[derive(Debug)] |
224 | | struct TestCase<'a> { |
225 | | bytes: &'a [u8], |
226 | | remainder: &'a [u8], |
227 | | value: usize, |
228 | | } |
229 | | static CASES: &[TestCase] = &[ |
230 | | TestCase { |
231 | | bytes: &[0b00000000], |
232 | | remainder: &[], |
233 | | value: 0, |
234 | | }, |
235 | | TestCase { |
236 | | bytes: &[0b00001010], |
237 | | remainder: &[], |
238 | | value: 10, |
239 | | }, |
240 | | TestCase { |
241 | | bytes: &[0b00011111], |
242 | | remainder: &[], |
243 | | value: 31, |
244 | | }, |
245 | | TestCase { |
246 | | bytes: &[0b00011111, 0b10101010], |
247 | | remainder: &[0b10101010], |
248 | | value: 31, |
249 | | }, |
250 | | TestCase { |
251 | | bytes: &[0b00100000, 0b00000000], |
252 | | remainder: &[], |
253 | | value: 32, |
254 | | }, |
255 | | TestCase { |
256 | | bytes: &[0b00100000, 0b00000001], |
257 | | remainder: &[], |
258 | | value: 33, |
259 | | }, |
260 | | TestCase { |
261 | | bytes: &[0b00100000, 0b00100000], |
262 | | remainder: &[], |
263 | | value: 64, |
264 | | }, |
265 | | TestCase { |
266 | | bytes: &[0x20, 0x44], |
267 | | remainder: &[], |
268 | | value: 100, |
269 | | }, |
270 | | TestCase { |
271 | | bytes: &[0b00100000, 0b01111111], |
272 | | remainder: &[], |
273 | | value: 159, |
274 | | }, |
275 | | TestCase { |
276 | | bytes: &[0b00100001, 0b00000000], |
277 | | remainder: &[], |
278 | | value: 160, |
279 | | }, |
280 | | TestCase { |
281 | | bytes: &[0b00100001, 0b00000001], |
282 | | remainder: &[], |
283 | | value: 161, |
284 | | }, |
285 | | TestCase { |
286 | | bytes: &[0x23, 0x54], |
287 | | remainder: &[], |
288 | | value: 500, |
289 | | }, |
290 | | TestCase { |
291 | | bytes: &[0b00111111, 0b01111111], |
292 | | remainder: &[], |
293 | | value: 4127, // 32 + (1 << 12) - 1 |
294 | | }, |
295 | | TestCase { |
296 | | bytes: &[0b00100000, 0b10000000, 0b00000000], |
297 | | remainder: &[], |
298 | | value: 4128, // 32 + (1 << 12) |
299 | | }, |
300 | | TestCase { |
301 | | bytes: &[0b00100000, 0b10000000, 0b00000001], |
302 | | remainder: &[], |
303 | | value: 4129, // 32 + (1 << 12) + 1 |
304 | | }, |
305 | | TestCase { |
306 | | bytes: &[0b00100000, 0b10000000, 0b01111111], |
307 | | remainder: &[], |
308 | | value: 4255, // 32 + (1 << 12) + 127 |
309 | | }, |
310 | | TestCase { |
311 | | bytes: &[0b00100000, 0b10000001, 0b00000000], |
312 | | remainder: &[], |
313 | | value: 4256, // 32 + (1 << 12) + 128 |
314 | | }, |
315 | | TestCase { |
316 | | bytes: &[0b00100000, 0b10000001, 0b00000001], |
317 | | remainder: &[], |
318 | | value: 4257, // 32 + (1 << 12) + 129 |
319 | | }, |
320 | | TestCase { |
321 | | bytes: &[0x20, 0x86, 0x68], |
322 | | remainder: &[], |
323 | | value: 5000, |
324 | | }, |
325 | | TestCase { |
326 | | bytes: &[0b00100000, 0b11111111, 0b01111111], |
327 | | remainder: &[], |
328 | | value: 20511, // 32 + (1 << 12) + (1 << 14) - 1 |
329 | | }, |
330 | | TestCase { |
331 | | bytes: &[0b00100001, 0b10000000, 0b00000000], |
332 | | remainder: &[], |
333 | | value: 20512, // 32 + (1 << 12) + (1 << 14) |
334 | | }, |
335 | | TestCase { |
336 | | bytes: &[0b00111111, 0b11111111, 0b01111111], |
337 | | remainder: &[], |
338 | | value: 528415, // 32 + (1 << 12) + (1 << 19) - 1 |
339 | | }, |
340 | | TestCase { |
341 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000], |
342 | | remainder: &[], |
343 | | value: 528416, // 32 + (1 << 12) + (1 << 19) |
344 | | }, |
345 | | TestCase { |
346 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001], |
347 | | remainder: &[], |
348 | | value: 528417, // 32 + (1 << 12) + (1 << 19) + 1 |
349 | | }, |
350 | | TestCase { |
351 | | bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111], |
352 | | remainder: &[], |
353 | | value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1 |
354 | | }, |
355 | | TestCase { |
356 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000], |
357 | | remainder: &[], |
358 | | value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26) |
359 | | }, |
360 | | ]; |
361 | | |
362 | | #[test] |
363 | | fn test_read() { |
364 | | for cas in CASES { |
365 | | let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
366 | | assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); |
367 | | } |
368 | | } |
369 | | |
370 | | #[test] |
371 | | fn test_read_write() { |
372 | | for cas in CASES { |
373 | | let reference_bytes = write_varint_reference(cas.value); |
374 | | assert_eq!( |
375 | | reference_bytes.len(), |
376 | | cas.bytes.len() - cas.remainder.len(), |
377 | | "{:?}", |
378 | | cas |
379 | | ); |
380 | | assert_eq!( |
381 | | reference_bytes.as_slice(), |
382 | | &cas.bytes[0..reference_bytes.len()], |
383 | | "{:?}", |
384 | | cas |
385 | | ); |
386 | | let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
387 | | assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); |
388 | | let write_bytes = write_varint_meta2(cas.value); |
389 | | assert_eq!( |
390 | | reference_bytes.as_slice(), |
391 | | write_bytes.as_slice(), |
392 | | "{:?}", |
393 | | cas |
394 | | ); |
395 | | } |
396 | | } |
397 | | |
398 | | #[test] |
399 | | fn test_roundtrip() { |
400 | | let mut i = 0usize; |
401 | | while i < MAX_VARINT { |
402 | | let bytes = write_varint_meta2(i); |
403 | | let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
404 | | assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
405 | | i <<= 1; |
406 | | i += 1; |
407 | | } |
408 | | } |
409 | | |
410 | | #[test] |
411 | | fn test_extended_roundtrip() { |
412 | | let mut i = 0usize; |
413 | | while i < MAX_VARINT { |
414 | | let bytes = write_varint_meta3(i); |
415 | | let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
416 | | assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
417 | | i <<= 1; |
418 | | i += 1; |
419 | | } |
420 | | } |
421 | | |
422 | | #[test] |
423 | | fn test_max() { |
424 | | let reference_bytes = write_varint_reference(MAX_VARINT); |
425 | | let write_bytes = write_varint_meta2(MAX_VARINT); |
426 | | assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH); |
427 | | assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice()); |
428 | | let subarray = write_bytes |
429 | | .as_const_slice() |
430 | | .get_subslice_or_panic(1, write_bytes.len()); |
431 | | let (recovered_value, remainder) = read_varint_meta2( |
432 | | *write_bytes.as_const_slice().first().unwrap(), |
433 | | subarray.as_slice(), |
434 | | ); |
435 | | assert!(remainder.is_empty()); |
436 | | assert_eq!(recovered_value, MAX_VARINT); |
437 | | assert_eq!( |
438 | | write_bytes.as_slice(), |
439 | | &[ |
440 | | 0b00100001, // |
441 | | 0b11011111, // |
442 | | 0b11011111, // |
443 | | 0b11011111, // |
444 | | 0b11011111, // |
445 | | 0b11011111, // |
446 | | 0b11011111, // |
447 | | 0b11011111, // |
448 | | 0b11011111, // |
449 | | 0b01011111, // |
450 | | ] |
451 | | ); |
452 | | } |
453 | | |
454 | | #[test] |
455 | | fn text_extended_max() { |
456 | | let write_bytes = write_varint_meta3(MAX_VARINT); |
457 | | assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH); |
458 | | let (lead, trailing) = write_bytes.as_slice().split_first().unwrap(); |
459 | | let (recovered_value, remainder) = read_varint_meta3(*lead, trailing); |
460 | | assert!(remainder.is_empty()); |
461 | | assert_eq!(recovered_value, MAX_VARINT); |
462 | | assert_eq!( |
463 | | write_bytes.as_slice(), |
464 | | &[ |
465 | | 0b00010001, // |
466 | | 0b11101111, // |
467 | | 0b11101111, // |
468 | | 0b11101111, // |
469 | | 0b11101111, // |
470 | | 0b11101111, // |
471 | | 0b11101111, // |
472 | | 0b11101111, // |
473 | | 0b11101111, // |
474 | | 0b01101111, // |
475 | | ] |
476 | | ); |
477 | | } |
478 | | |
479 | | #[test] |
480 | | fn test_latent_values() { |
481 | | // Same values documented in the module docs: M=2 |
482 | | let m2 = read_varint_meta2; |
483 | | assert_eq!(m2(0, &[]).0, 0); |
484 | | assert_eq!(m2(0x20, &[0x00]).0, 32); |
485 | | assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128); |
486 | | assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416); |
487 | | assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280); |
488 | | |
489 | | // Same values documented in the module docs: M=3 |
490 | | let m3 = read_varint_meta3; |
491 | | assert_eq!(m3(0, &[]).0, 0); |
492 | | assert_eq!(m3(0x10, &[0x00]).0, 16); |
493 | | assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064); |
494 | | assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208); |
495 | | assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640); |
496 | | } |
497 | | } |