/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zerotrie-0.2.3/src/varint.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! Varint spec for ZeroTrie: |
6 | | //! |
7 | | //! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value |
8 | | //! - Trail bytes: top bit is varint extender; rest are low bits of value |
9 | | //! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte |
10 | | //! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize` |
11 | | //! |
12 | | //! This is best shown by examples. |
13 | | //! |
14 | | //! ```txt |
15 | | //! xxx0'1010 = 10 |
16 | | //! xxx0'1111 = 15 (largest single-byte value with M=3) |
17 | | //! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3) |
18 | | //! xxx1'0000 0000'0001 = 17 |
19 | | //! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3) |
20 | | //! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3) |
21 | | //! xxx1'0000 1000'0000 0000'0001 = 2065 |
22 | | //! ``` |
23 | | //! |
24 | | //! The latent values by number of bytes for M=3 are: |
25 | | //! |
26 | | //! - 1 byte: 0 |
27 | | //! - 2 bytes: 16 = 0x10 = 0b10000 |
28 | | //! - 3 bytes: 2064 = 0x810 = 0b100000010000 |
29 | | //! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000 |
30 | | //! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000 |
31 | | //! - … |
32 | | //! |
33 | | //! For M=2, the latent values are: |
34 | | //! |
35 | | //! - 1 byte: 0 |
36 | | //! - 2 bytes: 32 = 0x20 = 0b100000 |
37 | | //! - 3 bytes: 4128 = 0x1020 = 0b1000000100000 |
38 | | //! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000 |
39 | | //! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000 |
40 | | //! - … |
41 | | |
42 | | use crate::builder::konst::ConstArrayBuilder; |
43 | | |
44 | | #[cfg(feature = "alloc")] |
45 | | use crate::builder::nonconst::TrieBuilderStore; |
46 | | |
47 | | /// Reads a varint with 2 bits of metadata in the lead byte. |
48 | | /// |
49 | | /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
50 | | /// |
51 | | /// If the varint spills off the end of the slice, a debug assertion will fail, |
52 | | /// and the function will return the value up to that point. |
53 | 0 | pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
54 | 0 | let mut value = (start & 0b00011111) as usize; |
55 | 0 | let mut remainder = remainder; |
56 | 0 | if (start & 0b00100000) != 0 { |
57 | | loop { |
58 | | let next; |
59 | 0 | (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
60 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
61 | | // The second addition could overflow; in such a case we just inform the |
62 | | // developer via the debug assertion. |
63 | 0 | value = (value << 7) + ((*next & 0b01111111) as usize) + 32; |
64 | 0 | if (*next & 0b10000000) == 0 { |
65 | 0 | break; |
66 | 0 | } |
67 | | } |
68 | 0 | } |
69 | 0 | (value, remainder) |
70 | 0 | } |
71 | | |
72 | | /// Reads a varint with 3 bits of metadata in the lead byte. |
73 | | /// |
74 | | /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
75 | | /// |
76 | | /// If the varint spills off the end of the slice, a debug assertion will fail, |
77 | | /// and the function will return the value up to that point. |
78 | 0 | pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
79 | 0 | let mut value = (start & 0b00001111) as usize; |
80 | 0 | let mut remainder = remainder; |
81 | 0 | if (start & 0b00010000) != 0 { |
82 | | loop { |
83 | | let next; |
84 | 0 | (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
85 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
86 | | // The second addition could overflow; in such a case we just inform the |
87 | | // developer via the debug assertion. |
88 | 0 | value = (value << 7) + ((*next & 0b01111111) as usize) + 16; |
89 | 0 | if (*next & 0b10000000) == 0 { |
90 | 0 | break; |
91 | 0 | } |
92 | | } |
93 | 0 | } |
94 | 0 | (value, remainder) |
95 | 0 | } |
96 | | |
97 | | /// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`]. |
98 | | /// |
99 | | /// Returns the varint value. |
100 | | #[cfg(feature = "alloc")] |
101 | | pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>( |
102 | | start: u8, |
103 | | remainder: &mut S, |
104 | | ) -> Option<usize> { |
105 | | let mut value = (start & 0b00001111) as usize; |
106 | | if (start & 0b00010000) != 0 { |
107 | | loop { |
108 | | let next = remainder.atbs_pop_front()?; |
109 | | // Note: value << 7 could drop high bits. The first addition can't overflow. |
110 | | // The second addition could overflow; in such a case we just inform the |
111 | | // developer via the debug assertion. |
112 | | value = (value << 7) + ((next & 0b01111111) as usize) + 16; |
113 | | if (next & 0b10000000) == 0 { |
114 | | break; |
115 | | } |
116 | | } |
117 | | } |
118 | | Some(value) |
119 | | } |
120 | | |
121 | | #[cfg(test)] |
122 | | const MAX_VARINT: usize = usize::MAX; |
123 | | |
124 | | // *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value. |
125 | | // Add an extra 1 since the lead byte holds only 5 bits of data. |
126 | | const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7; |
127 | | |
128 | | /// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata. |
129 | | #[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct |
130 | 0 | pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
131 | 0 | let mut result = [0; MAX_VARINT_LENGTH]; |
132 | 0 | let mut i = MAX_VARINT_LENGTH - 1; |
133 | 0 | let mut value = value; |
134 | 0 | let mut last = true; |
135 | | loop { |
136 | 0 | if value < 32 { |
137 | 0 | result[i] = value as u8; |
138 | 0 | if !last { |
139 | 0 | result[i] |= 0b00100000; |
140 | 0 | } |
141 | 0 | break; |
142 | 0 | } |
143 | 0 | value -= 32; |
144 | 0 | result[i] = (value as u8) & 0b01111111; |
145 | 0 | if !last { |
146 | 0 | result[i] |= 0b10000000; |
147 | 0 | } else { |
148 | 0 | last = false; |
149 | 0 | } |
150 | 0 | value >>= 7; |
151 | 0 | i -= 1; |
152 | | } |
153 | | // The bytes are from i to the end. |
154 | 0 | ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
155 | 0 | } |
156 | | |
157 | | /// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata. |
158 | | #[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct |
159 | 0 | pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
160 | 0 | let mut result = [0; MAX_VARINT_LENGTH]; |
161 | 0 | let mut i = MAX_VARINT_LENGTH - 1; |
162 | 0 | let mut value = value; |
163 | 0 | let mut last = true; |
164 | | loop { |
165 | 0 | if value < 16 { |
166 | 0 | result[i] = value as u8; |
167 | 0 | if !last { |
168 | 0 | result[i] |= 0b00010000; |
169 | 0 | } |
170 | 0 | break; |
171 | 0 | } |
172 | 0 | value -= 16; |
173 | 0 | result[i] = (value as u8) & 0b01111111; |
174 | 0 | if !last { |
175 | 0 | result[i] |= 0b10000000; |
176 | 0 | } else { |
177 | 0 | last = false; |
178 | 0 | } |
179 | 0 | value >>= 7; |
180 | 0 | i -= 1; |
181 | | } |
182 | | // The bytes are from i to the end. |
183 | 0 | ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
184 | 0 | } |
185 | | |
186 | | /// A secondary implementation that separates the latent value while computing the varint. |
187 | | #[cfg(test)] |
188 | | pub(crate) const fn write_varint_reference( |
189 | | value: usize, |
190 | | ) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
191 | | let mut result = [0; MAX_VARINT_LENGTH]; |
192 | | if value < 32 { |
193 | | result[0] = value as u8; |
194 | | return ConstArrayBuilder::from_manual_slice(result, 0, 1); |
195 | | } |
196 | | result[0] = 32; |
197 | | let mut latent = 32; |
198 | | let mut steps = 2; |
199 | | loop { |
200 | | let next_latent = (latent << 7) + 32; |
201 | | if value < next_latent || next_latent == latent { |
202 | | break; |
203 | | } |
204 | | latent = next_latent; |
205 | | steps += 1; |
206 | | } |
207 | | let mut value = value - latent; |
208 | | let mut i = steps; |
209 | | while i > 0 { |
210 | | i -= 1; |
211 | | result[i] |= (value as u8) & 0b01111111; |
212 | | value >>= 7; |
213 | | if i > 0 && i < steps - 1 { |
214 | | result[i] |= 0b10000000; |
215 | | } |
216 | | } |
217 | | // The bytes are from 0 to `steps`. |
218 | | ConstArrayBuilder::from_manual_slice(result, 0, steps) |
219 | | } |
220 | | |
221 | | #[cfg(test)] |
222 | | mod tests { |
223 | | use super::*; |
224 | | |
225 | | #[derive(Debug)] |
226 | | struct TestCase<'a> { |
227 | | bytes: &'a [u8], |
228 | | remainder: &'a [u8], |
229 | | value: usize, |
230 | | } |
231 | | static CASES: &[TestCase] = &[ |
232 | | TestCase { |
233 | | bytes: &[0b00000000], |
234 | | remainder: &[], |
235 | | value: 0, |
236 | | }, |
237 | | TestCase { |
238 | | bytes: &[0b00001010], |
239 | | remainder: &[], |
240 | | value: 10, |
241 | | }, |
242 | | TestCase { |
243 | | bytes: &[0b00011111], |
244 | | remainder: &[], |
245 | | value: 31, |
246 | | }, |
247 | | TestCase { |
248 | | bytes: &[0b00011111, 0b10101010], |
249 | | remainder: &[0b10101010], |
250 | | value: 31, |
251 | | }, |
252 | | TestCase { |
253 | | bytes: &[0b00100000, 0b00000000], |
254 | | remainder: &[], |
255 | | value: 32, |
256 | | }, |
257 | | TestCase { |
258 | | bytes: &[0b00100000, 0b00000001], |
259 | | remainder: &[], |
260 | | value: 33, |
261 | | }, |
262 | | TestCase { |
263 | | bytes: &[0b00100000, 0b00100000], |
264 | | remainder: &[], |
265 | | value: 64, |
266 | | }, |
267 | | TestCase { |
268 | | bytes: &[0x20, 0x44], |
269 | | remainder: &[], |
270 | | value: 100, |
271 | | }, |
272 | | TestCase { |
273 | | bytes: &[0b00100000, 0b01111111], |
274 | | remainder: &[], |
275 | | value: 159, |
276 | | }, |
277 | | TestCase { |
278 | | bytes: &[0b00100001, 0b00000000], |
279 | | remainder: &[], |
280 | | value: 160, |
281 | | }, |
282 | | TestCase { |
283 | | bytes: &[0b00100001, 0b00000001], |
284 | | remainder: &[], |
285 | | value: 161, |
286 | | }, |
287 | | TestCase { |
288 | | bytes: &[0x23, 0x54], |
289 | | remainder: &[], |
290 | | value: 500, |
291 | | }, |
292 | | TestCase { |
293 | | bytes: &[0b00111111, 0b01111111], |
294 | | remainder: &[], |
295 | | value: 4127, // 32 + (1 << 12) - 1 |
296 | | }, |
297 | | TestCase { |
298 | | bytes: &[0b00100000, 0b10000000, 0b00000000], |
299 | | remainder: &[], |
300 | | value: 4128, // 32 + (1 << 12) |
301 | | }, |
302 | | TestCase { |
303 | | bytes: &[0b00100000, 0b10000000, 0b00000001], |
304 | | remainder: &[], |
305 | | value: 4129, // 32 + (1 << 12) + 1 |
306 | | }, |
307 | | TestCase { |
308 | | bytes: &[0b00100000, 0b10000000, 0b01111111], |
309 | | remainder: &[], |
310 | | value: 4255, // 32 + (1 << 12) + 127 |
311 | | }, |
312 | | TestCase { |
313 | | bytes: &[0b00100000, 0b10000001, 0b00000000], |
314 | | remainder: &[], |
315 | | value: 4256, // 32 + (1 << 12) + 128 |
316 | | }, |
317 | | TestCase { |
318 | | bytes: &[0b00100000, 0b10000001, 0b00000001], |
319 | | remainder: &[], |
320 | | value: 4257, // 32 + (1 << 12) + 129 |
321 | | }, |
322 | | TestCase { |
323 | | bytes: &[0x20, 0x86, 0x68], |
324 | | remainder: &[], |
325 | | value: 5000, |
326 | | }, |
327 | | TestCase { |
328 | | bytes: &[0b00100000, 0b11111111, 0b01111111], |
329 | | remainder: &[], |
330 | | value: 20511, // 32 + (1 << 12) + (1 << 14) - 1 |
331 | | }, |
332 | | TestCase { |
333 | | bytes: &[0b00100001, 0b10000000, 0b00000000], |
334 | | remainder: &[], |
335 | | value: 20512, // 32 + (1 << 12) + (1 << 14) |
336 | | }, |
337 | | TestCase { |
338 | | bytes: &[0b00111111, 0b11111111, 0b01111111], |
339 | | remainder: &[], |
340 | | value: 528415, // 32 + (1 << 12) + (1 << 19) - 1 |
341 | | }, |
342 | | TestCase { |
343 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000], |
344 | | remainder: &[], |
345 | | value: 528416, // 32 + (1 << 12) + (1 << 19) |
346 | | }, |
347 | | TestCase { |
348 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001], |
349 | | remainder: &[], |
350 | | value: 528417, // 32 + (1 << 12) + (1 << 19) + 1 |
351 | | }, |
352 | | TestCase { |
353 | | bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111], |
354 | | remainder: &[], |
355 | | value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1 |
356 | | }, |
357 | | TestCase { |
358 | | bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000], |
359 | | remainder: &[], |
360 | | value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26) |
361 | | }, |
362 | | ]; |
363 | | |
364 | | #[test] |
365 | | fn test_read() { |
366 | | for cas in CASES { |
367 | | let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
368 | | assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}"); |
369 | | } |
370 | | } |
371 | | |
372 | | #[test] |
373 | | fn test_read_write() { |
374 | | for cas in CASES { |
375 | | let reference_bytes = write_varint_reference(cas.value); |
376 | | assert_eq!( |
377 | | reference_bytes.len(), |
378 | | cas.bytes.len() - cas.remainder.len(), |
379 | | "{cas:?}" |
380 | | ); |
381 | | assert_eq!( |
382 | | reference_bytes.as_slice(), |
383 | | &cas.bytes[0..reference_bytes.len()], |
384 | | "{cas:?}" |
385 | | ); |
386 | | let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
387 | | assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}"); |
388 | | let write_bytes = write_varint_meta2(cas.value); |
389 | | assert_eq!( |
390 | | reference_bytes.as_slice(), |
391 | | write_bytes.as_slice(), |
392 | | "{cas:?}" |
393 | | ); |
394 | | } |
395 | | } |
396 | | |
397 | | #[test] |
398 | | fn test_roundtrip() { |
399 | | let mut i = 0usize; |
400 | | while i < MAX_VARINT { |
401 | | let bytes = write_varint_meta2(i); |
402 | | let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
403 | | assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
404 | | i <<= 1; |
405 | | i += 1; |
406 | | } |
407 | | } |
408 | | |
409 | | #[test] |
410 | | fn test_extended_roundtrip() { |
411 | | let mut i = 0usize; |
412 | | while i < MAX_VARINT { |
413 | | let bytes = write_varint_meta3(i); |
414 | | let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
415 | | assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
416 | | i <<= 1; |
417 | | i += 1; |
418 | | } |
419 | | } |
420 | | |
421 | | #[test] |
422 | | fn test_max() { |
423 | | let reference_bytes = write_varint_reference(MAX_VARINT); |
424 | | let write_bytes = write_varint_meta2(MAX_VARINT); |
425 | | assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH); |
426 | | assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice()); |
427 | | let subarray = write_bytes |
428 | | .as_const_slice() |
429 | | .get_subslice_or_panic(1, write_bytes.len()); |
430 | | let (recovered_value, remainder) = read_varint_meta2( |
431 | | *write_bytes.as_const_slice().first().unwrap(), |
432 | | subarray.as_slice(), |
433 | | ); |
434 | | assert!(remainder.is_empty()); |
435 | | assert_eq!(recovered_value, MAX_VARINT); |
436 | | #[cfg(target_pointer_width = "64")] |
437 | | assert_eq!( |
438 | | write_bytes.as_slice(), |
439 | | &[ |
440 | | 0b00100001, // |
441 | | 0b11011111, // |
442 | | 0b11011111, // |
443 | | 0b11011111, // |
444 | | 0b11011111, // |
445 | | 0b11011111, // |
446 | | 0b11011111, // |
447 | | 0b11011111, // |
448 | | 0b11011111, // |
449 | | 0b01011111, // |
450 | | ] |
451 | | ); |
452 | | #[cfg(target_pointer_width = "32")] |
453 | | assert_eq!( |
454 | | write_bytes.as_slice(), |
455 | | &[ |
456 | | 0b00101111, // |
457 | | 0b11011111, // |
458 | | 0b11011111, // |
459 | | 0b11011111, // |
460 | | 0b01011111, // |
461 | | ] |
462 | | ); |
463 | | } |
464 | | |
465 | | #[test] |
466 | | fn text_extended_max() { |
467 | | let write_bytes = write_varint_meta3(MAX_VARINT); |
468 | | assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH); |
469 | | let (lead, trailing) = write_bytes.as_slice().split_first().unwrap(); |
470 | | let (recovered_value, remainder) = read_varint_meta3(*lead, trailing); |
471 | | assert!(remainder.is_empty()); |
472 | | assert_eq!(recovered_value, MAX_VARINT); |
473 | | #[cfg(target_pointer_width = "64")] |
474 | | assert_eq!( |
475 | | write_bytes.as_slice(), |
476 | | &[ |
477 | | 0b00010001, // |
478 | | 0b11101111, // |
479 | | 0b11101111, // |
480 | | 0b11101111, // |
481 | | 0b11101111, // |
482 | | 0b11101111, // |
483 | | 0b11101111, // |
484 | | 0b11101111, // |
485 | | 0b11101111, // |
486 | | 0b01101111, // |
487 | | ] |
488 | | ); |
489 | | #[cfg(target_pointer_width = "32")] |
490 | | assert_eq!( |
491 | | write_bytes.as_slice(), |
492 | | &[ |
493 | | 0b00011111, // |
494 | | 0b11101111, // |
495 | | 0b11101111, // |
496 | | 0b11101111, // |
497 | | 0b01101111, // |
498 | | ] |
499 | | ); |
500 | | } |
501 | | |
502 | | #[test] |
503 | | fn test_latent_values() { |
504 | | // Same values documented in the module docs: M=2 |
505 | | let m2 = read_varint_meta2; |
506 | | assert_eq!(m2(0, &[]).0, 0); |
507 | | assert_eq!(m2(0x20, &[0x00]).0, 32); |
508 | | assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128); |
509 | | assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416); |
510 | | assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280); |
511 | | |
512 | | // Same values documented in the module docs: M=3 |
513 | | let m3 = read_varint_meta3; |
514 | | assert_eq!(m3(0, &[]).0, 0); |
515 | | assert_eq!(m3(0x10, &[0x00]).0, 16); |
516 | | assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064); |
517 | | assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208); |
518 | | assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640); |
519 | | } |
520 | | } |