Coverage Report

Created: 2025-06-16 06:50

/rust/registry/src/index.crates.io-6f17d22bba15001f/zerotrie-0.1.3/src/varint.rs
Line
Count
Source (jump to first uncovered line)
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
//! Varint spec for ZeroTrie:
6
//!
7
//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
8
//! - Trail bytes: top bit is varint extender; rest are low bits of value
9
//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
10
//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
11
//!
12
//! This is best shown by examples.
13
//!
14
//! ```txt
15
//! xxx0'1010 = 10
16
//! xxx0'1111 = 15 (largest single-byte value with M=3)
17
//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
18
//! xxx1'0000 0000'0001 = 17
19
//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
20
//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
21
//! xxx1'0000 1000'0000 0000'0001 = 2065
22
//! ```
23
//!
24
//! The latent values by number of bytes for M=3 are:
25
//!
26
//! - 1 byte: 0
27
//! - 2 bytes: 16 = 0x10 = 0b10000
28
//! - 3 bytes: 2064 = 0x810 = 0b100000010000
29
//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
30
//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
31
//! - …
32
//!
33
//! For M=2, the latent values are:
34
//!
35
//! - 1 byte: 0
36
//! - 2 bytes: 32 = 0x20 = 0b100000
37
//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
38
//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
39
//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
40
//! - …
41
42
use crate::builder::konst::ConstArrayBuilder;
43
44
#[cfg(feature = "alloc")]
45
use crate::builder::nonconst::TrieBuilderStore;
46
47
/// Reads a varint with 2 bits of metadata in the lead byte.
48
///
49
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
50
///
51
/// If the varint spills off the end of the slice, a debug assertion will fail,
52
/// and the function will return the value up to that point.
53
0
pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
54
0
    let mut value = (start & 0b00011111) as usize;
55
0
    let mut remainder = remainder;
56
0
    if (start & 0b00100000) != 0 {
57
        loop {
58
            let next;
59
0
            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
60
            // Note: value << 7 could drop high bits. The first addition can't overflow.
61
            // The second addition could overflow; in such a case we just inform the
62
            // developer via the debug assertion.
63
0
            value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
64
0
            if (*next & 0b10000000) == 0 {
65
0
                break;
66
0
            }
67
        }
68
0
    }
69
0
    (value, remainder)
70
0
}
71
72
/// Reads a varint with 3 bits of metadata in the lead byte.
73
///
74
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
75
///
76
/// If the varint spills off the end of the slice, a debug assertion will fail,
77
/// and the function will return the value up to that point.
78
0
pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
79
0
    let mut value = (start & 0b00001111) as usize;
80
0
    let mut remainder = remainder;
81
0
    if (start & 0b00010000) != 0 {
82
        loop {
83
            let next;
84
0
            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
85
            // Note: value << 7 could drop high bits. The first addition can't overflow.
86
            // The second addition could overflow; in such a case we just inform the
87
            // developer via the debug assertion.
88
0
            value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
89
0
            if (*next & 0b10000000) == 0 {
90
0
                break;
91
0
            }
92
        }
93
0
    }
94
0
    (value, remainder)
95
0
}
96
97
/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
98
///
99
/// Returns the varint value.
100
#[cfg(feature = "alloc")]
101
0
pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
102
0
    start: u8,
103
0
    remainder: &mut S,
104
0
) -> Option<usize> {
105
0
    let mut value = (start & 0b00001111) as usize;
106
0
    if (start & 0b00010000) != 0 {
107
        loop {
108
0
            let next = remainder.atbs_pop_front()?;
109
            // Note: value << 7 could drop high bits. The first addition can't overflow.
110
            // The second addition could overflow; in such a case we just inform the
111
            // developer via the debug assertion.
112
0
            value = (value << 7) + ((next & 0b01111111) as usize) + 16;
113
0
            if (next & 0b10000000) == 0 {
114
0
                break;
115
0
            }
116
        }
117
0
    }
118
0
    Some(value)
119
0
}
120
121
#[cfg(test)]
122
const MAX_VARINT: usize = usize::MAX;
123
124
// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
125
// Add an extra 1 since the lead byte holds only 5 bits of data.
126
const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
127
128
/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
129
0
pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
130
0
    let mut result = [0; MAX_VARINT_LENGTH];
131
0
    let mut i = MAX_VARINT_LENGTH - 1;
132
0
    let mut value = value;
133
0
    let mut last = true;
134
    loop {
135
0
        if value < 32 {
136
0
            result[i] = value as u8;
137
0
            if !last {
138
0
                result[i] |= 0b00100000;
139
0
            }
140
0
            break;
141
0
        }
142
0
        value -= 32;
143
0
        result[i] = (value as u8) & 0b01111111;
144
0
        if !last {
145
0
            result[i] |= 0b10000000;
146
0
        } else {
147
0
            last = false;
148
0
        }
149
0
        value >>= 7;
150
0
        i -= 1;
151
    }
152
    // The bytes are from i to the end.
153
0
    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
154
0
}
155
156
/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
157
0
pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
158
0
    let mut result = [0; MAX_VARINT_LENGTH];
159
0
    let mut i = MAX_VARINT_LENGTH - 1;
160
0
    let mut value = value;
161
0
    let mut last = true;
162
    loop {
163
0
        if value < 16 {
164
0
            result[i] = value as u8;
165
0
            if !last {
166
0
                result[i] |= 0b00010000;
167
0
            }
168
0
            break;
169
0
        }
170
0
        value -= 16;
171
0
        result[i] = (value as u8) & 0b01111111;
172
0
        if !last {
173
0
            result[i] |= 0b10000000;
174
0
        } else {
175
0
            last = false;
176
0
        }
177
0
        value >>= 7;
178
0
        i -= 1;
179
    }
180
    // The bytes are from i to the end.
181
0
    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
182
0
}
183
184
/// A secondary implementation that separates the latent value while computing the varint.
185
#[cfg(test)]
186
pub(crate) const fn write_varint_reference(
187
    value: usize,
188
) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
189
    let mut result = [0; MAX_VARINT_LENGTH];
190
    if value < 32 {
191
        result[0] = value as u8;
192
        return ConstArrayBuilder::from_manual_slice(result, 0, 1);
193
    }
194
    result[0] = 32;
195
    let mut latent = 32;
196
    let mut steps = 2;
197
    loop {
198
        let next_latent = (latent << 7) + 32;
199
        if value < next_latent || next_latent == latent {
200
            break;
201
        }
202
        latent = next_latent;
203
        steps += 1;
204
    }
205
    let mut value = value - latent;
206
    let mut i = steps;
207
    while i > 0 {
208
        i -= 1;
209
        result[i] |= (value as u8) & 0b01111111;
210
        value >>= 7;
211
        if i > 0 && i < steps - 1 {
212
            result[i] |= 0b10000000;
213
        }
214
    }
215
    // The bytes are from 0 to `steps`.
216
    ConstArrayBuilder::from_manual_slice(result, 0, steps)
217
}
218
219
#[cfg(test)]
220
mod tests {
221
    use super::*;
222
223
    #[derive(Debug)]
224
    struct TestCase<'a> {
225
        bytes: &'a [u8],
226
        remainder: &'a [u8],
227
        value: usize,
228
    }
229
    static CASES: &[TestCase] = &[
230
        TestCase {
231
            bytes: &[0b00000000],
232
            remainder: &[],
233
            value: 0,
234
        },
235
        TestCase {
236
            bytes: &[0b00001010],
237
            remainder: &[],
238
            value: 10,
239
        },
240
        TestCase {
241
            bytes: &[0b00011111],
242
            remainder: &[],
243
            value: 31,
244
        },
245
        TestCase {
246
            bytes: &[0b00011111, 0b10101010],
247
            remainder: &[0b10101010],
248
            value: 31,
249
        },
250
        TestCase {
251
            bytes: &[0b00100000, 0b00000000],
252
            remainder: &[],
253
            value: 32,
254
        },
255
        TestCase {
256
            bytes: &[0b00100000, 0b00000001],
257
            remainder: &[],
258
            value: 33,
259
        },
260
        TestCase {
261
            bytes: &[0b00100000, 0b00100000],
262
            remainder: &[],
263
            value: 64,
264
        },
265
        TestCase {
266
            bytes: &[0x20, 0x44],
267
            remainder: &[],
268
            value: 100,
269
        },
270
        TestCase {
271
            bytes: &[0b00100000, 0b01111111],
272
            remainder: &[],
273
            value: 159,
274
        },
275
        TestCase {
276
            bytes: &[0b00100001, 0b00000000],
277
            remainder: &[],
278
            value: 160,
279
        },
280
        TestCase {
281
            bytes: &[0b00100001, 0b00000001],
282
            remainder: &[],
283
            value: 161,
284
        },
285
        TestCase {
286
            bytes: &[0x23, 0x54],
287
            remainder: &[],
288
            value: 500,
289
        },
290
        TestCase {
291
            bytes: &[0b00111111, 0b01111111],
292
            remainder: &[],
293
            value: 4127, // 32 + (1 << 12) - 1
294
        },
295
        TestCase {
296
            bytes: &[0b00100000, 0b10000000, 0b00000000],
297
            remainder: &[],
298
            value: 4128, // 32 + (1 << 12)
299
        },
300
        TestCase {
301
            bytes: &[0b00100000, 0b10000000, 0b00000001],
302
            remainder: &[],
303
            value: 4129, // 32 + (1 << 12) + 1
304
        },
305
        TestCase {
306
            bytes: &[0b00100000, 0b10000000, 0b01111111],
307
            remainder: &[],
308
            value: 4255, // 32 + (1 << 12) + 127
309
        },
310
        TestCase {
311
            bytes: &[0b00100000, 0b10000001, 0b00000000],
312
            remainder: &[],
313
            value: 4256, // 32 + (1 << 12) + 128
314
        },
315
        TestCase {
316
            bytes: &[0b00100000, 0b10000001, 0b00000001],
317
            remainder: &[],
318
            value: 4257, // 32 + (1 << 12) + 129
319
        },
320
        TestCase {
321
            bytes: &[0x20, 0x86, 0x68],
322
            remainder: &[],
323
            value: 5000,
324
        },
325
        TestCase {
326
            bytes: &[0b00100000, 0b11111111, 0b01111111],
327
            remainder: &[],
328
            value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
329
        },
330
        TestCase {
331
            bytes: &[0b00100001, 0b10000000, 0b00000000],
332
            remainder: &[],
333
            value: 20512, // 32 + (1 << 12) + (1 << 14)
334
        },
335
        TestCase {
336
            bytes: &[0b00111111, 0b11111111, 0b01111111],
337
            remainder: &[],
338
            value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
339
        },
340
        TestCase {
341
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
342
            remainder: &[],
343
            value: 528416, // 32 + (1 << 12) + (1 << 19)
344
        },
345
        TestCase {
346
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
347
            remainder: &[],
348
            value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
349
        },
350
        TestCase {
351
            bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
352
            remainder: &[],
353
            value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
354
        },
355
        TestCase {
356
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
357
            remainder: &[],
358
            value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
359
        },
360
    ];
361
362
    #[test]
363
    fn test_read() {
364
        for cas in CASES {
365
            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
366
            assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
367
        }
368
    }
369
370
    #[test]
371
    fn test_read_write() {
372
        for cas in CASES {
373
            let reference_bytes = write_varint_reference(cas.value);
374
            assert_eq!(
375
                reference_bytes.len(),
376
                cas.bytes.len() - cas.remainder.len(),
377
                "{:?}",
378
                cas
379
            );
380
            assert_eq!(
381
                reference_bytes.as_slice(),
382
                &cas.bytes[0..reference_bytes.len()],
383
                "{:?}",
384
                cas
385
            );
386
            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
387
            assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
388
            let write_bytes = write_varint_meta2(cas.value);
389
            assert_eq!(
390
                reference_bytes.as_slice(),
391
                write_bytes.as_slice(),
392
                "{:?}",
393
                cas
394
            );
395
        }
396
    }
397
398
    #[test]
399
    fn test_roundtrip() {
400
        let mut i = 0usize;
401
        while i < MAX_VARINT {
402
            let bytes = write_varint_meta2(i);
403
            let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
404
            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
405
            i <<= 1;
406
            i += 1;
407
        }
408
    }
409
410
    #[test]
411
    fn test_extended_roundtrip() {
412
        let mut i = 0usize;
413
        while i < MAX_VARINT {
414
            let bytes = write_varint_meta3(i);
415
            let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
416
            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
417
            i <<= 1;
418
            i += 1;
419
        }
420
    }
421
422
    #[test]
423
    fn test_max() {
424
        let reference_bytes = write_varint_reference(MAX_VARINT);
425
        let write_bytes = write_varint_meta2(MAX_VARINT);
426
        assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
427
        assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
428
        let subarray = write_bytes
429
            .as_const_slice()
430
            .get_subslice_or_panic(1, write_bytes.len());
431
        let (recovered_value, remainder) = read_varint_meta2(
432
            *write_bytes.as_const_slice().first().unwrap(),
433
            subarray.as_slice(),
434
        );
435
        assert!(remainder.is_empty());
436
        assert_eq!(recovered_value, MAX_VARINT);
437
        assert_eq!(
438
            write_bytes.as_slice(),
439
            &[
440
                0b00100001, //
441
                0b11011111, //
442
                0b11011111, //
443
                0b11011111, //
444
                0b11011111, //
445
                0b11011111, //
446
                0b11011111, //
447
                0b11011111, //
448
                0b11011111, //
449
                0b01011111, //
450
            ]
451
        );
452
    }
453
454
    #[test]
455
    fn text_extended_max() {
456
        let write_bytes = write_varint_meta3(MAX_VARINT);
457
        assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
458
        let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
459
        let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
460
        assert!(remainder.is_empty());
461
        assert_eq!(recovered_value, MAX_VARINT);
462
        assert_eq!(
463
            write_bytes.as_slice(),
464
            &[
465
                0b00010001, //
466
                0b11101111, //
467
                0b11101111, //
468
                0b11101111, //
469
                0b11101111, //
470
                0b11101111, //
471
                0b11101111, //
472
                0b11101111, //
473
                0b11101111, //
474
                0b01101111, //
475
            ]
476
        );
477
    }
478
479
    #[test]
480
    fn test_latent_values() {
481
        // Same values documented in the module docs: M=2
482
        let m2 = read_varint_meta2;
483
        assert_eq!(m2(0, &[]).0, 0);
484
        assert_eq!(m2(0x20, &[0x00]).0, 32);
485
        assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
486
        assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
487
        assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
488
489
        // Same values documented in the module docs: M=3
490
        let m3 = read_varint_meta3;
491
        assert_eq!(m3(0, &[]).0, 0);
492
        assert_eq!(m3(0x10, &[0x00]).0, 16);
493
        assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
494
        assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
495
        assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
496
    }
497
}