Coverage Report

Created: 2026-01-13 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zerotrie-0.2.3/src/varint.rs
Line
Count
Source
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
//! Varint spec for ZeroTrie:
6
//!
7
//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
8
//! - Trail bytes: top bit is varint extender; rest are low bits of value
9
//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
10
//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
11
//!
12
//! This is best shown by examples.
13
//!
14
//! ```txt
15
//! xxx0'1010 = 10
16
//! xxx0'1111 = 15 (largest single-byte value with M=3)
17
//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
18
//! xxx1'0000 0000'0001 = 17
19
//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
20
//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
21
//! xxx1'0000 1000'0000 0000'0001 = 2065
22
//! ```
23
//!
24
//! The latent values by number of bytes for M=3 are:
25
//!
26
//! - 1 byte: 0
27
//! - 2 bytes: 16 = 0x10 = 0b10000
28
//! - 3 bytes: 2064 = 0x810 = 0b100000010000
29
//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
30
//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
31
//! - …
32
//!
33
//! For M=2, the latent values are:
34
//!
35
//! - 1 byte: 0
36
//! - 2 bytes: 32 = 0x20 = 0b100000
37
//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
38
//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
39
//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
40
//! - …
41
42
use crate::builder::konst::ConstArrayBuilder;
43
44
#[cfg(feature = "alloc")]
45
use crate::builder::nonconst::TrieBuilderStore;
46
47
/// Reads a varint with 2 bits of metadata in the lead byte.
48
///
49
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
50
///
51
/// If the varint spills off the end of the slice, a debug assertion will fail,
52
/// and the function will return the value up to that point.
53
0
pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
54
0
    let mut value = (start & 0b00011111) as usize;
55
0
    let mut remainder = remainder;
56
0
    if (start & 0b00100000) != 0 {
57
        loop {
58
            let next;
59
0
            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
60
            // Note: value << 7 could drop high bits. The first addition can't overflow.
61
            // The second addition could overflow; in such a case we just inform the
62
            // developer via the debug assertion.
63
0
            value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
64
0
            if (*next & 0b10000000) == 0 {
65
0
                break;
66
0
            }
67
        }
68
0
    }
69
0
    (value, remainder)
70
0
}
71
72
/// Reads a varint with 3 bits of metadata in the lead byte.
73
///
74
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
75
///
76
/// If the varint spills off the end of the slice, a debug assertion will fail,
77
/// and the function will return the value up to that point.
78
0
pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
79
0
    let mut value = (start & 0b00001111) as usize;
80
0
    let mut remainder = remainder;
81
0
    if (start & 0b00010000) != 0 {
82
        loop {
83
            let next;
84
0
            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
85
            // Note: value << 7 could drop high bits. The first addition can't overflow.
86
            // The second addition could overflow; in such a case we just inform the
87
            // developer via the debug assertion.
88
0
            value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
89
0
            if (*next & 0b10000000) == 0 {
90
0
                break;
91
0
            }
92
        }
93
0
    }
94
0
    (value, remainder)
95
0
}
96
97
/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
98
///
99
/// Returns the varint value.
100
#[cfg(feature = "alloc")]
101
pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
102
    start: u8,
103
    remainder: &mut S,
104
) -> Option<usize> {
105
    let mut value = (start & 0b00001111) as usize;
106
    if (start & 0b00010000) != 0 {
107
        loop {
108
            let next = remainder.atbs_pop_front()?;
109
            // Note: value << 7 could drop high bits. The first addition can't overflow.
110
            // The second addition could overflow; in such a case we just inform the
111
            // developer via the debug assertion.
112
            value = (value << 7) + ((next & 0b01111111) as usize) + 16;
113
            if (next & 0b10000000) == 0 {
114
                break;
115
            }
116
        }
117
    }
118
    Some(value)
119
}
120
121
#[cfg(test)]
122
const MAX_VARINT: usize = usize::MAX;
123
124
// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
125
// Add an extra 1 since the lead byte holds only 5 bits of data.
126
const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
127
128
/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
129
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
130
0
pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
131
0
    let mut result = [0; MAX_VARINT_LENGTH];
132
0
    let mut i = MAX_VARINT_LENGTH - 1;
133
0
    let mut value = value;
134
0
    let mut last = true;
135
    loop {
136
0
        if value < 32 {
137
0
            result[i] = value as u8;
138
0
            if !last {
139
0
                result[i] |= 0b00100000;
140
0
            }
141
0
            break;
142
0
        }
143
0
        value -= 32;
144
0
        result[i] = (value as u8) & 0b01111111;
145
0
        if !last {
146
0
            result[i] |= 0b10000000;
147
0
        } else {
148
0
            last = false;
149
0
        }
150
0
        value >>= 7;
151
0
        i -= 1;
152
    }
153
    // The bytes are from i to the end.
154
0
    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
155
0
}
156
157
/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
158
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
159
0
pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
160
0
    let mut result = [0; MAX_VARINT_LENGTH];
161
0
    let mut i = MAX_VARINT_LENGTH - 1;
162
0
    let mut value = value;
163
0
    let mut last = true;
164
    loop {
165
0
        if value < 16 {
166
0
            result[i] = value as u8;
167
0
            if !last {
168
0
                result[i] |= 0b00010000;
169
0
            }
170
0
            break;
171
0
        }
172
0
        value -= 16;
173
0
        result[i] = (value as u8) & 0b01111111;
174
0
        if !last {
175
0
            result[i] |= 0b10000000;
176
0
        } else {
177
0
            last = false;
178
0
        }
179
0
        value >>= 7;
180
0
        i -= 1;
181
    }
182
    // The bytes are from i to the end.
183
0
    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
184
0
}
185
186
/// A secondary implementation that separates the latent value while computing the varint.
187
#[cfg(test)]
188
pub(crate) const fn write_varint_reference(
189
    value: usize,
190
) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
191
    let mut result = [0; MAX_VARINT_LENGTH];
192
    if value < 32 {
193
        result[0] = value as u8;
194
        return ConstArrayBuilder::from_manual_slice(result, 0, 1);
195
    }
196
    result[0] = 32;
197
    let mut latent = 32;
198
    let mut steps = 2;
199
    loop {
200
        let next_latent = (latent << 7) + 32;
201
        if value < next_latent || next_latent == latent {
202
            break;
203
        }
204
        latent = next_latent;
205
        steps += 1;
206
    }
207
    let mut value = value - latent;
208
    let mut i = steps;
209
    while i > 0 {
210
        i -= 1;
211
        result[i] |= (value as u8) & 0b01111111;
212
        value >>= 7;
213
        if i > 0 && i < steps - 1 {
214
            result[i] |= 0b10000000;
215
        }
216
    }
217
    // The bytes are from 0 to `steps`.
218
    ConstArrayBuilder::from_manual_slice(result, 0, steps)
219
}
220
221
#[cfg(test)]
222
mod tests {
223
    use super::*;
224
225
    #[derive(Debug)]
226
    struct TestCase<'a> {
227
        bytes: &'a [u8],
228
        remainder: &'a [u8],
229
        value: usize,
230
    }
231
    static CASES: &[TestCase] = &[
232
        TestCase {
233
            bytes: &[0b00000000],
234
            remainder: &[],
235
            value: 0,
236
        },
237
        TestCase {
238
            bytes: &[0b00001010],
239
            remainder: &[],
240
            value: 10,
241
        },
242
        TestCase {
243
            bytes: &[0b00011111],
244
            remainder: &[],
245
            value: 31,
246
        },
247
        TestCase {
248
            bytes: &[0b00011111, 0b10101010],
249
            remainder: &[0b10101010],
250
            value: 31,
251
        },
252
        TestCase {
253
            bytes: &[0b00100000, 0b00000000],
254
            remainder: &[],
255
            value: 32,
256
        },
257
        TestCase {
258
            bytes: &[0b00100000, 0b00000001],
259
            remainder: &[],
260
            value: 33,
261
        },
262
        TestCase {
263
            bytes: &[0b00100000, 0b00100000],
264
            remainder: &[],
265
            value: 64,
266
        },
267
        TestCase {
268
            bytes: &[0x20, 0x44],
269
            remainder: &[],
270
            value: 100,
271
        },
272
        TestCase {
273
            bytes: &[0b00100000, 0b01111111],
274
            remainder: &[],
275
            value: 159,
276
        },
277
        TestCase {
278
            bytes: &[0b00100001, 0b00000000],
279
            remainder: &[],
280
            value: 160,
281
        },
282
        TestCase {
283
            bytes: &[0b00100001, 0b00000001],
284
            remainder: &[],
285
            value: 161,
286
        },
287
        TestCase {
288
            bytes: &[0x23, 0x54],
289
            remainder: &[],
290
            value: 500,
291
        },
292
        TestCase {
293
            bytes: &[0b00111111, 0b01111111],
294
            remainder: &[],
295
            value: 4127, // 32 + (1 << 12) - 1
296
        },
297
        TestCase {
298
            bytes: &[0b00100000, 0b10000000, 0b00000000],
299
            remainder: &[],
300
            value: 4128, // 32 + (1 << 12)
301
        },
302
        TestCase {
303
            bytes: &[0b00100000, 0b10000000, 0b00000001],
304
            remainder: &[],
305
            value: 4129, // 32 + (1 << 12) + 1
306
        },
307
        TestCase {
308
            bytes: &[0b00100000, 0b10000000, 0b01111111],
309
            remainder: &[],
310
            value: 4255, // 32 + (1 << 12) + 127
311
        },
312
        TestCase {
313
            bytes: &[0b00100000, 0b10000001, 0b00000000],
314
            remainder: &[],
315
            value: 4256, // 32 + (1 << 12) + 128
316
        },
317
        TestCase {
318
            bytes: &[0b00100000, 0b10000001, 0b00000001],
319
            remainder: &[],
320
            value: 4257, // 32 + (1 << 12) + 129
321
        },
322
        TestCase {
323
            bytes: &[0x20, 0x86, 0x68],
324
            remainder: &[],
325
            value: 5000,
326
        },
327
        TestCase {
328
            bytes: &[0b00100000, 0b11111111, 0b01111111],
329
            remainder: &[],
330
            value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
331
        },
332
        TestCase {
333
            bytes: &[0b00100001, 0b10000000, 0b00000000],
334
            remainder: &[],
335
            value: 20512, // 32 + (1 << 12) + (1 << 14)
336
        },
337
        TestCase {
338
            bytes: &[0b00111111, 0b11111111, 0b01111111],
339
            remainder: &[],
340
            value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
341
        },
342
        TestCase {
343
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
344
            remainder: &[],
345
            value: 528416, // 32 + (1 << 12) + (1 << 19)
346
        },
347
        TestCase {
348
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
349
            remainder: &[],
350
            value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
351
        },
352
        TestCase {
353
            bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
354
            remainder: &[],
355
            value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
356
        },
357
        TestCase {
358
            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
359
            remainder: &[],
360
            value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
361
        },
362
    ];
363
364
    #[test]
365
    fn test_read() {
366
        for cas in CASES {
367
            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
368
            assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
369
        }
370
    }
371
372
    #[test]
373
    fn test_read_write() {
374
        for cas in CASES {
375
            let reference_bytes = write_varint_reference(cas.value);
376
            assert_eq!(
377
                reference_bytes.len(),
378
                cas.bytes.len() - cas.remainder.len(),
379
                "{cas:?}"
380
            );
381
            assert_eq!(
382
                reference_bytes.as_slice(),
383
                &cas.bytes[0..reference_bytes.len()],
384
                "{cas:?}"
385
            );
386
            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
387
            assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
388
            let write_bytes = write_varint_meta2(cas.value);
389
            assert_eq!(
390
                reference_bytes.as_slice(),
391
                write_bytes.as_slice(),
392
                "{cas:?}"
393
            );
394
        }
395
    }
396
397
    #[test]
398
    fn test_roundtrip() {
399
        let mut i = 0usize;
400
        while i < MAX_VARINT {
401
            let bytes = write_varint_meta2(i);
402
            let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
403
            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
404
            i <<= 1;
405
            i += 1;
406
        }
407
    }
408
409
    #[test]
410
    fn test_extended_roundtrip() {
411
        let mut i = 0usize;
412
        while i < MAX_VARINT {
413
            let bytes = write_varint_meta3(i);
414
            let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
415
            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
416
            i <<= 1;
417
            i += 1;
418
        }
419
    }
420
421
    #[test]
422
    fn test_max() {
423
        let reference_bytes = write_varint_reference(MAX_VARINT);
424
        let write_bytes = write_varint_meta2(MAX_VARINT);
425
        assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
426
        assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
427
        let subarray = write_bytes
428
            .as_const_slice()
429
            .get_subslice_or_panic(1, write_bytes.len());
430
        let (recovered_value, remainder) = read_varint_meta2(
431
            *write_bytes.as_const_slice().first().unwrap(),
432
            subarray.as_slice(),
433
        );
434
        assert!(remainder.is_empty());
435
        assert_eq!(recovered_value, MAX_VARINT);
436
        #[cfg(target_pointer_width = "64")]
437
        assert_eq!(
438
            write_bytes.as_slice(),
439
            &[
440
                0b00100001, //
441
                0b11011111, //
442
                0b11011111, //
443
                0b11011111, //
444
                0b11011111, //
445
                0b11011111, //
446
                0b11011111, //
447
                0b11011111, //
448
                0b11011111, //
449
                0b01011111, //
450
            ]
451
        );
452
        #[cfg(target_pointer_width = "32")]
453
        assert_eq!(
454
            write_bytes.as_slice(),
455
            &[
456
                0b00101111, //
457
                0b11011111, //
458
                0b11011111, //
459
                0b11011111, //
460
                0b01011111, //
461
            ]
462
        );
463
    }
464
465
    #[test]
466
    fn text_extended_max() {
467
        let write_bytes = write_varint_meta3(MAX_VARINT);
468
        assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
469
        let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
470
        let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
471
        assert!(remainder.is_empty());
472
        assert_eq!(recovered_value, MAX_VARINT);
473
        #[cfg(target_pointer_width = "64")]
474
        assert_eq!(
475
            write_bytes.as_slice(),
476
            &[
477
                0b00010001, //
478
                0b11101111, //
479
                0b11101111, //
480
                0b11101111, //
481
                0b11101111, //
482
                0b11101111, //
483
                0b11101111, //
484
                0b11101111, //
485
                0b11101111, //
486
                0b01101111, //
487
            ]
488
        );
489
        #[cfg(target_pointer_width = "32")]
490
        assert_eq!(
491
            write_bytes.as_slice(),
492
            &[
493
                0b00011111, //
494
                0b11101111, //
495
                0b11101111, //
496
                0b11101111, //
497
                0b01101111, //
498
            ]
499
        );
500
    }
501
502
    #[test]
503
    fn test_latent_values() {
504
        // Same values documented in the module docs: M=2
505
        let m2 = read_varint_meta2;
506
        assert_eq!(m2(0, &[]).0, 0);
507
        assert_eq!(m2(0x20, &[0x00]).0, 32);
508
        assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
509
        assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
510
        assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
511
512
        // Same values documented in the module docs: M=3
513
        let m3 = read_varint_meta3;
514
        assert_eq!(m3(0, &[]).0, 0);
515
        assert_eq!(m3(0x10, &[0x00]).0, 16);
516
        assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
517
        assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
518
        assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
519
    }
520
}