Coverage Report

Created: 2025-11-16 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/util/alphabet.rs
Line
Count
Source
1
use core::convert::TryFrom;
2
3
use crate::util::{
4
    bytes::{DeserializeError, SerializeError},
5
    DebugByte,
6
};
7
8
/// Unit represents a single unit of input for DFA based regex engines.
9
///
10
/// **NOTE:** It is not expected for consumers of this crate to need to use
11
/// this type unless they are implementing their own DFA. And even then, it's
12
/// not required: implementors may use other techniques to handle input.
13
///
14
/// Typically, a single unit of input for a DFA would be a single byte.
15
/// However, for the DFAs in this crate, matches are delayed by a single byte
16
/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
17
/// we have consumed the haystack, we must run the DFA through one additional
18
/// transition using an input that indicates the haystack has ended.
19
///
20
/// Since there is no way to represent a sentinel with a `u8` since all
21
/// possible values *may* be valid inputs to a DFA, this type explicitly adds
22
/// room for a sentinel value.
23
///
24
/// The sentinel EOI value is always its own equivalence class and is
25
/// ultimately represented by adding 1 to the maximum equivalence class value.
26
/// So for example, the regex `^[a-z]+$` might be split into the following
27
/// equivalence classes:
28
///
29
/// ```text
30
/// 0 => [\x00-`]
31
/// 1 => [a-z]
32
/// 2 => [{-\xFF]
33
/// 3 => [EOI]
34
/// ```
35
///
36
/// Where EOI is the special sentinel value that is always in its own
37
/// singleton equivalence class.
38
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
39
pub enum Unit {
40
    U8(u8),
41
    EOI(u16),
42
}
43
44
impl Unit {
45
    /// Create a new input unit from a byte value.
46
    ///
47
    /// All possible byte values are legal. However, when creating an input
48
    /// unit for a specific DFA, one should be careful to only construct input
49
    /// units that are in that DFA's alphabet. Namely, one way to compact a
50
    /// DFA's in-memory representation is to collapse its transitions to a set
51
    /// of equivalence classes into a set of all possible byte values. If a
52
    /// DFA uses equivalence classes instead of byte values, then the byte
53
    /// given here should be the equivalence class.
54
0
    pub fn u8(byte: u8) -> Unit {
55
0
        Unit::U8(byte)
56
0
    }
57
58
0
    pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
59
0
        assert!(
60
0
            num_byte_equiv_classes <= 256,
61
0
            "max number of byte-based equivalent classes is 256, but got {}",
62
            num_byte_equiv_classes,
63
        );
64
0
        Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap())
65
0
    }
66
67
0
    pub fn as_u8(self) -> Option<u8> {
68
0
        match self {
69
0
            Unit::U8(b) => Some(b),
70
0
            Unit::EOI(_) => None,
71
        }
72
0
    }
73
74
    #[cfg(feature = "alloc")]
75
    pub fn as_eoi(self) -> Option<usize> {
76
        match self {
77
            Unit::U8(_) => None,
78
            Unit::EOI(eoi) => Some(eoi as usize),
79
        }
80
    }
81
82
0
    pub fn as_usize(self) -> usize {
83
0
        match self {
84
0
            Unit::U8(b) => b as usize,
85
0
            Unit::EOI(eoi) => eoi as usize,
86
        }
87
0
    }
88
89
0
    pub fn is_eoi(&self) -> bool {
90
0
        match *self {
91
0
            Unit::EOI(_) => true,
92
0
            _ => false,
93
        }
94
0
    }
95
96
    #[cfg(feature = "alloc")]
97
    pub fn is_word_byte(&self) -> bool {
98
        self.as_u8().map_or(false, crate::util::is_word_byte)
99
    }
100
}
101
102
impl core::fmt::Debug for Unit {
103
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
104
0
        match *self {
105
0
            Unit::U8(b) => write!(f, "{:?}", DebugByte(b)),
106
0
            Unit::EOI(_) => write!(f, "EOI"),
107
        }
108
0
    }
109
}
110
111
/// A representation of byte oriented equivalence classes.
112
///
113
/// This is used in a DFA to reduce the size of the transition table. This can
114
/// have a particularly large impact not only on the total size of a dense DFA,
115
/// but also on compile times.
116
#[derive(Clone, Copy)]
117
pub struct ByteClasses([u8; 256]);
118
119
impl ByteClasses {
120
    /// Creates a new set of equivalence classes where all bytes are mapped to
121
    /// the same class.
122
0
    pub fn empty() -> ByteClasses {
123
0
        ByteClasses([0; 256])
124
0
    }
125
126
    /// Creates a new set of equivalence classes where each byte belongs to
127
    /// its own equivalence class.
128
    #[cfg(feature = "alloc")]
129
    pub fn singletons() -> ByteClasses {
130
        let mut classes = ByteClasses::empty();
131
        for i in 0..256 {
132
            classes.set(i as u8, i as u8);
133
        }
134
        classes
135
    }
136
137
    /// Deserializes a byte class map from the given slice. If the slice is of
138
    /// insufficient length or otherwise contains an impossible mapping, then
139
    /// an error is returned. Upon success, the number of bytes read along with
140
    /// the map are returned. The number of bytes read is always a multiple of
141
    /// 8.
142
0
    pub fn from_bytes(
143
0
        slice: &[u8],
144
0
    ) -> Result<(ByteClasses, usize), DeserializeError> {
145
0
        if slice.len() < 256 {
146
0
            return Err(DeserializeError::buffer_too_small("byte class map"));
147
0
        }
148
0
        let mut classes = ByteClasses::empty();
149
0
        for (b, &class) in slice[..256].iter().enumerate() {
150
0
            classes.set(b as u8, class);
151
0
        }
152
0
        for b in classes.iter() {
153
0
            if b.as_usize() >= classes.alphabet_len() {
154
0
                return Err(DeserializeError::generic(
155
0
                    "found equivalence class greater than alphabet len",
156
0
                ));
157
0
            }
158
        }
159
0
        Ok((classes, 256))
160
0
    }
161
162
    /// Writes this byte class map to the given byte buffer. if the given
163
    /// buffer is too small, then an error is returned. Upon success, the total
164
    /// number of bytes written is returned. The number of bytes written is
165
    /// guaranteed to be a multiple of 8.
166
0
    pub fn write_to(
167
0
        &self,
168
0
        mut dst: &mut [u8],
169
0
    ) -> Result<usize, SerializeError> {
170
0
        let nwrite = self.write_to_len();
171
0
        if dst.len() < nwrite {
172
0
            return Err(SerializeError::buffer_too_small("byte class map"));
173
0
        }
174
0
        for b in 0..=255 {
175
0
            dst[0] = self.get(b);
176
0
            dst = &mut dst[1..];
177
0
        }
178
0
        Ok(nwrite)
179
0
    }
180
181
    /// Returns the total number of bytes written by `write_to`.
182
0
    pub fn write_to_len(&self) -> usize {
183
0
        256
184
0
    }
185
186
    /// Set the equivalence class for the given byte.
187
    #[inline]
188
0
    pub fn set(&mut self, byte: u8, class: u8) {
189
0
        self.0[byte as usize] = class;
190
0
    }
191
192
    /// Get the equivalence class for the given byte.
193
    #[inline]
194
0
    pub fn get(&self, byte: u8) -> u8 {
195
0
        self.0[byte as usize]
196
0
    }
Unexecuted instantiation: <regex_automata::util::alphabet::ByteClasses>::get
Unexecuted instantiation: <regex_automata::util::alphabet::ByteClasses>::get
197
198
    /// Get the equivalence class for the given byte while forcefully
199
    /// eliding bounds checks.
200
    #[inline]
201
0
    pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
202
0
        *self.0.get_unchecked(byte as usize)
203
0
    }
204
205
    /// Get the equivalence class for the given input unit and return the
206
    /// class as a `usize`.
207
    #[inline]
208
0
    pub fn get_by_unit(&self, unit: Unit) -> usize {
209
0
        match unit {
210
0
            Unit::U8(b) => usize::try_from(self.get(b)).unwrap(),
211
0
            Unit::EOI(b) => usize::try_from(b).unwrap(),
212
        }
213
0
    }
214
215
    #[inline]
216
0
    pub fn eoi(&self) -> Unit {
217
0
        Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
218
0
    }
219
220
    /// Return the total number of elements in the alphabet represented by
221
    /// these equivalence classes. Equivalently, this returns the total number
222
    /// of equivalence classes.
223
    #[inline]
224
0
    pub fn alphabet_len(&self) -> usize {
225
        // Add one since the number of equivalence classes is one bigger than
226
        // the last one. But add another to account for the final EOI class
227
        // that isn't explicitly represented.
228
0
        self.0[255] as usize + 1 + 1
229
0
    }
230
231
    /// Returns the stride, as a base-2 exponent, required for these
232
    /// equivalence classes.
233
    ///
234
    /// The stride is always the smallest power of 2 that is greater than or
235
    /// equal to the alphabet length. This is done so that converting between
236
    /// state IDs and indices can be done with shifts alone, which is much
237
    /// faster than integer division.
238
    #[cfg(feature = "alloc")]
239
    pub fn stride2(&self) -> usize {
240
        self.alphabet_len().next_power_of_two().trailing_zeros() as usize
241
    }
242
243
    /// Returns true if and only if every byte in this class maps to its own
244
    /// equivalence class. Equivalently, there are 257 equivalence classes
245
    /// and each class contains exactly one byte (plus the special EOI class).
246
    #[inline]
247
0
    pub fn is_singleton(&self) -> bool {
248
0
        self.alphabet_len() == 257
249
0
    }
250
251
    /// Returns an iterator over all equivalence classes in this set.
252
0
    pub fn iter(&self) -> ByteClassIter<'_> {
253
0
        ByteClassIter { classes: self, i: 0 }
254
0
    }
255
256
    /// Returns an iterator over a sequence of representative bytes from each
257
    /// equivalence class. Namely, this yields exactly N items, where N is
258
    /// equivalent to the number of equivalence classes. Each item is an
259
    /// arbitrary byte drawn from each equivalence class.
260
    ///
261
    /// This is useful when one is determinizing an NFA and the NFA's alphabet
262
    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
263
    /// byte from each equivalence class then permits a full exploration of
264
    /// the NFA instead of using every possible byte value.
265
    #[cfg(feature = "alloc")]
266
    pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
267
        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
268
    }
269
270
    /// Returns an iterator of the bytes in the given equivalence class.
271
0
    pub fn elements(&self, class: Unit) -> ByteClassElements {
272
0
        ByteClassElements { classes: self, class, byte: 0 }
273
0
    }
274
275
    /// Returns an iterator of byte ranges in the given equivalence class.
276
    ///
277
    /// That is, a sequence of contiguous ranges are returned. Typically, every
278
    /// class maps to a single contiguous range.
279
0
    fn element_ranges(&self, class: Unit) -> ByteClassElementRanges {
280
0
        ByteClassElementRanges { elements: self.elements(class), range: None }
281
0
    }
282
}
283
284
impl core::fmt::Debug for ByteClasses {
285
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
286
0
        if self.is_singleton() {
287
0
            write!(f, "ByteClasses({{singletons}})")
288
        } else {
289
0
            write!(f, "ByteClasses(")?;
290
0
            for (i, class) in self.iter().enumerate() {
291
0
                if i > 0 {
292
0
                    write!(f, ", ")?;
293
0
                }
294
0
                write!(f, "{:?} => [", class.as_usize())?;
295
0
                for (start, end) in self.element_ranges(class) {
296
0
                    if start == end {
297
0
                        write!(f, "{:?}", start)?;
298
                    } else {
299
0
                        write!(f, "{:?}-{:?}", start, end)?;
300
                    }
301
                }
302
0
                write!(f, "]")?;
303
            }
304
0
            write!(f, ")")
305
        }
306
0
    }
307
}
308
309
/// An iterator over each equivalence class.
310
#[derive(Debug)]
311
pub struct ByteClassIter<'a> {
312
    classes: &'a ByteClasses,
313
    i: usize,
314
}
315
316
impl<'a> Iterator for ByteClassIter<'a> {
317
    type Item = Unit;
318
319
0
    fn next(&mut self) -> Option<Unit> {
320
0
        if self.i + 1 == self.classes.alphabet_len() {
321
0
            self.i += 1;
322
0
            Some(self.classes.eoi())
323
0
        } else if self.i < self.classes.alphabet_len() {
324
0
            let class = self.i as u8;
325
0
            self.i += 1;
326
0
            Some(Unit::u8(class))
327
        } else {
328
0
            None
329
        }
330
0
    }
331
}
332
333
/// An iterator over representative bytes from each equivalence class.
334
#[cfg(feature = "alloc")]
335
#[derive(Debug)]
336
pub struct ByteClassRepresentatives<'a> {
337
    classes: &'a ByteClasses,
338
    byte: usize,
339
    last_class: Option<u8>,
340
}
341
342
#[cfg(feature = "alloc")]
343
impl<'a> Iterator for ByteClassRepresentatives<'a> {
344
    type Item = Unit;
345
346
    fn next(&mut self) -> Option<Unit> {
347
        while self.byte < 256 {
348
            let byte = self.byte as u8;
349
            let class = self.classes.get(byte);
350
            self.byte += 1;
351
352
            if self.last_class != Some(class) {
353
                self.last_class = Some(class);
354
                return Some(Unit::u8(byte));
355
            }
356
        }
357
        if self.byte == 256 {
358
            self.byte += 1;
359
            return Some(self.classes.eoi());
360
        }
361
        None
362
    }
363
}
364
365
/// An iterator over all elements in an equivalence class.
366
#[derive(Debug)]
367
pub struct ByteClassElements<'a> {
368
    classes: &'a ByteClasses,
369
    class: Unit,
370
    byte: usize,
371
}
372
373
impl<'a> Iterator for ByteClassElements<'a> {
374
    type Item = Unit;
375
376
0
    fn next(&mut self) -> Option<Unit> {
377
0
        while self.byte < 256 {
378
0
            let byte = self.byte as u8;
379
0
            self.byte += 1;
380
0
            if self.class.as_u8() == Some(self.classes.get(byte)) {
381
0
                return Some(Unit::u8(byte));
382
0
            }
383
        }
384
0
        if self.byte < 257 {
385
0
            self.byte += 1;
386
0
            if self.class.is_eoi() {
387
0
                return Some(Unit::eoi(256));
388
0
            }
389
0
        }
390
0
        None
391
0
    }
392
}
393
394
/// An iterator over all elements in an equivalence class expressed as a
395
/// sequence of contiguous ranges.
396
#[derive(Debug)]
397
pub struct ByteClassElementRanges<'a> {
398
    elements: ByteClassElements<'a>,
399
    range: Option<(Unit, Unit)>,
400
}
401
402
impl<'a> Iterator for ByteClassElementRanges<'a> {
403
    type Item = (Unit, Unit);
404
405
0
    fn next(&mut self) -> Option<(Unit, Unit)> {
406
        loop {
407
0
            let element = match self.elements.next() {
408
0
                None => return self.range.take(),
409
0
                Some(element) => element,
410
            };
411
0
            match self.range.take() {
412
0
                None => {
413
0
                    self.range = Some((element, element));
414
0
                }
415
0
                Some((start, end)) => {
416
0
                    if end.as_usize() + 1 != element.as_usize()
417
0
                        || element.is_eoi()
418
                    {
419
0
                        self.range = Some((element, element));
420
0
                        return Some((start, end));
421
0
                    }
422
0
                    self.range = Some((start, element));
423
                }
424
            }
425
        }
426
0
    }
427
}
428
429
/// A byte class set keeps track of an *approximation* of equivalence classes
430
/// of bytes during NFA construction. That is, every byte in an equivalence
431
/// class cannot discriminate between a match and a non-match.
432
///
433
/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
434
/// same equivalence class because it never matters whether an `a` or a `b` is
435
/// seen, and no combination of `a`s and `b`s in the text can discriminate a
436
/// match.
437
///
438
/// Note though that this does not compute the minimal set of equivalence
439
/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
440
/// same equivalence class for the same reason that `a` and `b` are in the
441
/// same equivalence class in the aforementioned regex. However, in this
442
/// implementation, `a` and `c` are put into distinct equivalence classes. The
443
/// reason for this is implementation complexity. In the future, we should
444
/// endeavor to compute the minimal equivalence classes since they can have a
445
/// rather large impact on the size of the DFA. (Doing this will likely require
446
/// rethinking how equivalence classes are computed, including changing the
447
/// representation here, which is only able to group contiguous bytes into the
448
/// same equivalence class.)
449
#[derive(Clone, Debug)]
450
pub struct ByteClassSet(ByteSet);
451
452
impl ByteClassSet {
453
    /// Create a new set of byte classes where all bytes are part of the same
454
    /// equivalence class.
455
    #[cfg(feature = "alloc")]
456
    pub fn empty() -> Self {
457
        ByteClassSet(ByteSet::empty())
458
    }
459
460
    /// Indicate the the range of byte given (inclusive) can discriminate a
461
    /// match between it and all other bytes outside of the range.
462
    #[cfg(feature = "alloc")]
463
    pub fn set_range(&mut self, start: u8, end: u8) {
464
        debug_assert!(start <= end);
465
        if start > 0 {
466
            self.0.add(start - 1);
467
        }
468
        self.0.add(end);
469
    }
470
471
    /// Add the contiguous ranges in the set given to this byte class set.
472
    #[cfg(feature = "alloc")]
473
    pub fn add_set(&mut self, set: &ByteSet) {
474
        for (start, end) in set.iter_ranges() {
475
            self.set_range(start, end);
476
        }
477
    }
478
479
    /// Convert this boolean set to a map that maps all byte values to their
480
    /// corresponding equivalence class. The last mapping indicates the largest
481
    /// equivalence class identifier (which is never bigger than 255).
482
    #[cfg(feature = "alloc")]
483
    pub fn byte_classes(&self) -> ByteClasses {
484
        let mut classes = ByteClasses::empty();
485
        let mut class = 0u8;
486
        let mut b = 0u8;
487
        loop {
488
            classes.set(b, class);
489
            if b == 255 {
490
                break;
491
            }
492
            if self.0.contains(b) {
493
                class = class.checked_add(1).unwrap();
494
            }
495
            b = b.checked_add(1).unwrap();
496
        }
497
        classes
498
    }
499
}
500
501
/// A simple set of bytes that is reasonably cheap to copy and allocation free.
502
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
503
pub struct ByteSet {
504
    bits: BitSet,
505
}
506
507
/// The representation of a byte set. Split out so that we can define a
508
/// convenient Debug impl for it while keeping "ByteSet" in the output.
509
#[derive(Clone, Copy, Default, Eq, PartialEq)]
510
struct BitSet([u128; 2]);
511
512
impl ByteSet {
513
    /// Create an empty set of bytes.
514
    #[cfg(feature = "alloc")]
515
    pub fn empty() -> ByteSet {
516
        ByteSet { bits: BitSet([0; 2]) }
517
    }
518
519
    /// Add a byte to this set.
520
    ///
521
    /// If the given byte already belongs to this set, then this is a no-op.
522
    #[cfg(feature = "alloc")]
523
    pub fn add(&mut self, byte: u8) {
524
        let bucket = byte / 128;
525
        let bit = byte % 128;
526
        self.bits.0[bucket as usize] |= 1 << bit;
527
    }
528
529
    /// Add an inclusive range of bytes.
530
    #[cfg(feature = "alloc")]
531
    pub fn add_all(&mut self, start: u8, end: u8) {
532
        for b in start..=end {
533
            self.add(b);
534
        }
535
    }
536
537
    /// Remove a byte from this set.
538
    ///
539
    /// If the given byte is not in this set, then this is a no-op.
540
    #[cfg(feature = "alloc")]
541
    pub fn remove(&mut self, byte: u8) {
542
        let bucket = byte / 128;
543
        let bit = byte % 128;
544
        self.bits.0[bucket as usize] &= !(1 << bit);
545
    }
546
547
    /// Remove an inclusive range of bytes.
548
    #[cfg(feature = "alloc")]
549
    pub fn remove_all(&mut self, start: u8, end: u8) {
550
        for b in start..=end {
551
            self.remove(b);
552
        }
553
    }
554
555
    /// Return true if and only if the given byte is in this set.
556
0
    pub fn contains(&self, byte: u8) -> bool {
557
0
        let bucket = byte / 128;
558
0
        let bit = byte % 128;
559
0
        self.bits.0[bucket as usize] & (1 << bit) > 0
560
0
    }
561
562
    /// Return true if and only if the given inclusive range of bytes is in
563
    /// this set.
564
    #[cfg(feature = "alloc")]
565
    pub fn contains_range(&self, start: u8, end: u8) -> bool {
566
        (start..=end).all(|b| self.contains(b))
567
    }
568
569
    /// Returns an iterator over all bytes in this set.
570
    #[cfg(feature = "alloc")]
571
    pub fn iter(&self) -> ByteSetIter {
572
        ByteSetIter { set: self, b: 0 }
573
    }
574
575
    /// Returns an iterator over all contiguous ranges of bytes in this set.
576
    #[cfg(feature = "alloc")]
577
    pub fn iter_ranges(&self) -> ByteSetRangeIter {
578
        ByteSetRangeIter { set: self, b: 0 }
579
    }
580
581
    /// Return the number of bytes in this set.
582
    #[cfg(feature = "alloc")]
583
    pub fn len(&self) -> usize {
584
        (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize
585
    }
586
587
    /// Return true if and only if this set is empty.
588
    #[cfg(feature = "alloc")]
589
    pub fn is_empty(&self) -> bool {
590
        self.bits.0 == [0, 0]
591
    }
592
}
593
594
impl core::fmt::Debug for BitSet {
595
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
596
0
        let mut fmtd = f.debug_set();
597
0
        for b in (0..256).map(|b| b as u8) {
598
0
            if (ByteSet { bits: *self }).contains(b) {
599
0
                fmtd.entry(&b);
600
0
            }
601
        }
602
0
        fmtd.finish()
603
0
    }
604
}
605
606
#[derive(Debug)]
607
pub struct ByteSetIter<'a> {
608
    set: &'a ByteSet,
609
    b: usize,
610
}
611
612
impl<'a> Iterator for ByteSetIter<'a> {
613
    type Item = u8;
614
615
0
    fn next(&mut self) -> Option<u8> {
616
0
        while self.b <= 255 {
617
0
            let b = self.b as u8;
618
0
            self.b += 1;
619
0
            if self.set.contains(b) {
620
0
                return Some(b);
621
0
            }
622
        }
623
0
        None
624
0
    }
625
}
626
627
#[derive(Debug)]
628
pub struct ByteSetRangeIter<'a> {
629
    set: &'a ByteSet,
630
    b: usize,
631
}
632
633
impl<'a> Iterator for ByteSetRangeIter<'a> {
634
    type Item = (u8, u8);
635
636
0
    fn next(&mut self) -> Option<(u8, u8)> {
637
0
        while self.b <= 255 {
638
0
            let start = self.b as u8;
639
0
            self.b += 1;
640
0
            if !self.set.contains(start) {
641
0
                continue;
642
0
            }
643
644
0
            let mut end = start;
645
0
            while self.b <= 255 && self.set.contains(self.b as u8) {
646
0
                end = self.b as u8;
647
0
                self.b += 1;
648
0
            }
649
0
            return Some((start, end));
650
        }
651
0
        None
652
0
    }
653
}
654
655
#[cfg(test)]
656
#[cfg(feature = "alloc")]
657
mod tests {
658
    use alloc::{vec, vec::Vec};
659
660
    use super::*;
661
662
    #[test]
663
    fn byte_classes() {
664
        let mut set = ByteClassSet::empty();
665
        set.set_range(b'a', b'z');
666
667
        let classes = set.byte_classes();
668
        assert_eq!(classes.get(0), 0);
669
        assert_eq!(classes.get(1), 0);
670
        assert_eq!(classes.get(2), 0);
671
        assert_eq!(classes.get(b'a' - 1), 0);
672
        assert_eq!(classes.get(b'a'), 1);
673
        assert_eq!(classes.get(b'm'), 1);
674
        assert_eq!(classes.get(b'z'), 1);
675
        assert_eq!(classes.get(b'z' + 1), 2);
676
        assert_eq!(classes.get(254), 2);
677
        assert_eq!(classes.get(255), 2);
678
679
        let mut set = ByteClassSet::empty();
680
        set.set_range(0, 2);
681
        set.set_range(4, 6);
682
        let classes = set.byte_classes();
683
        assert_eq!(classes.get(0), 0);
684
        assert_eq!(classes.get(1), 0);
685
        assert_eq!(classes.get(2), 0);
686
        assert_eq!(classes.get(3), 1);
687
        assert_eq!(classes.get(4), 2);
688
        assert_eq!(classes.get(5), 2);
689
        assert_eq!(classes.get(6), 2);
690
        assert_eq!(classes.get(7), 3);
691
        assert_eq!(classes.get(255), 3);
692
    }
693
694
    #[test]
695
    fn full_byte_classes() {
696
        let mut set = ByteClassSet::empty();
697
        for i in 0..256u16 {
698
            set.set_range(i as u8, i as u8);
699
        }
700
        assert_eq!(set.byte_classes().alphabet_len(), 257);
701
    }
702
703
    #[test]
704
    fn elements_typical() {
705
        let mut set = ByteClassSet::empty();
706
        set.set_range(b'b', b'd');
707
        set.set_range(b'g', b'm');
708
        set.set_range(b'z', b'z');
709
        let classes = set.byte_classes();
710
        // class 0: \x00-a
711
        // class 1: b-d
712
        // class 2: e-f
713
        // class 3: g-m
714
        // class 4: n-y
715
        // class 5: z-z
716
        // class 6: \x7B-\xFF
717
        // class 7: EOI
718
        assert_eq!(classes.alphabet_len(), 8);
719
720
        let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
721
        assert_eq!(elements.len(), 98);
722
        assert_eq!(elements[0], Unit::u8(b'\x00'));
723
        assert_eq!(elements[97], Unit::u8(b'a'));
724
725
        let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>();
726
        assert_eq!(
727
            elements,
728
            vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')],
729
        );
730
731
        let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>();
732
        assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],);
733
734
        let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>();
735
        assert_eq!(
736
            elements,
737
            vec![
738
                Unit::u8(b'g'),
739
                Unit::u8(b'h'),
740
                Unit::u8(b'i'),
741
                Unit::u8(b'j'),
742
                Unit::u8(b'k'),
743
                Unit::u8(b'l'),
744
                Unit::u8(b'm'),
745
            ],
746
        );
747
748
        let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>();
749
        assert_eq!(elements.len(), 12);
750
        assert_eq!(elements[0], Unit::u8(b'n'));
751
        assert_eq!(elements[11], Unit::u8(b'y'));
752
753
        let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>();
754
        assert_eq!(elements, vec![Unit::u8(b'z')]);
755
756
        let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>();
757
        assert_eq!(elements.len(), 133);
758
        assert_eq!(elements[0], Unit::u8(b'\x7B'));
759
        assert_eq!(elements[132], Unit::u8(b'\xFF'));
760
761
        let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>();
762
        assert_eq!(elements, vec![Unit::eoi(256)]);
763
    }
764
765
    #[test]
766
    fn elements_singletons() {
767
        let classes = ByteClasses::singletons();
768
        assert_eq!(classes.alphabet_len(), 257);
769
770
        let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>();
771
        assert_eq!(elements, vec![Unit::u8(b'a')]);
772
773
        let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>();
774
        assert_eq!(elements, vec![Unit::eoi(256)]);
775
    }
776
777
    #[test]
778
    fn elements_empty() {
779
        let classes = ByteClasses::empty();
780
        assert_eq!(classes.alphabet_len(), 2);
781
782
        let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
783
        assert_eq!(elements.len(), 256);
784
        assert_eq!(elements[0], Unit::u8(b'\x00'));
785
        assert_eq!(elements[255], Unit::u8(b'\xFF'));
786
787
        let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
788
        assert_eq!(elements, vec![Unit::eoi(256)]);
789
    }
790
}