Coverage Report

Created: 2025-09-27 07:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.8.6/src/utf8.rs
Line
Count
Source
1
/*!
2
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4
This is sub-module is useful for constructing byte based automatons that need
5
to embed UTF-8 decoding. The most common use of this module is in conjunction
6
with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
7
8
See the documentation on the `Utf8Sequences` iterator for more details and
9
an example.
10
11
# Wait, what is this?
12
13
This is simplest to explain with an example. Let's say you wanted to test
14
whether a particular byte sequence was a Cyrillic character. One possible
15
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16
range can be expressed as a sequence of byte ranges:
17
18
```text
19
[D0-D3][80-BF]
20
```
21
22
This is simple enough: simply encode the boundaries, `0400` encodes to
23
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26
However, what if you wanted to add the Cyrillic Supplementary characters to
27
your range? Your range might then become `[0400-052F]`. The same procedure
28
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30
this isn't quite correct because this range doesn't capture many characters,
31
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33
Instead, you need multiple sequences of byte ranges:
34
35
```text
36
[D0-D3][80-BF]  # matches codepoints 0400-04FF
37
[D4][80-AF]     # matches codepoints 0500-052F
38
```
39
40
This gets even more complicated if you want bigger ranges, particularly if
41
they naively contain surrogate codepoints. For example, the sequence of byte
42
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44
```text
45
[0-7F]
46
[C2-DF][80-BF]
47
[E0][A0-BF][80-BF]
48
[E1-EC][80-BF][80-BF]
49
[ED][80-9F][80-BF]
50
[EE-EF][80-BF][80-BF]
51
```
52
53
Note that the byte ranges above will *not* match any erroneous encoding of
54
UTF-8, including encodings of surrogate codepoints.
55
56
And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58
```text
59
[0-7F]
60
[C2-DF][80-BF]
61
[E0][A0-BF][80-BF]
62
[E1-EC][80-BF][80-BF]
63
[ED][80-9F][80-BF]
64
[EE-EF][80-BF][80-BF]
65
[F0][90-BF][80-BF][80-BF]
66
[F1-F3][80-BF][80-BF][80-BF]
67
[F4][80-8F][80-BF][80-BF]
68
```
69
70
This module automates the process of creating these byte ranges from ranges of
71
Unicode scalar values.
72
73
# Lineage
74
75
I got the idea and general implementation strategy from Russ Cox in his
76
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78
I also got the idea from
79
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80
which uses it for executing automata on their term index.
81
*/
82
83
use core::{char, fmt, iter::FusedIterator, slice};
84
85
use alloc::{vec, vec::Vec};
86
87
const MAX_UTF8_BYTES: usize = 4;
88
89
/// Utf8Sequence represents a sequence of byte ranges.
90
///
91
/// To match a Utf8Sequence, a candidate byte sequence must match each
92
/// successive range.
93
///
94
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
95
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
96
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
97
pub enum Utf8Sequence {
98
    /// One byte range.
99
    One(Utf8Range),
100
    /// Two successive byte ranges.
101
    Two([Utf8Range; 2]),
102
    /// Three successive byte ranges.
103
    Three([Utf8Range; 3]),
104
    /// Four successive byte ranges.
105
    Four([Utf8Range; 4]),
106
}
107
108
impl Utf8Sequence {
109
    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
110
    /// range.
111
    ///
112
    /// This assumes that `start` and `end` have the same length.
113
0
    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
114
0
        assert_eq!(start.len(), end.len());
115
0
        match start.len() {
116
0
            2 => Utf8Sequence::Two([
117
0
                Utf8Range::new(start[0], end[0]),
118
0
                Utf8Range::new(start[1], end[1]),
119
0
            ]),
120
0
            3 => Utf8Sequence::Three([
121
0
                Utf8Range::new(start[0], end[0]),
122
0
                Utf8Range::new(start[1], end[1]),
123
0
                Utf8Range::new(start[2], end[2]),
124
0
            ]),
125
0
            4 => Utf8Sequence::Four([
126
0
                Utf8Range::new(start[0], end[0]),
127
0
                Utf8Range::new(start[1], end[1]),
128
0
                Utf8Range::new(start[2], end[2]),
129
0
                Utf8Range::new(start[3], end[3]),
130
0
            ]),
131
0
            n => unreachable!("invalid encoded length: {}", n),
132
        }
133
0
    }
134
135
    /// Returns the underlying sequence of byte ranges as a slice.
136
0
    pub fn as_slice(&self) -> &[Utf8Range] {
137
        use self::Utf8Sequence::*;
138
0
        match *self {
139
0
            One(ref r) => slice::from_ref(r),
140
0
            Two(ref r) => &r[..],
141
0
            Three(ref r) => &r[..],
142
0
            Four(ref r) => &r[..],
143
        }
144
0
    }
145
146
    /// Returns the number of byte ranges in this sequence.
147
    ///
148
    /// The length is guaranteed to be in the closed interval `[1, 4]`.
149
0
    pub fn len(&self) -> usize {
150
0
        self.as_slice().len()
151
0
    }
152
153
    /// Reverses the ranges in this sequence.
154
    ///
155
    /// For example, if this corresponds to the following sequence:
156
    ///
157
    /// ```text
158
    /// [D0-D3][80-BF]
159
    /// ```
160
    ///
161
    /// Then after reversal, it will be
162
    ///
163
    /// ```text
164
    /// [80-BF][D0-D3]
165
    /// ```
166
    ///
167
    /// This is useful when one is constructing a UTF-8 automaton to match
168
    /// character classes in reverse.
169
0
    pub fn reverse(&mut self) {
170
0
        match *self {
171
0
            Utf8Sequence::One(_) => {}
172
0
            Utf8Sequence::Two(ref mut x) => x.reverse(),
173
0
            Utf8Sequence::Three(ref mut x) => x.reverse(),
174
0
            Utf8Sequence::Four(ref mut x) => x.reverse(),
175
        }
176
0
    }
177
178
    /// Returns true if and only if a prefix of `bytes` matches this sequence
179
    /// of byte ranges.
180
0
    pub fn matches(&self, bytes: &[u8]) -> bool {
181
0
        if bytes.len() < self.len() {
182
0
            return false;
183
0
        }
184
0
        for (&b, r) in bytes.iter().zip(self) {
185
0
            if !r.matches(b) {
186
0
                return false;
187
0
            }
188
        }
189
0
        true
190
0
    }
191
}
192
193
impl<'a> IntoIterator for &'a Utf8Sequence {
194
    type IntoIter = slice::Iter<'a, Utf8Range>;
195
    type Item = &'a Utf8Range;
196
197
0
    fn into_iter(self) -> Self::IntoIter {
198
0
        self.as_slice().iter()
199
0
    }
200
}
201
202
impl fmt::Debug for Utf8Sequence {
203
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204
        use self::Utf8Sequence::*;
205
0
        match *self {
206
0
            One(ref r) => write!(f, "{:?}", r),
207
0
            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
208
0
            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
209
0
            Four(ref r) => {
210
0
                write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
211
            }
212
        }
213
0
    }
214
}
215
216
/// A single inclusive range of UTF-8 bytes.
217
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
218
pub struct Utf8Range {
219
    /// Start of byte range (inclusive).
220
    pub start: u8,
221
    /// End of byte range (inclusive).
222
    pub end: u8,
223
}
224
225
impl Utf8Range {
226
0
    fn new(start: u8, end: u8) -> Self {
227
0
        Utf8Range { start, end }
228
0
    }
229
230
    /// Returns true if and only if the given byte is in this range.
231
0
    pub fn matches(&self, b: u8) -> bool {
232
0
        self.start <= b && b <= self.end
233
0
    }
234
}
235
236
impl fmt::Debug for Utf8Range {
237
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238
0
        if self.start == self.end {
239
0
            write!(f, "[{:X}]", self.start)
240
        } else {
241
0
            write!(f, "[{:X}-{:X}]", self.start, self.end)
242
        }
243
0
    }
244
}
245
246
/// An iterator over ranges of matching UTF-8 byte sequences.
247
///
248
/// The iteration represents an alternation of comprehensive byte sequences
249
/// that match precisely the set of UTF-8 encoded scalar values.
250
///
251
/// A byte sequence corresponds to one of the scalar values in the range given
252
/// if and only if it completely matches exactly one of the sequences of byte
253
/// ranges produced by this iterator.
254
///
255
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
256
/// sequences will match the same bytes.
257
///
258
/// # Example
259
///
260
/// This shows how to match an arbitrary byte sequence against a range of
261
/// scalar values.
262
///
263
/// ```rust
264
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
265
///
266
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
267
///     for range in seqs {
268
///         if range.matches(bytes) {
269
///             return true;
270
///         }
271
///     }
272
///     false
273
/// }
274
///
275
/// // Test the basic multilingual plane.
276
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
277
///
278
/// // UTF-8 encoding of 'a'.
279
/// assert!(matches(&seqs, &[0x61]));
280
/// // UTF-8 encoding of '☃' (`\u{2603}`).
281
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
282
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
283
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
284
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
285
/// // which is invalid UTF-8, and therefore fails, despite the fact that
286
/// // the corresponding codepoint (0xD800) falls in the range given.
287
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
288
/// // And fails against plain old invalid UTF-8.
289
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
290
/// ```
291
///
292
/// If this example seems circuitous, that's because it is! It's meant to be
293
/// illustrative. In practice, you could just try to decode your byte sequence
294
/// and compare it with the scalar value range directly. However, this is not
295
/// always possible (for example, in a byte based automaton).
296
#[derive(Debug)]
297
pub struct Utf8Sequences {
298
    range_stack: Vec<ScalarRange>,
299
}
300
301
impl Utf8Sequences {
302
    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
303
    /// given.
304
0
    pub fn new(start: char, end: char) -> Self {
305
0
        let range =
306
0
            ScalarRange { start: u32::from(start), end: u32::from(end) };
307
0
        Utf8Sequences { range_stack: vec![range] }
308
0
    }
309
310
    /// reset resets the scalar value range.
311
    /// Any existing state is cleared, but resources may be reused.
312
    ///
313
    /// N.B. Benchmarks say that this method is dubious.
314
    #[doc(hidden)]
315
0
    pub fn reset(&mut self, start: char, end: char) {
316
0
        self.range_stack.clear();
317
0
        self.push(u32::from(start), u32::from(end));
318
0
    }
319
320
0
    fn push(&mut self, start: u32, end: u32) {
321
0
        self.range_stack.push(ScalarRange { start, end });
322
0
    }
323
}
324
325
struct ScalarRange {
326
    start: u32,
327
    end: u32,
328
}
329
330
impl fmt::Debug for ScalarRange {
331
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332
0
        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
333
0
    }
334
}
335
336
impl Iterator for Utf8Sequences {
337
    type Item = Utf8Sequence;
338
339
0
    fn next(&mut self) -> Option<Self::Item> {
340
0
        'TOP: while let Some(mut r) = self.range_stack.pop() {
341
            'INNER: loop {
342
0
                if let Some((r1, r2)) = r.split() {
343
0
                    self.push(r2.start, r2.end);
344
0
                    r.start = r1.start;
345
0
                    r.end = r1.end;
346
0
                    continue 'INNER;
347
0
                }
348
0
                if !r.is_valid() {
349
0
                    continue 'TOP;
350
0
                }
351
0
                for i in 1..MAX_UTF8_BYTES {
352
0
                    let max = max_scalar_value(i);
353
0
                    if r.start <= max && max < r.end {
354
0
                        self.push(max + 1, r.end);
355
0
                        r.end = max;
356
0
                        continue 'INNER;
357
0
                    }
358
                }
359
0
                if let Some(ascii_range) = r.as_ascii() {
360
0
                    return Some(Utf8Sequence::One(ascii_range));
361
0
                }
362
0
                for i in 1..MAX_UTF8_BYTES {
363
0
                    let m = (1 << (6 * i)) - 1;
364
0
                    if (r.start & !m) != (r.end & !m) {
365
0
                        if (r.start & m) != 0 {
366
0
                            self.push((r.start | m) + 1, r.end);
367
0
                            r.end = r.start | m;
368
0
                            continue 'INNER;
369
0
                        }
370
0
                        if (r.end & m) != m {
371
0
                            self.push(r.end & !m, r.end);
372
0
                            r.end = (r.end & !m) - 1;
373
0
                            continue 'INNER;
374
0
                        }
375
0
                    }
376
                }
377
0
                let mut start = [0; MAX_UTF8_BYTES];
378
0
                let mut end = [0; MAX_UTF8_BYTES];
379
0
                let n = r.encode(&mut start, &mut end);
380
0
                return Some(Utf8Sequence::from_encoded_range(
381
0
                    &start[0..n],
382
0
                    &end[0..n],
383
0
                ));
384
            }
385
        }
386
0
        None
387
0
    }
388
}
389
390
impl FusedIterator for Utf8Sequences {}
391
392
impl ScalarRange {
393
    /// split splits this range if it overlaps with a surrogate codepoint.
394
    ///
395
    /// Either or both ranges may be invalid.
396
0
    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
397
0
        if self.start < 0xE000 && self.end > 0xD7FF {
398
0
            Some((
399
0
                ScalarRange { start: self.start, end: 0xD7FF },
400
0
                ScalarRange { start: 0xE000, end: self.end },
401
0
            ))
402
        } else {
403
0
            None
404
        }
405
0
    }
406
407
    /// is_valid returns true if and only if start <= end.
408
0
    fn is_valid(&self) -> bool {
409
0
        self.start <= self.end
410
0
    }
411
412
    /// as_ascii returns this range as a Utf8Range if and only if all scalar
413
    /// values in this range can be encoded as a single byte.
414
0
    fn as_ascii(&self) -> Option<Utf8Range> {
415
0
        if self.is_ascii() {
416
0
            let start = u8::try_from(self.start).unwrap();
417
0
            let end = u8::try_from(self.end).unwrap();
418
0
            Some(Utf8Range::new(start, end))
419
        } else {
420
0
            None
421
        }
422
0
    }
423
424
    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
425
    /// byte to encode any scalar value).
426
0
    fn is_ascii(&self) -> bool {
427
0
        self.is_valid() && self.end <= 0x7f
428
0
    }
429
430
    /// encode writes the UTF-8 encoding of the start and end of this range
431
    /// to the corresponding destination slices, and returns the number of
432
    /// bytes written.
433
    ///
434
    /// The slices should have room for at least `MAX_UTF8_BYTES`.
435
0
    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
436
0
        let cs = char::from_u32(self.start).unwrap();
437
0
        let ce = char::from_u32(self.end).unwrap();
438
0
        let ss = cs.encode_utf8(start);
439
0
        let se = ce.encode_utf8(end);
440
0
        assert_eq!(ss.len(), se.len());
441
0
        ss.len()
442
0
    }
443
}
444
445
0
fn max_scalar_value(nbytes: usize) -> u32 {
446
0
    match nbytes {
447
0
        1 => 0x007F,
448
0
        2 => 0x07FF,
449
0
        3 => 0xFFFF,
450
0
        4 => 0x0010_FFFF,
451
0
        _ => unreachable!("invalid UTF-8 byte sequence size"),
452
    }
453
0
}
454
455
#[cfg(test)]
456
mod tests {
457
    use core::char;
458
459
    use alloc::{vec, vec::Vec};
460
461
    use crate::utf8::{Utf8Range, Utf8Sequences};
462
463
    fn rutf8(s: u8, e: u8) -> Utf8Range {
464
        Utf8Range::new(s, e)
465
    }
466
467
    fn never_accepts_surrogate_codepoints(start: char, end: char) {
468
        for cp in 0xD800..0xE000 {
469
            let buf = encode_surrogate(cp);
470
            for r in Utf8Sequences::new(start, end) {
471
                if r.matches(&buf) {
472
                    panic!(
473
                        "Sequence ({:X}, {:X}) contains range {:?}, \
474
                         which matches surrogate code point {:X} \
475
                         with encoded bytes {:?}",
476
                        u32::from(start),
477
                        u32::from(end),
478
                        r,
479
                        cp,
480
                        buf,
481
                    );
482
                }
483
            }
484
        }
485
    }
486
487
    #[test]
488
    fn codepoints_no_surrogates() {
489
        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
490
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
491
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
492
        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
493
        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
494
    }
495
496
    #[test]
497
    fn single_codepoint_one_sequence() {
498
        // Tests that every range of scalar values that contains a single
499
        // scalar value is recognized by one sequence of byte ranges.
500
        for i in 0x0..=0x0010_FFFF {
501
            let c = match char::from_u32(i) {
502
                None => continue,
503
                Some(c) => c,
504
            };
505
            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
506
            assert_eq!(seqs.len(), 1);
507
        }
508
    }
509
510
    #[test]
511
    fn bmp() {
512
        use crate::utf8::Utf8Sequence::*;
513
514
        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
515
        assert_eq!(
516
            seqs,
517
            vec![
518
                One(rutf8(0x0, 0x7F)),
519
                Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
520
                Three([
521
                    rutf8(0xE0, 0xE0),
522
                    rutf8(0xA0, 0xBF),
523
                    rutf8(0x80, 0xBF)
524
                ]),
525
                Three([
526
                    rutf8(0xE1, 0xEC),
527
                    rutf8(0x80, 0xBF),
528
                    rutf8(0x80, 0xBF)
529
                ]),
530
                Three([
531
                    rutf8(0xED, 0xED),
532
                    rutf8(0x80, 0x9F),
533
                    rutf8(0x80, 0xBF)
534
                ]),
535
                Three([
536
                    rutf8(0xEE, 0xEF),
537
                    rutf8(0x80, 0xBF),
538
                    rutf8(0x80, 0xBF)
539
                ]),
540
            ]
541
        );
542
    }
543
544
    #[test]
545
    fn reverse() {
546
        use crate::utf8::Utf8Sequence::*;
547
548
        let mut s = One(rutf8(0xA, 0xB));
549
        s.reverse();
550
        assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
551
552
        let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
553
        s.reverse();
554
        assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
555
556
        let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
557
        s.reverse();
558
        assert_eq!(
559
            s.as_slice(),
560
            &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
561
        );
562
563
        let mut s = Four([
564
            rutf8(0xA, 0xB),
565
            rutf8(0xB, 0xC),
566
            rutf8(0xC, 0xD),
567
            rutf8(0xD, 0xE),
568
        ]);
569
        s.reverse();
570
        assert_eq!(
571
            s.as_slice(),
572
            &[
573
                rutf8(0xD, 0xE),
574
                rutf8(0xC, 0xD),
575
                rutf8(0xB, 0xC),
576
                rutf8(0xA, 0xB)
577
            ]
578
        );
579
    }
580
581
    fn encode_surrogate(cp: u32) -> [u8; 3] {
582
        const TAG_CONT: u8 = 0b1000_0000;
583
        const TAG_THREE_B: u8 = 0b1110_0000;
584
585
        assert!(0xD800 <= cp && cp < 0xE000);
586
        let mut dst = [0; 3];
587
        dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
588
        dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
589
        dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
590
        dst
591
    }
592
}