Coverage Report

Created: 2025-08-26 06:38

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.6/src/utf8.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4
This is sub-module is useful for constructing byte based automatons that need
5
to embed UTF-8 decoding. The most common use of this module is in conjunction
6
with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
7
8
See the documentation on the `Utf8Sequences` iterator for more details and
9
an example.
10
11
# Wait, what is this?
12
13
This is simplest to explain with an example. Let's say you wanted to test
14
whether a particular byte sequence was a Cyrillic character. One possible
15
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16
range can be expressed as a sequence of byte ranges:
17
18
```text
19
[D0-D3][80-BF]
20
```
21
22
This is simple enough: simply encode the boundaries, `0400` encodes to
23
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26
However, what if you wanted to add the Cyrillic Supplementary characters to
27
your range? Your range might then become `[0400-052F]`. The same procedure
28
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30
this isn't quite correct because this range doesn't capture many characters,
31
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33
Instead, you need multiple sequences of byte ranges:
34
35
```text
36
[D0-D3][80-BF]  # matches codepoints 0400-04FF
37
[D4][80-AF]     # matches codepoints 0500-052F
38
```
39
40
This gets even more complicated if you want bigger ranges, particularly if
41
they naively contain surrogate codepoints. For example, the sequence of byte
42
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44
```text
45
[0-7F]
46
[C2-DF][80-BF]
47
[E0][A0-BF][80-BF]
48
[E1-EC][80-BF][80-BF]
49
[ED][80-9F][80-BF]
50
[EE-EF][80-BF][80-BF]
51
```
52
53
Note that the byte ranges above will *not* match any erroneous encoding of
54
UTF-8, including encodings of surrogate codepoints.
55
56
And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58
```text
59
[0-7F]
60
[C2-DF][80-BF]
61
[E0][A0-BF][80-BF]
62
[E1-EC][80-BF][80-BF]
63
[ED][80-9F][80-BF]
64
[EE-EF][80-BF][80-BF]
65
[F0][90-BF][80-BF][80-BF]
66
[F1-F3][80-BF][80-BF][80-BF]
67
[F4][80-8F][80-BF][80-BF]
68
```
69
70
This module automates the process of creating these byte ranges from ranges of
71
Unicode scalar values.
72
73
# Lineage
74
75
I got the idea and general implementation strategy from Russ Cox in his
76
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78
I also got the idea from
79
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80
which uses it for executing automata on their term index.
81
*/
82
83
use core::{char, fmt, iter::FusedIterator, slice};
84
85
use alloc::{vec, vec::Vec};
86
87
const MAX_UTF8_BYTES: usize = 4;
88
89
/// Utf8Sequence represents a sequence of byte ranges.
90
///
91
/// To match a Utf8Sequence, a candidate byte sequence must match each
92
/// successive range.
93
///
94
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
95
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
96
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
97
pub enum Utf8Sequence {
98
    /// One byte range.
99
    One(Utf8Range),
100
    /// Two successive byte ranges.
101
    Two([Utf8Range; 2]),
102
    /// Three successive byte ranges.
103
    Three([Utf8Range; 3]),
104
    /// Four successive byte ranges.
105
    Four([Utf8Range; 4]),
106
}
107
108
impl Utf8Sequence {
109
    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
110
    /// range.
111
    ///
112
    /// This assumes that `start` and `end` have the same length.
113
7.88k
    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
114
7.88k
        assert_eq!(start.len(), end.len());
115
7.88k
        match start.len() {
116
488
            2 => Utf8Sequence::Two([
117
488
                Utf8Range::new(start[0], end[0]),
118
488
                Utf8Range::new(start[1], end[1]),
119
488
            ]),
120
3.71k
            3 => Utf8Sequence::Three([
121
3.71k
                Utf8Range::new(start[0], end[0]),
122
3.71k
                Utf8Range::new(start[1], end[1]),
123
3.71k
                Utf8Range::new(start[2], end[2]),
124
3.71k
            ]),
125
3.68k
            4 => Utf8Sequence::Four([
126
3.68k
                Utf8Range::new(start[0], end[0]),
127
3.68k
                Utf8Range::new(start[1], end[1]),
128
3.68k
                Utf8Range::new(start[2], end[2]),
129
3.68k
                Utf8Range::new(start[3], end[3]),
130
3.68k
            ]),
131
0
            n => unreachable!("invalid encoded length: {}", n),
132
        }
133
7.88k
    }
134
135
    /// Returns the underlying sequence of byte ranges as a slice.
136
7.95k
    pub fn as_slice(&self) -> &[Utf8Range] {
137
        use self::Utf8Sequence::*;
138
7.95k
        match *self {
139
74
            One(ref r) => slice::from_ref(r),
140
488
            Two(ref r) => &r[..],
141
3.71k
            Three(ref r) => &r[..],
142
3.68k
            Four(ref r) => &r[..],
143
        }
144
7.95k
    }
145
146
    /// Returns the number of byte ranges in this sequence.
147
    ///
148
    /// The length is guaranteed to be in the closed interval `[1, 4]`.
149
0
    pub fn len(&self) -> usize {
150
0
        self.as_slice().len()
151
0
    }
152
153
    /// Reverses the ranges in this sequence.
154
    ///
155
    /// For example, if this corresponds to the following sequence:
156
    ///
157
    /// ```text
158
    /// [D0-D3][80-BF]
159
    /// ```
160
    ///
161
    /// Then after reversal, it will be
162
    ///
163
    /// ```text
164
    /// [80-BF][D0-D3]
165
    /// ```
166
    ///
167
    /// This is useful when one is constructing a UTF-8 automaton to match
168
    /// character classes in reverse.
169
0
    pub fn reverse(&mut self) {
170
0
        match *self {
171
0
            Utf8Sequence::One(_) => {}
172
0
            Utf8Sequence::Two(ref mut x) => x.reverse(),
173
0
            Utf8Sequence::Three(ref mut x) => x.reverse(),
174
0
            Utf8Sequence::Four(ref mut x) => x.reverse(),
175
        }
176
0
    }
177
178
    /// Returns true if and only if a prefix of `bytes` matches this sequence
179
    /// of byte ranges.
180
0
    pub fn matches(&self, bytes: &[u8]) -> bool {
181
0
        if bytes.len() < self.len() {
182
0
            return false;
183
0
        }
184
0
        for (&b, r) in bytes.iter().zip(self) {
185
0
            if !r.matches(b) {
186
0
                return false;
187
0
            }
188
        }
189
0
        true
190
0
    }
191
}
192
193
impl<'a> IntoIterator for &'a Utf8Sequence {
194
    type IntoIter = slice::Iter<'a, Utf8Range>;
195
    type Item = &'a Utf8Range;
196
197
0
    fn into_iter(self) -> Self::IntoIter {
198
0
        self.as_slice().iter()
199
0
    }
200
}
201
202
impl fmt::Debug for Utf8Sequence {
203
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204
        use self::Utf8Sequence::*;
205
0
        match *self {
206
0
            One(ref r) => write!(f, "{:?}", r),
207
0
            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
208
0
            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
209
0
            Four(ref r) => {
210
0
                write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
211
            }
212
        }
213
0
    }
214
}
215
216
/// A single inclusive range of UTF-8 bytes.
217
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
218
pub struct Utf8Range {
219
    /// Start of byte range (inclusive).
220
    pub start: u8,
221
    /// End of byte range (inclusive).
222
    pub end: u8,
223
}
224
225
impl Utf8Range {
226
26.9k
    fn new(start: u8, end: u8) -> Self {
227
26.9k
        Utf8Range { start, end }
228
26.9k
    }
229
230
    /// Returns true if and only if the given byte is in this range.
231
0
    pub fn matches(&self, b: u8) -> bool {
232
0
        self.start <= b && b <= self.end
233
0
    }
234
}
235
236
impl fmt::Debug for Utf8Range {
237
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238
0
        if self.start == self.end {
239
0
            write!(f, "[{:X}]", self.start)
240
        } else {
241
0
            write!(f, "[{:X}-{:X}]", self.start, self.end)
242
        }
243
0
    }
244
}
245
246
/// An iterator over ranges of matching UTF-8 byte sequences.
247
///
248
/// The iteration represents an alternation of comprehensive byte sequences
249
/// that match precisely the set of UTF-8 encoded scalar values.
250
///
251
/// A byte sequence corresponds to one of the scalar values in the range given
252
/// if and only if it completely matches exactly one of the sequences of byte
253
/// ranges produced by this iterator.
254
///
255
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
256
/// sequences will match the same bytes.
257
///
258
/// # Example
259
///
260
/// This shows how to match an arbitrary byte sequence against a range of
261
/// scalar values.
262
///
263
/// ```rust
264
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
265
///
266
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
267
///     for range in seqs {
268
///         if range.matches(bytes) {
269
///             return true;
270
///         }
271
///     }
272
///     false
273
/// }
274
///
275
/// // Test the basic multilingual plane.
276
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
277
///
278
/// // UTF-8 encoding of 'a'.
279
/// assert!(matches(&seqs, &[0x61]));
280
/// // UTF-8 encoding of '☃' (`\u{2603}`).
281
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
282
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
283
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
284
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
285
/// // which is invalid UTF-8, and therefore fails, despite the fact that
286
/// // the corresponding codepoint (0xD800) falls in the range given.
287
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
288
/// // And fails against plain old invalid UTF-8.
289
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
290
/// ```
291
///
292
/// If this example seems circuitous, that's because it is! It's meant to be
293
/// illustrative. In practice, you could just try to decode your byte sequence
294
/// and compare it with the scalar value range directly. However, this is not
295
/// always possible (for example, in a byte based automaton).
296
#[derive(Debug)]
297
pub struct Utf8Sequences {
298
    range_stack: Vec<ScalarRange>,
299
}
300
301
impl Utf8Sequences {
302
    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
303
    /// given.
304
6.31k
    pub fn new(start: char, end: char) -> Self {
305
6.31k
        let range =
306
6.31k
            ScalarRange { start: u32::from(start), end: u32::from(end) };
307
6.31k
        Utf8Sequences { range_stack: vec![range] }
308
6.31k
    }
309
310
    /// reset resets the scalar value range.
311
    /// Any existing state is cleared, but resources may be reused.
312
    ///
313
    /// N.B. Benchmarks say that this method is dubious.
314
    #[doc(hidden)]
315
0
    pub fn reset(&mut self, start: char, end: char) {
316
0
        self.range_stack.clear();
317
0
        self.push(u32::from(start), u32::from(end));
318
0
    }
319
320
1.64k
    fn push(&mut self, start: u32, end: u32) {
321
1.64k
        self.range_stack.push(ScalarRange { start, end });
322
1.64k
    }
323
}
324
325
struct ScalarRange {
326
    start: u32,
327
    end: u32,
328
}
329
330
impl fmt::Debug for ScalarRange {
331
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332
0
        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
333
0
    }
334
}
335
336
impl Iterator for Utf8Sequences {
337
    type Item = Utf8Sequence;
338
339
14.2k
    fn next(&mut self) -> Option<Self::Item> {
340
14.2k
        'TOP: while let Some(mut r) = self.range_stack.pop() {
341
            'INNER: loop {
342
9.59k
                if let Some((r1, r2)) = r.split() {
343
8
                    self.push(r2.start, r2.end);
344
8
                    r.start = r1.start;
345
8
                    r.end = r1.end;
346
8
                    continue 'INNER;
347
9.59k
                }
348
9.59k
                if !r.is_valid() {
349
0
                    continue 'TOP;
350
9.59k
                }
351
38.3k
                for i in 1..MAX_UTF8_BYTES {
352
28.7k
                    let max = max_scalar_value(i);
353
28.7k
                    if r.start <= max && max < r.end {
354
24
                        self.push(max + 1, r.end);
355
24
                        r.end = max;
356
24
                        continue 'INNER;
357
28.7k
                    }
358
                }
359
9.56k
                if let Some(ascii_range) = r.as_ascii() {
360
74
                    return Some(Utf8Sequence::One(ascii_range));
361
9.49k
                }
362
33.2k
                for i in 1..MAX_UTF8_BYTES {
363
25.4k
                    let m = (1 << (6 * i)) - 1;
364
25.4k
                    if (r.start & !m) != (r.end & !m) {
365
2.30k
                        if (r.start & m) != 0 {
366
1.04k
                            self.push((r.start | m) + 1, r.end);
367
1.04k
                            r.end = r.start | m;
368
1.04k
                            continue 'INNER;
369
1.26k
                        }
370
1.26k
                        if (r.end & m) != m {
371
568
                            self.push(r.end & !m, r.end);
372
568
                            r.end = (r.end & !m) - 1;
373
568
                            continue 'INNER;
374
696
                        }
375
23.1k
                    }
376
                }
377
7.88k
                let mut start = [0; MAX_UTF8_BYTES];
378
7.88k
                let mut end = [0; MAX_UTF8_BYTES];
379
7.88k
                let n = r.encode(&mut start, &mut end);
380
7.88k
                return Some(Utf8Sequence::from_encoded_range(
381
7.88k
                    &start[0..n],
382
7.88k
                    &end[0..n],
383
7.88k
                ));
384
            }
385
        }
386
6.31k
        None
387
14.2k
    }
388
}
389
390
impl FusedIterator for Utf8Sequences {}
391
392
impl ScalarRange {
393
    /// split splits this range if it overlaps with a surrogate codepoint.
394
    ///
395
    /// Either or both ranges may be invalid.
396
9.59k
    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
397
9.59k
        if self.start < 0xE000 && self.end > 0xD7FF {
398
8
            Some((
399
8
                ScalarRange { start: self.start, end: 0xD7FF },
400
8
                ScalarRange { start: 0xE000, end: self.end },
401
8
            ))
402
        } else {
403
9.59k
            None
404
        }
405
9.59k
    }
406
407
    /// is_valid returns true if and only if start <= end.
408
19.1k
    fn is_valid(&self) -> bool {
409
19.1k
        self.start <= self.end
410
19.1k
    }
411
412
    /// as_ascii returns this range as a Utf8Range if and only if all scalar
413
    /// values in this range can be encoded as a single byte.
414
9.56k
    fn as_ascii(&self) -> Option<Utf8Range> {
415
9.56k
        if self.is_ascii() {
416
74
            let start = u8::try_from(self.start).unwrap();
417
74
            let end = u8::try_from(self.end).unwrap();
418
74
            Some(Utf8Range::new(start, end))
419
        } else {
420
9.49k
            None
421
        }
422
9.56k
    }
423
424
    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
425
    /// byte to encode any scalar value).
426
9.56k
    fn is_ascii(&self) -> bool {
427
9.56k
        self.is_valid() && self.end <= 0x7f
428
9.56k
    }
429
430
    /// encode writes the UTF-8 encoding of the start and end of this range
431
    /// to the corresponding destination slices, and returns the number of
432
    /// bytes written.
433
    ///
434
    /// The slices should have room for at least `MAX_UTF8_BYTES`.
435
7.88k
    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
436
7.88k
        let cs = char::from_u32(self.start).unwrap();
437
7.88k
        let ce = char::from_u32(self.end).unwrap();
438
7.88k
        let ss = cs.encode_utf8(start);
439
7.88k
        let se = ce.encode_utf8(end);
440
7.88k
        assert_eq!(ss.len(), se.len());
441
7.88k
        ss.len()
442
7.88k
    }
443
}
444
445
28.7k
fn max_scalar_value(nbytes: usize) -> u32 {
446
28.7k
    match nbytes {
447
9.59k
        1 => 0x007F,
448
9.58k
        2 => 0x07FF,
449
9.57k
        3 => 0xFFFF,
450
0
        4 => 0x0010_FFFF,
451
0
        _ => unreachable!("invalid UTF-8 byte sequence size"),
452
    }
453
28.7k
}
454
455
#[cfg(test)]
456
mod tests {
457
    use core::char;
458
459
    use alloc::{vec, vec::Vec};
460
461
    use crate::utf8::{Utf8Range, Utf8Sequences};
462
463
    fn rutf8(s: u8, e: u8) -> Utf8Range {
464
        Utf8Range::new(s, e)
465
    }
466
467
    fn never_accepts_surrogate_codepoints(start: char, end: char) {
468
        for cp in 0xD800..0xE000 {
469
            let buf = encode_surrogate(cp);
470
            for r in Utf8Sequences::new(start, end) {
471
                if r.matches(&buf) {
472
                    panic!(
473
                        "Sequence ({:X}, {:X}) contains range {:?}, \
474
                         which matches surrogate code point {:X} \
475
                         with encoded bytes {:?}",
476
                        u32::from(start),
477
                        u32::from(end),
478
                        r,
479
                        cp,
480
                        buf,
481
                    );
482
                }
483
            }
484
        }
485
    }
486
487
    #[test]
488
    fn codepoints_no_surrogates() {
489
        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
490
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
491
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
492
        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
493
        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
494
    }
495
496
    #[test]
497
    fn single_codepoint_one_sequence() {
498
        // Tests that every range of scalar values that contains a single
499
        // scalar value is recognized by one sequence of byte ranges.
500
        for i in 0x0..=0x0010_FFFF {
501
            let c = match char::from_u32(i) {
502
                None => continue,
503
                Some(c) => c,
504
            };
505
            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
506
            assert_eq!(seqs.len(), 1);
507
        }
508
    }
509
510
    #[test]
511
    fn bmp() {
512
        use crate::utf8::Utf8Sequence::*;
513
514
        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
515
        assert_eq!(
516
            seqs,
517
            vec![
518
                One(rutf8(0x0, 0x7F)),
519
                Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
520
                Three([
521
                    rutf8(0xE0, 0xE0),
522
                    rutf8(0xA0, 0xBF),
523
                    rutf8(0x80, 0xBF)
524
                ]),
525
                Three([
526
                    rutf8(0xE1, 0xEC),
527
                    rutf8(0x80, 0xBF),
528
                    rutf8(0x80, 0xBF)
529
                ]),
530
                Three([
531
                    rutf8(0xED, 0xED),
532
                    rutf8(0x80, 0x9F),
533
                    rutf8(0x80, 0xBF)
534
                ]),
535
                Three([
536
                    rutf8(0xEE, 0xEF),
537
                    rutf8(0x80, 0xBF),
538
                    rutf8(0x80, 0xBF)
539
                ]),
540
            ]
541
        );
542
    }
543
544
    #[test]
545
    fn reverse() {
546
        use crate::utf8::Utf8Sequence::*;
547
548
        let mut s = One(rutf8(0xA, 0xB));
549
        s.reverse();
550
        assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
551
552
        let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
553
        s.reverse();
554
        assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
555
556
        let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
557
        s.reverse();
558
        assert_eq!(
559
            s.as_slice(),
560
            &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
561
        );
562
563
        let mut s = Four([
564
            rutf8(0xA, 0xB),
565
            rutf8(0xB, 0xC),
566
            rutf8(0xC, 0xD),
567
            rutf8(0xD, 0xE),
568
        ]);
569
        s.reverse();
570
        assert_eq!(
571
            s.as_slice(),
572
            &[
573
                rutf8(0xD, 0xE),
574
                rutf8(0xC, 0xD),
575
                rutf8(0xB, 0xC),
576
                rutf8(0xA, 0xB)
577
            ]
578
        );
579
    }
580
581
    fn encode_surrogate(cp: u32) -> [u8; 3] {
582
        const TAG_CONT: u8 = 0b1000_0000;
583
        const TAG_THREE_B: u8 = 0b1110_0000;
584
585
        assert!(0xD800 <= cp && cp < 0xE000);
586
        let mut dst = [0; 3];
587
        dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
588
        dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
589
        dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
590
        dst
591
    }
592
}