Coverage Report

Created: 2023-04-25 07:07

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.6.25/src/utf8.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4
This is sub-module is useful for constructing byte based automatons that need
5
to embed UTF-8 decoding. The most common use of this module is in conjunction
6
with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
7
8
See the documentation on the `Utf8Sequences` iterator for more details and
9
an example.
10
11
# Wait, what is this?
12
13
This is simplest to explain with an example. Let's say you wanted to test
14
whether a particular byte sequence was a Cyrillic character. One possible
15
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16
range can be expressed as a sequence of byte ranges:
17
18
```text
19
[D0-D3][80-BF]
20
```
21
22
This is simple enough: simply encode the boundaries, `0400` encodes to
23
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26
However, what if you wanted to add the Cyrillic Supplementary characters to
27
your range? Your range might then become `[0400-052F]`. The same procedure
28
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30
this isn't quite correct because this range doesn't capture many characters,
31
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33
Instead, you need multiple sequences of byte ranges:
34
35
```text
36
[D0-D3][80-BF]  # matches codepoints 0400-04FF
37
[D4][80-AF]     # matches codepoints 0500-052F
38
```
39
40
This gets even more complicated if you want bigger ranges, particularly if
41
they naively contain surrogate codepoints. For example, the sequence of byte
42
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44
```text
45
[0-7F]
46
[C2-DF][80-BF]
47
[E0][A0-BF][80-BF]
48
[E1-EC][80-BF][80-BF]
49
[ED][80-9F][80-BF]
50
[EE-EF][80-BF][80-BF]
51
```
52
53
Note that the byte ranges above will *not* match any erroneous encoding of
54
UTF-8, including encodings of surrogate codepoints.
55
56
And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58
```text
59
[0-7F]
60
[C2-DF][80-BF]
61
[E0][A0-BF][80-BF]
62
[E1-EC][80-BF][80-BF]
63
[ED][80-9F][80-BF]
64
[EE-EF][80-BF][80-BF]
65
[F0][90-BF][80-BF][80-BF]
66
[F1-F3][80-BF][80-BF][80-BF]
67
[F4][80-8F][80-BF][80-BF]
68
```
69
70
This module automates the process of creating these byte ranges from ranges of
71
Unicode scalar values.
72
73
# Lineage
74
75
I got the idea and general implementation strategy from Russ Cox in his
76
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78
I also got the idea from
79
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80
which uses it for executing automata on their term index.
81
*/
82
83
#![deny(missing_docs)]
84
85
use std::char;
86
use std::fmt;
87
use std::iter::FusedIterator;
88
use std::slice;
89
90
const MAX_UTF8_BYTES: usize = 4;
91
92
/// Utf8Sequence represents a sequence of byte ranges.
93
///
94
/// To match a Utf8Sequence, a candidate byte sequence must match each
95
/// successive range.
96
///
97
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
98
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
99
0
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
100
pub enum Utf8Sequence {
101
    /// One byte range.
102
    One(Utf8Range),
103
    /// Two successive byte ranges.
104
    Two([Utf8Range; 2]),
105
    /// Three successive byte ranges.
106
    Three([Utf8Range; 3]),
107
    /// Four successive byte ranges.
108
    Four([Utf8Range; 4]),
109
}
110
111
impl Utf8Sequence {
112
    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
113
    /// range.
114
    ///
115
    /// This assumes that `start` and `end` have the same length.
116
0
    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
117
0
        assert_eq!(start.len(), end.len());
118
0
        match start.len() {
119
0
            2 => Utf8Sequence::Two([
120
0
                Utf8Range::new(start[0], end[0]),
121
0
                Utf8Range::new(start[1], end[1]),
122
0
            ]),
123
0
            3 => Utf8Sequence::Three([
124
0
                Utf8Range::new(start[0], end[0]),
125
0
                Utf8Range::new(start[1], end[1]),
126
0
                Utf8Range::new(start[2], end[2]),
127
0
            ]),
128
0
            4 => Utf8Sequence::Four([
129
0
                Utf8Range::new(start[0], end[0]),
130
0
                Utf8Range::new(start[1], end[1]),
131
0
                Utf8Range::new(start[2], end[2]),
132
0
                Utf8Range::new(start[3], end[3]),
133
0
            ]),
134
0
            n => unreachable!("invalid encoded length: {}", n),
135
        }
136
0
    }
137
138
    /// Returns the underlying sequence of byte ranges as a slice.
139
0
    pub fn as_slice(&self) -> &[Utf8Range] {
140
0
        use self::Utf8Sequence::*;
141
0
        match *self {
142
0
            One(ref r) => slice::from_ref(r),
143
0
            Two(ref r) => &r[..],
144
0
            Three(ref r) => &r[..],
145
0
            Four(ref r) => &r[..],
146
        }
147
0
    }
148
149
    /// Returns the number of byte ranges in this sequence.
150
    ///
151
    /// The length is guaranteed to be in the closed interval `[1, 4]`.
152
0
    pub fn len(&self) -> usize {
153
0
        self.as_slice().len()
154
0
    }
155
156
    /// Reverses the ranges in this sequence.
157
    ///
158
    /// For example, if this corresponds to the following sequence:
159
    ///
160
    /// ```text
161
    /// [D0-D3][80-BF]
162
    /// ```
163
    ///
164
    /// Then after reversal, it will be
165
    ///
166
    /// ```text
167
    /// [80-BF][D0-D3]
168
    /// ```
169
    ///
170
    /// This is useful when one is constructing a UTF-8 automaton to match
171
    /// character classes in reverse.
172
0
    pub fn reverse(&mut self) {
173
0
        match *self {
174
0
            Utf8Sequence::One(_) => {}
175
0
            Utf8Sequence::Two(ref mut x) => x.reverse(),
176
0
            Utf8Sequence::Three(ref mut x) => x.reverse(),
177
0
            Utf8Sequence::Four(ref mut x) => x.reverse(),
178
        }
179
0
    }
180
181
    /// Returns true if and only if a prefix of `bytes` matches this sequence
182
    /// of byte ranges.
183
0
    pub fn matches(&self, bytes: &[u8]) -> bool {
184
0
        if bytes.len() < self.len() {
185
0
            return false;
186
0
        }
187
0
        for (&b, r) in bytes.iter().zip(self) {
188
0
            if !r.matches(b) {
189
0
                return false;
190
0
            }
191
        }
192
0
        true
193
0
    }
194
}
195
196
impl<'a> IntoIterator for &'a Utf8Sequence {
197
    type IntoIter = slice::Iter<'a, Utf8Range>;
198
    type Item = &'a Utf8Range;
199
200
0
    fn into_iter(self) -> Self::IntoIter {
201
0
        self.as_slice().into_iter()
202
0
    }
203
}
204
205
impl fmt::Debug for Utf8Sequence {
206
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207
0
        use self::Utf8Sequence::*;
208
0
        match *self {
209
0
            One(ref r) => write!(f, "{:?}", r),
210
0
            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
211
0
            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
212
0
            Four(ref r) => {
213
0
                write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
214
            }
215
        }
216
0
    }
217
}
218
219
/// A single inclusive range of UTF-8 bytes.
220
0
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
221
pub struct Utf8Range {
222
    /// Start of byte range (inclusive).
223
    pub start: u8,
224
    /// End of byte range (inclusive).
225
    pub end: u8,
226
}
227
228
impl Utf8Range {
229
0
    fn new(start: u8, end: u8) -> Self {
230
0
        Utf8Range { start, end }
231
0
    }
232
233
    /// Returns true if and only if the given byte is in this range.
234
0
    pub fn matches(&self, b: u8) -> bool {
235
0
        self.start <= b && b <= self.end
236
0
    }
237
}
238
239
impl fmt::Debug for Utf8Range {
240
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241
0
        if self.start == self.end {
242
0
            write!(f, "[{:X}]", self.start)
243
        } else {
244
0
            write!(f, "[{:X}-{:X}]", self.start, self.end)
245
        }
246
0
    }
247
}
248
249
/// An iterator over ranges of matching UTF-8 byte sequences.
250
///
251
/// The iteration represents an alternation of comprehensive byte sequences
252
/// that match precisely the set of UTF-8 encoded scalar values.
253
///
254
/// A byte sequence corresponds to one of the scalar values in the range given
255
/// if and only if it completely matches exactly one of the sequences of byte
256
/// ranges produced by this iterator.
257
///
258
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
259
/// sequences will match the same bytes.
260
///
261
/// # Example
262
///
263
/// This shows how to match an arbitrary byte sequence against a range of
264
/// scalar values.
265
///
266
/// ```rust
267
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
268
///
269
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
270
///     for range in seqs {
271
///         if range.matches(bytes) {
272
///             return true;
273
///         }
274
///     }
275
///     false
276
/// }
277
///
278
/// // Test the basic multilingual plane.
279
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
280
///
281
/// // UTF-8 encoding of 'a'.
282
/// assert!(matches(&seqs, &[0x61]));
283
/// // UTF-8 encoding of '☃' (`\u{2603}`).
284
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
285
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
286
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
287
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
288
/// // which is invalid UTF-8, and therefore fails, despite the fact that
289
/// // the corresponding codepoint (0xD800) falls in the range given.
290
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
291
/// // And fails against plain old invalid UTF-8.
292
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
293
/// ```
294
///
295
/// If this example seems circuitous, that's because it is! It's meant to be
296
/// illustrative. In practice, you could just try to decode your byte sequence
297
/// and compare it with the scalar value range directly. However, this is not
298
/// always possible (for example, in a byte based automaton).
299
0
#[derive(Debug)]
300
pub struct Utf8Sequences {
301
    range_stack: Vec<ScalarRange>,
302
}
303
304
impl Utf8Sequences {
305
    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
306
    /// given.
307
0
    pub fn new(start: char, end: char) -> Self {
308
0
        let mut it = Utf8Sequences { range_stack: vec![] };
309
0
        it.push(start as u32, end as u32);
310
0
        it
311
0
    }
312
313
    /// reset resets the scalar value range.
314
    /// Any existing state is cleared, but resources may be reused.
315
    ///
316
    /// N.B. Benchmarks say that this method is dubious.
317
    #[doc(hidden)]
318
0
    pub fn reset(&mut self, start: char, end: char) {
319
0
        self.range_stack.clear();
320
0
        self.push(start as u32, end as u32);
321
0
    }
322
323
0
    fn push(&mut self, start: u32, end: u32) {
324
0
        self.range_stack.push(ScalarRange { start, end });
325
0
    }
326
}
327
328
struct ScalarRange {
329
    start: u32,
330
    end: u32,
331
}
332
333
impl fmt::Debug for ScalarRange {
334
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335
0
        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
336
0
    }
337
}
338
339
impl Iterator for Utf8Sequences {
340
    type Item = Utf8Sequence;
341
342
0
    fn next(&mut self) -> Option<Self::Item> {
343
0
        'TOP: while let Some(mut r) = self.range_stack.pop() {
344
            'INNER: loop {
345
0
                if let Some((r1, r2)) = r.split() {
346
0
                    self.push(r2.start, r2.end);
347
0
                    r.start = r1.start;
348
0
                    r.end = r1.end;
349
0
                    continue 'INNER;
350
0
                }
351
0
                if !r.is_valid() {
352
0
                    continue 'TOP;
353
0
                }
354
0
                for i in 1..MAX_UTF8_BYTES {
355
0
                    let max = max_scalar_value(i);
356
0
                    if r.start <= max && max < r.end {
357
0
                        self.push(max + 1, r.end);
358
0
                        r.end = max;
359
0
                        continue 'INNER;
360
0
                    }
361
                }
362
0
                if let Some(ascii_range) = r.as_ascii() {
363
0
                    return Some(Utf8Sequence::One(ascii_range));
364
0
                }
365
0
                for i in 1..MAX_UTF8_BYTES {
366
0
                    let m = (1 << (6 * i)) - 1;
367
0
                    if (r.start & !m) != (r.end & !m) {
368
0
                        if (r.start & m) != 0 {
369
0
                            self.push((r.start | m) + 1, r.end);
370
0
                            r.end = r.start | m;
371
0
                            continue 'INNER;
372
0
                        }
373
0
                        if (r.end & m) != m {
374
0
                            self.push(r.end & !m, r.end);
375
0
                            r.end = (r.end & !m) - 1;
376
0
                            continue 'INNER;
377
0
                        }
378
0
                    }
379
                }
380
0
                let mut start = [0; MAX_UTF8_BYTES];
381
0
                let mut end = [0; MAX_UTF8_BYTES];
382
0
                let n = r.encode(&mut start, &mut end);
383
0
                return Some(Utf8Sequence::from_encoded_range(
384
0
                    &start[0..n],
385
0
                    &end[0..n],
386
0
                ));
387
            }
388
        }
389
0
        None
390
0
    }
391
}
392
393
impl FusedIterator for Utf8Sequences {}
394
395
impl ScalarRange {
396
    /// split splits this range if it overlaps with a surrogate codepoint.
397
    ///
398
    /// Either or both ranges may be invalid.
399
0
    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
400
0
        if self.start < 0xE000 && self.end > 0xD7FF {
401
0
            Some((
402
0
                ScalarRange { start: self.start, end: 0xD7FF },
403
0
                ScalarRange { start: 0xE000, end: self.end },
404
0
            ))
405
        } else {
406
0
            None
407
        }
408
0
    }
409
410
    /// is_valid returns true if and only if start <= end.
411
0
    fn is_valid(&self) -> bool {
412
0
        self.start <= self.end
413
0
    }
414
415
    /// as_ascii returns this range as a Utf8Range if and only if all scalar
416
    /// values in this range can be encoded as a single byte.
417
0
    fn as_ascii(&self) -> Option<Utf8Range> {
418
0
        if self.is_ascii() {
419
0
            Some(Utf8Range::new(self.start as u8, self.end as u8))
420
        } else {
421
0
            None
422
        }
423
0
    }
424
425
    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
426
    /// byte to encode any scalar value).
427
0
    fn is_ascii(&self) -> bool {
428
0
        self.is_valid() && self.end <= 0x7f
429
0
    }
430
431
    /// encode writes the UTF-8 encoding of the start and end of this range
432
    /// to the corresponding destination slices, and returns the number of
433
    /// bytes written.
434
    ///
435
    /// The slices should have room for at least `MAX_UTF8_BYTES`.
436
0
    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
437
0
        let cs = char::from_u32(self.start).unwrap();
438
0
        let ce = char::from_u32(self.end).unwrap();
439
0
        let ss = cs.encode_utf8(start);
440
0
        let se = ce.encode_utf8(end);
441
0
        assert_eq!(ss.len(), se.len());
442
0
        ss.len()
443
0
    }
444
}
445
446
0
fn max_scalar_value(nbytes: usize) -> u32 {
447
0
    match nbytes {
448
0
        1 => 0x007F,
449
0
        2 => 0x07FF,
450
0
        3 => 0xFFFF,
451
0
        4 => 0x10FFFF,
452
0
        _ => unreachable!("invalid UTF-8 byte sequence size"),
453
    }
454
0
}
455
456
#[cfg(test)]
457
mod tests {
458
    use std::char;
459
460
    use crate::utf8::{Utf8Range, Utf8Sequences};
461
462
    fn rutf8(s: u8, e: u8) -> Utf8Range {
463
        Utf8Range::new(s, e)
464
    }
465
466
    fn never_accepts_surrogate_codepoints(start: char, end: char) {
467
        for cp in 0xD800..0xE000 {
468
            let buf = encode_surrogate(cp);
469
            for r in Utf8Sequences::new(start, end) {
470
                if r.matches(&buf) {
471
                    panic!(
472
                        "Sequence ({:X}, {:X}) contains range {:?}, \
473
                         which matches surrogate code point {:X} \
474
                         with encoded bytes {:?}",
475
                        start as u32, end as u32, r, cp, buf,
476
                    );
477
                }
478
            }
479
        }
480
    }
481
482
    #[test]
483
    fn codepoints_no_surrogates() {
484
        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
485
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
486
        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
487
        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
488
        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
489
    }
490
491
    #[test]
492
    fn single_codepoint_one_sequence() {
493
        // Tests that every range of scalar values that contains a single
494
        // scalar value is recognized by one sequence of byte ranges.
495
        for i in 0x0..(0x10FFFF + 1) {
496
            let c = match char::from_u32(i) {
497
                None => continue,
498
                Some(c) => c,
499
            };
500
            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
501
            assert_eq!(seqs.len(), 1);
502
        }
503
    }
504
505
    #[test]
506
    fn bmp() {
507
        use crate::utf8::Utf8Sequence::*;
508
509
        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
510
        assert_eq!(
511
            seqs,
512
            vec![
513
                One(rutf8(0x0, 0x7F)),
514
                Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
515
                Three([
516
                    rutf8(0xE0, 0xE0),
517
                    rutf8(0xA0, 0xBF),
518
                    rutf8(0x80, 0xBF)
519
                ]),
520
                Three([
521
                    rutf8(0xE1, 0xEC),
522
                    rutf8(0x80, 0xBF),
523
                    rutf8(0x80, 0xBF)
524
                ]),
525
                Three([
526
                    rutf8(0xED, 0xED),
527
                    rutf8(0x80, 0x9F),
528
                    rutf8(0x80, 0xBF)
529
                ]),
530
                Three([
531
                    rutf8(0xEE, 0xEF),
532
                    rutf8(0x80, 0xBF),
533
                    rutf8(0x80, 0xBF)
534
                ]),
535
            ]
536
        );
537
    }
538
539
    #[test]
540
    fn reverse() {
541
        use crate::utf8::Utf8Sequence::*;
542
543
        let mut s = One(rutf8(0xA, 0xB));
544
        s.reverse();
545
        assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
546
547
        let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
548
        s.reverse();
549
        assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
550
551
        let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
552
        s.reverse();
553
        assert_eq!(
554
            s.as_slice(),
555
            &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
556
        );
557
558
        let mut s = Four([
559
            rutf8(0xA, 0xB),
560
            rutf8(0xB, 0xC),
561
            rutf8(0xC, 0xD),
562
            rutf8(0xD, 0xE),
563
        ]);
564
        s.reverse();
565
        assert_eq!(
566
            s.as_slice(),
567
            &[
568
                rutf8(0xD, 0xE),
569
                rutf8(0xC, 0xD),
570
                rutf8(0xB, 0xC),
571
                rutf8(0xA, 0xB)
572
            ]
573
        );
574
    }
575
576
    fn encode_surrogate(cp: u32) -> [u8; 3] {
577
        const TAG_CONT: u8 = 0b1000_0000;
578
        const TAG_THREE_B: u8 = 0b1110_0000;
579
580
        assert!(0xD800 <= cp && cp < 0xE000);
581
        let mut dst = [0; 3];
582
        dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
583
        dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
584
        dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
585
        dst
586
    }
587
}