Coverage Report

Created: 2026-03-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-1.5.6/src/utf8.rs
Line
Count
Source
1
/// A few elementary UTF-8 encoding and decoding functions used by the matching
2
/// engines.
3
///
4
/// In an ideal world, the matching engines operate on `&str` and we can just
5
/// lean on the standard library for all our UTF-8 needs. However, to support
6
/// byte based regexes (that can match on arbitrary bytes which may contain
7
/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8
/// The standard library doesn't really recognize this use case, so we have
9
/// to build it out ourselves.
10
///
11
/// Should this be factored out into a separate crate? It seems independently
12
/// useful. There are other crates that already exist (e.g., `utf-8`) that have
13
/// overlapping use cases. Not sure what to do.
14
use std::char;
15
16
const TAG_CONT: u8 = 0b1000_0000;
17
const TAG_TWO: u8 = 0b1100_0000;
18
const TAG_THREE: u8 = 0b1110_0000;
19
const TAG_FOUR: u8 = 0b1111_0000;
20
21
/// Returns the smallest possible index of the next valid UTF-8 sequence
22
/// starting after `i`.
23
0
pub fn next_utf8(text: &[u8], i: usize) -> usize {
24
0
    let b = match text.get(i) {
25
0
        None => return i + 1,
26
0
        Some(&b) => b,
27
    };
28
0
    let inc = if b <= 0x7F {
29
0
        1
30
0
    } else if b <= 0b110_11111 {
31
0
        2
32
0
    } else if b <= 0b1110_1111 {
33
0
        3
34
    } else {
35
0
        4
36
    };
37
0
    i + inc
38
0
}
Unexecuted instantiation: regex::utf8::next_utf8
Unexecuted instantiation: regex::utf8::next_utf8
39
40
/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41
///
42
/// If no valid UTF-8 sequence could be found, then `None` is returned.
43
/// Otherwise, the decoded codepoint and the number of bytes read is returned.
44
/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45
/// 1, 2, 3 or 4.
46
///
47
/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48
/// codepoint that is out of range (surrogate codepoints are out of range) or
49
/// is not the shortest possible UTF-8 sequence for that codepoint.
50
#[inline]
51
1.32M
pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52
1.32M
    let b0 = match src.get(0) {
53
0
        None => return None,
54
1.32M
        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55
3.36k
        Some(&b) => b,
56
    };
57
3.36k
    match b0 {
58
3.36k
        0b110_00000..=0b110_11111 => {
59
95
            if src.len() < 2 {
60
0
                return None;
61
95
            }
62
95
            let b1 = src[1];
63
95
            if 0b11_000000 & b1 != TAG_CONT {
64
0
                return None;
65
95
            }
66
95
            let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67
95
            match cp {
68
95
                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
regex::utf8::decode_utf8::{closure#0}
Line
Count
Source
68
42
                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
regex::utf8::decode_utf8::{closure#0}
Line
Count
Source
68
53
                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69
0
                _ => None,
70
            }
71
        }
72
3.26k
        0b1110_0000..=0b1110_1111 => {
73
2.45k
            if src.len() < 3 {
74
0
                return None;
75
2.45k
            }
76
2.45k
            let (b1, b2) = (src[1], src[2]);
77
2.45k
            if 0b11_000000 & b1 != TAG_CONT {
78
0
                return None;
79
2.45k
            }
80
2.45k
            if 0b11_000000 & b2 != TAG_CONT {
81
0
                return None;
82
2.45k
            }
83
2.45k
            let cp = ((b0 & !TAG_THREE) as u32) << 12
84
2.45k
                | ((b1 & !TAG_CONT) as u32) << 6
85
2.45k
                | ((b2 & !TAG_CONT) as u32);
86
2.45k
            match cp {
87
                // char::from_u32 will disallow surrogate codepoints.
88
2.45k
                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
regex::utf8::decode_utf8::{closure#1}
Line
Count
Source
88
1.88k
                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
regex::utf8::decode_utf8::{closure#1}
Line
Count
Source
88
567
                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89
0
                _ => None,
90
            }
91
        }
92
816
        0b11110_000..=0b11110_111 => {
93
816
            if src.len() < 4 {
94
0
                return None;
95
816
            }
96
816
            let (b1, b2, b3) = (src[1], src[2], src[3]);
97
816
            if 0b11_000000 & b1 != TAG_CONT {
98
0
                return None;
99
816
            }
100
816
            if 0b11_000000 & b2 != TAG_CONT {
101
0
                return None;
102
816
            }
103
816
            if 0b11_000000 & b3 != TAG_CONT {
104
0
                return None;
105
816
            }
106
816
            let cp = ((b0 & !TAG_FOUR) as u32) << 18
107
816
                | ((b1 & !TAG_CONT) as u32) << 12
108
816
                | ((b2 & !TAG_CONT) as u32) << 6
109
816
                | ((b3 & !TAG_CONT) as u32);
110
816
            match cp {
111
816
                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
regex::utf8::decode_utf8::{closure#2}
Line
Count
Source
111
791
                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
regex::utf8::decode_utf8::{closure#2}
Line
Count
Source
111
25
                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112
0
                _ => None,
113
            }
114
        }
115
0
        _ => None,
116
    }
117
1.32M
}
regex::utf8::decode_utf8
Line
Count
Source
51
789k
pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52
789k
    let b0 = match src.get(0) {
53
0
        None => return None,
54
789k
        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55
2.71k
        Some(&b) => b,
56
    };
57
2.71k
    match b0 {
58
2.71k
        0b110_00000..=0b110_11111 => {
59
42
            if src.len() < 2 {
60
0
                return None;
61
42
            }
62
42
            let b1 = src[1];
63
42
            if 0b11_000000 & b1 != TAG_CONT {
64
0
                return None;
65
42
            }
66
42
            let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67
42
            match cp {
68
42
                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69
0
                _ => None,
70
            }
71
        }
72
2.67k
        0b1110_0000..=0b1110_1111 => {
73
1.88k
            if src.len() < 3 {
74
0
                return None;
75
1.88k
            }
76
1.88k
            let (b1, b2) = (src[1], src[2]);
77
1.88k
            if 0b11_000000 & b1 != TAG_CONT {
78
0
                return None;
79
1.88k
            }
80
1.88k
            if 0b11_000000 & b2 != TAG_CONT {
81
0
                return None;
82
1.88k
            }
83
1.88k
            let cp = ((b0 & !TAG_THREE) as u32) << 12
84
1.88k
                | ((b1 & !TAG_CONT) as u32) << 6
85
1.88k
                | ((b2 & !TAG_CONT) as u32);
86
1.88k
            match cp {
87
                // char::from_u32 will disallow surrogate codepoints.
88
1.88k
                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89
0
                _ => None,
90
            }
91
        }
92
791
        0b11110_000..=0b11110_111 => {
93
791
            if src.len() < 4 {
94
0
                return None;
95
791
            }
96
791
            let (b1, b2, b3) = (src[1], src[2], src[3]);
97
791
            if 0b11_000000 & b1 != TAG_CONT {
98
0
                return None;
99
791
            }
100
791
            if 0b11_000000 & b2 != TAG_CONT {
101
0
                return None;
102
791
            }
103
791
            if 0b11_000000 & b3 != TAG_CONT {
104
0
                return None;
105
791
            }
106
791
            let cp = ((b0 & !TAG_FOUR) as u32) << 18
107
791
                | ((b1 & !TAG_CONT) as u32) << 12
108
791
                | ((b2 & !TAG_CONT) as u32) << 6
109
791
                | ((b3 & !TAG_CONT) as u32);
110
791
            match cp {
111
791
                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112
0
                _ => None,
113
            }
114
        }
115
0
        _ => None,
116
    }
117
789k
}
regex::utf8::decode_utf8
Line
Count
Source
51
539k
pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52
539k
    let b0 = match src.get(0) {
53
0
        None => return None,
54
539k
        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55
645
        Some(&b) => b,
56
    };
57
645
    match b0 {
58
645
        0b110_00000..=0b110_11111 => {
59
53
            if src.len() < 2 {
60
0
                return None;
61
53
            }
62
53
            let b1 = src[1];
63
53
            if 0b11_000000 & b1 != TAG_CONT {
64
0
                return None;
65
53
            }
66
53
            let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67
53
            match cp {
68
53
                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69
0
                _ => None,
70
            }
71
        }
72
592
        0b1110_0000..=0b1110_1111 => {
73
567
            if src.len() < 3 {
74
0
                return None;
75
567
            }
76
567
            let (b1, b2) = (src[1], src[2]);
77
567
            if 0b11_000000 & b1 != TAG_CONT {
78
0
                return None;
79
567
            }
80
567
            if 0b11_000000 & b2 != TAG_CONT {
81
0
                return None;
82
567
            }
83
567
            let cp = ((b0 & !TAG_THREE) as u32) << 12
84
567
                | ((b1 & !TAG_CONT) as u32) << 6
85
567
                | ((b2 & !TAG_CONT) as u32);
86
567
            match cp {
87
                // char::from_u32 will disallow surrogate codepoints.
88
567
                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89
0
                _ => None,
90
            }
91
        }
92
25
        0b11110_000..=0b11110_111 => {
93
25
            if src.len() < 4 {
94
0
                return None;
95
25
            }
96
25
            let (b1, b2, b3) = (src[1], src[2], src[3]);
97
25
            if 0b11_000000 & b1 != TAG_CONT {
98
0
                return None;
99
25
            }
100
25
            if 0b11_000000 & b2 != TAG_CONT {
101
0
                return None;
102
25
            }
103
25
            if 0b11_000000 & b3 != TAG_CONT {
104
0
                return None;
105
25
            }
106
25
            let cp = ((b0 & !TAG_FOUR) as u32) << 18
107
25
                | ((b1 & !TAG_CONT) as u32) << 12
108
25
                | ((b2 & !TAG_CONT) as u32) << 6
109
25
                | ((b3 & !TAG_CONT) as u32);
110
25
            match cp {
111
25
                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112
0
                _ => None,
113
            }
114
        }
115
0
        _ => None,
116
    }
117
539k
}
118
119
/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120
/// of the first.
121
0
pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122
0
    if src.is_empty() {
123
0
        return None;
124
0
    }
125
0
    let mut start = src.len() - 1;
126
0
    if src[start] <= 0x7F {
127
0
        return Some((src[start] as char, 1));
128
0
    }
129
0
    while start > src.len().saturating_sub(4) {
130
0
        start -= 1;
131
0
        if is_start_byte(src[start]) {
132
0
            break;
133
0
        }
134
    }
135
0
    match decode_utf8(&src[start..]) {
136
0
        None => None,
137
0
        Some((_, n)) if n < src.len() - start => None,
138
0
        Some((cp, n)) => Some((cp, n)),
139
    }
140
0
}
Unexecuted instantiation: regex::utf8::decode_last_utf8
Unexecuted instantiation: regex::utf8::decode_last_utf8
141
142
0
fn is_start_byte(b: u8) -> bool {
143
0
    b & 0b11_000000 != 0b1_0000000
144
0
}
Unexecuted instantiation: regex::utf8::is_start_byte
Unexecuted instantiation: regex::utf8::is_start_byte
145
146
#[cfg(test)]
147
mod tests {
148
    use std::str;
149
150
    use quickcheck::quickcheck;
151
152
    use super::{
153
        decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154
    };
155
156
    #[test]
157
    fn prop_roundtrip() {
158
        fn p(given_cp: char) -> bool {
159
            let mut tmp = [0; 4];
160
            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161
            let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162
            encoded_len == got_len && given_cp == got_cp
163
        }
164
        quickcheck(p as fn(char) -> bool)
165
    }
166
167
    #[test]
168
    fn prop_roundtrip_last() {
169
        fn p(given_cp: char) -> bool {
170
            let mut tmp = [0; 4];
171
            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172
            let (got_cp, got_len) =
173
                decode_last_utf8(&tmp[..encoded_len]).unwrap();
174
            encoded_len == got_len && given_cp == got_cp
175
        }
176
        quickcheck(p as fn(char) -> bool)
177
    }
178
179
    #[test]
180
    fn prop_encode_matches_std() {
181
        fn p(cp: char) -> bool {
182
            let mut got = [0; 4];
183
            let n = cp.encode_utf8(&mut got).len();
184
            let expected = cp.to_string();
185
            &got[..n] == expected.as_bytes()
186
        }
187
        quickcheck(p as fn(char) -> bool)
188
    }
189
190
    #[test]
191
    fn prop_decode_matches_std() {
192
        fn p(given_cp: char) -> bool {
193
            let mut tmp = [0; 4];
194
            let n = given_cp.encode_utf8(&mut tmp).len();
195
            let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196
            let expected_cp =
197
                str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198
            got_cp == expected_cp
199
        }
200
        quickcheck(p as fn(char) -> bool)
201
    }
202
203
    #[test]
204
    fn prop_decode_last_matches_std() {
205
        fn p(given_cp: char) -> bool {
206
            let mut tmp = [0; 4];
207
            let n = given_cp.encode_utf8(&mut tmp).len();
208
            let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209
            let expected_cp = str::from_utf8(&tmp[..n])
210
                .unwrap()
211
                .chars()
212
                .rev()
213
                .next()
214
                .unwrap();
215
            got_cp == expected_cp
216
        }
217
        quickcheck(p as fn(char) -> bool)
218
    }
219
220
    #[test]
221
    fn reject_invalid() {
222
        // Invalid start byte
223
        assert_eq!(decode_utf8(&[0xFF]), None);
224
        // Surrogate pair
225
        assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
226
        // Invalid continuation byte.
227
        assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
228
        // Bad lengths
229
        assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
230
        assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
231
        assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
232
                                                            // Not a minimal UTF-8 sequence
233
        assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
234
        assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
235
        assert_eq!(
236
            decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
237
            None
238
        );
239
    }
240
241
    #[test]
242
    fn reject_invalid_last() {
243
        // Invalid start byte
244
        assert_eq!(decode_last_utf8(&[0xFF]), None);
245
        // Surrogate pair
246
        assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
247
        // Bad lengths
248
        assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
249
        assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
250
        assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
251
                                                                 // Not a minimal UTF-8 sequence
252
        assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
253
        assert_eq!(
254
            decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
255
            None
256
        );
257
        assert_eq!(
258
            decode_last_utf8(
259
                &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
260
            ),
261
            None
262
        );
263
    }
264
}