Coverage Report

Created: 2025-07-11 07:02

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/utf8.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Utilities for dealing with UTF-8.
3
4
This module provides some UTF-8 related helper routines, including an
5
incremental decoder.
6
*/
7
8
/// Returns true if and only if the given byte is considered a word character.
9
/// This only applies to ASCII.
10
///
11
/// This was copied from regex-syntax so that we can use it to determine the
12
/// starting DFA state while searching without depending on regex-syntax. The
13
/// definition is never going to change, so there's no maintenance/bit-rot
14
/// hazard here.
15
#[cfg_attr(feature = "perf-inline", inline(always))]
16
0
pub(crate) fn is_word_byte(b: u8) -> bool {
17
0
    const fn mkwordset() -> [bool; 256] {
18
0
        // FIXME: Use as_usize() once const functions in traits are stable.
19
0
        let mut set = [false; 256];
20
0
        set[b'_' as usize] = true;
21
0
22
0
        let mut byte = b'0';
23
0
        while byte <= b'9' {
24
0
            set[byte as usize] = true;
25
0
            byte += 1;
26
0
        }
27
0
        byte = b'A';
28
0
        while byte <= b'Z' {
29
0
            set[byte as usize] = true;
30
0
            byte += 1;
31
0
        }
32
0
        byte = b'a';
33
0
        while byte <= b'z' {
34
0
            set[byte as usize] = true;
35
0
            byte += 1;
36
0
        }
37
0
        set
38
0
    }
39
    const WORD: [bool; 256] = mkwordset();
40
0
    WORD[b as usize]
41
0
}
42
43
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
44
///
45
/// If no valid encoding of a codepoint exists at the beginning of the given
46
/// byte slice, then the first byte is returned instead.
47
///
48
/// This returns `None` if and only if `bytes` is empty.
49
///
50
/// This never panics.
51
///
52
/// *WARNING*: This is not designed for performance. If you're looking for a
53
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
54
/// crate, then please file an issue and discuss your use case.
55
#[cfg_attr(feature = "perf-inline", inline(always))]
56
0
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
57
0
    if bytes.is_empty() {
58
0
        return None;
59
0
    }
60
0
    let len = match len(bytes[0]) {
61
0
        None => return Some(Err(bytes[0])),
62
0
        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
63
0
        Some(1) => return Some(Ok(char::from(bytes[0]))),
64
0
        Some(len) => len,
65
0
    };
66
0
    match core::str::from_utf8(&bytes[..len]) {
67
0
        Ok(s) => Some(Ok(s.chars().next().unwrap())),
68
0
        Err(_) => Some(Err(bytes[0])),
69
    }
70
0
}
71
72
/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
73
///
74
/// If no valid encoding of a codepoint exists at the end of the given byte
75
/// slice, then the last byte is returned instead.
76
///
77
/// This returns `None` if and only if `bytes` is empty.
78
#[cfg_attr(feature = "perf-inline", inline(always))]
79
0
pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
80
0
    if bytes.is_empty() {
81
0
        return None;
82
0
    }
83
0
    let mut start = bytes.len() - 1;
84
0
    let limit = bytes.len().saturating_sub(4);
85
0
    while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
86
0
        start -= 1;
87
0
    }
88
0
    match decode(&bytes[start..]) {
89
0
        None => None,
90
0
        Some(Ok(ch)) => Some(Ok(ch)),
91
0
        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
92
    }
93
0
}
94
95
/// Given a UTF-8 leading byte, this returns the total number of code units
96
/// in the following encoded codepoint.
97
///
98
/// If the given byte is not a valid UTF-8 leading byte, then this returns
99
/// `None`.
100
#[cfg_attr(feature = "perf-inline", inline(always))]
101
0
fn len(byte: u8) -> Option<usize> {
102
0
    if byte <= 0x7F {
103
0
        return Some(1);
104
0
    } else if byte & 0b1100_0000 == 0b1000_0000 {
105
0
        return None;
106
0
    } else if byte <= 0b1101_1111 {
107
0
        Some(2)
108
0
    } else if byte <= 0b1110_1111 {
109
0
        Some(3)
110
0
    } else if byte <= 0b1111_0111 {
111
0
        Some(4)
112
    } else {
113
0
        None
114
    }
115
0
}
116
117
/// Returns true if and only if the given offset in the given bytes falls on a
118
/// valid UTF-8 encoded codepoint boundary.
119
///
120
/// If `bytes` is not valid UTF-8, then the behavior of this routine is
121
/// unspecified.
122
#[cfg_attr(feature = "perf-inline", inline(always))]
123
0
pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
124
0
    match bytes.get(i) {
125
        // The position at the end of the bytes always represents an empty
126
        // string, which is a valid boundary. But anything after that doesn't
127
        // make much sense to call valid a boundary.
128
0
        None => i == bytes.len(),
129
        // Other than ASCII (where the most significant bit is never set),
130
        // valid starting bytes always have their most significant two bits
131
        // set, where as continuation bytes never have their second most
132
        // significant bit set. Therefore, this only returns true when bytes[i]
133
        // corresponds to a byte that begins a valid UTF-8 encoding of a
134
        // Unicode scalar value.
135
0
        Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
136
    }
137
0
}
138
139
/// Returns true if and only if the given byte is either a valid leading UTF-8
140
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
141
/// valid UTF-8 sequence.
142
#[cfg_attr(feature = "perf-inline", inline(always))]
143
0
fn is_leading_or_invalid_byte(b: u8) -> bool {
144
0
    // In the ASCII case, the most significant bit is never set. The leading
145
0
    // byte of a 2/3/4-byte sequence always has the top two most significant
146
0
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
147
0
    // also returns true, since every such byte has its two most significant
148
0
    // bits set:
149
0
    //
150
0
    //     \xC0 :: 11000000
151
0
    //     \xC1 :: 11000001
152
0
    //     \xF5 :: 11110101
153
0
    //     \xF6 :: 11110110
154
0
    //     \xF7 :: 11110111
155
0
    //     \xF8 :: 11111000
156
0
    //     \xF9 :: 11111001
157
0
    //     \xFA :: 11111010
158
0
    //     \xFB :: 11111011
159
0
    //     \xFC :: 11111100
160
0
    //     \xFD :: 11111101
161
0
    //     \xFE :: 11111110
162
0
    //     \xFF :: 11111111
163
0
    (b & 0b1100_0000) != 0b1000_0000
164
0
}
165
166
/*
167
/// Returns the smallest possible index of the next valid UTF-8 sequence
168
/// starting after `i`.
169
///
170
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
171
/// value is guaranteed to be greater than `i`. (If there is no value greater
172
/// than `i` that fits in `usize`, then this panics.)
173
///
174
/// Generally speaking, this should only be called on `text` when it is
175
/// permitted to assume that it is valid UTF-8 and where either `i >=
176
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
177
///
178
/// NOTE: This method was used in a previous conception of iterators where we
179
/// specifically tried to skip over empty matches that split a codepoint by
180
/// simply requiring that our next search begin at the beginning of codepoint.
181
/// But we ended up changing that technique to always advance by 1 byte and
182
/// then filter out matches that split a codepoint after-the-fact. Thus, we no
183
/// longer use this method. But I've kept it around in case we want to switch
184
/// back to this approach. Its guarantees are a little subtle, so I'd prefer
185
/// not to rebuild it from whole cloth.
186
pub(crate) fn next(text: &[u8], i: usize) -> usize {
187
    let b = match text.get(i) {
188
        None => return i.checked_add(1).unwrap(),
189
        Some(&b) => b,
190
    };
191
    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
192
    // other than just start at the next byte.
193
    let inc = len(b).unwrap_or(1);
194
    i.checked_add(inc).unwrap()
195
}
196
*/