/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/utf8.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Utilities for dealing with UTF-8. |
3 | | |
4 | | This module provides some UTF-8 related helper routines, including an |
5 | | incremental decoder. |
6 | | */ |
7 | | |
8 | | /// Returns true if and only if the given byte is considered a word character. |
9 | | /// This only applies to ASCII. |
10 | | /// |
11 | | /// This was copied from regex-syntax so that we can use it to determine the |
12 | | /// starting DFA state while searching without depending on regex-syntax. The |
13 | | /// definition is never going to change, so there's no maintenance/bit-rot |
14 | | /// hazard here. |
15 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
16 | 0 | pub(crate) fn is_word_byte(b: u8) -> bool { |
17 | 0 | const fn mkwordset() -> [bool; 256] { |
18 | 0 | // FIXME: Use as_usize() once const functions in traits are stable. |
19 | 0 | let mut set = [false; 256]; |
20 | 0 | set[b'_' as usize] = true; |
21 | 0 |
|
22 | 0 | let mut byte = b'0'; |
23 | 0 | while byte <= b'9' { |
24 | 0 | set[byte as usize] = true; |
25 | 0 | byte += 1; |
26 | 0 | } |
27 | 0 | byte = b'A'; |
28 | 0 | while byte <= b'Z' { |
29 | 0 | set[byte as usize] = true; |
30 | 0 | byte += 1; |
31 | 0 | } |
32 | 0 | byte = b'a'; |
33 | 0 | while byte <= b'z' { |
34 | 0 | set[byte as usize] = true; |
35 | 0 | byte += 1; |
36 | 0 | } |
37 | 0 | set |
38 | 0 | } |
39 | | const WORD: [bool; 256] = mkwordset(); |
40 | 0 | WORD[b as usize] |
41 | 0 | } |
42 | | |
43 | | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
44 | | /// |
45 | | /// If no valid encoding of a codepoint exists at the beginning of the given |
46 | | /// byte slice, then the first byte is returned instead. |
47 | | /// |
48 | | /// This returns `None` if and only if `bytes` is empty. |
49 | | /// |
50 | | /// This never panics. |
51 | | /// |
52 | | /// *WARNING*: This is not designed for performance. If you're looking for a |
53 | | /// fast UTF-8 decoder, this is not it. If you feel like you need one in this |
54 | | /// crate, then please file an issue and discuss your use case. |
55 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
56 | 0 | pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { |
57 | 0 | if bytes.is_empty() { |
58 | 0 | return None; |
59 | 0 | } |
60 | 0 | let len = match len(bytes[0]) { |
61 | 0 | None => return Some(Err(bytes[0])), |
62 | 0 | Some(len) if len > bytes.len() => return Some(Err(bytes[0])), |
63 | 0 | Some(1) => return Some(Ok(char::from(bytes[0]))), |
64 | 0 | Some(len) => len, |
65 | 0 | }; |
66 | 0 | match core::str::from_utf8(&bytes[..len]) { |
67 | 0 | Ok(s) => Some(Ok(s.chars().next().unwrap())), |
68 | 0 | Err(_) => Some(Err(bytes[0])), |
69 | | } |
70 | 0 | } |
71 | | |
72 | | /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
73 | | /// |
74 | | /// If no valid encoding of a codepoint exists at the end of the given byte |
75 | | /// slice, then the last byte is returned instead. |
76 | | /// |
77 | | /// This returns `None` if and only if `bytes` is empty. |
78 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
79 | 0 | pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { |
80 | 0 | if bytes.is_empty() { |
81 | 0 | return None; |
82 | 0 | } |
83 | 0 | let mut start = bytes.len() - 1; |
84 | 0 | let limit = bytes.len().saturating_sub(4); |
85 | 0 | while start > limit && !is_leading_or_invalid_byte(bytes[start]) { |
86 | 0 | start -= 1; |
87 | 0 | } |
88 | 0 | match decode(&bytes[start..]) { |
89 | 0 | None => None, |
90 | 0 | Some(Ok(ch)) => Some(Ok(ch)), |
91 | 0 | Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
92 | | } |
93 | 0 | } |
94 | | |
95 | | /// Given a UTF-8 leading byte, this returns the total number of code units |
96 | | /// in the following encoded codepoint. |
97 | | /// |
98 | | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
99 | | /// `None`. |
100 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
101 | 0 | fn len(byte: u8) -> Option<usize> { |
102 | 0 | if byte <= 0x7F { |
103 | 0 | return Some(1); |
104 | 0 | } else if byte & 0b1100_0000 == 0b1000_0000 { |
105 | 0 | return None; |
106 | 0 | } else if byte <= 0b1101_1111 { |
107 | 0 | Some(2) |
108 | 0 | } else if byte <= 0b1110_1111 { |
109 | 0 | Some(3) |
110 | 0 | } else if byte <= 0b1111_0111 { |
111 | 0 | Some(4) |
112 | | } else { |
113 | 0 | None |
114 | | } |
115 | 0 | } |
116 | | |
117 | | /// Returns true if and only if the given offset in the given bytes falls on a |
118 | | /// valid UTF-8 encoded codepoint boundary. |
119 | | /// |
120 | | /// If `bytes` is not valid UTF-8, then the behavior of this routine is |
121 | | /// unspecified. |
122 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
123 | 0 | pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { |
124 | 0 | match bytes.get(i) { |
125 | | // The position at the end of the bytes always represents an empty |
126 | | // string, which is a valid boundary. But anything after that doesn't |
127 | | // make much sense to call valid a boundary. |
128 | 0 | None => i == bytes.len(), |
129 | | // Other than ASCII (where the most significant bit is never set), |
130 | | // valid starting bytes always have their most significant two bits |
131 | | // set, where as continuation bytes never have their second most |
132 | | // significant bit set. Therefore, this only returns true when bytes[i] |
133 | | // corresponds to a byte that begins a valid UTF-8 encoding of a |
134 | | // Unicode scalar value. |
135 | 0 | Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, |
136 | | } |
137 | 0 | } |
138 | | |
139 | | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
140 | | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
141 | | /// valid UTF-8 sequence. |
142 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
143 | 0 | fn is_leading_or_invalid_byte(b: u8) -> bool { |
144 | 0 | // In the ASCII case, the most significant bit is never set. The leading |
145 | 0 | // byte of a 2/3/4-byte sequence always has the top two most significant |
146 | 0 | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
147 | 0 | // also returns true, since every such byte has its two most significant |
148 | 0 | // bits set: |
149 | 0 | // |
150 | 0 | // \xC0 :: 11000000 |
151 | 0 | // \xC1 :: 11000001 |
152 | 0 | // \xF5 :: 11110101 |
153 | 0 | // \xF6 :: 11110110 |
154 | 0 | // \xF7 :: 11110111 |
155 | 0 | // \xF8 :: 11111000 |
156 | 0 | // \xF9 :: 11111001 |
157 | 0 | // \xFA :: 11111010 |
158 | 0 | // \xFB :: 11111011 |
159 | 0 | // \xFC :: 11111100 |
160 | 0 | // \xFD :: 11111101 |
161 | 0 | // \xFE :: 11111110 |
162 | 0 | // \xFF :: 11111111 |
163 | 0 | (b & 0b1100_0000) != 0b1000_0000 |
164 | 0 | } |
165 | | |
166 | | /* |
167 | | /// Returns the smallest possible index of the next valid UTF-8 sequence |
168 | | /// starting after `i`. |
169 | | /// |
170 | | /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
171 | | /// value is guaranteed to be greater than `i`. (If there is no value greater |
172 | | /// than `i` that fits in `usize`, then this panics.) |
173 | | /// |
174 | | /// Generally speaking, this should only be called on `text` when it is |
175 | | /// permitted to assume that it is valid UTF-8 and where either `i >= |
176 | | /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
177 | | /// |
178 | | /// NOTE: This method was used in a previous conception of iterators where we |
179 | | /// specifically tried to skip over empty matches that split a codepoint by |
180 | | /// simply requiring that our next search begin at the beginning of codepoint. |
181 | | /// But we ended up changing that technique to always advance by 1 byte and |
182 | | /// then filter out matches that split a codepoint after-the-fact. Thus, we no |
183 | | /// longer use this method. But I've kept it around in case we want to switch |
184 | | /// back to this approach. Its guarantees are a little subtle, so I'd prefer |
185 | | /// not to rebuild it from whole cloth. |
186 | | pub(crate) fn next(text: &[u8], i: usize) -> usize { |
187 | | let b = match text.get(i) { |
188 | | None => return i.checked_add(1).unwrap(), |
189 | | Some(&b) => b, |
190 | | }; |
191 | | // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
192 | | // other than just start at the next byte. |
193 | | let inc = len(b).unwrap_or(1); |
194 | | i.checked_add(inc).unwrap() |
195 | | } |
196 | | */ |