/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-1.5.6/src/utf8.rs
Line | Count | Source |
1 | | /// A few elementary UTF-8 encoding and decoding functions used by the matching |
2 | | /// engines. |
3 | | /// |
4 | | /// In an ideal world, the matching engines operate on `&str` and we can just |
5 | | /// lean on the standard library for all our UTF-8 needs. However, to support |
6 | | /// byte based regexes (that can match on arbitrary bytes which may contain |
7 | | /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. |
8 | | /// The standard library doesn't really recognize this use case, so we have |
9 | | /// to build it out ourselves. |
10 | | /// |
11 | | /// Should this be factored out into a separate crate? It seems independently |
12 | | /// useful. There are other crates that already exist (e.g., `utf-8`) that have |
13 | | /// overlapping use cases. Not sure what to do. |
14 | | use std::char; |
15 | | |
16 | | const TAG_CONT: u8 = 0b1000_0000; |
17 | | const TAG_TWO: u8 = 0b1100_0000; |
18 | | const TAG_THREE: u8 = 0b1110_0000; |
19 | | const TAG_FOUR: u8 = 0b1111_0000; |
20 | | |
21 | | /// Returns the smallest possible index of the next valid UTF-8 sequence |
22 | | /// starting after `i`. |
23 | 0 | pub fn next_utf8(text: &[u8], i: usize) -> usize { |
24 | 0 | let b = match text.get(i) { |
25 | 0 | None => return i + 1, |
26 | 0 | Some(&b) => b, |
27 | | }; |
28 | 0 | let inc = if b <= 0x7F { |
29 | 0 | 1 |
30 | 0 | } else if b <= 0b110_11111 { |
31 | 0 | 2 |
32 | 0 | } else if b <= 0b1110_1111 { |
33 | 0 | 3 |
34 | | } else { |
35 | 0 | 4 |
36 | | }; |
37 | 0 | i + inc |
38 | 0 | } Unexecuted instantiation: regex::utf8::next_utf8 Unexecuted instantiation: regex::utf8::next_utf8 |
39 | | |
40 | | /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. |
41 | | /// |
42 | | /// If no valid UTF-8 sequence could be found, then `None` is returned. |
43 | | /// Otherwise, the decoded codepoint and the number of bytes read is returned. |
44 | | /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be |
45 | | /// 1, 2, 3 or 4. |
46 | | /// |
47 | | /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a |
48 | | /// codepoint that is out of range (surrogate codepoints are out of range) or |
49 | | /// is not the shortest possible UTF-8 sequence for that codepoint. |
50 | | #[inline] |
51 | 1.32M | pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { |
52 | 1.32M | let b0 = match src.get(0) { |
53 | 0 | None => return None, |
54 | 1.32M | Some(&b) if b <= 0x7F => return Some((b as char, 1)), |
55 | 3.36k | Some(&b) => b, |
56 | | }; |
57 | 3.36k | match b0 { |
58 | 3.36k | 0b110_00000..=0b110_11111 => { |
59 | 95 | if src.len() < 2 { |
60 | 0 | return None; |
61 | 95 | } |
62 | 95 | let b1 = src[1]; |
63 | 95 | if 0b11_000000 & b1 != TAG_CONT { |
64 | 0 | return None; |
65 | 95 | } |
66 | 95 | let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); |
67 | 95 | match cp { |
68 | 95 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), regex::utf8::decode_utf8::{closure#0}Line | Count | Source | 68 | 42 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), |
regex::utf8::decode_utf8::{closure#0}Line | Count | Source | 68 | 53 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), |
|
69 | 0 | _ => None, |
70 | | } |
71 | | } |
72 | 3.26k | 0b1110_0000..=0b1110_1111 => { |
73 | 2.45k | if src.len() < 3 { |
74 | 0 | return None; |
75 | 2.45k | } |
76 | 2.45k | let (b1, b2) = (src[1], src[2]); |
77 | 2.45k | if 0b11_000000 & b1 != TAG_CONT { |
78 | 0 | return None; |
79 | 2.45k | } |
80 | 2.45k | if 0b11_000000 & b2 != TAG_CONT { |
81 | 0 | return None; |
82 | 2.45k | } |
83 | 2.45k | let cp = ((b0 & !TAG_THREE) as u32) << 12 |
84 | 2.45k | | ((b1 & !TAG_CONT) as u32) << 6 |
85 | 2.45k | | ((b2 & !TAG_CONT) as u32); |
86 | 2.45k | match cp { |
87 | | // char::from_u32 will disallow surrogate codepoints. |
88 | 2.45k | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), regex::utf8::decode_utf8::{closure#1}Line | Count | Source | 88 | 1.88k | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), |
regex::utf8::decode_utf8::{closure#1}Line | Count | Source | 88 | 567 | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), |
|
89 | 0 | _ => None, |
90 | | } |
91 | | } |
92 | 816 | 0b11110_000..=0b11110_111 => { |
93 | 816 | if src.len() < 4 { |
94 | 0 | return None; |
95 | 816 | } |
96 | 816 | let (b1, b2, b3) = (src[1], src[2], src[3]); |
97 | 816 | if 0b11_000000 & b1 != TAG_CONT { |
98 | 0 | return None; |
99 | 816 | } |
100 | 816 | if 0b11_000000 & b2 != TAG_CONT { |
101 | 0 | return None; |
102 | 816 | } |
103 | 816 | if 0b11_000000 & b3 != TAG_CONT { |
104 | 0 | return None; |
105 | 816 | } |
106 | 816 | let cp = ((b0 & !TAG_FOUR) as u32) << 18 |
107 | 816 | | ((b1 & !TAG_CONT) as u32) << 12 |
108 | 816 | | ((b2 & !TAG_CONT) as u32) << 6 |
109 | 816 | | ((b3 & !TAG_CONT) as u32); |
110 | 816 | match cp { |
111 | 816 | 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), regex::utf8::decode_utf8::{closure#2}Line | Count | Source | 111 | 791 | 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), |
regex::utf8::decode_utf8::{closure#2}Line | Count | Source | 111 | 25 | 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), |
|
112 | 0 | _ => None, |
113 | | } |
114 | | } |
115 | 0 | _ => None, |
116 | | } |
117 | 1.32M | } Line | Count | Source | 51 | 789k | pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { | 52 | 789k | let b0 = match src.get(0) { | 53 | 0 | None => return None, | 54 | 789k | Some(&b) if b <= 0x7F => return Some((b as char, 1)), | 55 | 2.71k | Some(&b) => b, | 56 | | }; | 57 | 2.71k | match b0 { | 58 | 2.71k | 0b110_00000..=0b110_11111 => { | 59 | 42 | if src.len() < 2 { | 60 | 0 | return None; | 61 | 42 | } | 62 | 42 | let b1 = src[1]; | 63 | 42 | if 0b11_000000 & b1 != TAG_CONT { | 64 | 0 | return None; | 65 | 42 | } | 66 | 42 | let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); | 67 | 42 | match cp { | 68 | 42 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), | 69 | 0 | _ => None, | 70 | | } | 71 | | } | 72 | 2.67k | 0b1110_0000..=0b1110_1111 => { | 73 | 1.88k | if src.len() < 3 { | 74 | 0 | return None; | 75 | 1.88k | } | 76 | 1.88k | let (b1, b2) = (src[1], src[2]); | 77 | 1.88k | if 0b11_000000 & b1 != TAG_CONT { | 78 | 0 | return None; | 79 | 1.88k | } | 80 | 1.88k | if 0b11_000000 & b2 != TAG_CONT { | 81 | 0 | return None; | 82 | 1.88k | } | 83 | 1.88k | let cp = ((b0 & !TAG_THREE) as u32) << 12 | 84 | 1.88k | | ((b1 & !TAG_CONT) as u32) << 6 | 85 | 1.88k | | ((b2 & !TAG_CONT) as u32); | 86 | 1.88k | match cp { | 87 | | // char::from_u32 will disallow surrogate codepoints. | 88 | 1.88k | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), | 89 | 0 | _ => None, | 90 | | } | 91 | | } | 92 | 791 | 0b11110_000..=0b11110_111 => { | 93 | 791 | if src.len() < 4 { | 94 | 0 | return None; | 95 | 791 | } | 96 | 791 | let (b1, b2, b3) = (src[1], src[2], src[3]); | 97 | 791 | if 0b11_000000 & b1 != TAG_CONT { | 98 | 0 | return None; | 99 | 791 | } | 100 | 791 | if 0b11_000000 & b2 != TAG_CONT { | 101 | 0 | return None; | 102 | 791 | } | 103 | 791 | if 0b11_000000 & b3 != TAG_CONT { | 104 | 0 | return None; | 105 | 791 | } | 106 | 791 | let cp = ((b0 & !TAG_FOUR) as u32) << 18 | 107 | 791 | | ((b1 & !TAG_CONT) as u32) << 12 | 108 | 791 | | ((b2 & !TAG_CONT) as u32) << 6 | 109 | 791 | | ((b3 & !TAG_CONT) as u32); | 110 | 791 | match cp { | 111 | 791 | 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), | 112 | 0 | _ => None, | 113 | | } | 114 | | } | 115 | 0 | _ => None, | 116 | | } | 117 | 789k | } |
Line | Count | Source | 51 | 539k | pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { | 52 | 539k | let b0 = match src.get(0) { | 53 | 0 | None => return None, | 54 | 539k | Some(&b) if b <= 0x7F => return Some((b as char, 1)), | 55 | 645 | Some(&b) => b, | 56 | | }; | 57 | 645 | match b0 { | 58 | 645 | 0b110_00000..=0b110_11111 => { | 59 | 53 | if src.len() < 2 { | 60 | 0 | return None; | 61 | 53 | } | 62 | 53 | let b1 = src[1]; | 63 | 53 | if 0b11_000000 & b1 != TAG_CONT { | 64 | 0 | return None; | 65 | 53 | } | 66 | 53 | let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); | 67 | 53 | match cp { | 68 | 53 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), | 69 | 0 | _ => None, | 70 | | } | 71 | | } | 72 | 592 | 0b1110_0000..=0b1110_1111 => { | 73 | 567 | if src.len() < 3 { | 74 | 0 | return None; | 75 | 567 | } | 76 | 567 | let (b1, b2) = (src[1], src[2]); | 77 | 567 | if 0b11_000000 & b1 != TAG_CONT { | 78 | 0 | return None; | 79 | 567 | } | 80 | 567 | if 0b11_000000 & b2 != TAG_CONT { | 81 | 0 | return None; | 82 | 567 | } | 83 | 567 | let cp = ((b0 & !TAG_THREE) as u32) << 12 | 84 | 567 | | ((b1 & !TAG_CONT) as u32) << 6 | 85 | 567 | | ((b2 & !TAG_CONT) as u32); | 86 | 567 | match cp { | 87 | | // char::from_u32 will disallow surrogate codepoints. | 88 | 567 | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), | 89 | 0 | _ => None, | 90 | | } | 91 | | } | 92 | 25 | 0b11110_000..=0b11110_111 => { | 93 | 25 | if src.len() < 4 { | 94 | 0 | return None; | 95 | 25 | } | 96 | 25 | let (b1, b2, b3) = (src[1], src[2], src[3]); | 97 | 25 | if 0b11_000000 & b1 != TAG_CONT { | 98 | 0 | return None; | 99 | 25 | } | 100 | 25 | if 0b11_000000 & b2 != TAG_CONT { | 101 | 0 | return None; | 102 | 25 | } | 103 | 25 | if 0b11_000000 & b3 != TAG_CONT { | 104 | 0 | return None; | 105 | 25 | } | 106 | 25 | let cp = ((b0 & !TAG_FOUR) as u32) << 18 | 107 | 25 | | ((b1 & !TAG_CONT) as u32) << 12 | 108 | 25 | | ((b2 & !TAG_CONT) as u32) << 6 | 109 | 25 | | ((b3 & !TAG_CONT) as u32); | 110 | 25 | match cp { | 111 | 25 | 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), | 112 | 0 | _ => None, | 113 | | } | 114 | | } | 115 | 0 | _ => None, | 116 | | } | 117 | 539k | } |
|
118 | | |
119 | | /// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead |
120 | | /// of the first. |
121 | 0 | pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { |
122 | 0 | if src.is_empty() { |
123 | 0 | return None; |
124 | 0 | } |
125 | 0 | let mut start = src.len() - 1; |
126 | 0 | if src[start] <= 0x7F { |
127 | 0 | return Some((src[start] as char, 1)); |
128 | 0 | } |
129 | 0 | while start > src.len().saturating_sub(4) { |
130 | 0 | start -= 1; |
131 | 0 | if is_start_byte(src[start]) { |
132 | 0 | break; |
133 | 0 | } |
134 | | } |
135 | 0 | match decode_utf8(&src[start..]) { |
136 | 0 | None => None, |
137 | 0 | Some((_, n)) if n < src.len() - start => None, |
138 | 0 | Some((cp, n)) => Some((cp, n)), |
139 | | } |
140 | 0 | } Unexecuted instantiation: regex::utf8::decode_last_utf8 Unexecuted instantiation: regex::utf8::decode_last_utf8 |
141 | | |
142 | 0 | fn is_start_byte(b: u8) -> bool { |
143 | 0 | b & 0b11_000000 != 0b1_0000000 |
144 | 0 | } Unexecuted instantiation: regex::utf8::is_start_byte Unexecuted instantiation: regex::utf8::is_start_byte |
145 | | |
146 | | #[cfg(test)] |
147 | | mod tests { |
148 | | use std::str; |
149 | | |
150 | | use quickcheck::quickcheck; |
151 | | |
152 | | use super::{ |
153 | | decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, |
154 | | }; |
155 | | |
156 | | #[test] |
157 | | fn prop_roundtrip() { |
158 | | fn p(given_cp: char) -> bool { |
159 | | let mut tmp = [0; 4]; |
160 | | let encoded_len = given_cp.encode_utf8(&mut tmp).len(); |
161 | | let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); |
162 | | encoded_len == got_len && given_cp == got_cp |
163 | | } |
164 | | quickcheck(p as fn(char) -> bool) |
165 | | } |
166 | | |
167 | | #[test] |
168 | | fn prop_roundtrip_last() { |
169 | | fn p(given_cp: char) -> bool { |
170 | | let mut tmp = [0; 4]; |
171 | | let encoded_len = given_cp.encode_utf8(&mut tmp).len(); |
172 | | let (got_cp, got_len) = |
173 | | decode_last_utf8(&tmp[..encoded_len]).unwrap(); |
174 | | encoded_len == got_len && given_cp == got_cp |
175 | | } |
176 | | quickcheck(p as fn(char) -> bool) |
177 | | } |
178 | | |
179 | | #[test] |
180 | | fn prop_encode_matches_std() { |
181 | | fn p(cp: char) -> bool { |
182 | | let mut got = [0; 4]; |
183 | | let n = cp.encode_utf8(&mut got).len(); |
184 | | let expected = cp.to_string(); |
185 | | &got[..n] == expected.as_bytes() |
186 | | } |
187 | | quickcheck(p as fn(char) -> bool) |
188 | | } |
189 | | |
190 | | #[test] |
191 | | fn prop_decode_matches_std() { |
192 | | fn p(given_cp: char) -> bool { |
193 | | let mut tmp = [0; 4]; |
194 | | let n = given_cp.encode_utf8(&mut tmp).len(); |
195 | | let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); |
196 | | let expected_cp = |
197 | | str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); |
198 | | got_cp == expected_cp |
199 | | } |
200 | | quickcheck(p as fn(char) -> bool) |
201 | | } |
202 | | |
203 | | #[test] |
204 | | fn prop_decode_last_matches_std() { |
205 | | fn p(given_cp: char) -> bool { |
206 | | let mut tmp = [0; 4]; |
207 | | let n = given_cp.encode_utf8(&mut tmp).len(); |
208 | | let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); |
209 | | let expected_cp = str::from_utf8(&tmp[..n]) |
210 | | .unwrap() |
211 | | .chars() |
212 | | .rev() |
213 | | .next() |
214 | | .unwrap(); |
215 | | got_cp == expected_cp |
216 | | } |
217 | | quickcheck(p as fn(char) -> bool) |
218 | | } |
219 | | |
220 | | #[test] |
221 | | fn reject_invalid() { |
222 | | // Invalid start byte |
223 | | assert_eq!(decode_utf8(&[0xFF]), None); |
224 | | // Surrogate pair |
225 | | assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); |
226 | | // Invalid continuation byte. |
227 | | assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); |
228 | | // Bad lengths |
229 | | assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes |
230 | | assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes |
231 | | assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes |
232 | | // Not a minimal UTF-8 sequence |
233 | | assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); |
234 | | assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); |
235 | | assert_eq!( |
236 | | decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), |
237 | | None |
238 | | ); |
239 | | } |
240 | | |
241 | | #[test] |
242 | | fn reject_invalid_last() { |
243 | | // Invalid start byte |
244 | | assert_eq!(decode_last_utf8(&[0xFF]), None); |
245 | | // Surrogate pair |
246 | | assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); |
247 | | // Bad lengths |
248 | | assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes |
249 | | assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes |
250 | | assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes |
251 | | // Not a minimal UTF-8 sequence |
252 | | assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); |
253 | | assert_eq!( |
254 | | decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), |
255 | | None |
256 | | ); |
257 | | assert_eq!( |
258 | | decode_last_utf8( |
259 | | &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] |
260 | | ), |
261 | | None |
262 | | ); |
263 | | } |
264 | | } |