/rust/registry/src/index.crates.io-1949cf8c6b5b557f/bstr-1.12.1/src/utf8.rs
Line | Count | Source |
1 | | use core::{char, cmp, fmt, str}; |
2 | | |
3 | | use crate::{ascii, bstr::BStr, ext_slice::ByteSlice}; |
4 | | |
5 | | // The UTF-8 decoder provided here is based on the one presented here: |
6 | | // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
7 | | // |
8 | | // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}` |
9 | | // using regex-automata that is roughly the same size. The real benefit of |
10 | | // Hoehrmann's formulation is that the byte class mapping below is manually |
11 | | // tailored such that each byte's class doubles as a shift to mask out the |
12 | | // bits necessary for constructing the leading bits of each codepoint value |
13 | | // from the initial byte. |
14 | | // |
15 | | // There are some minor differences between this implementation and Hoehrmann's |
16 | | // formulation. |
17 | | // |
18 | | // Firstly, we make REJECT have state ID 0, since it makes the state table |
19 | | // itself a little easier to read and is consistent with the notion that 0 |
20 | | // means "false" or "bad." |
21 | | // |
22 | | // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast |
23 | | // path. |
24 | | // |
25 | | // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction |
26 | | // in the core decoding loop. (Which is what regex-automata would do by |
27 | | // default.) |
28 | | // |
29 | | // Fourthly, we split the byte class mapping and transition table into two |
30 | | // arrays because it's clearer. |
31 | | // |
32 | | // It is unlikely that this is the fastest way to do UTF-8 decoding, however, |
33 | | // it is fairly simple. |
34 | | |
35 | | const ACCEPT: usize = 12; |
36 | | const REJECT: usize = 0; |
37 | | |
38 | | /// SAFETY: The decode below function relies on the correctness of these |
39 | | /// equivalence classes. |
40 | | #[rustfmt::skip] |
41 | | const CLASSES: [u8; 256] = [ |
42 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
43 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
44 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
45 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
46 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
47 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
48 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
49 | | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
50 | | ]; |
51 | | |
52 | | /// SAFETY: The decode below function relies on the correctness of this state |
53 | | /// machine. |
54 | | #[rustfmt::skip] |
55 | | const STATES_FORWARD: &[u8] = &[ |
56 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
57 | | 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, |
58 | | 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, |
59 | | 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, |
60 | | 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, |
61 | | 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, |
62 | | 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
63 | | 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
64 | | 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
65 | | ]; |
66 | | |
67 | | /// An iterator over Unicode scalar values in a byte string. |
68 | | /// |
69 | | /// When invalid UTF-8 byte sequences are found, they are substituted with the |
70 | | /// Unicode replacement codepoint (`U+FFFD`) using the |
71 | | /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). |
72 | | /// |
73 | | /// This iterator is created by the |
74 | | /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the |
75 | | /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. |
76 | | #[derive(Clone, Debug)] |
77 | | pub struct Chars<'a> { |
78 | | bs: &'a [u8], |
79 | | } |
80 | | |
81 | | impl<'a> Chars<'a> { |
82 | 0 | pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> { |
83 | 0 | Chars { bs } |
84 | 0 | } Unexecuted instantiation: <bstr::utf8::Chars>::new Unexecuted instantiation: <bstr::utf8::Chars>::new Unexecuted instantiation: <bstr::utf8::Chars>::new Unexecuted instantiation: <bstr::utf8::Chars>::new |
85 | | |
86 | | /// View the underlying data as a subslice of the original data. |
87 | | /// |
88 | | /// The slice returned has the same lifetime as the original slice, and so |
89 | | /// the iterator can continue to be used while this exists. |
90 | | /// |
91 | | /// # Examples |
92 | | /// |
93 | | /// ``` |
94 | | /// use bstr::ByteSlice; |
95 | | /// |
96 | | /// let mut chars = b"abc".chars(); |
97 | | /// |
98 | | /// assert_eq!(b"abc", chars.as_bytes()); |
99 | | /// chars.next(); |
100 | | /// assert_eq!(b"bc", chars.as_bytes()); |
101 | | /// chars.next(); |
102 | | /// chars.next(); |
103 | | /// assert_eq!(b"", chars.as_bytes()); |
104 | | /// ``` |
105 | | #[inline] |
106 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
107 | 0 | self.bs |
108 | 0 | } Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes |
109 | | } |
110 | | |
111 | | impl<'a> Iterator for Chars<'a> { |
112 | | type Item = char; |
113 | | |
114 | | #[inline] |
115 | 0 | fn next(&mut self) -> Option<char> { |
116 | 0 | let (ch, size) = decode_lossy(self.bs); |
117 | 0 | if size == 0 { |
118 | 0 | return None; |
119 | 0 | } |
120 | 0 | self.bs = &self.bs[size..]; |
121 | 0 | Some(ch) |
122 | 0 | } Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next |
123 | | } |
124 | | |
125 | | impl<'a> DoubleEndedIterator for Chars<'a> { |
126 | | #[inline] |
127 | 0 | fn next_back(&mut self) -> Option<char> { |
128 | 0 | let (ch, size) = decode_last_lossy(self.bs); |
129 | 0 | if size == 0 { |
130 | 0 | return None; |
131 | 0 | } |
132 | 0 | self.bs = &self.bs[..self.bs.len() - size]; |
133 | 0 | Some(ch) |
134 | 0 | } Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back |
135 | | } |
136 | | |
137 | | /// An iterator over Unicode scalar values in a byte string and their |
138 | | /// byte index positions. |
139 | | /// |
140 | | /// When invalid UTF-8 byte sequences are found, they are substituted with the |
141 | | /// Unicode replacement codepoint (`U+FFFD`) using the |
142 | | /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). |
143 | | /// |
144 | | /// Note that this is slightly different from the `CharIndices` iterator |
145 | | /// provided by the standard library. Aside from working on possibly invalid |
146 | | /// UTF-8, this iterator provides both the corresponding starting and ending |
147 | | /// byte indices of each codepoint yielded. The ending position is necessary to |
148 | | /// slice the original byte string when invalid UTF-8 bytes are converted into |
149 | | /// a Unicode replacement codepoint, since a single replacement codepoint can |
150 | | /// substitute anywhere from 1 to 3 invalid bytes (inclusive). |
151 | | /// |
152 | | /// This iterator is created by the |
153 | | /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided |
154 | | /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. |
155 | | #[derive(Clone, Debug)] |
156 | | pub struct CharIndices<'a> { |
157 | | bs: &'a [u8], |
158 | | forward_index: usize, |
159 | | reverse_index: usize, |
160 | | } |
161 | | |
162 | | impl<'a> CharIndices<'a> { |
163 | 2.30M | pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { |
164 | 2.30M | CharIndices { bs, forward_index: 0, reverse_index: bs.len() } |
165 | 2.30M | } Unexecuted instantiation: <bstr::utf8::CharIndices>::new <bstr::utf8::CharIndices>::new Line | Count | Source | 163 | 115 | pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { | 164 | 115 | CharIndices { bs, forward_index: 0, reverse_index: bs.len() } | 165 | 115 | } |
<bstr::utf8::CharIndices>::new Line | Count | Source | 163 | 2.30M | pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { | 164 | 2.30M | CharIndices { bs, forward_index: 0, reverse_index: bs.len() } | 165 | 2.30M | } |
Unexecuted instantiation: <bstr::utf8::CharIndices>::new |
166 | | |
167 | | /// View the underlying data as a subslice of the original data. |
168 | | /// |
169 | | /// The slice returned has the same lifetime as the original slice, and so |
170 | | /// the iterator can continue to be used while this exists. |
171 | | /// |
172 | | /// # Examples |
173 | | /// |
174 | | /// ``` |
175 | | /// use bstr::ByteSlice; |
176 | | /// |
177 | | /// let mut it = b"abc".char_indices(); |
178 | | /// |
179 | | /// assert_eq!(b"abc", it.as_bytes()); |
180 | | /// it.next(); |
181 | | /// assert_eq!(b"bc", it.as_bytes()); |
182 | | /// it.next(); |
183 | | /// it.next(); |
184 | | /// assert_eq!(b"", it.as_bytes()); |
185 | | /// ``` |
186 | | #[inline] |
187 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
188 | 0 | self.bs |
189 | 0 | } Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes |
190 | | } |
191 | | |
192 | | impl<'a> Iterator for CharIndices<'a> { |
193 | | type Item = (usize, usize, char); |
194 | | |
195 | | #[inline] |
196 | 96.3M | fn next(&mut self) -> Option<(usize, usize, char)> { |
197 | 96.3M | let index = self.forward_index; |
198 | 96.3M | let (ch, size) = decode_lossy(self.bs); |
199 | 96.3M | if size == 0 { |
200 | 3.13M | return None; |
201 | 93.2M | } |
202 | 93.2M | self.bs = &self.bs[size..]; |
203 | 93.2M | self.forward_index += size; |
204 | 93.2M | Some((index, index + size, ch)) |
205 | 96.3M | } Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next <bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 196 | 108k | fn next(&mut self) -> Option<(usize, usize, char)> { | 197 | 108k | let index = self.forward_index; | 198 | 108k | let (ch, size) = decode_lossy(self.bs); | 199 | 108k | if size == 0 { | 200 | 115 | return None; | 201 | 107k | } | 202 | 107k | self.bs = &self.bs[size..]; | 203 | 107k | self.forward_index += size; | 204 | 107k | Some((index, index + size, ch)) | 205 | 108k | } |
<bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next Line | Count | Source | 196 | 96.2M | fn next(&mut self) -> Option<(usize, usize, char)> { | 197 | 96.2M | let index = self.forward_index; | 198 | 96.2M | let (ch, size) = decode_lossy(self.bs); | 199 | 96.2M | if size == 0 { | 200 | 3.13M | return None; | 201 | 93.1M | } | 202 | 93.1M | self.bs = &self.bs[size..]; | 203 | 93.1M | self.forward_index += size; | 204 | 93.1M | Some((index, index + size, ch)) | 205 | 96.2M | } |
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next |
206 | | } |
207 | | |
208 | | impl<'a> DoubleEndedIterator for CharIndices<'a> { |
209 | | #[inline] |
210 | 0 | fn next_back(&mut self) -> Option<(usize, usize, char)> { |
211 | 0 | let (ch, size) = decode_last_lossy(self.bs); |
212 | 0 | if size == 0 { |
213 | 0 | return None; |
214 | 0 | } |
215 | 0 | self.bs = &self.bs[..self.bs.len() - size]; |
216 | 0 | self.reverse_index -= size; |
217 | 0 | Some((self.reverse_index, self.reverse_index + size, ch)) |
218 | 0 | } Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back |
219 | | } |
220 | | |
221 | | impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {} |
222 | | |
223 | | /// An iterator over chunks of valid UTF-8 in a byte slice. |
224 | | /// |
225 | | /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks). |
226 | | #[derive(Clone, Debug)] |
227 | | pub struct Utf8Chunks<'a> { |
228 | | pub(super) bytes: &'a [u8], |
229 | | } |
230 | | |
231 | | /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes. |
232 | | /// |
233 | | /// This is yielded by the |
234 | | /// [`Utf8Chunks`](struct.Utf8Chunks.html) |
235 | | /// iterator, which can be created via the |
236 | | /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks) |
237 | | /// method. |
238 | | /// |
239 | | /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that |
240 | | /// are being iterated over. |
241 | | #[cfg_attr(test, derive(Debug, PartialEq))] |
242 | | pub struct Utf8Chunk<'a> { |
243 | | /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes. |
244 | | /// |
245 | | /// This is empty between adjacent invalid UTF-8 byte sequences. |
246 | | valid: &'a str, |
247 | | /// A sequence of invalid UTF-8 bytes. |
248 | | /// |
249 | | /// Can only be empty in the last chunk. |
250 | | /// |
251 | | /// Should be replaced by a single unicode replacement character, if not |
252 | | /// empty. |
253 | | invalid: &'a BStr, |
254 | | /// Indicates whether the invalid sequence could've been valid if there |
255 | | /// were more bytes. |
256 | | /// |
257 | | /// Can only be true in the last chunk. |
258 | | incomplete: bool, |
259 | | } |
260 | | |
261 | | impl<'a> Utf8Chunk<'a> { |
262 | | /// Returns the (possibly empty) valid UTF-8 bytes in this chunk. |
263 | | /// |
264 | | /// This may be empty if there are consecutive sequences of invalid UTF-8 |
265 | | /// bytes. |
266 | | #[inline] |
267 | 0 | pub fn valid(&self) -> &'a str { |
268 | 0 | self.valid |
269 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid |
270 | | |
271 | | /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that |
272 | | /// immediately follow the valid UTF-8 bytes in this chunk. |
273 | | /// |
274 | | /// This is only empty when this chunk corresponds to the last chunk in |
275 | | /// the original bytes. |
276 | | /// |
277 | | /// The maximum length of this slice is 3. That is, invalid UTF-8 byte |
278 | | /// sequences greater than 1 always correspond to a valid _prefix_ of |
279 | | /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution |
280 | | /// of maximal subparts" strategy that is described in more detail in the |
281 | | /// docs for the |
282 | | /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) |
283 | | /// method. |
284 | | #[inline] |
285 | 0 | pub fn invalid(&self) -> &'a [u8] { |
286 | 0 | self.invalid.as_bytes() |
287 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid |
288 | | |
289 | | /// Returns whether the invalid sequence might still become valid if more |
290 | | /// bytes are added. |
291 | | /// |
292 | | /// Returns true if the end of the input was reached unexpectedly, |
293 | | /// without encountering an unexpected byte. |
294 | | /// |
295 | | /// This can only be the case for the last chunk. |
296 | | #[inline] |
297 | 0 | pub fn incomplete(&self) -> bool { |
298 | 0 | self.incomplete |
299 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete |
300 | | } |
301 | | |
302 | | impl<'a> Iterator for Utf8Chunks<'a> { |
303 | | type Item = Utf8Chunk<'a>; |
304 | | |
305 | | #[inline] |
306 | 0 | fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
307 | 0 | if self.bytes.is_empty() { |
308 | 0 | return None; |
309 | 0 | } |
310 | 0 | match validate(self.bytes) { |
311 | | Ok(()) => { |
312 | 0 | let valid = self.bytes; |
313 | 0 | self.bytes = &[]; |
314 | 0 | Some(Utf8Chunk { |
315 | 0 | // SAFETY: This is safe because of the guarantees provided |
316 | 0 | // by utf8::validate. |
317 | 0 | valid: unsafe { str::from_utf8_unchecked(valid) }, |
318 | 0 | invalid: [].as_bstr(), |
319 | 0 | incomplete: false, |
320 | 0 | }) |
321 | | } |
322 | 0 | Err(e) => { |
323 | 0 | let (valid, rest) = self.bytes.split_at(e.valid_up_to()); |
324 | | // SAFETY: This is safe because of the guarantees provided by |
325 | | // utf8::validate. |
326 | 0 | let valid = unsafe { str::from_utf8_unchecked(valid) }; |
327 | 0 | let (invalid_len, incomplete) = match e.error_len() { |
328 | 0 | Some(n) => (n, false), |
329 | 0 | None => (rest.len(), true), |
330 | | }; |
331 | 0 | let (invalid, rest) = rest.split_at(invalid_len); |
332 | 0 | self.bytes = rest; |
333 | 0 | Some(Utf8Chunk { |
334 | 0 | valid, |
335 | 0 | invalid: invalid.as_bstr(), |
336 | 0 | incomplete, |
337 | 0 | }) |
338 | | } |
339 | | } |
340 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next |
341 | | |
342 | | #[inline] |
343 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
344 | 0 | if self.bytes.is_empty() { |
345 | 0 | (0, Some(0)) |
346 | | } else { |
347 | 0 | (1, Some(self.bytes.len())) |
348 | | } |
349 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint |
350 | | } |
351 | | |
352 | | impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {} |
353 | | |
354 | | /// An error that occurs when UTF-8 decoding fails. |
355 | | /// |
356 | | /// This error occurs when attempting to convert a non-UTF-8 byte |
357 | | /// string to a Rust string that must be valid UTF-8. For example, |
358 | | /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method. |
359 | | /// |
360 | | /// # Example |
361 | | /// |
362 | | /// This example shows what happens when a given byte sequence is invalid, |
363 | | /// but ends with a sequence that is a possible prefix of valid UTF-8. |
364 | | /// |
365 | | /// ``` |
366 | | /// use bstr::{B, ByteSlice}; |
367 | | /// |
368 | | /// let s = B(b"foobar\xF1\x80\x80"); |
369 | | /// let err = s.to_str().unwrap_err(); |
370 | | /// assert_eq!(err.valid_up_to(), 6); |
371 | | /// assert_eq!(err.error_len(), None); |
372 | | /// ``` |
373 | | /// |
374 | | /// This example shows what happens when a given byte sequence contains |
375 | | /// invalid UTF-8. |
376 | | /// |
377 | | /// ``` |
378 | | /// use bstr::ByteSlice; |
379 | | /// |
380 | | /// let s = b"foobar\xF1\x80\x80quux"; |
381 | | /// let err = s.to_str().unwrap_err(); |
382 | | /// assert_eq!(err.valid_up_to(), 6); |
383 | | /// // The error length reports the maximum number of bytes that correspond to |
384 | | /// // a valid prefix of a UTF-8 encoded codepoint. |
385 | | /// assert_eq!(err.error_len(), Some(3)); |
386 | | /// |
387 | | /// // In contrast to the above which contains a single invalid prefix, |
388 | | /// // consider the case of multiple individual bytes that are never valid |
389 | | /// // prefixes. Note how the value of error_len changes! |
390 | | /// let s = b"foobar\xFF\xFFquux"; |
391 | | /// let err = s.to_str().unwrap_err(); |
392 | | /// assert_eq!(err.valid_up_to(), 6); |
393 | | /// assert_eq!(err.error_len(), Some(1)); |
394 | | /// |
395 | | /// // The fact that it's an invalid prefix does not change error_len even |
396 | | /// // when it immediately precedes the end of the string. |
397 | | /// let s = b"foobar\xFF"; |
398 | | /// let err = s.to_str().unwrap_err(); |
399 | | /// assert_eq!(err.valid_up_to(), 6); |
400 | | /// assert_eq!(err.error_len(), Some(1)); |
401 | | /// ``` |
402 | | #[derive(Clone, Debug, Eq, PartialEq)] |
403 | | pub struct Utf8Error { |
404 | | valid_up_to: usize, |
405 | | error_len: Option<usize>, |
406 | | } |
407 | | |
408 | | impl Utf8Error { |
409 | | /// Returns the byte index of the position immediately following the last |
410 | | /// valid UTF-8 byte. |
411 | | /// |
412 | | /// # Example |
413 | | /// |
414 | | /// This examples shows how `valid_up_to` can be used to retrieve a |
415 | | /// possibly empty prefix that is guaranteed to be valid UTF-8: |
416 | | /// |
417 | | /// ``` |
418 | | /// use bstr::ByteSlice; |
419 | | /// |
420 | | /// let s = b"foobar\xF1\x80\x80quux"; |
421 | | /// let err = s.to_str().unwrap_err(); |
422 | | /// |
423 | | /// // This is guaranteed to never panic. |
424 | | /// let string = s[..err.valid_up_to()].to_str().unwrap(); |
425 | | /// assert_eq!(string, "foobar"); |
426 | | /// ``` |
427 | | #[inline] |
428 | 4.32M | pub fn valid_up_to(&self) -> usize { |
429 | 4.32M | self.valid_up_to |
430 | 4.32M | } Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to <bstr::utf8::Utf8Error>::valid_up_to Line | Count | Source | 428 | 4.32M | pub fn valid_up_to(&self) -> usize { | 429 | 4.32M | self.valid_up_to | 430 | 4.32M | } |
|
431 | | |
432 | | /// Returns the total number of invalid UTF-8 bytes immediately following |
433 | | /// the position returned by `valid_up_to`. This value is always at least |
434 | | /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8 |
435 | | /// encoded codepoint. |
436 | | /// |
437 | | /// If the end of the original input was found before a valid UTF-8 encoded |
438 | | /// codepoint could be completed, then this returns `None`. This is useful |
439 | | /// when processing streams, where a `None` value signals that more input |
440 | | /// might be needed. |
441 | | #[inline] |
442 | 4.32M | pub fn error_len(&self) -> Option<usize> { |
443 | 4.32M | self.error_len |
444 | 4.32M | } Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len <bstr::utf8::Utf8Error>::error_len Line | Count | Source | 442 | 4.32M | pub fn error_len(&self) -> Option<usize> { | 443 | 4.32M | self.error_len | 444 | 4.32M | } |
|
445 | | } |
446 | | |
447 | | #[cfg(feature = "std")] |
448 | | impl std::error::Error for Utf8Error { |
449 | 0 | fn description(&self) -> &str { |
450 | 0 | "invalid UTF-8" |
451 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description |
452 | | } |
453 | | |
454 | | impl fmt::Display for Utf8Error { |
455 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
456 | 0 | write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to) |
457 | 0 | } Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt |
458 | | } |
459 | | |
460 | | /// Returns OK if and only if the given slice is completely valid UTF-8. |
461 | | /// |
462 | | /// If the slice isn't valid UTF-8, then an error is returned that explains |
463 | | /// the first location at which invalid UTF-8 was detected. |
464 | 17.6M | pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { |
465 | | // The fast path for validating UTF-8. It steps through a UTF-8 automaton |
466 | | // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is |
467 | | // detected, it backs up and runs the slower version of the UTF-8 automaton |
468 | | // to determine correct error information. |
469 | 17.6M | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { |
470 | 17.6M | let mut state = ACCEPT; |
471 | 17.6M | let mut i = 0; |
472 | | |
473 | 25.4M | while i < slice.len() { |
474 | 12.0M | let b = slice[i]; |
475 | | |
476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try |
477 | | // to validate as much ASCII as possible very quickly. |
478 | 12.0M | if state == ACCEPT |
479 | 10.8M | && b <= 0x7F |
480 | 6.42M | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) Unexecuted instantiation: bstr::utf8::validate::fast::{closure#0}bstr::utf8::validate::fast::{closure#0}Line | Count | Source | 480 | 30.5k | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) |
bstr::utf8::validate::fast::{closure#0}Line | Count | Source | 480 | 1.17M | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) |
bstr::utf8::validate::fast::{closure#0}Line | Count | Source | 480 | 64.8k | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) |
|
481 | | { |
482 | 1.23M | i += ascii::first_non_ascii_byte(&slice[i..]); |
483 | 1.23M | continue; |
484 | 10.8M | } |
485 | | |
486 | 10.8M | state = step(state, b); |
487 | 10.8M | if state == REJECT { |
488 | 4.32M | return Err(find_valid_up_to(slice, i)); |
489 | 6.50M | } |
490 | 6.50M | i += 1; |
491 | | } |
492 | 13.3M | if state != ACCEPT { |
493 | 2.80k | Err(find_valid_up_to(slice, slice.len())) |
494 | | } else { |
495 | 13.3M | Ok(()) |
496 | | } |
497 | 17.6M | } Unexecuted instantiation: bstr::utf8::validate::fast bstr::utf8::validate::fast Line | Count | Source | 469 | 933 | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | 933 | let mut state = ACCEPT; | 471 | 933 | let mut i = 0; | 472 | | | 473 | 107k | while i < slice.len() { | 474 | 107k | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | 107k | if state == ACCEPT | 479 | 62.9k | && b <= 0x7F | 480 | 30.5k | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | 29.8k | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | 29.8k | continue; | 484 | 77.4k | } | 485 | | | 486 | 77.4k | state = step(state, b); | 487 | 77.4k | if state == REJECT { | 488 | 321 | return Err(find_valid_up_to(slice, i)); | 489 | 77.1k | } | 490 | 77.1k | i += 1; | 491 | | } | 492 | 612 | if state != ACCEPT { | 493 | 72 | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | 540 | Ok(()) | 496 | | } | 497 | 933 | } |
bstr::utf8::validate::fast Line | Count | Source | 469 | 11.2M | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | 11.2M | let mut state = ACCEPT; | 471 | 11.2M | let mut i = 0; | 472 | | | 473 | 17.8M | while i < slice.len() { | 474 | 6.55M | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | 6.55M | if state == ACCEPT | 479 | 6.41M | && b <= 0x7F | 480 | 6.32M | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | 1.16M | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | 1.16M | continue; | 484 | 5.38M | } | 485 | | | 486 | 5.38M | state = step(state, b); | 487 | 5.38M | if state == REJECT { | 488 | 1.07k | return Err(find_valid_up_to(slice, i)); | 489 | 5.38M | } | 490 | 5.38M | i += 1; | 491 | | } | 492 | 11.2M | if state != ACCEPT { | 493 | 192 | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | 11.2M | Ok(()) | 496 | | } | 497 | 11.2M | } |
bstr::utf8::validate::fast Line | Count | Source | 469 | 6.37M | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | 6.37M | let mut state = ACCEPT; | 471 | 6.37M | let mut i = 0; | 472 | | | 473 | 7.45M | while i < slice.len() { | 474 | 5.40M | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | 5.40M | if state == ACCEPT | 479 | 4.40M | && b <= 0x7F | 480 | 74.0k | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | 37.8k | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | 37.8k | continue; | 484 | 5.36M | } | 485 | | | 486 | 5.36M | state = step(state, b); | 487 | 5.36M | if state == REJECT { | 488 | 4.32M | return Err(find_valid_up_to(slice, i)); | 489 | 1.04M | } | 490 | 1.04M | i += 1; | 491 | | } | 492 | 2.05M | if state != ACCEPT { | 493 | 2.54k | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | 2.04M | Ok(()) | 496 | | } | 497 | 6.37M | } |
|
498 | | |
499 | | // Given the first position at which a UTF-8 sequence was determined to be |
500 | | // invalid, return an error that correctly reports the position at which |
501 | | // the last complete UTF-8 sequence ends. |
502 | | #[inline(never)] |
503 | 4.32M | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { |
504 | | // In order to find the last valid byte, we need to back up an amount |
505 | | // that guarantees every preceding byte is part of a valid UTF-8 |
506 | | // code unit sequence. To do this, we simply locate the last leading |
507 | | // byte that occurs before rejected_at. |
508 | 4.32M | let mut backup = rejected_at.saturating_sub(1); |
509 | 4.32M | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { |
510 | 1.54k | backup -= 1; |
511 | 1.54k | } |
512 | 4.32M | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); |
513 | 4.32M | let mut err = slow(&slice[backup..upto]).unwrap_err(); |
514 | 4.32M | err.valid_up_to += backup; |
515 | 4.32M | err |
516 | 4.32M | } Unexecuted instantiation: bstr::utf8::validate::find_valid_up_to bstr::utf8::validate::find_valid_up_to Line | Count | Source | 503 | 393 | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | 393 | let mut backup = rejected_at.saturating_sub(1); | 509 | 451 | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | 58 | backup -= 1; | 511 | 58 | } | 512 | 393 | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | 393 | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | 393 | err.valid_up_to += backup; | 515 | 393 | err | 516 | 393 | } |
bstr::utf8::validate::find_valid_up_to Line | Count | Source | 503 | 1.26k | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | 1.26k | let mut backup = rejected_at.saturating_sub(1); | 509 | 1.54k | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | 281 | backup -= 1; | 511 | 281 | } | 512 | 1.26k | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | 1.26k | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | 1.26k | err.valid_up_to += backup; | 515 | 1.26k | err | 516 | 1.26k | } |
bstr::utf8::validate::find_valid_up_to Line | Count | Source | 503 | 4.32M | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | 4.32M | let mut backup = rejected_at.saturating_sub(1); | 509 | 4.32M | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | 1.20k | backup -= 1; | 511 | 1.20k | } | 512 | 4.32M | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | 4.32M | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | 4.32M | err.valid_up_to += backup; | 515 | 4.32M | err | 516 | 4.32M | } |
|
517 | | |
518 | | // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error |
519 | | // when an invalid sequence is found. This is split out from validate so |
520 | | // that the fast path doesn't need to keep track of the position of the |
521 | | // last valid UTF-8 byte. In particular, tracking this requires checking |
522 | | // for an ACCEPT state on each byte, which degrades throughput pretty |
523 | | // badly. |
524 | 4.32M | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { |
525 | 4.32M | let mut state = ACCEPT; |
526 | 4.32M | let mut valid_up_to = 0; |
527 | 5.36M | for (i, &b) in slice.iter().enumerate() { |
528 | 5.36M | state = step(state, b); |
529 | 5.36M | if state == ACCEPT { |
530 | 48.8k | valid_up_to = i + 1; |
531 | 5.31M | } else if state == REJECT { |
532 | | // Our error length must always be at least 1. |
533 | 4.32M | let error_len = Some(cmp::max(1, i - valid_up_to)); |
534 | 4.32M | return Err(Utf8Error { valid_up_to, error_len }); |
535 | 989k | } |
536 | | } |
537 | 2.80k | if state != ACCEPT { |
538 | 2.80k | Err(Utf8Error { valid_up_to, error_len: None }) |
539 | | } else { |
540 | 0 | Ok(()) |
541 | | } |
542 | 4.32M | } Unexecuted instantiation: bstr::utf8::validate::slow bstr::utf8::validate::slow Line | Count | Source | 524 | 393 | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | 393 | let mut state = ACCEPT; | 526 | 393 | let mut valid_up_to = 0; | 527 | 750 | for (i, &b) in slice.iter().enumerate() { | 528 | 750 | state = step(state, b); | 529 | 750 | if state == ACCEPT { | 530 | 206 | valid_up_to = i + 1; | 531 | 544 | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | 321 | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | 321 | return Err(Utf8Error { valid_up_to, error_len }); | 535 | 223 | } | 536 | | } | 537 | 72 | if state != ACCEPT { | 538 | 72 | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | 0 | Ok(()) | 541 | | } | 542 | 393 | } |
bstr::utf8::validate::slow Line | Count | Source | 524 | 1.26k | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | 1.26k | let mut state = ACCEPT; | 526 | 1.26k | let mut valid_up_to = 0; | 527 | 2.57k | for (i, &b) in slice.iter().enumerate() { | 528 | 2.57k | state = step(state, b); | 529 | 2.57k | if state == ACCEPT { | 530 | 700 | valid_up_to = i + 1; | 531 | 1.87k | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | 1.07k | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | 1.07k | return Err(Utf8Error { valid_up_to, error_len }); | 535 | 802 | } | 536 | | } | 537 | 192 | if state != ACCEPT { | 538 | 192 | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | 0 | Ok(()) | 541 | | } | 542 | 1.26k | } |
bstr::utf8::validate::slow Line | Count | Source | 524 | 4.32M | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | 4.32M | let mut state = ACCEPT; | 526 | 4.32M | let mut valid_up_to = 0; | 527 | 5.35M | for (i, &b) in slice.iter().enumerate() { | 528 | 5.35M | state = step(state, b); | 529 | 5.35M | if state == ACCEPT { | 530 | 47.9k | valid_up_to = i + 1; | 531 | 5.30M | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | 4.32M | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | 4.32M | return Err(Utf8Error { valid_up_to, error_len }); | 535 | 988k | } | 536 | | } | 537 | 2.54k | if state != ACCEPT { | 538 | 2.54k | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | 0 | Ok(()) | 541 | | } | 542 | 4.32M | } |
|
543 | | |
544 | | // Advance to the next state given the current state and current byte. |
545 | 16.1M | fn step(state: usize, b: u8) -> usize { |
546 | 16.1M | let class = CLASSES[b as usize]; |
547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is |
548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where |
549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be |
550 | | // valid by construction of the state machine and the byte equivalence |
551 | | // classes. |
552 | | unsafe { |
553 | 16.1M | *STATES_FORWARD.get_unchecked(state + class as usize) as usize |
554 | | } |
555 | 16.1M | } Unexecuted instantiation: bstr::utf8::validate::step bstr::utf8::validate::step Line | Count | Source | 545 | 78.2k | fn step(state: usize, b: u8) -> usize { | 546 | 78.2k | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | 78.2k | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | 78.2k | } |
bstr::utf8::validate::step Line | Count | Source | 545 | 5.38M | fn step(state: usize, b: u8) -> usize { | 546 | 5.38M | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | 5.38M | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | 5.38M | } |
bstr::utf8::validate::step Line | Count | Source | 545 | 10.7M | fn step(state: usize, b: u8) -> usize { | 546 | 10.7M | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | 10.7M | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | 10.7M | } |
|
556 | | |
557 | 17.6M | fast(slice) |
558 | 17.6M | } Unexecuted instantiation: bstr::utf8::validate Line | Count | Source | 464 | 933 | pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { | 465 | | // The fast path for validating UTF-8. It steps through a UTF-8 automaton | 466 | | // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is | 467 | | // detected, it backs up and runs the slower version of the UTF-8 automaton | 468 | | // to determine correct error information. | 469 | | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | | let mut state = ACCEPT; | 471 | | let mut i = 0; | 472 | | | 473 | | while i < slice.len() { | 474 | | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | | if state == ACCEPT | 479 | | && b <= 0x7F | 480 | | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | | continue; | 484 | | } | 485 | | | 486 | | state = step(state, b); | 487 | | if state == REJECT { | 488 | | return Err(find_valid_up_to(slice, i)); | 489 | | } | 490 | | i += 1; | 491 | | } | 492 | | if state != ACCEPT { | 493 | | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | | Ok(()) | 496 | | } | 497 | | } | 498 | | | 499 | | // Given the first position at which a UTF-8 sequence was determined to be | 500 | | // invalid, return an error that correctly reports the position at which | 501 | | // the last complete UTF-8 sequence ends. | 502 | | #[inline(never)] | 503 | | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | | let mut backup = rejected_at.saturating_sub(1); | 509 | | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | | backup -= 1; | 511 | | } | 512 | | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | | err.valid_up_to += backup; | 515 | | err | 516 | | } | 517 | | | 518 | | // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error | 519 | | // when an invalid sequence is found. This is split out from validate so | 520 | | // that the fast path doesn't need to keep track of the position of the | 521 | | // last valid UTF-8 byte. In particular, tracking this requires checking | 522 | | // for an ACCEPT state on each byte, which degrades throughput pretty | 523 | | // badly. | 524 | | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | | let mut state = ACCEPT; | 526 | | let mut valid_up_to = 0; | 527 | | for (i, &b) in slice.iter().enumerate() { | 528 | | state = step(state, b); | 529 | | if state == ACCEPT { | 530 | | valid_up_to = i + 1; | 531 | | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | | return Err(Utf8Error { valid_up_to, error_len }); | 535 | | } | 536 | | } | 537 | | if state != ACCEPT { | 538 | | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | | Ok(()) | 541 | | } | 542 | | } | 543 | | | 544 | | // Advance to the next state given the current state and current byte. | 545 | | fn step(state: usize, b: u8) -> usize { | 546 | | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | | } | 556 | | | 557 | 933 | fast(slice) | 558 | 933 | } |
Line | Count | Source | 464 | 11.2M | pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { | 465 | | // The fast path for validating UTF-8. It steps through a UTF-8 automaton | 466 | | // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is | 467 | | // detected, it backs up and runs the slower version of the UTF-8 automaton | 468 | | // to determine correct error information. | 469 | | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | | let mut state = ACCEPT; | 471 | | let mut i = 0; | 472 | | | 473 | | while i < slice.len() { | 474 | | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | | if state == ACCEPT | 479 | | && b <= 0x7F | 480 | | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | | continue; | 484 | | } | 485 | | | 486 | | state = step(state, b); | 487 | | if state == REJECT { | 488 | | return Err(find_valid_up_to(slice, i)); | 489 | | } | 490 | | i += 1; | 491 | | } | 492 | | if state != ACCEPT { | 493 | | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | | Ok(()) | 496 | | } | 497 | | } | 498 | | | 499 | | // Given the first position at which a UTF-8 sequence was determined to be | 500 | | // invalid, return an error that correctly reports the position at which | 501 | | // the last complete UTF-8 sequence ends. | 502 | | #[inline(never)] | 503 | | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | | let mut backup = rejected_at.saturating_sub(1); | 509 | | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | | backup -= 1; | 511 | | } | 512 | | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | | err.valid_up_to += backup; | 515 | | err | 516 | | } | 517 | | | 518 | | // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error | 519 | | // when an invalid sequence is found. This is split out from validate so | 520 | | // that the fast path doesn't need to keep track of the position of the | 521 | | // last valid UTF-8 byte. In particular, tracking this requires checking | 522 | | // for an ACCEPT state on each byte, which degrades throughput pretty | 523 | | // badly. | 524 | | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | | let mut state = ACCEPT; | 526 | | let mut valid_up_to = 0; | 527 | | for (i, &b) in slice.iter().enumerate() { | 528 | | state = step(state, b); | 529 | | if state == ACCEPT { | 530 | | valid_up_to = i + 1; | 531 | | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | | return Err(Utf8Error { valid_up_to, error_len }); | 535 | | } | 536 | | } | 537 | | if state != ACCEPT { | 538 | | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | | Ok(()) | 541 | | } | 542 | | } | 543 | | | 544 | | // Advance to the next state given the current state and current byte. | 545 | | fn step(state: usize, b: u8) -> usize { | 546 | | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | | } | 556 | | | 557 | 11.2M | fast(slice) | 558 | 11.2M | } |
Line | Count | Source | 464 | 6.37M | pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { | 465 | | // The fast path for validating UTF-8. It steps through a UTF-8 automaton | 466 | | // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is | 467 | | // detected, it backs up and runs the slower version of the UTF-8 automaton | 468 | | // to determine correct error information. | 469 | | fn fast(slice: &[u8]) -> Result<(), Utf8Error> { | 470 | | let mut state = ACCEPT; | 471 | | let mut i = 0; | 472 | | | 473 | | while i < slice.len() { | 474 | | let b = slice[i]; | 475 | | | 476 | | // ASCII fast path. If we see two consecutive ASCII bytes, then try | 477 | | // to validate as much ASCII as possible very quickly. | 478 | | if state == ACCEPT | 479 | | && b <= 0x7F | 480 | | && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) | 481 | | { | 482 | | i += ascii::first_non_ascii_byte(&slice[i..]); | 483 | | continue; | 484 | | } | 485 | | | 486 | | state = step(state, b); | 487 | | if state == REJECT { | 488 | | return Err(find_valid_up_to(slice, i)); | 489 | | } | 490 | | i += 1; | 491 | | } | 492 | | if state != ACCEPT { | 493 | | Err(find_valid_up_to(slice, slice.len())) | 494 | | } else { | 495 | | Ok(()) | 496 | | } | 497 | | } | 498 | | | 499 | | // Given the first position at which a UTF-8 sequence was determined to be | 500 | | // invalid, return an error that correctly reports the position at which | 501 | | // the last complete UTF-8 sequence ends. | 502 | | #[inline(never)] | 503 | | fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { | 504 | | // In order to find the last valid byte, we need to back up an amount | 505 | | // that guarantees every preceding byte is part of a valid UTF-8 | 506 | | // code unit sequence. To do this, we simply locate the last leading | 507 | | // byte that occurs before rejected_at. | 508 | | let mut backup = rejected_at.saturating_sub(1); | 509 | | while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { | 510 | | backup -= 1; | 511 | | } | 512 | | let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); | 513 | | let mut err = slow(&slice[backup..upto]).unwrap_err(); | 514 | | err.valid_up_to += backup; | 515 | | err | 516 | | } | 517 | | | 518 | | // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error | 519 | | // when an invalid sequence is found. This is split out from validate so | 520 | | // that the fast path doesn't need to keep track of the position of the | 521 | | // last valid UTF-8 byte. In particular, tracking this requires checking | 522 | | // for an ACCEPT state on each byte, which degrades throughput pretty | 523 | | // badly. | 524 | | fn slow(slice: &[u8]) -> Result<(), Utf8Error> { | 525 | | let mut state = ACCEPT; | 526 | | let mut valid_up_to = 0; | 527 | | for (i, &b) in slice.iter().enumerate() { | 528 | | state = step(state, b); | 529 | | if state == ACCEPT { | 530 | | valid_up_to = i + 1; | 531 | | } else if state == REJECT { | 532 | | // Our error length must always be at least 1. | 533 | | let error_len = Some(cmp::max(1, i - valid_up_to)); | 534 | | return Err(Utf8Error { valid_up_to, error_len }); | 535 | | } | 536 | | } | 537 | | if state != ACCEPT { | 538 | | Err(Utf8Error { valid_up_to, error_len: None }) | 539 | | } else { | 540 | | Ok(()) | 541 | | } | 542 | | } | 543 | | | 544 | | // Advance to the next state given the current state and current byte. | 545 | | fn step(state: usize, b: u8) -> usize { | 546 | | let class = CLASSES[b as usize]; | 547 | | // SAFETY: This is safe because 'class' is always <=11 and 'state' is | 548 | | // always <=96. Therefore, the maximal index is 96+11 = 107, where | 549 | | // STATES_FORWARD.len() = 108 such that every index is guaranteed to be | 550 | | // valid by construction of the state machine and the byte equivalence | 551 | | // classes. | 552 | | unsafe { | 553 | | *STATES_FORWARD.get_unchecked(state + class as usize) as usize | 554 | | } | 555 | | } | 556 | | | 557 | 6.37M | fast(slice) | 558 | 6.37M | } |
|
559 | | |
560 | | /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. |
561 | | /// |
562 | | /// When successful, the corresponding Unicode scalar value is returned along |
563 | | /// with the number of bytes it was encoded with. The number of bytes consumed |
564 | | /// for a successful decode is always between 1 and 4, inclusive. |
565 | | /// |
566 | | /// When unsuccessful, `None` is returned along with the number of bytes that |
567 | | /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
568 | | /// the number of bytes consumed is always between 0 and 3, inclusive, where |
569 | | /// 0 is only returned when `slice` is empty. |
570 | | /// |
571 | | /// # Examples |
572 | | /// |
573 | | /// Basic usage: |
574 | | /// |
575 | | /// ``` |
576 | | /// use bstr::decode_utf8; |
577 | | /// |
578 | | /// // Decoding a valid codepoint. |
579 | | /// let (ch, size) = decode_utf8(b"\xE2\x98\x83"); |
580 | | /// assert_eq!(Some('☃'), ch); |
581 | | /// assert_eq!(3, size); |
582 | | /// |
583 | | /// // Decoding an incomplete codepoint. |
584 | | /// let (ch, size) = decode_utf8(b"\xE2\x98"); |
585 | | /// assert_eq!(None, ch); |
586 | | /// assert_eq!(2, size); |
587 | | /// ``` |
588 | | /// |
589 | | /// This example shows how to iterate over all codepoints in UTF-8 encoded |
590 | | /// bytes, while replacing invalid UTF-8 sequences with the replacement |
591 | | /// codepoint: |
592 | | /// |
593 | | /// ``` |
594 | | /// use bstr::{B, decode_utf8}; |
595 | | /// |
596 | | /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
597 | | /// let mut chars = vec![]; |
598 | | /// while !bytes.is_empty() { |
599 | | /// let (ch, size) = decode_utf8(bytes); |
600 | | /// bytes = &bytes[size..]; |
601 | | /// chars.push(ch.unwrap_or('\u{FFFD}')); |
602 | | /// } |
603 | | /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); |
604 | | /// ``` |
605 | | #[inline] |
606 | 96.3M | pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
607 | 96.3M | let slice = slice.as_ref(); |
608 | 96.3M | match slice.first() { |
609 | 3.13M | None => return (None, 0), |
610 | 93.2M | Some(&b) if b <= 0x7F => return (Some(b as char), 1), |
611 | 130k | _ => {} |
612 | | } |
613 | | |
614 | 130k | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); |
615 | 198k | while i < slice.len() { |
616 | 198k | decode_step(&mut state, &mut cp, slice[i]); |
617 | 198k | i += 1; |
618 | | |
619 | 198k | if state == ACCEPT { |
620 | | // SAFETY: This is safe because `decode_step` guarantees that |
621 | | // `cp` is a valid Unicode scalar value in an ACCEPT state. |
622 | 14.4k | let ch = unsafe { char::from_u32_unchecked(cp) }; |
623 | 14.4k | return (Some(ch), i); |
624 | 183k | } else if state == REJECT { |
625 | | // At this point, we always want to advance at least one byte. |
626 | 115k | return (None, cmp::max(1, i.saturating_sub(1))); |
627 | 68.0k | } |
628 | | } |
629 | 33 | (None, i) |
630 | 96.3M | } Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> bstr::utf8::decode::<&[u8]> Line | Count | Source | 606 | 108k | pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { | 607 | 108k | let slice = slice.as_ref(); | 608 | 108k | match slice.first() { | 609 | 115 | None => return (None, 0), | 610 | 107k | Some(&b) if b <= 0x7F => return (Some(b as char), 1), | 611 | 33.5k | _ => {} | 612 | | } | 613 | | | 614 | 33.5k | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); | 615 | 54.8k | while i < slice.len() { | 616 | 54.8k | decode_step(&mut state, &mut cp, slice[i]); | 617 | 54.8k | i += 1; | 618 | | | 619 | 54.8k | if state == ACCEPT { | 620 | | // SAFETY: This is safe because `decode_step` guarantees that | 621 | | // `cp` is a valid Unicode scalar value in an ACCEPT state. | 622 | 3.06k | let ch = unsafe { char::from_u32_unchecked(cp) }; | 623 | 3.06k | return (Some(ch), i); | 624 | 51.7k | } else if state == REJECT { | 625 | | // At this point, we always want to advance at least one byte. | 626 | 30.5k | return (None, cmp::max(1, i.saturating_sub(1))); | 627 | 21.2k | } | 628 | | } | 629 | 6 | (None, i) | 630 | 108k | } |
Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> bstr::utf8::decode::<&[u8]> Line | Count | Source | 606 | 47.9M | pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { | 607 | 47.9M | let slice = slice.as_ref(); | 608 | 47.9M | match slice.first() { | 609 | 3.12M | None => return (None, 0), | 610 | 44.8M | Some(&b) if b <= 0x7F => return (Some(b as char), 1), | 611 | 5.29k | _ => {} | 612 | | } | 613 | | | 614 | 5.29k | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); | 615 | 17.5k | while i < slice.len() { | 616 | 17.5k | decode_step(&mut state, &mut cp, slice[i]); | 617 | 17.5k | i += 1; | 618 | | | 619 | 17.5k | if state == ACCEPT { | 620 | | // SAFETY: This is safe because `decode_step` guarantees that | 621 | | // `cp` is a valid Unicode scalar value in an ACCEPT state. | 622 | 5.29k | let ch = unsafe { char::from_u32_unchecked(cp) }; | 623 | 5.29k | return (Some(ch), i); | 624 | 12.2k | } else if state == REJECT { | 625 | | // At this point, we always want to advance at least one byte. | 626 | 0 | return (None, cmp::max(1, i.saturating_sub(1))); | 627 | 12.2k | } | 628 | | } | 629 | 0 | (None, i) | 630 | 47.9M | } |
Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> bstr::utf8::decode::<&[u8]> Line | Count | Source | 606 | 48.2M | pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { | 607 | 48.2M | let slice = slice.as_ref(); | 608 | 48.2M | match slice.first() { | 609 | 3.00k | None => return (None, 0), | 610 | 48.2M | Some(&b) if b <= 0x7F => return (Some(b as char), 1), | 611 | 91.5k | _ => {} | 612 | | } | 613 | | | 614 | 91.5k | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); | 615 | 126k | while i < slice.len() { | 616 | 126k | decode_step(&mut state, &mut cp, slice[i]); | 617 | 126k | i += 1; | 618 | | | 619 | 126k | if state == ACCEPT { | 620 | | // SAFETY: This is safe because `decode_step` guarantees that | 621 | | // `cp` is a valid Unicode scalar value in an ACCEPT state. | 622 | 6.07k | let ch = unsafe { char::from_u32_unchecked(cp) }; | 623 | 6.07k | return (Some(ch), i); | 624 | 120k | } else if state == REJECT { | 625 | | // At this point, we always want to advance at least one byte. | 626 | 85.4k | return (None, cmp::max(1, i.saturating_sub(1))); | 627 | 34.6k | } | 628 | | } | 629 | 27 | (None, i) | 630 | 48.2M | } |
Unexecuted instantiation: bstr::utf8::decode::<&[u8]> Unexecuted instantiation: bstr::utf8::decode::<&[u8]> |
631 | | |
632 | | /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a |
633 | | /// slice. |
634 | | /// |
635 | | /// When successful, the corresponding Unicode scalar value is returned along |
636 | | /// with the number of bytes it was encoded with. The number of bytes consumed |
637 | | /// for a successful decode is always between 1 and 4, inclusive. |
638 | | /// |
639 | | /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned |
640 | | /// along with the number of bytes that make up a maximal prefix of a valid |
641 | | /// UTF-8 code unit sequence. In this case, the number of bytes consumed is |
642 | | /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is |
643 | | /// empty. |
644 | | /// |
645 | | /// # Examples |
646 | | /// |
647 | | /// Basic usage: |
648 | | /// |
649 | | /// ```ignore |
650 | | /// use bstr::decode_utf8_lossy; |
651 | | /// |
652 | | /// // Decoding a valid codepoint. |
653 | | /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83"); |
654 | | /// assert_eq!('☃', ch); |
655 | | /// assert_eq!(3, size); |
656 | | /// |
657 | | /// // Decoding an incomplete codepoint. |
658 | | /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98"); |
659 | | /// assert_eq!('\u{FFFD}', ch); |
660 | | /// assert_eq!(2, size); |
661 | | /// ``` |
662 | | /// |
663 | | /// This example shows how to iterate over all codepoints in UTF-8 encoded |
664 | | /// bytes, while replacing invalid UTF-8 sequences with the replacement |
665 | | /// codepoint: |
666 | | /// |
667 | | /// ```ignore |
668 | | /// use bstr::{B, decode_utf8_lossy}; |
669 | | /// |
670 | | /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
671 | | /// let mut chars = vec![]; |
672 | | /// while !bytes.is_empty() { |
673 | | /// let (ch, size) = decode_utf8_lossy(bytes); |
674 | | /// bytes = &bytes[size..]; |
675 | | /// chars.push(ch); |
676 | | /// } |
677 | | /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); |
678 | | /// ``` |
679 | | #[inline] |
680 | 96.3M | pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
681 | 96.3M | match decode(slice) { |
682 | 93.1M | (Some(ch), size) => (ch, size), |
683 | 3.24M | (None, size) => ('\u{FFFD}', size), |
684 | | } |
685 | 96.3M | } Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> bstr::utf8::decode_lossy::<&[u8]> Line | Count | Source | 680 | 108k | pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { | 681 | 108k | match decode(slice) { | 682 | 77.4k | (Some(ch), size) => (ch, size), | 683 | 30.6k | (None, size) => ('\u{FFFD}', size), | 684 | | } | 685 | 108k | } |
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> bstr::utf8::decode_lossy::<&[u8]> Line | Count | Source | 680 | 47.9M | pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { | 681 | 47.9M | match decode(slice) { | 682 | 44.8M | (Some(ch), size) => (ch, size), | 683 | 3.12M | (None, size) => ('\u{FFFD}', size), | 684 | | } | 685 | 47.9M | } |
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<_> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> bstr::utf8::decode_lossy::<&[u8]> Line | Count | Source | 680 | 48.2M | pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { | 681 | 48.2M | match decode(slice) { | 682 | 48.2M | (Some(ch), size) => (ch, size), | 683 | 88.4k | (None, size) => ('\u{FFFD}', size), | 684 | | } | 685 | 48.2M | } |
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]> |
686 | | |
687 | | /// UTF-8 decode a single Unicode scalar value from the end of a slice. |
688 | | /// |
689 | | /// When successful, the corresponding Unicode scalar value is returned along |
690 | | /// with the number of bytes it was encoded with. The number of bytes consumed |
691 | | /// for a successful decode is always between 1 and 4, inclusive. |
692 | | /// |
693 | | /// When unsuccessful, `None` is returned along with the number of bytes that |
694 | | /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
695 | | /// the number of bytes consumed is always between 0 and 3, inclusive, where |
696 | | /// 0 is only returned when `slice` is empty. |
697 | | /// |
698 | | /// # Examples |
699 | | /// |
700 | | /// Basic usage: |
701 | | /// |
702 | | /// ``` |
703 | | /// use bstr::decode_last_utf8; |
704 | | /// |
705 | | /// // Decoding a valid codepoint. |
706 | | /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83"); |
707 | | /// assert_eq!(Some('☃'), ch); |
708 | | /// assert_eq!(3, size); |
709 | | /// |
710 | | /// // Decoding an incomplete codepoint. |
711 | | /// let (ch, size) = decode_last_utf8(b"\xE2\x98"); |
712 | | /// assert_eq!(None, ch); |
713 | | /// assert_eq!(2, size); |
714 | | /// ``` |
715 | | /// |
716 | | /// This example shows how to iterate over all codepoints in UTF-8 encoded |
717 | | /// bytes in reverse, while replacing invalid UTF-8 sequences with the |
718 | | /// replacement codepoint: |
719 | | /// |
720 | | /// ``` |
721 | | /// use bstr::{B, decode_last_utf8}; |
722 | | /// |
723 | | /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
724 | | /// let mut chars = vec![]; |
725 | | /// while !bytes.is_empty() { |
726 | | /// let (ch, size) = decode_last_utf8(bytes); |
727 | | /// bytes = &bytes[..bytes.len()-size]; |
728 | | /// chars.push(ch.unwrap_or('\u{FFFD}')); |
729 | | /// } |
730 | | /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); |
731 | | /// ``` |
732 | | #[inline] |
733 | 0 | pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
734 | | // TODO: We could implement this by reversing the UTF-8 automaton, but for |
735 | | // now, we do it the slow way by using the forward automaton. |
736 | | |
737 | 0 | let slice = slice.as_ref(); |
738 | 0 | if slice.is_empty() { |
739 | 0 | return (None, 0); |
740 | 0 | } |
741 | 0 | let mut start = slice.len() - 1; |
742 | 0 | let limit = slice.len().saturating_sub(4); |
743 | 0 | while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) { |
744 | 0 | start -= 1; |
745 | 0 | } |
746 | 0 | let (ch, size) = decode(&slice[start..]); |
747 | | // If we didn't consume all of the bytes, then that means there's at least |
748 | | // one stray byte that never occurs in a valid code unit prefix, so we can |
749 | | // advance by one byte. |
750 | 0 | if start + size != slice.len() { |
751 | 0 | (None, 1) |
752 | | } else { |
753 | 0 | (ch, size) |
754 | | } |
755 | 0 | } Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last::<_> |
756 | | |
757 | | /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice. |
758 | | /// |
759 | | /// When successful, the corresponding Unicode scalar value is returned along |
760 | | /// with the number of bytes it was encoded with. The number of bytes consumed |
761 | | /// for a successful decode is always between 1 and 4, inclusive. |
762 | | /// |
763 | | /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned |
764 | | /// along with the number of bytes that make up a maximal prefix of a valid |
765 | | /// UTF-8 code unit sequence. In this case, the number of bytes consumed is |
766 | | /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is |
767 | | /// empty. |
768 | | /// |
769 | | /// # Examples |
770 | | /// |
771 | | /// Basic usage: |
772 | | /// |
773 | | /// ```ignore |
774 | | /// use bstr::decode_last_utf8_lossy; |
775 | | /// |
776 | | /// // Decoding a valid codepoint. |
777 | | /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83"); |
778 | | /// assert_eq!('☃', ch); |
779 | | /// assert_eq!(3, size); |
780 | | /// |
781 | | /// // Decoding an incomplete codepoint. |
782 | | /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98"); |
783 | | /// assert_eq!('\u{FFFD}', ch); |
784 | | /// assert_eq!(2, size); |
785 | | /// ``` |
786 | | /// |
787 | | /// This example shows how to iterate over all codepoints in UTF-8 encoded |
788 | | /// bytes in reverse, while replacing invalid UTF-8 sequences with the |
789 | | /// replacement codepoint: |
790 | | /// |
791 | | /// ```ignore |
792 | | /// use bstr::decode_last_utf8_lossy; |
793 | | /// |
794 | | /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
795 | | /// let mut chars = vec![]; |
796 | | /// while !bytes.is_empty() { |
797 | | /// let (ch, size) = decode_last_utf8_lossy(bytes); |
798 | | /// bytes = &bytes[..bytes.len()-size]; |
799 | | /// chars.push(ch); |
800 | | /// } |
801 | | /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); |
802 | | /// ``` |
803 | | #[inline] |
804 | 0 | pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
805 | 0 | match decode_last(slice) { |
806 | 0 | (Some(ch), size) => (ch, size), |
807 | 0 | (None, size) => ('\u{FFFD}', size), |
808 | | } |
809 | 0 | } Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]> Unexecuted instantiation: bstr::utf8::decode_last_lossy::<_> |
810 | | |
811 | | /// SAFETY: The decode function relies on state being equal to ACCEPT only if |
812 | | /// cp is a valid Unicode scalar value. |
813 | | #[inline] |
814 | 198k | pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { |
815 | 198k | let class = CLASSES[b as usize]; |
816 | 198k | let b = u32::from(b); |
817 | 198k | if *state == ACCEPT { |
818 | 130k | *cp = (0xFF >> class) & b; |
819 | 130k | } else { |
820 | 68.0k | *cp = (b & 0b0011_1111) | (*cp << 6); |
821 | 68.0k | } |
822 | 198k | *state = STATES_FORWARD[*state + class as usize] as usize; |
823 | 198k | } Unexecuted instantiation: bstr::utf8::decode_step Line | Count | Source | 814 | 54.8k | pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { | 815 | 54.8k | let class = CLASSES[b as usize]; | 816 | 54.8k | let b = u32::from(b); | 817 | 54.8k | if *state == ACCEPT { | 818 | 33.5k | *cp = (0xFF >> class) & b; | 819 | 33.5k | } else { | 820 | 21.2k | *cp = (b & 0b0011_1111) | (*cp << 6); | 821 | 21.2k | } | 822 | 54.8k | *state = STATES_FORWARD[*state + class as usize] as usize; | 823 | 54.8k | } |
Line | Count | Source | 814 | 143k | pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { | 815 | 143k | let class = CLASSES[b as usize]; | 816 | 143k | let b = u32::from(b); | 817 | 143k | if *state == ACCEPT { | 818 | 96.8k | *cp = (0xFF >> class) & b; | 819 | 96.8k | } else { | 820 | 46.7k | *cp = (b & 0b0011_1111) | (*cp << 6); | 821 | 46.7k | } | 822 | 143k | *state = STATES_FORWARD[*state + class as usize] as usize; | 823 | 143k | } |
Unexecuted instantiation: bstr::utf8::decode_step |
824 | | |
825 | | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
826 | | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
827 | | /// valid UTF-8 sequence. |
828 | 33.6k | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { |
829 | | // In the ASCII case, the most significant bit is never set. The leading |
830 | | // byte of a 2/3/4-byte sequence always has the top two most significant |
831 | | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
832 | | // also returns true, since every such byte has its two most significant |
833 | | // bits set: |
834 | | // |
835 | | // \xC0 :: 11000000 |
836 | | // \xC1 :: 11000001 |
837 | | // \xF5 :: 11110101 |
838 | | // \xF6 :: 11110110 |
839 | | // \xF7 :: 11110111 |
840 | | // \xF8 :: 11111000 |
841 | | // \xF9 :: 11111001 |
842 | | // \xFA :: 11111010 |
843 | | // \xFB :: 11111011 |
844 | | // \xFC :: 11111100 |
845 | | // \xFD :: 11111101 |
846 | | // \xFE :: 11111110 |
847 | | // \xFF :: 11111111 |
848 | 33.6k | (b & 0b1100_0000) != 0b1000_0000 |
849 | 33.6k | } Unexecuted instantiation: bstr::utf8::is_leading_or_invalid_utf8_byte bstr::utf8::is_leading_or_invalid_utf8_byte Line | Count | Source | 828 | 381 | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { | 829 | | // In the ASCII case, the most significant bit is never set. The leading | 830 | | // byte of a 2/3/4-byte sequence always has the top two most significant | 831 | | // bits set. For bytes that can never appear anywhere in valid UTF-8, this | 832 | | // also returns true, since every such byte has its two most significant | 833 | | // bits set: | 834 | | // | 835 | | // \xC0 :: 11000000 | 836 | | // \xC1 :: 11000001 | 837 | | // \xF5 :: 11110101 | 838 | | // \xF6 :: 11110110 | 839 | | // \xF7 :: 11110111 | 840 | | // \xF8 :: 11111000 | 841 | | // \xF9 :: 11111001 | 842 | | // \xFA :: 11111010 | 843 | | // \xFB :: 11111011 | 844 | | // \xFC :: 11111100 | 845 | | // \xFD :: 11111101 | 846 | | // \xFE :: 11111110 | 847 | | // \xFF :: 11111111 | 848 | 381 | (b & 0b1100_0000) != 0b1000_0000 | 849 | 381 | } |
bstr::utf8::is_leading_or_invalid_utf8_byte Line | Count | Source | 828 | 1.33k | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { | 829 | | // In the ASCII case, the most significant bit is never set. The leading | 830 | | // byte of a 2/3/4-byte sequence always has the top two most significant | 831 | | // bits set. For bytes that can never appear anywhere in valid UTF-8, this | 832 | | // also returns true, since every such byte has its two most significant | 833 | | // bits set: | 834 | | // | 835 | | // \xC0 :: 11000000 | 836 | | // \xC1 :: 11000001 | 837 | | // \xF5 :: 11110101 | 838 | | // \xF6 :: 11110110 | 839 | | // \xF7 :: 11110111 | 840 | | // \xF8 :: 11111000 | 841 | | // \xF9 :: 11111001 | 842 | | // \xFA :: 11111010 | 843 | | // \xFB :: 11111011 | 844 | | // \xFC :: 11111100 | 845 | | // \xFD :: 11111101 | 846 | | // \xFE :: 11111110 | 847 | | // \xFF :: 11111111 | 848 | 1.33k | (b & 0b1100_0000) != 0b1000_0000 | 849 | 1.33k | } |
bstr::utf8::is_leading_or_invalid_utf8_byte Line | Count | Source | 828 | 31.9k | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { | 829 | | // In the ASCII case, the most significant bit is never set. The leading | 830 | | // byte of a 2/3/4-byte sequence always has the top two most significant | 831 | | // bits set. For bytes that can never appear anywhere in valid UTF-8, this | 832 | | // also returns true, since every such byte has its two most significant | 833 | | // bits set: | 834 | | // | 835 | | // \xC0 :: 11000000 | 836 | | // \xC1 :: 11000001 | 837 | | // \xF5 :: 11110101 | 838 | | // \xF6 :: 11110110 | 839 | | // \xF7 :: 11110111 | 840 | | // \xF8 :: 11111000 | 841 | | // \xF9 :: 11111001 | 842 | | // \xFA :: 11111010 | 843 | | // \xFB :: 11111011 | 844 | | // \xFC :: 11111100 | 845 | | // \xFD :: 11111101 | 846 | | // \xFE :: 11111110 | 847 | | // \xFF :: 11111111 | 848 | 31.9k | (b & 0b1100_0000) != 0b1000_0000 | 849 | 31.9k | } |
|
850 | | |
851 | | #[cfg(all(test, feature = "std"))] |
852 | | mod tests { |
853 | | use core::char; |
854 | | |
855 | | use alloc::{string::String, vec, vec::Vec}; |
856 | | |
857 | | use crate::{ |
858 | | ext_slice::{ByteSlice, B}, |
859 | | tests::LOSSY_TESTS, |
860 | | utf8::{self, Utf8Error}, |
861 | | }; |
862 | | |
863 | | fn utf8e(valid_up_to: usize) -> Utf8Error { |
864 | | Utf8Error { valid_up_to, error_len: None } |
865 | | } |
866 | | |
867 | | fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error { |
868 | | Utf8Error { valid_up_to, error_len: Some(error_len) } |
869 | | } |
870 | | |
871 | | #[test] |
872 | | #[cfg(not(miri))] |
873 | | fn validate_all_codepoints() { |
874 | | for i in 0..(0x10FFFF + 1) { |
875 | | let cp = match char::from_u32(i) { |
876 | | None => continue, |
877 | | Some(cp) => cp, |
878 | | }; |
879 | | let mut buf = [0; 4]; |
880 | | let s = cp.encode_utf8(&mut buf); |
881 | | assert_eq!(Ok(()), utf8::validate(s.as_bytes())); |
882 | | } |
883 | | } |
884 | | |
885 | | #[test] |
886 | | fn validate_multiple_codepoints() { |
887 | | assert_eq!(Ok(()), utf8::validate(b"abc")); |
888 | | assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a")); |
889 | | assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a")); |
890 | | assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",)); |
891 | | assert_eq!( |
892 | | Ok(()), |
893 | | utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",) |
894 | | ); |
895 | | assert_eq!( |
896 | | Ok(()), |
897 | | utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",) |
898 | | ); |
899 | | } |
900 | | |
901 | | #[test] |
902 | | fn validate_errors() { |
903 | | // single invalid byte |
904 | | assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF")); |
905 | | // single invalid byte after ASCII |
906 | | assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF")); |
907 | | // single invalid byte after 2 byte sequence |
908 | | assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF")); |
909 | | // single invalid byte after 3 byte sequence |
910 | | assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF")); |
911 | | // single invalid byte after 4 byte sequence |
912 | | assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF")); |
913 | | |
914 | | // An invalid 2-byte sequence with a valid 1-byte prefix. |
915 | | assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0")); |
916 | | // An invalid 3-byte sequence with a valid 2-byte prefix. |
917 | | assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0")); |
918 | | // An invalid 4-byte sequence with a valid 3-byte prefix. |
919 | | assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0")); |
920 | | |
921 | | // An overlong sequence. Should be \xE2\x82\xAC, but we encode the |
922 | | // same codepoint value in 4 bytes. This not only tests that we reject |
923 | | // overlong sequences, but that we get valid_up_to correct. |
924 | | assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC")); |
925 | | assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC")); |
926 | | assert_eq!( |
927 | | Err(utf8e2(3, 1)), |
928 | | utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",) |
929 | | ); |
930 | | |
931 | | // Check that encoding a surrogate codepoint using the UTF-8 scheme |
932 | | // fails validation. |
933 | | assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80")); |
934 | | assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80")); |
935 | | assert_eq!( |
936 | | Err(utf8e2(3, 1)), |
937 | | utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",) |
938 | | ); |
939 | | |
940 | | // Check that an incomplete 2-byte sequence fails. |
941 | | assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa")); |
942 | | assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa")); |
943 | | assert_eq!( |
944 | | Err(utf8e2(3, 1)), |
945 | | utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",) |
946 | | ); |
947 | | // Check that an incomplete 3-byte sequence fails. |
948 | | assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a")); |
949 | | assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a")); |
950 | | assert_eq!( |
951 | | Err(utf8e2(3, 2)), |
952 | | utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",) |
953 | | ); |
954 | | // Check that an incomplete 4-byte sequence fails. |
955 | | assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca")); |
956 | | assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca")); |
957 | | assert_eq!( |
958 | | Err(utf8e2(4, 3)), |
959 | | utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",) |
960 | | ); |
961 | | assert_eq!( |
962 | | Err(utf8e2(6, 3)), |
963 | | utf8::validate(b"foobar\xF1\x80\x80quux",) |
964 | | ); |
965 | | |
966 | | // Check that an incomplete (EOF) 2-byte sequence fails. |
967 | | assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE")); |
968 | | assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE")); |
969 | | assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE")); |
970 | | // Check that an incomplete (EOF) 3-byte sequence fails. |
971 | | assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98")); |
972 | | assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98")); |
973 | | assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98")); |
974 | | // Check that an incomplete (EOF) 4-byte sequence fails. |
975 | | assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C")); |
976 | | assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C")); |
977 | | assert_eq!( |
978 | | Err(utf8e(4)), |
979 | | utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",) |
980 | | ); |
981 | | |
982 | | // Test that we errors correct even after long valid sequences. This |
983 | | // checks that our "backup" logic for detecting errors is correct. |
984 | | assert_eq!( |
985 | | Err(utf8e2(8, 1)), |
986 | | utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",) |
987 | | ); |
988 | | } |
989 | | |
990 | | #[test] |
991 | | fn decode_valid() { |
992 | | fn d(mut s: &str) -> Vec<char> { |
993 | | let mut chars = vec![]; |
994 | | while !s.is_empty() { |
995 | | let (ch, size) = utf8::decode(s.as_bytes()); |
996 | | s = &s[size..]; |
997 | | chars.push(ch.unwrap()); |
998 | | } |
999 | | chars |
1000 | | } |
1001 | | |
1002 | | assert_eq!(vec!['☃'], d("☃")); |
1003 | | assert_eq!(vec!['☃', '☃'], d("☃☃")); |
1004 | | assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); |
1005 | | assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); |
1006 | | assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); |
1007 | | } |
1008 | | |
1009 | | #[test] |
1010 | | fn decode_invalid() { |
1011 | | let (ch, size) = utf8::decode(b""); |
1012 | | assert_eq!(None, ch); |
1013 | | assert_eq!(0, size); |
1014 | | |
1015 | | let (ch, size) = utf8::decode(b"\xFF"); |
1016 | | assert_eq!(None, ch); |
1017 | | assert_eq!(1, size); |
1018 | | |
1019 | | let (ch, size) = utf8::decode(b"\xCE\xF0"); |
1020 | | assert_eq!(None, ch); |
1021 | | assert_eq!(1, size); |
1022 | | |
1023 | | let (ch, size) = utf8::decode(b"\xE2\x98\xF0"); |
1024 | | assert_eq!(None, ch); |
1025 | | assert_eq!(2, size); |
1026 | | |
1027 | | let (ch, size) = utf8::decode(b"\xF0\x9D\x9D"); |
1028 | | assert_eq!(None, ch); |
1029 | | assert_eq!(3, size); |
1030 | | |
1031 | | let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0"); |
1032 | | assert_eq!(None, ch); |
1033 | | assert_eq!(3, size); |
1034 | | |
1035 | | let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC"); |
1036 | | assert_eq!(None, ch); |
1037 | | assert_eq!(1, size); |
1038 | | |
1039 | | let (ch, size) = utf8::decode(b"\xED\xA0\x80"); |
1040 | | assert_eq!(None, ch); |
1041 | | assert_eq!(1, size); |
1042 | | |
1043 | | let (ch, size) = utf8::decode(b"\xCEa"); |
1044 | | assert_eq!(None, ch); |
1045 | | assert_eq!(1, size); |
1046 | | |
1047 | | let (ch, size) = utf8::decode(b"\xE2\x98a"); |
1048 | | assert_eq!(None, ch); |
1049 | | assert_eq!(2, size); |
1050 | | |
1051 | | let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca"); |
1052 | | assert_eq!(None, ch); |
1053 | | assert_eq!(3, size); |
1054 | | } |
1055 | | |
1056 | | #[test] |
1057 | | fn decode_lossy() { |
1058 | | let (ch, size) = utf8::decode_lossy(b""); |
1059 | | assert_eq!('\u{FFFD}', ch); |
1060 | | assert_eq!(0, size); |
1061 | | |
1062 | | let (ch, size) = utf8::decode_lossy(b"\xFF"); |
1063 | | assert_eq!('\u{FFFD}', ch); |
1064 | | assert_eq!(1, size); |
1065 | | |
1066 | | let (ch, size) = utf8::decode_lossy(b"\xCE\xF0"); |
1067 | | assert_eq!('\u{FFFD}', ch); |
1068 | | assert_eq!(1, size); |
1069 | | |
1070 | | let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0"); |
1071 | | assert_eq!('\u{FFFD}', ch); |
1072 | | assert_eq!(2, size); |
1073 | | |
1074 | | let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0"); |
1075 | | assert_eq!('\u{FFFD}', ch); |
1076 | | assert_eq!(3, size); |
1077 | | |
1078 | | let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC"); |
1079 | | assert_eq!('\u{FFFD}', ch); |
1080 | | assert_eq!(1, size); |
1081 | | |
1082 | | let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80"); |
1083 | | assert_eq!('\u{FFFD}', ch); |
1084 | | assert_eq!(1, size); |
1085 | | |
1086 | | let (ch, size) = utf8::decode_lossy(b"\xCEa"); |
1087 | | assert_eq!('\u{FFFD}', ch); |
1088 | | assert_eq!(1, size); |
1089 | | |
1090 | | let (ch, size) = utf8::decode_lossy(b"\xE2\x98a"); |
1091 | | assert_eq!('\u{FFFD}', ch); |
1092 | | assert_eq!(2, size); |
1093 | | |
1094 | | let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca"); |
1095 | | assert_eq!('\u{FFFD}', ch); |
1096 | | assert_eq!(3, size); |
1097 | | } |
1098 | | |
1099 | | #[test] |
1100 | | fn decode_last_valid() { |
1101 | | fn d(mut s: &str) -> Vec<char> { |
1102 | | let mut chars = vec![]; |
1103 | | while !s.is_empty() { |
1104 | | let (ch, size) = utf8::decode_last(s.as_bytes()); |
1105 | | s = &s[..s.len() - size]; |
1106 | | chars.push(ch.unwrap()); |
1107 | | } |
1108 | | chars |
1109 | | } |
1110 | | |
1111 | | assert_eq!(vec!['☃'], d("☃")); |
1112 | | assert_eq!(vec!['☃', '☃'], d("☃☃")); |
1113 | | assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε")); |
1114 | | assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇")); |
1115 | | assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲")); |
1116 | | } |
1117 | | |
1118 | | #[test] |
1119 | | fn decode_last_invalid() { |
1120 | | let (ch, size) = utf8::decode_last(b""); |
1121 | | assert_eq!(None, ch); |
1122 | | assert_eq!(0, size); |
1123 | | |
1124 | | let (ch, size) = utf8::decode_last(b"\xFF"); |
1125 | | assert_eq!(None, ch); |
1126 | | assert_eq!(1, size); |
1127 | | |
1128 | | let (ch, size) = utf8::decode_last(b"\xCE\xF0"); |
1129 | | assert_eq!(None, ch); |
1130 | | assert_eq!(1, size); |
1131 | | |
1132 | | let (ch, size) = utf8::decode_last(b"\xCE"); |
1133 | | assert_eq!(None, ch); |
1134 | | assert_eq!(1, size); |
1135 | | |
1136 | | let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0"); |
1137 | | assert_eq!(None, ch); |
1138 | | assert_eq!(1, size); |
1139 | | |
1140 | | let (ch, size) = utf8::decode_last(b"\xE2\x98"); |
1141 | | assert_eq!(None, ch); |
1142 | | assert_eq!(2, size); |
1143 | | |
1144 | | let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0"); |
1145 | | assert_eq!(None, ch); |
1146 | | assert_eq!(1, size); |
1147 | | |
1148 | | let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D"); |
1149 | | assert_eq!(None, ch); |
1150 | | assert_eq!(3, size); |
1151 | | |
1152 | | let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC"); |
1153 | | assert_eq!(None, ch); |
1154 | | assert_eq!(1, size); |
1155 | | |
1156 | | let (ch, size) = utf8::decode_last(b"\xED\xA0\x80"); |
1157 | | assert_eq!(None, ch); |
1158 | | assert_eq!(1, size); |
1159 | | |
1160 | | let (ch, size) = utf8::decode_last(b"\xED\xA0"); |
1161 | | assert_eq!(None, ch); |
1162 | | assert_eq!(1, size); |
1163 | | |
1164 | | let (ch, size) = utf8::decode_last(b"\xED"); |
1165 | | assert_eq!(None, ch); |
1166 | | assert_eq!(1, size); |
1167 | | |
1168 | | let (ch, size) = utf8::decode_last(b"a\xCE"); |
1169 | | assert_eq!(None, ch); |
1170 | | assert_eq!(1, size); |
1171 | | |
1172 | | let (ch, size) = utf8::decode_last(b"a\xE2\x98"); |
1173 | | assert_eq!(None, ch); |
1174 | | assert_eq!(2, size); |
1175 | | |
1176 | | let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C"); |
1177 | | assert_eq!(None, ch); |
1178 | | assert_eq!(3, size); |
1179 | | } |
1180 | | |
1181 | | #[test] |
1182 | | fn decode_last_lossy() { |
1183 | | let (ch, size) = utf8::decode_last_lossy(b""); |
1184 | | assert_eq!('\u{FFFD}', ch); |
1185 | | assert_eq!(0, size); |
1186 | | |
1187 | | let (ch, size) = utf8::decode_last_lossy(b"\xFF"); |
1188 | | assert_eq!('\u{FFFD}', ch); |
1189 | | assert_eq!(1, size); |
1190 | | |
1191 | | let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0"); |
1192 | | assert_eq!('\u{FFFD}', ch); |
1193 | | assert_eq!(1, size); |
1194 | | |
1195 | | let (ch, size) = utf8::decode_last_lossy(b"\xCE"); |
1196 | | assert_eq!('\u{FFFD}', ch); |
1197 | | assert_eq!(1, size); |
1198 | | |
1199 | | let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0"); |
1200 | | assert_eq!('\u{FFFD}', ch); |
1201 | | assert_eq!(1, size); |
1202 | | |
1203 | | let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98"); |
1204 | | assert_eq!('\u{FFFD}', ch); |
1205 | | assert_eq!(2, size); |
1206 | | |
1207 | | let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0"); |
1208 | | assert_eq!('\u{FFFD}', ch); |
1209 | | assert_eq!(1, size); |
1210 | | |
1211 | | let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D"); |
1212 | | assert_eq!('\u{FFFD}', ch); |
1213 | | assert_eq!(3, size); |
1214 | | |
1215 | | let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC"); |
1216 | | assert_eq!('\u{FFFD}', ch); |
1217 | | assert_eq!(1, size); |
1218 | | |
1219 | | let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80"); |
1220 | | assert_eq!('\u{FFFD}', ch); |
1221 | | assert_eq!(1, size); |
1222 | | |
1223 | | let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0"); |
1224 | | assert_eq!('\u{FFFD}', ch); |
1225 | | assert_eq!(1, size); |
1226 | | |
1227 | | let (ch, size) = utf8::decode_last_lossy(b"\xED"); |
1228 | | assert_eq!('\u{FFFD}', ch); |
1229 | | assert_eq!(1, size); |
1230 | | |
1231 | | let (ch, size) = utf8::decode_last_lossy(b"a\xCE"); |
1232 | | assert_eq!('\u{FFFD}', ch); |
1233 | | assert_eq!(1, size); |
1234 | | |
1235 | | let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98"); |
1236 | | assert_eq!('\u{FFFD}', ch); |
1237 | | assert_eq!(2, size); |
1238 | | |
1239 | | let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C"); |
1240 | | assert_eq!('\u{FFFD}', ch); |
1241 | | assert_eq!(3, size); |
1242 | | } |
1243 | | |
1244 | | #[test] |
1245 | | fn chars() { |
1246 | | for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { |
1247 | | let got: String = B(input).chars().collect(); |
1248 | | assert_eq!( |
1249 | | expected, got, |
1250 | | "chars(ith: {:?}, given: {:?})", |
1251 | | i, input, |
1252 | | ); |
1253 | | let got: String = |
1254 | | B(input).char_indices().map(|(_, _, ch)| ch).collect(); |
1255 | | assert_eq!( |
1256 | | expected, got, |
1257 | | "char_indices(ith: {:?}, given: {:?})", |
1258 | | i, input, |
1259 | | ); |
1260 | | |
1261 | | let expected: String = expected.chars().rev().collect(); |
1262 | | |
1263 | | let got: String = B(input).chars().rev().collect(); |
1264 | | assert_eq!( |
1265 | | expected, got, |
1266 | | "chars.rev(ith: {:?}, given: {:?})", |
1267 | | i, input, |
1268 | | ); |
1269 | | let got: String = |
1270 | | B(input).char_indices().rev().map(|(_, _, ch)| ch).collect(); |
1271 | | assert_eq!( |
1272 | | expected, got, |
1273 | | "char_indices.rev(ith: {:?}, given: {:?})", |
1274 | | i, input, |
1275 | | ); |
1276 | | } |
1277 | | } |
1278 | | |
1279 | | #[test] |
1280 | | fn utf8_chunks() { |
1281 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" }; |
1282 | | assert_eq!( |
1283 | | (c.next(), c.next()), |
1284 | | ( |
1285 | | Some(utf8::Utf8Chunk { |
1286 | | valid: "123", |
1287 | | invalid: b"\xC0".as_bstr(), |
1288 | | incomplete: false, |
1289 | | }), |
1290 | | None, |
1291 | | ) |
1292 | | ); |
1293 | | |
1294 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" }; |
1295 | | assert_eq!( |
1296 | | (c.next(), c.next(), c.next()), |
1297 | | ( |
1298 | | Some(utf8::Utf8Chunk { |
1299 | | valid: "123", |
1300 | | invalid: b"\xFF".as_bstr(), |
1301 | | incomplete: false, |
1302 | | }), |
1303 | | Some(utf8::Utf8Chunk { |
1304 | | valid: "", |
1305 | | invalid: b"\xFF".as_bstr(), |
1306 | | incomplete: false, |
1307 | | }), |
1308 | | None, |
1309 | | ) |
1310 | | ); |
1311 | | |
1312 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" }; |
1313 | | assert_eq!( |
1314 | | (c.next(), c.next()), |
1315 | | ( |
1316 | | Some(utf8::Utf8Chunk { |
1317 | | valid: "123", |
1318 | | invalid: b"\xD0".as_bstr(), |
1319 | | incomplete: true, |
1320 | | }), |
1321 | | None, |
1322 | | ) |
1323 | | ); |
1324 | | |
1325 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" }; |
1326 | | assert_eq!( |
1327 | | (c.next(), c.next(), c.next()), |
1328 | | ( |
1329 | | Some(utf8::Utf8Chunk { |
1330 | | valid: "123", |
1331 | | invalid: b"\xD0".as_bstr(), |
1332 | | incomplete: false, |
1333 | | }), |
1334 | | Some(utf8::Utf8Chunk { |
1335 | | valid: "456", |
1336 | | invalid: b"".as_bstr(), |
1337 | | incomplete: false, |
1338 | | }), |
1339 | | None, |
1340 | | ) |
1341 | | ); |
1342 | | |
1343 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" }; |
1344 | | assert_eq!( |
1345 | | (c.next(), c.next()), |
1346 | | ( |
1347 | | Some(utf8::Utf8Chunk { |
1348 | | valid: "123", |
1349 | | invalid: b"\xE2\x98".as_bstr(), |
1350 | | incomplete: true, |
1351 | | }), |
1352 | | None, |
1353 | | ) |
1354 | | ); |
1355 | | |
1356 | | let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" }; |
1357 | | assert_eq!( |
1358 | | (c.next(), c.next()), |
1359 | | ( |
1360 | | Some(utf8::Utf8Chunk { |
1361 | | valid: "123", |
1362 | | invalid: b"\xF4\x8F\xBF".as_bstr(), |
1363 | | incomplete: true, |
1364 | | }), |
1365 | | None, |
1366 | | ) |
1367 | | ); |
1368 | | } |
1369 | | } |