Coverage Report

Created: 2025-11-16 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/bstr-1.12.1/src/utf8.rs
Line
Count
Source
1
use core::{char, cmp, fmt, str};
2
3
use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
4
5
// The UTF-8 decoder provided here is based on the one presented here:
6
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
7
//
8
// We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
9
// using regex-automata that is roughly the same size. The real benefit of
10
// Hoehrmann's formulation is that the byte class mapping below is manually
11
// tailored such that each byte's class doubles as a shift to mask out the
12
// bits necessary for constructing the leading bits of each codepoint value
13
// from the initial byte.
14
//
15
// There are some minor differences between this implementation and Hoehrmann's
16
// formulation.
17
//
18
// Firstly, we make REJECT have state ID 0, since it makes the state table
19
// itself a little easier to read and is consistent with the notion that 0
20
// means "false" or "bad."
21
//
22
// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
23
// path.
24
//
25
// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
26
// in the core decoding loop. (Which is what regex-automata would do by
27
// default.)
28
//
29
// Fourthly, we split the byte class mapping and transition table into two
30
// arrays because it's clearer.
31
//
32
// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
33
// it is fairly simple.
34
35
const ACCEPT: usize = 12;
36
const REJECT: usize = 0;
37
38
/// SAFETY: The decode below function relies on the correctness of these
39
/// equivalence classes.
40
#[rustfmt::skip]
41
const CLASSES: [u8; 256] = [
42
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
44
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
45
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
47
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
48
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
49
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
50
];
51
52
/// SAFETY: The decode below function relies on the correctness of this state
53
/// machine.
54
#[rustfmt::skip]
55
const STATES_FORWARD: &[u8] = &[
56
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57
  12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
58
  0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
59
  0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
60
  0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
61
  0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
62
  0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
63
  0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
64
  0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65
];
66
67
/// An iterator over Unicode scalar values in a byte string.
68
///
69
/// When invalid UTF-8 byte sequences are found, they are substituted with the
70
/// Unicode replacement codepoint (`U+FFFD`) using the
71
/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
72
///
73
/// This iterator is created by the
74
/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
75
/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
76
#[derive(Clone, Debug)]
77
pub struct Chars<'a> {
78
    bs: &'a [u8],
79
}
80
81
impl<'a> Chars<'a> {
82
0
    pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
83
0
        Chars { bs }
84
0
    }
Unexecuted instantiation: <bstr::utf8::Chars>::new
Unexecuted instantiation: <bstr::utf8::Chars>::new
Unexecuted instantiation: <bstr::utf8::Chars>::new
Unexecuted instantiation: <bstr::utf8::Chars>::new
85
86
    /// View the underlying data as a subslice of the original data.
87
    ///
88
    /// The slice returned has the same lifetime as the original slice, and so
89
    /// the iterator can continue to be used while this exists.
90
    ///
91
    /// # Examples
92
    ///
93
    /// ```
94
    /// use bstr::ByteSlice;
95
    ///
96
    /// let mut chars = b"abc".chars();
97
    ///
98
    /// assert_eq!(b"abc", chars.as_bytes());
99
    /// chars.next();
100
    /// assert_eq!(b"bc", chars.as_bytes());
101
    /// chars.next();
102
    /// chars.next();
103
    /// assert_eq!(b"", chars.as_bytes());
104
    /// ```
105
    #[inline]
106
0
    pub fn as_bytes(&self) -> &'a [u8] {
107
0
        self.bs
108
0
    }
Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes
Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes
Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes
Unexecuted instantiation: <bstr::utf8::Chars>::as_bytes
109
}
110
111
impl<'a> Iterator for Chars<'a> {
112
    type Item = char;
113
114
    #[inline]
115
0
    fn next(&mut self) -> Option<char> {
116
0
        let (ch, size) = decode_lossy(self.bs);
117
0
        if size == 0 {
118
0
            return None;
119
0
        }
120
0
        self.bs = &self.bs[size..];
121
0
        Some(ch)
122
0
    }
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::iterator::Iterator>::next
123
}
124
125
impl<'a> DoubleEndedIterator for Chars<'a> {
126
    #[inline]
127
0
    fn next_back(&mut self) -> Option<char> {
128
0
        let (ch, size) = decode_last_lossy(self.bs);
129
0
        if size == 0 {
130
0
            return None;
131
0
        }
132
0
        self.bs = &self.bs[..self.bs.len() - size];
133
0
        Some(ch)
134
0
    }
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
135
}
136
137
/// An iterator over Unicode scalar values in a byte string and their
138
/// byte index positions.
139
///
140
/// When invalid UTF-8 byte sequences are found, they are substituted with the
141
/// Unicode replacement codepoint (`U+FFFD`) using the
142
/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
143
///
144
/// Note that this is slightly different from the `CharIndices` iterator
145
/// provided by the standard library. Aside from working on possibly invalid
146
/// UTF-8, this iterator provides both the corresponding starting and ending
147
/// byte indices of each codepoint yielded. The ending position is necessary to
148
/// slice the original byte string when invalid UTF-8 bytes are converted into
149
/// a Unicode replacement codepoint, since a single replacement codepoint can
150
/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
151
///
152
/// This iterator is created by the
153
/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
154
/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
155
#[derive(Clone, Debug)]
156
pub struct CharIndices<'a> {
157
    bs: &'a [u8],
158
    forward_index: usize,
159
    reverse_index: usize,
160
}
161
162
impl<'a> CharIndices<'a> {
163
2.30M
    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
164
2.30M
        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
165
2.30M
    }
Unexecuted instantiation: <bstr::utf8::CharIndices>::new
<bstr::utf8::CharIndices>::new
Line
Count
Source
163
115
    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
164
115
        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
165
115
    }
<bstr::utf8::CharIndices>::new
Line
Count
Source
163
2.30M
    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
164
2.30M
        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
165
2.30M
    }
Unexecuted instantiation: <bstr::utf8::CharIndices>::new
166
167
    /// View the underlying data as a subslice of the original data.
168
    ///
169
    /// The slice returned has the same lifetime as the original slice, and so
170
    /// the iterator can continue to be used while this exists.
171
    ///
172
    /// # Examples
173
    ///
174
    /// ```
175
    /// use bstr::ByteSlice;
176
    ///
177
    /// let mut it = b"abc".char_indices();
178
    ///
179
    /// assert_eq!(b"abc", it.as_bytes());
180
    /// it.next();
181
    /// assert_eq!(b"bc", it.as_bytes());
182
    /// it.next();
183
    /// it.next();
184
    /// assert_eq!(b"", it.as_bytes());
185
    /// ```
186
    #[inline]
187
0
    pub fn as_bytes(&self) -> &'a [u8] {
188
0
        self.bs
189
0
    }
Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes
Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes
Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes
Unexecuted instantiation: <bstr::utf8::CharIndices>::as_bytes
190
}
191
192
impl<'a> Iterator for CharIndices<'a> {
193
    type Item = (usize, usize, char);
194
195
    #[inline]
196
96.3M
    fn next(&mut self) -> Option<(usize, usize, char)> {
197
96.3M
        let index = self.forward_index;
198
96.3M
        let (ch, size) = decode_lossy(self.bs);
199
96.3M
        if size == 0 {
200
3.13M
            return None;
201
93.2M
        }
202
93.2M
        self.bs = &self.bs[size..];
203
93.2M
        self.forward_index += size;
204
93.2M
        Some((index, index + size, ch))
205
96.3M
    }
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next
<bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
196
108k
    fn next(&mut self) -> Option<(usize, usize, char)> {
197
108k
        let index = self.forward_index;
198
108k
        let (ch, size) = decode_lossy(self.bs);
199
108k
        if size == 0 {
200
115
            return None;
201
107k
        }
202
107k
        self.bs = &self.bs[size..];
203
107k
        self.forward_index += size;
204
107k
        Some((index, index + size, ch))
205
108k
    }
<bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next
Line
Count
Source
196
96.2M
    fn next(&mut self) -> Option<(usize, usize, char)> {
197
96.2M
        let index = self.forward_index;
198
96.2M
        let (ch, size) = decode_lossy(self.bs);
199
96.2M
        if size == 0 {
200
3.13M
            return None;
201
93.1M
        }
202
93.1M
        self.bs = &self.bs[size..];
203
93.1M
        self.forward_index += size;
204
93.1M
        Some((index, index + size, ch))
205
96.2M
    }
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::iterator::Iterator>::next
206
}
207
208
impl<'a> DoubleEndedIterator for CharIndices<'a> {
209
    #[inline]
210
0
    fn next_back(&mut self) -> Option<(usize, usize, char)> {
211
0
        let (ch, size) = decode_last_lossy(self.bs);
212
0
        if size == 0 {
213
0
            return None;
214
0
        }
215
0
        self.bs = &self.bs[..self.bs.len() - size];
216
0
        self.reverse_index -= size;
217
0
        Some((self.reverse_index, self.reverse_index + size, ch))
218
0
    }
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <bstr::utf8::CharIndices as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
219
}
220
221
impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
222
223
/// An iterator over chunks of valid UTF-8 in a byte slice.
224
///
225
/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
226
#[derive(Clone, Debug)]
227
pub struct Utf8Chunks<'a> {
228
    pub(super) bytes: &'a [u8],
229
}
230
231
/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
232
///
233
/// This is yielded by the
234
/// [`Utf8Chunks`](struct.Utf8Chunks.html)
235
/// iterator, which can be created via the
236
/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
237
/// method.
238
///
239
/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
240
/// are being iterated over.
241
#[cfg_attr(test, derive(Debug, PartialEq))]
242
pub struct Utf8Chunk<'a> {
243
    /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
244
    ///
245
    /// This is empty between adjacent invalid UTF-8 byte sequences.
246
    valid: &'a str,
247
    /// A sequence of invalid UTF-8 bytes.
248
    ///
249
    /// Can only be empty in the last chunk.
250
    ///
251
    /// Should be replaced by a single unicode replacement character, if not
252
    /// empty.
253
    invalid: &'a BStr,
254
    /// Indicates whether the invalid sequence could've been valid if there
255
    /// were more bytes.
256
    ///
257
    /// Can only be true in the last chunk.
258
    incomplete: bool,
259
}
260
261
impl<'a> Utf8Chunk<'a> {
262
    /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
263
    ///
264
    /// This may be empty if there are consecutive sequences of invalid UTF-8
265
    /// bytes.
266
    #[inline]
267
0
    pub fn valid(&self) -> &'a str {
268
0
        self.valid
269
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::valid
270
271
    /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
272
    /// immediately follow the valid UTF-8 bytes in this chunk.
273
    ///
274
    /// This is only empty when this chunk corresponds to the last chunk in
275
    /// the original bytes.
276
    ///
277
    /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
278
    /// sequences greater than 1 always correspond to a valid _prefix_ of
279
    /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
280
    /// of maximal subparts" strategy that is described in more detail in the
281
    /// docs for the
282
    /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
283
    /// method.
284
    #[inline]
285
0
    pub fn invalid(&self) -> &'a [u8] {
286
0
        self.invalid.as_bytes()
287
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::invalid
288
289
    /// Returns whether the invalid sequence might still become valid if more
290
    /// bytes are added.
291
    ///
292
    /// Returns true if the end of the input was reached unexpectedly,
293
    /// without encountering an unexpected byte.
294
    ///
295
    /// This can only be the case for the last chunk.
296
    #[inline]
297
0
    pub fn incomplete(&self) -> bool {
298
0
        self.incomplete
299
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete
Unexecuted instantiation: <bstr::utf8::Utf8Chunk>::incomplete
300
}
301
302
impl<'a> Iterator for Utf8Chunks<'a> {
303
    type Item = Utf8Chunk<'a>;
304
305
    #[inline]
306
0
    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
307
0
        if self.bytes.is_empty() {
308
0
            return None;
309
0
        }
310
0
        match validate(self.bytes) {
311
            Ok(()) => {
312
0
                let valid = self.bytes;
313
0
                self.bytes = &[];
314
0
                Some(Utf8Chunk {
315
0
                    // SAFETY: This is safe because of the guarantees provided
316
0
                    // by utf8::validate.
317
0
                    valid: unsafe { str::from_utf8_unchecked(valid) },
318
0
                    invalid: [].as_bstr(),
319
0
                    incomplete: false,
320
0
                })
321
            }
322
0
            Err(e) => {
323
0
                let (valid, rest) = self.bytes.split_at(e.valid_up_to());
324
                // SAFETY: This is safe because of the guarantees provided by
325
                // utf8::validate.
326
0
                let valid = unsafe { str::from_utf8_unchecked(valid) };
327
0
                let (invalid_len, incomplete) = match e.error_len() {
328
0
                    Some(n) => (n, false),
329
0
                    None => (rest.len(), true),
330
                };
331
0
                let (invalid, rest) = rest.split_at(invalid_len);
332
0
                self.bytes = rest;
333
0
                Some(Utf8Chunk {
334
0
                    valid,
335
0
                    invalid: invalid.as_bstr(),
336
0
                    incomplete,
337
0
                })
338
            }
339
        }
340
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::next
341
342
    #[inline]
343
0
    fn size_hint(&self) -> (usize, Option<usize>) {
344
0
        if self.bytes.is_empty() {
345
0
            (0, Some(0))
346
        } else {
347
0
            (1, Some(self.bytes.len()))
348
        }
349
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint
Unexecuted instantiation: <bstr::utf8::Utf8Chunks as core::iter::traits::iterator::Iterator>::size_hint
350
}
351
352
impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
353
354
/// An error that occurs when UTF-8 decoding fails.
355
///
356
/// This error occurs when attempting to convert a non-UTF-8 byte
357
/// string to a Rust string that must be valid UTF-8. For example,
358
/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
359
///
360
/// # Example
361
///
362
/// This example shows what happens when a given byte sequence is invalid,
363
/// but ends with a sequence that is a possible prefix of valid UTF-8.
364
///
365
/// ```
366
/// use bstr::{B, ByteSlice};
367
///
368
/// let s = B(b"foobar\xF1\x80\x80");
369
/// let err = s.to_str().unwrap_err();
370
/// assert_eq!(err.valid_up_to(), 6);
371
/// assert_eq!(err.error_len(), None);
372
/// ```
373
///
374
/// This example shows what happens when a given byte sequence contains
375
/// invalid UTF-8.
376
///
377
/// ```
378
/// use bstr::ByteSlice;
379
///
380
/// let s = b"foobar\xF1\x80\x80quux";
381
/// let err = s.to_str().unwrap_err();
382
/// assert_eq!(err.valid_up_to(), 6);
383
/// // The error length reports the maximum number of bytes that correspond to
384
/// // a valid prefix of a UTF-8 encoded codepoint.
385
/// assert_eq!(err.error_len(), Some(3));
386
///
387
/// // In contrast to the above which contains a single invalid prefix,
388
/// // consider the case of multiple individual bytes that are never valid
389
/// // prefixes. Note how the value of error_len changes!
390
/// let s = b"foobar\xFF\xFFquux";
391
/// let err = s.to_str().unwrap_err();
392
/// assert_eq!(err.valid_up_to(), 6);
393
/// assert_eq!(err.error_len(), Some(1));
394
///
395
/// // The fact that it's an invalid prefix does not change error_len even
396
/// // when it immediately precedes the end of the string.
397
/// let s = b"foobar\xFF";
398
/// let err = s.to_str().unwrap_err();
399
/// assert_eq!(err.valid_up_to(), 6);
400
/// assert_eq!(err.error_len(), Some(1));
401
/// ```
402
#[derive(Clone, Debug, Eq, PartialEq)]
403
pub struct Utf8Error {
404
    valid_up_to: usize,
405
    error_len: Option<usize>,
406
}
407
408
impl Utf8Error {
409
    /// Returns the byte index of the position immediately following the last
410
    /// valid UTF-8 byte.
411
    ///
412
    /// # Example
413
    ///
414
    /// This examples shows how `valid_up_to` can be used to retrieve a
415
    /// possibly empty prefix that is guaranteed to be valid UTF-8:
416
    ///
417
    /// ```
418
    /// use bstr::ByteSlice;
419
    ///
420
    /// let s = b"foobar\xF1\x80\x80quux";
421
    /// let err = s.to_str().unwrap_err();
422
    ///
423
    /// // This is guaranteed to never panic.
424
    /// let string = s[..err.valid_up_to()].to_str().unwrap();
425
    /// assert_eq!(string, "foobar");
426
    /// ```
427
    #[inline]
428
4.32M
    pub fn valid_up_to(&self) -> usize {
429
4.32M
        self.valid_up_to
430
4.32M
    }
Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to
Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to
Unexecuted instantiation: <bstr::utf8::Utf8Error>::valid_up_to
<bstr::utf8::Utf8Error>::valid_up_to
Line
Count
Source
428
4.32M
    pub fn valid_up_to(&self) -> usize {
429
4.32M
        self.valid_up_to
430
4.32M
    }
431
432
    /// Returns the total number of invalid UTF-8 bytes immediately following
433
    /// the position returned by `valid_up_to`. This value is always at least
434
    /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
435
    /// encoded codepoint.
436
    ///
437
    /// If the end of the original input was found before a valid UTF-8 encoded
438
    /// codepoint could be completed, then this returns `None`. This is useful
439
    /// when processing streams, where a `None` value signals that more input
440
    /// might be needed.
441
    #[inline]
442
4.32M
    pub fn error_len(&self) -> Option<usize> {
443
4.32M
        self.error_len
444
4.32M
    }
Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len
Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len
Unexecuted instantiation: <bstr::utf8::Utf8Error>::error_len
<bstr::utf8::Utf8Error>::error_len
Line
Count
Source
442
4.32M
    pub fn error_len(&self) -> Option<usize> {
443
4.32M
        self.error_len
444
4.32M
    }
445
}
446
447
#[cfg(feature = "std")]
448
impl std::error::Error for Utf8Error {
449
0
    fn description(&self) -> &str {
450
0
        "invalid UTF-8"
451
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::error::Error>::description
452
}
453
454
impl fmt::Display for Utf8Error {
455
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
456
0
        write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
457
0
    }
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt
Unexecuted instantiation: <bstr::utf8::Utf8Error as core::fmt::Display>::fmt
458
}
459
460
/// Returns OK if and only if the given slice is completely valid UTF-8.
461
///
462
/// If the slice isn't valid UTF-8, then an error is returned that explains
463
/// the first location at which invalid UTF-8 was detected.
464
17.6M
pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465
    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
466
    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467
    // detected, it backs up and runs the slower version of the UTF-8 automaton
468
    // to determine correct error information.
469
17.6M
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
17.6M
        let mut state = ACCEPT;
471
17.6M
        let mut i = 0;
472
473
25.4M
        while i < slice.len() {
474
12.0M
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
12.0M
            if state == ACCEPT
479
10.8M
                && b <= 0x7F
480
6.42M
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
Unexecuted instantiation: bstr::utf8::validate::fast::{closure#0}
bstr::utf8::validate::fast::{closure#0}
Line
Count
Source
480
30.5k
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
bstr::utf8::validate::fast::{closure#0}
Line
Count
Source
480
1.17M
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
bstr::utf8::validate::fast::{closure#0}
Line
Count
Source
480
64.8k
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
1.23M
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
1.23M
                continue;
484
10.8M
            }
485
486
10.8M
            state = step(state, b);
487
10.8M
            if state == REJECT {
488
4.32M
                return Err(find_valid_up_to(slice, i));
489
6.50M
            }
490
6.50M
            i += 1;
491
        }
492
13.3M
        if state != ACCEPT {
493
2.80k
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
13.3M
            Ok(())
496
        }
497
17.6M
    }
Unexecuted instantiation: bstr::utf8::validate::fast
bstr::utf8::validate::fast
Line
Count
Source
469
933
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
933
        let mut state = ACCEPT;
471
933
        let mut i = 0;
472
473
107k
        while i < slice.len() {
474
107k
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
107k
            if state == ACCEPT
479
62.9k
                && b <= 0x7F
480
30.5k
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
29.8k
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
29.8k
                continue;
484
77.4k
            }
485
486
77.4k
            state = step(state, b);
487
77.4k
            if state == REJECT {
488
321
                return Err(find_valid_up_to(slice, i));
489
77.1k
            }
490
77.1k
            i += 1;
491
        }
492
612
        if state != ACCEPT {
493
72
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
540
            Ok(())
496
        }
497
933
    }
bstr::utf8::validate::fast
Line
Count
Source
469
11.2M
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
11.2M
        let mut state = ACCEPT;
471
11.2M
        let mut i = 0;
472
473
17.8M
        while i < slice.len() {
474
6.55M
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
6.55M
            if state == ACCEPT
479
6.41M
                && b <= 0x7F
480
6.32M
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
1.16M
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
1.16M
                continue;
484
5.38M
            }
485
486
5.38M
            state = step(state, b);
487
5.38M
            if state == REJECT {
488
1.07k
                return Err(find_valid_up_to(slice, i));
489
5.38M
            }
490
5.38M
            i += 1;
491
        }
492
11.2M
        if state != ACCEPT {
493
192
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
11.2M
            Ok(())
496
        }
497
11.2M
    }
bstr::utf8::validate::fast
Line
Count
Source
469
6.37M
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
6.37M
        let mut state = ACCEPT;
471
6.37M
        let mut i = 0;
472
473
7.45M
        while i < slice.len() {
474
5.40M
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
5.40M
            if state == ACCEPT
479
4.40M
                && b <= 0x7F
480
74.0k
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
37.8k
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
37.8k
                continue;
484
5.36M
            }
485
486
5.36M
            state = step(state, b);
487
5.36M
            if state == REJECT {
488
4.32M
                return Err(find_valid_up_to(slice, i));
489
1.04M
            }
490
1.04M
            i += 1;
491
        }
492
2.05M
        if state != ACCEPT {
493
2.54k
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
2.04M
            Ok(())
496
        }
497
6.37M
    }
498
499
    // Given the first position at which a UTF-8 sequence was determined to be
500
    // invalid, return an error that correctly reports the position at which
501
    // the last complete UTF-8 sequence ends.
502
    #[inline(never)]
503
4.32M
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
4.32M
        let mut backup = rejected_at.saturating_sub(1);
509
4.32M
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
1.54k
            backup -= 1;
511
1.54k
        }
512
4.32M
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
4.32M
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
4.32M
        err.valid_up_to += backup;
515
4.32M
        err
516
4.32M
    }
Unexecuted instantiation: bstr::utf8::validate::find_valid_up_to
bstr::utf8::validate::find_valid_up_to
Line
Count
Source
503
393
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
393
        let mut backup = rejected_at.saturating_sub(1);
509
451
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
58
            backup -= 1;
511
58
        }
512
393
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
393
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
393
        err.valid_up_to += backup;
515
393
        err
516
393
    }
bstr::utf8::validate::find_valid_up_to
Line
Count
Source
503
1.26k
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
1.26k
        let mut backup = rejected_at.saturating_sub(1);
509
1.54k
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
281
            backup -= 1;
511
281
        }
512
1.26k
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
1.26k
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
1.26k
        err.valid_up_to += backup;
515
1.26k
        err
516
1.26k
    }
bstr::utf8::validate::find_valid_up_to
Line
Count
Source
503
4.32M
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
4.32M
        let mut backup = rejected_at.saturating_sub(1);
509
4.32M
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
1.20k
            backup -= 1;
511
1.20k
        }
512
4.32M
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
4.32M
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
4.32M
        err.valid_up_to += backup;
515
4.32M
        err
516
4.32M
    }
517
518
    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519
    // when an invalid sequence is found. This is split out from validate so
520
    // that the fast path doesn't need to keep track of the position of the
521
    // last valid UTF-8 byte. In particular, tracking this requires checking
522
    // for an ACCEPT state on each byte, which degrades throughput pretty
523
    // badly.
524
4.32M
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
4.32M
        let mut state = ACCEPT;
526
4.32M
        let mut valid_up_to = 0;
527
5.36M
        for (i, &b) in slice.iter().enumerate() {
528
5.36M
            state = step(state, b);
529
5.36M
            if state == ACCEPT {
530
48.8k
                valid_up_to = i + 1;
531
5.31M
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
4.32M
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
4.32M
                return Err(Utf8Error { valid_up_to, error_len });
535
989k
            }
536
        }
537
2.80k
        if state != ACCEPT {
538
2.80k
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
0
            Ok(())
541
        }
542
4.32M
    }
Unexecuted instantiation: bstr::utf8::validate::slow
bstr::utf8::validate::slow
Line
Count
Source
524
393
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
393
        let mut state = ACCEPT;
526
393
        let mut valid_up_to = 0;
527
750
        for (i, &b) in slice.iter().enumerate() {
528
750
            state = step(state, b);
529
750
            if state == ACCEPT {
530
206
                valid_up_to = i + 1;
531
544
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
321
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
321
                return Err(Utf8Error { valid_up_to, error_len });
535
223
            }
536
        }
537
72
        if state != ACCEPT {
538
72
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
0
            Ok(())
541
        }
542
393
    }
bstr::utf8::validate::slow
Line
Count
Source
524
1.26k
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
1.26k
        let mut state = ACCEPT;
526
1.26k
        let mut valid_up_to = 0;
527
2.57k
        for (i, &b) in slice.iter().enumerate() {
528
2.57k
            state = step(state, b);
529
2.57k
            if state == ACCEPT {
530
700
                valid_up_to = i + 1;
531
1.87k
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
1.07k
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
1.07k
                return Err(Utf8Error { valid_up_to, error_len });
535
802
            }
536
        }
537
192
        if state != ACCEPT {
538
192
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
0
            Ok(())
541
        }
542
1.26k
    }
bstr::utf8::validate::slow
Line
Count
Source
524
4.32M
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
4.32M
        let mut state = ACCEPT;
526
4.32M
        let mut valid_up_to = 0;
527
5.35M
        for (i, &b) in slice.iter().enumerate() {
528
5.35M
            state = step(state, b);
529
5.35M
            if state == ACCEPT {
530
47.9k
                valid_up_to = i + 1;
531
5.30M
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
4.32M
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
4.32M
                return Err(Utf8Error { valid_up_to, error_len });
535
988k
            }
536
        }
537
2.54k
        if state != ACCEPT {
538
2.54k
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
0
            Ok(())
541
        }
542
4.32M
    }
543
544
    // Advance to the next state given the current state and current byte.
545
16.1M
    fn step(state: usize, b: u8) -> usize {
546
16.1M
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
16.1M
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
16.1M
    }
Unexecuted instantiation: bstr::utf8::validate::step
bstr::utf8::validate::step
Line
Count
Source
545
78.2k
    fn step(state: usize, b: u8) -> usize {
546
78.2k
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
78.2k
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
78.2k
    }
bstr::utf8::validate::step
Line
Count
Source
545
5.38M
    fn step(state: usize, b: u8) -> usize {
546
5.38M
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
5.38M
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
5.38M
    }
bstr::utf8::validate::step
Line
Count
Source
545
10.7M
    fn step(state: usize, b: u8) -> usize {
546
10.7M
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
10.7M
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
10.7M
    }
556
557
17.6M
    fast(slice)
558
17.6M
}
Unexecuted instantiation: bstr::utf8::validate
bstr::utf8::validate
Line
Count
Source
464
933
pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465
    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
466
    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467
    // detected, it backs up and runs the slower version of the UTF-8 automaton
468
    // to determine correct error information.
469
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
        let mut state = ACCEPT;
471
        let mut i = 0;
472
473
        while i < slice.len() {
474
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
            if state == ACCEPT
479
                && b <= 0x7F
480
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
                continue;
484
            }
485
486
            state = step(state, b);
487
            if state == REJECT {
488
                return Err(find_valid_up_to(slice, i));
489
            }
490
            i += 1;
491
        }
492
        if state != ACCEPT {
493
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
            Ok(())
496
        }
497
    }
498
499
    // Given the first position at which a UTF-8 sequence was determined to be
500
    // invalid, return an error that correctly reports the position at which
501
    // the last complete UTF-8 sequence ends.
502
    #[inline(never)]
503
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
        let mut backup = rejected_at.saturating_sub(1);
509
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
            backup -= 1;
511
        }
512
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
        err.valid_up_to += backup;
515
        err
516
    }
517
518
    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519
    // when an invalid sequence is found. This is split out from validate so
520
    // that the fast path doesn't need to keep track of the position of the
521
    // last valid UTF-8 byte. In particular, tracking this requires checking
522
    // for an ACCEPT state on each byte, which degrades throughput pretty
523
    // badly.
524
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
        let mut state = ACCEPT;
526
        let mut valid_up_to = 0;
527
        for (i, &b) in slice.iter().enumerate() {
528
            state = step(state, b);
529
            if state == ACCEPT {
530
                valid_up_to = i + 1;
531
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
                return Err(Utf8Error { valid_up_to, error_len });
535
            }
536
        }
537
        if state != ACCEPT {
538
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
            Ok(())
541
        }
542
    }
543
544
    // Advance to the next state given the current state and current byte.
545
    fn step(state: usize, b: u8) -> usize {
546
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
    }
556
557
933
    fast(slice)
558
933
}
bstr::utf8::validate
Line
Count
Source
464
11.2M
pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465
    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
466
    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467
    // detected, it backs up and runs the slower version of the UTF-8 automaton
468
    // to determine correct error information.
469
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
        let mut state = ACCEPT;
471
        let mut i = 0;
472
473
        while i < slice.len() {
474
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
            if state == ACCEPT
479
                && b <= 0x7F
480
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
                continue;
484
            }
485
486
            state = step(state, b);
487
            if state == REJECT {
488
                return Err(find_valid_up_to(slice, i));
489
            }
490
            i += 1;
491
        }
492
        if state != ACCEPT {
493
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
            Ok(())
496
        }
497
    }
498
499
    // Given the first position at which a UTF-8 sequence was determined to be
500
    // invalid, return an error that correctly reports the position at which
501
    // the last complete UTF-8 sequence ends.
502
    #[inline(never)]
503
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
        let mut backup = rejected_at.saturating_sub(1);
509
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
            backup -= 1;
511
        }
512
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
        err.valid_up_to += backup;
515
        err
516
    }
517
518
    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519
    // when an invalid sequence is found. This is split out from validate so
520
    // that the fast path doesn't need to keep track of the position of the
521
    // last valid UTF-8 byte. In particular, tracking this requires checking
522
    // for an ACCEPT state on each byte, which degrades throughput pretty
523
    // badly.
524
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
        let mut state = ACCEPT;
526
        let mut valid_up_to = 0;
527
        for (i, &b) in slice.iter().enumerate() {
528
            state = step(state, b);
529
            if state == ACCEPT {
530
                valid_up_to = i + 1;
531
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
                return Err(Utf8Error { valid_up_to, error_len });
535
            }
536
        }
537
        if state != ACCEPT {
538
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
            Ok(())
541
        }
542
    }
543
544
    // Advance to the next state given the current state and current byte.
545
    fn step(state: usize, b: u8) -> usize {
546
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
    }
556
557
11.2M
    fast(slice)
558
11.2M
}
bstr::utf8::validate
Line
Count
Source
464
6.37M
pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465
    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
466
    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467
    // detected, it backs up and runs the slower version of the UTF-8 automaton
468
    // to determine correct error information.
469
    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470
        let mut state = ACCEPT;
471
        let mut i = 0;
472
473
        while i < slice.len() {
474
            let b = slice[i];
475
476
            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477
            // to validate as much ASCII as possible very quickly.
478
            if state == ACCEPT
479
                && b <= 0x7F
480
                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481
            {
482
                i += ascii::first_non_ascii_byte(&slice[i..]);
483
                continue;
484
            }
485
486
            state = step(state, b);
487
            if state == REJECT {
488
                return Err(find_valid_up_to(slice, i));
489
            }
490
            i += 1;
491
        }
492
        if state != ACCEPT {
493
            Err(find_valid_up_to(slice, slice.len()))
494
        } else {
495
            Ok(())
496
        }
497
    }
498
499
    // Given the first position at which a UTF-8 sequence was determined to be
500
    // invalid, return an error that correctly reports the position at which
501
    // the last complete UTF-8 sequence ends.
502
    #[inline(never)]
503
    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504
        // In order to find the last valid byte, we need to back up an amount
505
        // that guarantees every preceding byte is part of a valid UTF-8
506
        // code unit sequence. To do this, we simply locate the last leading
507
        // byte that occurs before rejected_at.
508
        let mut backup = rejected_at.saturating_sub(1);
509
        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510
            backup -= 1;
511
        }
512
        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513
        let mut err = slow(&slice[backup..upto]).unwrap_err();
514
        err.valid_up_to += backup;
515
        err
516
    }
517
518
    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519
    // when an invalid sequence is found. This is split out from validate so
520
    // that the fast path doesn't need to keep track of the position of the
521
    // last valid UTF-8 byte. In particular, tracking this requires checking
522
    // for an ACCEPT state on each byte, which degrades throughput pretty
523
    // badly.
524
    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525
        let mut state = ACCEPT;
526
        let mut valid_up_to = 0;
527
        for (i, &b) in slice.iter().enumerate() {
528
            state = step(state, b);
529
            if state == ACCEPT {
530
                valid_up_to = i + 1;
531
            } else if state == REJECT {
532
                // Our error length must always be at least 1.
533
                let error_len = Some(cmp::max(1, i - valid_up_to));
534
                return Err(Utf8Error { valid_up_to, error_len });
535
            }
536
        }
537
        if state != ACCEPT {
538
            Err(Utf8Error { valid_up_to, error_len: None })
539
        } else {
540
            Ok(())
541
        }
542
    }
543
544
    // Advance to the next state given the current state and current byte.
545
    fn step(state: usize, b: u8) -> usize {
546
        let class = CLASSES[b as usize];
547
        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548
        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549
        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550
        // valid by construction of the state machine and the byte equivalence
551
        // classes.
552
        unsafe {
553
            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554
        }
555
    }
556
557
6.37M
    fast(slice)
558
6.37M
}
559
560
/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
561
///
562
/// When successful, the corresponding Unicode scalar value is returned along
563
/// with the number of bytes it was encoded with. The number of bytes consumed
564
/// for a successful decode is always between 1 and 4, inclusive.
565
///
566
/// When unsuccessful, `None` is returned along with the number of bytes that
567
/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
568
/// the number of bytes consumed is always between 0 and 3, inclusive, where
569
/// 0 is only returned when `slice` is empty.
570
///
571
/// # Examples
572
///
573
/// Basic usage:
574
///
575
/// ```
576
/// use bstr::decode_utf8;
577
///
578
/// // Decoding a valid codepoint.
579
/// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
580
/// assert_eq!(Some('☃'), ch);
581
/// assert_eq!(3, size);
582
///
583
/// // Decoding an incomplete codepoint.
584
/// let (ch, size) = decode_utf8(b"\xE2\x98");
585
/// assert_eq!(None, ch);
586
/// assert_eq!(2, size);
587
/// ```
588
///
589
/// This example shows how to iterate over all codepoints in UTF-8 encoded
590
/// bytes, while replacing invalid UTF-8 sequences with the replacement
591
/// codepoint:
592
///
593
/// ```
594
/// use bstr::{B, decode_utf8};
595
///
596
/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
597
/// let mut chars = vec![];
598
/// while !bytes.is_empty() {
599
///     let (ch, size) = decode_utf8(bytes);
600
///     bytes = &bytes[size..];
601
///     chars.push(ch.unwrap_or('\u{FFFD}'));
602
/// }
603
/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
604
/// ```
605
#[inline]
606
96.3M
pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607
96.3M
    let slice = slice.as_ref();
608
96.3M
    match slice.first() {
609
3.13M
        None => return (None, 0),
610
93.2M
        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
611
130k
        _ => {}
612
    }
613
614
130k
    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
615
198k
    while i < slice.len() {
616
198k
        decode_step(&mut state, &mut cp, slice[i]);
617
198k
        i += 1;
618
619
198k
        if state == ACCEPT {
620
            // SAFETY: This is safe because `decode_step` guarantees that
621
            // `cp` is a valid Unicode scalar value in an ACCEPT state.
622
14.4k
            let ch = unsafe { char::from_u32_unchecked(cp) };
623
14.4k
            return (Some(ch), i);
624
183k
        } else if state == REJECT {
625
            // At this point, we always want to advance at least one byte.
626
115k
            return (None, cmp::max(1, i.saturating_sub(1)));
627
68.0k
        }
628
    }
629
33
    (None, i)
630
96.3M
}
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
bstr::utf8::decode::<&[u8]>
Line
Count
Source
606
108k
pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607
108k
    let slice = slice.as_ref();
608
108k
    match slice.first() {
609
115
        None => return (None, 0),
610
107k
        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
611
33.5k
        _ => {}
612
    }
613
614
33.5k
    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
615
54.8k
    while i < slice.len() {
616
54.8k
        decode_step(&mut state, &mut cp, slice[i]);
617
54.8k
        i += 1;
618
619
54.8k
        if state == ACCEPT {
620
            // SAFETY: This is safe because `decode_step` guarantees that
621
            // `cp` is a valid Unicode scalar value in an ACCEPT state.
622
3.06k
            let ch = unsafe { char::from_u32_unchecked(cp) };
623
3.06k
            return (Some(ch), i);
624
51.7k
        } else if state == REJECT {
625
            // At this point, we always want to advance at least one byte.
626
30.5k
            return (None, cmp::max(1, i.saturating_sub(1)));
627
21.2k
        }
628
    }
629
6
    (None, i)
630
108k
}
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
bstr::utf8::decode::<&[u8]>
Line
Count
Source
606
47.9M
pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607
47.9M
    let slice = slice.as_ref();
608
47.9M
    match slice.first() {
609
3.12M
        None => return (None, 0),
610
44.8M
        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
611
5.29k
        _ => {}
612
    }
613
614
5.29k
    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
615
17.5k
    while i < slice.len() {
616
17.5k
        decode_step(&mut state, &mut cp, slice[i]);
617
17.5k
        i += 1;
618
619
17.5k
        if state == ACCEPT {
620
            // SAFETY: This is safe because `decode_step` guarantees that
621
            // `cp` is a valid Unicode scalar value in an ACCEPT state.
622
5.29k
            let ch = unsafe { char::from_u32_unchecked(cp) };
623
5.29k
            return (Some(ch), i);
624
12.2k
        } else if state == REJECT {
625
            // At this point, we always want to advance at least one byte.
626
0
            return (None, cmp::max(1, i.saturating_sub(1)));
627
12.2k
        }
628
    }
629
0
    (None, i)
630
47.9M
}
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
bstr::utf8::decode::<&[u8]>
Line
Count
Source
606
48.2M
pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607
48.2M
    let slice = slice.as_ref();
608
48.2M
    match slice.first() {
609
3.00k
        None => return (None, 0),
610
48.2M
        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
611
91.5k
        _ => {}
612
    }
613
614
91.5k
    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
615
126k
    while i < slice.len() {
616
126k
        decode_step(&mut state, &mut cp, slice[i]);
617
126k
        i += 1;
618
619
126k
        if state == ACCEPT {
620
            // SAFETY: This is safe because `decode_step` guarantees that
621
            // `cp` is a valid Unicode scalar value in an ACCEPT state.
622
6.07k
            let ch = unsafe { char::from_u32_unchecked(cp) };
623
6.07k
            return (Some(ch), i);
624
120k
        } else if state == REJECT {
625
            // At this point, we always want to advance at least one byte.
626
85.4k
            return (None, cmp::max(1, i.saturating_sub(1)));
627
34.6k
        }
628
    }
629
27
    (None, i)
630
48.2M
}
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode::<&[u8]>
631
632
/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
633
/// slice.
634
///
635
/// When successful, the corresponding Unicode scalar value is returned along
636
/// with the number of bytes it was encoded with. The number of bytes consumed
637
/// for a successful decode is always between 1 and 4, inclusive.
638
///
639
/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
640
/// along with the number of bytes that make up a maximal prefix of a valid
641
/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
642
/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
643
/// empty.
644
///
645
/// # Examples
646
///
647
/// Basic usage:
648
///
649
/// ```ignore
650
/// use bstr::decode_utf8_lossy;
651
///
652
/// // Decoding a valid codepoint.
653
/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
654
/// assert_eq!('☃', ch);
655
/// assert_eq!(3, size);
656
///
657
/// // Decoding an incomplete codepoint.
658
/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
659
/// assert_eq!('\u{FFFD}', ch);
660
/// assert_eq!(2, size);
661
/// ```
662
///
663
/// This example shows how to iterate over all codepoints in UTF-8 encoded
664
/// bytes, while replacing invalid UTF-8 sequences with the replacement
665
/// codepoint:
666
///
667
/// ```ignore
668
/// use bstr::{B, decode_utf8_lossy};
669
///
670
/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
671
/// let mut chars = vec![];
672
/// while !bytes.is_empty() {
673
///     let (ch, size) = decode_utf8_lossy(bytes);
674
///     bytes = &bytes[size..];
675
///     chars.push(ch);
676
/// }
677
/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
678
/// ```
679
#[inline]
680
96.3M
pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681
96.3M
    match decode(slice) {
682
93.1M
        (Some(ch), size) => (ch, size),
683
3.24M
        (None, size) => ('\u{FFFD}', size),
684
    }
685
96.3M
}
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
bstr::utf8::decode_lossy::<&[u8]>
Line
Count
Source
680
108k
pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681
108k
    match decode(slice) {
682
77.4k
        (Some(ch), size) => (ch, size),
683
30.6k
        (None, size) => ('\u{FFFD}', size),
684
    }
685
108k
}
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
bstr::utf8::decode_lossy::<&[u8]>
Line
Count
Source
680
47.9M
pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681
47.9M
    match decode(slice) {
682
44.8M
        (Some(ch), size) => (ch, size),
683
3.12M
        (None, size) => ('\u{FFFD}', size),
684
    }
685
47.9M
}
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<_>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
bstr::utf8::decode_lossy::<&[u8]>
Line
Count
Source
680
48.2M
pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681
48.2M
    match decode(slice) {
682
48.2M
        (Some(ch), size) => (ch, size),
683
88.4k
        (None, size) => ('\u{FFFD}', size),
684
    }
685
48.2M
}
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_lossy::<&[u8]>
686
687
/// UTF-8 decode a single Unicode scalar value from the end of a slice.
688
///
689
/// When successful, the corresponding Unicode scalar value is returned along
690
/// with the number of bytes it was encoded with. The number of bytes consumed
691
/// for a successful decode is always between 1 and 4, inclusive.
692
///
693
/// When unsuccessful, `None` is returned along with the number of bytes that
694
/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
695
/// the number of bytes consumed is always between 0 and 3, inclusive, where
696
/// 0 is only returned when `slice` is empty.
697
///
698
/// # Examples
699
///
700
/// Basic usage:
701
///
702
/// ```
703
/// use bstr::decode_last_utf8;
704
///
705
/// // Decoding a valid codepoint.
706
/// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
707
/// assert_eq!(Some('☃'), ch);
708
/// assert_eq!(3, size);
709
///
710
/// // Decoding an incomplete codepoint.
711
/// let (ch, size) = decode_last_utf8(b"\xE2\x98");
712
/// assert_eq!(None, ch);
713
/// assert_eq!(2, size);
714
/// ```
715
///
716
/// This example shows how to iterate over all codepoints in UTF-8 encoded
717
/// bytes in reverse, while replacing invalid UTF-8 sequences with the
718
/// replacement codepoint:
719
///
720
/// ```
721
/// use bstr::{B, decode_last_utf8};
722
///
723
/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
724
/// let mut chars = vec![];
725
/// while !bytes.is_empty() {
726
///     let (ch, size) = decode_last_utf8(bytes);
727
///     bytes = &bytes[..bytes.len()-size];
728
///     chars.push(ch.unwrap_or('\u{FFFD}'));
729
/// }
730
/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
731
/// ```
732
#[inline]
733
0
pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
734
    // TODO: We could implement this by reversing the UTF-8 automaton, but for
735
    // now, we do it the slow way by using the forward automaton.
736
737
0
    let slice = slice.as_ref();
738
0
    if slice.is_empty() {
739
0
        return (None, 0);
740
0
    }
741
0
    let mut start = slice.len() - 1;
742
0
    let limit = slice.len().saturating_sub(4);
743
0
    while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
744
0
        start -= 1;
745
0
    }
746
0
    let (ch, size) = decode(&slice[start..]);
747
    // If we didn't consume all of the bytes, then that means there's at least
748
    // one stray byte that never occurs in a valid code unit prefix, so we can
749
    // advance by one byte.
750
0
    if start + size != slice.len() {
751
0
        (None, 1)
752
    } else {
753
0
        (ch, size)
754
    }
755
0
}
Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last::<_>
756
757
/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
758
///
759
/// When successful, the corresponding Unicode scalar value is returned along
760
/// with the number of bytes it was encoded with. The number of bytes consumed
761
/// for a successful decode is always between 1 and 4, inclusive.
762
///
763
/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
764
/// along with the number of bytes that make up a maximal prefix of a valid
765
/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
766
/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
767
/// empty.
768
///
769
/// # Examples
770
///
771
/// Basic usage:
772
///
773
/// ```ignore
774
/// use bstr::decode_last_utf8_lossy;
775
///
776
/// // Decoding a valid codepoint.
777
/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
778
/// assert_eq!('☃', ch);
779
/// assert_eq!(3, size);
780
///
781
/// // Decoding an incomplete codepoint.
782
/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
783
/// assert_eq!('\u{FFFD}', ch);
784
/// assert_eq!(2, size);
785
/// ```
786
///
787
/// This example shows how to iterate over all codepoints in UTF-8 encoded
788
/// bytes in reverse, while replacing invalid UTF-8 sequences with the
789
/// replacement codepoint:
790
///
791
/// ```ignore
792
/// use bstr::decode_last_utf8_lossy;
793
///
794
/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
795
/// let mut chars = vec![];
796
/// while !bytes.is_empty() {
797
///     let (ch, size) = decode_last_utf8_lossy(bytes);
798
///     bytes = &bytes[..bytes.len()-size];
799
///     chars.push(ch);
800
/// }
801
/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
802
/// ```
803
#[inline]
804
0
pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
805
0
    match decode_last(slice) {
806
0
        (Some(ch), size) => (ch, size),
807
0
        (None, size) => ('\u{FFFD}', size),
808
    }
809
0
}
Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last_lossy::<&[u8]>
Unexecuted instantiation: bstr::utf8::decode_last_lossy::<_>
810
811
/// SAFETY: The decode function relies on state being equal to ACCEPT only if
812
/// cp is a valid Unicode scalar value.
813
#[inline]
814
198k
pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
815
198k
    let class = CLASSES[b as usize];
816
198k
    let b = u32::from(b);
817
198k
    if *state == ACCEPT {
818
130k
        *cp = (0xFF >> class) & b;
819
130k
    } else {
820
68.0k
        *cp = (b & 0b0011_1111) | (*cp << 6);
821
68.0k
    }
822
198k
    *state = STATES_FORWARD[*state + class as usize] as usize;
823
198k
}
Unexecuted instantiation: bstr::utf8::decode_step
bstr::utf8::decode_step
Line
Count
Source
814
54.8k
pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
815
54.8k
    let class = CLASSES[b as usize];
816
54.8k
    let b = u32::from(b);
817
54.8k
    if *state == ACCEPT {
818
33.5k
        *cp = (0xFF >> class) & b;
819
33.5k
    } else {
820
21.2k
        *cp = (b & 0b0011_1111) | (*cp << 6);
821
21.2k
    }
822
54.8k
    *state = STATES_FORWARD[*state + class as usize] as usize;
823
54.8k
}
bstr::utf8::decode_step
Line
Count
Source
814
143k
pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
815
143k
    let class = CLASSES[b as usize];
816
143k
    let b = u32::from(b);
817
143k
    if *state == ACCEPT {
818
96.8k
        *cp = (0xFF >> class) & b;
819
96.8k
    } else {
820
46.7k
        *cp = (b & 0b0011_1111) | (*cp << 6);
821
46.7k
    }
822
143k
    *state = STATES_FORWARD[*state + class as usize] as usize;
823
143k
}
Unexecuted instantiation: bstr::utf8::decode_step
824
825
/// Returns true if and only if the given byte is either a valid leading UTF-8
826
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
827
/// valid UTF-8 sequence.
828
33.6k
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
829
    // In the ASCII case, the most significant bit is never set. The leading
830
    // byte of a 2/3/4-byte sequence always has the top two most significant
831
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
832
    // also returns true, since every such byte has its two most significant
833
    // bits set:
834
    //
835
    //     \xC0 :: 11000000
836
    //     \xC1 :: 11000001
837
    //     \xF5 :: 11110101
838
    //     \xF6 :: 11110110
839
    //     \xF7 :: 11110111
840
    //     \xF8 :: 11111000
841
    //     \xF9 :: 11111001
842
    //     \xFA :: 11111010
843
    //     \xFB :: 11111011
844
    //     \xFC :: 11111100
845
    //     \xFD :: 11111101
846
    //     \xFE :: 11111110
847
    //     \xFF :: 11111111
848
33.6k
    (b & 0b1100_0000) != 0b1000_0000
849
33.6k
}
Unexecuted instantiation: bstr::utf8::is_leading_or_invalid_utf8_byte
bstr::utf8::is_leading_or_invalid_utf8_byte
Line
Count
Source
828
381
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
829
    // In the ASCII case, the most significant bit is never set. The leading
830
    // byte of a 2/3/4-byte sequence always has the top two most significant
831
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
832
    // also returns true, since every such byte has its two most significant
833
    // bits set:
834
    //
835
    //     \xC0 :: 11000000
836
    //     \xC1 :: 11000001
837
    //     \xF5 :: 11110101
838
    //     \xF6 :: 11110110
839
    //     \xF7 :: 11110111
840
    //     \xF8 :: 11111000
841
    //     \xF9 :: 11111001
842
    //     \xFA :: 11111010
843
    //     \xFB :: 11111011
844
    //     \xFC :: 11111100
845
    //     \xFD :: 11111101
846
    //     \xFE :: 11111110
847
    //     \xFF :: 11111111
848
381
    (b & 0b1100_0000) != 0b1000_0000
849
381
}
bstr::utf8::is_leading_or_invalid_utf8_byte
Line
Count
Source
828
1.33k
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
829
    // In the ASCII case, the most significant bit is never set. The leading
830
    // byte of a 2/3/4-byte sequence always has the top two most significant
831
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
832
    // also returns true, since every such byte has its two most significant
833
    // bits set:
834
    //
835
    //     \xC0 :: 11000000
836
    //     \xC1 :: 11000001
837
    //     \xF5 :: 11110101
838
    //     \xF6 :: 11110110
839
    //     \xF7 :: 11110111
840
    //     \xF8 :: 11111000
841
    //     \xF9 :: 11111001
842
    //     \xFA :: 11111010
843
    //     \xFB :: 11111011
844
    //     \xFC :: 11111100
845
    //     \xFD :: 11111101
846
    //     \xFE :: 11111110
847
    //     \xFF :: 11111111
848
1.33k
    (b & 0b1100_0000) != 0b1000_0000
849
1.33k
}
bstr::utf8::is_leading_or_invalid_utf8_byte
Line
Count
Source
828
31.9k
fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
829
    // In the ASCII case, the most significant bit is never set. The leading
830
    // byte of a 2/3/4-byte sequence always has the top two most significant
831
    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
832
    // also returns true, since every such byte has its two most significant
833
    // bits set:
834
    //
835
    //     \xC0 :: 11000000
836
    //     \xC1 :: 11000001
837
    //     \xF5 :: 11110101
838
    //     \xF6 :: 11110110
839
    //     \xF7 :: 11110111
840
    //     \xF8 :: 11111000
841
    //     \xF9 :: 11111001
842
    //     \xFA :: 11111010
843
    //     \xFB :: 11111011
844
    //     \xFC :: 11111100
845
    //     \xFD :: 11111101
846
    //     \xFE :: 11111110
847
    //     \xFF :: 11111111
848
31.9k
    (b & 0b1100_0000) != 0b1000_0000
849
31.9k
}
850
851
#[cfg(all(test, feature = "std"))]
852
mod tests {
853
    use core::char;
854
855
    use alloc::{string::String, vec, vec::Vec};
856
857
    use crate::{
858
        ext_slice::{ByteSlice, B},
859
        tests::LOSSY_TESTS,
860
        utf8::{self, Utf8Error},
861
    };
862
863
    fn utf8e(valid_up_to: usize) -> Utf8Error {
864
        Utf8Error { valid_up_to, error_len: None }
865
    }
866
867
    fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
868
        Utf8Error { valid_up_to, error_len: Some(error_len) }
869
    }
870
871
    #[test]
872
    #[cfg(not(miri))]
873
    fn validate_all_codepoints() {
874
        for i in 0..(0x10FFFF + 1) {
875
            let cp = match char::from_u32(i) {
876
                None => continue,
877
                Some(cp) => cp,
878
            };
879
            let mut buf = [0; 4];
880
            let s = cp.encode_utf8(&mut buf);
881
            assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
882
        }
883
    }
884
885
    #[test]
886
    fn validate_multiple_codepoints() {
887
        assert_eq!(Ok(()), utf8::validate(b"abc"));
888
        assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
889
        assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
890
        assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
891
        assert_eq!(
892
            Ok(()),
893
            utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
894
        );
895
        assert_eq!(
896
            Ok(()),
897
            utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
898
        );
899
    }
900
901
    #[test]
902
    fn validate_errors() {
903
        // single invalid byte
904
        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
905
        // single invalid byte after ASCII
906
        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
907
        // single invalid byte after 2 byte sequence
908
        assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
909
        // single invalid byte after 3 byte sequence
910
        assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
911
        // single invalid byte after 4 byte sequence
912
        assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
913
914
        // An invalid 2-byte sequence with a valid 1-byte prefix.
915
        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
916
        // An invalid 3-byte sequence with a valid 2-byte prefix.
917
        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
918
        // An invalid 4-byte sequence with a valid 3-byte prefix.
919
        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
920
921
        // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
922
        // same codepoint value in 4 bytes. This not only tests that we reject
923
        // overlong sequences, but that we get valid_up_to correct.
924
        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
925
        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
926
        assert_eq!(
927
            Err(utf8e2(3, 1)),
928
            utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
929
        );
930
931
        // Check that encoding a surrogate codepoint using the UTF-8 scheme
932
        // fails validation.
933
        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
934
        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
935
        assert_eq!(
936
            Err(utf8e2(3, 1)),
937
            utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
938
        );
939
940
        // Check that an incomplete 2-byte sequence fails.
941
        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
942
        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
943
        assert_eq!(
944
            Err(utf8e2(3, 1)),
945
            utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
946
        );
947
        // Check that an incomplete 3-byte sequence fails.
948
        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
949
        assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
950
        assert_eq!(
951
            Err(utf8e2(3, 2)),
952
            utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
953
        );
954
        // Check that an incomplete 4-byte sequence fails.
955
        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
956
        assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
957
        assert_eq!(
958
            Err(utf8e2(4, 3)),
959
            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
960
        );
961
        assert_eq!(
962
            Err(utf8e2(6, 3)),
963
            utf8::validate(b"foobar\xF1\x80\x80quux",)
964
        );
965
966
        // Check that an incomplete (EOF) 2-byte sequence fails.
967
        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
968
        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
969
        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
970
        // Check that an incomplete (EOF) 3-byte sequence fails.
971
        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
972
        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
973
        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
974
        // Check that an incomplete (EOF) 4-byte sequence fails.
975
        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
976
        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
977
        assert_eq!(
978
            Err(utf8e(4)),
979
            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
980
        );
981
982
        // Test that we errors correct even after long valid sequences. This
983
        // checks that our "backup" logic for detecting errors is correct.
984
        assert_eq!(
985
            Err(utf8e2(8, 1)),
986
            utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
987
        );
988
    }
989
990
    #[test]
991
    fn decode_valid() {
992
        fn d(mut s: &str) -> Vec<char> {
993
            let mut chars = vec![];
994
            while !s.is_empty() {
995
                let (ch, size) = utf8::decode(s.as_bytes());
996
                s = &s[size..];
997
                chars.push(ch.unwrap());
998
            }
999
            chars
1000
        }
1001
1002
        assert_eq!(vec!['☃'], d("☃"));
1003
        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1004
        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1005
        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1006
        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
1007
    }
1008
1009
    #[test]
1010
    fn decode_invalid() {
1011
        let (ch, size) = utf8::decode(b"");
1012
        assert_eq!(None, ch);
1013
        assert_eq!(0, size);
1014
1015
        let (ch, size) = utf8::decode(b"\xFF");
1016
        assert_eq!(None, ch);
1017
        assert_eq!(1, size);
1018
1019
        let (ch, size) = utf8::decode(b"\xCE\xF0");
1020
        assert_eq!(None, ch);
1021
        assert_eq!(1, size);
1022
1023
        let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1024
        assert_eq!(None, ch);
1025
        assert_eq!(2, size);
1026
1027
        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1028
        assert_eq!(None, ch);
1029
        assert_eq!(3, size);
1030
1031
        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1032
        assert_eq!(None, ch);
1033
        assert_eq!(3, size);
1034
1035
        let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1036
        assert_eq!(None, ch);
1037
        assert_eq!(1, size);
1038
1039
        let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1040
        assert_eq!(None, ch);
1041
        assert_eq!(1, size);
1042
1043
        let (ch, size) = utf8::decode(b"\xCEa");
1044
        assert_eq!(None, ch);
1045
        assert_eq!(1, size);
1046
1047
        let (ch, size) = utf8::decode(b"\xE2\x98a");
1048
        assert_eq!(None, ch);
1049
        assert_eq!(2, size);
1050
1051
        let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1052
        assert_eq!(None, ch);
1053
        assert_eq!(3, size);
1054
    }
1055
1056
    #[test]
1057
    fn decode_lossy() {
1058
        let (ch, size) = utf8::decode_lossy(b"");
1059
        assert_eq!('\u{FFFD}', ch);
1060
        assert_eq!(0, size);
1061
1062
        let (ch, size) = utf8::decode_lossy(b"\xFF");
1063
        assert_eq!('\u{FFFD}', ch);
1064
        assert_eq!(1, size);
1065
1066
        let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1067
        assert_eq!('\u{FFFD}', ch);
1068
        assert_eq!(1, size);
1069
1070
        let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1071
        assert_eq!('\u{FFFD}', ch);
1072
        assert_eq!(2, size);
1073
1074
        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1075
        assert_eq!('\u{FFFD}', ch);
1076
        assert_eq!(3, size);
1077
1078
        let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1079
        assert_eq!('\u{FFFD}', ch);
1080
        assert_eq!(1, size);
1081
1082
        let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1083
        assert_eq!('\u{FFFD}', ch);
1084
        assert_eq!(1, size);
1085
1086
        let (ch, size) = utf8::decode_lossy(b"\xCEa");
1087
        assert_eq!('\u{FFFD}', ch);
1088
        assert_eq!(1, size);
1089
1090
        let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1091
        assert_eq!('\u{FFFD}', ch);
1092
        assert_eq!(2, size);
1093
1094
        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1095
        assert_eq!('\u{FFFD}', ch);
1096
        assert_eq!(3, size);
1097
    }
1098
1099
    #[test]
1100
    fn decode_last_valid() {
1101
        fn d(mut s: &str) -> Vec<char> {
1102
            let mut chars = vec![];
1103
            while !s.is_empty() {
1104
                let (ch, size) = utf8::decode_last(s.as_bytes());
1105
                s = &s[..s.len() - size];
1106
                chars.push(ch.unwrap());
1107
            }
1108
            chars
1109
        }
1110
1111
        assert_eq!(vec!['☃'], d("☃"));
1112
        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1113
        assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1114
        assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1115
        assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
1116
    }
1117
1118
    #[test]
1119
    fn decode_last_invalid() {
1120
        let (ch, size) = utf8::decode_last(b"");
1121
        assert_eq!(None, ch);
1122
        assert_eq!(0, size);
1123
1124
        let (ch, size) = utf8::decode_last(b"\xFF");
1125
        assert_eq!(None, ch);
1126
        assert_eq!(1, size);
1127
1128
        let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1129
        assert_eq!(None, ch);
1130
        assert_eq!(1, size);
1131
1132
        let (ch, size) = utf8::decode_last(b"\xCE");
1133
        assert_eq!(None, ch);
1134
        assert_eq!(1, size);
1135
1136
        let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1137
        assert_eq!(None, ch);
1138
        assert_eq!(1, size);
1139
1140
        let (ch, size) = utf8::decode_last(b"\xE2\x98");
1141
        assert_eq!(None, ch);
1142
        assert_eq!(2, size);
1143
1144
        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1145
        assert_eq!(None, ch);
1146
        assert_eq!(1, size);
1147
1148
        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1149
        assert_eq!(None, ch);
1150
        assert_eq!(3, size);
1151
1152
        let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1153
        assert_eq!(None, ch);
1154
        assert_eq!(1, size);
1155
1156
        let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1157
        assert_eq!(None, ch);
1158
        assert_eq!(1, size);
1159
1160
        let (ch, size) = utf8::decode_last(b"\xED\xA0");
1161
        assert_eq!(None, ch);
1162
        assert_eq!(1, size);
1163
1164
        let (ch, size) = utf8::decode_last(b"\xED");
1165
        assert_eq!(None, ch);
1166
        assert_eq!(1, size);
1167
1168
        let (ch, size) = utf8::decode_last(b"a\xCE");
1169
        assert_eq!(None, ch);
1170
        assert_eq!(1, size);
1171
1172
        let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1173
        assert_eq!(None, ch);
1174
        assert_eq!(2, size);
1175
1176
        let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1177
        assert_eq!(None, ch);
1178
        assert_eq!(3, size);
1179
    }
1180
1181
    #[test]
1182
    fn decode_last_lossy() {
1183
        let (ch, size) = utf8::decode_last_lossy(b"");
1184
        assert_eq!('\u{FFFD}', ch);
1185
        assert_eq!(0, size);
1186
1187
        let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1188
        assert_eq!('\u{FFFD}', ch);
1189
        assert_eq!(1, size);
1190
1191
        let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1192
        assert_eq!('\u{FFFD}', ch);
1193
        assert_eq!(1, size);
1194
1195
        let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1196
        assert_eq!('\u{FFFD}', ch);
1197
        assert_eq!(1, size);
1198
1199
        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1200
        assert_eq!('\u{FFFD}', ch);
1201
        assert_eq!(1, size);
1202
1203
        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1204
        assert_eq!('\u{FFFD}', ch);
1205
        assert_eq!(2, size);
1206
1207
        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1208
        assert_eq!('\u{FFFD}', ch);
1209
        assert_eq!(1, size);
1210
1211
        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1212
        assert_eq!('\u{FFFD}', ch);
1213
        assert_eq!(3, size);
1214
1215
        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1216
        assert_eq!('\u{FFFD}', ch);
1217
        assert_eq!(1, size);
1218
1219
        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1220
        assert_eq!('\u{FFFD}', ch);
1221
        assert_eq!(1, size);
1222
1223
        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1224
        assert_eq!('\u{FFFD}', ch);
1225
        assert_eq!(1, size);
1226
1227
        let (ch, size) = utf8::decode_last_lossy(b"\xED");
1228
        assert_eq!('\u{FFFD}', ch);
1229
        assert_eq!(1, size);
1230
1231
        let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1232
        assert_eq!('\u{FFFD}', ch);
1233
        assert_eq!(1, size);
1234
1235
        let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1236
        assert_eq!('\u{FFFD}', ch);
1237
        assert_eq!(2, size);
1238
1239
        let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1240
        assert_eq!('\u{FFFD}', ch);
1241
        assert_eq!(3, size);
1242
    }
1243
1244
    #[test]
1245
    fn chars() {
1246
        for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1247
            let got: String = B(input).chars().collect();
1248
            assert_eq!(
1249
                expected, got,
1250
                "chars(ith: {:?}, given: {:?})",
1251
                i, input,
1252
            );
1253
            let got: String =
1254
                B(input).char_indices().map(|(_, _, ch)| ch).collect();
1255
            assert_eq!(
1256
                expected, got,
1257
                "char_indices(ith: {:?}, given: {:?})",
1258
                i, input,
1259
            );
1260
1261
            let expected: String = expected.chars().rev().collect();
1262
1263
            let got: String = B(input).chars().rev().collect();
1264
            assert_eq!(
1265
                expected, got,
1266
                "chars.rev(ith: {:?}, given: {:?})",
1267
                i, input,
1268
            );
1269
            let got: String =
1270
                B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1271
            assert_eq!(
1272
                expected, got,
1273
                "char_indices.rev(ith: {:?}, given: {:?})",
1274
                i, input,
1275
            );
1276
        }
1277
    }
1278
1279
    #[test]
1280
    fn utf8_chunks() {
1281
        let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1282
        assert_eq!(
1283
            (c.next(), c.next()),
1284
            (
1285
                Some(utf8::Utf8Chunk {
1286
                    valid: "123",
1287
                    invalid: b"\xC0".as_bstr(),
1288
                    incomplete: false,
1289
                }),
1290
                None,
1291
            )
1292
        );
1293
1294
        let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1295
        assert_eq!(
1296
            (c.next(), c.next(), c.next()),
1297
            (
1298
                Some(utf8::Utf8Chunk {
1299
                    valid: "123",
1300
                    invalid: b"\xFF".as_bstr(),
1301
                    incomplete: false,
1302
                }),
1303
                Some(utf8::Utf8Chunk {
1304
                    valid: "",
1305
                    invalid: b"\xFF".as_bstr(),
1306
                    incomplete: false,
1307
                }),
1308
                None,
1309
            )
1310
        );
1311
1312
        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1313
        assert_eq!(
1314
            (c.next(), c.next()),
1315
            (
1316
                Some(utf8::Utf8Chunk {
1317
                    valid: "123",
1318
                    invalid: b"\xD0".as_bstr(),
1319
                    incomplete: true,
1320
                }),
1321
                None,
1322
            )
1323
        );
1324
1325
        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1326
        assert_eq!(
1327
            (c.next(), c.next(), c.next()),
1328
            (
1329
                Some(utf8::Utf8Chunk {
1330
                    valid: "123",
1331
                    invalid: b"\xD0".as_bstr(),
1332
                    incomplete: false,
1333
                }),
1334
                Some(utf8::Utf8Chunk {
1335
                    valid: "456",
1336
                    invalid: b"".as_bstr(),
1337
                    incomplete: false,
1338
                }),
1339
                None,
1340
            )
1341
        );
1342
1343
        let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1344
        assert_eq!(
1345
            (c.next(), c.next()),
1346
            (
1347
                Some(utf8::Utf8Chunk {
1348
                    valid: "123",
1349
                    invalid: b"\xE2\x98".as_bstr(),
1350
                    incomplete: true,
1351
                }),
1352
                None,
1353
            )
1354
        );
1355
1356
        let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1357
        assert_eq!(
1358
            (c.next(), c.next()),
1359
            (
1360
                Some(utf8::Utf8Chunk {
1361
                    valid: "123",
1362
                    invalid: b"\xF4\x8F\xBF".as_bstr(),
1363
                    incomplete: true,
1364
                }),
1365
                None,
1366
            )
1367
        );
1368
    }
1369
}