Coverage Report

Created: 2025-07-23 07:04

/rust/registry/src/index.crates.io-6f17d22bba15001f/utf8_iter-1.0.4/src/lib.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright Mozilla Foundation
2
//
3
// Licensed under the Apache License (Version 2.0), or the MIT license,
4
// (the "Licenses") at your option. You may not use this file except in
5
// compliance with one of the Licenses. You may obtain copies of the
6
// Licenses at:
7
//
8
//    https://www.apache.org/licenses/LICENSE-2.0
9
//    https://opensource.org/licenses/MIT
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the Licenses is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the Licenses for the specific language governing permissions and
15
// limitations under the Licenses.
16
17
#![no_std]
18
19
//! Provides iteration by `char` over `&[u8]` containing potentially-invalid
20
//! UTF-8 such that errors are handled according to the [WHATWG Encoding
21
//! Standard](https://encoding.spec.whatwg.org/#utf-8-decoder) (i.e. the same
22
//! way as in `String::from_utf8_lossy`).
23
//!
24
//! The trait `Utf8CharsEx` provides the convenience method `chars()` on
25
//! byte slices themselves instead of having to use the more verbose
26
//! `Utf8Chars::new(slice)`.
27
//!
28
//! ```rust
29
//! use utf8_iter::Utf8CharsEx;
30
//! let data = b"\xFF\xC2\xE2\xE2\x98\xF0\xF0\x9F\xF0\x9F\x92\xE2\x98\x83";
31
//! let from_iter: String = data.chars().collect();
32
//! let from_std = String::from_utf8_lossy(data);
33
//! assert_eq!(from_iter, from_std);
34
//! ```
35
36
mod indices;
37
mod report;
38
39
pub use crate::indices::Utf8CharIndices;
40
pub use crate::report::ErrorReportingUtf8Chars;
41
pub use crate::report::Utf8CharsError;
42
use core::iter::FusedIterator;
43
44
#[repr(align(64))] // Align to cache lines
45
struct Utf8Data {
46
    pub table: [u8; 384],
47
}
48
49
// This is generated code copied and pasted from utf_8.rs of encoding_rs.
50
// Please don't edit by hand but instead regenerate as instructed in that
51
// file.
52
53
static UTF8_DATA: Utf8Data = Utf8Data {
54
    table: [
55
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
61
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
62
        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
63
        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
64
        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
65
        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
66
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
67
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
68
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
69
        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
70
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
71
        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
72
        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
73
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
74
    ],
75
};
76
77
// End manually copypasted generated code.
78
79
#[inline(always)]
80
0
fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
81
0
    i.wrapping_sub(start) <= (end - start)
82
0
}
83
84
/// Iterator by `char` over `&[u8]` that contains
85
/// potentially-invalid UTF-8. See the crate documentation.
86
#[derive(Debug, Clone)]
87
pub struct Utf8Chars<'a> {
88
    remaining: &'a [u8],
89
}
90
91
impl<'a> Utf8Chars<'a> {
92
    #[inline(always)]
93
    /// Creates the iterator from a byte slice.
94
0
    pub fn new(bytes: &'a [u8]) -> Self {
95
0
        Utf8Chars::<'a> { remaining: bytes }
96
0
    }
97
98
    /// Views the current remaining data in the iterator as a subslice
99
    /// of the original slice.
100
    #[inline(always)]
101
0
    pub fn as_slice(&self) -> &'a [u8] {
102
0
        self.remaining
103
0
    }
104
105
    #[inline(never)]
106
0
    fn next_fallback(&mut self) -> Option<char> {
107
0
        if self.remaining.is_empty() {
108
0
            return None;
109
0
        }
110
0
        let first = self.remaining[0];
111
0
        if first < 0x80 {
112
0
            self.remaining = &self.remaining[1..];
113
0
            return Some(char::from(first));
114
0
        }
115
0
        if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
116
0
            self.remaining = &self.remaining[1..];
117
0
            return Some('\u{FFFD}');
118
0
        }
119
0
        let second = self.remaining[1];
120
0
        let (lower_bound, upper_bound) = match first {
121
0
            0xE0 => (0xA0, 0xBF),
122
0
            0xED => (0x80, 0x9F),
123
0
            0xF0 => (0x90, 0xBF),
124
0
            0xF4 => (0x80, 0x8F),
125
0
            _ => (0x80, 0xBF),
126
        };
127
0
        if !in_inclusive_range8(second, lower_bound, upper_bound) {
128
0
            self.remaining = &self.remaining[1..];
129
0
            return Some('\u{FFFD}');
130
0
        }
131
0
        if first < 0xE0 {
132
0
            self.remaining = &self.remaining[2..];
133
0
            let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
134
0
            return Some(unsafe { char::from_u32_unchecked(point) });
135
0
        }
136
0
        if self.remaining.len() == 2 {
137
0
            self.remaining = &self.remaining[2..];
138
0
            return Some('\u{FFFD}');
139
0
        }
140
0
        let third = self.remaining[2];
141
0
        if !in_inclusive_range8(third, 0x80, 0xBF) {
142
0
            self.remaining = &self.remaining[2..];
143
0
            return Some('\u{FFFD}');
144
0
        }
145
0
        if first < 0xF0 {
146
0
            self.remaining = &self.remaining[3..];
147
0
            let point = ((u32::from(first) & 0xF) << 12)
148
0
                | ((u32::from(second) & 0x3F) << 6)
149
0
                | (u32::from(third) & 0x3F);
150
0
            return Some(unsafe { char::from_u32_unchecked(point) });
151
0
        }
152
0
        // At this point, we have a valid 3-byte prefix of a
153
0
        // four-byte sequence that has to be incomplete, because
154
0
        // otherwise `next()` would have succeeded.
155
0
        self.remaining = &self.remaining[3..];
156
0
        Some('\u{FFFD}')
157
0
    }
158
}
159
160
impl<'a> Iterator for Utf8Chars<'a> {
161
    type Item = char;
162
163
    #[inline]
164
0
    fn next(&mut self) -> Option<char> {
165
0
        // Not delegating directly to `ErrorReportingUtf8Chars` to avoid
166
0
        // an extra branch in the common case based on a cursory inspection
167
0
        // of generated code in a similar case. Be sure to inspect the
168
0
        // generated code as inlined into an actual usage site carefully
169
0
        // if attempting to consolidate the source code here.
170
0
171
0
        // This loop is only broken out of as goto forward
172
0
        #[allow(clippy::never_loop)]
173
0
        loop {
174
0
            if self.remaining.len() < 4 {
175
0
                break;
176
0
            }
177
0
            let first = self.remaining[0];
178
0
            if first < 0x80 {
179
0
                self.remaining = &self.remaining[1..];
180
0
                return Some(char::from(first));
181
0
            }
182
0
            let second = self.remaining[1];
183
0
            if in_inclusive_range8(first, 0xC2, 0xDF) {
184
0
                if !in_inclusive_range8(second, 0x80, 0xBF) {
185
0
                    break;
186
0
                }
187
0
                let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
188
0
                self.remaining = &self.remaining[2..];
189
0
                return Some(unsafe { char::from_u32_unchecked(point) });
190
0
            }
191
0
            // This table-based formulation was benchmark-based in encoding_rs,
192
0
            // but it hasn't been re-benchmarked in this iterator context.
193
0
            let third = self.remaining[2];
194
0
            if first < 0xF0 {
195
0
                if ((UTF8_DATA.table[usize::from(second)]
196
0
                    & UTF8_DATA.table[usize::from(first) + 0x80])
197
0
                    | (third >> 6))
198
0
                    != 2
199
                {
200
0
                    break;
201
0
                }
202
0
                let point = ((u32::from(first) & 0xF) << 12)
203
0
                    | ((u32::from(second) & 0x3F) << 6)
204
0
                    | (u32::from(third) & 0x3F);
205
0
                self.remaining = &self.remaining[3..];
206
0
                return Some(unsafe { char::from_u32_unchecked(point) });
207
0
            }
208
0
            let fourth = self.remaining[3];
209
0
            if (u16::from(
210
0
                UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
211
0
            ) | u16::from(third >> 6)
212
0
                | (u16::from(fourth & 0xC0) << 2))
213
0
                != 0x202
214
            {
215
0
                break;
216
0
            }
217
0
            let point = ((u32::from(first) & 0x7) << 18)
218
0
                | ((u32::from(second) & 0x3F) << 12)
219
0
                | ((u32::from(third) & 0x3F) << 6)
220
0
                | (u32::from(fourth) & 0x3F);
221
0
            self.remaining = &self.remaining[4..];
222
0
            return Some(unsafe { char::from_u32_unchecked(point) });
223
        }
224
0
        self.next_fallback()
225
0
    }
Unexecuted instantiation: <utf8_iter::Utf8Chars as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <utf8_iter::Utf8Chars as core::iter::traits::iterator::Iterator>::next
226
}
227
228
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
229
    #[inline]
230
0
    fn next_back(&mut self) -> Option<char> {
231
0
        if self.remaining.is_empty() {
232
0
            return None;
233
0
        }
234
0
        let mut attempt = 1;
235
0
        for b in self.remaining.iter().rev() {
236
0
            if b & 0xC0 != 0x80 {
237
0
                let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
238
0
                let mut inner = Utf8Chars::new(tail);
239
0
                let candidate = inner.next();
240
0
                if inner.as_slice().is_empty() {
241
0
                    self.remaining = head;
242
0
                    return candidate;
243
0
                }
244
0
                break;
245
0
            }
246
0
            if attempt == 4 {
247
0
                break;
248
0
            }
249
0
            attempt += 1;
250
        }
251
252
0
        self.remaining = &self.remaining[..self.remaining.len() - 1];
253
0
        Some('\u{FFFD}')
254
0
    }
Unexecuted instantiation: <utf8_iter::Utf8Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
Unexecuted instantiation: <utf8_iter::Utf8Chars as core::iter::traits::double_ended::DoubleEndedIterator>::next_back
255
}
256
257
impl FusedIterator for Utf8Chars<'_> {}
258
259
/// Convenience trait that adds `chars()` and `char_indices()` methods
260
/// similar to the ones on string slices to byte slices.
261
pub trait Utf8CharsEx {
262
    fn chars(&self) -> Utf8Chars<'_>;
263
    fn char_indices(&self) -> Utf8CharIndices<'_>;
264
}
265
266
impl Utf8CharsEx for [u8] {
267
    /// Convenience method for creating an UTF-8 iterator
268
    /// for the slice.
269
    #[inline]
270
0
    fn chars(&self) -> Utf8Chars<'_> {
271
0
        Utf8Chars::new(self)
272
0
    }
Unexecuted instantiation: <[u8] as utf8_iter::Utf8CharsEx>::chars
Unexecuted instantiation: <[u8] as utf8_iter::Utf8CharsEx>::chars
Unexecuted instantiation: <[u8] as utf8_iter::Utf8CharsEx>::chars
273
    /// Convenience method for creating a byte index and
274
    /// UTF-8 iterator for the slice.
275
    #[inline]
276
0
    fn char_indices(&self) -> Utf8CharIndices<'_> {
277
0
        Utf8CharIndices::new(self)
278
0
    }
279
}
280
281
// No manually-written tests for forward-iteration, because the code passed multiple
282
// days of fuzzing comparing with known-good behavior.