Coverage Report

Created: 2026-03-31 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/html5ever/tendril/src/utf8_decode.rs
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4
// option. This file may not be copied, modified, or distributed
5
// except according to those terms.
6
7
use crate::fmt;
8
use crate::{Atomicity, Tendril};
9
10
use std::cmp;
11
use std::str;
12
13
/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
14
pub(crate) const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
15
16
#[derive(Debug, Copy, Clone)]
17
pub(crate) enum DecodeError<'a> {
18
    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
19
    /// then call `decode()` again with `remaining_input`.
20
    Invalid {
21
        valid_prefix: &'a str,
22
        invalid_sequence: &'a [u8],
23
    },
24
25
    /// Call the `incomplete_suffix.try_to_complete_codepoint` method with more input when available.
26
    /// If no more input is available, this is an invalid byte sequence.
27
    Incomplete {
28
        valid_prefix: &'a str,
29
        incomplete_suffix: IncompleteUtf8,
30
    },
31
}
32
33
#[derive(Debug, Copy, Clone)]
34
pub struct IncompleteUtf8 {
35
    pub buffer: [u8; 4],
36
    pub buffer_len: u8,
37
}
38
39
29.9M
pub(crate) fn decode_utf8(input: &[u8]) -> Result<&str, DecodeError<'_>> {
40
29.9M
    let error = match str::from_utf8(input) {
41
28.2k
        Ok(valid) => return Ok(valid),
42
29.9M
        Err(error) => error,
43
    };
44
45
    // FIXME: separate function from here to guide inlining?
46
29.9M
    let (valid, after_valid) = input.split_at(error.valid_up_to());
47
29.9M
    let valid = unsafe { str::from_utf8_unchecked(valid) };
48
49
29.9M
    match error.error_len() {
50
29.9M
        Some(invalid_sequence_length) => {
51
29.9M
            let invalid = &after_valid[..invalid_sequence_length];
52
29.9M
            Err(DecodeError::Invalid {
53
29.9M
                valid_prefix: valid,
54
29.9M
                invalid_sequence: invalid,
55
29.9M
            })
56
        },
57
4.67k
        None => Err(DecodeError::Incomplete {
58
4.67k
            valid_prefix: valid,
59
4.67k
            incomplete_suffix: IncompleteUtf8::new(after_valid),
60
4.67k
        }),
61
    }
62
29.9M
}
63
64
enum Utf8CompletionResult {
65
    NotEnoughInput,
66
    MalformedUtf8Buffer,
67
    Valid,
68
}
69
70
impl IncompleteUtf8 {
71
4.67k
    fn new(bytes: &[u8]) -> Self {
72
4.67k
        let mut buffer = [0, 0, 0, 0];
73
4.67k
        let len = bytes.len();
74
4.67k
        buffer[..len].copy_from_slice(bytes);
75
76
4.67k
        Self {
77
4.67k
            buffer,
78
4.67k
            buffer_len: len as u8,
79
4.67k
        }
80
4.67k
    }
81
82
3.81k
    fn take_buffer(&mut self) -> &[u8] {
83
3.81k
        let len = self.buffer_len as usize;
84
3.81k
        self.buffer_len = 0;
85
3.81k
        &self.buffer[..len]
86
3.81k
    }
87
88
    /// Consumes bytes from the input and attempts to form a valid utf8 codepoint.
89
    ///
90
    /// Returns how many bytes were consumed and whether a valid code point was found.
91
3.81k
    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Utf8CompletionResult) {
92
3.81k
        let initial_buffer_len = self.buffer_len as usize;
93
        let copied_from_input;
94
3.81k
        {
95
3.81k
            let unwritten = &mut self.buffer[initial_buffer_len..];
96
3.81k
            copied_from_input = cmp::min(unwritten.len(), input.len());
97
3.81k
            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
98
3.81k
        }
99
3.81k
        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
100
3.81k
        match str::from_utf8(spliced) {
101
            Ok(_) => {
102
385
                self.buffer_len = spliced.len() as u8;
103
385
                (copied_from_input, Utf8CompletionResult::Valid)
104
            },
105
3.43k
            Err(error) => {
106
3.43k
                let valid_up_to = error.valid_up_to();
107
3.43k
                if valid_up_to > 0 {
108
670
                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
109
670
                    self.buffer_len = valid_up_to as u8;
110
670
                    (consumed, Utf8CompletionResult::Valid)
111
                } else {
112
2.76k
                    match error.error_len() {
113
2.76k
                        Some(invalid_sequence_length) => {
114
2.76k
                            let consumed = invalid_sequence_length
115
2.76k
                                .checked_sub(initial_buffer_len)
116
2.76k
                                .unwrap();
117
2.76k
                            self.buffer_len = invalid_sequence_length as u8;
118
2.76k
                            (consumed, Utf8CompletionResult::MalformedUtf8Buffer)
119
                        },
120
                        None => {
121
2
                            self.buffer_len = spliced.len() as u8;
122
2
                            (copied_from_input, Utf8CompletionResult::NotEnoughInput)
123
                        },
124
                    }
125
                }
126
            },
127
        }
128
3.81k
    }
129
130
    /// Attempts to complete the codepoint given the bytes from `input`.
131
    ///
132
    /// Returns `None` if more input is required to complete the codepoint. In this case, no
133
    /// input is consumed.
134
    ///
135
    /// Otherwise, returns either the decoded `&str` or malformed `&[u8]` and the remaining input.
136
    #[allow(clippy::type_complexity)]
137
3.81k
    pub fn try_to_complete_codepoint<'input>(
138
3.81k
        &mut self,
139
3.81k
        input: &'input [u8],
140
3.81k
    ) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
141
3.81k
        let (consumed, completion_result) = self.try_complete_offsets(input);
142
3.81k
        let result = match completion_result {
143
2
            Utf8CompletionResult::NotEnoughInput => return None,
144
2.76k
            Utf8CompletionResult::MalformedUtf8Buffer => Err(self.take_buffer()),
145
            Utf8CompletionResult::Valid => {
146
1.05k
                Ok(unsafe { str::from_utf8_unchecked(self.take_buffer()) })
147
            },
148
        };
149
3.81k
        let remaining_input = &input[consumed..];
150
151
3.81k
        Some((result, remaining_input))
152
3.81k
    }
153
154
0
    pub fn try_complete<A, F>(
155
0
        &mut self,
156
0
        mut input: Tendril<fmt::Bytes, A>,
157
0
        mut push_utf8: F,
158
0
    ) -> Result<Tendril<fmt::Bytes, A>, ()>
159
0
    where
160
0
        A: Atomicity,
161
0
        F: FnMut(Tendril<fmt::UTF8, A>),
162
    {
163
0
        let Some((result, remaining_input)) = self.try_to_complete_codepoint(&input) else {
164
            // Not enough input to complete codepoint
165
0
            return Err(());
166
        };
167
168
0
        push_utf8(Tendril::from_slice(result.unwrap_or(REPLACEMENT_CHARACTER)));
169
0
        let resume_at = input.len() - remaining_input.len();
170
0
        input.pop_front(resume_at as u32);
171
0
        Ok(input)
172
0
    }
173
}
174
175
impl<A> Tendril<fmt::Bytes, A>
176
where
177
    A: Atomicity,
178
{
179
0
    pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8>
180
0
    where
181
0
        F: FnMut(Tendril<fmt::UTF8, A>),
182
    {
183
        loop {
184
0
            if self.is_empty() {
185
0
                return None;
186
0
            }
187
0
            let unborrowed_result = match decode_utf8(&self) {
188
0
                Ok(string) => {
189
0
                    debug_assert!(string.as_ptr() == self.as_ptr());
190
0
                    debug_assert!(string.len() == self.len());
191
0
                    Ok(())
192
                },
193
                Err(DecodeError::Invalid {
194
0
                    valid_prefix,
195
0
                    invalid_sequence,
196
                    ..
197
                }) => {
198
0
                    debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
199
0
                    debug_assert!(valid_prefix.len() <= self.len());
200
0
                    Err((
201
0
                        valid_prefix.len(),
202
0
                        Err(valid_prefix.len() + invalid_sequence.len()),
203
0
                    ))
204
                },
205
                Err(DecodeError::Incomplete {
206
0
                    valid_prefix,
207
0
                    incomplete_suffix,
208
                }) => {
209
0
                    debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
210
0
                    debug_assert!(valid_prefix.len() <= self.len());
211
0
                    Err((valid_prefix.len(), Ok(incomplete_suffix)))
212
                },
213
            };
214
0
            match unborrowed_result {
215
                Ok(()) => {
216
0
                    unsafe { push_utf8(self.reinterpret_without_validating()) }
217
0
                    return None;
218
                },
219
0
                Err((valid_len, and_then)) => {
220
0
                    if valid_len > 0 {
221
0
                        let subtendril = self.subtendril(0, valid_len as u32);
222
0
                        unsafe { push_utf8(subtendril.reinterpret_without_validating()) }
223
0
                    }
224
0
                    match and_then {
225
0
                        Ok(incomplete) => return Some(incomplete),
226
0
                        Err(offset) => {
227
0
                            push_utf8(Tendril::from_slice(REPLACEMENT_CHARACTER));
228
0
                            self.pop_front(offset as u32)
229
                        },
230
                    }
231
                },
232
            }
233
        }
234
0
    }
235
}