/src/html5ever/tendril/src/utf8_decode.rs

Source
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use crate::fmt;
use crate::{Atomicity, Tendril};

use std::cmp;
use std::str;

/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
pub(crate) const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";

#[derive(Debug, Copy, Clone)]
pub(crate) enum DecodeError<'a> {
    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
    /// then call `decode()` again with `remaining_input`.
    Invalid {
        valid_prefix: &'a str,
        invalid_sequence: &'a [u8],
    },

    /// Call the `incomplete_suffix.try_to_complete_codepoint` method with more input when available.
    /// If no more input is available, this is an invalid byte sequence.
    Incomplete {
        valid_prefix: &'a str,
        incomplete_suffix: IncompleteUtf8,
    },
}

#[derive(Debug, Copy, Clone)]
pub struct IncompleteUtf8 {
    pub buffer: [u8; 4],
    pub buffer_len: u8,
}

pub(crate) fn decode_utf8(input: &[u8]) -> Result<&str, DecodeError<'_>> {
    let error = match str::from_utf8(input) {
        Ok(valid) => return Ok(valid),
        Err(error) => error,
    };

    // FIXME: separate function from here to guide inlining?
    let (valid, after_valid) = input.split_at(error.valid_up_to());
    let valid = unsafe { str::from_utf8_unchecked(valid) };

    match error.error_len() {
        Some(invalid_sequence_length) => {
            let invalid = &after_valid[..invalid_sequence_length];
            Err(DecodeError::Invalid {
                valid_prefix: valid,
                invalid_sequence: invalid,
            })
        },
        None => Err(DecodeError::Incomplete {
            valid_prefix: valid,
            incomplete_suffix: IncompleteUtf8::new(after_valid),
        }),
    }
}

enum Utf8CompletionResult {
    NotEnoughInput,
    MalformedUtf8Buffer,
    Valid,
}

impl IncompleteUtf8 {
    fn new(bytes: &[u8]) -> Self {
        let mut buffer = [0, 0, 0, 0];
        let len = bytes.len();
        buffer[..len].copy_from_slice(bytes);

        Self {
            buffer,
            buffer_len: len as u8,
        }
    }

    fn take_buffer(&mut self) -> &[u8] {
        let len = self.buffer_len as usize;
        self.buffer_len = 0;
        &self.buffer[..len]
    }

    /// Consumes bytes from the input and attempts to form a valid utf8 codepoint.
    ///
    /// Returns how many bytes were consumed and whether a valid code point was found.
    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Utf8CompletionResult) {
        let initial_buffer_len = self.buffer_len as usize;
        let copied_from_input;
        {
            let unwritten = &mut self.buffer[initial_buffer_len..];
            copied_from_input = cmp::min(unwritten.len(), input.len());
            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
        }
        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
        match str::from_utf8(spliced) {
            Ok(_) => {
                self.buffer_len = spliced.len() as u8;
                (copied_from_input, Utf8CompletionResult::Valid)
            },
            Err(error) => {
                let valid_up_to = error.valid_up_to();
                if valid_up_to > 0 {
                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
                    self.buffer_len = valid_up_to as u8;
                    (consumed, Utf8CompletionResult::Valid)
                } else {
                    match error.error_len() {
                        Some(invalid_sequence_length) => {
                            let consumed = invalid_sequence_length
                                .checked_sub(initial_buffer_len)
                                .unwrap();
                            self.buffer_len = invalid_sequence_length as u8;
                            (consumed, Utf8CompletionResult::MalformedUtf8Buffer)
                        },
                        None => {
                            self.buffer_len = spliced.len() as u8;
                            (copied_from_input, Utf8CompletionResult::NotEnoughInput)
                        },
                    }
                }
            },
        }
    }

    /// Attempts to complete the codepoint given the bytes from `input`.
    ///
    /// Returns `None` if more input is required to complete the codepoint. In this case, no
    /// input is consumed.
    ///
    /// Otherwise, returns either the decoded `&str` or malformed `&[u8]` and the remaining input.
    #[allow(clippy::type_complexity)]
    pub fn try_to_complete_codepoint<'input>(
        &mut self,
        input: &'input [u8],
    ) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
        let (consumed, completion_result) = self.try_complete_offsets(input);
        let result = match completion_result {
            Utf8CompletionResult::NotEnoughInput => return None,
            Utf8CompletionResult::MalformedUtf8Buffer => Err(self.take_buffer()),
            Utf8CompletionResult::Valid => {
                Ok(unsafe { str::from_utf8_unchecked(self.take_buffer()) })
            },
        };
        let remaining_input = &input[consumed..];

        Some((result, remaining_input))
    }

    pub fn try_complete<A, F>(
        &mut self,
        mut input: Tendril<fmt::Bytes, A>,
        mut push_utf8: F,
    ) -> Result<Tendril<fmt::Bytes, A>, ()>
    where
        A: Atomicity,
        F: FnMut(Tendril<fmt::UTF8, A>),
    {
        let Some((result, remaining_input)) = self.try_to_complete_codepoint(&input) else {
            // Not enough input to complete codepoint
            return Err(());
        };

        push_utf8(Tendril::from_slice(result.unwrap_or(REPLACEMENT_CHARACTER)));
        let resume_at = input.len() - remaining_input.len();
        input.pop_front(resume_at as u32);
        Ok(input)
    }
}

impl<A> Tendril<fmt::Bytes, A>
where
    A: Atomicity,
{
    pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8>
    where
        F: FnMut(Tendril<fmt::UTF8, A>),
    {
        loop {
            if self.is_empty() {
                return None;
            }
            let unborrowed_result = match decode_utf8(&self) {
                Ok(string) => {
                    debug_assert!(string.as_ptr() == self.as_ptr());
                    debug_assert!(string.len() == self.len());
                    Ok(())
                },
                Err(DecodeError::Invalid {
                    valid_prefix,
                    invalid_sequence,
                    ..
                }) => {
                    debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
                    debug_assert!(valid_prefix.len() <= self.len());
                    Err((
                        valid_prefix.len(),
                        Err(valid_prefix.len() + invalid_sequence.len()),
                    ))
                },
                Err(DecodeError::Incomplete {
                    valid_prefix,
                    incomplete_suffix,
                }) => {
                    debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
                    debug_assert!(valid_prefix.len() <= self.len());
                    Err((valid_prefix.len(), Ok(incomplete_suffix)))
                },
            };
            match unborrowed_result {
                Ok(()) => {
                    unsafe { push_utf8(self.reinterpret_without_validating()) }
                    return None;
                },
                Err((valid_len, and_then)) => {
                    if valid_len > 0 {
                        let subtendril = self.subtendril(0, valid_len as u32);
                        unsafe { push_utf8(subtendril.reinterpret_without_validating()) }
                    }
                    match and_then {
                        Ok(incomplete) => return Some(incomplete),
                        Err(offset) => {
                            push_utf8(Tendril::from_slice(REPLACEMENT_CHARACTER));
                            self.pop_front(offset as u32)
                        },
                    }
                },
            }
        }
    }
}

Coverage Report

Created: 2026-03-31 06:51

Line	Count	Source
1		// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2		// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3		// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4		// option. This file may not be copied, modified, or distributed
5		// except according to those terms.
6
7		use crate::fmt;
8		use crate::{Atomicity, Tendril};
9
10		use std::cmp;
11		use std::str;
12
13		/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
14		pub(crate) const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
15
16		#[derive(Debug, Copy, Clone)]
17		pub(crate) enum DecodeError<'a> {
18		/// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
19		/// then call `decode()` again with `remaining_input`.
20		Invalid {
21		valid_prefix: &'a str,
22		invalid_sequence: &'a [u8],
23		},
24
25		/// Call the `incomplete_suffix.try_to_complete_codepoint` method with more input when available.
26		/// If no more input is available, this is an invalid byte sequence.
27		Incomplete {
28		valid_prefix: &'a str,
29		incomplete_suffix: IncompleteUtf8,
30		},
31		}
32
33		#[derive(Debug, Copy, Clone)]
34		pub struct IncompleteUtf8 {
35		pub buffer: [u8; 4],
36		pub buffer_len: u8,
37		}
38
39	29.9M	pub(crate) fn decode_utf8(input: &[u8]) -> Result<&str, DecodeError<'_>> {
40	29.9M	let error = match str::from_utf8(input) {
41	28.2k	Ok(valid) => return Ok(valid),
42	29.9M	Err(error) => error,
43		};
44
45		// FIXME: separate function from here to guide inlining?
46	29.9M	let (valid, after_valid) = input.split_at(error.valid_up_to());
47	29.9M	let valid = unsafe { str::from_utf8_unchecked(valid) };
48
49	29.9M	match error.error_len() {
50	29.9M	Some(invalid_sequence_length) => {
51	29.9M	let invalid = &after_valid[..invalid_sequence_length];
52	29.9M	Err(DecodeError::Invalid {
53	29.9M	valid_prefix: valid,
54	29.9M	invalid_sequence: invalid,
55	29.9M	})
56		},
57	4.67k	None => Err(DecodeError::Incomplete {
58	4.67k	valid_prefix: valid,
59	4.67k	incomplete_suffix: IncompleteUtf8::new(after_valid),
60	4.67k	}),
61		}
62	29.9M	}
63
64		enum Utf8CompletionResult {
65		NotEnoughInput,
66		MalformedUtf8Buffer,
67		Valid,
68		}
69
70		impl IncompleteUtf8 {
71	4.67k	fn new(bytes: &[u8]) -> Self {
72	4.67k	let mut buffer = [0, 0, 0, 0];
73	4.67k	let len = bytes.len();
74	4.67k	buffer[..len].copy_from_slice(bytes);
75
76	4.67k	Self {
77	4.67k	buffer,
78	4.67k	buffer_len: len as u8,
79	4.67k	}
80	4.67k	}
81
82	3.81k	fn take_buffer(&mut self) -> &[u8] {
83	3.81k	let len = self.buffer_len as usize;
84	3.81k	self.buffer_len = 0;
85	3.81k	&self.buffer[..len]
86	3.81k	}
87
88		/// Consumes bytes from the input and attempts to form a valid utf8 codepoint.
89		///
90		/// Returns how many bytes were consumed and whether a valid code point was found.
91	3.81k	fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Utf8CompletionResult) {
92	3.81k	let initial_buffer_len = self.buffer_len as usize;
93		let copied_from_input;
94	3.81k	{
95	3.81k	let unwritten = &mut self.buffer[initial_buffer_len..];
96	3.81k	copied_from_input = cmp::min(unwritten.len(), input.len());
97	3.81k	unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
98	3.81k	}
99	3.81k	let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
100	3.81k	match str::from_utf8(spliced) {
101		Ok(_) => {
102	385	self.buffer_len = spliced.len() as u8;
103	385	(copied_from_input, Utf8CompletionResult::Valid)
104		},
105	3.43k	Err(error) => {
106	3.43k	let valid_up_to = error.valid_up_to();
107	3.43k	if valid_up_to > 0 {
108	670	let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
109	670	self.buffer_len = valid_up_to as u8;
110	670	(consumed, Utf8CompletionResult::Valid)
111		} else {
112	2.76k	match error.error_len() {
113	2.76k	Some(invalid_sequence_length) => {
114	2.76k	let consumed = invalid_sequence_length
115	2.76k	.checked_sub(initial_buffer_len)
116	2.76k	.unwrap();
117	2.76k	self.buffer_len = invalid_sequence_length as u8;
118	2.76k	(consumed, Utf8CompletionResult::MalformedUtf8Buffer)
119		},
120		None => {
121	2	self.buffer_len = spliced.len() as u8;
122	2	(copied_from_input, Utf8CompletionResult::NotEnoughInput)
123		},
124		}
125		}
126		},
127		}
128	3.81k	}
129
130		/// Attempts to complete the codepoint given the bytes from `input`.
131		///
132		/// Returns `None` if more input is required to complete the codepoint. In this case, no
133		/// input is consumed.
134		///
135		/// Otherwise, returns either the decoded `&str` or malformed `&[u8]` and the remaining input.
136		#[allow(clippy::type_complexity)]
137	3.81k	pub fn try_to_complete_codepoint<'input>(
138	3.81k	&mut self,
139	3.81k	input: &'input [u8],
140	3.81k	) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
141	3.81k	let (consumed, completion_result) = self.try_complete_offsets(input);
142	3.81k	let result = match completion_result {
143	2	Utf8CompletionResult::NotEnoughInput => return None,
144	2.76k	Utf8CompletionResult::MalformedUtf8Buffer => Err(self.take_buffer()),
145		Utf8CompletionResult::Valid => {
146	1.05k	Ok(unsafe { str::from_utf8_unchecked(self.take_buffer()) })
147		},
148		};
149	3.81k	let remaining_input = &input[consumed..];
150
151	3.81k	Some((result, remaining_input))
152	3.81k	}
153
154	0	pub fn try_complete<A, F>(
155	0	&mut self,
156	0	mut input: Tendril<fmt::Bytes, A>,
157	0	mut push_utf8: F,
158	0	) -> Result<Tendril<fmt::Bytes, A>, ()>
159	0	where
160	0	A: Atomicity,
161	0	F: FnMut(Tendril<fmt::UTF8, A>),
162		{
163	0	let Some((result, remaining_input)) = self.try_to_complete_codepoint(&input) else {
164		// Not enough input to complete codepoint
165	0	return Err(());
166		};
167
168	0	push_utf8(Tendril::from_slice(result.unwrap_or(REPLACEMENT_CHARACTER)));
169	0	let resume_at = input.len() - remaining_input.len();
170	0	input.pop_front(resume_at as u32);
171	0	Ok(input)
172	0	}
173		}
174
175		impl<A> Tendril<fmt::Bytes, A>
176		where
177		A: Atomicity,
178		{
179	0	pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8>
180	0	where
181	0	F: FnMut(Tendril<fmt::UTF8, A>),
182		{
183		loop {
184	0	if self.is_empty() {
185	0	return None;
186	0	}
187	0	let unborrowed_result = match decode_utf8(&self) {
188	0	Ok(string) => {
189	0	debug_assert!(string.as_ptr() == self.as_ptr());
190	0	debug_assert!(string.len() == self.len());
191	0	Ok(())
192		},
193		Err(DecodeError::Invalid {
194	0	valid_prefix,
195	0	invalid_sequence,
196		..
197		}) => {
198	0	debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
199	0	debug_assert!(valid_prefix.len() <= self.len());
200	0	Err((
201	0	valid_prefix.len(),
202	0	Err(valid_prefix.len() + invalid_sequence.len()),
203	0	))
204		},
205		Err(DecodeError::Incomplete {
206	0	valid_prefix,
207	0	incomplete_suffix,
208		}) => {
209	0	debug_assert!(valid_prefix.as_ptr() == self.as_ptr());
210	0	debug_assert!(valid_prefix.len() <= self.len());
211	0	Err((valid_prefix.len(), Ok(incomplete_suffix)))
212		},
213		};
214	0	match unborrowed_result {
215		Ok(()) => {
216	0	unsafe { push_utf8(self.reinterpret_without_validating()) }
217	0	return None;
218		},
219	0	Err((valid_len, and_then)) => {
220	0	if valid_len > 0 {
221	0	let subtendril = self.subtendril(0, valid_len as u32);
222	0	unsafe { push_utf8(subtendril.reinterpret_without_validating()) }
223	0	}
224	0	match and_then {
225	0	Ok(incomplete) => return Some(incomplete),
226	0	Err(offset) => {
227	0	push_utf8(Tendril::from_slice(REPLACEMENT_CHARACTER));
228	0	self.pop_front(offset as u32)
229		},
230		}
231		},
232		}
233		}
234	0	}
235		}