/src/html5ever/tendril/src/utf8_decode.rs
Line | Count | Source |
1 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
4 | | // option. This file may not be copied, modified, or distributed |
5 | | // except according to those terms. |
6 | | |
7 | | use crate::fmt; |
8 | | use crate::{Atomicity, Tendril}; |
9 | | |
10 | | use std::cmp; |
11 | | use std::str; |
12 | | |
13 | | /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. |
14 | | pub(crate) const REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; |
15 | | |
16 | | #[derive(Debug, Copy, Clone)] |
17 | | pub(crate) enum DecodeError<'a> { |
18 | | /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, |
19 | | /// then call `decode()` again with `remaining_input`. |
20 | | Invalid { |
21 | | valid_prefix: &'a str, |
22 | | invalid_sequence: &'a [u8], |
23 | | }, |
24 | | |
25 | | /// Call the `incomplete_suffix.try_to_complete_codepoint` method with more input when available. |
26 | | /// If no more input is available, this is an invalid byte sequence. |
27 | | Incomplete { |
28 | | valid_prefix: &'a str, |
29 | | incomplete_suffix: IncompleteUtf8, |
30 | | }, |
31 | | } |
32 | | |
33 | | #[derive(Debug, Copy, Clone)] |
34 | | pub struct IncompleteUtf8 { |
35 | | pub buffer: [u8; 4], |
36 | | pub buffer_len: u8, |
37 | | } |
38 | | |
39 | 29.9M | pub(crate) fn decode_utf8(input: &[u8]) -> Result<&str, DecodeError<'_>> { |
40 | 29.9M | let error = match str::from_utf8(input) { |
41 | 28.2k | Ok(valid) => return Ok(valid), |
42 | 29.9M | Err(error) => error, |
43 | | }; |
44 | | |
45 | | // FIXME: separate function from here to guide inlining? |
46 | 29.9M | let (valid, after_valid) = input.split_at(error.valid_up_to()); |
47 | 29.9M | let valid = unsafe { str::from_utf8_unchecked(valid) }; |
48 | | |
49 | 29.9M | match error.error_len() { |
50 | 29.9M | Some(invalid_sequence_length) => { |
51 | 29.9M | let invalid = &after_valid[..invalid_sequence_length]; |
52 | 29.9M | Err(DecodeError::Invalid { |
53 | 29.9M | valid_prefix: valid, |
54 | 29.9M | invalid_sequence: invalid, |
55 | 29.9M | }) |
56 | | }, |
57 | 4.67k | None => Err(DecodeError::Incomplete { |
58 | 4.67k | valid_prefix: valid, |
59 | 4.67k | incomplete_suffix: IncompleteUtf8::new(after_valid), |
60 | 4.67k | }), |
61 | | } |
62 | 29.9M | } |
63 | | |
64 | | enum Utf8CompletionResult { |
65 | | NotEnoughInput, |
66 | | MalformedUtf8Buffer, |
67 | | Valid, |
68 | | } |
69 | | |
70 | | impl IncompleteUtf8 { |
71 | 4.67k | fn new(bytes: &[u8]) -> Self { |
72 | 4.67k | let mut buffer = [0, 0, 0, 0]; |
73 | 4.67k | let len = bytes.len(); |
74 | 4.67k | buffer[..len].copy_from_slice(bytes); |
75 | | |
76 | 4.67k | Self { |
77 | 4.67k | buffer, |
78 | 4.67k | buffer_len: len as u8, |
79 | 4.67k | } |
80 | 4.67k | } |
81 | | |
82 | 3.81k | fn take_buffer(&mut self) -> &[u8] { |
83 | 3.81k | let len = self.buffer_len as usize; |
84 | 3.81k | self.buffer_len = 0; |
85 | 3.81k | &self.buffer[..len] |
86 | 3.81k | } |
87 | | |
88 | | /// Consumes bytes from the input and attempts to form a valid utf8 codepoint. |
89 | | /// |
90 | | /// Returns how many bytes were consumed and whether a valid code point was found. |
91 | 3.81k | fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Utf8CompletionResult) { |
92 | 3.81k | let initial_buffer_len = self.buffer_len as usize; |
93 | | let copied_from_input; |
94 | 3.81k | { |
95 | 3.81k | let unwritten = &mut self.buffer[initial_buffer_len..]; |
96 | 3.81k | copied_from_input = cmp::min(unwritten.len(), input.len()); |
97 | 3.81k | unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); |
98 | 3.81k | } |
99 | 3.81k | let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; |
100 | 3.81k | match str::from_utf8(spliced) { |
101 | | Ok(_) => { |
102 | 385 | self.buffer_len = spliced.len() as u8; |
103 | 385 | (copied_from_input, Utf8CompletionResult::Valid) |
104 | | }, |
105 | 3.43k | Err(error) => { |
106 | 3.43k | let valid_up_to = error.valid_up_to(); |
107 | 3.43k | if valid_up_to > 0 { |
108 | 670 | let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); |
109 | 670 | self.buffer_len = valid_up_to as u8; |
110 | 670 | (consumed, Utf8CompletionResult::Valid) |
111 | | } else { |
112 | 2.76k | match error.error_len() { |
113 | 2.76k | Some(invalid_sequence_length) => { |
114 | 2.76k | let consumed = invalid_sequence_length |
115 | 2.76k | .checked_sub(initial_buffer_len) |
116 | 2.76k | .unwrap(); |
117 | 2.76k | self.buffer_len = invalid_sequence_length as u8; |
118 | 2.76k | (consumed, Utf8CompletionResult::MalformedUtf8Buffer) |
119 | | }, |
120 | | None => { |
121 | 2 | self.buffer_len = spliced.len() as u8; |
122 | 2 | (copied_from_input, Utf8CompletionResult::NotEnoughInput) |
123 | | }, |
124 | | } |
125 | | } |
126 | | }, |
127 | | } |
128 | 3.81k | } |
129 | | |
130 | | /// Attempts to complete the codepoint given the bytes from `input`. |
131 | | /// |
132 | | /// Returns `None` if more input is required to complete the codepoint. In this case, no |
133 | | /// input is consumed. |
134 | | /// |
135 | | /// Otherwise, returns either the decoded `&str` or malformed `&[u8]` and the remaining input. |
136 | | #[allow(clippy::type_complexity)] |
137 | 3.81k | pub fn try_to_complete_codepoint<'input>( |
138 | 3.81k | &mut self, |
139 | 3.81k | input: &'input [u8], |
140 | 3.81k | ) -> Option<(Result<&str, &[u8]>, &'input [u8])> { |
141 | 3.81k | let (consumed, completion_result) = self.try_complete_offsets(input); |
142 | 3.81k | let result = match completion_result { |
143 | 2 | Utf8CompletionResult::NotEnoughInput => return None, |
144 | 2.76k | Utf8CompletionResult::MalformedUtf8Buffer => Err(self.take_buffer()), |
145 | | Utf8CompletionResult::Valid => { |
146 | 1.05k | Ok(unsafe { str::from_utf8_unchecked(self.take_buffer()) }) |
147 | | }, |
148 | | }; |
149 | 3.81k | let remaining_input = &input[consumed..]; |
150 | | |
151 | 3.81k | Some((result, remaining_input)) |
152 | 3.81k | } |
153 | | |
154 | 0 | pub fn try_complete<A, F>( |
155 | 0 | &mut self, |
156 | 0 | mut input: Tendril<fmt::Bytes, A>, |
157 | 0 | mut push_utf8: F, |
158 | 0 | ) -> Result<Tendril<fmt::Bytes, A>, ()> |
159 | 0 | where |
160 | 0 | A: Atomicity, |
161 | 0 | F: FnMut(Tendril<fmt::UTF8, A>), |
162 | | { |
163 | 0 | let Some((result, remaining_input)) = self.try_to_complete_codepoint(&input) else { |
164 | | // Not enough input to complete codepoint |
165 | 0 | return Err(()); |
166 | | }; |
167 | | |
168 | 0 | push_utf8(Tendril::from_slice(result.unwrap_or(REPLACEMENT_CHARACTER))); |
169 | 0 | let resume_at = input.len() - remaining_input.len(); |
170 | 0 | input.pop_front(resume_at as u32); |
171 | 0 | Ok(input) |
172 | 0 | } |
173 | | } |
174 | | |
175 | | impl<A> Tendril<fmt::Bytes, A> |
176 | | where |
177 | | A: Atomicity, |
178 | | { |
179 | 0 | pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8> |
180 | 0 | where |
181 | 0 | F: FnMut(Tendril<fmt::UTF8, A>), |
182 | | { |
183 | | loop { |
184 | 0 | if self.is_empty() { |
185 | 0 | return None; |
186 | 0 | } |
187 | 0 | let unborrowed_result = match decode_utf8(&self) { |
188 | 0 | Ok(string) => { |
189 | 0 | debug_assert!(string.as_ptr() == self.as_ptr()); |
190 | 0 | debug_assert!(string.len() == self.len()); |
191 | 0 | Ok(()) |
192 | | }, |
193 | | Err(DecodeError::Invalid { |
194 | 0 | valid_prefix, |
195 | 0 | invalid_sequence, |
196 | | .. |
197 | | }) => { |
198 | 0 | debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); |
199 | 0 | debug_assert!(valid_prefix.len() <= self.len()); |
200 | 0 | Err(( |
201 | 0 | valid_prefix.len(), |
202 | 0 | Err(valid_prefix.len() + invalid_sequence.len()), |
203 | 0 | )) |
204 | | }, |
205 | | Err(DecodeError::Incomplete { |
206 | 0 | valid_prefix, |
207 | 0 | incomplete_suffix, |
208 | | }) => { |
209 | 0 | debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); |
210 | 0 | debug_assert!(valid_prefix.len() <= self.len()); |
211 | 0 | Err((valid_prefix.len(), Ok(incomplete_suffix))) |
212 | | }, |
213 | | }; |
214 | 0 | match unborrowed_result { |
215 | | Ok(()) => { |
216 | 0 | unsafe { push_utf8(self.reinterpret_without_validating()) } |
217 | 0 | return None; |
218 | | }, |
219 | 0 | Err((valid_len, and_then)) => { |
220 | 0 | if valid_len > 0 { |
221 | 0 | let subtendril = self.subtendril(0, valid_len as u32); |
222 | 0 | unsafe { push_utf8(subtendril.reinterpret_without_validating()) } |
223 | 0 | } |
224 | 0 | match and_then { |
225 | 0 | Ok(incomplete) => return Some(incomplete), |
226 | 0 | Err(offset) => { |
227 | 0 | push_utf8(Tendril::from_slice(REPLACEMENT_CHARACTER)); |
228 | 0 | self.pop_front(offset as u32) |
229 | | }, |
230 | | } |
231 | | }, |
232 | | } |
233 | | } |
234 | 0 | } |
235 | | } |