/src/quick-xml/src/reader/state.rs
Line | Count | Source |
1 | | #[cfg(feature = "encoding")] |
2 | | use encoding_rs::UTF_8; |
3 | | |
4 | | use crate::encoding::Decoder; |
5 | | use crate::errors::{Error, IllFormedError, Result}; |
6 | | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event}; |
7 | | use crate::parser::{Parser, PiParser}; |
8 | | #[cfg(feature = "encoding")] |
9 | | use crate::reader::EncodingRef; |
10 | | use crate::reader::{BangType, Config, DtdParser, ParseState}; |
11 | | use crate::utils::{is_whitespace, name_len}; |
12 | | |
13 | | /// A struct that holds a current reader state and a parser configuration. |
14 | | /// It is independent on a way of reading data: the reader feed data into it and |
15 | | /// get back produced [`Event`]s. |
16 | | #[derive(Clone, Debug)] |
17 | | pub(super) struct ReaderState { |
18 | | /// Number of bytes read from the source of data since the reader was created |
19 | | pub offset: u64, |
20 | | /// A snapshot of an `offset` of the last error returned. It can be less than |
21 | | /// `offset`, because some errors conveniently report at earlier position, |
22 | | /// and changing `offset` is not possible, because `Error::IllFormed` errors |
23 | | /// are recoverable. |
24 | | pub last_error_offset: u64, |
25 | | /// Defines how to process next byte |
26 | | pub state: ParseState, |
27 | | /// User-defined settings that affect parsing |
28 | | pub config: Config, |
29 | | /// All currently Started elements which didn't have a matching |
30 | | /// End element yet. |
31 | | /// |
32 | | /// For an XML |
33 | | /// |
34 | | /// ```xml |
35 | | /// <root><one/><inner attr="value">|<tag></inner></root> |
36 | | /// ``` |
37 | | /// when cursor at the `|` position buffer contains: |
38 | | /// |
39 | | /// ```text |
40 | | /// rootinner |
41 | | /// ^ ^ |
42 | | /// ``` |
43 | | /// |
44 | | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`] |
45 | | /// (0 and 4 in that case). |
46 | | opened_buffer: Vec<u8>, |
47 | | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation |
48 | | /// for that field for details |
49 | | opened_starts: Vec<usize>, |
50 | | |
51 | | #[cfg(feature = "encoding")] |
52 | | /// Reference to the encoding used to read an XML |
53 | | pub encoding: EncodingRef, |
54 | | } |
55 | | |
56 | | impl ReaderState { |
57 | | /// Trims end whitespaces from `bytes`, if required, and returns a text event. |
58 | | /// |
59 | | /// # Parameters |
60 | | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` |
61 | 548k | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> { |
62 | 548k | let mut content = bytes; |
63 | | |
64 | 548k | if self.config.trim_text_end { |
65 | | // Skip the ending '<' |
66 | 233k | let len = bytes |
67 | 233k | .iter() |
68 | 1.52M | .rposition(|&b| !is_whitespace(b)) |
69 | 233k | .map_or(0, |p| p + 1); |
70 | 233k | content = &bytes[..len]; |
71 | 314k | } |
72 | 548k | BytesText::wrap(content, self.decoder()) |
73 | 548k | } |
74 | | |
75 | | /// Returns `Comment`, `CData` or `DocType` event. |
76 | | /// |
77 | | /// `buf` contains data between `<` and `>`: |
78 | | /// - CDATA: `![CDATA[...]]` |
79 | | /// - Comment: `!--...--` |
80 | | /// - Doctype (uppercase): `!D...` |
81 | | /// - Doctype (lowercase): `!d...` |
82 | 73.4k | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> { |
83 | 73.4k | debug_assert_eq!( |
84 | 0 | buf.first(), |
85 | | Some(&b'!'), |
86 | 0 | "CDATA, comment or DOCTYPE should start from '!'" |
87 | | ); |
88 | | |
89 | 73.4k | let uncased_starts_with = |string: &[u8], prefix: &[u8]| { |
90 | 4.75k | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) |
91 | 4.75k | }; |
92 | | |
93 | 73.4k | let len = buf.len(); |
94 | 60.9k | match bang_type { |
95 | 7.76k | BangType::Comment if buf.starts_with(b"!--") => { |
96 | 7.70k | debug_assert!(buf.ends_with(b"--")); |
97 | 7.70k | if self.config.check_comments { |
98 | | // search if '--' not in comments |
99 | 4.62k | let mut haystack = &buf[3..len - 2]; |
100 | 4.62k | let mut off = 0; |
101 | 9.86k | while let Some(p) = memchr::memchr(b'-', haystack) { |
102 | 5.28k | off += p + 1; |
103 | | // if next byte after `-` is also `-`, return an error |
104 | 5.28k | if buf[3 + off] == b'-' { |
105 | | // Explanation of the magic: |
106 | | // |
107 | | // - `self.offset`` just after `>`, |
108 | | // - `buf` contains `!-- con--tent --` |
109 | | // - `p` is counted from byte after `<!--` |
110 | | // |
111 | | // <!-- con--tent -->: |
112 | | // ~~~~~~~~~~~~~~~~ : - buf |
113 | | // : =========== : - zone of search (possible values of `p`) |
114 | | // : |---p : - p is counted from | (| is 0) |
115 | | // : : : ^ - self.offset |
116 | | // ^ : : - self.offset - len |
117 | | // ^ : - self.offset - len + 2 |
118 | | // ^ - self.offset - len + 2 + p |
119 | 44 | self.last_error_offset = self.offset - len as u64 + 2 + p as u64; |
120 | 44 | return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); |
121 | 5.24k | } |
122 | | // Continue search after single `-` (+1 to skip it) |
123 | 5.24k | haystack = &haystack[p + 1..]; |
124 | | } |
125 | 3.08k | } |
126 | 7.66k | Ok(Event::Comment(BytesText::wrap( |
127 | 7.66k | // Cut of `!--` and `--` from start and end |
128 | 7.66k | &buf[3..len - 2], |
129 | 7.66k | self.decoder(), |
130 | 7.66k | ))) |
131 | | } |
132 | | // XML requires uppercase only: |
133 | | // https://www.w3.org/TR/xml11/#sec-cdata-sect |
134 | | // Even HTML5 required uppercase only: |
135 | | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state |
136 | 60.9k | BangType::CData if buf.starts_with(b"![CDATA[") => { |
137 | 60.6k | debug_assert!(buf.ends_with(b"]]")); |
138 | 60.6k | Ok(Event::CData(BytesCData::wrap( |
139 | 60.6k | // Cut of `![CDATA[` and `]]` from start and end |
140 | 60.6k | &buf[8..len - 2], |
141 | 60.6k | self.decoder(), |
142 | 60.6k | ))) |
143 | | } |
144 | | // XML requires uppercase only, but we will check that on validation stage: |
145 | | // https://www.w3.org/TR/xml11/#sec-prolog-dtd |
146 | | // HTML5 allows mixed case for doctype declarations: |
147 | | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state |
148 | 4.75k | BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b"!DOCTYPE") => { |
149 | 17.0k | match buf[8..].iter().position(|&b| !is_whitespace(b)) { |
150 | 4.07k | Some(start) => Ok(Event::DocType(BytesText::wrap( |
151 | 4.07k | // Cut of `!DOCTYPE` and any number of spaces from start |
152 | 4.07k | &buf[8 + start..], |
153 | 4.07k | self.decoder(), |
154 | 4.07k | ))), |
155 | | None => { |
156 | | // Because we here, we at least read `<!DOCTYPE>` and offset after `>`. |
157 | | // We want report error at place where name is expected - this is just |
158 | | // before `>` |
159 | 128 | self.last_error_offset = self.offset - 1; |
160 | 128 | Err(Error::IllFormed(IllFormedError::MissingDoctypeName)) |
161 | | } |
162 | | } |
163 | | } |
164 | | _ => { |
165 | | // <!....> |
166 | | // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. |
167 | | // ^------- We report error at that position, so we need to subtract 2 and buf len |
168 | 899 | self.last_error_offset = self.offset - len as u64 - 2; |
169 | 899 | Err(bang_type.to_err().into()) |
170 | | } |
171 | | } |
172 | 73.4k | } |
173 | | |
174 | | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that |
175 | | /// end name matches the last opened start name if `self.config.check_end_names` is set. |
176 | | /// |
177 | | /// `buf` contains data between `<` and `>`, for example `/tag`. |
178 | 20.8k | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> { |
179 | 20.8k | debug_assert_eq!( |
180 | 0 | buf.first(), |
181 | | Some(&b'/'), |
182 | 0 | "closing tag should start from '/'" |
183 | | ); |
184 | | |
185 | | // Strip the `/` character. `content` contains data between `</` and `>` |
186 | 20.8k | let content = &buf[1..]; |
187 | | // XML standard permits whitespaces after the markup name in closing tags. |
188 | | // Let's strip them from the buffer before comparing tag names. |
189 | 20.8k | let name = if self.config.trim_markup_names_in_closing_tags { |
190 | 8.47k | if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { |
191 | 3.26k | &content[..pos_end_name + 1] |
192 | | } else { |
193 | 5.20k | content |
194 | | } |
195 | | } else { |
196 | 12.3k | content |
197 | | }; |
198 | | |
199 | 20.8k | let decoder = self.decoder(); |
200 | | |
201 | | // Get the index in self.opened_buffer of the name of the last opened tag |
202 | 20.8k | match self.opened_starts.pop() { |
203 | 18.6k | Some(start) => { |
204 | 18.6k | if self.config.check_end_names { |
205 | 7.39k | let expected = &self.opened_buffer[start..]; |
206 | 7.39k | if name != expected { |
207 | 820 | let expected = decoder.decode(expected).unwrap_or_default().into_owned(); |
208 | | // #513: In order to allow error recovery we should drop content of the buffer |
209 | 820 | self.opened_buffer.truncate(start); |
210 | | |
211 | | // Report error at start of the end tag at `<` character |
212 | | // -2 for `<` and `>` |
213 | 820 | self.last_error_offset = self.offset - buf.len() as u64 - 2; |
214 | 820 | return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { |
215 | 820 | expected, |
216 | 820 | found: decoder.decode(name).unwrap_or_default().into_owned(), |
217 | 820 | })); |
218 | 6.57k | } |
219 | 11.2k | } |
220 | | |
221 | 17.8k | self.opened_buffer.truncate(start); |
222 | | } |
223 | | None => { |
224 | 2.18k | if !self.config.allow_unmatched_ends { |
225 | | // Report error at start of the end tag at `<` character |
226 | | // -2 for `<` and `>` |
227 | 492 | self.last_error_offset = self.offset - buf.len() as u64 - 2; |
228 | 492 | return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( |
229 | 492 | decoder.decode(name).unwrap_or_default().into_owned(), |
230 | 492 | ))); |
231 | 1.69k | } |
232 | | } |
233 | | } |
234 | | |
235 | 19.5k | Ok(Event::End(BytesEnd::wrap(name.into()))) |
236 | 20.8k | } |
237 | | |
238 | | /// `buf` contains data between `<` and `>` and the first byte is `?`. |
239 | | /// `self.offset` already after the `>` |
240 | | /// |
241 | | /// Returns `Decl` or `PI` event |
242 | 92.0k | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> { |
243 | 92.0k | debug_assert!(!buf.is_empty()); |
244 | 92.0k | debug_assert_eq!(buf[0], b'?'); |
245 | | |
246 | 92.0k | let len = buf.len(); |
247 | | // We accept at least <??> |
248 | | // ~~ - len = 2 |
249 | 92.0k | if len > 1 && buf[len - 1] == b'?' { |
250 | | // Cut of `?` and `?` from start and end |
251 | 91.8k | let content = &buf[1..len - 1]; |
252 | 91.8k | let len = content.len(); |
253 | | |
254 | 91.8k | if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { |
255 | 68.4k | let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder())); |
256 | | |
257 | | // Try getting encoding from the declaration event |
258 | | #[cfg(feature = "encoding")] |
259 | | if self.encoding.can_be_refined() { |
260 | | if let Some(encoding) = event.encoder() { |
261 | | self.encoding = EncodingRef::XmlDetected(encoding); |
262 | | } |
263 | | } |
264 | | |
265 | 68.4k | Ok(Event::Decl(event)) |
266 | | } else { |
267 | 23.4k | Ok(Event::PI(BytesPI::wrap( |
268 | 23.4k | content, |
269 | 23.4k | name_len(content), |
270 | 23.4k | self.decoder(), |
271 | 23.4k | ))) |
272 | | } |
273 | | } else { |
274 | | // <?....> |
275 | | // ^^^^^ - `buf` does not contain `<`, but we want to report error at `<`, |
276 | | // so we move offset to it (-2 for `<` and `>`) |
277 | 117 | self.last_error_offset = self.offset - len as u64 - 2; |
278 | 117 | Err(Error::Syntax(PiParser(false).eof_error(buf))) |
279 | | } |
280 | 92.0k | } |
281 | | |
282 | | /// Converts content of a tag to a `Start` or an `Empty` event |
283 | | /// |
284 | | /// # Parameters |
285 | | /// - `content`: Content of a tag between `<` and `>` |
286 | 6.72M | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> { |
287 | 6.72M | if let Some(content) = content.strip_suffix(b"/") { |
288 | | // This is self-closed tag `<something/>` |
289 | 296k | let event = BytesStart::wrap(content, name_len(content), self.decoder()); |
290 | | |
291 | 296k | if self.config.expand_empty_elements { |
292 | 283k | self.state = ParseState::InsideEmpty; |
293 | 283k | self.opened_starts.push(self.opened_buffer.len()); |
294 | 283k | self.opened_buffer.extend(event.name().as_ref()); |
295 | 283k | Event::Start(event) |
296 | | } else { |
297 | 13.0k | Event::Empty(event) |
298 | | } |
299 | | } else { |
300 | 6.43M | let event = BytesStart::wrap(content, name_len(content), self.decoder()); |
301 | | |
302 | | // #514: Always store names event when .check_end_names == false, |
303 | | // because checks can be temporary disabled and when they would be |
304 | | // enabled, we should have that information |
305 | 6.43M | self.opened_starts.push(self.opened_buffer.len()); |
306 | 6.43M | self.opened_buffer.extend(event.name().as_ref()); |
307 | 6.43M | Event::Start(event) |
308 | | } |
309 | 6.72M | } |
310 | | |
311 | | #[inline] |
312 | 283k | pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> { |
313 | 283k | self.state = ParseState::InsideText; |
314 | 283k | let name = self |
315 | 283k | .opened_buffer |
316 | 283k | .split_off(self.opened_starts.pop().unwrap()); |
317 | 283k | BytesEnd::wrap(name.into()) |
318 | 283k | } <quick_xml::reader::state::ReaderState>::close_expanded_empty Line | Count | Source | 312 | 1.93k | pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> { | 313 | 1.93k | self.state = ParseState::InsideText; | 314 | 1.93k | let name = self | 315 | 1.93k | .opened_buffer | 316 | 1.93k | .split_off(self.opened_starts.pop().unwrap()); | 317 | 1.93k | BytesEnd::wrap(name.into()) | 318 | 1.93k | } |
<quick_xml::reader::state::ReaderState>::close_expanded_empty Line | Count | Source | 312 | 281k | pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> { | 313 | 281k | self.state = ParseState::InsideText; | 314 | 281k | let name = self | 315 | 281k | .opened_buffer | 316 | 281k | .split_off(self.opened_starts.pop().unwrap()); | 317 | 281k | BytesEnd::wrap(name.into()) | 318 | 281k | } |
|
319 | | |
320 | | /// Get the decoder, used to decode bytes, read by this reader, to the strings. |
321 | | /// |
322 | | /// If [`encoding`] feature is enabled, the used encoding may change after |
323 | | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. |
324 | | /// |
325 | | /// If [`encoding`] feature is enabled and no encoding is specified in declaration, |
326 | | /// defaults to UTF-8. |
327 | | /// |
328 | | /// [`encoding`]: ../../index.html#encoding |
329 | 18.0M | pub const fn decoder(&self) -> Decoder { |
330 | 18.0M | Decoder { |
331 | 18.0M | #[cfg(feature = "encoding")] |
332 | 18.0M | encoding: self.encoding.encoding(), |
333 | 18.0M | } |
334 | 18.0M | } |
335 | | } |
336 | | |
337 | | impl Default for ReaderState { |
338 | 28.7k | fn default() -> Self { |
339 | 28.7k | Self { |
340 | 28.7k | offset: 0, |
341 | 28.7k | last_error_offset: 0, |
342 | 28.7k | state: ParseState::Init, |
343 | 28.7k | config: Config::default(), |
344 | 28.7k | opened_buffer: Vec::new(), |
345 | 28.7k | opened_starts: Vec::new(), |
346 | 28.7k | |
347 | 28.7k | #[cfg(feature = "encoding")] |
348 | 28.7k | encoding: EncodingRef::Implicit(UTF_8), |
349 | 28.7k | } |
350 | 28.7k | } |
351 | | } |