/rust/registry/src/index.crates.io-6f17d22bba15001f/quick-xml-0.29.0/src/reader/parser.rs
Line | Count | Source (jump to first uncovered line) |
1 | | #[cfg(feature = "encoding")] |
2 | | use encoding_rs::UTF_8; |
3 | | |
4 | | use crate::encoding::Decoder; |
5 | | use crate::errors::{Error, Result}; |
6 | | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; |
7 | | #[cfg(feature = "encoding")] |
8 | | use crate::reader::EncodingRef; |
9 | | use crate::reader::{is_whitespace, BangType, ParseState}; |
10 | | |
11 | | use memchr; |
12 | | |
13 | | /// A struct that holds a current parse state and a parser configuration. |
14 | | /// It is independent on a way of reading data: the reader feed data into it and |
15 | | /// get back produced [`Event`]s. |
16 | | #[derive(Clone)] |
17 | | pub(super) struct Parser { |
18 | | /// Number of bytes read from the source of data since the parser was created |
19 | | pub offset: usize, |
20 | | /// Defines how to process next byte |
21 | | pub state: ParseState, |
22 | | /// Expand empty element into an opening and closing element |
23 | | pub expand_empty_elements: bool, |
24 | | /// Trims leading whitespace in Text events, skip the element if text is empty |
25 | | pub trim_text_start: bool, |
26 | | /// Trims trailing whitespace in Text events. |
27 | | pub trim_text_end: bool, |
28 | | /// Trims trailing whitespaces from markup names in closing tags `</a >` |
29 | | pub trim_markup_names_in_closing_tags: bool, |
30 | | /// Check if [`Event::End`] nodes match last [`Event::Start`] node |
31 | | pub check_end_names: bool, |
32 | | /// Check if comments contains `--` (false per default) |
33 | | pub check_comments: bool, |
34 | | /// All currently Started elements which didn't have a matching |
35 | | /// End element yet. |
36 | | /// |
37 | | /// For an XML |
38 | | /// |
39 | | /// ```xml |
40 | | /// <root><one/><inner attr="value">|<tag></inner></root> |
41 | | /// ``` |
42 | | /// when cursor at the `|` position buffer contains: |
43 | | /// |
44 | | /// ```text |
45 | | /// rootinner |
46 | | /// ^ ^ |
47 | | /// ``` |
48 | | /// |
49 | | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`] |
50 | | /// (0 and 4 in that case). |
51 | | opened_buffer: Vec<u8>, |
52 | | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation |
53 | | /// for that field for details |
54 | | opened_starts: Vec<usize>, |
55 | | |
56 | | #[cfg(feature = "encoding")] |
57 | | /// Reference to the encoding used to read an XML |
58 | | pub encoding: EncodingRef, |
59 | | } |
60 | | |
61 | | impl Parser { |
62 | | /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event. |
63 | | /// |
64 | | /// # Parameters |
65 | | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` |
66 | | /// |
67 | | /// [`Text`]: Event::Text |
68 | 84.0M | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> { |
69 | 84.0M | let mut content = bytes; |
70 | 84.0M | |
71 | 84.0M | if self.trim_text_end { |
72 | 0 | // Skip the ending '<' |
73 | 0 | let len = bytes |
74 | 0 | .iter() |
75 | 0 | .rposition(|&b| !is_whitespace(b)) |
76 | 0 | .map_or_else(|| bytes.len(), |p| p + 1); |
77 | 0 | content = &bytes[..len]; |
78 | 84.0M | } |
79 | | |
80 | 84.0M | Ok(Event::Text(BytesText::wrap(content, self.decoder()))) |
81 | 84.0M | } |
82 | | |
83 | | /// reads `BytesElement` starting with a `!`, |
84 | | /// return `Comment`, `CData` or `DocType` event |
85 | 12.7k | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> { |
86 | 12.7k | let uncased_starts_with = |string: &[u8], prefix: &[u8]| { |
87 | 12.7k | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) |
88 | 12.7k | }; |
89 | | |
90 | 12.7k | let len = buf.len(); |
91 | 0 | match bang_type { |
92 | 0 | BangType::Comment if buf.starts_with(b"!--") => { |
93 | 0 | debug_assert!(buf.ends_with(b"--")); |
94 | 0 | if self.check_comments { |
95 | | // search if '--' not in comments |
96 | 0 | if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) |
97 | 0 | .position(|p| buf[3 + p + 1] == b'-') |
98 | | { |
99 | 0 | self.offset += len - p; |
100 | 0 | return Err(Error::UnexpectedToken("--".to_string())); |
101 | 0 | } |
102 | 0 | } |
103 | 0 | Ok(Event::Comment(BytesText::wrap( |
104 | 0 | &buf[3..len - 2], |
105 | 0 | self.decoder(), |
106 | 0 | ))) |
107 | | } |
108 | 0 | BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { |
109 | 0 | debug_assert!(buf.ends_with(b"]]")); |
110 | 0 | Ok(Event::CData(BytesCData::wrap( |
111 | 0 | &buf[8..len - 2], |
112 | 0 | self.decoder(), |
113 | 0 | ))) |
114 | | } |
115 | 12.7k | BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { |
116 | 12.7k | let start = buf[8..] |
117 | 12.7k | .iter() |
118 | 25.5k | .position(|b| !is_whitespace(*b)) |
119 | 12.7k | .unwrap_or(len - 8); |
120 | 12.7k | if start + 8 >= len { |
121 | 0 | return Err(Error::EmptyDocType); |
122 | 12.7k | } |
123 | 12.7k | Ok(Event::DocType(BytesText::wrap( |
124 | 12.7k | &buf[8 + start..], |
125 | 12.7k | self.decoder(), |
126 | 12.7k | ))) |
127 | | } |
128 | 0 | _ => Err(bang_type.to_err()), |
129 | | } |
130 | 12.7k | } |
131 | | |
132 | | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that |
133 | | /// end name matches the last opened start name if `self.check_end_names` is set. |
134 | 41.1M | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> { |
135 | | // XML standard permits whitespaces after the markup name in closing tags. |
136 | | // Let's strip them from the buffer before comparing tag names. |
137 | 41.1M | let name = if self.trim_markup_names_in_closing_tags { |
138 | 41.1M | if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) { |
139 | 41.1M | let (name, _) = buf[1..].split_at(pos_end_name + 1); |
140 | 41.1M | name |
141 | | } else { |
142 | 0 | &buf[1..] |
143 | | } |
144 | | } else { |
145 | 0 | &buf[1..] |
146 | | }; |
147 | | |
148 | 41.1M | let decoder = self.decoder(); |
149 | 41.1M | let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| { |
150 | 0 | *offset -= buf.len(); |
151 | 0 | Err(Error::EndEventMismatch { |
152 | 0 | expected, |
153 | 0 | found: decoder.decode(found).unwrap_or_default().into_owned(), |
154 | 0 | }) |
155 | 0 | }; |
156 | | |
157 | | // Get the index in self.opened_buffer of the name of the last opened tag |
158 | 41.1M | match self.opened_starts.pop() { |
159 | 41.1M | Some(start) => { |
160 | 41.1M | if self.check_end_names { |
161 | 41.1M | let expected = &self.opened_buffer[start..]; |
162 | 41.1M | if name != expected { |
163 | 0 | let expected = decoder.decode(expected).unwrap_or_default().into_owned(); |
164 | 0 | // #513: In order to allow error recovery we should drop content of the buffer |
165 | 0 | self.opened_buffer.truncate(start); |
166 | 0 |
|
167 | 0 | return mismatch_err(expected, name, &mut self.offset); |
168 | 41.1M | } |
169 | 0 | } |
170 | | |
171 | 41.1M | self.opened_buffer.truncate(start); |
172 | | } |
173 | | None => { |
174 | 0 | if self.check_end_names { |
175 | 0 | return mismatch_err("".to_string(), &buf[1..], &mut self.offset); |
176 | 0 | } |
177 | | } |
178 | | } |
179 | | |
180 | 41.1M | Ok(Event::End(BytesEnd::wrap(name.into()))) |
181 | 41.1M | } |
182 | | |
183 | | /// reads `BytesElement` starting with a `?`, |
184 | | /// return `Decl` or `PI` event |
185 | 12.7k | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> { |
186 | 12.7k | let len = buf.len(); |
187 | 12.7k | if len > 2 && buf[len - 1] == b'?' { |
188 | 12.7k | if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) { |
189 | 12.7k | let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3)); |
190 | 12.7k | |
191 | 12.7k | // Try getting encoding from the declaration event |
192 | 12.7k | #[cfg(feature = "encoding")] |
193 | 12.7k | if self.encoding.can_be_refined() { |
194 | 12.7k | if let Some(encoding) = event.encoder() { |
195 | 12.7k | self.encoding = EncodingRef::XmlDetected(encoding); |
196 | 12.7k | } |
197 | 12.7k | } |
198 | 12.7k | |
199 | 12.7k | Ok(Event::Decl(event)) |
200 | | } else { |
201 | 0 | Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder()))) |
202 | | } |
203 | | } else { |
204 | 0 | self.offset -= len; |
205 | 0 | Err(Error::UnexpectedEof("XmlDecl".to_string())) |
206 | | } |
207 | 12.7k | } |
208 | | |
209 | | /// Converts content of a tag to a `Start` or an `Empty` event |
210 | | /// |
211 | | /// # Parameters |
212 | | /// - `content`: Content of a tag between `<` and `>` |
213 | 43.0M | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> { |
214 | 43.0M | let len = content.len(); |
215 | 43.0M | let name_end = content |
216 | 43.0M | .iter() |
217 | 232M | .position(|&b| is_whitespace(b)) |
218 | 43.0M | .unwrap_or(len); |
219 | 43.0M | if let Some(&b'/') = content.last() { |
220 | | // This is self-closed tag `<something/>` |
221 | 1.84M | let name_len = if name_end < len { name_end } else { len - 1 }; |
222 | 1.84M | let event = BytesStart::wrap(&content[..len - 1], name_len); |
223 | 1.84M | |
224 | 1.84M | if self.expand_empty_elements { |
225 | 0 | self.state = ParseState::Empty; |
226 | 0 | self.opened_starts.push(self.opened_buffer.len()); |
227 | 0 | self.opened_buffer.extend(&content[..name_len]); |
228 | 0 | Ok(Event::Start(event)) |
229 | | } else { |
230 | 1.84M | Ok(Event::Empty(event)) |
231 | | } |
232 | | } else { |
233 | | // #514: Always store names event when .check_end_names == false, |
234 | | // because checks can be temporary disabled and when they would be |
235 | | // enabled, we should have that information |
236 | 41.1M | self.opened_starts.push(self.opened_buffer.len()); |
237 | 41.1M | self.opened_buffer.extend(&content[..name_end]); |
238 | 41.1M | Ok(Event::Start(BytesStart::wrap(content, name_end))) |
239 | | } |
240 | 43.0M | } |
241 | | |
242 | | #[inline] |
243 | 0 | pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> { |
244 | 0 | self.state = ParseState::ClosedTag; |
245 | 0 | let name = self |
246 | 0 | .opened_buffer |
247 | 0 | .split_off(self.opened_starts.pop().unwrap()); |
248 | 0 | Ok(Event::End(BytesEnd::wrap(name.into()))) |
249 | 0 | } |
250 | | |
251 | | /// Get the decoder, used to decode bytes, read by this reader, to the strings. |
252 | | /// |
253 | | /// If `encoding` feature is enabled, the used encoding may change after |
254 | | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. |
255 | | /// |
256 | | /// If `encoding` feature is enabled and no encoding is specified in declaration, |
257 | | /// defaults to UTF-8. |
258 | 156M | pub fn decoder(&self) -> Decoder { |
259 | 156M | Decoder { |
260 | 156M | #[cfg(feature = "encoding")] |
261 | 156M | encoding: self.encoding.encoding(), |
262 | 156M | } |
263 | 156M | } |
264 | | } |
265 | | |
266 | | impl Default for Parser { |
267 | 12.7k | fn default() -> Self { |
268 | 12.7k | Self { |
269 | 12.7k | offset: 0, |
270 | 12.7k | state: ParseState::Init, |
271 | 12.7k | expand_empty_elements: false, |
272 | 12.7k | trim_text_start: false, |
273 | 12.7k | trim_text_end: false, |
274 | 12.7k | trim_markup_names_in_closing_tags: true, |
275 | 12.7k | check_end_names: true, |
276 | 12.7k | check_comments: false, |
277 | 12.7k | opened_buffer: Vec::new(), |
278 | 12.7k | opened_starts: Vec::new(), |
279 | 12.7k | |
280 | 12.7k | #[cfg(feature = "encoding")] |
281 | 12.7k | encoding: EncodingRef::Implicit(UTF_8), |
282 | 12.7k | } |
283 | 12.7k | } |
284 | | } |