/rust/registry/src/index.crates.io-6f17d22bba15001f/quick-xml-0.29.0/src/reader/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | //! Contains high-level interface for a pull-based XML parser. |
2 | | |
3 | | #[cfg(feature = "encoding")] |
4 | | use encoding_rs::Encoding; |
5 | | use std::ops::Range; |
6 | | |
7 | | use crate::encoding::Decoder; |
8 | | use crate::errors::{Error, Result}; |
9 | | use crate::events::Event; |
10 | | use crate::reader::parser::Parser; |
11 | | |
12 | | use memchr; |
13 | | |
14 | | macro_rules! configure_methods { |
15 | | ($($holder:ident)?) => { |
16 | | /// Changes whether empty elements should be split into an `Open` and a `Close` event. |
17 | | /// |
18 | | /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are |
19 | | /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the |
20 | | /// default), those tags are represented by an [`Empty`] event instead. |
21 | | /// |
22 | | /// Note, that setting this to `true` will lead to additional allocates that |
23 | | /// needed to store tag name for an [`End`] event. However if [`check_end_names`] |
24 | | /// is also set, only one additional allocation will be performed that support |
25 | | /// both these options. |
26 | | /// |
27 | | /// (`false` by default) |
28 | | /// |
29 | | /// [`Empty`]: Event::Empty |
30 | | /// [`Start`]: Event::Start |
31 | | /// [`End`]: Event::End |
32 | | /// [`check_end_names`]: Self::check_end_names |
33 | 0 | pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self { |
34 | 0 | self $(.$holder)? .parser.expand_empty_elements = val; |
35 | 0 | self |
36 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::expand_empty_elements Unexecuted instantiation: <quick_xml::reader::Reader<_>>::expand_empty_elements |
37 | | |
38 | | /// Changes whether whitespace before and after character data should be removed. |
39 | | /// |
40 | | /// When set to `true`, all [`Text`] events are trimmed. |
41 | | /// If after that the event is empty it will not be pushed. |
42 | | /// |
43 | | /// Changing this option automatically changes the [`trim_text_end`] option. |
44 | | /// |
45 | | /// (`false` by default). |
46 | | /// |
47 | | /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;"> |
48 | | /// |
49 | | /// WARNING: With this option every text events will be trimmed which is |
50 | | /// incorrect behavior when text events delimited by comments, processing |
51 | | /// instructions or CDATA sections. To correctly trim data manually apply |
52 | | /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`] |
53 | | /// only to necessary events. |
54 | | /// </div> |
55 | | /// |
56 | | /// [`Text`]: Event::Text |
57 | | /// [`trim_text_end`]: Self::trim_text_end |
58 | | /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start |
59 | | /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end |
60 | 0 | pub fn trim_text(&mut self, val: bool) -> &mut Self { |
61 | 0 | self $(.$holder)? .parser.trim_text_start = val; |
62 | 0 | self $(.$holder)? .parser.trim_text_end = val; |
63 | 0 | self |
64 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::trim_text Unexecuted instantiation: <quick_xml::reader::Reader<_>>::trim_text |
65 | | |
66 | | /// Changes whether whitespace after character data should be removed. |
67 | | /// |
68 | | /// When set to `true`, trailing whitespace is trimmed in [`Text`] events. |
69 | | /// If after that the event is empty it will not be pushed. |
70 | | /// |
71 | | /// (`false` by default). |
72 | | /// |
73 | | /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;"> |
74 | | /// |
75 | | /// WARNING: With this option every text events will be trimmed which is |
76 | | /// incorrect behavior when text events delimited by comments, processing |
77 | | /// instructions or CDATA sections. To correctly trim data manually apply |
78 | | /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`] |
79 | | /// only to necessary events. |
80 | | /// </div> |
81 | | /// |
82 | | /// [`Text`]: Event::Text |
83 | | /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start |
84 | | /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end |
85 | 0 | pub fn trim_text_end(&mut self, val: bool) -> &mut Self { |
86 | 0 | self $(.$holder)? .parser.trim_text_end = val; |
87 | 0 | self |
88 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::trim_text_end Unexecuted instantiation: <quick_xml::reader::Reader<_>>::trim_text_end |
89 | | |
90 | | /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags |
91 | | /// `</a >`. |
92 | | /// |
93 | | /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name. |
94 | | /// |
95 | | /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is |
96 | | /// going to fail erroneously if a closing tag contains trailing whitespaces. |
97 | | /// |
98 | | /// (`true` by default) |
99 | | /// |
100 | | /// [`End`]: Event::End |
101 | 0 | pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self { |
102 | 0 | self $(.$holder)? .parser.trim_markup_names_in_closing_tags = val; |
103 | 0 | self |
104 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::trim_markup_names_in_closing_tags Unexecuted instantiation: <quick_xml::reader::Reader<_>>::trim_markup_names_in_closing_tags |
105 | | |
106 | | /// Changes whether mismatched closing tag names should be detected. |
107 | | /// |
108 | | /// Note, that start and end tags [should match literally][spec], they cannot |
109 | | /// have different prefixes even if both prefixes resolve to the same namespace. |
110 | | /// The XML |
111 | | /// |
112 | | /// ```xml |
113 | | /// <outer xmlns="namespace" xmlns:p="namespace"> |
114 | | /// </p:outer> |
115 | | /// ``` |
116 | | /// |
117 | | /// is not valid, even though semantically the start tag is the same as the |
118 | | /// end tag. The reason is that namespaces are an extension of the original |
119 | | /// XML specification (without namespaces) and it should be backward-compatible. |
120 | | /// |
121 | | /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag. |
122 | | /// For example, `<mytag></different_tag>` will be permitted. |
123 | | /// |
124 | | /// If the XML is known to be sane (already processed, etc.) this saves extra time. |
125 | | /// |
126 | | /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will |
127 | | /// contain the data of the mismatched end tag. |
128 | | /// |
129 | | /// Note, that setting this to `true` will lead to additional allocates that |
130 | | /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`] |
131 | | /// is also set, only one additional allocation will be performed that support |
132 | | /// both these options. |
133 | | /// |
134 | | /// (`true` by default) |
135 | | /// |
136 | | /// [spec]: https://www.w3.org/TR/xml11/#dt-etag |
137 | | /// [`End`]: Event::End |
138 | | /// [`expand_empty_elements`]: Self::expand_empty_elements |
139 | 0 | pub fn check_end_names(&mut self, val: bool) -> &mut Self { |
140 | 0 | self $(.$holder)? .parser.check_end_names = val; |
141 | 0 | self |
142 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::check_end_names Unexecuted instantiation: <quick_xml::reader::Reader<_>>::check_end_names |
143 | | |
144 | | /// Changes whether comments should be validated. |
145 | | /// |
146 | | /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which |
147 | | /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't |
148 | | /// really care about comment correctness, thus the default value is `false` to improve |
149 | | /// performance. |
150 | | /// |
151 | | /// (`false` by default) |
152 | | /// |
153 | | /// [`Comment`]: Event::Comment |
154 | 0 | pub fn check_comments(&mut self, val: bool) -> &mut Self { |
155 | 0 | self $(.$holder)? .parser.check_comments = val; |
156 | 0 | self |
157 | 0 | } Unexecuted instantiation: <quick_xml::reader::ns_reader::NsReader<_>>::check_comments Unexecuted instantiation: <quick_xml::reader::Reader<_>>::check_comments |
158 | | }; |
159 | | } |
160 | | |
161 | | macro_rules! read_event_impl { |
162 | | ( |
163 | | $self:ident, $buf:ident, |
164 | | $reader:expr, |
165 | | $read_until_open:ident, |
166 | | $read_until_close:ident |
167 | | $(, $await:ident)? |
168 | | ) => {{ |
169 | | let event = loop { |
170 | | match $self.parser.state { |
171 | | ParseState::Init => { // Go to OpenedTag state |
172 | | // If encoding set explicitly, we not need to detect it. For example, |
173 | | // explicit UTF-8 set automatically if Reader was created using `from_str`. |
174 | | // But we still need to remove BOM for consistency with no encoding |
175 | | // feature enabled path |
176 | | #[cfg(feature = "encoding")] |
177 | | if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? { |
178 | | if $self.parser.encoding.can_be_refined() { |
179 | | $self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding); |
180 | | } |
181 | | } |
182 | | |
183 | | // Removes UTF-8 BOM if it is present |
184 | | #[cfg(not(feature = "encoding"))] |
185 | | $reader.remove_utf8_bom() $(.$await)? ?; |
186 | | |
187 | | // Go to OpenedTag state |
188 | | match $self.$read_until_open($buf) $(.$await)? { |
189 | | Ok(Ok(ev)) => break Ok(ev), |
190 | | Ok(Err(b)) => $buf = b, |
191 | | Err(err) => break Err(err), |
192 | | } |
193 | | }, |
194 | | ParseState::ClosedTag => { // Go to OpenedTag state |
195 | | match $self.$read_until_open($buf) $(.$await)? { |
196 | | Ok(Ok(ev)) => break Ok(ev), |
197 | | Ok(Err(b)) => $buf = b, |
198 | | Err(err) => break Err(err), |
199 | | } |
200 | | }, |
201 | | // Go to ClosedTag state in next two arms |
202 | | ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?, |
203 | | ParseState::Empty => break $self.parser.close_expanded_empty(), |
204 | | ParseState::Exit => break Ok(Event::Eof), |
205 | | }; |
206 | | }; |
207 | | match event { |
208 | | Err(_) | Ok(Event::Eof) => $self.parser.state = ParseState::Exit, |
209 | | _ => {} |
210 | | } |
211 | | event |
212 | | }}; |
213 | | } |
214 | | |
215 | | /// Read bytes up to `<` and skip it. If current byte (after skipping all space |
216 | | /// characters if [`Parser::trim_text_start`] is `true`) is already `<`, then |
217 | | /// returns the next event, otherwise stay at position just after the `<` symbol. |
218 | | /// |
219 | | /// Moves parser to the `OpenedTag` state. |
220 | | /// |
221 | | /// This code is executed in two cases: |
222 | | /// - after start of parsing just after skipping BOM if it is present |
223 | | /// - after parsing `</tag>` or `<tag>` |
224 | | macro_rules! read_until_open { |
225 | | ( |
226 | | $self:ident, $buf:ident, |
227 | | $reader:expr, |
228 | | $read_event:ident |
229 | | $(, $await:ident)? |
230 | | ) => {{ |
231 | | $self.parser.state = ParseState::OpenedTag; |
232 | | |
233 | | if $self.parser.trim_text_start { |
234 | | $reader.skip_whitespace(&mut $self.parser.offset) $(.$await)? ?; |
235 | | } |
236 | | |
237 | | // If we already at the `<` symbol, do not try to return an empty Text event |
238 | | if $reader.skip_one(b'<', &mut $self.parser.offset) $(.$await)? ? { |
239 | | // Pass $buf to the next next iteration of parsing loop |
240 | | return Ok(Err($buf)); |
241 | | } |
242 | | |
243 | | match $reader |
244 | | .read_bytes_until(b'<', $buf, &mut $self.parser.offset) |
245 | | $(.$await)? |
246 | | { |
247 | | // Return Text event with `bytes` content |
248 | | Ok(Some(bytes)) => $self.parser.emit_text(bytes).map(Ok), |
249 | | Ok(None) => Ok(Ok(Event::Eof)), |
250 | | Err(e) => Err(e), |
251 | | } |
252 | | }}; |
253 | | } |
254 | | |
255 | | /// Read bytes up to the `>` and skip it. This method is expected to be called |
256 | | /// after seeing the `<` symbol and skipping it. Inspects the next (current) |
257 | | /// symbol and returns an appropriate [`Event`]: |
258 | | /// |
259 | | /// |Symbol |Event |
260 | | /// |-------|------------------------------------- |
261 | | /// |`!` |[`Comment`], [`CData`] or [`DocType`] |
262 | | /// |`/` |[`End`] |
263 | | /// |`?` |[`PI`] |
264 | | /// |_other_|[`Start`] or [`Empty`] |
265 | | /// |
266 | | /// Moves parser to the `ClosedTag` state. |
267 | | /// |
268 | | /// [`Comment`]: Event::Comment |
269 | | /// [`CData`]: Event::CData |
270 | | /// [`DocType`]: Event::DocType |
271 | | /// [`End`]: Event::End |
272 | | /// [`PI`]: Event::PI |
273 | | /// [`Start`]: Event::Start |
274 | | /// [`Empty`]: Event::Empty |
275 | | macro_rules! read_until_close { |
276 | | ( |
277 | | $self:ident, $buf:ident, |
278 | | $reader:expr |
279 | | $(, $await:ident)? |
280 | | ) => {{ |
281 | | $self.parser.state = ParseState::ClosedTag; |
282 | | |
283 | | match $reader.peek_one() $(.$await)? { |
284 | | // `<!` - comment, CDATA or DOCTYPE declaration |
285 | | Ok(Some(b'!')) => match $reader |
286 | | .read_bang_element($buf, &mut $self.parser.offset) |
287 | | $(.$await)? |
288 | | { |
289 | | Ok(None) => Ok(Event::Eof), |
290 | | Ok(Some((bang_type, bytes))) => $self.parser.emit_bang(bang_type, bytes), |
291 | | Err(e) => Err(e), |
292 | | }, |
293 | | // `</` - closing tag |
294 | | Ok(Some(b'/')) => match $reader |
295 | | .read_bytes_until(b'>', $buf, &mut $self.parser.offset) |
296 | | $(.$await)? |
297 | | { |
298 | | Ok(None) => Ok(Event::Eof), |
299 | | Ok(Some(bytes)) => $self.parser.emit_end(bytes), |
300 | | Err(e) => Err(e), |
301 | | }, |
302 | | // `<?` - processing instruction |
303 | | Ok(Some(b'?')) => match $reader |
304 | | .read_bytes_until(b'>', $buf, &mut $self.parser.offset) |
305 | | $(.$await)? |
306 | | { |
307 | | Ok(None) => Ok(Event::Eof), |
308 | | Ok(Some(bytes)) => $self.parser.emit_question_mark(bytes), |
309 | | Err(e) => Err(e), |
310 | | }, |
311 | | // `<...` - opening or self-closed tag |
312 | | Ok(Some(_)) => match $reader |
313 | | .read_element($buf, &mut $self.parser.offset) |
314 | | $(.$await)? |
315 | | { |
316 | | Ok(None) => Ok(Event::Eof), |
317 | | Ok(Some(bytes)) => $self.parser.emit_start(bytes), |
318 | | Err(e) => Err(e), |
319 | | }, |
320 | | Ok(None) => Ok(Event::Eof), |
321 | | Err(e) => Err(e), |
322 | | } |
323 | | }}; |
324 | | } |
325 | | |
326 | | /// Generalization of `read_to_end` method for buffered and borrowed readers |
327 | | macro_rules! read_to_end { |
328 | | ( |
329 | | $self:expr, $end:expr, $buf:expr, |
330 | | $read_event:ident, |
331 | | // Code block that performs clearing of internal buffer after read of each event |
332 | | $clear:block |
333 | | $(, $await:ident)? |
334 | | ) => {{ |
335 | | let start = $self.buffer_position(); |
336 | | let mut depth = 0; |
337 | | loop { |
338 | | $clear |
339 | | let end = $self.buffer_position(); |
340 | | match $self.$read_event($buf) $(.$await)? { |
341 | | Err(e) => return Err(e), |
342 | | |
343 | | Ok(Event::Start(e)) if e.name() == $end => depth += 1, |
344 | | Ok(Event::End(e)) if e.name() == $end => { |
345 | | if depth == 0 { |
346 | | break start..end; |
347 | | } |
348 | | depth -= 1; |
349 | | } |
350 | | Ok(Event::Eof) => { |
351 | | let name = $self.decoder().decode($end.as_ref()); |
352 | | return Err(Error::UnexpectedEof(format!("</{:?}>", name))); |
353 | | } |
354 | | _ => (), |
355 | | } |
356 | | } |
357 | | }}; |
358 | | } |
359 | | |
360 | | #[cfg(feature = "async-tokio")] |
361 | | mod async_tokio; |
362 | | mod buffered_reader; |
363 | | mod ns_reader; |
364 | | mod parser; |
365 | | mod slice_reader; |
366 | | |
367 | | pub use ns_reader::NsReader; |
368 | | |
369 | | /// Range of input in bytes, that corresponds to some piece of XML |
370 | | pub type Span = Range<usize>; |
371 | | |
372 | | //////////////////////////////////////////////////////////////////////////////////////////////////// |
373 | | |
374 | | /// Possible reader states. The state transition diagram (`true` and `false` shows |
375 | | /// value of [`Reader::expand_empty_elements()`] option): |
376 | | /// |
377 | | /// ```mermaid |
378 | | /// flowchart LR |
379 | | /// subgraph _ |
380 | | /// direction LR |
381 | | /// |
382 | | /// Init -- "(no event)"\n --> OpenedTag |
383 | | /// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag |
384 | | /// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag |
385 | | /// end |
386 | | /// ClosedTag -- "#lt;true#gt;"\nStart --> Empty |
387 | | /// Empty -- End --> ClosedTag |
388 | | /// _ -. Eof .-> Exit |
389 | | /// ``` |
390 | | #[derive(Clone)] |
391 | | enum ParseState { |
392 | | /// Initial state in which reader stay after creation. Transition from that |
393 | | /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next |
394 | | /// state is always `OpenedTag`. The reader will never return to this state. The |
395 | | /// event emitted during transition to `OpenedTag` is a `StartEvent` if the |
396 | | /// first symbol not `<`, otherwise no event are emitted. |
397 | | Init, |
398 | | /// State after seeing the `<` symbol. Depending on the next symbol all other |
399 | | /// events could be generated. |
400 | | /// |
401 | | /// After generating one event the reader moves to the `ClosedTag` state. |
402 | | OpenedTag, |
403 | | /// State in which reader searches the `<` symbol of a markup. All bytes before |
404 | | /// that symbol will be returned in the [`Event::Text`] event. After that |
405 | | /// the reader moves to the `OpenedTag` state. |
406 | | ClosedTag, |
407 | | /// This state is used only if option [`expand_empty_elements`] is set to `true`. |
408 | | /// Reader enters to this state when it is in a `ClosedTag` state and emits an |
409 | | /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], |
410 | | /// after which reader returned to the `ClosedTag` state. |
411 | | /// |
412 | | /// [`expand_empty_elements`]: Parser::expand_empty_elements |
413 | | Empty, |
414 | | /// Reader enters this state when `Eof` event generated or an error occurred. |
415 | | /// This is the last state, the reader stay in it forever. |
416 | | Exit, |
417 | | } |
418 | | |
419 | | /// A reference to an encoding together with information about how it was retrieved. |
420 | | /// |
421 | | /// The state transition diagram: |
422 | | /// |
423 | | /// ```mermaid |
424 | | /// flowchart LR |
425 | | /// Implicit -- from_str --> Explicit |
426 | | /// Implicit -- BOM --> BomDetected |
427 | | /// Implicit -- "encoding=..." --> XmlDetected |
428 | | /// BomDetected -- "encoding=..." --> XmlDetected |
429 | | /// ``` |
430 | | #[cfg(feature = "encoding")] |
431 | | #[derive(Clone, Copy)] |
432 | | enum EncodingRef { |
433 | | /// Encoding was implicitly assumed to have a specified value. It can be refined |
434 | | /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`) |
435 | | Implicit(&'static Encoding), |
436 | | /// Encoding was explicitly set to the desired value. It cannot be changed |
437 | | /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`) |
438 | | Explicit(&'static Encoding), |
439 | | /// Encoding was detected from a byte order mark (BOM) or by the first bytes |
440 | | /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`) |
441 | | BomDetected(&'static Encoding), |
442 | | /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`). |
443 | | /// It can no longer change |
444 | | XmlDetected(&'static Encoding), |
445 | | } |
446 | | #[cfg(feature = "encoding")] |
447 | | impl EncodingRef { |
448 | | #[inline] |
449 | | fn encoding(&self) -> &'static Encoding { |
450 | | match self { |
451 | | Self::Implicit(e) => e, |
452 | | Self::Explicit(e) => e, |
453 | | Self::BomDetected(e) => e, |
454 | | Self::XmlDetected(e) => e, |
455 | | } |
456 | | } |
457 | | #[inline] |
458 | | fn can_be_refined(&self) -> bool { |
459 | | match self { |
460 | | Self::Implicit(_) | Self::BomDetected(_) => true, |
461 | | Self::Explicit(_) | Self::XmlDetected(_) => false, |
462 | | } |
463 | | } |
464 | | } |
465 | | |
466 | | //////////////////////////////////////////////////////////////////////////////////////////////////// |
467 | | |
468 | | /// A low level encoding-agnostic XML event reader. |
469 | | /// |
470 | | /// Consumes bytes and streams XML [`Event`]s. |
471 | | /// |
472 | | /// This reader does not manage namespace declarations and not able to resolve |
473 | | /// prefixes. If you want these features, use the [`NsReader`]. |
474 | | /// |
475 | | /// # Examples |
476 | | /// |
477 | | /// ``` |
478 | | /// use quick_xml::events::Event; |
479 | | /// use quick_xml::reader::Reader; |
480 | | /// |
481 | | /// let xml = r#"<tag1 att1 = "test"> |
482 | | /// <tag2><!--Test comment-->Test</tag2> |
483 | | /// <tag2>Test 2</tag2> |
484 | | /// </tag1>"#; |
485 | | /// let mut reader = Reader::from_str(xml); |
486 | | /// reader.trim_text(true); |
487 | | /// |
488 | | /// let mut count = 0; |
489 | | /// let mut txt = Vec::new(); |
490 | | /// let mut buf = Vec::new(); |
491 | | /// |
492 | | /// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s) |
493 | | /// loop { |
494 | | /// // NOTE: this is the generic case when we don't know about the input BufRead. |
495 | | /// // when the input is a &str or a &[u8], we don't actually need to use another |
496 | | /// // buffer, we could directly call `reader.read_event()` |
497 | | /// match reader.read_event_into(&mut buf) { |
498 | | /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), |
499 | | /// // exits the loop when reaching end of file |
500 | | /// Ok(Event::Eof) => break, |
501 | | /// |
502 | | /// Ok(Event::Start(e)) => { |
503 | | /// match e.name().as_ref() { |
504 | | /// b"tag1" => println!("attributes values: {:?}", |
505 | | /// e.attributes().map(|a| a.unwrap().value) |
506 | | /// .collect::<Vec<_>>()), |
507 | | /// b"tag2" => count += 1, |
508 | | /// _ => (), |
509 | | /// } |
510 | | /// } |
511 | | /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), |
512 | | /// |
513 | | /// // There are several other `Event`s we do not consider here |
514 | | /// _ => (), |
515 | | /// } |
516 | | /// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low |
517 | | /// buf.clear(); |
518 | | /// } |
519 | | /// ``` |
520 | | /// |
521 | | /// [`NsReader`]: crate::reader::NsReader |
522 | | #[derive(Clone)] |
523 | | pub struct Reader<R> { |
524 | | /// Source of data for parse |
525 | | reader: R, |
526 | | /// Configuration and current parse state |
527 | | parser: Parser, |
528 | | } |
529 | | |
530 | | /// Builder methods |
531 | | impl<R> Reader<R> { |
532 | | /// Creates a `Reader` that reads from a given reader. |
533 | 13.1k | pub fn from_reader(reader: R) -> Self { |
534 | 13.1k | Self { |
535 | 13.1k | reader, |
536 | 13.1k | parser: Parser::default(), |
537 | 13.1k | } |
538 | 13.1k | } |
539 | | |
540 | | configure_methods!(); |
541 | | } |
542 | | |
543 | | /// Getters |
544 | | impl<R> Reader<R> { |
545 | | /// Consumes `Reader` returning the underlying reader |
546 | | /// |
547 | | /// Can be used to compute line and column of a parsing error position |
548 | | /// |
549 | | /// # Examples |
550 | | /// |
551 | | /// ``` |
552 | | /// # use pretty_assertions::assert_eq; |
553 | | /// use std::{str, io::Cursor}; |
554 | | /// use quick_xml::events::Event; |
555 | | /// use quick_xml::reader::Reader; |
556 | | /// |
557 | | /// let xml = r#"<tag1 att1 = "test"> |
558 | | /// <tag2><!--Test comment-->Test</tag2> |
559 | | /// <tag3>Test 2</tag3> |
560 | | /// </tag1>"#; |
561 | | /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); |
562 | | /// let mut buf = Vec::new(); |
563 | | /// |
564 | | /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) { |
565 | | /// let end_pos = reader.buffer_position(); |
566 | | /// let mut cursor = reader.into_inner(); |
567 | | /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) |
568 | | /// .expect("can't make a string"); |
569 | | /// let mut line = 1; |
570 | | /// let mut column = 0; |
571 | | /// for c in s.chars() { |
572 | | /// if c == '\n' { |
573 | | /// line += 1; |
574 | | /// column = 0; |
575 | | /// } else { |
576 | | /// column += 1; |
577 | | /// } |
578 | | /// } |
579 | | /// (line, column) |
580 | | /// } |
581 | | /// |
582 | | /// loop { |
583 | | /// match reader.read_event_into(&mut buf) { |
584 | | /// Ok(Event::Start(ref e)) => match e.name().as_ref() { |
585 | | /// b"tag1" | b"tag2" => (), |
586 | | /// tag => { |
587 | | /// assert_eq!(b"tag3", tag); |
588 | | /// assert_eq!((3, 22), into_line_and_column(reader)); |
589 | | /// break; |
590 | | /// } |
591 | | /// }, |
592 | | /// Ok(Event::Eof) => unreachable!(), |
593 | | /// _ => (), |
594 | | /// } |
595 | | /// buf.clear(); |
596 | | /// } |
597 | | /// ``` |
598 | 0 | pub fn into_inner(self) -> R { |
599 | 0 | self.reader |
600 | 0 | } |
601 | | |
602 | | /// Gets a reference to the underlying reader. |
603 | 0 | pub fn get_ref(&self) -> &R { |
604 | 0 | &self.reader |
605 | 0 | } |
606 | | |
607 | | /// Gets a mutable reference to the underlying reader. |
608 | 0 | pub fn get_mut(&mut self) -> &mut R { |
609 | 0 | &mut self.reader |
610 | 0 | } |
611 | | |
612 | | /// Gets the current byte position in the input data. |
613 | | /// |
614 | | /// Useful when debugging errors. |
615 | 86.5M | pub fn buffer_position(&self) -> usize { |
616 | 86.5M | // when internal state is OpenedTag, we have actually read until '<', |
617 | 86.5M | // which we don't want to show |
618 | 86.5M | if let ParseState::OpenedTag = self.parser.state { |
619 | 28.7M | self.parser.offset - 1 |
620 | | } else { |
621 | 57.7M | self.parser.offset |
622 | | } |
623 | 86.5M | } |
624 | | |
625 | | /// Get the decoder, used to decode bytes, read by this reader, to the strings. |
626 | | /// |
627 | | /// If `encoding` feature is enabled, the used encoding may change after |
628 | | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. |
629 | | /// |
630 | | /// If `encoding` feature is enabled and no encoding is specified in declaration, |
631 | | /// defaults to UTF-8. |
632 | | #[inline] |
633 | 28.8M | pub fn decoder(&self) -> Decoder { |
634 | 28.8M | self.parser.decoder() |
635 | 28.8M | } |
636 | | } |
637 | | |
638 | | /// Private sync reading methods |
639 | | impl<R> Reader<R> { |
640 | | /// Read text into the given buffer, and return an event that borrows from |
641 | | /// either that buffer or from the input itself, based on the type of the |
642 | | /// reader. |
643 | 149M | fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>> |
644 | 149M | where |
645 | 149M | R: XmlSource<'i, B>, |
646 | 149M | { |
647 | 149M | read_event_impl!(self, buf, self.reader, read_until_open, read_until_close) |
648 | 149M | } |
649 | | |
650 | | /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. |
651 | | /// |
652 | | /// Returns inner `Ok` if the loop should be broken and an event returned. |
653 | | /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular. |
654 | 74.9M | fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>> |
655 | 74.9M | where |
656 | 74.9M | R: XmlSource<'i, B>, |
657 | 74.9M | { |
658 | 74.9M | read_until_open!(self, buf, self.reader, read_event_impl) |
659 | 74.9M | } |
660 | | |
661 | | /// Private function to read until `>` is found. This function expects that |
662 | | /// it was called just after encounter a `<` symbol. |
663 | 74.9M | fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>> |
664 | 74.9M | where |
665 | 74.9M | R: XmlSource<'i, B>, |
666 | 74.9M | { |
667 | 74.9M | read_until_close!(self, buf, self.reader) |
668 | 74.9M | } |
669 | | } |
670 | | |
671 | | //////////////////////////////////////////////////////////////////////////////////////////////////// |
672 | | |
673 | | /// Represents an input for a reader that can return borrowed data. |
674 | | /// |
675 | | /// There are two implementors of this trait: generic one that read data from |
676 | | /// `Self`, copies some part of it into a provided buffer of type `B` and then |
677 | | /// returns data that borrow from that buffer. |
678 | | /// |
679 | | /// The other implementor is for `&[u8]` and instead of copying data returns |
680 | | /// borrowed data from `Self` instead. This implementation allows zero-copy |
681 | | /// deserialization. |
682 | | /// |
683 | | /// # Parameters |
684 | | /// - `'r`: lifetime of a buffer from which events will borrow |
685 | | /// - `B`: a type of a buffer that can be used to store data read from `Self` and |
686 | | /// from which events can borrow |
687 | | trait XmlSource<'r, B> { |
688 | | /// Removes UTF-8 BOM if it is present |
689 | | #[cfg(not(feature = "encoding"))] |
690 | | fn remove_utf8_bom(&mut self) -> Result<()>; |
691 | | |
692 | | /// Determines encoding from the start of input and removes BOM if it is present |
693 | | #[cfg(feature = "encoding")] |
694 | | fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>; |
695 | | |
696 | | /// Read input until `byte` is found or end of input is reached. |
697 | | /// |
698 | | /// Returns a slice of data read up to `byte`, which does not include into result. |
699 | | /// If input (`Self`) is exhausted, returns `None`. |
700 | | /// |
701 | | /// # Example |
702 | | /// |
703 | | /// ```ignore |
704 | | /// let mut position = 0; |
705 | | /// let mut input = b"abc*def".as_ref(); |
706 | | /// // ^= 4 |
707 | | /// |
708 | | /// assert_eq!( |
709 | | /// input.read_bytes_until(b'*', (), &mut position).unwrap(), |
710 | | /// Some(b"abc".as_ref()) |
711 | | /// ); |
712 | | /// assert_eq!(position, 4); // position after the symbol matched |
713 | | /// ``` |
714 | | /// |
715 | | /// # Parameters |
716 | | /// - `byte`: Byte for search |
717 | | /// - `buf`: Buffer that could be filled from an input (`Self`) and |
718 | | /// from which [events] could borrow their data |
719 | | /// - `position`: Will be increased by amount of bytes consumed |
720 | | /// |
721 | | /// [events]: crate::events::Event |
722 | | fn read_bytes_until( |
723 | | &mut self, |
724 | | byte: u8, |
725 | | buf: B, |
726 | | position: &mut usize, |
727 | | ) -> Result<Option<&'r [u8]>>; |
728 | | |
729 | | /// Read input until comment, CDATA or processing instruction is finished. |
730 | | /// |
731 | | /// This method expect that `<` already was read. |
732 | | /// |
733 | | /// Returns a slice of data read up to end of comment, CDATA or processing |
734 | | /// instruction (`>`), which does not include into result. |
735 | | /// |
736 | | /// If input (`Self`) is exhausted and nothing was read, returns `None`. |
737 | | /// |
738 | | /// # Parameters |
739 | | /// - `buf`: Buffer that could be filled from an input (`Self`) and |
740 | | /// from which [events] could borrow their data |
741 | | /// - `position`: Will be increased by amount of bytes consumed |
742 | | /// |
743 | | /// [events]: crate::events::Event |
744 | | fn read_bang_element( |
745 | | &mut self, |
746 | | buf: B, |
747 | | position: &mut usize, |
748 | | ) -> Result<Option<(BangType, &'r [u8])>>; |
749 | | |
750 | | /// Read input until XML element is closed by approaching a `>` symbol. |
751 | | /// Returns `Some(buffer)` that contains a data between `<` and `>` or |
752 | | /// `None` if end-of-input was reached and nothing was read. |
753 | | /// |
754 | | /// Derived from `read_until`, but modified to handle XML attributes |
755 | | /// using a minimal state machine. |
756 | | /// |
757 | | /// Attribute values are [defined] as follows: |
758 | | /// ```plain |
759 | | /// AttValue := '"' (([^<&"]) | Reference)* '"' |
760 | | /// | "'" (([^<&']) | Reference)* "'" |
761 | | /// ``` |
762 | | /// (`Reference` is something like `"`, but we don't care about |
763 | | /// escaped characters at this level) |
764 | | /// |
765 | | /// # Parameters |
766 | | /// - `buf`: Buffer that could be filled from an input (`Self`) and |
767 | | /// from which [events] could borrow their data |
768 | | /// - `position`: Will be increased by amount of bytes consumed |
769 | | /// |
770 | | /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue |
771 | | /// [events]: crate::events::Event |
772 | | fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>; |
773 | | |
774 | | /// Consume and discard all the whitespace until the next non-whitespace |
775 | | /// character or EOF. |
776 | | /// |
777 | | /// # Parameters |
778 | | /// - `position`: Will be increased by amount of bytes consumed |
779 | | fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; |
780 | | |
781 | | /// Consume and discard one character if it matches the given byte. Return |
782 | | /// `true` if it matched. |
783 | | /// |
784 | | /// # Parameters |
785 | | /// - `position`: Will be increased by 1 if byte is matched |
786 | | fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>; |
787 | | |
788 | | /// Return one character without consuming it, so that future `read_*` calls |
789 | | /// will still include it. On EOF, return `None`. |
790 | | fn peek_one(&mut self) -> Result<Option<u8>>; |
791 | | } |
792 | | |
793 | | /// Possible elements started with `<!` |
794 | | #[derive(Debug, PartialEq)] |
795 | | enum BangType { |
796 | | /// <![CDATA[...]]> |
797 | | CData, |
798 | | /// <!--...--> |
799 | | Comment, |
800 | | /// <!DOCTYPE...> |
801 | | DocType, |
802 | | } |
803 | | impl BangType { |
804 | | #[inline(always)] |
805 | 13.1k | fn new(byte: Option<u8>) -> Result<Self> { |
806 | 13.1k | Ok(match byte { |
807 | 0 | Some(b'[') => Self::CData, |
808 | 0 | Some(b'-') => Self::Comment, |
809 | 13.1k | Some(b'D') | Some(b'd') => Self::DocType, |
810 | 0 | Some(b) => return Err(Error::UnexpectedBang(b)), |
811 | 0 | None => return Err(Error::UnexpectedEof("Bang".to_string())), |
812 | | }) |
813 | 13.1k | } |
814 | | |
815 | | /// If element is finished, returns its content up to `>` symbol and |
816 | | /// an index of this symbol, otherwise returns `None` |
817 | | /// |
818 | | /// # Parameters |
819 | | /// - `buf`: buffer with data consumed on previous iterations |
820 | | /// - `chunk`: data read on current iteration and not yet consumed from reader |
821 | | #[inline(always)] |
822 | 13.1k | fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { |
823 | 13.1k | for i in memchr::memchr_iter(b'>', chunk) { |
824 | 0 | match self { |
825 | 0 | // Need to read at least 6 symbols (`!---->`) for properly finished comment |
826 | 0 | // <!----> - XML comment |
827 | 0 | // 012345 - i |
828 | 0 | Self::Comment if buf.len() + i > 4 => { |
829 | 0 | if chunk[..i].ends_with(b"--") { |
830 | | // We cannot strip last `--` from the buffer because we need it in case of |
831 | | // check_comments enabled option. XML standard requires that comment |
832 | | // will not end with `--->` sequence because this is a special case of |
833 | | // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) |
834 | 0 | return Some((&chunk[..i], i + 1)); // +1 for `>` |
835 | 0 | } |
836 | 0 | // End sequence `-|->` was splitted at | |
837 | 0 | // buf --/ \-- chunk |
838 | 0 | if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { |
839 | 0 | return Some((&chunk[..i], i + 1)); // +1 for `>` |
840 | 0 | } |
841 | 0 | // End sequence `--|>` was splitted at | |
842 | 0 | // buf --/ \-- chunk |
843 | 0 | if i == 0 && buf.ends_with(b"--") { |
844 | 0 | return Some((&[], i + 1)); // +1 for `>` |
845 | 0 | } |
846 | | } |
847 | 0 | Self::Comment => {} |
848 | | Self::CData => { |
849 | 0 | if chunk[..i].ends_with(b"]]") { |
850 | 0 | return Some((&chunk[..i], i + 1)); // +1 for `>` |
851 | 0 | } |
852 | 0 | // End sequence `]|]>` was splitted at | |
853 | 0 | // buf --/ \-- chunk |
854 | 0 | if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { |
855 | 0 | return Some((&chunk[..i], i + 1)); // +1 for `>` |
856 | 0 | } |
857 | 0 | // End sequence `]]|>` was splitted at | |
858 | 0 | // buf --/ \-- chunk |
859 | 0 | if i == 0 && buf.ends_with(b"]]") { |
860 | 0 | return Some((&[], i + 1)); // +1 for `>` |
861 | 0 | } |
862 | | } |
863 | | Self::DocType => { |
864 | 13.1k | let content = &chunk[..i]; |
865 | 13.1k | let balance = memchr::memchr2_iter(b'<', b'>', content) |
866 | 13.1k | .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) |
867 | 13.1k | .sum::<i32>(); |
868 | 13.1k | if balance == 0 { |
869 | 13.1k | return Some((content, i + 1)); // +1 for `>` |
870 | 0 | } |
871 | | } |
872 | | } |
873 | | } |
874 | 0 | None |
875 | 13.1k | } |
876 | | #[inline] |
877 | 0 | fn to_err(&self) -> Error { |
878 | 0 | let bang_str = match self { |
879 | 0 | Self::CData => "CData", |
880 | 0 | Self::Comment => "Comment", |
881 | 0 | Self::DocType => "DOCTYPE", |
882 | | }; |
883 | 0 | Error::UnexpectedEof(bang_str.to_string()) |
884 | 0 | } |
885 | | } |
886 | | |
887 | | /// State machine for the [`XmlSource::read_element`] |
888 | | #[derive(Clone, Copy)] |
889 | | enum ReadElementState { |
890 | | /// The initial state (inside element, but outside of attribute value) |
891 | | Elem, |
892 | | /// Inside a single-quoted attribute value |
893 | | SingleQ, |
894 | | /// Inside a double-quoted attribute value |
895 | | DoubleQ, |
896 | | } |
897 | | impl ReadElementState { |
898 | | /// Changes state by analyzing part of input. |
899 | | /// Returns a tuple with part of chunk up to element closing symbol `>` |
900 | | /// and a position after that symbol or `None` if such symbol was not found |
901 | | #[inline(always)] |
902 | 38.5M | fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { |
903 | 42.7M | for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { |
904 | 42.7M | *self = match (*self, chunk[i]) { |
905 | | // only allowed to match `>` while we are in state `Elem` |
906 | 38.5M | (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), |
907 | 0 | (Self::Elem, b'\'') => Self::SingleQ, |
908 | 2.06M | (Self::Elem, b'\"') => Self::DoubleQ, |
909 | | |
910 | | // the only end_byte that gets us out if the same character |
911 | 2.06M | (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, |
912 | | |
913 | | // all other bytes: no state change |
914 | 36.3k | _ => *self, |
915 | | }; |
916 | | } |
917 | 0 | None |
918 | 38.5M | } |
919 | | } |
920 | | |
921 | | /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) |
922 | | #[inline] |
923 | 236M | pub(crate) const fn is_whitespace(b: u8) -> bool { |
924 | 236M | matches!(b, b' ' | b'\r' | b'\n' | b'\t') |
925 | 236M | } |
926 | | |
927 | | //////////////////////////////////////////////////////////////////////////////////////////////////// |
928 | | |
929 | | #[cfg(test)] |
930 | | mod test { |
931 | | /// Checks the internal implementation of the various reader methods |
932 | | macro_rules! check { |
933 | | ( |
934 | | #[$test:meta] |
935 | | $read_event:ident, |
936 | | $read_until_close:ident, |
937 | | // constructor of the XML source on which internal functions will be called |
938 | | $source:path, |
939 | | // constructor of the buffer to which read data will stored |
940 | | $buf:expr |
941 | | $(, $async:ident, $await:ident)? |
942 | | ) => { |
943 | | mod read_bytes_until { |
944 | | use super::*; |
945 | | // Use Bytes for printing bytes as strings for ASCII range |
946 | | use crate::utils::Bytes; |
947 | | use pretty_assertions::assert_eq; |
948 | | |
949 | | /// Checks that search in the empty buffer returns `None` |
950 | | #[$test] |
951 | | $($async)? fn empty() { |
952 | | let buf = $buf; |
953 | | let mut position = 0; |
954 | | let mut input = b"".as_ref(); |
955 | | // ^= 0 |
956 | | |
957 | | assert_eq!( |
958 | | $source(&mut input) |
959 | | .read_bytes_until(b'*', buf, &mut position) |
960 | | $(.$await)? |
961 | | .unwrap() |
962 | | .map(Bytes), |
963 | | None |
964 | | ); |
965 | | assert_eq!(position, 0); |
966 | | } |
967 | | |
968 | | /// Checks that search in the buffer non-existent value returns entire buffer |
969 | | /// as a result and set `position` to `len()` |
970 | | #[$test] |
971 | | $($async)? fn non_existent() { |
972 | | let buf = $buf; |
973 | | let mut position = 0; |
974 | | let mut input = b"abcdef".as_ref(); |
975 | | // ^= 6 |
976 | | |
977 | | assert_eq!( |
978 | | $source(&mut input) |
979 | | .read_bytes_until(b'*', buf, &mut position) |
980 | | $(.$await)? |
981 | | .unwrap() |
982 | | .map(Bytes), |
983 | | Some(Bytes(b"abcdef")) |
984 | | ); |
985 | | assert_eq!(position, 6); |
986 | | } |
987 | | |
988 | | /// Checks that search in the buffer an element that is located in the front of |
989 | | /// buffer returns empty slice as a result and set `position` to one symbol |
990 | | /// after match (`1`) |
991 | | #[$test] |
992 | | $($async)? fn at_the_start() { |
993 | | let buf = $buf; |
994 | | let mut position = 0; |
995 | | let mut input = b"*abcdef".as_ref(); |
996 | | // ^= 1 |
997 | | |
998 | | assert_eq!( |
999 | | $source(&mut input) |
1000 | | .read_bytes_until(b'*', buf, &mut position) |
1001 | | $(.$await)? |
1002 | | .unwrap() |
1003 | | .map(Bytes), |
1004 | | Some(Bytes(b"")) |
1005 | | ); |
1006 | | assert_eq!(position, 1); // position after the symbol matched |
1007 | | } |
1008 | | |
1009 | | /// Checks that search in the buffer an element that is located in the middle of |
1010 | | /// buffer returns slice before that symbol as a result and set `position` to one |
1011 | | /// symbol after match |
1012 | | #[$test] |
1013 | | $($async)? fn inside() { |
1014 | | let buf = $buf; |
1015 | | let mut position = 0; |
1016 | | let mut input = b"abc*def".as_ref(); |
1017 | | // ^= 4 |
1018 | | |
1019 | | assert_eq!( |
1020 | | $source(&mut input) |
1021 | | .read_bytes_until(b'*', buf, &mut position) |
1022 | | $(.$await)? |
1023 | | .unwrap() |
1024 | | .map(Bytes), |
1025 | | Some(Bytes(b"abc")) |
1026 | | ); |
1027 | | assert_eq!(position, 4); // position after the symbol matched |
1028 | | } |
1029 | | |
1030 | | /// Checks that search in the buffer an element that is located in the end of |
1031 | | /// buffer returns slice before that symbol as a result and set `position` to one |
1032 | | /// symbol after match (`len()`) |
1033 | | #[$test] |
1034 | | $($async)? fn in_the_end() { |
1035 | | let buf = $buf; |
1036 | | let mut position = 0; |
1037 | | let mut input = b"abcdef*".as_ref(); |
1038 | | // ^= 7 |
1039 | | |
1040 | | assert_eq!( |
1041 | | $source(&mut input) |
1042 | | .read_bytes_until(b'*', buf, &mut position) |
1043 | | $(.$await)? |
1044 | | .unwrap() |
1045 | | .map(Bytes), |
1046 | | Some(Bytes(b"abcdef")) |
1047 | | ); |
1048 | | assert_eq!(position, 7); // position after the symbol matched |
1049 | | } |
1050 | | } |
1051 | | |
1052 | | mod read_bang_element { |
1053 | | use super::*; |
1054 | | |
1055 | | /// Checks that reading CDATA content works correctly |
1056 | | mod cdata { |
1057 | | use super::*; |
1058 | | use crate::errors::Error; |
1059 | | use crate::reader::BangType; |
1060 | | use crate::utils::Bytes; |
1061 | | use pretty_assertions::assert_eq; |
1062 | | |
1063 | | /// Checks that if input begins like CDATA element, but CDATA start sequence |
1064 | | /// is not finished, parsing ends with an error |
1065 | | #[$test] |
1066 | | #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] |
1067 | | $($async)? fn not_properly_start() { |
1068 | | let buf = $buf; |
1069 | | let mut position = 0; |
1070 | | let mut input = b"![]]>other content".as_ref(); |
1071 | | // ^= 0 |
1072 | | |
1073 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1074 | | Err(Error::UnexpectedEof(s)) if s == "CData" => {} |
1075 | | x => assert!( |
1076 | | false, |
1077 | | r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#, |
1078 | | x |
1079 | | ), |
1080 | | } |
1081 | | assert_eq!(position, 0); |
1082 | | } |
1083 | | |
1084 | | /// Checks that if CDATA startup sequence was matched, but an end sequence |
1085 | | /// is not found, parsing ends with an error |
1086 | | #[$test] |
1087 | | $($async)? fn not_closed() { |
1088 | | let buf = $buf; |
1089 | | let mut position = 0; |
1090 | | let mut input = b"![CDATA[other content".as_ref(); |
1091 | | // ^= 0 |
1092 | | |
1093 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1094 | | Err(Error::UnexpectedEof(s)) if s == "CData" => {} |
1095 | | x => assert!( |
1096 | | false, |
1097 | | r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#, |
1098 | | x |
1099 | | ), |
1100 | | } |
1101 | | assert_eq!(position, 0); |
1102 | | } |
1103 | | |
1104 | | /// Checks that CDATA element without content inside parsed successfully |
1105 | | #[$test] |
1106 | | $($async)? fn empty() { |
1107 | | let buf = $buf; |
1108 | | let mut position = 0; |
1109 | | let mut input = b"![CDATA[]]>other content".as_ref(); |
1110 | | // ^= 11 |
1111 | | |
1112 | | assert_eq!( |
1113 | | $source(&mut input) |
1114 | | .read_bang_element(buf, &mut position) |
1115 | | $(.$await)? |
1116 | | .unwrap() |
1117 | | .map(|(ty, data)| (ty, Bytes(data))), |
1118 | | Some((BangType::CData, Bytes(b"![CDATA[]]"))) |
1119 | | ); |
1120 | | assert_eq!(position, 11); |
1121 | | } |
1122 | | |
1123 | | /// Checks that CDATA element with content parsed successfully. |
1124 | | /// Additionally checks that sequences inside CDATA that may look like |
1125 | | /// a CDATA end sequence do not interrupt CDATA parsing |
1126 | | #[$test] |
1127 | | $($async)? fn with_content() { |
1128 | | let buf = $buf; |
1129 | | let mut position = 0; |
1130 | | let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); |
1131 | | // ^= 28 |
1132 | | |
1133 | | assert_eq!( |
1134 | | $source(&mut input) |
1135 | | .read_bang_element(buf, &mut position) |
1136 | | $(.$await)? |
1137 | | .unwrap() |
1138 | | .map(|(ty, data)| (ty, Bytes(data))), |
1139 | | Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))) |
1140 | | ); |
1141 | | assert_eq!(position, 28); |
1142 | | } |
1143 | | } |
1144 | | |
1145 | | /// Checks that reading XML comments works correctly. According to the [specification], |
1146 | | /// comment data can contain any sequence except `--`: |
1147 | | /// |
1148 | | /// ```peg |
1149 | | /// comment = '<--' (!'--' char)* '-->'; |
1150 | | /// char = [#x1-#x2C] |
1151 | | /// / [#x2E-#xD7FF] |
1152 | | /// / [#xE000-#xFFFD] |
1153 | | /// / [#x10000-#x10FFFF] |
1154 | | /// ``` |
1155 | | /// |
1156 | | /// The presence of this limitation, however, is simply a poorly designed specification |
1157 | | /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for |
1158 | | /// presence of these sequences by default. This tests allow such content. |
1159 | | /// |
1160 | | /// [specification]: https://www.w3.org/TR/xml11/#dt-comment |
1161 | | mod comment { |
1162 | | use super::*; |
1163 | | use crate::errors::Error; |
1164 | | use crate::reader::BangType; |
1165 | | use crate::utils::Bytes; |
1166 | | use pretty_assertions::assert_eq; |
1167 | | |
1168 | | #[$test] |
1169 | | #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] |
1170 | | $($async)? fn not_properly_start() { |
1171 | | let buf = $buf; |
1172 | | let mut position = 0; |
1173 | | let mut input = b"!- -->other content".as_ref(); |
1174 | | // ^= 0 |
1175 | | |
1176 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1177 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1178 | | x => assert!( |
1179 | | false, |
1180 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1181 | | x |
1182 | | ), |
1183 | | } |
1184 | | assert_eq!(position, 0); |
1185 | | } |
1186 | | |
1187 | | #[$test] |
1188 | | $($async)? fn not_properly_end() { |
1189 | | let buf = $buf; |
1190 | | let mut position = 0; |
1191 | | let mut input = b"!->other content".as_ref(); |
1192 | | // ^= 0 |
1193 | | |
1194 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1195 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1196 | | x => assert!( |
1197 | | false, |
1198 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1199 | | x |
1200 | | ), |
1201 | | } |
1202 | | assert_eq!(position, 0); |
1203 | | } |
1204 | | |
1205 | | #[$test] |
1206 | | $($async)? fn not_closed1() { |
1207 | | let buf = $buf; |
1208 | | let mut position = 0; |
1209 | | let mut input = b"!--other content".as_ref(); |
1210 | | // ^= 0 |
1211 | | |
1212 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1213 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1214 | | x => assert!( |
1215 | | false, |
1216 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1217 | | x |
1218 | | ), |
1219 | | } |
1220 | | assert_eq!(position, 0); |
1221 | | } |
1222 | | |
1223 | | #[$test] |
1224 | | $($async)? fn not_closed2() { |
1225 | | let buf = $buf; |
1226 | | let mut position = 0; |
1227 | | let mut input = b"!-->other content".as_ref(); |
1228 | | // ^= 0 |
1229 | | |
1230 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1231 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1232 | | x => assert!( |
1233 | | false, |
1234 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1235 | | x |
1236 | | ), |
1237 | | } |
1238 | | assert_eq!(position, 0); |
1239 | | } |
1240 | | |
1241 | | #[$test] |
1242 | | $($async)? fn not_closed3() { |
1243 | | let buf = $buf; |
1244 | | let mut position = 0; |
1245 | | let mut input = b"!--->other content".as_ref(); |
1246 | | // ^= 0 |
1247 | | |
1248 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1249 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1250 | | x => assert!( |
1251 | | false, |
1252 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1253 | | x |
1254 | | ), |
1255 | | } |
1256 | | assert_eq!(position, 0); |
1257 | | } |
1258 | | |
1259 | | #[$test] |
1260 | | $($async)? fn empty() { |
1261 | | let buf = $buf; |
1262 | | let mut position = 0; |
1263 | | let mut input = b"!---->other content".as_ref(); |
1264 | | // ^= 6 |
1265 | | |
1266 | | assert_eq!( |
1267 | | $source(&mut input) |
1268 | | .read_bang_element(buf, &mut position) |
1269 | | $(.$await)? |
1270 | | .unwrap() |
1271 | | .map(|(ty, data)| (ty, Bytes(data))), |
1272 | | Some((BangType::Comment, Bytes(b"!----"))) |
1273 | | ); |
1274 | | assert_eq!(position, 6); |
1275 | | } |
1276 | | |
1277 | | #[$test] |
1278 | | $($async)? fn with_content() { |
1279 | | let buf = $buf; |
1280 | | let mut position = 0; |
1281 | | let mut input = b"!--->comment<--->other content".as_ref(); |
1282 | | // ^= 17 |
1283 | | |
1284 | | assert_eq!( |
1285 | | $source(&mut input) |
1286 | | .read_bang_element(buf, &mut position) |
1287 | | $(.$await)? |
1288 | | .unwrap() |
1289 | | .map(|(ty, data)| (ty, Bytes(data))), |
1290 | | Some((BangType::Comment, Bytes(b"!--->comment<---"))) |
1291 | | ); |
1292 | | assert_eq!(position, 17); |
1293 | | } |
1294 | | } |
1295 | | |
1296 | | /// Checks that reading DOCTYPE definition works correctly |
1297 | | mod doctype { |
1298 | | use super::*; |
1299 | | |
1300 | | mod uppercase { |
1301 | | use super::*; |
1302 | | use crate::errors::Error; |
1303 | | use crate::reader::BangType; |
1304 | | use crate::utils::Bytes; |
1305 | | use pretty_assertions::assert_eq; |
1306 | | |
1307 | | #[$test] |
1308 | | $($async)? fn not_properly_start() { |
1309 | | let buf = $buf; |
1310 | | let mut position = 0; |
1311 | | let mut input = b"!D other content".as_ref(); |
1312 | | // ^= 0 |
1313 | | |
1314 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1315 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1316 | | x => assert!( |
1317 | | false, |
1318 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1319 | | x |
1320 | | ), |
1321 | | } |
1322 | | assert_eq!(position, 0); |
1323 | | } |
1324 | | |
1325 | | #[$test] |
1326 | | $($async)? fn without_space() { |
1327 | | let buf = $buf; |
1328 | | let mut position = 0; |
1329 | | let mut input = b"!DOCTYPEother content".as_ref(); |
1330 | | // ^= 0 |
1331 | | |
1332 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1333 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1334 | | x => assert!( |
1335 | | false, |
1336 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1337 | | x |
1338 | | ), |
1339 | | } |
1340 | | assert_eq!(position, 0); |
1341 | | } |
1342 | | |
1343 | | #[$test] |
1344 | | $($async)? fn empty() { |
1345 | | let buf = $buf; |
1346 | | let mut position = 0; |
1347 | | let mut input = b"!DOCTYPE>other content".as_ref(); |
1348 | | // ^= 9 |
1349 | | |
1350 | | assert_eq!( |
1351 | | $source(&mut input) |
1352 | | .read_bang_element(buf, &mut position) |
1353 | | $(.$await)? |
1354 | | .unwrap() |
1355 | | .map(|(ty, data)| (ty, Bytes(data))), |
1356 | | Some((BangType::DocType, Bytes(b"!DOCTYPE"))) |
1357 | | ); |
1358 | | assert_eq!(position, 9); |
1359 | | } |
1360 | | |
1361 | | #[$test] |
1362 | | $($async)? fn not_closed() { |
1363 | | let buf = $buf; |
1364 | | let mut position = 0; |
1365 | | let mut input = b"!DOCTYPE other content".as_ref(); |
1366 | | // ^= 0 |
1367 | | |
1368 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1369 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1370 | | x => assert!( |
1371 | | false, |
1372 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1373 | | x |
1374 | | ), |
1375 | | } |
1376 | | assert_eq!(position, 0); |
1377 | | } |
1378 | | } |
1379 | | |
1380 | | mod lowercase { |
1381 | | use super::*; |
1382 | | use crate::errors::Error; |
1383 | | use crate::reader::BangType; |
1384 | | use crate::utils::Bytes; |
1385 | | use pretty_assertions::assert_eq; |
1386 | | |
1387 | | #[$test] |
1388 | | $($async)? fn not_properly_start() { |
1389 | | let buf = $buf; |
1390 | | let mut position = 0; |
1391 | | let mut input = b"!d other content".as_ref(); |
1392 | | // ^= 0 |
1393 | | |
1394 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1395 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1396 | | x => assert!( |
1397 | | false, |
1398 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1399 | | x |
1400 | | ), |
1401 | | } |
1402 | | assert_eq!(position, 0); |
1403 | | } |
1404 | | |
1405 | | #[$test] |
1406 | | $($async)? fn without_space() { |
1407 | | let buf = $buf; |
1408 | | let mut position = 0; |
1409 | | let mut input = b"!doctypeother content".as_ref(); |
1410 | | // ^= 0 |
1411 | | |
1412 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1413 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1414 | | x => assert!( |
1415 | | false, |
1416 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1417 | | x |
1418 | | ), |
1419 | | } |
1420 | | assert_eq!(position, 0); |
1421 | | } |
1422 | | |
1423 | | #[$test] |
1424 | | $($async)? fn empty() { |
1425 | | let buf = $buf; |
1426 | | let mut position = 0; |
1427 | | let mut input = b"!doctype>other content".as_ref(); |
1428 | | // ^= 9 |
1429 | | |
1430 | | assert_eq!( |
1431 | | $source(&mut input) |
1432 | | .read_bang_element(buf, &mut position) |
1433 | | $(.$await)? |
1434 | | .unwrap() |
1435 | | .map(|(ty, data)| (ty, Bytes(data))), |
1436 | | Some((BangType::DocType, Bytes(b"!doctype"))) |
1437 | | ); |
1438 | | assert_eq!(position, 9); |
1439 | | } |
1440 | | |
1441 | | #[$test] |
1442 | | $($async)? fn not_closed() { |
1443 | | let buf = $buf; |
1444 | | let mut position = 0; |
1445 | | let mut input = b"!doctype other content".as_ref(); |
1446 | | // ^= 0 |
1447 | | |
1448 | | match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { |
1449 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1450 | | x => assert!( |
1451 | | false, |
1452 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1453 | | x |
1454 | | ), |
1455 | | } |
1456 | | assert_eq!(position, 0); |
1457 | | } |
1458 | | } |
1459 | | } |
1460 | | } |
1461 | | |
1462 | | mod read_element { |
1463 | | use super::*; |
1464 | | use crate::utils::Bytes; |
1465 | | use pretty_assertions::assert_eq; |
1466 | | |
1467 | | /// Checks that nothing was read from empty buffer |
1468 | | #[$test] |
1469 | | $($async)? fn empty() { |
1470 | | let buf = $buf; |
1471 | | let mut position = 0; |
1472 | | let mut input = b"".as_ref(); |
1473 | | // ^= 0 |
1474 | | |
1475 | | assert_eq!( |
1476 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1477 | | None |
1478 | | ); |
1479 | | assert_eq!(position, 0); |
1480 | | } |
1481 | | |
1482 | | mod open { |
1483 | | use super::*; |
1484 | | use crate::utils::Bytes; |
1485 | | use pretty_assertions::assert_eq; |
1486 | | |
1487 | | #[$test] |
1488 | | $($async)? fn empty_tag() { |
1489 | | let buf = $buf; |
1490 | | let mut position = 0; |
1491 | | let mut input = b">".as_ref(); |
1492 | | // ^= 1 |
1493 | | |
1494 | | assert_eq!( |
1495 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1496 | | Some(Bytes(b"")) |
1497 | | ); |
1498 | | assert_eq!(position, 1); |
1499 | | } |
1500 | | |
1501 | | #[$test] |
1502 | | $($async)? fn normal() { |
1503 | | let buf = $buf; |
1504 | | let mut position = 0; |
1505 | | let mut input = b"tag>".as_ref(); |
1506 | | // ^= 4 |
1507 | | |
1508 | | assert_eq!( |
1509 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1510 | | Some(Bytes(b"tag")) |
1511 | | ); |
1512 | | assert_eq!(position, 4); |
1513 | | } |
1514 | | |
1515 | | #[$test] |
1516 | | $($async)? fn empty_ns_empty_tag() { |
1517 | | let buf = $buf; |
1518 | | let mut position = 0; |
1519 | | let mut input = b":>".as_ref(); |
1520 | | // ^= 2 |
1521 | | |
1522 | | assert_eq!( |
1523 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1524 | | Some(Bytes(b":")) |
1525 | | ); |
1526 | | assert_eq!(position, 2); |
1527 | | } |
1528 | | |
1529 | | #[$test] |
1530 | | $($async)? fn empty_ns() { |
1531 | | let buf = $buf; |
1532 | | let mut position = 0; |
1533 | | let mut input = b":tag>".as_ref(); |
1534 | | // ^= 5 |
1535 | | |
1536 | | assert_eq!( |
1537 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1538 | | Some(Bytes(b":tag")) |
1539 | | ); |
1540 | | assert_eq!(position, 5); |
1541 | | } |
1542 | | |
1543 | | #[$test] |
1544 | | $($async)? fn with_attributes() { |
1545 | | let buf = $buf; |
1546 | | let mut position = 0; |
1547 | | let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); |
1548 | | // ^= 38 |
1549 | | |
1550 | | assert_eq!( |
1551 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1552 | | Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)) |
1553 | | ); |
1554 | | assert_eq!(position, 38); |
1555 | | } |
1556 | | } |
1557 | | |
1558 | | mod self_closed { |
1559 | | use super::*; |
1560 | | use crate::utils::Bytes; |
1561 | | use pretty_assertions::assert_eq; |
1562 | | |
1563 | | #[$test] |
1564 | | $($async)? fn empty_tag() { |
1565 | | let buf = $buf; |
1566 | | let mut position = 0; |
1567 | | let mut input = b"/>".as_ref(); |
1568 | | // ^= 2 |
1569 | | |
1570 | | assert_eq!( |
1571 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1572 | | Some(Bytes(b"/")) |
1573 | | ); |
1574 | | assert_eq!(position, 2); |
1575 | | } |
1576 | | |
1577 | | #[$test] |
1578 | | $($async)? fn normal() { |
1579 | | let buf = $buf; |
1580 | | let mut position = 0; |
1581 | | let mut input = b"tag/>".as_ref(); |
1582 | | // ^= 5 |
1583 | | |
1584 | | assert_eq!( |
1585 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1586 | | Some(Bytes(b"tag/")) |
1587 | | ); |
1588 | | assert_eq!(position, 5); |
1589 | | } |
1590 | | |
1591 | | #[$test] |
1592 | | $($async)? fn empty_ns_empty_tag() { |
1593 | | let buf = $buf; |
1594 | | let mut position = 0; |
1595 | | let mut input = b":/>".as_ref(); |
1596 | | // ^= 3 |
1597 | | |
1598 | | assert_eq!( |
1599 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1600 | | Some(Bytes(b":/")) |
1601 | | ); |
1602 | | assert_eq!(position, 3); |
1603 | | } |
1604 | | |
1605 | | #[$test] |
1606 | | $($async)? fn empty_ns() { |
1607 | | let buf = $buf; |
1608 | | let mut position = 0; |
1609 | | let mut input = b":tag/>".as_ref(); |
1610 | | // ^= 6 |
1611 | | |
1612 | | assert_eq!( |
1613 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1614 | | Some(Bytes(b":tag/")) |
1615 | | ); |
1616 | | assert_eq!(position, 6); |
1617 | | } |
1618 | | |
1619 | | #[$test] |
1620 | | $($async)? fn with_attributes() { |
1621 | | let buf = $buf; |
1622 | | let mut position = 0; |
1623 | | let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); |
1624 | | // ^= 41 |
1625 | | |
1626 | | assert_eq!( |
1627 | | $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes), |
1628 | | Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)) |
1629 | | ); |
1630 | | assert_eq!(position, 41); |
1631 | | } |
1632 | | } |
1633 | | } |
1634 | | |
1635 | | mod issue_344 { |
1636 | | use crate::errors::Error; |
1637 | | use crate::reader::Reader; |
1638 | | |
1639 | | #[$test] |
1640 | | $($async)? fn cdata() { |
1641 | | let mut reader = Reader::from_str("![]]>"); |
1642 | | |
1643 | | match reader.$read_until_close($buf) $(.$await)? { |
1644 | | Err(Error::UnexpectedEof(s)) if s == "CData" => {} |
1645 | | x => assert!( |
1646 | | false, |
1647 | | r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#, |
1648 | | x |
1649 | | ), |
1650 | | } |
1651 | | } |
1652 | | |
1653 | | #[$test] |
1654 | | $($async)? fn comment() { |
1655 | | let mut reader = Reader::from_str("!- -->"); |
1656 | | |
1657 | | match reader.$read_until_close($buf) $(.$await)? { |
1658 | | Err(Error::UnexpectedEof(s)) if s == "Comment" => {} |
1659 | | x => assert!( |
1660 | | false, |
1661 | | r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#, |
1662 | | x |
1663 | | ), |
1664 | | } |
1665 | | } |
1666 | | |
1667 | | #[$test] |
1668 | | $($async)? fn doctype_uppercase() { |
1669 | | let mut reader = Reader::from_str("!D>"); |
1670 | | |
1671 | | match reader.$read_until_close($buf) $(.$await)? { |
1672 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1673 | | x => assert!( |
1674 | | false, |
1675 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1676 | | x |
1677 | | ), |
1678 | | } |
1679 | | } |
1680 | | |
1681 | | #[$test] |
1682 | | $($async)? fn doctype_lowercase() { |
1683 | | let mut reader = Reader::from_str("!d>"); |
1684 | | |
1685 | | match reader.$read_until_close($buf) $(.$await)? { |
1686 | | Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} |
1687 | | x => assert!( |
1688 | | false, |
1689 | | r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#, |
1690 | | x |
1691 | | ), |
1692 | | } |
1693 | | } |
1694 | | } |
1695 | | |
1696 | | /// Ensures, that no empty `Text` events are generated |
1697 | | mod $read_event { |
1698 | | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; |
1699 | | use crate::reader::Reader; |
1700 | | use pretty_assertions::assert_eq; |
1701 | | |
1702 | | /// When `encoding` feature is enabled, encoding should be detected |
1703 | | /// from BOM (UTF-8) and BOM should be stripped. |
1704 | | /// |
1705 | | /// When `encoding` feature is disabled, UTF-8 is assumed and BOM |
1706 | | /// character should be stripped for consistency |
1707 | | #[$test] |
1708 | | $($async)? fn bom_from_reader() { |
1709 | | let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes()); |
1710 | | |
1711 | | assert_eq!( |
1712 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1713 | | Event::Text(BytesText::from_escaped("\u{feff}")) |
1714 | | ); |
1715 | | |
1716 | | assert_eq!( |
1717 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1718 | | Event::Eof |
1719 | | ); |
1720 | | } |
1721 | | |
1722 | | /// When parsing from &str, encoding is fixed (UTF-8), so |
1723 | | /// - when `encoding` feature is disabled, the behavior the |
1724 | | /// same as in `bom_from_reader` text |
1725 | | /// - when `encoding` feature is enabled, the behavior should |
1726 | | /// stay consistent, so the first BOM character is stripped |
1727 | | #[$test] |
1728 | | $($async)? fn bom_from_str() { |
1729 | | let mut reader = Reader::from_str("\u{feff}\u{feff}"); |
1730 | | |
1731 | | assert_eq!( |
1732 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1733 | | Event::Text(BytesText::from_escaped("\u{feff}")) |
1734 | | ); |
1735 | | |
1736 | | assert_eq!( |
1737 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1738 | | Event::Eof |
1739 | | ); |
1740 | | } |
1741 | | |
1742 | | #[$test] |
1743 | | $($async)? fn declaration() { |
1744 | | let mut reader = Reader::from_str("<?xml ?>"); |
1745 | | |
1746 | | assert_eq!( |
1747 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1748 | | Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3))) |
1749 | | ); |
1750 | | } |
1751 | | |
1752 | | #[$test] |
1753 | | $($async)? fn doctype() { |
1754 | | let mut reader = Reader::from_str("<!DOCTYPE x>"); |
1755 | | |
1756 | | assert_eq!( |
1757 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1758 | | Event::DocType(BytesText::from_escaped("x")) |
1759 | | ); |
1760 | | } |
1761 | | |
1762 | | #[$test] |
1763 | | $($async)? fn processing_instruction() { |
1764 | | let mut reader = Reader::from_str("<?xml-stylesheet?>"); |
1765 | | |
1766 | | assert_eq!( |
1767 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1768 | | Event::PI(BytesText::from_escaped("xml-stylesheet")) |
1769 | | ); |
1770 | | } |
1771 | | |
1772 | | #[$test] |
1773 | | $($async)? fn start() { |
1774 | | let mut reader = Reader::from_str("<tag>"); |
1775 | | |
1776 | | assert_eq!( |
1777 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1778 | | Event::Start(BytesStart::new("tag")) |
1779 | | ); |
1780 | | } |
1781 | | |
1782 | | #[$test] |
1783 | | $($async)? fn end() { |
1784 | | let mut reader = Reader::from_str("</tag>"); |
1785 | | // Because we expect invalid XML, do not check that |
1786 | | // the end name paired with the start name |
1787 | | reader.check_end_names(false); |
1788 | | |
1789 | | assert_eq!( |
1790 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1791 | | Event::End(BytesEnd::new("tag")) |
1792 | | ); |
1793 | | } |
1794 | | |
1795 | | #[$test] |
1796 | | $($async)? fn empty() { |
1797 | | let mut reader = Reader::from_str("<tag/>"); |
1798 | | |
1799 | | assert_eq!( |
1800 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1801 | | Event::Empty(BytesStart::new("tag")) |
1802 | | ); |
1803 | | } |
1804 | | |
1805 | | #[$test] |
1806 | | $($async)? fn text() { |
1807 | | let mut reader = Reader::from_str("text"); |
1808 | | |
1809 | | assert_eq!( |
1810 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1811 | | Event::Text(BytesText::from_escaped("text")) |
1812 | | ); |
1813 | | } |
1814 | | |
1815 | | #[$test] |
1816 | | $($async)? fn cdata() { |
1817 | | let mut reader = Reader::from_str("<![CDATA[]]>"); |
1818 | | |
1819 | | assert_eq!( |
1820 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1821 | | Event::CData(BytesCData::new("")) |
1822 | | ); |
1823 | | } |
1824 | | |
1825 | | #[$test] |
1826 | | $($async)? fn comment() { |
1827 | | let mut reader = Reader::from_str("<!---->"); |
1828 | | |
1829 | | assert_eq!( |
1830 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1831 | | Event::Comment(BytesText::from_escaped("")) |
1832 | | ); |
1833 | | } |
1834 | | |
1835 | | #[$test] |
1836 | | $($async)? fn eof() { |
1837 | | let mut reader = Reader::from_str(""); |
1838 | | |
1839 | | assert_eq!( |
1840 | | reader.$read_event($buf) $(.$await)? .unwrap(), |
1841 | | Event::Eof |
1842 | | ); |
1843 | | } |
1844 | | } |
1845 | | }; |
1846 | | } |
1847 | | |
1848 | | /// Tests for https://github.com/tafia/quick-xml/issues/469 |
1849 | | macro_rules! small_buffers { |
1850 | | ( |
1851 | | #[$test:meta] |
1852 | | $read_event:ident: $BufReader:ty |
1853 | | $(, $async:ident, $await:ident)? |
1854 | | ) => { |
1855 | | mod small_buffers { |
1856 | | use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event}; |
1857 | | use crate::reader::Reader; |
1858 | | use pretty_assertions::assert_eq; |
1859 | | |
1860 | | #[$test] |
1861 | | $($async)? fn decl() { |
1862 | | let xml = "<?xml ?>"; |
1863 | | // ^^^^^^^ data that fit into buffer |
1864 | | let size = xml.match_indices("?>").next().unwrap().0 + 1; |
1865 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1866 | | let mut reader = Reader::from_reader(br); |
1867 | | let mut buf = Vec::new(); |
1868 | | |
1869 | | assert_eq!( |
1870 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1871 | | Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3))) |
1872 | | ); |
1873 | | assert_eq!( |
1874 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1875 | | Event::Eof |
1876 | | ); |
1877 | | } |
1878 | | |
1879 | | #[$test] |
1880 | | $($async)? fn pi() { |
1881 | | let xml = "<?pi?>"; |
1882 | | // ^^^^^ data that fit into buffer |
1883 | | let size = xml.match_indices("?>").next().unwrap().0 + 1; |
1884 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1885 | | let mut reader = Reader::from_reader(br); |
1886 | | let mut buf = Vec::new(); |
1887 | | |
1888 | | assert_eq!( |
1889 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1890 | | Event::PI(BytesText::new("pi")) |
1891 | | ); |
1892 | | assert_eq!( |
1893 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1894 | | Event::Eof |
1895 | | ); |
1896 | | } |
1897 | | |
1898 | | #[$test] |
1899 | | $($async)? fn empty() { |
1900 | | let xml = "<empty/>"; |
1901 | | // ^^^^^^^ data that fit into buffer |
1902 | | let size = xml.match_indices("/>").next().unwrap().0 + 1; |
1903 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1904 | | let mut reader = Reader::from_reader(br); |
1905 | | let mut buf = Vec::new(); |
1906 | | |
1907 | | assert_eq!( |
1908 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1909 | | Event::Empty(BytesStart::new("empty")) |
1910 | | ); |
1911 | | assert_eq!( |
1912 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1913 | | Event::Eof |
1914 | | ); |
1915 | | } |
1916 | | |
1917 | | #[$test] |
1918 | | $($async)? fn cdata1() { |
1919 | | let xml = "<![CDATA[cdata]]>"; |
1920 | | // ^^^^^^^^^^^^^^^ data that fit into buffer |
1921 | | let size = xml.match_indices("]]>").next().unwrap().0 + 1; |
1922 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1923 | | let mut reader = Reader::from_reader(br); |
1924 | | let mut buf = Vec::new(); |
1925 | | |
1926 | | assert_eq!( |
1927 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1928 | | Event::CData(BytesCData::new("cdata")) |
1929 | | ); |
1930 | | assert_eq!( |
1931 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1932 | | Event::Eof |
1933 | | ); |
1934 | | } |
1935 | | |
1936 | | #[$test] |
1937 | | $($async)? fn cdata2() { |
1938 | | let xml = "<![CDATA[cdata]]>"; |
1939 | | // ^^^^^^^^^^^^^^^^ data that fit into buffer |
1940 | | let size = xml.match_indices("]]>").next().unwrap().0 + 2; |
1941 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1942 | | let mut reader = Reader::from_reader(br); |
1943 | | let mut buf = Vec::new(); |
1944 | | |
1945 | | assert_eq!( |
1946 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1947 | | Event::CData(BytesCData::new("cdata")) |
1948 | | ); |
1949 | | assert_eq!( |
1950 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1951 | | Event::Eof |
1952 | | ); |
1953 | | } |
1954 | | |
1955 | | #[$test] |
1956 | | $($async)? fn comment1() { |
1957 | | let xml = "<!--comment-->"; |
1958 | | // ^^^^^^^^^^^^ data that fit into buffer |
1959 | | let size = xml.match_indices("-->").next().unwrap().0 + 1; |
1960 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1961 | | let mut reader = Reader::from_reader(br); |
1962 | | let mut buf = Vec::new(); |
1963 | | |
1964 | | assert_eq!( |
1965 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1966 | | Event::Comment(BytesText::new("comment")) |
1967 | | ); |
1968 | | assert_eq!( |
1969 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1970 | | Event::Eof |
1971 | | ); |
1972 | | } |
1973 | | |
1974 | | #[$test] |
1975 | | $($async)? fn comment2() { |
1976 | | let xml = "<!--comment-->"; |
1977 | | // ^^^^^^^^^^^^^ data that fit into buffer |
1978 | | let size = xml.match_indices("-->").next().unwrap().0 + 2; |
1979 | | let br = <$BufReader>::with_capacity(size, xml.as_bytes()); |
1980 | | let mut reader = Reader::from_reader(br); |
1981 | | let mut buf = Vec::new(); |
1982 | | |
1983 | | assert_eq!( |
1984 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1985 | | Event::Comment(BytesText::new("comment")) |
1986 | | ); |
1987 | | assert_eq!( |
1988 | | reader.$read_event(&mut buf) $(.$await)? .unwrap(), |
1989 | | Event::Eof |
1990 | | ); |
1991 | | } |
1992 | | } |
1993 | | }; |
1994 | | } |
1995 | | |
1996 | | // Export macros for the child modules: |
1997 | | // - buffered_reader |
1998 | | // - slice_reader |
1999 | | pub(super) use check; |
2000 | | pub(super) use small_buffers; |
2001 | | } |