/rust/registry/src/index.crates.io-6f17d22bba15001f/quick-xml-0.29.0/src/reader/parser.rs

Source (jump to first uncovered line)
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;

use crate::encoding::Decoder;
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
#[cfg(feature = "encoding")]
use crate::reader::EncodingRef;
use crate::reader::{is_whitespace, BangType, ParseState};

use memchr;

/// A struct that holds a current parse state and a parser configuration.
/// It is independent on a way of reading data: the reader feed data into it and
/// get back produced [`Event`]s.
#[derive(Clone)]
pub(super) struct Parser {
    /// Number of bytes read from the source of data since the parser was created
    pub offset: usize,
    /// Defines how to process next byte
    pub state: ParseState,
    /// Expand empty element into an opening and closing element
    pub expand_empty_elements: bool,
    /// Trims leading whitespace in Text events, skip the element if text is empty
    pub trim_text_start: bool,
    /// Trims trailing whitespace in Text events.
    pub trim_text_end: bool,
    /// Trims trailing whitespaces from markup names in closing tags `</a >`
    pub trim_markup_names_in_closing_tags: bool,
    /// Check if [`Event::End`] nodes match last [`Event::Start`] node
    pub check_end_names: bool,
    /// Check if comments contains `--` (false per default)
    pub check_comments: bool,
    /// All currently Started elements which didn't have a matching
    /// End element yet.
    ///
    /// For an XML
    ///
    /// ```xml
    /// <root><one/><inner attr="value">|<tag></inner></root>
    /// ```
    /// when cursor at the `|` position buffer contains:
    ///
    /// ```text
    /// rootinner
    /// ^   ^
    /// ```
    ///
    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
    /// (0 and 4 in that case).
    opened_buffer: Vec<u8>,
    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
    /// for that field for details
    opened_starts: Vec<usize>,

    #[cfg(feature = "encoding")]
    /// Reference to the encoding used to read an XML
    pub encoding: EncodingRef,
}

impl Parser {
    /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
    ///
    /// # Parameters
    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
    ///
    /// [`Text`]: Event::Text
    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
        let mut content = bytes;

        if self.trim_text_end {
            // Skip the ending '<'
            let len = bytes
                .iter()
                .rposition(|&b| !is_whitespace(b))
                .map_or_else(|| bytes.len(), |p| p + 1);
            content = &bytes[..len];
        }

        Ok(Event::Text(BytesText::wrap(content, self.decoder())))
    }

    /// reads `BytesElement` starting with a `!`,
    /// return `Comment`, `CData` or `DocType` event
    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
        };

        let len = buf.len();
        match bang_type {
            BangType::Comment if buf.starts_with(b"!--") => {
                debug_assert!(buf.ends_with(b"--"));
                if self.check_comments {
                    // search if '--' not in comments
                    if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
                        .position(|p| buf[3 + p + 1] == b'-')
                    {
                        self.offset += len - p;
                        return Err(Error::UnexpectedToken("--".to_string()));
                    }
                }
                Ok(Event::Comment(BytesText::wrap(
                    &buf[3..len - 2],
                    self.decoder(),
                )))
            }
            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
                debug_assert!(buf.ends_with(b"]]"));
                Ok(Event::CData(BytesCData::wrap(
                    &buf[8..len - 2],
                    self.decoder(),
                )))
            }
            BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
                let start = buf[8..]
                    .iter()
                    .position(|b| !is_whitespace(*b))
                    .unwrap_or(len - 8);
                if start + 8 >= len {
                    return Err(Error::EmptyDocType);
                }
                Ok(Event::DocType(BytesText::wrap(
                    &buf[8 + start..],
                    self.decoder(),
                )))
            }
            _ => Err(bang_type.to_err()),
        }
    }

    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
    /// end name matches the last opened start name if `self.check_end_names` is set.
    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
        // XML standard permits whitespaces after the markup name in closing tags.
        // Let's strip them from the buffer before comparing tag names.
        let name = if self.trim_markup_names_in_closing_tags {
            if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
                let (name, _) = buf[1..].split_at(pos_end_name + 1);
                name
            } else {
                &buf[1..]
            }
        } else {
            &buf[1..]
        };

        let decoder = self.decoder();
        let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
            *offset -= buf.len();
            Err(Error::EndEventMismatch {
                expected,
                found: decoder.decode(found).unwrap_or_default().into_owned(),
            })
        };

        // Get the index in self.opened_buffer of the name of the last opened tag
        match self.opened_starts.pop() {
            Some(start) => {
                if self.check_end_names {
                    let expected = &self.opened_buffer[start..];
                    if name != expected {
                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
                        // #513: In order to allow error recovery we should drop content of the buffer
                        self.opened_buffer.truncate(start);

                        return mismatch_err(expected, name, &mut self.offset);
                    }
                }

                self.opened_buffer.truncate(start);
            }
            None => {
                if self.check_end_names {
                    return mismatch_err("".to_string(), &buf[1..], &mut self.offset);
                }
            }
        }

        Ok(Event::End(BytesEnd::wrap(name.into())))
    }

    /// reads `BytesElement` starting with a `?`,
    /// return `Decl` or `PI` event
    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
        let len = buf.len();
        if len > 2 && buf[len - 1] == b'?' {
            if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
                let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));

                // Try getting encoding from the declaration event
                #[cfg(feature = "encoding")]
                if self.encoding.can_be_refined() {
                    if let Some(encoding) = event.encoder() {
                        self.encoding = EncodingRef::XmlDetected(encoding);
                    }
                }

                Ok(Event::Decl(event))
            } else {
                Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
            }
        } else {
            self.offset -= len;
            Err(Error::UnexpectedEof("XmlDecl".to_string()))
        }
    }

    /// Converts content of a tag to a `Start` or an `Empty` event
    ///
    /// # Parameters
    /// - `content`: Content of a tag between `<` and `>`
    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
        let len = content.len();
        let name_end = content
            .iter()
            .position(|&b| is_whitespace(b))
            .unwrap_or(len);
        if let Some(&b'/') = content.last() {
            // This is self-closed tag `<something/>`
            let name_len = if name_end < len { name_end } else { len - 1 };
            let event = BytesStart::wrap(&content[..len - 1], name_len);

            if self.expand_empty_elements {
                self.state = ParseState::Empty;
                self.opened_starts.push(self.opened_buffer.len());
                self.opened_buffer.extend(&content[..name_len]);
                Ok(Event::Start(event))
            } else {
                Ok(Event::Empty(event))
            }
        } else {
            // #514: Always store names event when .check_end_names == false,
            // because checks can be temporary disabled and when they would be
            // enabled, we should have that information
            self.opened_starts.push(self.opened_buffer.len());
            self.opened_buffer.extend(&content[..name_end]);
            Ok(Event::Start(BytesStart::wrap(content, name_end)))
        }
    }

    #[inline]
    pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
        self.state = ParseState::ClosedTag;
        let name = self
            .opened_buffer
            .split_off(self.opened_starts.pop().unwrap());
        Ok(Event::End(BytesEnd::wrap(name.into())))
    }

    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
    ///
    /// If `encoding` feature is enabled, the used encoding may change after
    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
    ///
    /// If `encoding` feature is enabled and no encoding is specified in declaration,
    /// defaults to UTF-8.
    pub fn decoder(&self) -> Decoder {
        Decoder {
            #[cfg(feature = "encoding")]
            encoding: self.encoding.encoding(),
        }
    }
}

impl Default for Parser {
    fn default() -> Self {
        Self {
            offset: 0,
            state: ParseState::Init,
            expand_empty_elements: false,
            trim_text_start: false,
            trim_text_end: false,
            trim_markup_names_in_closing_tags: true,
            check_end_names: true,
            check_comments: false,
            opened_buffer: Vec::new(),
            opened_starts: Vec::new(),

            #[cfg(feature = "encoding")]
            encoding: EncodingRef::Implicit(UTF_8),
        }
    }
}

Coverage Report

Created: 2025-08-12 06:35

Line	Count	Source (jump to first uncovered line)
1		#[cfg(feature = "encoding")]
2		use encoding_rs::UTF_8;
3
4		use crate::encoding::Decoder;
5		use crate::errors::{Error, Result};
6		use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7		#[cfg(feature = "encoding")]
8		use crate::reader::EncodingRef;
9		use crate::reader::{is_whitespace, BangType, ParseState};
10
11		use memchr;
12
13		/// A struct that holds a current parse state and a parser configuration.
14		/// It is independent on a way of reading data: the reader feed data into it and
15		/// get back produced [`Event`]s.
16		#[derive(Clone)]
17		pub(super) struct Parser {
18		/// Number of bytes read from the source of data since the parser was created
19		pub offset: usize,
20		/// Defines how to process next byte
21		pub state: ParseState,
22		/// Expand empty element into an opening and closing element
23		pub expand_empty_elements: bool,
24		/// Trims leading whitespace in Text events, skip the element if text is empty
25		pub trim_text_start: bool,
26		/// Trims trailing whitespace in Text events.
27		pub trim_text_end: bool,
28		/// Trims trailing whitespaces from markup names in closing tags `</a >`
29		pub trim_markup_names_in_closing_tags: bool,
30		/// Check if [`Event::End`] nodes match last [`Event::Start`] node
31		pub check_end_names: bool,
32		/// Check if comments contains `--` (false per default)
33		pub check_comments: bool,
34		/// All currently Started elements which didn't have a matching
35		/// End element yet.
36		///
37		/// For an XML
38		///
39		/// ```xml
40		/// <root><one/><inner attr="value">\|<tag></inner></root>
41		/// ```
42		/// when cursor at the `\|` position buffer contains:
43		///
44		/// ```text
45		/// rootinner
46		/// ^ ^
47		/// ```
48		///
49		/// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
50		/// (0 and 4 in that case).
51		opened_buffer: Vec<u8>,
52		/// Opened name start indexes into [`Self::opened_buffer`]. See documentation
53		/// for that field for details
54		opened_starts: Vec<usize>,
55
56		#[cfg(feature = "encoding")]
57		/// Reference to the encoding used to read an XML
58		pub encoding: EncodingRef,
59		}
60
61		impl Parser {
62		/// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
63		///
64		/// # Parameters
65		/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66		///
67		/// [`Text`]: Event::Text
68	84.0M	pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
69	84.0M	let mut content = bytes;
70	84.0M
71	84.0M	if self.trim_text_end {
72	0	// Skip the ending '<'
73	0	let len = bytes
74	0	.iter()
75	0	.rposition(\|&b\| !is_whitespace(b))
76	0	.map_or_else(\|\| bytes.len(), \|p\| p + 1);
77	0	content = &bytes[..len];
78	84.0M	}
79
80	84.0M	Ok(Event::Text(BytesText::wrap(content, self.decoder())))
81	84.0M	}
82
83		/// reads `BytesElement` starting with a `!`,
84		/// return `Comment`, `CData` or `DocType` event
85	12.7k	pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
86	12.7k	let uncased_starts_with = \|string: &[u8], prefix: &[u8]\| {
87	12.7k	string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
88	12.7k	};
89
90	12.7k	let len = buf.len();
91	0	match bang_type {
92	0	BangType::Comment if buf.starts_with(b"!--") => {
93	0	debug_assert!(buf.ends_with(b"--"));
94	0	if self.check_comments {
95		// search if '--' not in comments
96	0	if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
97	0	.position(\|p\| buf[3 + p + 1] == b'-')
98		{
99	0	self.offset += len - p;
100	0	return Err(Error::UnexpectedToken("--".to_string()));
101	0	}
102	0	}
103	0	Ok(Event::Comment(BytesText::wrap(
104	0	&buf[3..len - 2],
105	0	self.decoder(),
106	0	)))
107		}
108	0	BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
109	0	debug_assert!(buf.ends_with(b"]]"));
110	0	Ok(Event::CData(BytesCData::wrap(
111	0	&buf[8..len - 2],
112	0	self.decoder(),
113	0	)))
114		}
115	12.7k	BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
116	12.7k	let start = buf[8..]
117	12.7k	.iter()
118	25.5k	.position(\|b\| !is_whitespace(*b))
119	12.7k	.unwrap_or(len - 8);
120	12.7k	if start + 8 >= len {
121	0	return Err(Error::EmptyDocType);
122	12.7k	}
123	12.7k	Ok(Event::DocType(BytesText::wrap(
124	12.7k	&buf[8 + start..],
125	12.7k	self.decoder(),
126	12.7k	)))
127		}
128	0	_ => Err(bang_type.to_err()),
129		}
130	12.7k	}
131
132		/// Wraps content of `buf` into the [`Event::End`] event. Does the check that
133		/// end name matches the last opened start name if `self.check_end_names` is set.
134	41.1M	pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
135		// XML standard permits whitespaces after the markup name in closing tags.
136		// Let's strip them from the buffer before comparing tag names.
137	41.1M	let name = if self.trim_markup_names_in_closing_tags {
138	41.1M	if let Some(pos_end_name) = buf[1..].iter().rposition(\|&b\| !b.is_ascii_whitespace()) {
139	41.1M	let (name, _) = buf[1..].split_at(pos_end_name + 1);
140	41.1M	name
141		} else {
142	0	&buf[1..]
143		}
144		} else {
145	0	&buf[1..]
146		};
147
148	41.1M	let decoder = self.decoder();
149	41.1M	let mismatch_err = \|expected: String, found: &[u8], offset: &mut usize\| {
150	0	*offset -= buf.len();
151	0	Err(Error::EndEventMismatch {
152	0	expected,
153	0	found: decoder.decode(found).unwrap_or_default().into_owned(),
154	0	})
155	0	};
156
157		// Get the index in self.opened_buffer of the name of the last opened tag
158	41.1M	match self.opened_starts.pop() {
159	41.1M	Some(start) => {
160	41.1M	if self.check_end_names {
161	41.1M	let expected = &self.opened_buffer[start..];
162	41.1M	if name != expected {
163	0	let expected = decoder.decode(expected).unwrap_or_default().into_owned();
164	0	// #513: In order to allow error recovery we should drop content of the buffer
165	0	self.opened_buffer.truncate(start);
166	0
167	0	return mismatch_err(expected, name, &mut self.offset);
168	41.1M	}
169	0	}
170
171	41.1M	self.opened_buffer.truncate(start);
172		}
173		None => {
174	0	if self.check_end_names {
175	0	return mismatch_err("".to_string(), &buf[1..], &mut self.offset);
176	0	}
177		}
178		}
179
180	41.1M	Ok(Event::End(BytesEnd::wrap(name.into())))
181	41.1M	}
182
183		/// reads `BytesElement` starting with a `?`,
184		/// return `Decl` or `PI` event
185	12.7k	pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
186	12.7k	let len = buf.len();
187	12.7k	if len > 2 && buf[len - 1] == b'?' {
188	12.7k	if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
189	12.7k	let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
190	12.7k
191	12.7k	// Try getting encoding from the declaration event
192	12.7k	#[cfg(feature = "encoding")]
193	12.7k	if self.encoding.can_be_refined() {
194	12.7k	if let Some(encoding) = event.encoder() {
195	12.7k	self.encoding = EncodingRef::XmlDetected(encoding);
196	12.7k	}
197	12.7k	}
198	12.7k
199	12.7k	Ok(Event::Decl(event))
200		} else {
201	0	Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
202		}
203		} else {
204	0	self.offset -= len;
205	0	Err(Error::UnexpectedEof("XmlDecl".to_string()))
206		}
207	12.7k	}
208
209		/// Converts content of a tag to a `Start` or an `Empty` event
210		///
211		/// # Parameters
212		/// - `content`: Content of a tag between `<` and `>`
213	43.0M	pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
214	43.0M	let len = content.len();
215	43.0M	let name_end = content
216	43.0M	.iter()
217	232M	.position(\|&b\| is_whitespace(b))
218	43.0M	.unwrap_or(len);
219	43.0M	if let Some(&b'/') = content.last() {
220		// This is self-closed tag `<something/>`
221	1.84M	let name_len = if name_end < len { name_end } else { len - 1 };
222	1.84M	let event = BytesStart::wrap(&content[..len - 1], name_len);
223	1.84M
224	1.84M	if self.expand_empty_elements {
225	0	self.state = ParseState::Empty;
226	0	self.opened_starts.push(self.opened_buffer.len());
227	0	self.opened_buffer.extend(&content[..name_len]);
228	0	Ok(Event::Start(event))
229		} else {
230	1.84M	Ok(Event::Empty(event))
231		}
232		} else {
233		// #514: Always store names event when .check_end_names == false,
234		// because checks can be temporary disabled and when they would be
235		// enabled, we should have that information
236	41.1M	self.opened_starts.push(self.opened_buffer.len());
237	41.1M	self.opened_buffer.extend(&content[..name_end]);
238	41.1M	Ok(Event::Start(BytesStart::wrap(content, name_end)))
239		}
240	43.0M	}
241
242		#[inline]
243	0	pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
244	0	self.state = ParseState::ClosedTag;
245	0	let name = self
246	0	.opened_buffer
247	0	.split_off(self.opened_starts.pop().unwrap());
248	0	Ok(Event::End(BytesEnd::wrap(name.into())))
249	0	}
250
251		/// Get the decoder, used to decode bytes, read by this reader, to the strings.
252		///
253		/// If `encoding` feature is enabled, the used encoding may change after
254		/// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
255		///
256		/// If `encoding` feature is enabled and no encoding is specified in declaration,
257		/// defaults to UTF-8.
258	156M	pub fn decoder(&self) -> Decoder {
259	156M	Decoder {
260	156M	#[cfg(feature = "encoding")]
261	156M	encoding: self.encoding.encoding(),
262	156M	}
263	156M	}
264		}
265
266		impl Default for Parser {
267	12.7k	fn default() -> Self {
268	12.7k	Self {
269	12.7k	offset: 0,
270	12.7k	state: ParseState::Init,
271	12.7k	expand_empty_elements: false,
272	12.7k	trim_text_start: false,
273	12.7k	trim_text_end: false,
274	12.7k	trim_markup_names_in_closing_tags: true,
275	12.7k	check_end_names: true,
276	12.7k	check_comments: false,
277	12.7k	opened_buffer: Vec::new(),
278	12.7k	opened_starts: Vec::new(),
279	12.7k
280	12.7k	#[cfg(feature = "encoding")]
281	12.7k	encoding: EncodingRef::Implicit(UTF_8),
282	12.7k	}
283	12.7k	}
284		}