Coverage Report

Created: 2025-08-12 06:35

/rust/registry/src/index.crates.io-6f17d22bba15001f/quick-xml-0.29.0/src/reader/parser.rs
Line
Count
Source (jump to first uncovered line)
1
#[cfg(feature = "encoding")]
2
use encoding_rs::UTF_8;
3
4
use crate::encoding::Decoder;
5
use crate::errors::{Error, Result};
6
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7
#[cfg(feature = "encoding")]
8
use crate::reader::EncodingRef;
9
use crate::reader::{is_whitespace, BangType, ParseState};
10
11
use memchr;
12
13
/// A struct that holds a current parse state and a parser configuration.
14
/// It is independent on a way of reading data: the reader feed data into it and
15
/// get back produced [`Event`]s.
16
#[derive(Clone)]
17
pub(super) struct Parser {
18
    /// Number of bytes read from the source of data since the parser was created
19
    pub offset: usize,
20
    /// Defines how to process next byte
21
    pub state: ParseState,
22
    /// Expand empty element into an opening and closing element
23
    pub expand_empty_elements: bool,
24
    /// Trims leading whitespace in Text events, skip the element if text is empty
25
    pub trim_text_start: bool,
26
    /// Trims trailing whitespace in Text events.
27
    pub trim_text_end: bool,
28
    /// Trims trailing whitespaces from markup names in closing tags `</a >`
29
    pub trim_markup_names_in_closing_tags: bool,
30
    /// Check if [`Event::End`] nodes match last [`Event::Start`] node
31
    pub check_end_names: bool,
32
    /// Check if comments contains `--` (false per default)
33
    pub check_comments: bool,
34
    /// All currently Started elements which didn't have a matching
35
    /// End element yet.
36
    ///
37
    /// For an XML
38
    ///
39
    /// ```xml
40
    /// <root><one/><inner attr="value">|<tag></inner></root>
41
    /// ```
42
    /// when cursor at the `|` position buffer contains:
43
    ///
44
    /// ```text
45
    /// rootinner
46
    /// ^   ^
47
    /// ```
48
    ///
49
    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
50
    /// (0 and 4 in that case).
51
    opened_buffer: Vec<u8>,
52
    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
53
    /// for that field for details
54
    opened_starts: Vec<usize>,
55
56
    #[cfg(feature = "encoding")]
57
    /// Reference to the encoding used to read an XML
58
    pub encoding: EncodingRef,
59
}
60
61
impl Parser {
62
    /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
63
    ///
64
    /// # Parameters
65
    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66
    ///
67
    /// [`Text`]: Event::Text
68
84.0M
    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
69
84.0M
        let mut content = bytes;
70
84.0M
71
84.0M
        if self.trim_text_end {
72
0
            // Skip the ending '<'
73
0
            let len = bytes
74
0
                .iter()
75
0
                .rposition(|&b| !is_whitespace(b))
76
0
                .map_or_else(|| bytes.len(), |p| p + 1);
77
0
            content = &bytes[..len];
78
84.0M
        }
79
80
84.0M
        Ok(Event::Text(BytesText::wrap(content, self.decoder())))
81
84.0M
    }
82
83
    /// reads `BytesElement` starting with a `!`,
84
    /// return `Comment`, `CData` or `DocType` event
85
12.7k
    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
86
12.7k
        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
87
12.7k
            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
88
12.7k
        };
89
90
12.7k
        let len = buf.len();
91
0
        match bang_type {
92
0
            BangType::Comment if buf.starts_with(b"!--") => {
93
0
                debug_assert!(buf.ends_with(b"--"));
94
0
                if self.check_comments {
95
                    // search if '--' not in comments
96
0
                    if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
97
0
                        .position(|p| buf[3 + p + 1] == b'-')
98
                    {
99
0
                        self.offset += len - p;
100
0
                        return Err(Error::UnexpectedToken("--".to_string()));
101
0
                    }
102
0
                }
103
0
                Ok(Event::Comment(BytesText::wrap(
104
0
                    &buf[3..len - 2],
105
0
                    self.decoder(),
106
0
                )))
107
            }
108
0
            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
109
0
                debug_assert!(buf.ends_with(b"]]"));
110
0
                Ok(Event::CData(BytesCData::wrap(
111
0
                    &buf[8..len - 2],
112
0
                    self.decoder(),
113
0
                )))
114
            }
115
12.7k
            BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
116
12.7k
                let start = buf[8..]
117
12.7k
                    .iter()
118
25.5k
                    .position(|b| !is_whitespace(*b))
119
12.7k
                    .unwrap_or(len - 8);
120
12.7k
                if start + 8 >= len {
121
0
                    return Err(Error::EmptyDocType);
122
12.7k
                }
123
12.7k
                Ok(Event::DocType(BytesText::wrap(
124
12.7k
                    &buf[8 + start..],
125
12.7k
                    self.decoder(),
126
12.7k
                )))
127
            }
128
0
            _ => Err(bang_type.to_err()),
129
        }
130
12.7k
    }
131
132
    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
133
    /// end name matches the last opened start name if `self.check_end_names` is set.
134
41.1M
    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
135
        // XML standard permits whitespaces after the markup name in closing tags.
136
        // Let's strip them from the buffer before comparing tag names.
137
41.1M
        let name = if self.trim_markup_names_in_closing_tags {
138
41.1M
            if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
139
41.1M
                let (name, _) = buf[1..].split_at(pos_end_name + 1);
140
41.1M
                name
141
            } else {
142
0
                &buf[1..]
143
            }
144
        } else {
145
0
            &buf[1..]
146
        };
147
148
41.1M
        let decoder = self.decoder();
149
41.1M
        let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
150
0
            *offset -= buf.len();
151
0
            Err(Error::EndEventMismatch {
152
0
                expected,
153
0
                found: decoder.decode(found).unwrap_or_default().into_owned(),
154
0
            })
155
0
        };
156
157
        // Get the index in self.opened_buffer of the name of the last opened tag
158
41.1M
        match self.opened_starts.pop() {
159
41.1M
            Some(start) => {
160
41.1M
                if self.check_end_names {
161
41.1M
                    let expected = &self.opened_buffer[start..];
162
41.1M
                    if name != expected {
163
0
                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
164
0
                        // #513: In order to allow error recovery we should drop content of the buffer
165
0
                        self.opened_buffer.truncate(start);
166
0
167
0
                        return mismatch_err(expected, name, &mut self.offset);
168
41.1M
                    }
169
0
                }
170
171
41.1M
                self.opened_buffer.truncate(start);
172
            }
173
            None => {
174
0
                if self.check_end_names {
175
0
                    return mismatch_err("".to_string(), &buf[1..], &mut self.offset);
176
0
                }
177
            }
178
        }
179
180
41.1M
        Ok(Event::End(BytesEnd::wrap(name.into())))
181
41.1M
    }
182
183
    /// reads `BytesElement` starting with a `?`,
184
    /// return `Decl` or `PI` event
185
12.7k
    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
186
12.7k
        let len = buf.len();
187
12.7k
        if len > 2 && buf[len - 1] == b'?' {
188
12.7k
            if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
189
12.7k
                let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
190
12.7k
191
12.7k
                // Try getting encoding from the declaration event
192
12.7k
                #[cfg(feature = "encoding")]
193
12.7k
                if self.encoding.can_be_refined() {
194
12.7k
                    if let Some(encoding) = event.encoder() {
195
12.7k
                        self.encoding = EncodingRef::XmlDetected(encoding);
196
12.7k
                    }
197
12.7k
                }
198
12.7k
199
12.7k
                Ok(Event::Decl(event))
200
            } else {
201
0
                Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
202
            }
203
        } else {
204
0
            self.offset -= len;
205
0
            Err(Error::UnexpectedEof("XmlDecl".to_string()))
206
        }
207
12.7k
    }
208
209
    /// Converts content of a tag to a `Start` or an `Empty` event
210
    ///
211
    /// # Parameters
212
    /// - `content`: Content of a tag between `<` and `>`
213
43.0M
    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
214
43.0M
        let len = content.len();
215
43.0M
        let name_end = content
216
43.0M
            .iter()
217
232M
            .position(|&b| is_whitespace(b))
218
43.0M
            .unwrap_or(len);
219
43.0M
        if let Some(&b'/') = content.last() {
220
            // This is self-closed tag `<something/>`
221
1.84M
            let name_len = if name_end < len { name_end } else { len - 1 };
222
1.84M
            let event = BytesStart::wrap(&content[..len - 1], name_len);
223
1.84M
224
1.84M
            if self.expand_empty_elements {
225
0
                self.state = ParseState::Empty;
226
0
                self.opened_starts.push(self.opened_buffer.len());
227
0
                self.opened_buffer.extend(&content[..name_len]);
228
0
                Ok(Event::Start(event))
229
            } else {
230
1.84M
                Ok(Event::Empty(event))
231
            }
232
        } else {
233
            // #514: Always store names event when .check_end_names == false,
234
            // because checks can be temporary disabled and when they would be
235
            // enabled, we should have that information
236
41.1M
            self.opened_starts.push(self.opened_buffer.len());
237
41.1M
            self.opened_buffer.extend(&content[..name_end]);
238
41.1M
            Ok(Event::Start(BytesStart::wrap(content, name_end)))
239
        }
240
43.0M
    }
241
242
    #[inline]
243
0
    pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
244
0
        self.state = ParseState::ClosedTag;
245
0
        let name = self
246
0
            .opened_buffer
247
0
            .split_off(self.opened_starts.pop().unwrap());
248
0
        Ok(Event::End(BytesEnd::wrap(name.into())))
249
0
    }
250
251
    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
252
    ///
253
    /// If `encoding` feature is enabled, the used encoding may change after
254
    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
255
    ///
256
    /// If `encoding` feature is enabled and no encoding is specified in declaration,
257
    /// defaults to UTF-8.
258
156M
    pub fn decoder(&self) -> Decoder {
259
156M
        Decoder {
260
156M
            #[cfg(feature = "encoding")]
261
156M
            encoding: self.encoding.encoding(),
262
156M
        }
263
156M
    }
264
}
265
266
impl Default for Parser {
267
12.7k
    fn default() -> Self {
268
12.7k
        Self {
269
12.7k
            offset: 0,
270
12.7k
            state: ParseState::Init,
271
12.7k
            expand_empty_elements: false,
272
12.7k
            trim_text_start: false,
273
12.7k
            trim_text_end: false,
274
12.7k
            trim_markup_names_in_closing_tags: true,
275
12.7k
            check_end_names: true,
276
12.7k
            check_comments: false,
277
12.7k
            opened_buffer: Vec::new(),
278
12.7k
            opened_starts: Vec::new(),
279
12.7k
280
12.7k
            #[cfg(feature = "encoding")]
281
12.7k
            encoding: EncodingRef::Implicit(UTF_8),
282
12.7k
        }
283
12.7k
    }
284
}