Coverage Report

Created: 2026-02-14 06:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quick-xml/src/reader/state.rs
Line
Count
Source
1
#[cfg(feature = "encoding")]
2
use encoding_rs::UTF_8;
3
4
use crate::encoding::Decoder;
5
use crate::errors::{Error, IllFormedError, Result};
6
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
7
use crate::parser::{Parser, PiParser};
8
#[cfg(feature = "encoding")]
9
use crate::reader::EncodingRef;
10
use crate::reader::{BangType, Config, DtdParser, ParseState};
11
use crate::utils::{is_whitespace, name_len};
12
13
/// A struct that holds a current reader state and a parser configuration.
14
/// It is independent on a way of reading data: the reader feed data into it and
15
/// get back produced [`Event`]s.
16
#[derive(Clone, Debug)]
17
pub(super) struct ReaderState {
18
    /// Number of bytes read from the source of data since the reader was created
19
    pub offset: u64,
20
    /// A snapshot of an `offset` of the last error returned. It can be less than
21
    /// `offset`, because some errors conveniently report at earlier position,
22
    /// and changing `offset` is not possible, because `Error::IllFormed` errors
23
    /// are recoverable.
24
    pub last_error_offset: u64,
25
    /// Defines how to process next byte
26
    pub state: ParseState,
27
    /// User-defined settings that affect parsing
28
    pub config: Config,
29
    /// All currently Started elements which didn't have a matching
30
    /// End element yet.
31
    ///
32
    /// For an XML
33
    ///
34
    /// ```xml
35
    /// <root><one/><inner attr="value">|<tag></inner></root>
36
    /// ```
37
    /// when cursor at the `|` position buffer contains:
38
    ///
39
    /// ```text
40
    /// rootinner
41
    /// ^   ^
42
    /// ```
43
    ///
44
    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
45
    /// (0 and 4 in that case).
46
    opened_buffer: Vec<u8>,
47
    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
48
    /// for that field for details
49
    opened_starts: Vec<usize>,
50
51
    #[cfg(feature = "encoding")]
52
    /// Reference to the encoding used to read an XML
53
    pub encoding: EncodingRef,
54
}
55
56
impl ReaderState {
57
    /// Trims end whitespaces from `bytes`, if required, and returns a text event.
58
    ///
59
    /// # Parameters
60
    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
61
548k
    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
62
548k
        let mut content = bytes;
63
64
548k
        if self.config.trim_text_end {
65
            // Skip the ending '<'
66
233k
            let len = bytes
67
233k
                .iter()
68
1.52M
                .rposition(|&b| !is_whitespace(b))
69
233k
                .map_or(0, |p| p + 1);
70
233k
            content = &bytes[..len];
71
314k
        }
72
548k
        BytesText::wrap(content, self.decoder())
73
548k
    }
74
75
    /// Returns `Comment`, `CData` or `DocType` event.
76
    ///
77
    /// `buf` contains data between `<` and `>`:
78
    /// - CDATA: `![CDATA[...]]`
79
    /// - Comment: `!--...--`
80
    /// - Doctype (uppercase): `!D...`
81
    /// - Doctype (lowercase): `!d...`
82
73.4k
    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
83
73.4k
        debug_assert_eq!(
84
0
            buf.first(),
85
            Some(&b'!'),
86
0
            "CDATA, comment or DOCTYPE should start from '!'"
87
        );
88
89
73.4k
        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
90
4.75k
            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
91
4.75k
        };
92
93
73.4k
        let len = buf.len();
94
60.9k
        match bang_type {
95
7.76k
            BangType::Comment if buf.starts_with(b"!--") => {
96
7.70k
                debug_assert!(buf.ends_with(b"--"));
97
7.70k
                if self.config.check_comments {
98
                    // search if '--' not in comments
99
4.62k
                    let mut haystack = &buf[3..len - 2];
100
4.62k
                    let mut off = 0;
101
9.86k
                    while let Some(p) = memchr::memchr(b'-', haystack) {
102
5.28k
                        off += p + 1;
103
                        // if next byte after `-` is also `-`, return an error
104
5.28k
                        if buf[3 + off] == b'-' {
105
                            // Explanation of the magic:
106
                            //
107
                            // - `self.offset`` just after `>`,
108
                            // - `buf` contains `!-- con--tent --`
109
                            // - `p` is counted from byte after `<!--`
110
                            //
111
                            // <!-- con--tent -->:
112
                            //  ~~~~~~~~~~~~~~~~ : - buf
113
                            //   : ===========   : - zone of search (possible values of `p`)
114
                            //   : |---p         : - p is counted from | (| is 0)
115
                            //   : :   :         ^ - self.offset
116
                            //   ^ :   :           - self.offset - len
117
                            //     ^   :           - self.offset - len + 2
118
                            //         ^           - self.offset - len + 2 + p
119
44
                            self.last_error_offset = self.offset - len as u64 + 2 + p as u64;
120
44
                            return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
121
5.24k
                        }
122
                        // Continue search after single `-` (+1 to skip it)
123
5.24k
                        haystack = &haystack[p + 1..];
124
                    }
125
3.08k
                }
126
7.66k
                Ok(Event::Comment(BytesText::wrap(
127
7.66k
                    // Cut of `!--` and `--` from start and end
128
7.66k
                    &buf[3..len - 2],
129
7.66k
                    self.decoder(),
130
7.66k
                )))
131
            }
132
            // XML requires uppercase only:
133
            // https://www.w3.org/TR/xml11/#sec-cdata-sect
134
            // Even HTML5 required uppercase only:
135
            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
136
60.9k
            BangType::CData if buf.starts_with(b"![CDATA[") => {
137
60.6k
                debug_assert!(buf.ends_with(b"]]"));
138
60.6k
                Ok(Event::CData(BytesCData::wrap(
139
60.6k
                    // Cut of `![CDATA[` and `]]` from start and end
140
60.6k
                    &buf[8..len - 2],
141
60.6k
                    self.decoder(),
142
60.6k
                )))
143
            }
144
            // XML requires uppercase only, but we will check that on validation stage:
145
            // https://www.w3.org/TR/xml11/#sec-prolog-dtd
146
            // HTML5 allows mixed case for doctype declarations:
147
            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
148
4.75k
            BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b"!DOCTYPE") => {
149
17.0k
                match buf[8..].iter().position(|&b| !is_whitespace(b)) {
150
4.07k
                    Some(start) => Ok(Event::DocType(BytesText::wrap(
151
4.07k
                        // Cut of `!DOCTYPE` and any number of spaces from start
152
4.07k
                        &buf[8 + start..],
153
4.07k
                        self.decoder(),
154
4.07k
                    ))),
155
                    None => {
156
                        // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
157
                        // We want report error at place where name is expected - this is just
158
                        // before `>`
159
128
                        self.last_error_offset = self.offset - 1;
160
128
                        Err(Error::IllFormed(IllFormedError::MissingDoctypeName))
161
                    }
162
                }
163
            }
164
            _ => {
165
                // <!....>
166
                //  ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
167
                // ^------- We report error at that position, so we need to subtract 2 and buf len
168
899
                self.last_error_offset = self.offset - len as u64 - 2;
169
899
                Err(bang_type.to_err().into())
170
            }
171
        }
172
73.4k
    }
173
174
    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
175
    /// end name matches the last opened start name if `self.config.check_end_names` is set.
176
    ///
177
    /// `buf` contains data between `<` and `>`, for example `/tag`.
178
20.8k
    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
179
20.8k
        debug_assert_eq!(
180
0
            buf.first(),
181
            Some(&b'/'),
182
0
            "closing tag should start from '/'"
183
        );
184
185
        // Strip the `/` character. `content` contains data between `</` and `>`
186
20.8k
        let content = &buf[1..];
187
        // XML standard permits whitespaces after the markup name in closing tags.
188
        // Let's strip them from the buffer before comparing tag names.
189
20.8k
        let name = if self.config.trim_markup_names_in_closing_tags {
190
8.47k
            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
191
3.26k
                &content[..pos_end_name + 1]
192
            } else {
193
5.20k
                content
194
            }
195
        } else {
196
12.3k
            content
197
        };
198
199
20.8k
        let decoder = self.decoder();
200
201
        // Get the index in self.opened_buffer of the name of the last opened tag
202
20.8k
        match self.opened_starts.pop() {
203
18.6k
            Some(start) => {
204
18.6k
                if self.config.check_end_names {
205
7.39k
                    let expected = &self.opened_buffer[start..];
206
7.39k
                    if name != expected {
207
820
                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
208
                        // #513: In order to allow error recovery we should drop content of the buffer
209
820
                        self.opened_buffer.truncate(start);
210
211
                        // Report error at start of the end tag at `<` character
212
                        // -2 for `<` and `>`
213
820
                        self.last_error_offset = self.offset - buf.len() as u64 - 2;
214
820
                        return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
215
820
                            expected,
216
820
                            found: decoder.decode(name).unwrap_or_default().into_owned(),
217
820
                        }));
218
6.57k
                    }
219
11.2k
                }
220
221
17.8k
                self.opened_buffer.truncate(start);
222
            }
223
            None => {
224
2.18k
                if !self.config.allow_unmatched_ends {
225
                    // Report error at start of the end tag at `<` character
226
                    // -2 for `<` and `>`
227
492
                    self.last_error_offset = self.offset - buf.len() as u64 - 2;
228
492
                    return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
229
492
                        decoder.decode(name).unwrap_or_default().into_owned(),
230
492
                    )));
231
1.69k
                }
232
            }
233
        }
234
235
19.5k
        Ok(Event::End(BytesEnd::wrap(name.into())))
236
20.8k
    }
237
238
    /// `buf` contains data between `<` and `>` and the first byte is `?`.
239
    /// `self.offset` already after the `>`
240
    ///
241
    /// Returns `Decl` or `PI` event
242
92.0k
    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
243
92.0k
        debug_assert!(!buf.is_empty());
244
92.0k
        debug_assert_eq!(buf[0], b'?');
245
246
92.0k
        let len = buf.len();
247
        // We accept at least <??>
248
        //                     ~~ - len = 2
249
92.0k
        if len > 1 && buf[len - 1] == b'?' {
250
            // Cut of `?` and `?` from start and end
251
91.8k
            let content = &buf[1..len - 1];
252
91.8k
            let len = content.len();
253
254
91.8k
            if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
255
68.4k
                let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder()));
256
257
                // Try getting encoding from the declaration event
258
                #[cfg(feature = "encoding")]
259
                if self.encoding.can_be_refined() {
260
                    if let Some(encoding) = event.encoder() {
261
                        self.encoding = EncodingRef::XmlDetected(encoding);
262
                    }
263
                }
264
265
68.4k
                Ok(Event::Decl(event))
266
            } else {
267
23.4k
                Ok(Event::PI(BytesPI::wrap(
268
23.4k
                    content,
269
23.4k
                    name_len(content),
270
23.4k
                    self.decoder(),
271
23.4k
                )))
272
            }
273
        } else {
274
            // <?....>
275
            //  ^^^^^ - `buf` does not contain `<`, but we want to report error at `<`,
276
            //          so we move offset to it (-2 for `<` and `>`)
277
117
            self.last_error_offset = self.offset - len as u64 - 2;
278
117
            Err(Error::Syntax(PiParser(false).eof_error(buf)))
279
        }
280
92.0k
    }
281
282
    /// Converts content of a tag to a `Start` or an `Empty` event
283
    ///
284
    /// # Parameters
285
    /// - `content`: Content of a tag between `<` and `>`
286
6.72M
    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
287
6.72M
        if let Some(content) = content.strip_suffix(b"/") {
288
            // This is self-closed tag `<something/>`
289
296k
            let event = BytesStart::wrap(content, name_len(content), self.decoder());
290
291
296k
            if self.config.expand_empty_elements {
292
283k
                self.state = ParseState::InsideEmpty;
293
283k
                self.opened_starts.push(self.opened_buffer.len());
294
283k
                self.opened_buffer.extend(event.name().as_ref());
295
283k
                Event::Start(event)
296
            } else {
297
13.0k
                Event::Empty(event)
298
            }
299
        } else {
300
6.43M
            let event = BytesStart::wrap(content, name_len(content), self.decoder());
301
302
            // #514: Always store names event when .check_end_names == false,
303
            // because checks can be temporary disabled and when they would be
304
            // enabled, we should have that information
305
6.43M
            self.opened_starts.push(self.opened_buffer.len());
306
6.43M
            self.opened_buffer.extend(event.name().as_ref());
307
6.43M
            Event::Start(event)
308
        }
309
6.72M
    }
310
311
    #[inline]
312
283k
    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
313
283k
        self.state = ParseState::InsideText;
314
283k
        let name = self
315
283k
            .opened_buffer
316
283k
            .split_off(self.opened_starts.pop().unwrap());
317
283k
        BytesEnd::wrap(name.into())
318
283k
    }
<quick_xml::reader::state::ReaderState>::close_expanded_empty
Line
Count
Source
312
1.93k
    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
313
1.93k
        self.state = ParseState::InsideText;
314
1.93k
        let name = self
315
1.93k
            .opened_buffer
316
1.93k
            .split_off(self.opened_starts.pop().unwrap());
317
1.93k
        BytesEnd::wrap(name.into())
318
1.93k
    }
<quick_xml::reader::state::ReaderState>::close_expanded_empty
Line
Count
Source
312
281k
    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
313
281k
        self.state = ParseState::InsideText;
314
281k
        let name = self
315
281k
            .opened_buffer
316
281k
            .split_off(self.opened_starts.pop().unwrap());
317
281k
        BytesEnd::wrap(name.into())
318
281k
    }
319
320
    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
321
    ///
322
    /// If [`encoding`] feature is enabled, the used encoding may change after
323
    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
324
    ///
325
    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
326
    /// defaults to UTF-8.
327
    ///
328
    /// [`encoding`]: ../../index.html#encoding
329
18.0M
    pub const fn decoder(&self) -> Decoder {
330
18.0M
        Decoder {
331
18.0M
            #[cfg(feature = "encoding")]
332
18.0M
            encoding: self.encoding.encoding(),
333
18.0M
        }
334
18.0M
    }
335
}
336
337
impl Default for ReaderState {
338
28.7k
    fn default() -> Self {
339
28.7k
        Self {
340
28.7k
            offset: 0,
341
28.7k
            last_error_offset: 0,
342
28.7k
            state: ParseState::Init,
343
28.7k
            config: Config::default(),
344
28.7k
            opened_buffer: Vec::new(),
345
28.7k
            opened_starts: Vec::new(),
346
28.7k
347
28.7k
            #[cfg(feature = "encoding")]
348
28.7k
            encoding: EncodingRef::Implicit(UTF_8),
349
28.7k
        }
350
28.7k
    }
351
}