/src/html5ever/xml5ever/src/tokenizer/char_ref/mod.rs

Source
// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::{TokenSink, XmlTokenizer};
use crate::data;
use crate::tendril::StrTendril;
use log::debug;
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::{self, Borrowed};
use std::char::from_u32;

use self::State::*;
pub use self::Status::*;

//§ tokenizing-character-references
pub struct CharRef {
    /// The resulting character(s)
    pub chars: [char; 2],

    /// How many slots in `chars` are valid?
    pub num_chars: u8,
}

pub enum Status {
    Stuck,
    Progress,
    Done,
}

#[derive(Debug)]
enum State {
    Begin,
    Octothorpe,
    Numeric(u32), // base
    NumericSemicolon,
    Named,
    BogusName,
}

pub struct CharRefTokenizer {
    state: State,
    addnl_allowed: Option<char>,
    result: Option<CharRef>,

    num: u32,
    num_too_big: bool,
    seen_digit: bool,
    hex_marker: Option<char>,

    name_buf_opt: Option<StrTendril>,
    name_match: Option<(u32, u32)>,
    name_len: usize,
}

impl CharRefTokenizer {
    // NB: We assume that we have an additional allowed character iff we're
    // tokenizing in an attribute value.
    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
        CharRefTokenizer {
            state: Begin,
            addnl_allowed,
            result: None,
            num: 0,
            num_too_big: false,
            seen_digit: false,
            hex_marker: None,
            name_buf_opt: None,
            name_match: None,
            name_len: 0,
        }
    }

    // A CharRefTokenizer can only tokenize one character reference,
    // so this method consumes the tokenizer.
    pub fn get_result(self) -> CharRef {
        self.result.expect("get_result called before done")
    }

    fn name_buf(&self) -> &StrTendril {
        self.name_buf_opt
            .as_ref()
            .expect("name_buf missing in named character reference")
    }

    fn name_buf_mut(&mut self) -> &mut StrTendril {
        self.name_buf_opt
            .as_mut()
            .expect("name_buf missing in named character reference")
    }

    fn finish_none(&mut self) -> Status {
        self.result = Some(CharRef {
            chars: ['\0', '\0'],
            num_chars: 0,
        });
        Done
    }

    fn finish_one(&mut self, c: char) -> Status {
        self.result = Some(CharRef {
            chars: [c, '\0'],
            num_chars: 1,
        });
        Done
    }
}

impl CharRefTokenizer {
    pub fn step<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        if self.result.is_some() {
            return Done;
        }

        debug!("char ref tokenizer stepping in state {:?}", self.state);
        match self.state {
            Begin => self.do_begin(tokenizer, input),
            Octothorpe => self.do_octothorpe(tokenizer, input),
            Numeric(base) => self.do_numeric(tokenizer, base, input),
            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
            Named => self.do_named(tokenizer, input),
            BogusName => self.do_bogus_name(tokenizer, input),
        }
    }

    fn do_begin<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        match tokenizer.peek(input) {
            Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(),
            Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
            Some('#') => {
                tokenizer.discard_char(input);
                self.state = Octothorpe;
                Progress
            },
            Some(_) => {
                self.state = Named;
                self.name_buf_opt = Some(StrTendril::new());
                Progress
            },
            None => Stuck,
        }
    }

    fn do_octothorpe<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        match tokenizer.peek(input) {
            Some(c @ ('x' | 'X')) => {
                tokenizer.discard_char(input);
                self.hex_marker = Some(c);
                self.state = Numeric(16);
            },
            Some(_) => {
                self.hex_marker = None;
                self.state = Numeric(10);
            },
            None => return Stuck,
        }
        Progress
    }

    fn do_numeric<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        base: u32,
        input: &BufferQueue,
    ) -> Status {
        let Some(c) = tokenizer.peek(input) else {
            return Stuck;
        };
        match c.to_digit(base) {
            Some(n) => {
                tokenizer.discard_char(input);
                self.num = self.num.wrapping_mul(base);
                if self.num > 0x10FFFF {
                    // We might overflow, and the character is definitely invalid.
                    // We still parse digits and semicolon, but don't use the result.
                    self.num_too_big = true;
                }
                self.num = self.num.wrapping_add(n);
                self.seen_digit = true;
                Progress
            },

            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),

            None => {
                self.state = NumericSemicolon;
                Progress
            },
        }
    }

    fn do_numeric_semicolon<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        match tokenizer.peek(input) {
            Some(';') => tokenizer.discard_char(input),
            Some(_) => tokenizer.emit_error(Borrowed(
                "Semicolon missing after numeric character reference",
            )),
            None => return Stuck,
        };
        self.finish_numeric(tokenizer)
    }

    fn unconsume_numeric<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        let mut unconsume = StrTendril::from_char('#');
        if let Some(c) = self.hex_marker {
            unconsume.push_char(c);
        }

        tokenizer.unconsume(input, unconsume);
        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
        self.finish_none()
    }

    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
        fn conv(n: u32) -> char {
            from_u32(n).expect("invalid char missed by error handling cases")
        }

        let (c, error) = match self.num {
            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),

            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
                Some(c) => (c, true),
                None => (conv(self.num), true),
            },

            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),

            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),

            n => (conv(n), false),
        };

        if error {
            let msg = if tokenizer.opts.exact_errors {
                Cow::from(format!(
                    "Invalid numeric character reference value 0x{:06X}",
                    self.num
                ))
            } else {
                Cow::from("Invalid numeric character reference")
            };
            tokenizer.emit_error(msg);
        }

        self.finish_one(c)
    }

    fn do_named<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        let Some(c) = tokenizer.get_char(input) else {
            return Stuck;
        };
        self.name_buf_mut().push_char(c);
        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
            // We have either a full match or a prefix of one.
            Some(&m) => {
                if m.0 != 0 {
                    // We have a full match, but there might be a longer one to come.
                    self.name_match = Some(m);
                    self.name_len = self.name_buf().len();
                }
                // Otherwise we just have a prefix match.
                Progress
            },

            // Can't continue the match.
            None => self.finish_named(tokenizer, Some(c), input),
        }
    }

    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
        let msg = if tokenizer.opts.exact_errors {
            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
        } else {
            Cow::from("Invalid character reference")
        };
        tokenizer.emit_error(msg);
    }

    fn unconsume_name<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) {
        tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
    }

    fn finish_named<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        end_char: Option<char>,
        input: &BufferQueue,
    ) -> Status {
        match self.name_match {
            None => {
                match end_char {
                    Some(c) if c.is_ascii_alphanumeric() => {
                        // Keep looking for a semicolon, to determine whether
                        // we emit a parse error.
                        self.state = BogusName;
                        return Progress;
                    },

                    // Check length because &; is not a parse error.
                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),

                    _ => (),
                }
                self.unconsume_name(tokenizer, input);
                self.finish_none()
            },

            Some((c1, c2)) => {
                // We have a complete match, but we may have consumed
                // additional characters into self.name_buf.  Usually
                // at least one, but several in cases like
                //
                //     &not    => match for U+00AC
                //     &noti   => valid prefix for &notin
                //     &notit  => can't continue match

                let name_len = self.name_len;
                assert!(name_len > 0);
                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();

                // There might not be a next character after the match, if
                // we had a full match and then hit EOF.
                let next_after = if name_len == self.name_buf().len() {
                    None
                } else {
                    Some(self.name_buf()[name_len..].chars().next().unwrap())
                };

                // "If the character reference is being consumed as part of an
                // attribute, and the last character matched is not a U+003B
                // SEMICOLON character (;), and the next character is either a
                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
                // character, then, for historical reasons, all the characters
                // that were matched after the U+0026 AMPERSAND character (&)
                // must be unconsumed, and nothing is returned. However, if
                // this next character is in fact a U+003D EQUALS SIGN
                // character (=), then this is a parse error"

                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
                    (_, ';', _) => false,
                    (Some(_), _, Some('=')) => {
                        tokenizer.emit_error(Borrowed(
                            "Equals sign after character reference in attribute",
                        ));
                        true
                    },
                    (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
                    _ => {
                        tokenizer.emit_error(Borrowed(
                            "Character reference does not end with semicolon",
                        ));
                        false
                    },
                };

                if unconsume_all {
                    self.unconsume_name(tokenizer, input);
                    self.finish_none()
                } else {
                    tokenizer
                        .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
                    self.result = Some(CharRef {
                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
                        num_chars: if c2 == 0 { 1 } else { 2 },
                    });
                    Done
                }
            },
        }
    }

    fn do_bogus_name<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) -> Status {
        let Some(c) = tokenizer.get_char(input) else {
            return Stuck;
        };
        self.name_buf_mut().push_char(c);
        match c {
            _ if c.is_ascii_alphanumeric() => return Progress,
            ';' => self.emit_name_error(tokenizer),
            _ => (),
        }
        self.unconsume_name(tokenizer, input);
        self.finish_none()
    }

    pub fn end_of_file<Sink: TokenSink>(
        &mut self,
        tokenizer: &XmlTokenizer<Sink>,
        input: &BufferQueue,
    ) {
        while self.result.is_none() {
            match self.state {
                Begin => drop(self.finish_none()),

                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),

                Numeric(_) | NumericSemicolon => {
                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
                    self.finish_numeric(tokenizer);
                },

                Named => drop(self.finish_named(tokenizer, None, input)),

                BogusName => {
                    self.unconsume_name(tokenizer, input);
                    self.finish_none();
                },

                Octothorpe => {
                    tokenizer.unconsume(input, StrTendril::from_slice("#"));
                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                    self.finish_none();
                },
            }
        }
    }
}

Coverage Report

Created: 2026-03-31 06:51

Line	Count	Source
1		// Copyright 2014-2017 The html5ever Project Developers. See the
2		// COPYRIGHT file at the top-level directory of this distribution.
3		//
4		// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5		// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6		// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7		// option. This file may not be copied, modified, or distributed
8		// except according to those terms.
9
10		use super::{TokenSink, XmlTokenizer};
11		use crate::data;
12		use crate::tendril::StrTendril;
13		use log::debug;
14		use markup5ever::buffer_queue::BufferQueue;
15		use std::borrow::Cow::{self, Borrowed};
16		use std::char::from_u32;
17
18		use self::State::*;
19		pub use self::Status::*;
20
21		//§ tokenizing-character-references
22		pub struct CharRef {
23		/// The resulting character(s)
24		pub chars: [char; 2],
25
26		/// How many slots in `chars` are valid?
27		pub num_chars: u8,
28		}
29
30		pub enum Status {
31		Stuck,
32		Progress,
33		Done,
34		}
35
36		#[derive(Debug)]
37		enum State {
38		Begin,
39		Octothorpe,
40		Numeric(u32), // base
41		NumericSemicolon,
42		Named,
43		BogusName,
44		}
45
46		pub struct CharRefTokenizer {
47		state: State,
48		addnl_allowed: Option<char>,
49		result: Option<CharRef>,
50
51		num: u32,
52		num_too_big: bool,
53		seen_digit: bool,
54		hex_marker: Option<char>,
55
56		name_buf_opt: Option<StrTendril>,
57		name_match: Option<(u32, u32)>,
58		name_len: usize,
59		}
60
61		impl CharRefTokenizer {
62		// NB: We assume that we have an additional allowed character iff we're
63		// tokenizing in an attribute value.
64	737k	pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
65	737k	CharRefTokenizer {
66	737k	state: Begin,
67	737k	addnl_allowed,
68	737k	result: None,
69	737k	num: 0,
70	737k	num_too_big: false,
71	737k	seen_digit: false,
72	737k	hex_marker: None,
73	737k	name_buf_opt: None,
74	737k	name_match: None,
75	737k	name_len: 0,
76	737k	}
77	737k	}
78
79		// A CharRefTokenizer can only tokenize one character reference,
80		// so this method consumes the tokenizer.
81	737k	pub fn get_result(self) -> CharRef {
82	737k	self.result.expect("get_result called before done")
83	737k	}
84
85	2.77M	fn name_buf(&self) -> &StrTendril {
86	2.77M	self.name_buf_opt
87	2.77M	.as_ref()
88	2.77M	.expect("name_buf missing in named character reference")
89	2.77M	}
90
91	7.02M	fn name_buf_mut(&mut self) -> &mut StrTendril {
92	7.02M	self.name_buf_opt
93	7.02M	.as_mut()
94	7.02M	.expect("name_buf missing in named character reference")
95	7.02M	}
96
97	569k	fn finish_none(&mut self) -> Status {
98	569k	self.result = Some(CharRef {
99	569k	chars: ['\0', '\0'],
100	569k	num_chars: 0,
101	569k	});
102	569k	Done
103	569k	}
104
105	70.9k	fn finish_one(&mut self, c: char) -> Status {
106	70.9k	self.result = Some(CharRef {
107	70.9k	chars: [c, '\0'],
108	70.9k	num_chars: 1,
109	70.9k	});
110	70.9k	Done
111	70.9k	}
112		}
113
114		impl CharRefTokenizer {
115	8.36M	pub fn step<Sink: TokenSink>(
116	8.36M	&mut self,
117	8.36M	tokenizer: &XmlTokenizer<Sink>,
118	8.36M	input: &BufferQueue,
119	8.36M	) -> Status {
120	8.36M	if self.result.is_some() {
121	0	return Done;
122	8.36M	}
123
124	8.36M	debug!("char ref tokenizer stepping in state {:?}", self.state);
125	8.36M	match self.state {
126	749k	Begin => self.do_begin(tokenizer, input),
127	121k	Octothorpe => self.do_octothorpe(tokenizer, input),
128	262k	Numeric(base) => self.do_numeric(tokenizer, base, input),
129	70.6k	NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130	2.24M	Named => self.do_named(tokenizer, input),
131	4.90M	BogusName => self.do_bogus_name(tokenizer, input),
132		}
133	8.36M	}
134
135	749k	fn do_begin<Sink: TokenSink>(
136	749k	&mut self,
137	749k	tokenizer: &XmlTokenizer<Sink>,
138	749k	input: &BufferQueue,
139	749k	) -> Status {
140	749k	match tokenizer.peek(input) {
141	42.7k	Some('\t' \| '\n' \| '\x0C' \| ' ' \| '<' \| '&') => self.finish_none(),
142	694k	Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
143		Some('#') => {
144	118k	tokenizer.discard_char(input);
145	118k	self.state = Octothorpe;
146	118k	Progress
147		},
148		Some(_) => {
149	575k	self.state = Named;
150	575k	self.name_buf_opt = Some(StrTendril::new());
151	575k	Progress
152		},
153	12.3k	None => Stuck,
154		}
155	749k	}
156
157	121k	fn do_octothorpe<Sink: TokenSink>(
158	121k	&mut self,
159	121k	tokenizer: &XmlTokenizer<Sink>,
160	121k	input: &BufferQueue,
161	121k	) -> Status {
162	121k	match tokenizer.peek(input) {
163	27.1k	Some(c @ ('x' \| 'X')) => {
164	27.1k	tokenizer.discard_char(input);
165	27.1k	self.hex_marker = Some(c);
166	27.1k	self.state = Numeric(16);
167	27.1k	},
168	91.2k	Some(_) => {
169	91.2k	self.hex_marker = None;
170	91.2k	self.state = Numeric(10);
171	91.2k	},
172	2.99k	None => return Stuck,
173		}
174	118k	Progress
175	121k	}
176
177	262k	fn do_numeric<Sink: TokenSink>(
178	262k	&mut self,
179	262k	tokenizer: &XmlTokenizer<Sink>,
180	262k	base: u32,
181	262k	input: &BufferQueue,
182	262k	) -> Status {
183	262k	let Some(c) = tokenizer.peek(input) else {
184	1.64k	return Stuck;
185		};
186	261k	match c.to_digit(base) {
187	143k	Some(n) => {
188	143k	tokenizer.discard_char(input);
189	143k	self.num = self.num.wrapping_mul(base);
190	143k	if self.num > 0x10FFFF {
191	2.95k	// We might overflow, and the character is definitely invalid.
192	2.95k	// We still parse digits and semicolon, but don't use the result.
193	2.95k	self.num_too_big = true;
194	140k	}
195	143k	self.num = self.num.wrapping_add(n);
196	143k	self.seen_digit = true;
197	143k	Progress
198		},
199
200	47.4k	None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
201
202		None => {
203	70.6k	self.state = NumericSemicolon;
204	70.6k	Progress
205		},
206		}
207	262k	}
208
209	70.6k	fn do_numeric_semicolon<Sink: TokenSink>(
210	70.6k	&mut self,
211	70.6k	tokenizer: &XmlTokenizer<Sink>,
212	70.6k	input: &BufferQueue,
213	70.6k	) -> Status {
214	70.6k	match tokenizer.peek(input) {
215	396	Some(';') => tokenizer.discard_char(input),
216	70.2k	Some(_) => tokenizer.emit_error(Borrowed(
217	70.2k	"Semicolon missing after numeric character reference",
218	70.2k	)),
219	0	None => return Stuck,
220		};
221	70.6k	self.finish_numeric(tokenizer)
222	70.6k	}
223
224	47.4k	fn unconsume_numeric<Sink: TokenSink>(
225	47.4k	&mut self,
226	47.4k	tokenizer: &XmlTokenizer<Sink>,
227	47.4k	input: &BufferQueue,
228	47.4k	) -> Status {
229	47.4k	let mut unconsume = StrTendril::from_char('#');
230	47.4k	if let Some(c) = self.hex_marker {
231	7.38k	unconsume.push_char(c);
232	40.0k	}
233
234	47.4k	tokenizer.unconsume(input, unconsume);
235	47.4k	tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236	47.4k	self.finish_none()
237	47.4k	}
238
239	70.9k	fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240	62.4k	fn conv(n: u32) -> char {
241	62.4k	from_u32(n).expect("invalid char missed by error handling cases")
242	62.4k	}
243
244	70.9k	let (c, error) = match self.num {
245	70.9k	n if (n > 0x10FFFF) \|\| self.num_too_big => ('\u{fffd}', true),
246	6.32k	0x00 \| 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248	4.67k	0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249	1.59k	Some(c) => (c, true),
250	1.48k	None => (conv(self.num), true),
251		},
252
253	58.0k	0x01..=0x08 \| 0x0B \| 0x0D..=0x1F \| 0x7F \| 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255	49.3k	n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257	48.9k	n => (conv(n), false),
258		};
259
260	70.9k	if error {
261	21.9k	let msg = if tokenizer.opts.exact_errors {
262	0	Cow::from(format!(
263	0	"Invalid numeric character reference value 0x{:06X}",
264		self.num
265		))
266		} else {
267	21.9k	Cow::from("Invalid numeric character reference")
268		};
269	21.9k	tokenizer.emit_error(msg);
270	48.9k	}
271
272	70.9k	self.finish_one(c)
273	70.9k	}
274
275	2.24M	fn do_named<Sink: TokenSink>(
276	2.24M	&mut self,
277	2.24M	tokenizer: &XmlTokenizer<Sink>,
278	2.24M	input: &BufferQueue,
279	2.24M	) -> Status {
280	2.24M	let Some(c) = tokenizer.get_char(input) else {
281	29.8k	return Stuck;
282		};
283	2.21M	self.name_buf_mut().push_char(c);
284	2.21M	match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
285		// We have either a full match or a prefix of one.
286	1.64M	Some(&m) => {
287	1.64M	if m.0 != 0 {
288	98.9k	// We have a full match, but there might be a longer one to come.
289	98.9k	self.name_match = Some(m);
290	98.9k	self.name_len = self.name_buf().len();
291	1.54M	}
292		// Otherwise we just have a prefix match.
293	1.64M	Progress
294		},
295
296		// Can't continue the match.
297	575k	None => self.finish_named(tokenizer, Some(c), input),
298		}
299	2.24M	}
300
301	71.4k	fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
302	71.4k	let msg = if tokenizer.opts.exact_errors {
303	0	Cow::from(format!("Invalid character reference &{}", self.name_buf()))
304		} else {
305	71.4k	Cow::from("Invalid character reference")
306		};
307	71.4k	tokenizer.emit_error(msg);
308	71.4k	}
309
310	478k	fn unconsume_name<Sink: TokenSink>(
311	478k	&mut self,
312	478k	tokenizer: &XmlTokenizer<Sink>,
313	478k	input: &BufferQueue,
314	478k	) {
315	478k	tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
316	478k	}
317
318	575k	fn finish_named<Sink: TokenSink>(
319	575k	&mut self,
320	575k	tokenizer: &XmlTokenizer<Sink>,
321	575k	end_char: Option<char>,
322	575k	input: &BufferQueue,
323	575k	) -> Status {
324	575k	match self.name_match {
325		None => {
326	68.3k	match end_char {
327	477k	Some(c) if c.is_ascii_alphanumeric() => {
328		// Keep looking for a semicolon, to determine whether
329		// we emit a parse error.
330	205k	self.state = BogusName;
331	205k	return Progress;
332		},
333
334		// Check length because &; is not a parse error.
335	68.3k	Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
336
337	204k	_ => (),
338		}
339	272k	self.unconsume_name(tokenizer, input);
340	272k	self.finish_none()
341		},
342
343	98.5k	Some((c1, c2)) => {
344		// We have a complete match, but we may have consumed
345		// additional characters into self.name_buf. Usually
346		// at least one, but several in cases like
347		//
348		// &not => match for U+00AC
349		// &noti => valid prefix for &notin
350		// &notit => can't continue match
351
352	98.5k	let name_len = self.name_len;
353	98.5k	assert!(name_len > 0);
354	98.5k	let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
355
356		// There might not be a next character after the match, if
357		// we had a full match and then hit EOF.
358	98.5k	let next_after = if name_len == self.name_buf().len() {
359	120	None
360		} else {
361	98.4k	Some(self.name_buf()[name_len..].chars().next().unwrap())
362		};
363
364		// "If the character reference is being consumed as part of an
365		// attribute, and the last character matched is not a U+003B
366		// SEMICOLON character (;), and the next character is either a
367		// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
368		// character, then, for historical reasons, all the characters
369		// that were matched after the U+0026 AMPERSAND character (&)
370		// must be unconsumed, and nothing is returned. However, if
371		// this next character is in fact a U+003D EQUALS SIGN
372		// character (=), then this is a parse error"
373
374	98.5k	let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
375	77.3k	(_, ';', _) => false,
376		(Some(_), _, Some('=')) => {
377	194	tokenizer.emit_error(Borrowed(
378	194	"Equals sign after character reference in attribute",
379	194	));
380	194	true
381		},
382	1.91k	(Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
383		_ => {
384	19.9k	tokenizer.emit_error(Borrowed(
385	19.9k	"Character reference does not end with semicolon",
386	19.9k	));
387	19.9k	false
388		},
389		};
390
391	98.5k	if unconsume_all {
392	1.25k	self.unconsume_name(tokenizer, input);
393	1.25k	self.finish_none()
394		} else {
395	97.2k	tokenizer
396	97.2k	.unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
397		self.result = Some(CharRef {
398	97.2k	chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
399	97.2k	num_chars: if c2 == 0 { 1 } else { 2 },
400		});
401	97.2k	Done
402		}
403		},
404		}
405	575k	}
406
407	4.90M	fn do_bogus_name<Sink: TokenSink>(
408	4.90M	&mut self,
409	4.90M	tokenizer: &XmlTokenizer<Sink>,
410	4.90M	input: &BufferQueue,
411	4.90M	) -> Status {
412	4.90M	let Some(c) = tokenizer.get_char(input) else {
413	103k	return Stuck;
414		};
415	4.80M	self.name_buf_mut().push_char(c);
416	4.80M	match c {
417	4.80M	_ if c.is_ascii_alphanumeric() => return Progress,
418	3.67k	';' => self.emit_name_error(tokenizer),
419	201k	_ => (),
420		}
421	205k	self.unconsume_name(tokenizer, input);
422	205k	self.finish_none()
423	4.90M	}
424
425	729	pub fn end_of_file<Sink: TokenSink>(
426	729	&mut self,
427	729	tokenizer: &XmlTokenizer<Sink>,
428	729	input: &BufferQueue,
429	729	) {
430	1.45k	while self.result.is_none() {
431	330	match self.state {
432	147	Begin => drop(self.finish_none()),
433
434	16	Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
435
436	314	Numeric(_) \| NumericSemicolon => {
437	314	tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
438	314	self.finish_numeric(tokenizer);
439	314	},
440
441	160	Named => drop(self.finish_named(tokenizer, None, input)),
442
443	72	BogusName => {
444	72	self.unconsume_name(tokenizer, input);
445	72	self.finish_none();
446	72	},
447
448	20	Octothorpe => {
449	20	tokenizer.unconsume(input, StrTendril::from_slice("#"));
450	20	tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
451	20	self.finish_none();
452	20	},
453		}
454		}
455	729	}
456		}