/src/html5ever/xml5ever/src/tokenizer/char_ref/mod.rs
Line | Count | Source |
1 | | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | | // COPYRIGHT file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | use super::{TokenSink, XmlTokenizer}; |
11 | | use crate::data; |
12 | | use crate::tendril::StrTendril; |
13 | | use log::debug; |
14 | | use markup5ever::buffer_queue::BufferQueue; |
15 | | use std::borrow::Cow::{self, Borrowed}; |
16 | | use std::char::from_u32; |
17 | | |
18 | | use self::State::*; |
19 | | pub use self::Status::*; |
20 | | |
21 | | //ยง tokenizing-character-references |
22 | | pub struct CharRef { |
23 | | /// The resulting character(s) |
24 | | pub chars: [char; 2], |
25 | | |
26 | | /// How many slots in `chars` are valid? |
27 | | pub num_chars: u8, |
28 | | } |
29 | | |
30 | | pub enum Status { |
31 | | Stuck, |
32 | | Progress, |
33 | | Done, |
34 | | } |
35 | | |
36 | | #[derive(Debug)] |
37 | | enum State { |
38 | | Begin, |
39 | | Octothorpe, |
40 | | Numeric(u32), // base |
41 | | NumericSemicolon, |
42 | | Named, |
43 | | BogusName, |
44 | | } |
45 | | |
46 | | pub struct CharRefTokenizer { |
47 | | state: State, |
48 | | addnl_allowed: Option<char>, |
49 | | result: Option<CharRef>, |
50 | | |
51 | | num: u32, |
52 | | num_too_big: bool, |
53 | | seen_digit: bool, |
54 | | hex_marker: Option<char>, |
55 | | |
56 | | name_buf_opt: Option<StrTendril>, |
57 | | name_match: Option<(u32, u32)>, |
58 | | name_len: usize, |
59 | | } |
60 | | |
61 | | impl CharRefTokenizer { |
62 | | // NB: We assume that we have an additional allowed character iff we're |
63 | | // tokenizing in an attribute value. |
64 | 737k | pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { |
65 | 737k | CharRefTokenizer { |
66 | 737k | state: Begin, |
67 | 737k | addnl_allowed, |
68 | 737k | result: None, |
69 | 737k | num: 0, |
70 | 737k | num_too_big: false, |
71 | 737k | seen_digit: false, |
72 | 737k | hex_marker: None, |
73 | 737k | name_buf_opt: None, |
74 | 737k | name_match: None, |
75 | 737k | name_len: 0, |
76 | 737k | } |
77 | 737k | } |
78 | | |
79 | | // A CharRefTokenizer can only tokenize one character reference, |
80 | | // so this method consumes the tokenizer. |
81 | 737k | pub fn get_result(self) -> CharRef { |
82 | 737k | self.result.expect("get_result called before done") |
83 | 737k | } |
84 | | |
85 | 2.77M | fn name_buf(&self) -> &StrTendril { |
86 | 2.77M | self.name_buf_opt |
87 | 2.77M | .as_ref() |
88 | 2.77M | .expect("name_buf missing in named character reference") |
89 | 2.77M | } |
90 | | |
91 | 7.02M | fn name_buf_mut(&mut self) -> &mut StrTendril { |
92 | 7.02M | self.name_buf_opt |
93 | 7.02M | .as_mut() |
94 | 7.02M | .expect("name_buf missing in named character reference") |
95 | 7.02M | } |
96 | | |
97 | 569k | fn finish_none(&mut self) -> Status { |
98 | 569k | self.result = Some(CharRef { |
99 | 569k | chars: ['\0', '\0'], |
100 | 569k | num_chars: 0, |
101 | 569k | }); |
102 | 569k | Done |
103 | 569k | } |
104 | | |
105 | 70.9k | fn finish_one(&mut self, c: char) -> Status { |
106 | 70.9k | self.result = Some(CharRef { |
107 | 70.9k | chars: [c, '\0'], |
108 | 70.9k | num_chars: 1, |
109 | 70.9k | }); |
110 | 70.9k | Done |
111 | 70.9k | } |
112 | | } |
113 | | |
114 | | impl CharRefTokenizer { |
115 | 8.36M | pub fn step<Sink: TokenSink>( |
116 | 8.36M | &mut self, |
117 | 8.36M | tokenizer: &XmlTokenizer<Sink>, |
118 | 8.36M | input: &BufferQueue, |
119 | 8.36M | ) -> Status { |
120 | 8.36M | if self.result.is_some() { |
121 | 0 | return Done; |
122 | 8.36M | } |
123 | | |
124 | 8.36M | debug!("char ref tokenizer stepping in state {:?}", self.state); |
125 | 8.36M | match self.state { |
126 | 749k | Begin => self.do_begin(tokenizer, input), |
127 | 121k | Octothorpe => self.do_octothorpe(tokenizer, input), |
128 | 262k | Numeric(base) => self.do_numeric(tokenizer, base, input), |
129 | 70.6k | NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), |
130 | 2.24M | Named => self.do_named(tokenizer, input), |
131 | 4.90M | BogusName => self.do_bogus_name(tokenizer, input), |
132 | | } |
133 | 8.36M | } |
134 | | |
135 | 749k | fn do_begin<Sink: TokenSink>( |
136 | 749k | &mut self, |
137 | 749k | tokenizer: &XmlTokenizer<Sink>, |
138 | 749k | input: &BufferQueue, |
139 | 749k | ) -> Status { |
140 | 749k | match tokenizer.peek(input) { |
141 | 42.7k | Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(), |
142 | 694k | Some(c) if Some(c) == self.addnl_allowed => self.finish_none(), |
143 | | Some('#') => { |
144 | 118k | tokenizer.discard_char(input); |
145 | 118k | self.state = Octothorpe; |
146 | 118k | Progress |
147 | | }, |
148 | | Some(_) => { |
149 | 575k | self.state = Named; |
150 | 575k | self.name_buf_opt = Some(StrTendril::new()); |
151 | 575k | Progress |
152 | | }, |
153 | 12.3k | None => Stuck, |
154 | | } |
155 | 749k | } |
156 | | |
157 | 121k | fn do_octothorpe<Sink: TokenSink>( |
158 | 121k | &mut self, |
159 | 121k | tokenizer: &XmlTokenizer<Sink>, |
160 | 121k | input: &BufferQueue, |
161 | 121k | ) -> Status { |
162 | 121k | match tokenizer.peek(input) { |
163 | 27.1k | Some(c @ ('x' | 'X')) => { |
164 | 27.1k | tokenizer.discard_char(input); |
165 | 27.1k | self.hex_marker = Some(c); |
166 | 27.1k | self.state = Numeric(16); |
167 | 27.1k | }, |
168 | 91.2k | Some(_) => { |
169 | 91.2k | self.hex_marker = None; |
170 | 91.2k | self.state = Numeric(10); |
171 | 91.2k | }, |
172 | 2.99k | None => return Stuck, |
173 | | } |
174 | 118k | Progress |
175 | 121k | } |
176 | | |
177 | 262k | fn do_numeric<Sink: TokenSink>( |
178 | 262k | &mut self, |
179 | 262k | tokenizer: &XmlTokenizer<Sink>, |
180 | 262k | base: u32, |
181 | 262k | input: &BufferQueue, |
182 | 262k | ) -> Status { |
183 | 262k | let Some(c) = tokenizer.peek(input) else { |
184 | 1.64k | return Stuck; |
185 | | }; |
186 | 261k | match c.to_digit(base) { |
187 | 143k | Some(n) => { |
188 | 143k | tokenizer.discard_char(input); |
189 | 143k | self.num = self.num.wrapping_mul(base); |
190 | 143k | if self.num > 0x10FFFF { |
191 | 2.95k | // We might overflow, and the character is definitely invalid. |
192 | 2.95k | // We still parse digits and semicolon, but don't use the result. |
193 | 2.95k | self.num_too_big = true; |
194 | 140k | } |
195 | 143k | self.num = self.num.wrapping_add(n); |
196 | 143k | self.seen_digit = true; |
197 | 143k | Progress |
198 | | }, |
199 | | |
200 | 47.4k | None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), |
201 | | |
202 | | None => { |
203 | 70.6k | self.state = NumericSemicolon; |
204 | 70.6k | Progress |
205 | | }, |
206 | | } |
207 | 262k | } |
208 | | |
209 | 70.6k | fn do_numeric_semicolon<Sink: TokenSink>( |
210 | 70.6k | &mut self, |
211 | 70.6k | tokenizer: &XmlTokenizer<Sink>, |
212 | 70.6k | input: &BufferQueue, |
213 | 70.6k | ) -> Status { |
214 | 70.6k | match tokenizer.peek(input) { |
215 | 396 | Some(';') => tokenizer.discard_char(input), |
216 | 70.2k | Some(_) => tokenizer.emit_error(Borrowed( |
217 | 70.2k | "Semicolon missing after numeric character reference", |
218 | 70.2k | )), |
219 | 0 | None => return Stuck, |
220 | | }; |
221 | 70.6k | self.finish_numeric(tokenizer) |
222 | 70.6k | } |
223 | | |
224 | 47.4k | fn unconsume_numeric<Sink: TokenSink>( |
225 | 47.4k | &mut self, |
226 | 47.4k | tokenizer: &XmlTokenizer<Sink>, |
227 | 47.4k | input: &BufferQueue, |
228 | 47.4k | ) -> Status { |
229 | 47.4k | let mut unconsume = StrTendril::from_char('#'); |
230 | 47.4k | if let Some(c) = self.hex_marker { |
231 | 7.38k | unconsume.push_char(c); |
232 | 40.0k | } |
233 | | |
234 | 47.4k | tokenizer.unconsume(input, unconsume); |
235 | 47.4k | tokenizer.emit_error(Borrowed("Numeric character reference without digits")); |
236 | 47.4k | self.finish_none() |
237 | 47.4k | } |
238 | | |
239 | 70.9k | fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status { |
240 | 62.4k | fn conv(n: u32) -> char { |
241 | 62.4k | from_u32(n).expect("invalid char missed by error handling cases") |
242 | 62.4k | } |
243 | | |
244 | 70.9k | let (c, error) = match self.num { |
245 | 70.9k | n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), |
246 | 6.32k | 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), |
247 | | |
248 | 4.67k | 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { |
249 | 1.59k | Some(c) => (c, true), |
250 | 1.48k | None => (conv(self.num), true), |
251 | | }, |
252 | | |
253 | 58.0k | 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), |
254 | | |
255 | 49.3k | n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), |
256 | | |
257 | 48.9k | n => (conv(n), false), |
258 | | }; |
259 | | |
260 | 70.9k | if error { |
261 | 21.9k | let msg = if tokenizer.opts.exact_errors { |
262 | 0 | Cow::from(format!( |
263 | 0 | "Invalid numeric character reference value 0x{:06X}", |
264 | | self.num |
265 | | )) |
266 | | } else { |
267 | 21.9k | Cow::from("Invalid numeric character reference") |
268 | | }; |
269 | 21.9k | tokenizer.emit_error(msg); |
270 | 48.9k | } |
271 | | |
272 | 70.9k | self.finish_one(c) |
273 | 70.9k | } |
274 | | |
275 | 2.24M | fn do_named<Sink: TokenSink>( |
276 | 2.24M | &mut self, |
277 | 2.24M | tokenizer: &XmlTokenizer<Sink>, |
278 | 2.24M | input: &BufferQueue, |
279 | 2.24M | ) -> Status { |
280 | 2.24M | let Some(c) = tokenizer.get_char(input) else { |
281 | 29.8k | return Stuck; |
282 | | }; |
283 | 2.21M | self.name_buf_mut().push_char(c); |
284 | 2.21M | match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { |
285 | | // We have either a full match or a prefix of one. |
286 | 1.64M | Some(&m) => { |
287 | 1.64M | if m.0 != 0 { |
288 | 98.9k | // We have a full match, but there might be a longer one to come. |
289 | 98.9k | self.name_match = Some(m); |
290 | 98.9k | self.name_len = self.name_buf().len(); |
291 | 1.54M | } |
292 | | // Otherwise we just have a prefix match. |
293 | 1.64M | Progress |
294 | | }, |
295 | | |
296 | | // Can't continue the match. |
297 | 575k | None => self.finish_named(tokenizer, Some(c), input), |
298 | | } |
299 | 2.24M | } |
300 | | |
301 | 71.4k | fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) { |
302 | 71.4k | let msg = if tokenizer.opts.exact_errors { |
303 | 0 | Cow::from(format!("Invalid character reference &{}", self.name_buf())) |
304 | | } else { |
305 | 71.4k | Cow::from("Invalid character reference") |
306 | | }; |
307 | 71.4k | tokenizer.emit_error(msg); |
308 | 71.4k | } |
309 | | |
310 | 478k | fn unconsume_name<Sink: TokenSink>( |
311 | 478k | &mut self, |
312 | 478k | tokenizer: &XmlTokenizer<Sink>, |
313 | 478k | input: &BufferQueue, |
314 | 478k | ) { |
315 | 478k | tokenizer.unconsume(input, self.name_buf_opt.take().unwrap()); |
316 | 478k | } |
317 | | |
318 | 575k | fn finish_named<Sink: TokenSink>( |
319 | 575k | &mut self, |
320 | 575k | tokenizer: &XmlTokenizer<Sink>, |
321 | 575k | end_char: Option<char>, |
322 | 575k | input: &BufferQueue, |
323 | 575k | ) -> Status { |
324 | 575k | match self.name_match { |
325 | | None => { |
326 | 68.3k | match end_char { |
327 | 477k | Some(c) if c.is_ascii_alphanumeric() => { |
328 | | // Keep looking for a semicolon, to determine whether |
329 | | // we emit a parse error. |
330 | 205k | self.state = BogusName; |
331 | 205k | return Progress; |
332 | | }, |
333 | | |
334 | | // Check length because &; is not a parse error. |
335 | 68.3k | Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), |
336 | | |
337 | 204k | _ => (), |
338 | | } |
339 | 272k | self.unconsume_name(tokenizer, input); |
340 | 272k | self.finish_none() |
341 | | }, |
342 | | |
343 | 98.5k | Some((c1, c2)) => { |
344 | | // We have a complete match, but we may have consumed |
345 | | // additional characters into self.name_buf. Usually |
346 | | // at least one, but several in cases like |
347 | | // |
348 | | // ¬ => match for U+00AC |
349 | | // ¬i => valid prefix for ¬in |
350 | | // ¬it => can't continue match |
351 | | |
352 | 98.5k | let name_len = self.name_len; |
353 | 98.5k | assert!(name_len > 0); |
354 | 98.5k | let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); |
355 | | |
356 | | // There might not be a next character after the match, if |
357 | | // we had a full match and then hit EOF. |
358 | 98.5k | let next_after = if name_len == self.name_buf().len() { |
359 | 120 | None |
360 | | } else { |
361 | 98.4k | Some(self.name_buf()[name_len..].chars().next().unwrap()) |
362 | | }; |
363 | | |
364 | | // "If the character reference is being consumed as part of an |
365 | | // attribute, and the last character matched is not a U+003B |
366 | | // SEMICOLON character (;), and the next character is either a |
367 | | // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII |
368 | | // character, then, for historical reasons, all the characters |
369 | | // that were matched after the U+0026 AMPERSAND character (&) |
370 | | // must be unconsumed, and nothing is returned. However, if |
371 | | // this next character is in fact a U+003D EQUALS SIGN |
372 | | // character (=), then this is a parse error" |
373 | | |
374 | 98.5k | let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { |
375 | 77.3k | (_, ';', _) => false, |
376 | | (Some(_), _, Some('=')) => { |
377 | 194 | tokenizer.emit_error(Borrowed( |
378 | 194 | "Equals sign after character reference in attribute", |
379 | 194 | )); |
380 | 194 | true |
381 | | }, |
382 | 1.91k | (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true, |
383 | | _ => { |
384 | 19.9k | tokenizer.emit_error(Borrowed( |
385 | 19.9k | "Character reference does not end with semicolon", |
386 | 19.9k | )); |
387 | 19.9k | false |
388 | | }, |
389 | | }; |
390 | | |
391 | 98.5k | if unconsume_all { |
392 | 1.25k | self.unconsume_name(tokenizer, input); |
393 | 1.25k | self.finish_none() |
394 | | } else { |
395 | 97.2k | tokenizer |
396 | 97.2k | .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..])); |
397 | | self.result = Some(CharRef { |
398 | 97.2k | chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], |
399 | 97.2k | num_chars: if c2 == 0 { 1 } else { 2 }, |
400 | | }); |
401 | 97.2k | Done |
402 | | } |
403 | | }, |
404 | | } |
405 | 575k | } |
406 | | |
407 | 4.90M | fn do_bogus_name<Sink: TokenSink>( |
408 | 4.90M | &mut self, |
409 | 4.90M | tokenizer: &XmlTokenizer<Sink>, |
410 | 4.90M | input: &BufferQueue, |
411 | 4.90M | ) -> Status { |
412 | 4.90M | let Some(c) = tokenizer.get_char(input) else { |
413 | 103k | return Stuck; |
414 | | }; |
415 | 4.80M | self.name_buf_mut().push_char(c); |
416 | 4.80M | match c { |
417 | 4.80M | _ if c.is_ascii_alphanumeric() => return Progress, |
418 | 3.67k | ';' => self.emit_name_error(tokenizer), |
419 | 201k | _ => (), |
420 | | } |
421 | 205k | self.unconsume_name(tokenizer, input); |
422 | 205k | self.finish_none() |
423 | 4.90M | } |
424 | | |
425 | 729 | pub fn end_of_file<Sink: TokenSink>( |
426 | 729 | &mut self, |
427 | 729 | tokenizer: &XmlTokenizer<Sink>, |
428 | 729 | input: &BufferQueue, |
429 | 729 | ) { |
430 | 1.45k | while self.result.is_none() { |
431 | 330 | match self.state { |
432 | 147 | Begin => drop(self.finish_none()), |
433 | | |
434 | 16 | Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), |
435 | | |
436 | 314 | Numeric(_) | NumericSemicolon => { |
437 | 314 | tokenizer.emit_error(Borrowed("EOF in numeric character reference")); |
438 | 314 | self.finish_numeric(tokenizer); |
439 | 314 | }, |
440 | | |
441 | 160 | Named => drop(self.finish_named(tokenizer, None, input)), |
442 | | |
443 | 72 | BogusName => { |
444 | 72 | self.unconsume_name(tokenizer, input); |
445 | 72 | self.finish_none(); |
446 | 72 | }, |
447 | | |
448 | 20 | Octothorpe => { |
449 | 20 | tokenizer.unconsume(input, StrTendril::from_slice("#")); |
450 | 20 | tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); |
451 | 20 | self.finish_none(); |
452 | 20 | }, |
453 | | } |
454 | | } |
455 | 729 | } |
456 | | } |