Coverage Report

Created: 2026-03-31 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/html5ever/xml5ever/src/tokenizer/char_ref/mod.rs
Line
Count
Source
1
// Copyright 2014-2017 The html5ever Project Developers. See the
2
// COPYRIGHT file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
use super::{TokenSink, XmlTokenizer};
11
use crate::data;
12
use crate::tendril::StrTendril;
13
use log::debug;
14
use markup5ever::buffer_queue::BufferQueue;
15
use std::borrow::Cow::{self, Borrowed};
16
use std::char::from_u32;
17
18
use self::State::*;
19
pub use self::Status::*;
20
21
//ยง tokenizing-character-references
22
pub struct CharRef {
23
    /// The resulting character(s)
24
    pub chars: [char; 2],
25
26
    /// How many slots in `chars` are valid?
27
    pub num_chars: u8,
28
}
29
30
pub enum Status {
31
    Stuck,
32
    Progress,
33
    Done,
34
}
35
36
#[derive(Debug)]
37
enum State {
38
    Begin,
39
    Octothorpe,
40
    Numeric(u32), // base
41
    NumericSemicolon,
42
    Named,
43
    BogusName,
44
}
45
46
pub struct CharRefTokenizer {
47
    state: State,
48
    addnl_allowed: Option<char>,
49
    result: Option<CharRef>,
50
51
    num: u32,
52
    num_too_big: bool,
53
    seen_digit: bool,
54
    hex_marker: Option<char>,
55
56
    name_buf_opt: Option<StrTendril>,
57
    name_match: Option<(u32, u32)>,
58
    name_len: usize,
59
}
60
61
impl CharRefTokenizer {
62
    // NB: We assume that we have an additional allowed character iff we're
63
    // tokenizing in an attribute value.
64
737k
    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
65
737k
        CharRefTokenizer {
66
737k
            state: Begin,
67
737k
            addnl_allowed,
68
737k
            result: None,
69
737k
            num: 0,
70
737k
            num_too_big: false,
71
737k
            seen_digit: false,
72
737k
            hex_marker: None,
73
737k
            name_buf_opt: None,
74
737k
            name_match: None,
75
737k
            name_len: 0,
76
737k
        }
77
737k
    }
78
79
    // A CharRefTokenizer can only tokenize one character reference,
80
    // so this method consumes the tokenizer.
81
737k
    pub fn get_result(self) -> CharRef {
82
737k
        self.result.expect("get_result called before done")
83
737k
    }
84
85
2.77M
    fn name_buf(&self) -> &StrTendril {
86
2.77M
        self.name_buf_opt
87
2.77M
            .as_ref()
88
2.77M
            .expect("name_buf missing in named character reference")
89
2.77M
    }
90
91
7.02M
    fn name_buf_mut(&mut self) -> &mut StrTendril {
92
7.02M
        self.name_buf_opt
93
7.02M
            .as_mut()
94
7.02M
            .expect("name_buf missing in named character reference")
95
7.02M
    }
96
97
569k
    fn finish_none(&mut self) -> Status {
98
569k
        self.result = Some(CharRef {
99
569k
            chars: ['\0', '\0'],
100
569k
            num_chars: 0,
101
569k
        });
102
569k
        Done
103
569k
    }
104
105
70.9k
    fn finish_one(&mut self, c: char) -> Status {
106
70.9k
        self.result = Some(CharRef {
107
70.9k
            chars: [c, '\0'],
108
70.9k
            num_chars: 1,
109
70.9k
        });
110
70.9k
        Done
111
70.9k
    }
112
}
113
114
impl CharRefTokenizer {
115
8.36M
    pub fn step<Sink: TokenSink>(
116
8.36M
        &mut self,
117
8.36M
        tokenizer: &XmlTokenizer<Sink>,
118
8.36M
        input: &BufferQueue,
119
8.36M
    ) -> Status {
120
8.36M
        if self.result.is_some() {
121
0
            return Done;
122
8.36M
        }
123
124
8.36M
        debug!("char ref tokenizer stepping in state {:?}", self.state);
125
8.36M
        match self.state {
126
749k
            Begin => self.do_begin(tokenizer, input),
127
121k
            Octothorpe => self.do_octothorpe(tokenizer, input),
128
262k
            Numeric(base) => self.do_numeric(tokenizer, base, input),
129
70.6k
            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130
2.24M
            Named => self.do_named(tokenizer, input),
131
4.90M
            BogusName => self.do_bogus_name(tokenizer, input),
132
        }
133
8.36M
    }
134
135
749k
    fn do_begin<Sink: TokenSink>(
136
749k
        &mut self,
137
749k
        tokenizer: &XmlTokenizer<Sink>,
138
749k
        input: &BufferQueue,
139
749k
    ) -> Status {
140
749k
        match tokenizer.peek(input) {
141
42.7k
            Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(),
142
694k
            Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
143
            Some('#') => {
144
118k
                tokenizer.discard_char(input);
145
118k
                self.state = Octothorpe;
146
118k
                Progress
147
            },
148
            Some(_) => {
149
575k
                self.state = Named;
150
575k
                self.name_buf_opt = Some(StrTendril::new());
151
575k
                Progress
152
            },
153
12.3k
            None => Stuck,
154
        }
155
749k
    }
156
157
121k
    fn do_octothorpe<Sink: TokenSink>(
158
121k
        &mut self,
159
121k
        tokenizer: &XmlTokenizer<Sink>,
160
121k
        input: &BufferQueue,
161
121k
    ) -> Status {
162
121k
        match tokenizer.peek(input) {
163
27.1k
            Some(c @ ('x' | 'X')) => {
164
27.1k
                tokenizer.discard_char(input);
165
27.1k
                self.hex_marker = Some(c);
166
27.1k
                self.state = Numeric(16);
167
27.1k
            },
168
91.2k
            Some(_) => {
169
91.2k
                self.hex_marker = None;
170
91.2k
                self.state = Numeric(10);
171
91.2k
            },
172
2.99k
            None => return Stuck,
173
        }
174
118k
        Progress
175
121k
    }
176
177
262k
    fn do_numeric<Sink: TokenSink>(
178
262k
        &mut self,
179
262k
        tokenizer: &XmlTokenizer<Sink>,
180
262k
        base: u32,
181
262k
        input: &BufferQueue,
182
262k
    ) -> Status {
183
262k
        let Some(c) = tokenizer.peek(input) else {
184
1.64k
            return Stuck;
185
        };
186
261k
        match c.to_digit(base) {
187
143k
            Some(n) => {
188
143k
                tokenizer.discard_char(input);
189
143k
                self.num = self.num.wrapping_mul(base);
190
143k
                if self.num > 0x10FFFF {
191
2.95k
                    // We might overflow, and the character is definitely invalid.
192
2.95k
                    // We still parse digits and semicolon, but don't use the result.
193
2.95k
                    self.num_too_big = true;
194
140k
                }
195
143k
                self.num = self.num.wrapping_add(n);
196
143k
                self.seen_digit = true;
197
143k
                Progress
198
            },
199
200
47.4k
            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
201
202
            None => {
203
70.6k
                self.state = NumericSemicolon;
204
70.6k
                Progress
205
            },
206
        }
207
262k
    }
208
209
70.6k
    fn do_numeric_semicolon<Sink: TokenSink>(
210
70.6k
        &mut self,
211
70.6k
        tokenizer: &XmlTokenizer<Sink>,
212
70.6k
        input: &BufferQueue,
213
70.6k
    ) -> Status {
214
70.6k
        match tokenizer.peek(input) {
215
396
            Some(';') => tokenizer.discard_char(input),
216
70.2k
            Some(_) => tokenizer.emit_error(Borrowed(
217
70.2k
                "Semicolon missing after numeric character reference",
218
70.2k
            )),
219
0
            None => return Stuck,
220
        };
221
70.6k
        self.finish_numeric(tokenizer)
222
70.6k
    }
223
224
47.4k
    fn unconsume_numeric<Sink: TokenSink>(
225
47.4k
        &mut self,
226
47.4k
        tokenizer: &XmlTokenizer<Sink>,
227
47.4k
        input: &BufferQueue,
228
47.4k
    ) -> Status {
229
47.4k
        let mut unconsume = StrTendril::from_char('#');
230
47.4k
        if let Some(c) = self.hex_marker {
231
7.38k
            unconsume.push_char(c);
232
40.0k
        }
233
234
47.4k
        tokenizer.unconsume(input, unconsume);
235
47.4k
        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236
47.4k
        self.finish_none()
237
47.4k
    }
238
239
70.9k
    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240
62.4k
        fn conv(n: u32) -> char {
241
62.4k
            from_u32(n).expect("invalid char missed by error handling cases")
242
62.4k
        }
243
244
70.9k
        let (c, error) = match self.num {
245
70.9k
            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246
6.32k
            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248
4.67k
            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249
1.59k
                Some(c) => (c, true),
250
1.48k
                None => (conv(self.num), true),
251
            },
252
253
58.0k
            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255
49.3k
            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257
48.9k
            n => (conv(n), false),
258
        };
259
260
70.9k
        if error {
261
21.9k
            let msg = if tokenizer.opts.exact_errors {
262
0
                Cow::from(format!(
263
0
                    "Invalid numeric character reference value 0x{:06X}",
264
                    self.num
265
                ))
266
            } else {
267
21.9k
                Cow::from("Invalid numeric character reference")
268
            };
269
21.9k
            tokenizer.emit_error(msg);
270
48.9k
        }
271
272
70.9k
        self.finish_one(c)
273
70.9k
    }
274
275
2.24M
    fn do_named<Sink: TokenSink>(
276
2.24M
        &mut self,
277
2.24M
        tokenizer: &XmlTokenizer<Sink>,
278
2.24M
        input: &BufferQueue,
279
2.24M
    ) -> Status {
280
2.24M
        let Some(c) = tokenizer.get_char(input) else {
281
29.8k
            return Stuck;
282
        };
283
2.21M
        self.name_buf_mut().push_char(c);
284
2.21M
        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
285
            // We have either a full match or a prefix of one.
286
1.64M
            Some(&m) => {
287
1.64M
                if m.0 != 0 {
288
98.9k
                    // We have a full match, but there might be a longer one to come.
289
98.9k
                    self.name_match = Some(m);
290
98.9k
                    self.name_len = self.name_buf().len();
291
1.54M
                }
292
                // Otherwise we just have a prefix match.
293
1.64M
                Progress
294
            },
295
296
            // Can't continue the match.
297
575k
            None => self.finish_named(tokenizer, Some(c), input),
298
        }
299
2.24M
    }
300
301
71.4k
    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
302
71.4k
        let msg = if tokenizer.opts.exact_errors {
303
0
            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
304
        } else {
305
71.4k
            Cow::from("Invalid character reference")
306
        };
307
71.4k
        tokenizer.emit_error(msg);
308
71.4k
    }
309
310
478k
    fn unconsume_name<Sink: TokenSink>(
311
478k
        &mut self,
312
478k
        tokenizer: &XmlTokenizer<Sink>,
313
478k
        input: &BufferQueue,
314
478k
    ) {
315
478k
        tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
316
478k
    }
317
318
575k
    fn finish_named<Sink: TokenSink>(
319
575k
        &mut self,
320
575k
        tokenizer: &XmlTokenizer<Sink>,
321
575k
        end_char: Option<char>,
322
575k
        input: &BufferQueue,
323
575k
    ) -> Status {
324
575k
        match self.name_match {
325
            None => {
326
68.3k
                match end_char {
327
477k
                    Some(c) if c.is_ascii_alphanumeric() => {
328
                        // Keep looking for a semicolon, to determine whether
329
                        // we emit a parse error.
330
205k
                        self.state = BogusName;
331
205k
                        return Progress;
332
                    },
333
334
                    // Check length because &; is not a parse error.
335
68.3k
                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
336
337
204k
                    _ => (),
338
                }
339
272k
                self.unconsume_name(tokenizer, input);
340
272k
                self.finish_none()
341
            },
342
343
98.5k
            Some((c1, c2)) => {
344
                // We have a complete match, but we may have consumed
345
                // additional characters into self.name_buf.  Usually
346
                // at least one, but several in cases like
347
                //
348
                //     &not    => match for U+00AC
349
                //     &noti   => valid prefix for &notin
350
                //     &notit  => can't continue match
351
352
98.5k
                let name_len = self.name_len;
353
98.5k
                assert!(name_len > 0);
354
98.5k
                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
355
356
                // There might not be a next character after the match, if
357
                // we had a full match and then hit EOF.
358
98.5k
                let next_after = if name_len == self.name_buf().len() {
359
120
                    None
360
                } else {
361
98.4k
                    Some(self.name_buf()[name_len..].chars().next().unwrap())
362
                };
363
364
                // "If the character reference is being consumed as part of an
365
                // attribute, and the last character matched is not a U+003B
366
                // SEMICOLON character (;), and the next character is either a
367
                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
368
                // character, then, for historical reasons, all the characters
369
                // that were matched after the U+0026 AMPERSAND character (&)
370
                // must be unconsumed, and nothing is returned. However, if
371
                // this next character is in fact a U+003D EQUALS SIGN
372
                // character (=), then this is a parse error"
373
374
98.5k
                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
375
77.3k
                    (_, ';', _) => false,
376
                    (Some(_), _, Some('=')) => {
377
194
                        tokenizer.emit_error(Borrowed(
378
194
                            "Equals sign after character reference in attribute",
379
194
                        ));
380
194
                        true
381
                    },
382
1.91k
                    (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
383
                    _ => {
384
19.9k
                        tokenizer.emit_error(Borrowed(
385
19.9k
                            "Character reference does not end with semicolon",
386
19.9k
                        ));
387
19.9k
                        false
388
                    },
389
                };
390
391
98.5k
                if unconsume_all {
392
1.25k
                    self.unconsume_name(tokenizer, input);
393
1.25k
                    self.finish_none()
394
                } else {
395
97.2k
                    tokenizer
396
97.2k
                        .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
397
                    self.result = Some(CharRef {
398
97.2k
                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
399
97.2k
                        num_chars: if c2 == 0 { 1 } else { 2 },
400
                    });
401
97.2k
                    Done
402
                }
403
            },
404
        }
405
575k
    }
406
407
4.90M
    fn do_bogus_name<Sink: TokenSink>(
408
4.90M
        &mut self,
409
4.90M
        tokenizer: &XmlTokenizer<Sink>,
410
4.90M
        input: &BufferQueue,
411
4.90M
    ) -> Status {
412
4.90M
        let Some(c) = tokenizer.get_char(input) else {
413
103k
            return Stuck;
414
        };
415
4.80M
        self.name_buf_mut().push_char(c);
416
4.80M
        match c {
417
4.80M
            _ if c.is_ascii_alphanumeric() => return Progress,
418
3.67k
            ';' => self.emit_name_error(tokenizer),
419
201k
            _ => (),
420
        }
421
205k
        self.unconsume_name(tokenizer, input);
422
205k
        self.finish_none()
423
4.90M
    }
424
425
729
    pub fn end_of_file<Sink: TokenSink>(
426
729
        &mut self,
427
729
        tokenizer: &XmlTokenizer<Sink>,
428
729
        input: &BufferQueue,
429
729
    ) {
430
1.45k
        while self.result.is_none() {
431
330
            match self.state {
432
147
                Begin => drop(self.finish_none()),
433
434
16
                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
435
436
314
                Numeric(_) | NumericSemicolon => {
437
314
                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
438
314
                    self.finish_numeric(tokenizer);
439
314
                },
440
441
160
                Named => drop(self.finish_named(tokenizer, None, input)),
442
443
72
                BogusName => {
444
72
                    self.unconsume_name(tokenizer, input);
445
72
                    self.finish_none();
446
72
                },
447
448
20
                Octothorpe => {
449
20
                    tokenizer.unconsume(input, StrTendril::from_slice("#"));
450
20
                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
451
20
                    self.finish_none();
452
20
                },
453
            }
454
        }
455
729
    }
456
}