Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/tokenizer.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef CPP_HTMLPARSER_TOKENIZER_H_
2
#define CPP_HTMLPARSER_TOKENIZER_H_
3
4
#include <memory>
5
#include <optional>
6
#include <tuple>
7
#include <vector>
8
9
#include "cpp/htmlparser/token.h"
10
11
namespace htmlparser {
12
13
class Tokenizer {
14
 public:
15
  // Constructs a new HTML Tokenizer for the given html.
16
  // The input is assumed to be UTF-8 encoded.
17
  //
18
  // If tokenizing InnerHTML fragment, context_tag is that element's tag, such
19
  // as "div" or "iframe".
20
  explicit Tokenizer(std::string_view html, std::string context_tag = "");
21
22
  Tokenizer() = delete;
23
24
  // Span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
25
  // the end is exclusive.
26
  struct Span {
27
    int start = 0;
28
    int end = 0;
29
  };
30
31
  using RawAttribute = std::tuple<Span, Span, LineCol>;
32
33
  // Sets whether or not the tokenizer recognizes `<![CDATA[foo]]>` as
34
  // the text "foo". The default value is false, which means to recognize it as
35
  // a bogus comment `<!-- [CDATA[foo]] -->` instead.
36
  //
37
  // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
38
  // only if tokenizing foreign content, such as MathML and SVG. However,
39
  // tracking foreign-contentness is difficult to do purely in the tokenizer,
40
  // as opposed to the parser, due to HTML integration points: an <svg> element
41
  // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
42
  // HTML. For strict compliance with the HTML5 tokenization algorithm, it is
43
  // the responsibility of the user of a tokenizer to call AllowCDATA as
44
  // appropriate.
45
  // In practice, if using the tokenizer without caring whether MathML or SVG
46
  // CDATA is text or comments, such as tokenizing HTML to find all the anchor
47
  // text, it is acceptable to ignore this responsibility.
48
  void SetAllowCDATA(bool allow_cdata);
49
50
  // NextIsNotRawText instructs the tokenizer that the next token should not be
51
  // considered as 'raw text'. Some elements, such as script and title elements,
52
  // normally require the next token after the opening tag to be 'raw text' that
53
  // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
54
  // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
55
  // an end tag token for "</title>". There are no distinct start tag or end tag
56
  // tokens for the "<b>" and "</b>".
57
  //
58
  // This tokenizer implementation will generally look for raw text at the right
59
  // times. Strictly speaking, an HTML5 compliant tokenizer should not look for
60
  // raw text if in foreign content: <title> generally needs raw text, but a
61
  // <title> inside an <svg> does not. Another example is that a <textarea>
62
  // generally needs raw text, but a <textarea> is not allowed as an immediate
63
  // child of a <select>; in normal parsing, a <textarea> implies </select>, but
64
  // one cannot close the implicit element when parsing a <select>'s InnerHTML.
65
  // Similarly to AllowCDATA, tracking the correct moment to override raw-text-
66
  // ness is difficult to do purely in the tokenizer, as opposed to the parser.
67
  // For strict compliance with the HTML5 tokenization algorithm, it is the
68
  // responsibility of the user of a tokenizer to call NextIsNotRawText as
69
  // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
70
  // responsibility for basic usage.
71
  //
72
  // Note that this 'raw text' concept is different from the one offered by the
73
  // Tokenizer.Raw method.
74
  void NextIsNotRawText();
75
76
  // The following two states have special meaning.
77
  // EOF is when tokenizer reaches end of HTML input.
78
  // Error is the first time error encountered during tokenization.
79
  // It is possible for not Error && EOF. which means Next call will return an
80
  // error token. For example, if the HTML text was just "plain", or
81
  // `<!-- xml version="1.0"` (without closing), then the first Next call would
82
  // set EOF and return TextToken or CommentToken. Subsequent calls to Next
83
  // will return ErrorToken with Error set to true.
84
23.0k
  bool IsEOF() const { return eof_; }
85
0
  bool Error() const { return err_; }
86
87
  // Returns the unmodified text of the current token. Calling Next, Token,
88
  // Text, TagName or TagAttr may change the contents of the returned slice.
89
  std::string_view Raw();
90
91
  // Scans the next token and returns its type.
92
  TokenType Next(bool template_mode = false);
93
94
  // Returns the unescaped text of a text, comment or doctype token. The
95
  // contents of the returned slice may change on the next call to Next.
96
  std::string Text();
97
98
  // Returns the lower-cased name of a tag token (the `img` out of
99
  // `<IMG SRC="foo">`) and whether the tag has attributes.
100
  // The contents of the returned slice may change on the next call to Next.
101
  std::optional<std::tuple<std::string, bool>> TagName();
102
103
  // Returns the lower-cased key and unescaped value of the next unparsed
104
  // attribute for the current tag token and whether there are more attributes.
105
  // The contents of the returned slices may change on the next call to Next.
106
  std::optional<std::tuple<Attribute, bool>> TagAttr();
107
108
  // Returns the current Token. The result's Data and Attr values remain
109
  // valid after subsequent Next calls.
110
  Token token();
111
112
  // Returns current position of the tokenizer in the html source.
113
11.5k
  LineCol CurrentPosition() { return current_line_col_; }
114
115
  // Count of lines processed in html source.
116
0
  int LinesProcessed() { return lines_cols_.size(); }
117
118
 private:
119
  // Fragment tokenization is allowed from these parent elements only.
120
  inline static constexpr std::array<Atom, 10> kAllowedFragmentContainers{
121
      Atom::IFRAME,    Atom::NOEMBED, Atom::NOFRAMES, Atom::NOSCRIPT,
122
      Atom::PLAINTEXT, Atom::SCRIPT,  Atom::STYLE,    Atom::TEXTAREA,
123
      Atom::TITLE,     Atom::XMP,
124
  };
125
126
  // Returns the next byte from the input stream, doing a buffered read
127
  // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a
128
  // contiguous byte slice that holds all the bytes read so far for the current
129
  // token.
130
  // It sets z.err if the underlying reader returns an error.
131
  // Pre-condition: z.err == nil.
132
  char ReadByte();
133
134
  // Moves cursor back past one byte.
135
  void UnreadByte();
136
137
  // Reads until next ">".
138
  void ReadUntilCloseAngle();
139
140
  // Reads the next start tag token. The opening "<a" has already
141
  // been consumed, where 'a' means anything in [A-Za-z].
142
  TokenType ReadStartTag(bool template_mode = false);
143
144
  // Attempts to read a CDATA section and returns true if
145
  // successful. The opening "<!" has already been consumed.
146
  bool ReadCDATA();
147
148
  // Reads until the next "</foo>", where "foo" is z.rawTag and
149
  // is typically something like "script" or "textarea".
150
  void ReadRawOrRCDATA();
151
152
  // Attempts to read a doctype declaration and returns true if
153
  // successful. The opening "<!" has already been consumed.
154
  bool ReadDoctype();
155
156
  // Reads the next tag token and its attributes. If saveAttr, those
157
  // attributes are saved in z.attr, otherwise z.attr is set to an empty slice.
158
  // The opening "<a" or "</a" has already been consumed, where 'a' means
159
  // anything in [A-Za-z].
160
  void ReadTag(bool save_attr, bool template_mode = false);
161
162
  // Sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
163
  // is positioned such that the first byte of the tag name (the "d" in "<div")
164
  // has already been consumed.
165
  void ReadTagName();
166
167
  // Sets z.pendingAttr[0] to the "k" in "<div k=v>".
168
  // Precondition: z.err == nil.
169
  void ReadTagAttributeKey(bool template_mode = false);
170
171
  // Sets z.pendingAttr[1] to the "v" in "<div k=v>".
172
  void ReadTagAttributeValue();
173
174
  // Attempts to read a tag like "</foo>", where "foo" is z.rawTag.
175
  // If it succeeds, it backs up the input position to reconsume the tag and
176
  // returns true. Otherwise it returns false. The opening "</" has already been
177
  // consumed.
178
  bool ReadRawEndTag();
179
180
  // Reads until the next </script> tag, following the byzantine
181
  // rules for escaping/hiding the closing tag.
182
  void ReadScript();
183
184
  // Reads the next token starting with "<!". It might be
185
  // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
186
  // "<!a bogus comment". The opening "<!" has already been consumed.
187
  TokenType ReadMarkupDeclaration();
188
189
  // Reads the next comment token starting with "<!--". The opening
190
  // "<!--" has already been consumed.
191
  void ReadComment();
192
193
  // Skips past any white space.
194
  void SkipWhiteSpace();
195
196
  // Returns whether the start tag in buffer[data.start:data.end]
197
  // case-insensitively matches any element of ss.
198
  template <typename... Args>
199
  bool StartTagIn(Args... ss);
200
201
  std::string_view buffer_;
202
203
  // buffer_[raw.start:raw.end] holds the raw bytes of the current token.
204
  // buf[raw.end:] is buffered input that will yield future tokens.
205
  Span raw_ = {0, 0};
206
207
  // buffer_[data.start:data.end] holds the raw bytes of the current token's
208
  // data: a text token's text, a tag token's tag name, etc.
209
  Span data_ = {0, 0};
210
211
  // TokenType of the current token.
212
  TokenType token_type_;
213
214
  // Attribute key and value currently being tokenized.
215
  RawAttribute pending_attribute_;
216
217
  std::vector<RawAttribute> attributes_{};
218
219
  int n_attributes_returned_ = 0;
220
221
  // raw_tag_ is the "script" in "</script>" that closes the next token. If
222
  // non-empty, the subsequent call to Next will return a raw or RCDATA text
223
  // token: one that treats "<p>" as text instead of an element.
224
  // raw_tag_'s contents are lower-cased.
225
  std::string raw_tag_;
226
227
  // text_is_raw_ is whether the current text token's data is not escaped.
228
  bool text_is_raw_ = false;
229
230
  // Whether NULL bytes in the current token's data should be converted
231
  // into \ufffd replacement characters.
232
  bool convert_null_ = false;
233
234
  // allow_cdata_ is whether CDATA sectiosn are allowed in the current context.
235
  bool allow_cdata_ = false;
236
237
  // Cursor reached the end of the buffer.
238
  bool eof_ = false;
239
  bool err_ = false;
240
241
  // Tells if the token is manufactured.
242
  // In a few cases, for example '<' followed by '?', is treated as comment and
243
  // a comment token is manufactured.
244
  // This is not same as manufactured html, head, body, tbody, thead etc,
245
  // these are manufactured during parsing, not tokenization.
246
  // This field accounts for only special cases where illegal characters leads
247
  // to  manufacturing of comments token.
248
  // Eg:
249
  // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name
250
  bool is_token_manufactured_ = false;
251
252
  // Keeps track of all the lines and columns in HTML source.
253
  std::vector<LineCol> lines_cols_;
254
255
  // Keeps track of current position of the cursor.
256
  LineCol current_line_col_;
257
258
  // Current token's line col record. One line can have several tokens.
259
  LineCol token_line_col_;
260
};
261
262
}  // namespace htmlparser
263
264
#endif  // CPP_HTMLPARSER_TOKENIZER_H_