/proc/self/cwd/cpp/htmlparser/tokenizer.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef CPP_HTMLPARSER_TOKENIZER_H_ |
2 | | #define CPP_HTMLPARSER_TOKENIZER_H_ |
3 | | |
4 | | #include <memory> |
5 | | #include <optional> |
6 | | #include <tuple> |
7 | | #include <vector> |
8 | | |
9 | | #include "cpp/htmlparser/token.h" |
10 | | |
11 | | namespace htmlparser { |
12 | | |
13 | | class Tokenizer { |
14 | | public: |
15 | | // Constructs a new HTML Tokenizer for the given html. |
16 | | // The input is assumed to be UTF-8 encoded. |
17 | | // |
18 | | // If tokenizing InnerHTML fragment, context_tag is that element's tag, such |
19 | | // as "div" or "iframe". |
20 | | explicit Tokenizer(std::string_view html, std::string context_tag = ""); |
21 | | |
22 | | Tokenizer() = delete; |
23 | | |
24 | | // Span is a range of bytes in a Tokenizer's buffer. The start is inclusive, |
25 | | // the end is exclusive. |
26 | | struct Span { |
27 | | int start = 0; |
28 | | int end = 0; |
29 | | }; |
30 | | |
31 | | using RawAttribute = std::tuple<Span, Span, LineCol>; |
32 | | |
33 | | // Sets whether or not the tokenizer recognizes `<![CDATA[foo]]>` as |
34 | | // the text "foo". The default value is false, which means to recognize it as |
35 | | // a bogus comment `<!-- [CDATA[foo]] -->` instead. |
36 | | // |
37 | | // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and |
38 | | // only if tokenizing foreign content, such as MathML and SVG. However, |
39 | | // tracking foreign-contentness is difficult to do purely in the tokenizer, |
40 | | // as opposed to the parser, due to HTML integration points: an <svg> element |
41 | | // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to- |
42 | | // HTML. For strict compliance with the HTML5 tokenization algorithm, it is |
43 | | // the responsibility of the user of a tokenizer to call AllowCDATA as |
44 | | // appropriate. |
45 | | // In practice, if using the tokenizer without caring whether MathML or SVG |
46 | | // CDATA is text or comments, such as tokenizing HTML to find all the anchor |
47 | | // text, it is acceptable to ignore this responsibility. |
48 | | void SetAllowCDATA(bool allow_cdata); |
49 | | |
50 | | // NextIsNotRawText instructs the tokenizer that the next token should not be |
51 | | // considered as 'raw text'. Some elements, such as script and title elements, |
52 | | // normally require the next token after the opening tag to be 'raw text' that |
53 | | // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>" |
54 | | // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and |
55 | | // an end tag token for "</title>". There are no distinct start tag or end tag |
56 | | // tokens for the "<b>" and "</b>". |
57 | | // |
58 | | // This tokenizer implementation will generally look for raw text at the right |
59 | | // times. Strictly speaking, an HTML5 compliant tokenizer should not look for |
60 | | // raw text if in foreign content: <title> generally needs raw text, but a |
61 | | // <title> inside an <svg> does not. Another example is that a <textarea> |
62 | | // generally needs raw text, but a <textarea> is not allowed as an immediate |
63 | | // child of a <select>; in normal parsing, a <textarea> implies </select>, but |
64 | | // one cannot close the implicit element when parsing a <select>'s InnerHTML. |
65 | | // Similarly to AllowCDATA, tracking the correct moment to override raw-text- |
66 | | // ness is difficult to do purely in the tokenizer, as opposed to the parser. |
67 | | // For strict compliance with the HTML5 tokenization algorithm, it is the |
68 | | // responsibility of the user of a tokenizer to call NextIsNotRawText as |
69 | | // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this |
70 | | // responsibility for basic usage. |
71 | | // |
72 | | // Note that this 'raw text' concept is different from the one offered by the |
73 | | // Tokenizer.Raw method. |
74 | | void NextIsNotRawText(); |
75 | | |
76 | | // The following two states have special meaning. |
77 | | // EOF is when tokenizer reaches end of HTML input. |
78 | | // Error is the first time error encountered during tokenization. |
79 | | // It is possible for not Error && EOF. which means Next call will return an |
80 | | // error token. For example, if the HTML text was just "plain", or |
81 | | // `<!-- xml version="1.0"` (without closing), then the first Next call would |
82 | | // set EOF and return TextToken or CommentToken. Subsequent calls to Next |
83 | | // will return ErrorToken with Error set to true. |
84 | 23.0k | bool IsEOF() const { return eof_; } |
85 | 0 | bool Error() const { return err_; } |
86 | | |
87 | | // Returns the unmodified text of the current token. Calling Next, Token, |
88 | | // Text, TagName or TagAttr may change the contents of the returned slice. |
89 | | std::string_view Raw(); |
90 | | |
91 | | // Scans the next token and returns its type. |
92 | | TokenType Next(bool template_mode = false); |
93 | | |
94 | | // Returns the unescaped text of a text, comment or doctype token. The |
95 | | // contents of the returned slice may change on the next call to Next. |
96 | | std::string Text(); |
97 | | |
98 | | // Returns the lower-cased name of a tag token (the `img` out of |
99 | | // `<IMG SRC="foo">`) and whether the tag has attributes. |
100 | | // The contents of the returned slice may change on the next call to Next. |
101 | | std::optional<std::tuple<std::string, bool>> TagName(); |
102 | | |
103 | | // Returns the lower-cased key and unescaped value of the next unparsed |
104 | | // attribute for the current tag token and whether there are more attributes. |
105 | | // The contents of the returned slices may change on the next call to Next. |
106 | | std::optional<std::tuple<Attribute, bool>> TagAttr(); |
107 | | |
108 | | // Returns the current Token. The result's Data and Attr values remain |
109 | | // valid after subsequent Next calls. |
110 | | Token token(); |
111 | | |
112 | | // Returns current position of the tokenizer in the html source. |
113 | 11.5k | LineCol CurrentPosition() { return current_line_col_; } |
114 | | |
115 | | // Count of lines processed in html source. |
116 | 0 | int LinesProcessed() { return lines_cols_.size(); } |
117 | | |
118 | | private: |
119 | | // Fragment tokenization is allowed from these parent elements only. |
120 | | inline static constexpr std::array<Atom, 10> kAllowedFragmentContainers{ |
121 | | Atom::IFRAME, Atom::NOEMBED, Atom::NOFRAMES, Atom::NOSCRIPT, |
122 | | Atom::PLAINTEXT, Atom::SCRIPT, Atom::STYLE, Atom::TEXTAREA, |
123 | | Atom::TITLE, Atom::XMP, |
124 | | }; |
125 | | |
126 | | // Returns the next byte from the input stream, doing a buffered read |
127 | | // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a |
128 | | // contiguous byte slice that holds all the bytes read so far for the current |
129 | | // token. |
130 | | // It sets z.err if the underlying reader returns an error. |
131 | | // Pre-condition: z.err == nil. |
132 | | char ReadByte(); |
133 | | |
134 | | // Moves cursor back past one byte. |
135 | | void UnreadByte(); |
136 | | |
137 | | // Reads until next ">". |
138 | | void ReadUntilCloseAngle(); |
139 | | |
140 | | // Reads the next start tag token. The opening "<a" has already |
141 | | // been consumed, where 'a' means anything in [A-Za-z]. |
142 | | TokenType ReadStartTag(bool template_mode = false); |
143 | | |
144 | | // Attempts to read a CDATA section and returns true if |
145 | | // successful. The opening "<!" has already been consumed. |
146 | | bool ReadCDATA(); |
147 | | |
148 | | // Reads until the next "</foo>", where "foo" is z.rawTag and |
149 | | // is typically something like "script" or "textarea". |
150 | | void ReadRawOrRCDATA(); |
151 | | |
152 | | // Attempts to read a doctype declaration and returns true if |
153 | | // successful. The opening "<!" has already been consumed. |
154 | | bool ReadDoctype(); |
155 | | |
156 | | // Reads the next tag token and its attributes. If saveAttr, those |
157 | | // attributes are saved in z.attr, otherwise z.attr is set to an empty slice. |
158 | | // The opening "<a" or "</a" has already been consumed, where 'a' means |
159 | | // anything in [A-Za-z]. |
160 | | void ReadTag(bool save_attr, bool template_mode = false); |
161 | | |
162 | | // Sets z.data to the "div" in "<div k=v>". The reader (z.raw.end) |
163 | | // is positioned such that the first byte of the tag name (the "d" in "<div") |
164 | | // has already been consumed. |
165 | | void ReadTagName(); |
166 | | |
167 | | // Sets z.pendingAttr[0] to the "k" in "<div k=v>". |
168 | | // Precondition: z.err == nil. |
169 | | void ReadTagAttributeKey(bool template_mode = false); |
170 | | |
171 | | // Sets z.pendingAttr[1] to the "v" in "<div k=v>". |
172 | | void ReadTagAttributeValue(); |
173 | | |
174 | | // Attempts to read a tag like "</foo>", where "foo" is z.rawTag. |
175 | | // If it succeeds, it backs up the input position to reconsume the tag and |
176 | | // returns true. Otherwise it returns false. The opening "</" has already been |
177 | | // consumed. |
178 | | bool ReadRawEndTag(); |
179 | | |
180 | | // Reads until the next </script> tag, following the byzantine |
181 | | // rules for escaping/hiding the closing tag. |
182 | | void ReadScript(); |
183 | | |
184 | | // Reads the next token starting with "<!". It might be |
185 | | // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or |
186 | | // "<!a bogus comment". The opening "<!" has already been consumed. |
187 | | TokenType ReadMarkupDeclaration(); |
188 | | |
189 | | // Reads the next comment token starting with "<!--". The opening |
190 | | // "<!--" has already been consumed. |
191 | | void ReadComment(); |
192 | | |
193 | | // Skips past any white space. |
194 | | void SkipWhiteSpace(); |
195 | | |
196 | | // Returns whether the start tag in buffer[data.start:data.end] |
197 | | // case-insensitively matches any element of ss. |
198 | | template <typename... Args> |
199 | | bool StartTagIn(Args... ss); |
200 | | |
201 | | std::string_view buffer_; |
202 | | |
203 | | // buffer_[raw.start:raw.end] holds the raw bytes of the current token. |
204 | | // buf[raw.end:] is buffered input that will yield future tokens. |
205 | | Span raw_ = {0, 0}; |
206 | | |
207 | | // buffer_[data.start:data.end] holds the raw bytes of the current token's |
208 | | // data: a text token's text, a tag token's tag name, etc. |
209 | | Span data_ = {0, 0}; |
210 | | |
211 | | // TokenType of the current token. |
212 | | TokenType token_type_; |
213 | | |
214 | | // Attribute key and value currently being tokenized. |
215 | | RawAttribute pending_attribute_; |
216 | | |
217 | | std::vector<RawAttribute> attributes_{}; |
218 | | |
219 | | int n_attributes_returned_ = 0; |
220 | | |
221 | | // raw_tag_ is the "script" in "</script>" that closes the next token. If |
222 | | // non-empty, the subsequent call to Next will return a raw or RCDATA text |
223 | | // token: one that treats "<p>" as text instead of an element. |
224 | | // raw_tag_'s contents are lower-cased. |
225 | | std::string raw_tag_; |
226 | | |
227 | | // text_is_raw_ is whether the current text token's data is not escaped. |
228 | | bool text_is_raw_ = false; |
229 | | |
230 | | // Whether NULL bytes in the current token's data should be converted |
231 | | // into \ufffd replacement characters. |
232 | | bool convert_null_ = false; |
233 | | |
234 | | // allow_cdata_ is whether CDATA sectiosn are allowed in the current context. |
235 | | bool allow_cdata_ = false; |
236 | | |
237 | | // Cursor reached the end of the buffer. |
238 | | bool eof_ = false; |
239 | | bool err_ = false; |
240 | | |
241 | | // Tells if the token is manufactured. |
242 | | // In a few cases, for example '<' followed by '?', is treated as comment and |
243 | | // a comment token is manufactured. |
244 | | // This is not same as manufactured html, head, body, tbody, thead etc, |
245 | | // these are manufactured during parsing, not tokenization. |
246 | | // This field accounts for only special cases where illegal characters leads |
247 | | // to manufacturing of comments token. |
248 | | // Eg: |
249 | | // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name |
250 | | bool is_token_manufactured_ = false; |
251 | | |
252 | | // Keeps track of all the lines and columns in HTML source. |
253 | | std::vector<LineCol> lines_cols_; |
254 | | |
255 | | // Keeps track of current position of the cursor. |
256 | | LineCol current_line_col_; |
257 | | |
258 | | // Current token's line col record. One line can have several tokens. |
259 | | LineCol token_line_col_; |
260 | | }; |
261 | | |
262 | | } // namespace htmlparser |
263 | | |
264 | | #endif // CPP_HTMLPARSER_TOKENIZER_H_ |