/src/nokogiri/gumbo-parser/src/parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright 2017-2018 Craig Barnes. |
3 | | Copyright 2010 Google Inc. |
4 | | |
5 | | Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | you may not use this file except in compliance with the License. |
7 | | You may obtain a copy of the License at |
8 | | |
9 | | https://www.apache.org/licenses/LICENSE-2.0 |
10 | | |
11 | | Unless required by applicable law or agreed to in writing, software |
12 | | distributed under the License is distributed on an "AS IS" BASIS, |
13 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | See the License for the specific language governing permissions and |
15 | | limitations under the License. |
16 | | */ |
17 | | |
18 | | #include <assert.h> |
19 | | #include <stdarg.h> |
20 | | #include <stdint.h> |
21 | | #include <stdlib.h> |
22 | | #include <string.h> |
23 | | |
24 | | #include "ascii.h" |
25 | | #include "attribute.h" |
26 | | #include "error.h" |
27 | | #include "nokogiri_gumbo.h" |
28 | | #include "insertion_mode.h" |
29 | | #include "macros.h" |
30 | | #include "parser.h" |
31 | | #include "replacement.h" |
32 | | #include "tokenizer.h" |
33 | | #include "tokenizer_states.h" |
34 | | #include "token_buffer.h" |
35 | | #include "utf8.h" |
36 | | #include "util.h" |
37 | | #include "vector.h" |
38 | | |
39 | | typedef uint8_t TagSet[GUMBO_TAG_LAST + 1]; |
40 | 197M | #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) |
41 | 8.48M | #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) |
42 | 15.3M | #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) |
43 | | |
44 | 1.25M | #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 } |
45 | 1.25M | #define kGumboEmptySourcePosition (const GumboSourcePosition) \ |
46 | 1.25M | GUMBO_EMPTY_SOURCE_POSITION_INIT |
47 | | |
48 | | const GumboOptions kGumboDefaultOptions = { |
49 | | .tab_stop = 8, |
50 | | .stop_on_first_error = false, |
51 | | .max_attributes = 400, |
52 | | .max_tree_depth = 400, |
53 | | .max_errors = -1, |
54 | | .fragment_context = NULL, |
55 | | .fragment_namespace = GUMBO_NAMESPACE_HTML, |
56 | | .fragment_encoding = NULL, |
57 | | .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS, |
58 | | .fragment_context_has_form_ancestor = false, |
59 | | }; |
60 | | |
61 | | #define STRING(s) {.data = s, .length = sizeof(s) - 1} |
62 | | #define TERMINATOR {.data = NULL, .length = 0} |
63 | | |
64 | | // The doctype arrays have an explicit terminator because we want to pass them |
65 | | // to a helper function, and passing them as a pointer discards sizeof |
66 | | // information. The SVG arrays are used only by one-off functions, and so loops |
67 | | // over them use sizeof directly instead of a terminator. |
68 | | |
69 | | static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { |
70 | | STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), |
71 | | STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), |
72 | | STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), |
73 | | STRING("-//IETF//DTD HTML 2.0 Level 1//"), |
74 | | STRING("-//IETF//DTD HTML 2.0 Level 2//"), |
75 | | STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), |
76 | | STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), |
77 | | STRING("-//IETF//DTD HTML 2.0 Strict//"), |
78 | | STRING("-//IETF//DTD HTML 2.0//"), |
79 | | STRING("-//IETF//DTD HTML 2.1E//"), |
80 | | STRING("-//IETF//DTD HTML 3.0//"), |
81 | | STRING("-//IETF//DTD HTML 3.2 Final//"), |
82 | | STRING("-//IETF//DTD HTML 3.2//"), |
83 | | STRING("-//IETF//DTD HTML 3//"), |
84 | | STRING("-//IETF//DTD HTML Level 0//"), |
85 | | STRING("-//IETF//DTD HTML Level 1//"), |
86 | | STRING("-//IETF//DTD HTML Level 2//"), |
87 | | STRING("-//IETF//DTD HTML Level 3//"), |
88 | | STRING("-//IETF//DTD HTML Strict Level 0//"), |
89 | | STRING("-//IETF//DTD HTML Strict Level 1//"), |
90 | | STRING("-//IETF//DTD HTML Strict Level 2//"), |
91 | | STRING("-//IETF//DTD HTML Strict Level 3//"), |
92 | | STRING("-//IETF//DTD HTML Strict//"), |
93 | | STRING("-//IETF//DTD HTML//"), |
94 | | STRING("-//Metrius//DTD Metrius Presentational//"), |
95 | | STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), |
96 | | STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), |
97 | | STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), |
98 | | STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), |
99 | | STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), |
100 | | STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), |
101 | | STRING("-//Netscape Comm. Corp.//DTD HTML//"), |
102 | | STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), |
103 | | STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), |
104 | | STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), |
105 | | STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), |
106 | | STRING( |
107 | | "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" |
108 | | "extensions to HTML 4.0//"), |
109 | | STRING( |
110 | | "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" |
111 | | "extensions to HTML 4.0//"), |
112 | | STRING("-//Spyglass//DTD HTML 2.0 Extended//"), |
113 | | STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), |
114 | | STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), |
115 | | STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), |
116 | | STRING("-//W3C//DTD HTML 3 1995-03-24//"), |
117 | | STRING("-//W3C//DTD HTML 3.2 Draft//"), |
118 | | STRING("-//W3C//DTD HTML 3.2 Final//"), |
119 | | STRING("-//W3C//DTD HTML 3.2//"), |
120 | | STRING("-//W3C//DTD HTML 3.2S Draft//"), |
121 | | STRING("-//W3C//DTD HTML 4.0 Frameset//"), |
122 | | STRING("-//W3C//DTD HTML 4.0 Transitional//"), |
123 | | STRING("-//W3C//DTD HTML Experimental 19960712//"), |
124 | | STRING("-//W3C//DTD HTML Experimental 970421//"), |
125 | | STRING("-//W3C//DTD W3 HTML//"), |
126 | | STRING("-//W3O//DTD W3 HTML 3.0//"), |
127 | | STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), |
128 | | STRING("-//WebTechs//DTD Mozilla HTML//"), |
129 | | TERMINATOR |
130 | | }; |
131 | | |
132 | | static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { |
133 | | STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), |
134 | | STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), |
135 | | STRING("HTML"), |
136 | | TERMINATOR |
137 | | }; |
138 | | |
139 | | static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { |
140 | | STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), |
141 | | TERMINATOR |
142 | | }; |
143 | | |
144 | | static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { |
145 | | STRING("-//W3C//DTD XHTML 1.0 Frameset//"), |
146 | | STRING("-//W3C//DTD XHTML 1.0 Transitional//"), |
147 | | TERMINATOR |
148 | | }; |
149 | | |
150 | | static const GumboStringPiece kSystemIdDependentPublicIdPrefixes[] = { |
151 | | STRING("-//W3C//DTD HTML 4.01 Frameset//"), |
152 | | STRING("-//W3C//DTD HTML 4.01 Transitional//"), |
153 | | TERMINATOR |
154 | | }; |
155 | | |
156 | | // Indexed by GumboNamespaceEnum; keep in sync with that. |
157 | | static const char* kLegalXmlns[] = { |
158 | | "http://www.w3.org/1999/xhtml", |
159 | | "http://www.w3.org/2000/svg", |
160 | | "http://www.w3.org/1998/Math/MathML" |
161 | | }; |
162 | | |
163 | | // The "scope marker" for the list of active formatting elements. We use a |
164 | | // pointer to this as a generic marker element, since the particular element |
165 | | // scope doesn't matter. |
166 | | static const GumboNode kActiveFormattingScopeMarker; |
167 | | |
168 | | // The tag_is and tag_in function use true & false to denote start & end tags, |
169 | | // but for readability, we define constants for them here. |
170 | | static const bool kStartTag = true; |
171 | | static const bool kEndTag = false; |
172 | | |
173 | | // Because GumboStringPieces are immutable, we can't insert a character directly |
174 | | // into a text node. Instead, we accumulate all pending characters here and |
175 | | // flush them out to a text node whenever a new element is inserted. |
176 | | // |
177 | | // https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character |
178 | | typedef struct _TextNodeBufferState { |
179 | | // The accumulated text to be inserted into the current text node. |
180 | | GumboStringBuffer _buffer; |
181 | | |
182 | | // A pointer to the original text represented by this text node. Note that |
183 | | // because of foster parenting and other strange DOM manipulations, this may |
184 | | // include other non-text HTML tags in it; it is defined as the span of |
185 | | // original text from the first character in this text node to the last |
186 | | // character in this text node. |
187 | | const char* _start_original_text; |
188 | | |
189 | | // The source position of the start of this text node. |
190 | | GumboSourcePosition _start_position; |
191 | | |
192 | | // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). |
193 | | GumboNodeType _type; |
194 | | } TextNodeBufferState; |
195 | | |
196 | | typedef struct GumboInternalParserState { |
197 | | // https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode |
198 | | GumboInsertionMode _insertion_mode; |
199 | | |
200 | | // Used for run_generic_parsing_algorithm, which needs to switch back to the |
201 | | // original insertion mode at its conclusion. |
202 | | GumboInsertionMode _original_insertion_mode; |
203 | | |
204 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements |
205 | | GumboVector /*GumboNode*/ _open_elements; |
206 | | |
207 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements |
208 | | GumboVector /*GumboNode*/ _active_formatting_elements; |
209 | | |
210 | | // The stack of template insertion modes. |
211 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode |
212 | | GumboVector /*InsertionMode*/ _template_insertion_modes; |
213 | | |
214 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers |
215 | | GumboNode* _head_element; |
216 | | GumboNode* _form_element; |
217 | | |
218 | | // The element used as fragment context when parsing in fragment mode |
219 | | GumboNode* _fragment_ctx; |
220 | | |
221 | | // The flag for when the spec says "Reprocess the current token in..." |
222 | | bool _reprocess_current_token; |
223 | | |
224 | | // The flag for "acknowledge the token's self-closing flag". |
225 | | bool _self_closing_flag_acknowledged; |
226 | | |
227 | | // The "frameset-ok" flag from the spec. |
228 | | bool _frameset_ok; |
229 | | |
230 | | // The flag for "If the next token is a LINE FEED, ignore that token...". |
231 | | bool _ignore_next_linefeed; |
232 | | |
233 | | // The flag for "whenever a node would be inserted into the current node, it |
234 | | // must instead be foster parented". This is used for misnested table |
235 | | // content, which needs to be handled according to "in body" rules yet foster |
236 | | // parented outside of the table. |
237 | | // It would perhaps be more explicit to have this as a parameter to |
238 | | // handle_in_body and insert_element, but given how special-purpose this is |
239 | | // and the number of call-sites that would need to take the extra parameter, |
240 | | // it's easier just to have a state flag. |
241 | | bool _foster_parent_insertions; |
242 | | |
243 | | // The accumulated text node buffer state. |
244 | | TextNodeBufferState _text_node; |
245 | | |
246 | | // The accumulated character tokens in tables for error purposes. |
247 | | GumboCharacterTokenBuffer _table_character_tokens; |
248 | | |
249 | | // The current token. |
250 | | GumboToken* _current_token; |
251 | | |
252 | | // The way that the spec is written, the </body> and </html> tags are *always* |
253 | | // implicit, because encountering one of those tokens merely switches the |
254 | | // insertion mode out of "in body". So we have individual state flags for |
255 | | // those end tags that are then inspected by pop_current_node when the <body> |
256 | | // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG |
257 | | // flag appropriately. |
258 | | bool _closed_body_tag; |
259 | | bool _closed_html_tag; |
260 | | } GumboParserState; |
261 | | |
262 | 29.6k | static bool token_has_attribute(const GumboToken* token, const char* name) { |
263 | 29.6k | assert(token->type == GUMBO_TOKEN_START_TAG); |
264 | 0 | return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL; |
265 | 29.6k | } |
266 | | |
267 | | // Checks if the value of the specified attribute is a case-insensitive match |
268 | | // for the specified string. |
269 | | static bool attribute_matches ( |
270 | | const GumboVector* attributes, |
271 | | const char* name, |
272 | | const char* value |
273 | 2.59k | ) { |
274 | 2.59k | const GumboAttribute* attr = gumbo_get_attribute(attributes, name); |
275 | 2.59k | return attr ? gumbo_ascii_strcasecmp(value, attr->value) == 0 : false; |
276 | 2.59k | } |
277 | | |
278 | | // Checks if the value of the specified attribute is a case-sensitive match |
279 | | // for the specified string. |
280 | | static bool attribute_matches_case_sensitive ( |
281 | | const GumboVector* attributes, |
282 | | const char* name, |
283 | | const char* value |
284 | 13.4k | ) { |
285 | 13.4k | const GumboAttribute* attr = gumbo_get_attribute(attributes, name); |
286 | 13.4k | return attr ? strcmp(value, attr->value) == 0 : false; |
287 | 13.4k | } |
288 | | |
289 | | // Checks if the specified attribute vectors are identical. |
290 | | static bool all_attributes_match ( |
291 | | const GumboVector* attr1, |
292 | | const GumboVector* attr2 |
293 | 234k | ) { |
294 | 234k | unsigned int num_unmatched_attr2_elements = attr2->length; |
295 | 238k | for (unsigned int i = 0; i < attr1->length; ++i) { |
296 | 13.4k | const GumboAttribute* attr = attr1->data[i]; |
297 | 13.4k | if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) { |
298 | 4.01k | --num_unmatched_attr2_elements; |
299 | 9.43k | } else { |
300 | 9.43k | return false; |
301 | 9.43k | } |
302 | 13.4k | } |
303 | 224k | return num_unmatched_attr2_elements == 0; |
304 | 234k | } |
305 | | |
306 | 8.50M | static void set_frameset_not_ok(GumboParser* parser) { |
307 | 8.50M | gumbo_debug("Setting frameset_ok to false.\n"); |
308 | 8.50M | parser->_parser_state->_frameset_ok = false; |
309 | 8.50M | } |
310 | | |
311 | 1.84M | static GumboNode* create_node(GumboNodeType type) { |
312 | 1.84M | GumboNode* node = gumbo_alloc(sizeof(GumboNode)); |
313 | 1.84M | node->parent = NULL; |
314 | 1.84M | node->index_within_parent = -1; |
315 | 1.84M | node->type = type; |
316 | 1.84M | node->parse_flags = GUMBO_INSERTION_NORMAL; |
317 | 1.84M | return node; |
318 | 1.84M | } |
319 | | |
320 | 10.0k | static GumboNode* new_document_node() { |
321 | 10.0k | GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT); |
322 | 10.0k | document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; |
323 | 10.0k | gumbo_vector_init(1, &document_node->v.document.children); |
324 | | |
325 | | // Must be initialized explicitly, as there's no guarantee that we'll see a |
326 | | // doc type token. |
327 | 10.0k | GumboDocument* document = &document_node->v.document; |
328 | 10.0k | document->has_doctype = false; |
329 | 10.0k | document->name = NULL; |
330 | 10.0k | document->public_identifier = NULL; |
331 | 10.0k | document->system_identifier = NULL; |
332 | 10.0k | document->doc_type_quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS; |
333 | 10.0k | return document_node; |
334 | 10.0k | } |
335 | | |
336 | 10.0k | static void output_init(GumboParser* parser) { |
337 | 10.0k | GumboOutput* output = gumbo_alloc(sizeof(GumboOutput)); |
338 | 10.0k | output->root = NULL; |
339 | 10.0k | output->document = new_document_node(); |
340 | 10.0k | output->document_error = false; |
341 | 10.0k | output->status = GUMBO_STATUS_OK; |
342 | 10.0k | parser->_output = output; |
343 | 10.0k | gumbo_init_errors(parser); |
344 | 10.0k | } |
345 | | |
346 | 10.0k | static void parser_state_init(GumboParser* parser) { |
347 | 10.0k | GumboParserState* parser_state = gumbo_alloc(sizeof(GumboParserState)); |
348 | 10.0k | parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; |
349 | 10.0k | parser_state->_reprocess_current_token = false; |
350 | 10.0k | parser_state->_frameset_ok = true; |
351 | 10.0k | parser_state->_ignore_next_linefeed = false; |
352 | 10.0k | parser_state->_foster_parent_insertions = false; |
353 | 10.0k | parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; |
354 | 10.0k | gumbo_string_buffer_init(&parser_state->_text_node._buffer); |
355 | 10.0k | gumbo_character_token_buffer_init(&parser_state->_table_character_tokens); |
356 | 10.0k | gumbo_vector_init(10, &parser_state->_open_elements); |
357 | 10.0k | gumbo_vector_init(5, &parser_state->_active_formatting_elements); |
358 | 10.0k | gumbo_vector_init(5, &parser_state->_template_insertion_modes); |
359 | 10.0k | parser_state->_head_element = NULL; |
360 | 10.0k | parser_state->_form_element = NULL; |
361 | 10.0k | parser_state->_fragment_ctx = NULL; |
362 | 10.0k | parser_state->_current_token = NULL; |
363 | 10.0k | parser_state->_closed_body_tag = false; |
364 | 10.0k | parser_state->_closed_html_tag = false; |
365 | 10.0k | parser->_parser_state = parser_state; |
366 | 10.0k | } |
367 | | |
368 | | typedef void (*TreeTraversalCallback)(GumboNode* node); |
369 | | |
370 | 10.1k | static void tree_traverse(GumboNode* node, TreeTraversalCallback callback) { |
371 | 10.1k | GumboNode* current_node = node; |
372 | 10.1k | unsigned int offset = 0; |
373 | | |
374 | 3.89M | tailcall: |
375 | 3.89M | switch (current_node->type) { |
376 | 21.5k | case GUMBO_NODE_DOCUMENT: |
377 | 38.5k | case GUMBO_NODE_TEMPLATE: |
378 | 3.27M | case GUMBO_NODE_ELEMENT: { |
379 | 3.27M | GumboVector* children = (current_node->type == GUMBO_NODE_DOCUMENT) |
380 | 3.27M | ? ¤t_node->v.document.children |
381 | 3.27M | : ¤t_node->v.element.children |
382 | 3.27M | ; |
383 | 3.27M | if (offset >= children->length) { |
384 | 1.33M | assert(offset == children->length); |
385 | 0 | break; |
386 | 1.94M | } else { |
387 | 1.94M | current_node = children->data[offset]; |
388 | 1.94M | offset = 0; |
389 | 1.94M | goto tailcall; |
390 | 1.94M | } |
391 | 3.27M | } |
392 | 308k | case GUMBO_NODE_TEXT: |
393 | 308k | case GUMBO_NODE_CDATA: |
394 | 508k | case GUMBO_NODE_COMMENT: |
395 | 620k | case GUMBO_NODE_WHITESPACE: |
396 | 620k | assert(offset == 0); |
397 | 0 | break; |
398 | 3.89M | } |
399 | | |
400 | 1.95M | offset = current_node->index_within_parent + 1; |
401 | 1.95M | GumboNode* next_node = current_node->parent; |
402 | 1.95M | callback(current_node); |
403 | 1.95M | if (current_node == node) { |
404 | 10.1k | return; |
405 | 10.1k | } |
406 | 1.94M | current_node = next_node; |
407 | 1.94M | goto tailcall; |
408 | 1.95M | } |
409 | | |
410 | 1.95M | static void destroy_node_callback(GumboNode* node) { |
411 | 1.95M | switch (node->type) { |
412 | 10.0k | case GUMBO_NODE_DOCUMENT: { |
413 | 10.0k | GumboDocument* doc = &node->v.document; |
414 | 10.0k | gumbo_free((void*) doc->children.data); |
415 | 10.0k | gumbo_free((void*) doc->name); |
416 | 10.0k | gumbo_free((void*) doc->public_identifier); |
417 | 10.0k | gumbo_free((void*) doc->system_identifier); |
418 | 10.0k | } break; |
419 | 3.19k | case GUMBO_NODE_TEMPLATE: |
420 | 1.32M | case GUMBO_NODE_ELEMENT: |
421 | 1.39M | for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { |
422 | 72.1k | gumbo_destroy_attribute(node->v.element.attributes.data[i]); |
423 | 72.1k | } |
424 | 1.32M | gumbo_free(node->v.element.attributes.data); |
425 | 1.32M | gumbo_free(node->v.element.children.data); |
426 | 1.32M | if (node->v.element.tag == GUMBO_TAG_UNKNOWN) |
427 | 147k | gumbo_free((void *)node->v.element.name); |
428 | 1.32M | break; |
429 | 308k | case GUMBO_NODE_TEXT: |
430 | 308k | case GUMBO_NODE_CDATA: |
431 | 508k | case GUMBO_NODE_COMMENT: |
432 | 620k | case GUMBO_NODE_WHITESPACE: |
433 | 620k | gumbo_free((void*) node->v.text.text); |
434 | 620k | break; |
435 | 1.95M | } |
436 | 1.95M | gumbo_free(node); |
437 | 1.95M | } |
438 | | |
439 | 10.1k | static void destroy_node(GumboNode* node) { |
440 | 10.1k | tree_traverse(node, &destroy_node_callback); |
441 | 10.1k | } |
442 | | |
443 | | static void destroy_fragment_ctx_element(GumboNode* ctx); |
444 | | |
445 | 10.0k | static void parser_state_destroy(GumboParser* parser) { |
446 | 10.0k | GumboParserState* state = parser->_parser_state; |
447 | 10.0k | if (state->_fragment_ctx) { |
448 | 0 | destroy_fragment_ctx_element(state->_fragment_ctx); |
449 | 0 | } |
450 | 10.0k | gumbo_vector_destroy(&state->_active_formatting_elements); |
451 | 10.0k | gumbo_vector_destroy(&state->_open_elements); |
452 | 10.0k | gumbo_vector_destroy(&state->_template_insertion_modes); |
453 | 10.0k | gumbo_string_buffer_destroy(&state->_text_node._buffer); |
454 | 10.0k | gumbo_character_token_buffer_destroy(&state->_table_character_tokens); |
455 | 10.0k | gumbo_free(state); |
456 | 10.0k | } |
457 | | |
458 | 31.7k | static GumboNode* get_document_node(const GumboParser* parser) { |
459 | 31.7k | return parser->_output->document; |
460 | 31.7k | } |
461 | | |
462 | 1.07k | static bool is_fragment_parser(const GumboParser* parser) { |
463 | 1.07k | return !!parser->_parser_state->_fragment_ctx; |
464 | 1.07k | } |
465 | | |
466 | | // Returns the node at the bottom of the stack of open elements, or NULL if no |
467 | | // elements have been added yet. |
468 | 40.0M | static GumboNode* get_current_node(const GumboParser* parser) { |
469 | 40.0M | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
470 | 40.0M | if (open_elements->length == 0) { |
471 | 37.8k | assert(!parser->_output->root); |
472 | 0 | return NULL; |
473 | 37.8k | } |
474 | 39.9M | assert(open_elements->length > 0); |
475 | 0 | assert(open_elements->data != NULL); |
476 | 0 | return open_elements->data[open_elements->length - 1]; |
477 | 40.0M | } |
478 | | |
479 | 32.6M | static GumboNode* get_adjusted_current_node(const GumboParser* parser) { |
480 | 32.6M | const GumboParserState* state = parser->_parser_state; |
481 | 32.6M | if (state->_open_elements.length == 1 && state->_fragment_ctx) { |
482 | 0 | return state->_fragment_ctx; |
483 | 0 | } |
484 | 32.6M | return get_current_node(parser); |
485 | 32.6M | } |
486 | | |
487 | | // Returns true if the given needle is in the given array of literal |
488 | | // GumboStringPieces. If exact_match is true, this requires that they match |
489 | | // exactly; otherwise, this performs a prefix match to check if any of the |
490 | | // elements in haystack start with needle. This always performs a |
491 | | // case-insensitive match. |
492 | | static bool is_in_static_list ( |
493 | | const GumboStringPiece* needle, |
494 | | const GumboStringPiece* haystack, |
495 | | bool exact_match |
496 | 175 | ) { |
497 | 175 | if (needle->length == 0) |
498 | 69 | return false; |
499 | 106 | if (exact_match) { |
500 | 112 | for (size_t i = 0; haystack[i].data; ++i) { |
501 | 81 | if (gumbo_string_equals_ignore_case(needle, &haystack[i])) |
502 | 0 | return true; |
503 | 81 | } |
504 | 75 | } else { |
505 | 1.55k | for (size_t i = 0; haystack[i].data; ++i) { |
506 | 1.47k | if (gumbo_string_prefix_ignore_case(&haystack[i], needle)) |
507 | 0 | return true; |
508 | 1.47k | } |
509 | 75 | } |
510 | 106 | return false; |
511 | 106 | } |
512 | | |
513 | 184k | static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { |
514 | 184k | parser->_parser_state->_insertion_mode = mode; |
515 | 184k | } |
516 | | |
517 | | static void push_template_insertion_mode ( |
518 | | GumboParser* parser, |
519 | | GumboInsertionMode mode |
520 | 5.39k | ) { |
521 | 5.39k | gumbo_vector_add ( |
522 | 5.39k | (void*) mode, |
523 | 5.39k | &parser->_parser_state->_template_insertion_modes |
524 | 5.39k | ); |
525 | 5.39k | } |
526 | | |
527 | 5.01k | static void pop_template_insertion_mode(GumboParser* parser) { |
528 | 5.01k | gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes); |
529 | 5.01k | } |
530 | | |
531 | | // Returns the current template insertion mode. If the stack of template |
532 | | // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. |
533 | | static GumboInsertionMode get_current_template_insertion_mode ( |
534 | | const GumboParser* parser |
535 | 13.6k | ) { |
536 | 13.6k | GumboVector* modes = &parser->_parser_state->_template_insertion_modes; |
537 | 13.6k | if (modes->length == 0) { |
538 | 9.85k | return GUMBO_INSERTION_MODE_INITIAL; |
539 | 9.85k | } |
540 | 3.77k | return (GumboInsertionMode)(intptr_t) modes->data[(modes->length - 1)]; |
541 | 13.6k | } |
542 | | |
543 | | // Returns true if the specified token is either a start or end tag |
544 | | // (specified by is_start) with one of the tag types in the TagSet. |
545 | | static bool tag_in ( |
546 | | const GumboToken* token, |
547 | | bool is_start, |
548 | | const TagSet* tags |
549 | 25.4M | ) { |
550 | 25.4M | GumboTag token_tag; |
551 | 25.4M | if (is_start && token->type == GUMBO_TOKEN_START_TAG) { |
552 | 8.89M | token_tag = token->v.start_tag.tag; |
553 | 16.5M | } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { |
554 | 104k | token_tag = token->v.end_tag.tag; |
555 | 16.4M | } else { |
556 | 16.4M | return false; |
557 | 16.4M | } |
558 | 9.00M | return (*tags)[(unsigned) token_tag] != 0u; |
559 | 25.4M | } |
560 | | |
561 | | // Like tag_in, but for the single-tag case. |
562 | 68.7M | static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { |
563 | 68.7M | if (is_start && token->type == GUMBO_TOKEN_START_TAG) { |
564 | 13.1M | return token->v.start_tag.tag == tag; |
565 | 13.1M | } |
566 | 55.5M | if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { |
567 | 263k | return token->v.end_tag.tag == tag; |
568 | 263k | } |
569 | 55.3M | return false; |
570 | 55.5M | } |
571 | | |
572 | | static inline bool tagset_includes ( |
573 | | const TagSet* tagset, |
574 | | GumboNamespaceEnum ns, |
575 | | GumboTag tag |
576 | 10.5M | ) { |
577 | 10.5M | return ((*tagset)[(unsigned) tag] & (1u << (unsigned) ns)) != 0u; |
578 | 10.5M | } |
579 | | |
580 | | // Like tag_in, but checks for the tag of a node, rather than a token. |
581 | 7.64M | static bool node_tag_in_set(const GumboNode* node, const TagSet* tags) { |
582 | 7.64M | assert(node != NULL); |
583 | 7.64M | if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { |
584 | 0 | return false; |
585 | 0 | } |
586 | 7.64M | return tagset_includes ( |
587 | 7.64M | tags, |
588 | 7.64M | node->v.element.tag_namespace, |
589 | 7.64M | node->v.element.tag |
590 | 7.64M | ); |
591 | 7.64M | } |
592 | | |
593 | | static bool node_qualified_tagname_is ( |
594 | | const GumboNode* node, |
595 | | GumboNamespaceEnum ns, |
596 | | GumboTag tag, |
597 | | const char *name |
598 | 1.65M | ) { |
599 | 1.65M | assert(node); |
600 | 0 | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
601 | 0 | assert(node->v.element.name); |
602 | 0 | assert(tag != GUMBO_TAG_UNKNOWN || name); |
603 | 0 | GumboTag element_tag = node->v.element.tag; |
604 | 1.65M | const char *element_name = node->v.element.name; |
605 | 1.65M | assert(element_tag != GUMBO_TAG_UNKNOWN || element_name); |
606 | 1.65M | if (node->v.element.tag_namespace != ns || element_tag != tag) |
607 | 901k | return false; |
608 | 756k | if (tag != GUMBO_TAG_UNKNOWN) |
609 | 743k | return true; |
610 | 12.3k | return !gumbo_ascii_strcasecmp(element_name, name); |
611 | 756k | } |
612 | | |
613 | | static bool node_html_tagname_is ( |
614 | | const GumboNode* node, |
615 | | GumboTag tag, |
616 | | const char *name |
617 | 497k | ) { |
618 | 497k | return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name); |
619 | 497k | } |
620 | | |
621 | | static bool node_tagname_is ( |
622 | | const GumboNode* node, |
623 | | GumboTag tag, |
624 | | const char *name |
625 | 10.0k | ) { |
626 | 10.0k | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
627 | 0 | return node_qualified_tagname_is(node, node->v.element.tag_namespace, tag, name); |
628 | 10.0k | } |
629 | | |
630 | | // Like node_tag_in, but for the single-tag case. |
631 | | static bool node_qualified_tag_is ( |
632 | | const GumboNode* node, |
633 | | GumboNamespaceEnum ns, |
634 | | GumboTag tag |
635 | 16.2M | ) { |
636 | 16.2M | assert(node); |
637 | 0 | assert(tag != GUMBO_TAG_UNKNOWN); |
638 | 0 | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
639 | 0 | return |
640 | 16.2M | node->v.element.tag == tag |
641 | 16.2M | && node->v.element.tag_namespace == ns; |
642 | 16.2M | } |
643 | | |
644 | | // Like node_tag_in, but for the single-tag case in the HTML namespace |
645 | 12.7M | static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { |
646 | 12.7M | return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); |
647 | 12.7M | } |
648 | | |
649 | | // https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately |
650 | | // This is a helper function that returns the appropriate insertion mode instead |
651 | | // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to |
652 | | // indicate that there is no appropriate insertion mode, and the loop should |
653 | | // continue. |
654 | | static GumboInsertionMode get_appropriate_insertion_mode ( |
655 | | const GumboParser* parser, |
656 | | int index |
657 | 351k | ) { |
658 | 351k | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
659 | 351k | const GumboNode* node = open_elements->data[index]; |
660 | 351k | const bool is_last = index == 0; |
661 | | |
662 | 351k | if (is_last && is_fragment_parser(parser)) { |
663 | 0 | node = parser->_parser_state->_fragment_ctx; |
664 | 0 | } |
665 | | |
666 | 351k | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
667 | 351k | if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) { |
668 | 1.45k | return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; |
669 | 1.45k | } |
670 | | |
671 | 350k | switch (node->v.element.tag) { |
672 | 188 | case GUMBO_TAG_SELECT: { |
673 | 188 | if (is_last) { |
674 | 0 | return GUMBO_INSERTION_MODE_IN_SELECT; |
675 | 0 | } |
676 | 3.15k | for (int i = index; i > 0; --i) { |
677 | 3.11k | const GumboNode* ancestor = open_elements->data[i]; |
678 | 3.11k | if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { |
679 | 108 | return GUMBO_INSERTION_MODE_IN_SELECT; |
680 | 108 | } |
681 | 3.00k | if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { |
682 | 40 | return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; |
683 | 40 | } |
684 | 3.00k | } |
685 | 40 | return GUMBO_INSERTION_MODE_IN_SELECT; |
686 | 188 | } |
687 | 6.50k | case GUMBO_TAG_TD: |
688 | 7.17k | case GUMBO_TAG_TH: |
689 | 7.17k | if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; |
690 | 0 | break; |
691 | 1.27k | case GUMBO_TAG_TR: |
692 | 1.27k | return GUMBO_INSERTION_MODE_IN_ROW; |
693 | 201 | case GUMBO_TAG_TBODY: |
694 | 333 | case GUMBO_TAG_THEAD: |
695 | 601 | case GUMBO_TAG_TFOOT: |
696 | 601 | return GUMBO_INSERTION_MODE_IN_TABLE_BODY; |
697 | 382 | case GUMBO_TAG_CAPTION: |
698 | 382 | return GUMBO_INSERTION_MODE_IN_CAPTION; |
699 | 115 | case GUMBO_TAG_COLGROUP: |
700 | 115 | return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; |
701 | 563 | case GUMBO_TAG_TABLE: |
702 | 563 | return GUMBO_INSERTION_MODE_IN_TABLE; |
703 | 2.16k | case GUMBO_TAG_TEMPLATE: |
704 | 2.16k | return get_current_template_insertion_mode(parser); |
705 | 186 | case GUMBO_TAG_HEAD: |
706 | 186 | if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; |
707 | 0 | break; |
708 | 7.89k | case GUMBO_TAG_BODY: |
709 | 7.89k | return GUMBO_INSERTION_MODE_IN_BODY; |
710 | 0 | case GUMBO_TAG_FRAMESET: |
711 | 0 | return GUMBO_INSERTION_MODE_IN_FRAMESET; |
712 | 161 | case GUMBO_TAG_HTML: |
713 | 161 | return parser->_parser_state->_head_element |
714 | 161 | ? GUMBO_INSERTION_MODE_AFTER_HEAD |
715 | 161 | : GUMBO_INSERTION_MODE_BEFORE_HEAD; |
716 | 329k | default: |
717 | 329k | break; |
718 | 350k | } |
719 | 329k | return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; |
720 | 350k | } |
721 | | |
722 | | // This performs the actual "reset the insertion mode" loop. |
723 | 20.7k | static void reset_insertion_mode_appropriately(GumboParser* parser) { |
724 | 20.7k | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
725 | 351k | for (int i = open_elements->length; --i >= 0;) { |
726 | 351k | GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i); |
727 | 351k | if (mode != GUMBO_INSERTION_MODE_INITIAL) { |
728 | 20.7k | set_insertion_mode(parser, mode); |
729 | 20.7k | return; |
730 | 20.7k | } |
731 | 351k | } |
732 | | // Should never get here, because is_last will be set on the last iteration |
733 | | // and will force GUMBO_INSERTION_MODE_IN_BODY. |
734 | 0 | assert(0); |
735 | 0 | } |
736 | | |
737 | | static void parser_add_parse_error ( |
738 | | GumboParser* parser, |
739 | | const GumboToken* token |
740 | 5.83M | ) { |
741 | 5.83M | gumbo_debug("Adding parse error.\n"); |
742 | 5.83M | GumboError* error = gumbo_add_error(parser); |
743 | 5.83M | if (!error) { |
744 | 0 | return; |
745 | 0 | } |
746 | 5.83M | error->type = GUMBO_ERR_PARSER; |
747 | 5.83M | error->position = token->position; |
748 | 5.83M | error->original_text = token->original_text; |
749 | 5.83M | GumboParserError* extra_data = &error->v.parser; |
750 | 5.83M | extra_data->input_type = token->type; |
751 | 5.83M | extra_data->input_tag = GUMBO_TAG_UNKNOWN; |
752 | 5.83M | if (token->type == GUMBO_TOKEN_START_TAG) { |
753 | 936k | extra_data->input_tag = token->v.start_tag.tag; |
754 | 4.89M | } else if (token->type == GUMBO_TOKEN_END_TAG) { |
755 | 43.2k | extra_data->input_tag = token->v.end_tag.tag; |
756 | 43.2k | } |
757 | 5.83M | const GumboParserState* state = parser->_parser_state; |
758 | 5.83M | extra_data->parser_state = state->_insertion_mode; |
759 | 5.83M | gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack); |
760 | 429M | for (unsigned int i = 0; i < state->_open_elements.length; ++i) { |
761 | 423M | const GumboNode* node = state->_open_elements.data[i]; |
762 | 423M | assert ( |
763 | 423M | node->type == GUMBO_NODE_ELEMENT |
764 | 423M | || node->type == GUMBO_NODE_TEMPLATE |
765 | 423M | ); |
766 | 0 | gumbo_vector_add ( |
767 | 423M | (void*) node->v.element.tag, |
768 | 423M | &extra_data->tag_stack |
769 | 423M | ); |
770 | 423M | } |
771 | 5.83M | } |
772 | | |
773 | | // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point |
774 | 2.08M | static bool is_mathml_integration_point(const GumboNode* node) { |
775 | 2.08M | static const TagSet mathml_integration_point_tags = { |
776 | 2.08M | TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), |
777 | 2.08M | TAG_MATHML(MS), TAG_MATHML(MTEXT) |
778 | 2.08M | }; |
779 | 2.08M | return node_tag_in_set(node, &mathml_integration_point_tags); |
780 | 2.08M | } |
781 | | |
782 | | // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point |
783 | 2.07M | static bool is_html_integration_point(const GumboNode* node) { |
784 | 2.07M | static const TagSet html_integration_point_svg_tags = { |
785 | 2.07M | TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) |
786 | 2.07M | }; |
787 | 2.07M | if (node_tag_in_set(node, &html_integration_point_svg_tags)) { |
788 | 6.52k | return true; |
789 | 6.52k | } |
790 | | |
791 | 2.07M | const bool is_mathml_annotation_xml_element = node_qualified_tag_is ( |
792 | 2.07M | node, |
793 | 2.07M | GUMBO_NAMESPACE_MATHML, |
794 | 2.07M | GUMBO_TAG_ANNOTATION_XML |
795 | 2.07M | ); |
796 | 2.07M | const GumboVector* attributes = &node->v.element.attributes; |
797 | 2.07M | if ( |
798 | 2.07M | is_mathml_annotation_xml_element |
799 | 2.07M | && ( |
800 | 584 | attribute_matches(attributes, "encoding", "text/html") |
801 | 584 | || attribute_matches(attributes, "encoding", "application/xhtml+xml") |
802 | 584 | ) |
803 | 2.07M | ) { |
804 | 0 | return true; |
805 | 0 | } |
806 | | |
807 | 2.07M | return false; |
808 | 2.07M | } |
809 | | |
810 | | // This represents a place to insert a node, consisting of a target parent and a |
811 | | // child index within that parent. If the node should be inserted at the end of |
812 | | // the parent's child, index will be -1. |
813 | | typedef struct { |
814 | | GumboNode* target; |
815 | | int index; |
816 | | } InsertionLocation; |
817 | | |
818 | | static InsertionLocation get_appropriate_insertion_location ( |
819 | | const GumboParser* parser, |
820 | | GumboNode* override_target |
821 | 1.73M | ) { |
822 | 1.73M | InsertionLocation retval = {override_target, -1}; |
823 | 1.73M | if (retval.target == NULL) { |
824 | | // No override target; default to the current node, but special-case the |
825 | | // root node since get_current_node() assumes the stack of open elements is |
826 | | // non-empty. |
827 | 1.72M | retval.target = (parser->_output->root != NULL) |
828 | 1.72M | ? get_current_node(parser) |
829 | 1.72M | : get_document_node(parser) |
830 | 1.72M | ; |
831 | 1.72M | } |
832 | 1.73M | if ( |
833 | 1.73M | !parser->_parser_state->_foster_parent_insertions |
834 | 1.73M | || !node_tag_in_set(retval.target, &(const TagSet) { |
835 | 1.02M | TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) |
836 | 1.02M | }) |
837 | 1.73M | ) { |
838 | 1.21M | return retval; |
839 | 1.21M | } |
840 | | |
841 | | // Foster-parenting case. |
842 | 519k | int last_template_index = -1; |
843 | 519k | int last_table_index = -1; |
844 | 519k | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
845 | 4.49M | for (unsigned int i = 0; i < open_elements->length; ++i) { |
846 | 3.97M | if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { |
847 | 21.9k | last_template_index = i; |
848 | 21.9k | } |
849 | 3.97M | if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { |
850 | 543k | last_table_index = i; |
851 | 543k | } |
852 | 3.97M | } |
853 | 519k | if ( |
854 | 519k | last_template_index != -1 |
855 | 519k | && (last_table_index == -1 || last_template_index > last_table_index) |
856 | 519k | ) { |
857 | 1.20k | retval.target = open_elements->data[last_template_index]; |
858 | 1.20k | return retval; |
859 | 1.20k | } |
860 | 518k | if (last_table_index == -1) { |
861 | 0 | retval.target = open_elements->data[0]; |
862 | 0 | return retval; |
863 | 0 | } |
864 | 518k | const GumboNode* last_table = open_elements->data[last_table_index]; |
865 | 518k | if (last_table->parent != NULL) { |
866 | 518k | retval.target = last_table->parent; |
867 | 518k | retval.index = last_table->index_within_parent; |
868 | 518k | return retval; |
869 | 518k | } |
870 | | |
871 | 0 | retval.target = open_elements->data[last_table_index - 1]; |
872 | 0 | return retval; |
873 | 518k | } |
874 | | |
875 | | // Appends a node to the end of its parent, setting the "parent" and |
876 | | // "index_within_parent" fields appropriately. |
877 | 1.43M | static void append_node(GumboNode* parent, GumboNode* node) { |
878 | 1.43M | assert(node->parent == NULL); |
879 | 0 | assert(node->index_within_parent == (unsigned int) -1); |
880 | 0 | GumboVector* children; |
881 | 1.43M | if ( |
882 | 1.43M | parent->type == GUMBO_NODE_ELEMENT |
883 | 1.43M | || parent->type == GUMBO_NODE_TEMPLATE |
884 | 1.43M | ) { |
885 | 1.42M | children = &parent->v.element.children; |
886 | 1.42M | } else { |
887 | 11.4k | assert(parent->type == GUMBO_NODE_DOCUMENT); |
888 | 0 | children = &parent->v.document.children; |
889 | 11.4k | } |
890 | 0 | node->parent = parent; |
891 | 1.43M | node->index_within_parent = children->length; |
892 | 1.43M | gumbo_vector_add((void*) node, children); |
893 | 1.43M | assert(node->index_within_parent < children->length); |
894 | 1.43M | } |
895 | | |
896 | | // Inserts a node at the specified InsertionLocation, updating the |
897 | | // "parent" and "index_within_parent" fields of it and all its siblings. |
898 | | // If the index of the location is -1, this calls append_node. |
899 | 1.73M | static void insert_node(GumboNode* node, InsertionLocation location) { |
900 | 1.73M | assert(node->parent == NULL); |
901 | 0 | assert(node->index_within_parent == (unsigned int) -1); |
902 | 0 | GumboNode* parent = location.target; |
903 | 1.73M | int index = location.index; |
904 | 1.73M | if (index != -1) { |
905 | 518k | GumboVector* children = NULL; |
906 | 518k | if ( |
907 | 518k | parent->type == GUMBO_NODE_ELEMENT |
908 | 518k | || parent->type == GUMBO_NODE_TEMPLATE |
909 | 518k | ) { |
910 | 518k | children = &parent->v.element.children; |
911 | 518k | } else if (parent->type == GUMBO_NODE_DOCUMENT) { |
912 | 0 | children = &parent->v.document.children; |
913 | 0 | assert(children->length == 0); |
914 | 0 | } else { |
915 | 0 | assert(0); |
916 | 0 | } |
917 | | |
918 | 0 | assert(index >= 0); |
919 | 0 | assert((unsigned int) index < children->length); |
920 | 0 | node->parent = parent; |
921 | 518k | node->index_within_parent = index; |
922 | 518k | gumbo_vector_insert_at((void*) node, index, children); |
923 | 518k | assert(node->index_within_parent < children->length); |
924 | 1.03M | for (unsigned int i = index + 1; i < children->length; ++i) { |
925 | 518k | GumboNode* sibling = children->data[i]; |
926 | 518k | sibling->index_within_parent = i; |
927 | 518k | assert(sibling->index_within_parent < children->length); |
928 | 518k | } |
929 | 1.21M | } else { |
930 | 1.21M | append_node(parent, node); |
931 | 1.21M | } |
932 | 1.73M | } |
933 | | |
934 | 2.75M | static void maybe_flush_text_node_buffer(GumboParser* parser) { |
935 | 2.75M | GumboParserState* state = parser->_parser_state; |
936 | 2.75M | TextNodeBufferState* buffer_state = &state->_text_node; |
937 | 2.75M | if (buffer_state->_buffer.length == 0) { |
938 | 2.33M | return; |
939 | 2.33M | } |
940 | | |
941 | 420k | assert ( |
942 | 420k | buffer_state->_type == GUMBO_NODE_WHITESPACE |
943 | 420k | || buffer_state->_type == GUMBO_NODE_TEXT |
944 | 420k | || buffer_state->_type == GUMBO_NODE_CDATA |
945 | 420k | ); |
946 | 0 | GumboNode* text_node = create_node(buffer_state->_type); |
947 | 420k | GumboText* text_node_data = &text_node->v.text; |
948 | 420k | text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer); |
949 | 420k | text_node_data->original_text.data = buffer_state->_start_original_text; |
950 | 420k | text_node_data->original_text.length = |
951 | 420k | state->_current_token->original_text.data - |
952 | 420k | buffer_state->_start_original_text; |
953 | 420k | text_node_data->start_pos = buffer_state->_start_position; |
954 | | |
955 | 420k | gumbo_debug ( |
956 | 420k | "Flushing text node buffer of %.*s.\n", |
957 | 420k | (int) buffer_state->_buffer.length, |
958 | 420k | buffer_state->_buffer.data |
959 | 420k | ); |
960 | | |
961 | 420k | InsertionLocation location = get_appropriate_insertion_location(parser, NULL); |
962 | 420k | if (location.target->type == GUMBO_NODE_DOCUMENT) { |
963 | | // The DOM does not allow Document nodes to have Text children, so per the |
964 | | // spec, they are dropped on the floor. |
965 | 0 | destroy_node(text_node); |
966 | 420k | } else { |
967 | 420k | insert_node(text_node, location); |
968 | 420k | } |
969 | | |
970 | 420k | gumbo_string_buffer_clear(&buffer_state->_buffer); |
971 | 420k | buffer_state->_type = GUMBO_NODE_WHITESPACE; |
972 | 420k | assert(buffer_state->_buffer.length == 0); |
973 | 420k | } |
974 | | |
975 | | static void record_end_of_element ( |
976 | | const GumboToken* current_token, |
977 | | GumboElement* element |
978 | 1.29M | ) { |
979 | 1.29M | element->end_pos = current_token->position; |
980 | 1.29M | element->original_end_tag = |
981 | 1.29M | (current_token->type == GUMBO_TOKEN_END_TAG) |
982 | 1.29M | ? current_token->original_text |
983 | 1.29M | : kGumboEmptyString; |
984 | 1.29M | } |
985 | | |
986 | 1.31M | static GumboNode* pop_current_node(GumboParser* parser) { |
987 | 1.31M | GumboParserState* state = parser->_parser_state; |
988 | 1.31M | maybe_flush_text_node_buffer(parser); |
989 | 1.31M | if (state->_open_elements.length > 0) { |
990 | 1.29M | assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); |
991 | 0 | gumbo_debug ( |
992 | 1.29M | "Popping %s node.\n", |
993 | 1.29M | gumbo_normalized_tagname(get_current_node(parser)->v.element.tag) |
994 | 1.29M | ); |
995 | 1.29M | } |
996 | 0 | GumboNode* current_node = gumbo_vector_pop(&state->_open_elements); |
997 | 1.31M | if (!current_node) { |
998 | 20.1k | assert(state->_open_elements.length == 0); |
999 | 0 | return NULL; |
1000 | 20.1k | } |
1001 | 1.29M | assert ( |
1002 | 1.29M | current_node->type == GUMBO_NODE_ELEMENT |
1003 | 1.29M | || current_node->type == GUMBO_NODE_TEMPLATE |
1004 | 1.29M | ); |
1005 | 0 | bool is_closed_body_or_html_tag = |
1006 | 1.29M | ( |
1007 | 1.29M | node_html_tag_is(current_node, GUMBO_TAG_BODY) |
1008 | 1.29M | && state->_closed_body_tag |
1009 | 1.29M | ) || ( |
1010 | 1.29M | node_html_tag_is(current_node, GUMBO_TAG_HTML) |
1011 | 1.29M | && state->_closed_html_tag |
1012 | 1.29M | ) |
1013 | 1.29M | ; |
1014 | 1.29M | if ( |
1015 | 1.29M | ( |
1016 | 1.29M | state->_current_token->type != GUMBO_TOKEN_END_TAG |
1017 | 1.29M | || !node_qualified_tagname_is ( |
1018 | 35.7k | current_node, |
1019 | 35.7k | GUMBO_NAMESPACE_HTML, |
1020 | 35.7k | state->_current_token->v.end_tag.tag, |
1021 | 35.7k | state->_current_token->v.end_tag.name |
1022 | 35.7k | ) |
1023 | 1.29M | ) |
1024 | 1.29M | && !is_closed_body_or_html_tag |
1025 | 1.29M | ) { |
1026 | 1.28M | current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; |
1027 | 1.28M | } |
1028 | 1.29M | if (!is_closed_body_or_html_tag) { |
1029 | 1.29M | record_end_of_element(state->_current_token, ¤t_node->v.element); |
1030 | 1.29M | } |
1031 | 1.29M | return current_node; |
1032 | 1.31M | } |
1033 | | |
1034 | | static void append_comment_node ( |
1035 | | GumboParser* parser, |
1036 | | GumboNode* node, |
1037 | | const GumboToken* token |
1038 | 200k | ) { |
1039 | 200k | maybe_flush_text_node_buffer(parser); |
1040 | 200k | GumboNode* comment = create_node(GUMBO_NODE_COMMENT); |
1041 | 200k | comment->type = GUMBO_NODE_COMMENT; |
1042 | 200k | comment->parse_flags = GUMBO_INSERTION_NORMAL; |
1043 | 200k | comment->v.text.text = token->v.text; |
1044 | 200k | comment->v.text.original_text = token->original_text; |
1045 | 200k | comment->v.text.start_pos = token->position; |
1046 | 200k | append_node(node, comment); |
1047 | 200k | } |
1048 | | |
1049 | | // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context |
1050 | 27.4k | static void clear_stack_to_table_row_context(GumboParser* parser) { |
1051 | 27.4k | static const TagSet tags = {TAG(HTML), TAG(TR), TAG(TEMPLATE)}; |
1052 | 56.9k | while (!node_tag_in_set(get_current_node(parser), &tags)) { |
1053 | 29.5k | pop_current_node(parser); |
1054 | 29.5k | } |
1055 | 27.4k | } |
1056 | | |
1057 | | // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context |
1058 | 8.60k | static void clear_stack_to_table_context(GumboParser* parser) { |
1059 | 8.60k | static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}; |
1060 | 24.1k | while (!node_tag_in_set(get_current_node(parser), &tags)) { |
1061 | 15.5k | pop_current_node(parser); |
1062 | 15.5k | } |
1063 | 8.60k | } |
1064 | | |
1065 | | // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context |
1066 | 8.78k | static void clear_stack_to_table_body_context(GumboParser* parser) { |
1067 | 8.78k | static const TagSet tags = { |
1068 | 8.78k | TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) |
1069 | 8.78k | }; |
1070 | 29.0k | while (!node_tag_in_set(get_current_node(parser), &tags)) { |
1071 | 20.2k | pop_current_node(parser); |
1072 | 20.2k | } |
1073 | 8.78k | } |
1074 | | |
1075 | | // Creates a parser-inserted element in the HTML namespace and returns it. |
1076 | 36.3k | static GumboNode* create_element(GumboParser* parser, GumboTag tag) { |
1077 | | // XXX: This will fail for creating fragments with an element with tag |
1078 | | // GUMBO_TAG_UNKNOWN |
1079 | 36.3k | assert(tag != GUMBO_TAG_UNKNOWN); |
1080 | 0 | GumboNode* node = create_node(GUMBO_NODE_ELEMENT); |
1081 | 36.3k | GumboElement* element = &node->v.element; |
1082 | 36.3k | gumbo_vector_init(1, &element->children); |
1083 | 36.3k | gumbo_vector_init(0, &element->attributes); |
1084 | 36.3k | element->tag = tag; |
1085 | 36.3k | element->name = gumbo_normalized_tagname(tag); |
1086 | 36.3k | element->tag_namespace = GUMBO_NAMESPACE_HTML; |
1087 | 36.3k | element->original_tag = kGumboEmptyString; |
1088 | 36.3k | element->original_end_tag = kGumboEmptyString; |
1089 | 36.3k | element->start_pos = (parser->_parser_state->_current_token) |
1090 | 36.3k | ? parser->_parser_state->_current_token->position |
1091 | 36.3k | : kGumboEmptySourcePosition |
1092 | 36.3k | ; |
1093 | 36.3k | element->end_pos = kGumboEmptySourcePosition; |
1094 | 36.3k | return node; |
1095 | 36.3k | } |
1096 | | |
1097 | | // Constructs an element from the given start tag token. |
1098 | | static GumboNode* create_element_from_token ( |
1099 | | GumboToken* token, |
1100 | | GumboNamespaceEnum tag_namespace |
1101 | 1.17M | ) { |
1102 | 1.17M | assert(token->type == GUMBO_TOKEN_START_TAG); |
1103 | 0 | GumboTokenStartTag* start_tag = &token->v.start_tag; |
1104 | | |
1105 | 1.17M | GumboNodeType type = |
1106 | 1.17M | ( |
1107 | 1.17M | tag_namespace == GUMBO_NAMESPACE_HTML |
1108 | 1.17M | && start_tag->tag == GUMBO_TAG_TEMPLATE |
1109 | 1.17M | ) |
1110 | 1.17M | ? GUMBO_NODE_TEMPLATE |
1111 | 1.17M | : GUMBO_NODE_ELEMENT |
1112 | 1.17M | ; |
1113 | | |
1114 | 1.17M | GumboNode* node = create_node(type); |
1115 | 1.17M | GumboElement* element = &node->v.element; |
1116 | 1.17M | gumbo_vector_init(1, &element->children); |
1117 | 1.17M | element->attributes = start_tag->attributes; |
1118 | 1.17M | element->tag = start_tag->tag; |
1119 | 1.17M | element->name = start_tag->name ? start_tag->name : gumbo_normalized_tagname(start_tag->tag); |
1120 | 1.17M | element->tag_namespace = tag_namespace; |
1121 | | |
1122 | 1.17M | assert(token->original_text.length >= 2); |
1123 | 0 | assert(token->original_text.data[0] == '<'); |
1124 | 0 | assert(token->original_text.data[token->original_text.length - 1] == '>'); |
1125 | 0 | element->original_tag = token->original_text; |
1126 | 1.17M | element->start_pos = token->position; |
1127 | 1.17M | element->original_end_tag = kGumboEmptyString; |
1128 | 1.17M | element->end_pos = kGumboEmptySourcePosition; |
1129 | | |
1130 | | // The element takes ownership of the attributes and name from the token, so |
1131 | | // any allocated-memory fields should be nulled out. |
1132 | 1.17M | start_tag->attributes = kGumboEmptyVector; |
1133 | 1.17M | start_tag->name = NULL; |
1134 | 1.17M | return node; |
1135 | 1.17M | } |
1136 | | |
1137 | | // https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element |
1138 | | static void insert_element ( |
1139 | | GumboParser* parser, |
1140 | | GumboNode* node, |
1141 | | bool is_reconstructing_formatting_elements |
1142 | 1.21M | ) { |
1143 | 1.21M | GumboParserState* state = parser->_parser_state; |
1144 | | // NOTE(jdtang): The text node buffer must always be flushed before inserting |
1145 | | // a node, otherwise we're handling nodes in a different order than the spec |
1146 | | // mandated. However, one clause of the spec (character tokens in the body) |
1147 | | // requires that we reconstruct the active formatting elements *before* adding |
1148 | | // the character, and reconstructing the active formatting elements may itself |
1149 | | // result in the insertion of new elements (which should be pushed onto the |
1150 | | // stack of open elements before the buffer is flushed). We solve this (for |
1151 | | // the time being, the spec has been rewritten for <template> and the new |
1152 | | // version may be simpler here) with a boolean flag to this method. |
1153 | 1.21M | if (!is_reconstructing_formatting_elements) { |
1154 | 1.21M | maybe_flush_text_node_buffer(parser); |
1155 | 1.21M | } |
1156 | 1.21M | InsertionLocation location = get_appropriate_insertion_location(parser, NULL); |
1157 | 1.21M | insert_node(node, location); |
1158 | 1.21M | gumbo_vector_add((void*) node, &state->_open_elements); |
1159 | 1.21M | } |
1160 | | |
1161 | | // Convenience method that combines create_element_from_token and |
1162 | | // insert_element, inserting the generated element directly into the current |
1163 | | // node. Returns the node inserted. |
1164 | | static GumboNode* insert_element_from_token ( |
1165 | | GumboParser* parser, |
1166 | | GumboToken* token |
1167 | 1.16M | ) { |
1168 | 1.16M | GumboNode* element = create_element_from_token(token, GUMBO_NAMESPACE_HTML); |
1169 | 1.16M | insert_element(parser, element, false); |
1170 | 1.16M | gumbo_debug ( |
1171 | 1.16M | "Inserting <%s> element (@%p) from token.\n", |
1172 | 1.16M | gumbo_normalized_tagname(element->v.element.tag), |
1173 | 1.16M | (void*)element |
1174 | 1.16M | ); |
1175 | 1.16M | return element; |
1176 | 1.16M | } |
1177 | | |
1178 | | // Convenience method that combines create_element and insert_element, inserting |
1179 | | // a parser-generated element of a specific tag type. Returns the node |
1180 | | // inserted. |
1181 | | static GumboNode* insert_element_of_tag_type ( |
1182 | | GumboParser* parser, |
1183 | | GumboTag tag, |
1184 | | GumboParseFlags reason |
1185 | 36.3k | ) { |
1186 | 36.3k | GumboNode* element = create_element(parser, tag); |
1187 | 36.3k | element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason; |
1188 | 36.3k | insert_element(parser, element, false); |
1189 | 36.3k | gumbo_debug ( |
1190 | 36.3k | "Inserting %s element (@%p) from tag type.\n", |
1191 | 36.3k | gumbo_normalized_tagname(tag), |
1192 | 36.3k | (void*)element |
1193 | 36.3k | ); |
1194 | 36.3k | return element; |
1195 | 36.3k | } |
1196 | | |
1197 | | // Convenience method for creating foreign namespaced element. Returns the node |
1198 | | // inserted. |
1199 | | static GumboNode* insert_foreign_element ( |
1200 | | GumboParser* parser, |
1201 | | GumboToken* token, |
1202 | | GumboNamespaceEnum tag_namespace |
1203 | 14.4k | ) { |
1204 | 14.4k | assert(token->type == GUMBO_TOKEN_START_TAG); |
1205 | 0 | GumboNode* element = create_element_from_token(token, tag_namespace); |
1206 | 14.4k | insert_element(parser, element, false); |
1207 | 14.4k | if ( |
1208 | 14.4k | token_has_attribute(token, "xmlns") |
1209 | 14.4k | && !attribute_matches_case_sensitive ( |
1210 | 0 | &token->v.start_tag.attributes, |
1211 | 0 | "xmlns", |
1212 | 0 | kLegalXmlns[tag_namespace] |
1213 | 0 | ) |
1214 | 14.4k | ) { |
1215 | | // TODO(jdtang): Since there're multiple possible error codes here, we |
1216 | | // eventually need reason codes to differentiate them. |
1217 | 0 | parser_add_parse_error(parser, token); |
1218 | 0 | } |
1219 | 14.4k | if ( |
1220 | 14.4k | token_has_attribute(token, "xmlns:xlink") |
1221 | 14.4k | && !attribute_matches_case_sensitive ( |
1222 | 0 | &token->v.start_tag.attributes, |
1223 | 0 | "xmlns:xlink", |
1224 | 0 | "http://www.w3.org/1999/xlink" |
1225 | 0 | ) |
1226 | 14.4k | ) { |
1227 | 0 | parser_add_parse_error(parser, token); |
1228 | 0 | } |
1229 | 14.4k | return element; |
1230 | 14.4k | } |
1231 | | |
1232 | 12.1M | static void insert_text_token(GumboParser* parser, GumboToken* token) { |
1233 | 12.1M | assert ( |
1234 | 12.1M | token->type == GUMBO_TOKEN_WHITESPACE |
1235 | 12.1M | || token->type == GUMBO_TOKEN_CHARACTER |
1236 | 12.1M | || token->type == GUMBO_TOKEN_NULL |
1237 | 12.1M | || token->type == GUMBO_TOKEN_CDATA |
1238 | 12.1M | ); |
1239 | 0 | TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; |
1240 | 12.1M | if (buffer_state->_buffer.length == 0) { |
1241 | | // Initialize position fields. |
1242 | 420k | buffer_state->_start_original_text = token->original_text.data; |
1243 | 420k | buffer_state->_start_position = token->position; |
1244 | 420k | } |
1245 | 12.1M | gumbo_string_buffer_append_codepoint ( |
1246 | 12.1M | token->v.character, |
1247 | 12.1M | &buffer_state->_buffer |
1248 | 12.1M | ); |
1249 | 12.1M | if (token->type == GUMBO_TOKEN_CHARACTER) { |
1250 | 10.3M | buffer_state->_type = GUMBO_NODE_TEXT; |
1251 | 10.3M | } else if (token->type == GUMBO_TOKEN_CDATA) { |
1252 | 500k | buffer_state->_type = GUMBO_NODE_CDATA; |
1253 | 500k | } |
1254 | 12.1M | gumbo_debug("Inserting text token '%c'.\n", token->v.character); |
1255 | 12.1M | } |
1256 | | |
1257 | | // https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm |
1258 | | static void run_generic_parsing_algorithm ( |
1259 | | GumboParser* parser, |
1260 | | GumboToken* token, |
1261 | | GumboTokenizerEnum lexer_state |
1262 | 2.91k | ) { |
1263 | 2.91k | insert_element_from_token(parser, token); |
1264 | 2.91k | gumbo_tokenizer_set_state(parser, lexer_state); |
1265 | 2.91k | GumboParserState* parser_state = parser->_parser_state; |
1266 | 2.91k | parser_state->_original_insertion_mode = parser_state->_insertion_mode; |
1267 | 2.91k | parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT; |
1268 | 2.91k | } |
1269 | | |
1270 | 7.31k | static void acknowledge_self_closing_tag(GumboParser* parser) { |
1271 | 7.31k | parser->_parser_state->_self_closing_flag_acknowledged = true; |
1272 | 7.31k | } |
1273 | | |
1274 | | // Returns true if there's an anchor tag in the list of active formatting |
1275 | | // elements, and fills in its index if so. |
1276 | 89.9k | static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) { |
1277 | 89.9k | GumboVector* elements = &parser->_parser_state->_active_formatting_elements; |
1278 | 110k | for (int i = elements->length; --i >= 0;) { |
1279 | 107k | GumboNode* node = elements->data[i]; |
1280 | 107k | if (node == &kActiveFormattingScopeMarker) { |
1281 | 60.3k | return false; |
1282 | 60.3k | } |
1283 | 47.3k | if (node_html_tag_is(node, GUMBO_TAG_A)) { |
1284 | 26.9k | *anchor_index = i; |
1285 | 26.9k | return true; |
1286 | 26.9k | } |
1287 | 47.3k | } |
1288 | 2.70k | return false; |
1289 | 89.9k | } |
1290 | | |
1291 | | // Counts the number of open formatting elements in the list of active |
1292 | | // formatting elements (after the last active scope marker) that have a specific |
1293 | | // tag. If this is > 0, then earliest_matching_index will be filled in with the |
1294 | | // index of the first such element. |
1295 | | static int count_formatting_elements_of_tag ( |
1296 | | GumboParser* parser, |
1297 | | const GumboNode* desired_node, |
1298 | | int* earliest_matching_index |
1299 | 388k | ) { |
1300 | 388k | const GumboElement* desired_element = &desired_node->v.element; |
1301 | 388k | GumboVector* elements = &parser->_parser_state->_active_formatting_elements; |
1302 | 388k | int num_identical_elements = 0; |
1303 | 1.37M | for (int i = elements->length; --i >= 0;) { |
1304 | 1.34M | GumboNode* node = elements->data[i]; |
1305 | 1.34M | if (node == &kActiveFormattingScopeMarker) { |
1306 | 362k | break; |
1307 | 362k | } |
1308 | 986k | assert(node->type == GUMBO_NODE_ELEMENT); |
1309 | 0 | if ( |
1310 | 986k | node_qualified_tagname_is ( |
1311 | 986k | node, |
1312 | 986k | desired_element->tag_namespace, |
1313 | 986k | desired_element->tag, |
1314 | 986k | desired_element->name |
1315 | 986k | ) |
1316 | 986k | && all_attributes_match(&node->v.element.attributes, &desired_element->attributes) |
1317 | 986k | ) { |
1318 | 223k | num_identical_elements++; |
1319 | 223k | *earliest_matching_index = i; |
1320 | 223k | } |
1321 | 986k | } |
1322 | 388k | return num_identical_elements; |
1323 | 388k | } |
1324 | | |
1325 | | // https://html.spec.whatwg.org/multipage/parsing.html#reconstruct-the-active-formatting-elements |
1326 | 388k | static void add_formatting_element(GumboParser* parser, const GumboNode* node) { |
1327 | 388k | assert ( |
1328 | 388k | node == &kActiveFormattingScopeMarker |
1329 | 388k | || node->type == GUMBO_NODE_ELEMENT |
1330 | 388k | ); |
1331 | 0 | GumboVector* elements = &parser->_parser_state->_active_formatting_elements; |
1332 | 388k | if (node == &kActiveFormattingScopeMarker) { |
1333 | 92.3k | gumbo_debug("Adding a scope marker.\n"); |
1334 | 295k | } else { |
1335 | 295k | gumbo_debug("Adding a formatting element.\n"); |
1336 | 295k | } |
1337 | | |
1338 | | // Hunt for identical elements. |
1339 | 388k | int earliest_identical_element = elements->length; |
1340 | 388k | int num_identical_elements = count_formatting_elements_of_tag ( |
1341 | 388k | parser, |
1342 | 388k | node, |
1343 | 388k | &earliest_identical_element |
1344 | 388k | ); |
1345 | | |
1346 | | // Noah's Ark clause: if there're at least 3, remove the earliest. |
1347 | 388k | if (num_identical_elements >= 3) { |
1348 | 35.0k | gumbo_debug ( |
1349 | 35.0k | "Noah's ark clause: removing element at %d.\n", |
1350 | 35.0k | earliest_identical_element |
1351 | 35.0k | ); |
1352 | 35.0k | gumbo_vector_remove_at(earliest_identical_element, elements); |
1353 | 35.0k | } |
1354 | | |
1355 | 388k | gumbo_vector_add((void*) node, elements); |
1356 | 388k | } |
1357 | | |
1358 | 1.51M | static bool is_open_element(const GumboParser* parser, const GumboNode* node) { |
1359 | 1.51M | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
1360 | 113M | for (unsigned int i = 0; i < open_elements->length; ++i) { |
1361 | 113M | if (open_elements->data[i] == node) { |
1362 | 1.42M | return true; |
1363 | 1.42M | } |
1364 | 113M | } |
1365 | 89.5k | return false; |
1366 | 1.51M | } |
1367 | | |
1368 | | // Clones attributes, tags, etc. of a node, but does not copy the content. The |
1369 | | // clone shares no structure with the original node: all owned strings and |
1370 | | // values are fresh copies. |
1371 | | static GumboNode* clone_node ( |
1372 | | GumboNode* node, |
1373 | | GumboParseFlags reason |
1374 | 105k | ) { |
1375 | 105k | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
1376 | 0 | GumboNode* new_node = gumbo_alloc(sizeof(GumboNode)); |
1377 | 105k | *new_node = *node; |
1378 | 105k | new_node->parent = NULL; |
1379 | 105k | new_node->index_within_parent = -1; |
1380 | | // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may |
1381 | | // have a separate end tag. |
1382 | 105k | new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG; |
1383 | 105k | new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER; |
1384 | 105k | GumboElement* element = &new_node->v.element; |
1385 | 105k | gumbo_vector_init(1, &element->children); |
1386 | | |
1387 | 105k | const GumboVector* old_attributes = &node->v.element.attributes; |
1388 | 105k | gumbo_vector_init(old_attributes->length, &element->attributes); |
1389 | 125k | for (unsigned int i = 0; i < old_attributes->length; ++i) { |
1390 | 20.3k | const GumboAttribute* old_attr = old_attributes->data[i]; |
1391 | 20.3k | GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); |
1392 | 20.3k | *attr = *old_attr; |
1393 | 20.3k | attr->name = gumbo_strdup(old_attr->name); |
1394 | 20.3k | attr->value = gumbo_strdup(old_attr->value); |
1395 | 20.3k | gumbo_vector_add(attr, &element->attributes); |
1396 | 20.3k | } |
1397 | 105k | return new_node; |
1398 | 105k | } |
1399 | | |
1400 | | // "Reconstruct active formatting elements" part of the spec. |
1401 | | // This implementation is based on the html5lib translation from the |
1402 | | // mess of GOTOs in the spec to reasonably structured programming. |
1403 | | // https://github.com/html5lib/html5lib-python/blob/master/html5lib/treebuilders/base.py |
1404 | 7.29M | static void reconstruct_active_formatting_elements(GumboParser* parser) { |
1405 | 7.29M | GumboVector* elements = &parser->_parser_state->_active_formatting_elements; |
1406 | | // Step 1 |
1407 | 7.29M | if (elements->length == 0) { |
1408 | 5.36M | return; |
1409 | 5.36M | } |
1410 | | |
1411 | | // Step 2 & 3 |
1412 | 1.92M | unsigned int i = elements->length - 1; |
1413 | 1.92M | GumboNode* element = elements->data[i]; |
1414 | 1.92M | if ( |
1415 | 1.92M | element == &kActiveFormattingScopeMarker |
1416 | 1.92M | || is_open_element(parser, element) |
1417 | 1.92M | ) { |
1418 | 1.90M | return; |
1419 | 1.90M | } |
1420 | | |
1421 | | // Step 6 |
1422 | 89.5k | do { |
1423 | 89.5k | if (i == 0) { |
1424 | | // Step 4 |
1425 | 3.50k | i = -1; // Incremented to 0 below. |
1426 | 3.50k | break; |
1427 | 3.50k | } |
1428 | | // Step 5 |
1429 | 86.0k | element = elements->data[--i]; |
1430 | 86.0k | } while ( |
1431 | 86.0k | element != &kActiveFormattingScopeMarker |
1432 | 86.0k | && !is_open_element(parser, element) |
1433 | 18.2k | ); |
1434 | | |
1435 | 0 | ++i; |
1436 | 18.2k | gumbo_debug ( |
1437 | 18.2k | "Reconstructing elements from %u on %s parent.\n", |
1438 | 18.2k | i, |
1439 | 18.2k | gumbo_normalized_tagname(get_current_node(parser)->v.element.tag) |
1440 | 18.2k | ); |
1441 | 107k | for (; i < elements->length; ++i) { |
1442 | | // Step 7 & 8. |
1443 | 89.5k | assert(elements->length > 0); |
1444 | 0 | assert(i < elements->length); |
1445 | 0 | element = elements->data[i]; |
1446 | 89.5k | assert(element != &kActiveFormattingScopeMarker); |
1447 | 0 | GumboNode* clone = clone_node ( |
1448 | 89.5k | element, |
1449 | 89.5k | GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT |
1450 | 89.5k | ); |
1451 | | // Step 9. |
1452 | 89.5k | InsertionLocation location = |
1453 | 89.5k | get_appropriate_insertion_location(parser, NULL); |
1454 | 89.5k | insert_node(clone, location); |
1455 | 89.5k | gumbo_vector_add ( |
1456 | 89.5k | (void*) clone, |
1457 | 89.5k | &parser->_parser_state->_open_elements |
1458 | 89.5k | ); |
1459 | | |
1460 | | // Step 10. |
1461 | 89.5k | elements->data[i] = clone; |
1462 | 89.5k | gumbo_debug ( |
1463 | 89.5k | "Reconstructed %s element at %u.\n", |
1464 | 89.5k | gumbo_normalized_tagname(clone->v.element.tag), |
1465 | 89.5k | i |
1466 | 89.5k | ); |
1467 | 89.5k | } |
1468 | 18.2k | } |
1469 | | |
1470 | 27.2k | static void clear_active_formatting_elements(GumboParser* parser) { |
1471 | 27.2k | GumboVector* elements = &parser->_parser_state->_active_formatting_elements; |
1472 | 27.2k | int num_elements_cleared = 0; |
1473 | 27.2k | const GumboNode* node; |
1474 | 42.4k | do { |
1475 | 42.4k | node = gumbo_vector_pop(elements); |
1476 | 42.4k | ++num_elements_cleared; |
1477 | 42.4k | } while (node && node != &kActiveFormattingScopeMarker); |
1478 | 27.2k | gumbo_debug ( |
1479 | 27.2k | "Cleared %d elements from active formatting list.\n", |
1480 | 27.2k | num_elements_cleared |
1481 | 27.2k | ); |
1482 | 27.2k | } |
1483 | | |
1484 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode |
1485 | | GumboQuirksModeEnum gumbo_compute_quirks_mode ( |
1486 | | const char *name, |
1487 | | const char *pubid_str, |
1488 | | const char *sysid_str |
1489 | 107 | ) { |
1490 | | |
1491 | 107 | GumboStringPiece pubid = { |
1492 | 107 | .data = pubid_str, |
1493 | 107 | .length = pubid_str? strlen(pubid_str) : 0, |
1494 | 107 | }; |
1495 | 107 | GumboStringPiece sysid = { |
1496 | 107 | .data = sysid_str, |
1497 | 107 | .length = sysid_str? strlen(sysid_str) : 0, |
1498 | 107 | }; |
1499 | 107 | bool has_system_identifier = !!sysid_str; |
1500 | 107 | if ( |
1501 | 107 | name == NULL |
1502 | 107 | || strcmp(name, "html") |
1503 | 107 | || is_in_static_list(&pubid, kQuirksModePublicIdPrefixes, false) |
1504 | 107 | || is_in_static_list(&pubid, kQuirksModePublicIdExactMatches, true) |
1505 | 107 | || is_in_static_list(&sysid, kQuirksModeSystemIdExactMatches, true) |
1506 | 107 | || ( |
1507 | 35 | !has_system_identifier |
1508 | 35 | && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false) |
1509 | 35 | ) |
1510 | 107 | ) { |
1511 | 72 | return GUMBO_DOCTYPE_QUIRKS; |
1512 | 72 | } |
1513 | | |
1514 | 35 | if ( |
1515 | 35 | is_in_static_list(&pubid, kLimitedQuirksPublicIdPrefixes, false) |
1516 | 35 | || ( |
1517 | 35 | has_system_identifier |
1518 | 35 | && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false) |
1519 | 35 | ) |
1520 | 35 | ) { |
1521 | 0 | return GUMBO_DOCTYPE_LIMITED_QUIRKS; |
1522 | 0 | } |
1523 | | |
1524 | 35 | return GUMBO_DOCTYPE_NO_QUIRKS; |
1525 | 35 | } |
1526 | | |
1527 | 472 | static GumboQuirksModeEnum compute_quirks_mode(const GumboTokenDocType* doctype) { |
1528 | 472 | if (doctype->force_quirks) |
1529 | 365 | return GUMBO_DOCTYPE_QUIRKS; |
1530 | 107 | return gumbo_compute_quirks_mode ( |
1531 | 107 | doctype->name, |
1532 | 107 | doctype->has_public_identifier? doctype->public_identifier : NULL, |
1533 | 107 | doctype->has_system_identifier? doctype->system_identifier : NULL |
1534 | 107 | ); |
1535 | 472 | } |
1536 | | |
1537 | | // The following functions are all defined by the "has an element in __ scope" |
1538 | | // sections of the HTML5 spec: |
1539 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope |
1540 | | // The basic idea behind them is that they check for an element of the given |
1541 | | // qualified name, contained within a scope formed by a set of other qualified |
1542 | | // names. For example, "has an element in list scope" looks for an element of |
1543 | | // the given qualified name within the nearest enclosing <ol> or <ul>, along |
1544 | | // with a bunch of generic element types that serve to "firewall" their content |
1545 | | // from the rest of the document. Note that because of the way the spec is |
1546 | | // written, |
1547 | | // all elements are expected to be in the HTML namespace |
1548 | | static bool has_an_element_in_specific_scope ( |
1549 | | const GumboParser* parser, |
1550 | | int expected_size, |
1551 | | const GumboTag* expected, |
1552 | | bool negate, |
1553 | | const TagSet* tags |
1554 | 733k | ) { |
1555 | 733k | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
1556 | 3.53M | for (int i = open_elements->length; --i >= 0;) { |
1557 | 3.53M | const GumboNode* node = open_elements->data[i]; |
1558 | 3.53M | if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { |
1559 | 0 | continue; |
1560 | 0 | } |
1561 | | |
1562 | 3.53M | GumboTag node_tag = node->v.element.tag; |
1563 | 3.53M | GumboNamespaceEnum node_ns = node->v.element.tag_namespace; |
1564 | 6.52M | for (int j = 0; j < expected_size; ++j) { |
1565 | 3.64M | if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) { |
1566 | 648k | return true; |
1567 | 648k | } |
1568 | 3.64M | } |
1569 | | |
1570 | 2.88M | bool found = tagset_includes(tags, node_ns, node_tag); |
1571 | 2.88M | if (negate != found) { |
1572 | 85.6k | return false; |
1573 | 85.6k | } |
1574 | 2.88M | } |
1575 | 0 | return false; |
1576 | 733k | } |
1577 | | |
1578 | | // Checks for the presence of an open element of the specified tag type. |
1579 | 10.6k | static bool has_open_element(const GumboParser* parser, GumboTag tag) { |
1580 | 10.6k | static const TagSet tags = {TAG(HTML)}; |
1581 | 10.6k | return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); |
1582 | 10.6k | } |
1583 | | |
1584 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope |
1585 | | #define DEFAULT_SCOPE_TAGS \ |
1586 | 611k | TAG(APPLET), \ |
1587 | 1.17M | TAG(CAPTION), \ |
1588 | 1.17M | TAG(HTML), \ |
1589 | 1.17M | TAG(TABLE), \ |
1590 | 1.17M | TAG(TD), \ |
1591 | 1.17M | TAG(TH), \ |
1592 | 1.17M | TAG(MARQUEE), \ |
1593 | 1.17M | TAG(OBJECT), \ |
1594 | 1.17M | TAG(TEMPLATE), \ |
1595 | 1.17M | TAG_MATHML(MI), \ |
1596 | 1.17M | TAG_MATHML(MO), \ |
1597 | 1.17M | TAG_MATHML(MN), \ |
1598 | 1.17M | TAG_MATHML(MS), \ |
1599 | 1.17M | TAG_MATHML(MTEXT), \ |
1600 | 1.17M | TAG_MATHML(ANNOTATION_XML), \ |
1601 | 1.17M | TAG_SVG(FOREIGNOBJECT), \ |
1602 | 1.17M | TAG_SVG(DESC), \ |
1603 | 1.17M | TAG_SVG(TITLE) |
1604 | | |
1605 | | static const TagSet heading_tags = { |
1606 | | TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6) |
1607 | | }; |
1608 | | |
1609 | | static const TagSet td_th_tags = { |
1610 | | TAG(TD), TAG(TH) |
1611 | | }; |
1612 | | |
1613 | | static const TagSet dd_dt_tags = { |
1614 | | TAG(DD), TAG(DT) |
1615 | | }; |
1616 | | |
1617 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope |
1618 | 45.2k | static bool has_an_element_in_scope(const GumboParser* parser, GumboTag tag) { |
1619 | 45.2k | static const TagSet tags = {DEFAULT_SCOPE_TAGS}; |
1620 | 45.2k | return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); |
1621 | 45.2k | } |
1622 | | |
1623 | | // Like "has an element in scope", but for the specific case of looking for a |
1624 | | // unique target node, not for any node with a given tag name. This duplicates |
1625 | | // much of the algorithm from has_an_element_in_specific_scope because the |
1626 | | // predicate is different when checking for an exact node, and it's easier & |
1627 | | // faster just to duplicate the code for this one case than to try and |
1628 | | // parameterize it. |
1629 | 266 | static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node) { |
1630 | 266 | static const TagSet tags = {DEFAULT_SCOPE_TAGS}; |
1631 | 266 | const GumboVector* open_elements = &parser->_parser_state->_open_elements; |
1632 | 2.23k | for (int i = open_elements->length; --i >= 0;) { |
1633 | 2.23k | const GumboNode* current = open_elements->data[i]; |
1634 | 2.23k | const GumboNodeType type = current->type; |
1635 | 2.23k | if (current == node) { |
1636 | 137 | return true; |
1637 | 137 | } |
1638 | 2.09k | if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) { |
1639 | 0 | continue; |
1640 | 0 | } |
1641 | 2.09k | if (node_tag_in_set(current, &tags)) { |
1642 | 129 | return false; |
1643 | 129 | } |
1644 | 2.09k | } |
1645 | 0 | assert(false); |
1646 | 0 | return false; |
1647 | 266 | } |
1648 | | |
1649 | | // Like has_an_element_in_scope, but restricts the expected qualified name to a |
1650 | | // range of possible qualified names instead of just a single one. |
1651 | | static bool has_an_element_in_scope_with_tagname ( |
1652 | | const GumboParser* parser, |
1653 | | int len, |
1654 | | const GumboTag expected[] |
1655 | 2.55k | ) { |
1656 | 2.55k | static const TagSet tags = {DEFAULT_SCOPE_TAGS}; |
1657 | 2.55k | return has_an_element_in_specific_scope(parser, len, expected, false, &tags); |
1658 | 2.55k | } |
1659 | | |
1660 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-list-item-scope |
1661 | 311 | static bool has_an_element_in_list_scope(const GumboParser* parser, GumboTag tag) { |
1662 | 311 | static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(OL), TAG(UL)}; |
1663 | 311 | return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); |
1664 | 311 | } |
1665 | | |
1666 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-button-scope |
1667 | 563k | static bool has_an_element_in_button_scope(const GumboParser* parser, GumboTag tag) { |
1668 | 563k | static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(BUTTON)}; |
1669 | 563k | return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); |
1670 | 563k | } |
1671 | | |
1672 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-table-scope |
1673 | 108k | static bool has_an_element_in_table_scope(const GumboParser* parser, GumboTag tag) { |
1674 | 108k | static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}; |
1675 | 108k | return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); |
1676 | 108k | } |
1677 | | |
1678 | | // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope |
1679 | 3.43k | static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag tag) { |
1680 | 3.43k | static const TagSet tags = {TAG(OPTGROUP), TAG(OPTION)}; |
1681 | 3.43k | return has_an_element_in_specific_scope(parser, 1, &tag, true, &tags); |
1682 | 3.43k | } |
1683 | | |
1684 | | // https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags |
1685 | | // "exception" is the "element to exclude from the process" listed in the spec. |
1686 | | // Pass GUMBO_TAG_LAST to not exclude any of them. |
1687 | | static void generate_implied_end_tags ( |
1688 | | GumboParser* parser, |
1689 | | GumboTag exception, |
1690 | | const char* exception_name |
1691 | 564k | ) { |
1692 | 564k | static const TagSet tags = { |
1693 | 564k | TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), |
1694 | 564k | TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC) |
1695 | 564k | }; |
1696 | 564k | while ( |
1697 | 568k | node_tag_in_set(get_current_node(parser), &tags) |
1698 | 568k | && !node_html_tagname_is(get_current_node(parser), exception, exception_name) |
1699 | 564k | ) { |
1700 | 3.21k | pop_current_node(parser); |
1701 | 3.21k | } |
1702 | 564k | } |
1703 | | |
1704 | | // This is the "generate all implied end tags thoroughly" clause of the spec. |
1705 | | // https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags |
1706 | 734 | static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { |
1707 | 734 | static const TagSet tags = { |
1708 | 734 | TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), |
1709 | 734 | TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), |
1710 | 734 | TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
1711 | 734 | }; |
1712 | 1.06k | while (node_tag_in_set(get_current_node(parser), &tags)) { |
1713 | 335 | pop_current_node(parser); |
1714 | 335 | } |
1715 | 734 | } |
1716 | | |
1717 | | // This factors out the clauses in the "in body" insertion mode checking "if |
1718 | | // there is a node in the stack of open elements that is not" one of a list of |
1719 | | // elements in which case it's a parse error. |
1720 | | // This is used in "an end-of-file token", "an end tag whose tag name is |
1721 | | // 'body'", and "an end tag whose tag name is 'html'". |
1722 | | static bool stack_contains_nonclosable_element ( |
1723 | | GumboParser* parser |
1724 | 10.8k | ) { |
1725 | 10.8k | static const TagSet tags = { |
1726 | 10.8k | TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB), |
1727 | 10.8k | TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), |
1728 | 10.8k | TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML), |
1729 | 10.8k | }; |
1730 | 10.8k | GumboVector* open_elements = &parser->_parser_state->_open_elements; |
1731 | 33.5k | for (size_t i = 0; i < open_elements->length; ++i) { |
1732 | 26.0k | if (!node_tag_in_set(open_elements->data[i], &tags)) |
1733 | 3.41k | return true; |
1734 | 26.0k | } |
1735 | 7.48k | return false; |
1736 | 10.8k | } |
1737 | | |
1738 | | // This factors out the clauses relating to "act as if an end tag token with tag |
1739 | | // name "table" had been seen. Returns true if there's a table element in table |
1740 | | // scope which was successfully closed, false if not and the token should be |
1741 | | // ignored. Does not add parse errors; callers should handle that. |
1742 | 7.31k | static bool close_table(GumboParser* parser) { |
1743 | 7.31k | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) { |
1744 | 204 | return false; |
1745 | 204 | } |
1746 | | |
1747 | 7.11k | GumboNode* node = pop_current_node(parser); |
1748 | 144k | while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) { |
1749 | 136k | node = pop_current_node(parser); |
1750 | 136k | } |
1751 | 7.11k | reset_insertion_mode_appropriately(parser); |
1752 | 7.11k | return true; |
1753 | 7.31k | } |
1754 | | |
1755 | | // This factors out the clauses relating to "act as if an end tag token with tag |
1756 | | // name `cell_tag` had been seen". |
1757 | | static void close_table_cell ( |
1758 | | GumboParser* parser, |
1759 | | const GumboToken* token, |
1760 | | GumboTag cell_tag |
1761 | 22.8k | ) { |
1762 | 22.8k | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
1763 | 22.8k | const GumboNode* node = get_current_node(parser); |
1764 | 22.8k | if (!node_html_tag_is(node, cell_tag)) |
1765 | 13.1k | parser_add_parse_error(parser, token); |
1766 | 259k | do { |
1767 | 259k | node = pop_current_node(parser); |
1768 | 259k | } while (!node_html_tag_is(node, cell_tag)); |
1769 | | |
1770 | 22.8k | clear_active_formatting_elements(parser); |
1771 | 22.8k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); |
1772 | 22.8k | } |
1773 | | |
1774 | | // https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell |
1775 | | // This holds the logic to determine whether we should close a <td> or a <th>. |
1776 | 21.3k | static void close_current_cell(GumboParser* parser, const GumboToken* token) { |
1777 | 21.3k | GumboTag cell_tag; |
1778 | 21.3k | if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { |
1779 | 20.3k | assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); |
1780 | 0 | cell_tag = GUMBO_TAG_TD; |
1781 | 20.3k | } else { |
1782 | 998 | assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); |
1783 | 0 | cell_tag = GUMBO_TAG_TH; |
1784 | 998 | } |
1785 | 0 | close_table_cell(parser, token, cell_tag); |
1786 | 21.3k | } |
1787 | | |
1788 | | // This factors out the "act as if an end tag of tag name 'select' had been |
1789 | | // seen" clause of the spec, since it's referenced in several places. It pops |
1790 | | // all nodes from the stack until the current <select> has been closed, then |
1791 | | // resets the insertion mode appropriately. |
1792 | 10.7k | static void close_current_select(GumboParser* parser) { |
1793 | 10.7k | GumboNode* node = pop_current_node(parser); |
1794 | 24.9k | while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) { |
1795 | 14.2k | node = pop_current_node(parser); |
1796 | 14.2k | } |
1797 | 10.7k | reset_insertion_mode_appropriately(parser); |
1798 | 10.7k | } |
1799 | | |
1800 | | // The list of nodes in the "special" category: |
1801 | | // https://html.spec.whatwg.org/multipage/parsing.html#special |
1802 | 209k | static bool is_special_node(const GumboNode* node) { |
1803 | 209k | assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); |
1804 | 0 | return node_tag_in_set(node, &(const TagSet) { |
1805 | 209k | TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), |
1806 | 209k | TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), |
1807 | 209k | TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), |
1808 | 209k | TAG(COLGROUP), TAG(DD), TAG(DETAILS), TAG(DIR), |
1809 | 209k | TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET), |
1810 | 209k | TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME), |
1811 | 209k | TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), |
1812 | 209k | TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME), |
1813 | 209k | TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING), |
1814 | 209k | TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), |
1815 | 209k | TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), |
1816 | 209k | TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), |
1817 | 209k | TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), |
1818 | 209k | TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), |
1819 | 209k | TAG(THEAD), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), |
1820 | | |
1821 | 209k | TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), |
1822 | 209k | TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), |
1823 | | |
1824 | 209k | TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), |
1825 | | |
1826 | | // This TagSet needs to include the "title" element in both the |
1827 | | // HTML and SVG namespaces. Using both TAG(TITLE) and TAG_SVG(TITLE) |
1828 | | // won't work, due to the simplistic way in which the TAG macros are |
1829 | | // implemented, so we do it like this instead: |
1830 | 209k | [GUMBO_TAG_TITLE] = |
1831 | 209k | (1 << GUMBO_NAMESPACE_HTML) | |
1832 | 209k | (1 << GUMBO_NAMESPACE_SVG) |
1833 | 209k | } |
1834 | 209k | ); |
1835 | 209k | } |
1836 | | |
1837 | | // Implicitly closes currently open elements until it reaches an element with |
1838 | | // the |
1839 | | // specified qualified name. If the elements closed are in the set handled by |
1840 | | // generate_implied_end_tags, this is normal operation and this function returns |
1841 | | // true. Otherwise, a parse error is recorded and this function returns false. |
1842 | | static void implicitly_close_tags ( |
1843 | | GumboParser* parser, |
1844 | | GumboToken* token, |
1845 | | GumboNamespaceEnum target_ns, |
1846 | | GumboTag target |
1847 | 537k | ) { |
1848 | 537k | assert(target != GUMBO_TAG_UNKNOWN); |
1849 | 0 | generate_implied_end_tags(parser, target, NULL); |
1850 | 537k | if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) { |
1851 | 42.7k | parser_add_parse_error(parser, token); |
1852 | 42.7k | while ( |
1853 | 153k | !node_qualified_tag_is(get_current_node(parser), target_ns, target) |
1854 | 111k | ) { |
1855 | 111k | pop_current_node(parser); |
1856 | 111k | } |
1857 | 42.7k | } |
1858 | 537k | assert(node_qualified_tag_is(get_current_node(parser), target_ns, target)); |
1859 | 0 | pop_current_node(parser); |
1860 | 537k | } |
1861 | | |
1862 | | // If the stack of open elements has a <p> tag in button scope, this acts as if |
1863 | | // a </p> tag was encountered, implicitly closing tags. Returns false if a |
1864 | | // parse error occurs. This is a convenience function because this particular |
1865 | | // clause appears several times in the spec. |
1866 | | static void maybe_implicitly_close_p_tag ( |
1867 | | GumboParser* parser, |
1868 | | GumboToken* token |
1869 | 560k | ) { |
1870 | 560k | if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { |
1871 | 533k | implicitly_close_tags ( |
1872 | 533k | parser, |
1873 | 533k | token, |
1874 | 533k | GUMBO_NAMESPACE_HTML, |
1875 | 533k | GUMBO_TAG_P |
1876 | 533k | ); |
1877 | 533k | } |
1878 | 560k | } |
1879 | | |
1880 | | // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt> |
1881 | | // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>. |
1882 | | static void maybe_implicitly_close_list_tag ( |
1883 | | GumboParser* parser, |
1884 | | GumboToken* token, |
1885 | | bool is_li |
1886 | 3.00k | ) { |
1887 | 3.00k | GumboParserState* state = parser->_parser_state; |
1888 | 3.00k | set_frameset_not_ok(parser); |
1889 | 15.5k | for (int i = state->_open_elements.length; --i >= 0;) { |
1890 | 15.5k | const GumboNode* node = state->_open_elements.data[i]; |
1891 | 15.5k | bool is_list_tag = is_li |
1892 | 15.5k | ? node_html_tag_is(node, GUMBO_TAG_LI) |
1893 | 15.5k | : node_tag_in_set(node, &dd_dt_tags) |
1894 | 15.5k | ; |
1895 | 15.5k | if (is_list_tag) { |
1896 | 1.21k | implicitly_close_tags ( |
1897 | 1.21k | parser, |
1898 | 1.21k | token, |
1899 | 1.21k | node->v.element.tag_namespace, |
1900 | 1.21k | node->v.element.tag |
1901 | 1.21k | ); |
1902 | 1.21k | return; |
1903 | 1.21k | } |
1904 | | |
1905 | 14.3k | if ( |
1906 | 14.3k | is_special_node(node) |
1907 | 14.3k | && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)}) |
1908 | 14.3k | ) { |
1909 | 1.79k | return; |
1910 | 1.79k | } |
1911 | 14.3k | } |
1912 | 3.00k | } |
1913 | | |
1914 | | static void merge_attributes ( |
1915 | | GumboToken* token, |
1916 | | GumboNode* node |
1917 | 1.71k | ) { |
1918 | 1.71k | assert(token->type == GUMBO_TOKEN_START_TAG); |
1919 | 0 | assert(node->type == GUMBO_NODE_ELEMENT); |
1920 | 0 | const GumboVector* token_attr = &token->v.start_tag.attributes; |
1921 | 1.71k | GumboVector* node_attr = &node->v.element.attributes; |
1922 | | |
1923 | 3.34k | for (unsigned int i = 0; i < token_attr->length; ++i) { |
1924 | 1.62k | GumboAttribute* attr = token_attr->data[i]; |
1925 | 1.62k | if (!gumbo_get_attribute(node_attr, attr->name)) { |
1926 | | // Ownership of the attribute is transferred by this gumbo_vector_add, |
1927 | | // so it has to be nulled out of the original token so it doesn't get |
1928 | | // double-deleted. |
1929 | 1.25k | gumbo_vector_add(attr, node_attr); |
1930 | 1.25k | token_attr->data[i] = NULL; |
1931 | 1.25k | } |
1932 | 1.62k | } |
1933 | | // When attributes are merged, it means the token has been ignored and merged |
1934 | | // with another token, so we need to free its memory. The attributes that are |
1935 | | // transferred need to be nulled-out in the vector above so that they aren't |
1936 | | // double-deleted. |
1937 | 1.71k | gumbo_token_destroy(token); |
1938 | | |
1939 | 1.71k | #ifndef NDEBUG |
1940 | | // Mark this sentinel so the assertion in the main loop knows it's been |
1941 | | // destroyed. |
1942 | 1.71k | token->v.start_tag.attributes = kGumboEmptyVector; |
1943 | 1.71k | #endif |
1944 | 1.71k | } |
1945 | | |
1946 | 0 | const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) { |
1947 | 0 | const StringReplacement *replacement = gumbo_get_svg_tag_replacement ( |
1948 | 0 | tag->data, |
1949 | 0 | tag->length |
1950 | 0 | ); |
1951 | 0 | return replacement ? replacement->to : NULL; |
1952 | 0 | } |
1953 | | |
1954 | | // https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes |
1955 | | // This destructively modifies any matching attributes on the token and sets the |
1956 | | // namespace appropriately. |
1957 | 14.4k | static void adjust_foreign_attributes(GumboToken* token) { |
1958 | 14.4k | assert(token->type == GUMBO_TOKEN_START_TAG); |
1959 | 0 | const GumboVector* attributes = &token->v.start_tag.attributes; |
1960 | 21.2k | for (unsigned int i = 0, n = attributes->length; i < n; ++i) { |
1961 | 6.84k | GumboAttribute* attr = attributes->data[i]; |
1962 | 6.84k | const ForeignAttrReplacement* entry = gumbo_get_foreign_attr_replacement ( |
1963 | 6.84k | attr->name, |
1964 | 6.84k | strlen(attr->name) |
1965 | 6.84k | ); |
1966 | 6.84k | if (!entry) { |
1967 | 6.68k | continue; |
1968 | 6.68k | } |
1969 | 156 | gumbo_free((void*) attr->name); |
1970 | 156 | attr->attr_namespace = entry->attr_namespace; |
1971 | 156 | attr->name = gumbo_strdup(entry->local_name); |
1972 | 156 | } |
1973 | 14.4k | } |
1974 | | |
1975 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign |
1976 | | // This adjusts svg tags. |
1977 | 8.30k | static void adjust_svg_tag(GumboToken* token) { |
1978 | 8.30k | assert(token->type == GUMBO_TOKEN_START_TAG); |
1979 | 8.30k | if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) { |
1980 | 106 | assert(token->v.start_tag.name == NULL); |
1981 | 0 | token->v.start_tag.name = "foreignObject"; |
1982 | 8.20k | } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) { |
1983 | 4.20k | assert(token->v.start_tag.name); |
1984 | 0 | const StringReplacement *replacement = gumbo_get_svg_tag_replacement( |
1985 | 4.20k | token->v.start_tag.name, |
1986 | 4.20k | strlen(token->v.start_tag.name) |
1987 | 4.20k | ); |
1988 | 4.20k | if (replacement) { |
1989 | | // This cast is safe because we allocated this memory and we'll free it. |
1990 | 3 | strcpy((char *)token->v.start_tag.name, replacement->to); |
1991 | 3 | } |
1992 | 4.20k | } |
1993 | 8.30k | } |
1994 | | |
1995 | | // https://html.spec.whatwg.org/multipage/parsing.html#adjust-svg-attributes |
1996 | | // This destructively modifies any matching attributes on the token. |
1997 | 10.3k | static void adjust_svg_attributes(GumboToken* token) { |
1998 | 10.3k | assert(token->type == GUMBO_TOKEN_START_TAG); |
1999 | 0 | const GumboVector* attributes = &token->v.start_tag.attributes; |
2000 | 16.0k | for (unsigned int i = 0, n = attributes->length; i < n; i++) { |
2001 | 5.61k | GumboAttribute* attr = (GumboAttribute*) attributes->data[i]; |
2002 | 5.61k | const StringReplacement* replacement = gumbo_get_svg_attr_replacement ( |
2003 | 5.61k | attr->name, |
2004 | 5.61k | attr->original_name.length |
2005 | 5.61k | ); |
2006 | 5.61k | if (!replacement) { |
2007 | 5.12k | continue; |
2008 | 5.12k | } |
2009 | 498 | gumbo_free((void*) attr->name); |
2010 | 498 | attr->name = gumbo_strdup(replacement->to); |
2011 | 498 | } |
2012 | 10.3k | } |
2013 | | |
2014 | | // https://html.spec.whatwg.org/multipage/parsing.html#adjust-mathml-attributes |
2015 | | // Note that this may destructively modify the token with the new attribute |
2016 | | // value. |
2017 | 4.06k | static void adjust_mathml_attributes(GumboToken* token) { |
2018 | 4.06k | assert(token->type == GUMBO_TOKEN_START_TAG); |
2019 | 0 | GumboAttribute* attr = gumbo_get_attribute ( |
2020 | 4.06k | &token->v.start_tag.attributes, |
2021 | 4.06k | "definitionurl" |
2022 | 4.06k | ); |
2023 | 4.06k | if (!attr) { |
2024 | 4.06k | return; |
2025 | 4.06k | } |
2026 | 0 | gumbo_free((void*) attr->name); |
2027 | 0 | attr->name = gumbo_strdup("definitionURL"); |
2028 | 0 | } |
2029 | | |
2030 | | static void maybe_add_doctype_error ( |
2031 | | GumboParser* parser, |
2032 | | const GumboToken* token |
2033 | 472 | ) { |
2034 | 472 | const GumboTokenDocType* doctype = &token->v.doc_type; |
2035 | 472 | if ( |
2036 | 472 | strcmp(doctype->name, "html") |
2037 | 472 | || doctype->has_public_identifier |
2038 | 472 | || (doctype->has_system_identifier |
2039 | 36 | && strcmp(doctype->system_identifier, "about:legacy-compat")) |
2040 | 472 | ) { |
2041 | 466 | parser_add_parse_error(parser, token); |
2042 | 466 | } |
2043 | 472 | } |
2044 | | |
2045 | 15.5k | static void remove_from_parent(GumboNode* node) { |
2046 | 15.5k | if (!node->parent) { |
2047 | | // The node may not have a parent if, for example, it is a newly-cloned copy |
2048 | | // of an active formatting element. DOM manipulations continue with the |
2049 | | // orphaned fragment of the DOM tree until it's appended/foster-parented to |
2050 | | // the common ancestor at the end of the adoption agency algorithm. |
2051 | 5.75k | return; |
2052 | 5.75k | } |
2053 | 9.75k | assert(node->parent->type == GUMBO_NODE_ELEMENT); |
2054 | 0 | GumboVector* children = &node->parent->v.element.children; |
2055 | 9.75k | int index = gumbo_vector_index_of(children, node); |
2056 | 9.75k | assert(index != -1); |
2057 | | |
2058 | 0 | gumbo_vector_remove_at(index, children); |
2059 | 9.75k | node->parent = NULL; |
2060 | 9.75k | node->index_within_parent = -1; |
2061 | 121k | for (unsigned int i = index; i < children->length; ++i) { |
2062 | 111k | GumboNode* child = children->data[i]; |
2063 | 111k | child->index_within_parent = i; |
2064 | 111k | } |
2065 | 9.75k | } |
2066 | | |
2067 | | // This is here to clean up memory when the spec says "Ignore current token." |
2068 | 2.71M | static void ignore_token(GumboParser* parser) { |
2069 | 2.71M | GumboToken* token = parser->_parser_state->_current_token; |
2070 | | // Ownership of the token's internal buffers are normally transferred to the |
2071 | | // element, but if no element is emitted (as happens in non-verbatim-mode |
2072 | | // when a token is ignored), we need to free it here to prevent a memory |
2073 | | // leak. |
2074 | 2.71M | gumbo_token_destroy(token); |
2075 | 2.71M | #ifndef NDEBUG |
2076 | 2.71M | if (token->type == GUMBO_TOKEN_START_TAG) { |
2077 | | // Mark this sentinel so the assertion in the main loop knows it's been |
2078 | | // destroyed. |
2079 | 36.3k | token->v.start_tag.attributes = kGumboEmptyVector; |
2080 | 36.3k | token->v.start_tag.name = NULL; |
2081 | 36.3k | } |
2082 | 2.71M | #endif |
2083 | 2.71M | } |
2084 | | |
2085 | | // The token is usually an end tag; however, the adoption agency algorithm may |
2086 | | // invoke this for an 'a' or 'nobr' start tag. |
2087 | | // Returns false if there was an error. |
2088 | | static void in_body_any_other_end_tag(GumboParser* parser, GumboToken* token) |
2089 | 9.50k | { |
2090 | 9.50k | GumboParserState* state = parser->_parser_state; |
2091 | 9.50k | GumboTag tag; |
2092 | 9.50k | const char* tagname; |
2093 | | |
2094 | 9.50k | if (token->type == GUMBO_TOKEN_END_TAG) { |
2095 | 9.38k | tag = token->v.end_tag.tag; |
2096 | 9.38k | tagname = token->v.end_tag.name; |
2097 | 9.38k | } else { |
2098 | 116 | assert(token->type == GUMBO_TOKEN_START_TAG); |
2099 | 0 | tag = token->v.start_tag.tag; |
2100 | 116 | tagname = token->v.start_tag.name; |
2101 | 116 | } |
2102 | | |
2103 | 0 | assert(state->_open_elements.length > 0); |
2104 | 0 | assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); |
2105 | | // Walk up the stack of open elements until we find one that either: |
2106 | | // a) Matches the tag name we saw |
2107 | | // b) Is in the "special" category. |
2108 | | // If we see a), implicitly close everything up to and including it. If we |
2109 | | // see b), then record a parse error, don't close anything (except the |
2110 | | // implied end tags) and ignore the end tag token. |
2111 | 126k | for (int i = state->_open_elements.length; --i >= 0;) { |
2112 | 126k | const GumboNode* node = state->_open_elements.data[i]; |
2113 | 126k | if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, tagname)) { |
2114 | 1.00k | generate_implied_end_tags(parser, tag, tagname); |
2115 | | // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example of an error. |
2116 | | // foo is the "current node" but sarcasm is node. |
2117 | | // XXX: Write a test for this. |
2118 | 1.00k | if (node != get_current_node(parser)) { |
2119 | 408 | parser_add_parse_error(parser, token); |
2120 | 408 | } |
2121 | 2.18k | while (node != pop_current_node(parser)) |
2122 | 1.18k | ; // Pop everything. |
2123 | 1.00k | return; |
2124 | 125k | } else if (is_special_node(node)) { |
2125 | 8.50k | parser_add_parse_error(parser, token); |
2126 | 8.50k | ignore_token(parser); |
2127 | 8.50k | return; |
2128 | 8.50k | } |
2129 | 126k | } |
2130 | | // <html> is in the special category, so we should never get here. |
2131 | 0 | assert(0 && "unreachable"); |
2132 | 0 | } |
2133 | | |
2134 | | // https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser |
2135 | | // Also described in the "in body" handling for end formatting tags. |
2136 | | // Returns false if there was an error. |
2137 | | static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token) |
2138 | 30.8k | { |
2139 | 30.8k | GumboParserState* state = parser->_parser_state; |
2140 | 30.8k | gumbo_debug("Entering adoption agency algorithm.\n"); |
2141 | | // Step 1. |
2142 | 30.8k | GumboTag subject; |
2143 | 30.8k | if (token->type == GUMBO_TOKEN_START_TAG) { |
2144 | 27.8k | subject = token->v.start_tag.tag; |
2145 | 27.8k | } else { |
2146 | 2.96k | assert(token->type == GUMBO_TOKEN_END_TAG); |
2147 | 0 | subject = token->v.end_tag.tag; |
2148 | 2.96k | } |
2149 | 0 | assert(subject != GUMBO_TAG_UNKNOWN); |
2150 | | |
2151 | | // Step 2. |
2152 | 0 | GumboNode* current_node = get_current_node(parser); |
2153 | 30.8k | if ( |
2154 | 30.8k | node_html_tag_is(current_node, subject) |
2155 | 30.8k | && -1 == gumbo_vector_index_of ( |
2156 | 20.2k | &state->_active_formatting_elements, |
2157 | 20.2k | current_node |
2158 | 20.2k | ) |
2159 | 30.8k | ) { |
2160 | 381 | pop_current_node(parser); |
2161 | 381 | return; |
2162 | 381 | } |
2163 | | |
2164 | | // Steps 3-5 & 21: |
2165 | 40.2k | for (unsigned int i = 0; i < 8; ++i) { |
2166 | | // Step 6. |
2167 | 39.7k | GumboNode* formatting_node = NULL; |
2168 | 39.7k | int formatting_node_in_open_elements = -1; |
2169 | 69.8k | for (int j = state->_active_formatting_elements.length; --j >= 0;) { |
2170 | 68.8k | GumboNode* current_node = state->_active_formatting_elements.data[j]; |
2171 | 68.8k | if (current_node == &kActiveFormattingScopeMarker) { |
2172 | 834 | gumbo_debug("Broke on scope marker; aborting.\n"); |
2173 | | // Last scope marker; abort the algorithm and handle according to "any |
2174 | | // other end tag" (below). |
2175 | 834 | break; |
2176 | 834 | } |
2177 | 67.9k | if (node_html_tag_is(current_node, subject)) { |
2178 | | // Found it. |
2179 | 37.8k | formatting_node = current_node; |
2180 | 37.8k | formatting_node_in_open_elements = gumbo_vector_index_of ( |
2181 | 37.8k | &state->_open_elements, |
2182 | 37.8k | formatting_node |
2183 | 37.8k | ); |
2184 | 37.8k | gumbo_debug ( |
2185 | 37.8k | "Formatting element of tag %s at %d.\n", |
2186 | 37.8k | gumbo_normalized_tagname(subject), |
2187 | 37.8k | formatting_node_in_open_elements |
2188 | 37.8k | ); |
2189 | 37.8k | break; |
2190 | 37.8k | } |
2191 | 67.9k | } |
2192 | 39.7k | if (!formatting_node) { |
2193 | | // No matching tag; not a parse error outright, but fall through to the |
2194 | | // "any other end tag" clause (which may potentially add a parse error, |
2195 | | // but not always). |
2196 | 1.86k | gumbo_debug("No active formatting elements; aborting.\n"); |
2197 | 1.86k | in_body_any_other_end_tag(parser, token); |
2198 | 1.86k | return; |
2199 | 1.86k | } |
2200 | | |
2201 | | // Step 7 |
2202 | 37.8k | if (formatting_node_in_open_elements == -1) { |
2203 | 97 | gumbo_debug("Formatting node not on stack of open elements.\n"); |
2204 | 97 | parser_add_parse_error(parser, token); |
2205 | 97 | gumbo_vector_remove ( |
2206 | 97 | formatting_node, |
2207 | 97 | &state->_active_formatting_elements |
2208 | 97 | ); |
2209 | 97 | return; |
2210 | 97 | } |
2211 | | |
2212 | | // Step 8 |
2213 | 37.7k | if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) { |
2214 | 931 | parser_add_parse_error(parser, token); |
2215 | 931 | gumbo_debug("Element not in scope.\n"); |
2216 | 931 | return; |
2217 | 931 | } |
2218 | | |
2219 | | // Step 9 |
2220 | 36.8k | if (formatting_node != get_current_node(parser)) |
2221 | 14.1k | parser_add_parse_error(parser, token); // But continue onwards. |
2222 | 36.8k | assert(formatting_node); |
2223 | 0 | assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML)); |
2224 | 0 | assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY)); |
2225 | | |
2226 | | // Step 10 |
2227 | 0 | GumboNode* furthest_block = NULL; |
2228 | 36.8k | for ( |
2229 | 36.8k | unsigned int j = formatting_node_in_open_elements; |
2230 | 96.2k | j < state->_open_elements.length; |
2231 | 59.3k | ++j |
2232 | 69.1k | ) { |
2233 | 69.1k | assert(j > 0); |
2234 | 0 | GumboNode* current = state->_open_elements.data[j]; |
2235 | 69.1k | if (is_special_node(current)) { |
2236 | 9.75k | furthest_block = current; |
2237 | 9.75k | break; |
2238 | 9.75k | } |
2239 | 69.1k | } |
2240 | | // Step 11. |
2241 | 36.8k | if (!furthest_block) { |
2242 | 35.4k | while (pop_current_node(parser) != formatting_node) |
2243 | 8.35k | ; |
2244 | 27.0k | gumbo_vector_remove ( |
2245 | 27.0k | formatting_node, |
2246 | 27.0k | &state->_active_formatting_elements |
2247 | 27.0k | ); |
2248 | 27.0k | return; |
2249 | 27.0k | } |
2250 | 9.75k | assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML)); |
2251 | | |
2252 | | // Step 12. |
2253 | | // Elements may be moved and reparented by this algorithm, so |
2254 | | // common_ancestor is not necessarily the same as formatting_node->parent. |
2255 | 0 | GumboNode* common_ancestor = state->_open_elements.data [ |
2256 | 9.75k | formatting_node_in_open_elements - 1 |
2257 | 9.75k | ]; |
2258 | 9.75k | gumbo_debug ( |
2259 | 9.75k | "Common ancestor tag = %s, furthest block tag = %s.\n", |
2260 | 9.75k | gumbo_normalized_tagname(common_ancestor->v.element.tag), |
2261 | 9.75k | gumbo_normalized_tagname(furthest_block->v.element.tag) |
2262 | 9.75k | ); |
2263 | | |
2264 | | // Step 13. |
2265 | 9.75k | int bookmark = 1 + gumbo_vector_index_of ( |
2266 | 9.75k | &state->_active_formatting_elements, |
2267 | 9.75k | formatting_node |
2268 | 9.75k | ); |
2269 | 9.75k | gumbo_debug("Bookmark at %d.\n", bookmark); |
2270 | | // Step 14. |
2271 | 9.75k | GumboNode* node = furthest_block; |
2272 | 9.75k | GumboNode* last_node = furthest_block; |
2273 | | // Must be stored explicitly, in case node is removed from the stack of open |
2274 | | // elements, to handle step 14.3. |
2275 | 9.75k | int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node); |
2276 | 9.75k | assert(saved_node_index > 0); |
2277 | | // Step 14.1. |
2278 | 23.9k | for (int j = 0;;) { |
2279 | | // Step 14.2. |
2280 | 23.9k | ++j; |
2281 | | // Step 14.3. |
2282 | 23.9k | int node_index = gumbo_vector_index_of(&state->_open_elements, node); |
2283 | 23.9k | gumbo_debug ( |
2284 | 23.9k | "Current index: %d, last index: %d.\n", |
2285 | 23.9k | node_index, |
2286 | 23.9k | saved_node_index |
2287 | 23.9k | ); |
2288 | 23.9k | if (node_index == -1) { |
2289 | 6.23k | node_index = saved_node_index; |
2290 | 6.23k | } |
2291 | 23.9k | saved_node_index = --node_index; |
2292 | 23.9k | assert(node_index > 0); |
2293 | 0 | assert((unsigned int) node_index < state->_open_elements.capacity); |
2294 | 0 | node = state->_open_elements.data[node_index]; |
2295 | 23.9k | assert(node->parent); |
2296 | | // Step 14.4. |
2297 | 23.9k | if (node == formatting_node) { |
2298 | 9.75k | break; |
2299 | 9.75k | } |
2300 | 14.1k | int formatting_index = gumbo_vector_index_of ( |
2301 | 14.1k | &state->_active_formatting_elements, |
2302 | 14.1k | node |
2303 | 14.1k | ); |
2304 | | // Step 14.5. |
2305 | 14.1k | if (j > 3 && formatting_index != -1) { |
2306 | 2.16k | gumbo_debug("Removing formatting element at %d.\n", formatting_index); |
2307 | 2.16k | gumbo_vector_remove_at ( |
2308 | 2.16k | formatting_index, |
2309 | 2.16k | &state->_active_formatting_elements |
2310 | 2.16k | ); |
2311 | | // Removing the element shifts all indices over by one, so we may need |
2312 | | // to move the bookmark. |
2313 | 2.16k | if (formatting_index < bookmark) { |
2314 | 1.70k | --bookmark; |
2315 | 1.70k | gumbo_debug("Moving bookmark to %d.\n", bookmark); |
2316 | 1.70k | } |
2317 | 2.16k | continue; |
2318 | 2.16k | } |
2319 | 11.9k | if (formatting_index == -1) { |
2320 | | // Step 14.6. |
2321 | 6.23k | gumbo_vector_remove_at(node_index, &state->_open_elements); |
2322 | 6.23k | continue; |
2323 | 6.23k | } |
2324 | | // Step 14.7. |
2325 | | // "common ancestor as the intended parent" doesn't actually mean insert |
2326 | | // it into the common ancestor; that happens below. |
2327 | 5.75k | node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); |
2328 | 5.75k | assert(formatting_index >= 0); |
2329 | 0 | state->_active_formatting_elements.data[formatting_index] = node; |
2330 | 5.75k | assert(node_index >= 0); |
2331 | 0 | state->_open_elements.data[node_index] = node; |
2332 | | // Step 14.8. |
2333 | 5.75k | if (last_node == furthest_block) { |
2334 | 3.83k | bookmark = formatting_index + 1; |
2335 | 3.83k | gumbo_debug("Bookmark moved to %d.\n", bookmark); |
2336 | 3.83k | assert((unsigned int) bookmark <= state->_active_formatting_elements.length); |
2337 | 3.83k | } |
2338 | | // Step 14.9. |
2339 | 0 | last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; |
2340 | 5.75k | remove_from_parent(last_node); |
2341 | 5.75k | append_node(node, last_node); |
2342 | | // Step 14.10. |
2343 | 5.75k | last_node = node; |
2344 | 5.75k | } // Step 14.11. |
2345 | | |
2346 | | // Step 15. |
2347 | 9.75k | gumbo_debug ( |
2348 | 9.75k | "Removing %s node from parent ", |
2349 | 9.75k | gumbo_normalized_tagname(last_node->v.element.tag) |
2350 | 9.75k | ); |
2351 | 9.75k | remove_from_parent(last_node); |
2352 | 9.75k | last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; |
2353 | 9.75k | InsertionLocation location = get_appropriate_insertion_location ( |
2354 | 9.75k | parser, |
2355 | 9.75k | common_ancestor |
2356 | 9.75k | ); |
2357 | 9.75k | gumbo_debug ( |
2358 | 9.75k | "and inserting it into %s.\n", |
2359 | 9.75k | gumbo_normalized_tagname(location.target->v.element.tag) |
2360 | 9.75k | ); |
2361 | 9.75k | insert_node(last_node, location); |
2362 | | |
2363 | | // Step 16. |
2364 | 9.75k | GumboNode* new_formatting_node = clone_node ( |
2365 | 9.75k | formatting_node, |
2366 | 9.75k | GUMBO_INSERTION_ADOPTION_AGENCY_CLONED |
2367 | 9.75k | ); |
2368 | 9.75k | formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; |
2369 | | |
2370 | | // Step 17. Instead of appending nodes one-by-one, we swap the children |
2371 | | // vector of furthest_block with the empty children of new_formatting_node, |
2372 | | // reducing memory traffic and allocations. We still have to reset their |
2373 | | // parent pointers, though. |
2374 | 9.75k | GumboVector temp = new_formatting_node->v.element.children; |
2375 | 9.75k | new_formatting_node->v.element.children = furthest_block->v.element.children; |
2376 | 9.75k | furthest_block->v.element.children = temp; |
2377 | | |
2378 | 9.75k | temp = new_formatting_node->v.element.children; |
2379 | 96.1k | for (unsigned int i = 0; i < temp.length; ++i) { |
2380 | 86.4k | GumboNode* child = temp.data[i]; |
2381 | 86.4k | child->parent = new_formatting_node; |
2382 | 86.4k | } |
2383 | | |
2384 | | // Step 18. |
2385 | 9.75k | append_node(furthest_block, new_formatting_node); |
2386 | | |
2387 | | // Step 19. |
2388 | | // If the formatting node was before the bookmark, it may shift over all |
2389 | | // indices after it, so we need to explicitly find the index and possibly |
2390 | | // adjust the bookmark. |
2391 | 9.75k | int formatting_node_index = gumbo_vector_index_of ( |
2392 | 9.75k | &state->_active_formatting_elements, |
2393 | 9.75k | formatting_node |
2394 | 9.75k | ); |
2395 | 9.75k | assert(formatting_node_index != -1); |
2396 | 9.75k | if (formatting_node_index < bookmark) { |
2397 | 9.75k | gumbo_debug ( |
2398 | 9.75k | "Formatting node at %d is before bookmark at %d; decrementing.\n", |
2399 | 9.75k | formatting_node_index, bookmark |
2400 | 9.75k | ); |
2401 | 9.75k | --bookmark; |
2402 | 9.75k | } |
2403 | 9.75k | gumbo_vector_remove_at ( |
2404 | 9.75k | formatting_node_index, |
2405 | 9.75k | &state->_active_formatting_elements |
2406 | 9.75k | ); |
2407 | 9.75k | assert(bookmark >= 0); |
2408 | 0 | assert((unsigned int) bookmark <= state->_active_formatting_elements.length); |
2409 | 0 | gumbo_vector_insert_at ( |
2410 | 9.75k | new_formatting_node, |
2411 | 9.75k | bookmark, |
2412 | 9.75k | &state->_active_formatting_elements |
2413 | 9.75k | ); |
2414 | | |
2415 | | // Step 20. |
2416 | 9.75k | gumbo_vector_remove(formatting_node, &state->_open_elements); |
2417 | 9.75k | int insert_at = 1 + gumbo_vector_index_of ( |
2418 | 9.75k | &state->_open_elements, |
2419 | 9.75k | furthest_block |
2420 | 9.75k | ); |
2421 | 9.75k | assert(insert_at >= 0); |
2422 | 0 | assert((unsigned int) insert_at <= state->_open_elements.length); |
2423 | 0 | gumbo_vector_insert_at ( |
2424 | 9.75k | new_formatting_node, |
2425 | 9.75k | insert_at, |
2426 | 9.75k | &state->_open_elements |
2427 | 9.75k | ); |
2428 | 9.75k | } // Step 21. |
2429 | 30.4k | } |
2430 | | |
2431 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-end |
2432 | 10.0k | static void finish_parsing(GumboParser* parser) { |
2433 | 10.0k | gumbo_debug("Finishing parsing"); |
2434 | 10.0k | maybe_flush_text_node_buffer(parser); |
2435 | 10.0k | GumboParserState* state = parser->_parser_state; |
2436 | 10.0k | for ( |
2437 | 10.0k | GumboNode* node = pop_current_node(parser); |
2438 | 64.6k | node; |
2439 | 54.5k | node = pop_current_node(parser) |
2440 | 54.5k | ) { |
2441 | 54.5k | if ( |
2442 | 54.5k | (node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) |
2443 | 54.5k | || (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag) |
2444 | 54.5k | ) { |
2445 | 235 | continue; |
2446 | 235 | } |
2447 | 54.3k | node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; |
2448 | 54.3k | } |
2449 | 10.0k | while (pop_current_node(parser)) |
2450 | 0 | ; // Pop them all. |
2451 | 10.0k | } |
2452 | | |
2453 | 11.9k | static void handle_initial(GumboParser* parser, GumboToken* token) { |
2454 | 11.9k | GumboDocument* document = &get_document_node(parser)->v.document; |
2455 | 11.9k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2456 | 1.09k | ignore_token(parser); |
2457 | 1.09k | return; |
2458 | 1.09k | } |
2459 | 10.8k | if (token->type == GUMBO_TOKEN_COMMENT) { |
2460 | 800 | append_comment_node(parser, get_document_node(parser), token); |
2461 | 800 | return; |
2462 | 800 | } |
2463 | 10.0k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2464 | 472 | document->has_doctype = true; |
2465 | 472 | document->name = token->v.doc_type.name; |
2466 | 472 | document->public_identifier = token->v.doc_type.public_identifier; |
2467 | 472 | document->system_identifier = token->v.doc_type.system_identifier; |
2468 | 472 | document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type); |
2469 | 472 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); |
2470 | 472 | maybe_add_doctype_error(parser, token); |
2471 | 472 | return; |
2472 | 472 | } |
2473 | 9.62k | parser_add_parse_error(parser, token); |
2474 | 9.62k | document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS; |
2475 | 9.62k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); |
2476 | 9.62k | parser->_parser_state->_reprocess_current_token = true; |
2477 | 9.62k | } |
2478 | | |
2479 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode |
2480 | 11.7k | static void handle_before_html(GumboParser* parser, GumboToken* token) { |
2481 | 11.7k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2482 | 425 | parser_add_parse_error(parser, token); |
2483 | 425 | ignore_token(parser); |
2484 | 425 | return; |
2485 | 425 | } |
2486 | 11.3k | if (token->type == GUMBO_TOKEN_COMMENT) { |
2487 | 351 | append_comment_node(parser, get_document_node(parser), token); |
2488 | 351 | return; |
2489 | 351 | } |
2490 | 10.9k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2491 | 617 | ignore_token(parser); |
2492 | 617 | return; |
2493 | 617 | } |
2494 | 10.3k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2495 | 41 | GumboNode* html_node = insert_element_from_token(parser, token); |
2496 | 41 | parser->_output->root = html_node; |
2497 | 41 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); |
2498 | 41 | return; |
2499 | 41 | } |
2500 | 10.3k | if ( |
2501 | 10.3k | token->type == GUMBO_TOKEN_END_TAG |
2502 | 10.3k | && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)}) |
2503 | 10.3k | ) { |
2504 | 268 | parser_add_parse_error(parser, token); |
2505 | 268 | ignore_token(parser); |
2506 | 268 | return; |
2507 | 268 | } |
2508 | 10.0k | GumboNode* html_node = insert_element_of_tag_type ( |
2509 | 10.0k | parser, |
2510 | 10.0k | GUMBO_TAG_HTML, |
2511 | 10.0k | GUMBO_INSERTION_IMPLIED |
2512 | 10.0k | ); |
2513 | 10.0k | assert(html_node); |
2514 | 0 | parser->_output->root = html_node; |
2515 | 10.0k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); |
2516 | 10.0k | parser->_parser_state->_reprocess_current_token = true; |
2517 | 10.0k | } |
2518 | | |
2519 | | // Forward declarations because of mutual dependencies. |
2520 | | static void handle_token(GumboParser* parser, GumboToken* token); |
2521 | | static void handle_in_body(GumboParser* parser, GumboToken* token); |
2522 | | static void handle_in_template(GumboParser* parser, GumboToken* token); |
2523 | | |
2524 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode |
2525 | 10.5k | static void handle_before_head(GumboParser* parser, GumboToken* token) { |
2526 | 10.5k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2527 | 120 | ignore_token(parser); |
2528 | 120 | return; |
2529 | 120 | } |
2530 | 10.4k | if (token->type == GUMBO_TOKEN_COMMENT) { |
2531 | 76 | append_comment_node(parser, get_current_node(parser), token); |
2532 | 76 | return; |
2533 | 76 | } |
2534 | 10.3k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2535 | 44 | parser_add_parse_error(parser, token); |
2536 | 44 | ignore_token(parser); |
2537 | 44 | return; |
2538 | 44 | } |
2539 | 10.3k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2540 | 73 | handle_in_body(parser, token); |
2541 | 73 | return; |
2542 | 73 | } |
2543 | 10.2k | if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) { |
2544 | 20 | GumboNode* node = insert_element_from_token(parser, token); |
2545 | 20 | parser->_parser_state->_head_element = node; |
2546 | 20 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); |
2547 | 20 | return; |
2548 | 20 | } |
2549 | 10.2k | if ( |
2550 | 10.2k | token->type == GUMBO_TOKEN_END_TAG |
2551 | 10.2k | && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)}) |
2552 | 10.2k | ) { |
2553 | 141 | parser_add_parse_error(parser, token); |
2554 | 141 | ignore_token(parser); |
2555 | 141 | return; |
2556 | 141 | } |
2557 | 10.0k | GumboNode* node = insert_element_of_tag_type ( |
2558 | 10.0k | parser, |
2559 | 10.0k | GUMBO_TAG_HEAD, |
2560 | 10.0k | GUMBO_INSERTION_IMPLIED |
2561 | 10.0k | ); |
2562 | 10.0k | parser->_parser_state->_head_element = node; |
2563 | 10.0k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); |
2564 | 10.0k | parser->_parser_state->_reprocess_current_token = true; |
2565 | 10.0k | } |
2566 | | |
2567 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead |
2568 | 21.2k | static void handle_in_head(GumboParser* parser, GumboToken* token) { |
2569 | 21.2k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2570 | 958 | insert_text_token(parser, token); |
2571 | 958 | return; |
2572 | 958 | } |
2573 | 20.3k | if (token->type == GUMBO_TOKEN_COMMENT) { |
2574 | 526 | append_comment_node(parser, get_current_node(parser), token); |
2575 | 526 | return; |
2576 | 526 | } |
2577 | 19.7k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2578 | 85 | parser_add_parse_error(parser, token); |
2579 | 85 | ignore_token(parser); |
2580 | 85 | return; |
2581 | 85 | } |
2582 | 19.7k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2583 | 60 | return handle_in_body(parser, token); |
2584 | 60 | } |
2585 | 19.6k | if ( |
2586 | 19.6k | tag_in(token, kStartTag, &(const TagSet) { |
2587 | 19.6k | TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK) |
2588 | 19.6k | }) |
2589 | 19.6k | ) { |
2590 | 2.01k | insert_element_from_token(parser, token); |
2591 | 2.01k | pop_current_node(parser); |
2592 | 2.01k | acknowledge_self_closing_tag(parser); |
2593 | 2.01k | return; |
2594 | 2.01k | } |
2595 | 17.6k | if (tag_is(token, kStartTag, GUMBO_TAG_META)) { |
2596 | 309 | insert_element_from_token(parser, token); |
2597 | 309 | pop_current_node(parser); |
2598 | 309 | acknowledge_self_closing_tag(parser); |
2599 | | // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the |
2600 | | // spec doesn't apply. If clients want to handle meta-tag re-encoding, they |
2601 | | // should specifically look for that string in the document and re-encode it |
2602 | | // before passing to Gumbo. |
2603 | 309 | return; |
2604 | 309 | } |
2605 | 17.3k | if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) { |
2606 | 359 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); |
2607 | 359 | return; |
2608 | 359 | } |
2609 | 16.9k | if ( |
2610 | 16.9k | tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)}) |
2611 | 16.9k | ) { |
2612 | 580 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); |
2613 | 580 | return; |
2614 | 580 | } |
2615 | 16.3k | if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) { |
2616 | 371 | insert_element_from_token(parser, token); |
2617 | 371 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT); |
2618 | 371 | return; |
2619 | 371 | } |
2620 | 16.0k | if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) { |
2621 | 1.26k | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA); |
2622 | 1.26k | return; |
2623 | 1.26k | } |
2624 | 14.7k | if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) { |
2625 | 39 | GumboNode* head = pop_current_node(parser); |
2626 | 39 | UNUSED_IF_NDEBUG(head); |
2627 | 39 | assert(node_html_tag_is(head, GUMBO_TAG_HEAD)); |
2628 | 0 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); |
2629 | 39 | return; |
2630 | 39 | } |
2631 | 14.7k | if ( |
2632 | 14.7k | tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)}) |
2633 | 14.7k | ) { |
2634 | 44 | pop_current_node(parser); |
2635 | 44 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); |
2636 | 44 | parser->_parser_state->_reprocess_current_token = true; |
2637 | 44 | return; |
2638 | 44 | } |
2639 | 14.6k | if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) { |
2640 | 3.19k | insert_element_from_token(parser, token); |
2641 | 3.19k | add_formatting_element(parser, &kActiveFormattingScopeMarker); |
2642 | 3.19k | set_frameset_not_ok(parser); |
2643 | 3.19k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); |
2644 | 3.19k | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); |
2645 | 3.19k | return; |
2646 | 3.19k | } |
2647 | 11.4k | if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { |
2648 | 1.17k | if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
2649 | 436 | parser_add_parse_error(parser, token); |
2650 | 436 | ignore_token(parser); |
2651 | 436 | return; |
2652 | 436 | } |
2653 | 734 | generate_all_implied_end_tags_thoroughly(parser); |
2654 | 734 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) |
2655 | 119 | parser_add_parse_error(parser, token); |
2656 | 2.42k | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) |
2657 | 1.69k | ; |
2658 | 734 | clear_active_formatting_elements(parser); |
2659 | 734 | pop_template_insertion_mode(parser); |
2660 | 734 | reset_insertion_mode_appropriately(parser); |
2661 | 734 | return; |
2662 | 1.17k | } |
2663 | 10.3k | if ( |
2664 | 10.3k | tag_is(token, kStartTag, GUMBO_TAG_HEAD) |
2665 | 10.3k | || (token->type == GUMBO_TOKEN_END_TAG) |
2666 | 10.3k | ) { |
2667 | 303 | parser_add_parse_error(parser, token); |
2668 | 303 | ignore_token(parser); |
2669 | 303 | return; |
2670 | 303 | } |
2671 | 10.0k | pop_current_node(parser); |
2672 | 10.0k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); |
2673 | 10.0k | parser->_parser_state->_reprocess_current_token = true; |
2674 | 10.0k | return; |
2675 | 10.3k | } |
2676 | | |
2677 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript |
2678 | 1.20k | static void handle_in_head_noscript(GumboParser* parser, GumboToken* token) { |
2679 | 1.20k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2680 | 7 | parser_add_parse_error(parser, token); |
2681 | 7 | ignore_token(parser); |
2682 | 7 | return; |
2683 | 7 | } |
2684 | 1.19k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2685 | 19 | handle_in_body(parser, token); |
2686 | 19 | return; |
2687 | 19 | } |
2688 | 1.17k | if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) { |
2689 | 154 | const GumboNode* node = pop_current_node(parser); |
2690 | 154 | assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); |
2691 | 0 | UNUSED_IF_NDEBUG(node); |
2692 | 154 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); |
2693 | 154 | return; |
2694 | 154 | } |
2695 | 1.02k | if ( |
2696 | 1.02k | token->type == GUMBO_TOKEN_WHITESPACE |
2697 | 1.02k | || token->type == GUMBO_TOKEN_COMMENT |
2698 | 1.02k | || tag_in (token, kStartTag, &(const TagSet) { |
2699 | 731 | TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), |
2700 | 731 | TAG(META), TAG(NOFRAMES), TAG(STYLE) |
2701 | 731 | }) |
2702 | 1.02k | ) { |
2703 | 453 | handle_in_head(parser, token); |
2704 | 453 | return; |
2705 | 453 | } |
2706 | 572 | if ( |
2707 | 572 | tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)}) |
2708 | 572 | || ( |
2709 | 429 | token->type == GUMBO_TOKEN_END_TAG |
2710 | 429 | && !tag_is(token, kEndTag, GUMBO_TAG_BR) |
2711 | 429 | ) |
2712 | 572 | ) { |
2713 | 355 | parser_add_parse_error(parser, token); |
2714 | 355 | ignore_token(parser); |
2715 | 355 | return; |
2716 | 355 | } |
2717 | 217 | parser_add_parse_error(parser, token); |
2718 | 217 | const GumboNode* node = pop_current_node(parser); |
2719 | 217 | assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); |
2720 | 0 | UNUSED_IF_NDEBUG(node); |
2721 | 217 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); |
2722 | 217 | parser->_parser_state->_reprocess_current_token = true; |
2723 | 217 | } |
2724 | | |
2725 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode |
2726 | 11.8k | static void handle_after_head(GumboParser* parser, GumboToken* token) { |
2727 | 11.8k | GumboParserState* state = parser->_parser_state; |
2728 | 11.8k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2729 | 353 | insert_text_token(parser, token); |
2730 | 353 | return; |
2731 | 353 | } |
2732 | 11.4k | if (token->type == GUMBO_TOKEN_COMMENT) { |
2733 | 353 | append_comment_node(parser, get_current_node(parser), token); |
2734 | 353 | return; |
2735 | 353 | } |
2736 | 11.1k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2737 | 5 | parser_add_parse_error(parser, token); |
2738 | 5 | ignore_token(parser); |
2739 | 5 | return; |
2740 | 5 | } |
2741 | 11.1k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2742 | 21 | handle_in_body(parser, token); |
2743 | 21 | return; |
2744 | 21 | } |
2745 | 11.1k | if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { |
2746 | 4 | insert_element_from_token(parser, token); |
2747 | 4 | set_frameset_not_ok(parser); |
2748 | 4 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
2749 | 4 | return; |
2750 | 4 | } |
2751 | 11.1k | if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { |
2752 | 132 | insert_element_from_token(parser, token); |
2753 | 132 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); |
2754 | 132 | return; |
2755 | 132 | } |
2756 | 10.9k | if ( |
2757 | 10.9k | tag_in(token, kStartTag, &(const TagSet) { |
2758 | 10.9k | TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META), |
2759 | 10.9k | TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) |
2760 | 10.9k | }) |
2761 | 10.9k | ) { |
2762 | 346 | parser_add_parse_error(parser, token); |
2763 | 346 | assert(state->_head_element != NULL); |
2764 | | // This must be flushed before we push the head element on, as there may be |
2765 | | // pending character tokens that should be attached to the root. |
2766 | 0 | maybe_flush_text_node_buffer(parser); |
2767 | 346 | gumbo_vector_add(state->_head_element, &state->_open_elements); |
2768 | 346 | handle_in_head(parser, token); |
2769 | 346 | gumbo_vector_remove(state->_head_element, &state->_open_elements); |
2770 | 346 | return; |
2771 | 346 | } |
2772 | 10.6k | if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { |
2773 | 0 | handle_in_head(parser, token); |
2774 | 0 | return; |
2775 | 0 | } |
2776 | 10.6k | if ( |
2777 | 10.6k | tag_is(token, kStartTag, GUMBO_TAG_HEAD) |
2778 | 10.6k | || ( |
2779 | 10.2k | token->type == GUMBO_TOKEN_END_TAG |
2780 | 10.2k | && !tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)}) |
2781 | 10.2k | ) |
2782 | 10.6k | ) { |
2783 | 666 | parser_add_parse_error(parser, token); |
2784 | 666 | ignore_token(parser); |
2785 | 666 | return; |
2786 | 666 | } |
2787 | 9.95k | insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED); |
2788 | 9.95k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
2789 | 9.95k | state->_reprocess_current_token = true; |
2790 | 9.95k | } |
2791 | | |
2792 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody |
2793 | 10.4M | static void handle_in_body(GumboParser* parser, GumboToken* token) { |
2794 | 10.4M | GumboParserState* state = parser->_parser_state; |
2795 | 10.4M | assert(state->_open_elements.length > 0); |
2796 | 10.4M | if (token->type == GUMBO_TOKEN_NULL) { |
2797 | 2.45M | parser_add_parse_error(parser, token); |
2798 | 2.45M | ignore_token(parser); |
2799 | 2.45M | return; |
2800 | 2.45M | } |
2801 | 7.98M | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
2802 | 102k | reconstruct_active_formatting_elements(parser); |
2803 | 102k | insert_text_token(parser, token); |
2804 | 102k | return; |
2805 | 102k | } |
2806 | 7.87M | if ( |
2807 | 7.87M | token->type == GUMBO_TOKEN_CHARACTER |
2808 | 7.87M | || token->type == GUMBO_TOKEN_CDATA |
2809 | 7.87M | ) { |
2810 | 6.64M | reconstruct_active_formatting_elements(parser); |
2811 | 6.64M | insert_text_token(parser, token); |
2812 | 6.64M | set_frameset_not_ok(parser); |
2813 | 6.64M | return; |
2814 | 6.64M | } |
2815 | 1.23M | if (token->type == GUMBO_TOKEN_COMMENT) { |
2816 | 86.3k | append_comment_node(parser, get_current_node(parser), token); |
2817 | 86.3k | return; |
2818 | 86.3k | } |
2819 | 1.15M | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
2820 | 5.32k | parser_add_parse_error(parser, token); |
2821 | 5.32k | ignore_token(parser); |
2822 | 5.32k | return; |
2823 | 5.32k | } |
2824 | 1.14M | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
2825 | 1.64k | parser_add_parse_error(parser, token); |
2826 | 1.64k | if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
2827 | 132 | ignore_token(parser); |
2828 | 132 | return; |
2829 | 132 | } |
2830 | 1.50k | assert(parser->_output->root != NULL); |
2831 | 0 | assert(parser->_output->root->type == GUMBO_NODE_ELEMENT); |
2832 | 0 | merge_attributes(token, parser->_output->root); |
2833 | 1.50k | return; |
2834 | 1.64k | } |
2835 | 1.14M | if ( |
2836 | 1.14M | tag_in(token, kStartTag, &(const TagSet) { |
2837 | 1.14M | TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), |
2838 | 1.14M | TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), |
2839 | 1.14M | TAG(TITLE) |
2840 | 1.14M | }) |
2841 | 1.14M | || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) |
2842 | 1.14M | ) { |
2843 | 4.27k | handle_in_head(parser, token); |
2844 | 4.27k | return; |
2845 | 4.27k | } |
2846 | 1.14M | if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { |
2847 | 654 | parser_add_parse_error(parser, token); |
2848 | 654 | if ( |
2849 | 654 | state->_open_elements.length < 2 |
2850 | 654 | || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) |
2851 | 654 | || has_open_element(parser, GUMBO_TAG_TEMPLATE) |
2852 | 654 | ) { |
2853 | 448 | ignore_token(parser); |
2854 | 448 | } else { |
2855 | 206 | set_frameset_not_ok(parser); |
2856 | 206 | merge_attributes(token, state->_open_elements.data[1]); |
2857 | 206 | } |
2858 | 654 | return; |
2859 | 654 | } |
2860 | 1.14M | if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { |
2861 | 668 | parser_add_parse_error(parser, token); |
2862 | 668 | if ( |
2863 | 668 | state->_open_elements.length < 2 |
2864 | 668 | || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) |
2865 | 668 | || !state->_frameset_ok |
2866 | 668 | ) { |
2867 | 605 | ignore_token(parser); |
2868 | 605 | return; |
2869 | 605 | } |
2870 | | // Save the body node for later removal. |
2871 | 63 | GumboNode* body_node = state->_open_elements.data[1]; |
2872 | | |
2873 | | // Pop all nodes except root HTML element. |
2874 | 63 | GumboNode* node; |
2875 | 92 | do { |
2876 | 92 | node = pop_current_node(parser); |
2877 | 92 | } while (node != state->_open_elements.data[1]); |
2878 | | |
2879 | | // Removing & destroying the body node is going to kill any nodes that have |
2880 | | // been added to the list of active formatting elements, and so we should |
2881 | | // clear it to prevent a use-after-free if the list of active formatting |
2882 | | // elements is reconstructed afterwards. This may happen if whitespace |
2883 | | // follows the </frameset>. |
2884 | 63 | clear_active_formatting_elements(parser); |
2885 | | |
2886 | | // Remove the body node. We may want to factor this out into a generic |
2887 | | // helper, but right now this is the only code that needs to do this. |
2888 | 63 | GumboVector* children = &parser->_output->root->v.element.children; |
2889 | 468 | for (unsigned int i = 0; i < children->length; ++i) { |
2890 | 468 | if (children->data[i] == body_node) { |
2891 | 63 | gumbo_vector_remove_at(i, children); |
2892 | 63 | break; |
2893 | 63 | } |
2894 | 468 | } |
2895 | 63 | destroy_node(body_node); |
2896 | | |
2897 | | // Insert the <frameset>, and switch the insertion mode. |
2898 | 63 | insert_element_from_token(parser, token); |
2899 | 63 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); |
2900 | 63 | return; |
2901 | 668 | } |
2902 | 1.14M | if (token->type == GUMBO_TOKEN_EOF) { |
2903 | 11.4k | if (get_current_template_insertion_mode(parser) != |
2904 | 11.4k | GUMBO_INSERTION_MODE_INITIAL) { |
2905 | 1.61k | handle_in_template(parser, token); |
2906 | 1.61k | return; |
2907 | 1.61k | } |
2908 | 9.85k | if (stack_contains_nonclosable_element(parser)) |
2909 | 2.70k | parser_add_parse_error(parser, token); |
2910 | 9.85k | return; |
2911 | 11.4k | } |
2912 | 1.12M | if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) { |
2913 | 408 | if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) { |
2914 | 68 | parser_add_parse_error(parser, token); |
2915 | 68 | ignore_token(parser); |
2916 | 68 | return; |
2917 | 68 | } |
2918 | 340 | if (stack_contains_nonclosable_element(parser)) |
2919 | 190 | parser_add_parse_error(parser, token); |
2920 | 340 | GumboNode* body = state->_open_elements.data[1]; |
2921 | 340 | assert(node_html_tag_is(body, GUMBO_TAG_BODY)); |
2922 | 0 | record_end_of_element(state->_current_token, &body->v.element); |
2923 | 340 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY); |
2924 | 340 | return; |
2925 | 408 | } |
2926 | 1.12M | if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { |
2927 | 798 | if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) { |
2928 | 93 | parser_add_parse_error(parser, token); |
2929 | 93 | ignore_token(parser); |
2930 | 93 | return; |
2931 | 93 | } |
2932 | 705 | if (stack_contains_nonclosable_element(parser)) |
2933 | 523 | parser_add_parse_error(parser, token); |
2934 | 705 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY); |
2935 | 705 | parser->_parser_state->_reprocess_current_token = true; |
2936 | 705 | return; |
2937 | 798 | } |
2938 | 1.12M | if ( |
2939 | 1.12M | tag_in(token, kStartTag, &(const TagSet) { |
2940 | 1.12M | TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER), |
2941 | 1.12M | TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), |
2942 | 1.12M | TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), |
2943 | 1.12M | TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION), |
2944 | 1.12M | TAG(SUMMARY), TAG(UL), TAG(SEARCH) |
2945 | 1.12M | }) |
2946 | 1.12M | ) { |
2947 | 552k | maybe_implicitly_close_p_tag(parser, token); |
2948 | 552k | insert_element_from_token(parser, token); |
2949 | 552k | return; |
2950 | 552k | } |
2951 | 575k | if (tag_in(token, kStartTag, &heading_tags)) { |
2952 | 3.05k | maybe_implicitly_close_p_tag(parser, token); |
2953 | 3.05k | if (node_tag_in_set(get_current_node(parser), &heading_tags)) { |
2954 | 297 | parser_add_parse_error(parser, token); |
2955 | 297 | pop_current_node(parser); |
2956 | 297 | } |
2957 | 3.05k | insert_element_from_token(parser, token); |
2958 | 3.05k | return; |
2959 | 3.05k | } |
2960 | 572k | if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) { |
2961 | 707 | maybe_implicitly_close_p_tag(parser, token); |
2962 | 707 | insert_element_from_token(parser, token); |
2963 | 707 | state->_ignore_next_linefeed = true; |
2964 | 707 | set_frameset_not_ok(parser); |
2965 | 707 | return; |
2966 | 707 | } |
2967 | 571k | if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { |
2968 | 961 | if ( |
2969 | 961 | state->_form_element != NULL |
2970 | 961 | && !has_open_element(parser, GUMBO_TAG_TEMPLATE) |
2971 | 961 | ) { |
2972 | 477 | gumbo_debug("Ignoring nested form.\n"); |
2973 | 477 | parser_add_parse_error(parser, token); |
2974 | 477 | ignore_token(parser); |
2975 | 477 | return; |
2976 | 477 | } |
2977 | 484 | maybe_implicitly_close_p_tag(parser, token); |
2978 | 484 | GumboNode* form_element = insert_element_from_token(parser, token); |
2979 | 484 | if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
2980 | 246 | state->_form_element = form_element; |
2981 | 246 | } |
2982 | 484 | return; |
2983 | 961 | } |
2984 | 570k | if (tag_is(token, kStartTag, GUMBO_TAG_LI)) { |
2985 | 2.09k | maybe_implicitly_close_list_tag(parser, token, true); |
2986 | 2.09k | maybe_implicitly_close_p_tag(parser, token); |
2987 | 2.09k | insert_element_from_token(parser, token); |
2988 | 2.09k | return; |
2989 | 2.09k | } |
2990 | 568k | if (tag_in(token, kStartTag, &dd_dt_tags)) { |
2991 | 914 | maybe_implicitly_close_list_tag(parser, token, false); |
2992 | 914 | maybe_implicitly_close_p_tag(parser, token); |
2993 | 914 | insert_element_from_token(parser, token); |
2994 | 914 | return; |
2995 | 914 | } |
2996 | 567k | if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) { |
2997 | 45 | maybe_implicitly_close_p_tag(parser, token); |
2998 | 45 | insert_element_from_token(parser, token); |
2999 | 45 | gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); |
3000 | 45 | return; |
3001 | 45 | } |
3002 | 567k | if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) { |
3003 | 137 | if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) { |
3004 | 55 | parser_add_parse_error(parser, token); |
3005 | | // We don't want to use implicitly_close_tags here because it may add an |
3006 | | // error and we've already added the only error the standard specifies. |
3007 | 55 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3008 | 170 | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON)) |
3009 | 115 | ; |
3010 | 55 | } |
3011 | 137 | reconstruct_active_formatting_elements(parser); |
3012 | 137 | insert_element_from_token(parser, token); |
3013 | 137 | set_frameset_not_ok(parser); |
3014 | 137 | return; |
3015 | 137 | } |
3016 | 567k | if ( |
3017 | 567k | tag_in(token, kEndTag, &(const TagSet) { |
3018 | 567k | TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON), |
3019 | 567k | TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), |
3020 | 567k | TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), |
3021 | 567k | TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), |
3022 | 567k | TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL), TAG(SEARCH) |
3023 | 567k | }) |
3024 | 567k | ) { |
3025 | 291 | GumboTag tag = token->v.end_tag.tag; |
3026 | 291 | if (!has_an_element_in_scope(parser, tag)) { |
3027 | 83 | parser_add_parse_error(parser, token); |
3028 | 83 | ignore_token(parser); |
3029 | 83 | return; |
3030 | 83 | } |
3031 | 208 | return implicitly_close_tags ( |
3032 | 208 | parser, |
3033 | 208 | token, |
3034 | 208 | GUMBO_NAMESPACE_HTML, |
3035 | 208 | token->v.end_tag.tag |
3036 | 208 | ); |
3037 | 291 | } |
3038 | 567k | if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) { |
3039 | 795 | if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
3040 | 267 | if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) { |
3041 | 184 | parser_add_parse_error(parser, token); |
3042 | 184 | ignore_token(parser); |
3043 | 184 | return; |
3044 | 184 | } |
3045 | 83 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3046 | 83 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) |
3047 | 22 | parser_add_parse_error(parser, token); |
3048 | 231 | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM)) |
3049 | 148 | ; |
3050 | 83 | return; |
3051 | 528 | } else { |
3052 | 528 | GumboNode* node = state->_form_element; |
3053 | 528 | assert(!node || node->type == GUMBO_NODE_ELEMENT); |
3054 | 0 | state->_form_element = NULL; |
3055 | 528 | if (!node || !has_node_in_scope(parser, node)) { |
3056 | 391 | gumbo_debug("Closing an unopened form.\n"); |
3057 | 391 | parser_add_parse_error(parser, token); |
3058 | 391 | ignore_token(parser); |
3059 | 391 | return; |
3060 | 391 | } |
3061 | | // Since we remove the form node without popping, we need to make sure |
3062 | | // that we flush any text nodes at the end of the form. |
3063 | 137 | maybe_flush_text_node_buffer(parser); |
3064 | | // This differs from implicitly_close_tags because we remove *only* the |
3065 | | // <form> element; other nodes are left in scope. |
3066 | 137 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3067 | 137 | if (get_current_node(parser) != node) |
3068 | 34 | parser_add_parse_error(parser, token); |
3069 | 103 | else |
3070 | 103 | record_end_of_element(token, &node->v.element); |
3071 | | |
3072 | 137 | GumboVector* open_elements = &state->_open_elements; |
3073 | 137 | int index = gumbo_vector_index_of(open_elements, node); |
3074 | 137 | assert(index >= 0); |
3075 | 0 | gumbo_vector_remove_at(index, open_elements); |
3076 | 137 | return; |
3077 | 528 | } |
3078 | 795 | } |
3079 | 566k | if (tag_is(token, kEndTag, GUMBO_TAG_P)) { |
3080 | 2.32k | if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { |
3081 | 2.09k | parser_add_parse_error(parser, token); |
3082 | | // reconstruct_active_formatting_elements(parser); |
3083 | 2.09k | insert_element_of_tag_type ( |
3084 | 2.09k | parser, |
3085 | 2.09k | GUMBO_TAG_P, |
3086 | 2.09k | GUMBO_INSERTION_CONVERTED_FROM_END_TAG |
3087 | 2.09k | ); |
3088 | 2.09k | } |
3089 | 2.32k | implicitly_close_tags ( |
3090 | 2.32k | parser, |
3091 | 2.32k | token, |
3092 | 2.32k | GUMBO_NAMESPACE_HTML, |
3093 | 2.32k | GUMBO_TAG_P |
3094 | 2.32k | ); |
3095 | 2.32k | return; |
3096 | 2.32k | } |
3097 | 564k | if (tag_is(token, kEndTag, GUMBO_TAG_LI)) { |
3098 | 311 | if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) { |
3099 | 194 | parser_add_parse_error(parser, token); |
3100 | 194 | ignore_token(parser); |
3101 | 194 | return; |
3102 | 194 | } |
3103 | 117 | implicitly_close_tags ( |
3104 | 117 | parser, |
3105 | 117 | token, |
3106 | 117 | GUMBO_NAMESPACE_HTML, |
3107 | 117 | GUMBO_TAG_LI |
3108 | 117 | ); |
3109 | 117 | return; |
3110 | 311 | } |
3111 | 563k | if (tag_in(token, kEndTag, &dd_dt_tags)) { |
3112 | 491 | GumboTag token_tag = token->v.end_tag.tag; |
3113 | 491 | if (!has_an_element_in_scope(parser, token_tag)) { |
3114 | 445 | parser_add_parse_error(parser, token); |
3115 | 445 | ignore_token(parser); |
3116 | 445 | return; |
3117 | 445 | } |
3118 | 46 | implicitly_close_tags ( |
3119 | 46 | parser, |
3120 | 46 | token, |
3121 | 46 | GUMBO_NAMESPACE_HTML, |
3122 | 46 | token_tag |
3123 | 46 | ); |
3124 | 46 | return; |
3125 | 491 | } |
3126 | 563k | if (tag_in(token, kEndTag, &heading_tags)) { |
3127 | 2.55k | if ( |
3128 | 2.55k | !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) { |
3129 | 2.55k | GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4, |
3130 | 2.55k | GUMBO_TAG_H5, GUMBO_TAG_H6 |
3131 | 2.55k | }) |
3132 | 2.55k | ) { |
3133 | | // No heading open; ignore the token entirely. |
3134 | 579 | parser_add_parse_error(parser, token); |
3135 | 579 | ignore_token(parser); |
3136 | 579 | return; |
3137 | 579 | } |
3138 | 1.97k | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3139 | 1.97k | const GumboNode* current_node = get_current_node(parser); |
3140 | 1.97k | if (!node_html_tag_is(current_node, token->v.end_tag.tag)) { |
3141 | | // There're children of the heading currently open; close them below and |
3142 | | // record a parse error. |
3143 | | // TODO(jdtang): Add a way to distinguish this error case from the one |
3144 | | // above. |
3145 | 1.74k | parser_add_parse_error(parser, token); |
3146 | 1.74k | } |
3147 | 3.00k | do { |
3148 | 3.00k | current_node = pop_current_node(parser); |
3149 | 3.00k | } while (!node_tag_in_set(current_node, &heading_tags)); |
3150 | 1.97k | return; |
3151 | 2.55k | } |
3152 | 560k | if (tag_is(token, kStartTag, GUMBO_TAG_A)) { |
3153 | 63.9k | int last_a; |
3154 | 63.9k | int has_matching_a = find_last_anchor_index(parser, &last_a); |
3155 | 63.9k | if (has_matching_a) { |
3156 | 26.0k | assert(has_matching_a == 1); |
3157 | 0 | parser_add_parse_error(parser, token); |
3158 | 26.0k | (void)adoption_agency_algorithm(parser, token); |
3159 | | // The adoption agency algorithm usually removes all instances of <a> |
3160 | | // from the list of active formatting elements, but in case it doesn't, |
3161 | | // we're supposed to do this. (The conditions where it might not are |
3162 | | // listed in the spec.) |
3163 | 26.0k | if (find_last_anchor_index(parser, &last_a)) { |
3164 | 883 | void* last_element = gumbo_vector_remove_at ( |
3165 | 883 | last_a, |
3166 | 883 | &state->_active_formatting_elements |
3167 | 883 | ); |
3168 | 883 | gumbo_vector_remove(last_element, &state->_open_elements); |
3169 | 883 | } |
3170 | 26.0k | } |
3171 | 0 | reconstruct_active_formatting_elements(parser); |
3172 | 63.9k | add_formatting_element(parser, insert_element_from_token(parser, token)); |
3173 | 63.9k | return; |
3174 | 63.9k | } |
3175 | 496k | if ( |
3176 | 496k | tag_in(token, kStartTag, &(const TagSet) { |
3177 | 496k | TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S), |
3178 | 496k | TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U) |
3179 | 496k | }) |
3180 | 496k | ) { |
3181 | 229k | reconstruct_active_formatting_elements(parser); |
3182 | 229k | add_formatting_element(parser, insert_element_from_token(parser, token)); |
3183 | 229k | return; |
3184 | 229k | } |
3185 | 267k | if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) { |
3186 | 2.52k | reconstruct_active_formatting_elements(parser); |
3187 | 2.52k | if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) { |
3188 | 1.82k | parser_add_parse_error(parser, token); |
3189 | 1.82k | adoption_agency_algorithm(parser, token); |
3190 | 1.82k | reconstruct_active_formatting_elements(parser); |
3191 | 1.82k | } |
3192 | 2.52k | insert_element_from_token(parser, token); |
3193 | 2.52k | add_formatting_element(parser, get_current_node(parser)); |
3194 | 2.52k | return; |
3195 | 2.52k | } |
3196 | 265k | if ( |
3197 | 265k | tag_in(token, kEndTag, &(const TagSet) { |
3198 | 265k | TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), |
3199 | 265k | TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), |
3200 | 265k | TAG(U) |
3201 | 265k | }) |
3202 | 265k | ) { |
3203 | 2.96k | adoption_agency_algorithm(parser, token); |
3204 | 2.96k | return; |
3205 | 2.96k | } |
3206 | 262k | if ( |
3207 | 262k | tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)}) |
3208 | 262k | ) { |
3209 | 64.0k | reconstruct_active_formatting_elements(parser); |
3210 | 64.0k | insert_element_from_token(parser, token); |
3211 | 64.0k | add_formatting_element(parser, &kActiveFormattingScopeMarker); |
3212 | 64.0k | set_frameset_not_ok(parser); |
3213 | 64.0k | return; |
3214 | 64.0k | } |
3215 | 197k | if ( |
3216 | 197k | tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)}) |
3217 | 197k | ) { |
3218 | 129 | GumboTag token_tag = token->v.end_tag.tag; |
3219 | 129 | if (!has_an_element_in_scope(parser, token_tag)) { |
3220 | 34 | parser_add_parse_error(parser, token); |
3221 | 34 | ignore_token(parser); |
3222 | 34 | return; |
3223 | 34 | } |
3224 | 95 | implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag); |
3225 | 95 | clear_active_formatting_elements(parser); |
3226 | 95 | return; |
3227 | 129 | } |
3228 | 197k | if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { |
3229 | 8.31k | if ( |
3230 | 8.31k | get_document_node(parser)->v.document.doc_type_quirks_mode |
3231 | 8.31k | != GUMBO_DOCTYPE_QUIRKS |
3232 | 8.31k | ) { |
3233 | 91 | maybe_implicitly_close_p_tag(parser, token); |
3234 | 91 | } |
3235 | 8.31k | insert_element_from_token(parser, token); |
3236 | 8.31k | set_frameset_not_ok(parser); |
3237 | 8.31k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3238 | 8.31k | return; |
3239 | 8.31k | } |
3240 | 189k | if (tag_is(token, kEndTag, GUMBO_TAG_BR)) { |
3241 | 290 | parser_add_parse_error(parser, token); |
3242 | 290 | reconstruct_active_formatting_elements(parser); |
3243 | 290 | insert_element_of_tag_type ( |
3244 | 290 | parser, |
3245 | 290 | GUMBO_TAG_BR, |
3246 | 290 | GUMBO_INSERTION_CONVERTED_FROM_END_TAG |
3247 | 290 | ); |
3248 | 290 | pop_current_node(parser); |
3249 | 290 | acknowledge_self_closing_tag(parser); |
3250 | 290 | set_frameset_not_ok(parser); |
3251 | 290 | return; |
3252 | 290 | } |
3253 | 189k | if ( |
3254 | 189k | tag_in(token, kStartTag, &(const TagSet) { |
3255 | 189k | TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN), |
3256 | 189k | TAG(WBR) |
3257 | 189k | }) |
3258 | 189k | ) { |
3259 | 397 | bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE); |
3260 | 397 | if (is_image) { |
3261 | 58 | parser_add_parse_error(parser, token); |
3262 | 58 | token->v.start_tag.tag = GUMBO_TAG_IMG; |
3263 | 58 | } |
3264 | 397 | reconstruct_active_formatting_elements(parser); |
3265 | 397 | GumboNode* node = insert_element_from_token(parser, token); |
3266 | 397 | if (is_image) |
3267 | 58 | node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE; |
3268 | 397 | pop_current_node(parser); |
3269 | 397 | acknowledge_self_closing_tag(parser); |
3270 | 397 | set_frameset_not_ok(parser); |
3271 | 397 | return; |
3272 | 397 | } |
3273 | 188k | if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) { |
3274 | 1.12k | reconstruct_active_formatting_elements(parser); |
3275 | 1.12k | GumboNode *input = insert_element_from_token(parser, token); |
3276 | 1.12k | pop_current_node(parser); |
3277 | 1.12k | acknowledge_self_closing_tag(parser); |
3278 | 1.12k | if (!attribute_matches(&input->v.element.attributes, "type", "hidden")) |
3279 | 969 | set_frameset_not_ok(parser); |
3280 | 1.12k | return; |
3281 | 1.12k | } |
3282 | 187k | if ( |
3283 | 187k | tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)}) |
3284 | 187k | ) { |
3285 | 256 | insert_element_from_token(parser, token); |
3286 | 256 | pop_current_node(parser); |
3287 | 256 | acknowledge_self_closing_tag(parser); |
3288 | 256 | return; |
3289 | 256 | } |
3290 | 187k | if (tag_is(token, kStartTag, GUMBO_TAG_HR)) { |
3291 | 475 | maybe_implicitly_close_p_tag(parser, token); |
3292 | 475 | insert_element_from_token(parser, token); |
3293 | 475 | pop_current_node(parser); |
3294 | 475 | acknowledge_self_closing_tag(parser); |
3295 | 475 | set_frameset_not_ok(parser); |
3296 | 475 | return; |
3297 | 475 | } |
3298 | 186k | if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) { |
3299 | 78 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); |
3300 | 78 | parser->_parser_state->_ignore_next_linefeed = true; |
3301 | 78 | set_frameset_not_ok(parser); |
3302 | 78 | return; |
3303 | 78 | } |
3304 | 186k | if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) { |
3305 | 494 | maybe_implicitly_close_p_tag(parser, token); |
3306 | 494 | reconstruct_active_formatting_elements(parser); |
3307 | 494 | set_frameset_not_ok(parser); |
3308 | 494 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); |
3309 | 494 | return; |
3310 | 494 | } |
3311 | 186k | if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) { |
3312 | 36 | set_frameset_not_ok(parser); |
3313 | 36 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); |
3314 | 36 | return; |
3315 | 36 | } |
3316 | 186k | if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) { |
3317 | 101 | run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); |
3318 | 101 | return; |
3319 | 101 | } |
3320 | 186k | if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { |
3321 | 11.0k | reconstruct_active_formatting_elements(parser); |
3322 | 11.0k | insert_element_from_token(parser, token); |
3323 | 11.0k | set_frameset_not_ok(parser); |
3324 | 11.0k | GumboInsertionMode state = parser->_parser_state->_insertion_mode; |
3325 | 11.0k | if ( |
3326 | 11.0k | state == GUMBO_INSERTION_MODE_IN_TABLE |
3327 | 11.0k | || state == GUMBO_INSERTION_MODE_IN_CAPTION |
3328 | 11.0k | || state == GUMBO_INSERTION_MODE_IN_TABLE_BODY |
3329 | 11.0k | || state == GUMBO_INSERTION_MODE_IN_ROW |
3330 | 11.0k | || state == GUMBO_INSERTION_MODE_IN_CELL |
3331 | 11.0k | ) { |
3332 | 8.58k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE); |
3333 | 8.58k | } else { |
3334 | 2.50k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT); |
3335 | 2.50k | } |
3336 | 11.0k | return; |
3337 | 11.0k | } |
3338 | 175k | if ( |
3339 | 175k | tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)}) |
3340 | 175k | ) { |
3341 | 6.63k | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { |
3342 | 1.68k | pop_current_node(parser); |
3343 | 1.68k | } |
3344 | 6.63k | reconstruct_active_formatting_elements(parser); |
3345 | 6.63k | insert_element_from_token(parser, token); |
3346 | 6.63k | return; |
3347 | 6.63k | } |
3348 | 168k | if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) { |
3349 | 1.39k | if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) { |
3350 | 265 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3351 | 265 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) |
3352 | 97 | parser_add_parse_error(parser, token); |
3353 | 265 | } |
3354 | 1.39k | insert_element_from_token(parser, token); |
3355 | 1.39k | return; |
3356 | 1.39k | } |
3357 | 167k | if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) { |
3358 | 1.02k | if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) { |
3359 | 76 | generate_implied_end_tags(parser, GUMBO_TAG_RTC, NULL); |
3360 | 76 | GumboNode* current = get_current_node(parser); |
3361 | 76 | if (!node_html_tag_is(current, GUMBO_TAG_RUBY) && |
3362 | 76 | !node_html_tag_is(current, GUMBO_TAG_RTC)) { |
3363 | 16 | parser_add_parse_error(parser, token); |
3364 | 16 | } |
3365 | 76 | } |
3366 | 1.02k | insert_element_from_token(parser, token); |
3367 | 1.02k | return; |
3368 | 1.02k | } |
3369 | 166k | if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) { |
3370 | 1.20k | reconstruct_active_formatting_elements(parser); |
3371 | 1.20k | adjust_mathml_attributes(token); |
3372 | 1.20k | adjust_foreign_attributes(token); |
3373 | 1.20k | insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML); |
3374 | 1.20k | if (token->v.start_tag.is_self_closing) { |
3375 | 174 | pop_current_node(parser); |
3376 | 174 | acknowledge_self_closing_tag(parser); |
3377 | 174 | } |
3378 | 1.20k | return; |
3379 | 1.20k | } |
3380 | 164k | if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) { |
3381 | 2.08k | reconstruct_active_formatting_elements(parser); |
3382 | 2.08k | adjust_svg_attributes(token); |
3383 | 2.08k | adjust_foreign_attributes(token); |
3384 | 2.08k | insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG); |
3385 | 2.08k | if (token->v.start_tag.is_self_closing) { |
3386 | 67 | pop_current_node(parser); |
3387 | 67 | acknowledge_self_closing_tag(parser); |
3388 | 67 | } |
3389 | 2.08k | return; |
3390 | 2.08k | } |
3391 | 162k | if ( |
3392 | 162k | tag_in(token, kStartTag, &(const TagSet) { |
3393 | 162k | TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD), |
3394 | 162k | TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
3395 | 162k | }) |
3396 | 162k | ) { |
3397 | 2.47k | parser_add_parse_error(parser, token); |
3398 | 2.47k | ignore_token(parser); |
3399 | 2.47k | return; |
3400 | 2.47k | } |
3401 | 160k | if (token->type == GUMBO_TOKEN_START_TAG) { |
3402 | 152k | reconstruct_active_formatting_elements(parser); |
3403 | 152k | insert_element_from_token(parser, token); |
3404 | 152k | return; |
3405 | 152k | } |
3406 | 7.63k | in_body_any_other_end_tag(parser, token); |
3407 | 7.63k | } |
3408 | | |
3409 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata |
3410 | 1.83M | static void handle_text(GumboParser* parser, GumboToken* token) { |
3411 | 1.83M | if ( |
3412 | 1.83M | token->type == GUMBO_TOKEN_CHARACTER |
3413 | 1.83M | || token->type == GUMBO_TOKEN_WHITESPACE |
3414 | 1.83M | ) { |
3415 | 1.83M | insert_text_token(parser, token); |
3416 | 1.83M | return; |
3417 | 1.83M | } |
3418 | | // We provide only bare-bones script handling that doesn't involve any of |
3419 | | // the parser-pause/already-started/script-nesting flags or re-entrant |
3420 | | // invocations of the tokenizer. Because the intended usage of this library |
3421 | | // is mostly for templating, refactoring, and static-analysis libraries, we |
3422 | | // provide the script body as a text-node child of the <script> element. |
3423 | | // This behavior doesn't support document.write of partial HTML elements, |
3424 | | // but should be adequate for almost all other scripting support. |
3425 | 2.91k | if (token->type == GUMBO_TOKEN_EOF) { |
3426 | 1.03k | parser_add_parse_error(parser, token); |
3427 | 1.03k | parser->_parser_state->_reprocess_current_token = true; |
3428 | 1.03k | } |
3429 | 2.91k | pop_current_node(parser); |
3430 | 2.91k | set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode); |
3431 | 2.91k | } |
3432 | | |
3433 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable |
3434 | 2.35M | static void handle_in_table(GumboParser* parser, GumboToken* token) { |
3435 | 2.35M | GumboParserState* state = parser->_parser_state; |
3436 | 2.35M | if ( |
3437 | 2.35M | (token->type == GUMBO_TOKEN_CHARACTER |
3438 | 2.35M | || token->type == GUMBO_TOKEN_WHITESPACE |
3439 | 2.35M | || token->type == GUMBO_TOKEN_NULL) |
3440 | 2.35M | && node_tag_in_set(get_current_node(parser), &(const TagSet) { |
3441 | 1.53M | TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR) |
3442 | 1.53M | }) |
3443 | 2.35M | ) { |
3444 | | // The "pending table character tokens" list described in the spec is |
3445 | | // nothing more than the TextNodeBufferState. We accumulate text tokens as |
3446 | | // normal, except that when we go to flush them in the handle_in_table_text, |
3447 | | // we set _foster_parent_insertions if there're non-whitespace characters in |
3448 | | // the buffer. |
3449 | 11.3k | assert(state->_text_node._buffer.length == 0); |
3450 | 0 | assert(state->_table_character_tokens.length == 0); |
3451 | 0 | state->_original_insertion_mode = state->_insertion_mode; |
3452 | 11.3k | state->_reprocess_current_token = true; |
3453 | 11.3k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT); |
3454 | 11.3k | return; |
3455 | 11.3k | } |
3456 | 2.34M | if (token->type == GUMBO_TOKEN_COMMENT) { |
3457 | 4.38k | append_comment_node(parser, get_current_node(parser), token); |
3458 | 4.38k | return; |
3459 | 4.38k | } |
3460 | 2.34M | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
3461 | 465 | parser_add_parse_error(parser, token); |
3462 | 465 | ignore_token(parser); |
3463 | 465 | return; |
3464 | 465 | } |
3465 | 2.34M | if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) { |
3466 | 1.53k | clear_stack_to_table_context(parser); |
3467 | 1.53k | add_formatting_element(parser, &kActiveFormattingScopeMarker); |
3468 | 1.53k | insert_element_from_token(parser, token); |
3469 | 1.53k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION); |
3470 | 1.53k | return; |
3471 | 1.53k | } |
3472 | 2.34M | if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) { |
3473 | 773 | clear_stack_to_table_context(parser); |
3474 | 773 | insert_element_from_token(parser, token); |
3475 | 773 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); |
3476 | 773 | return; |
3477 | 773 | } |
3478 | 2.33M | if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { |
3479 | 722 | clear_stack_to_table_context(parser); |
3480 | 722 | insert_element_of_tag_type ( |
3481 | 722 | parser, |
3482 | 722 | GUMBO_TAG_COLGROUP, |
3483 | 722 | GUMBO_INSERTION_IMPLIED |
3484 | 722 | ); |
3485 | 722 | state->_reprocess_current_token = true; |
3486 | 722 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); |
3487 | 722 | return; |
3488 | 722 | } |
3489 | 2.33M | if ( |
3490 | 2.33M | tag_in(token, kStartTag, &(const TagSet) { |
3491 | 2.33M | TAG(TBODY), TAG(TFOOT), TAG(THEAD) |
3492 | 2.33M | }) |
3493 | 2.33M | ) { |
3494 | 3.76k | clear_stack_to_table_context(parser); |
3495 | 3.76k | insert_element_from_token(parser, token); |
3496 | 3.76k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
3497 | 3.76k | return; |
3498 | 3.76k | } |
3499 | 2.33M | if ( |
3500 | 2.33M | tag_in(token, kStartTag, &(const TagSet) { |
3501 | 2.33M | TAG(TD), TAG(TH), TAG(TR) |
3502 | 2.33M | }) |
3503 | 2.33M | ) { |
3504 | 1.81k | clear_stack_to_table_context(parser); |
3505 | 1.81k | insert_element_of_tag_type ( |
3506 | 1.81k | parser, |
3507 | 1.81k | GUMBO_TAG_TBODY, |
3508 | 1.81k | GUMBO_INSERTION_IMPLIED |
3509 | 1.81k | ); |
3510 | 1.81k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
3511 | 1.81k | state->_reprocess_current_token = true; |
3512 | 1.81k | return; |
3513 | 1.81k | } |
3514 | 2.33M | if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { |
3515 | 6.82k | parser_add_parse_error(parser, token); |
3516 | 6.82k | if (close_table(parser)) { |
3517 | 6.64k | state->_reprocess_current_token = true; |
3518 | 6.64k | } else { |
3519 | 178 | ignore_token(parser); |
3520 | 178 | } |
3521 | 6.82k | return; |
3522 | 6.82k | } |
3523 | 2.32M | if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { |
3524 | 493 | if (!close_table(parser)) { |
3525 | 26 | parser_add_parse_error(parser, token); |
3526 | 26 | return; |
3527 | 26 | } |
3528 | 467 | return; |
3529 | 493 | } |
3530 | 2.32M | if ( |
3531 | 2.32M | tag_in(token, kEndTag, &(const TagSet) { |
3532 | 2.32M | TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), |
3533 | 2.32M | TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
3534 | 2.32M | }) |
3535 | 2.32M | ) { |
3536 | 2.19k | parser_add_parse_error(parser, token); |
3537 | 2.19k | ignore_token(parser); |
3538 | 2.19k | return; |
3539 | 2.19k | } |
3540 | 2.32M | if ( |
3541 | 2.32M | tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) |
3542 | 2.32M | || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) |
3543 | 2.32M | ) { |
3544 | 883 | handle_in_head(parser, token); |
3545 | 883 | return; |
3546 | 883 | } |
3547 | 2.32M | if ( |
3548 | 2.32M | tag_is(token, kStartTag, GUMBO_TAG_INPUT) |
3549 | 2.32M | && attribute_matches(&token->v.start_tag.attributes, "type", "hidden") |
3550 | 2.32M | ) { |
3551 | 59 | parser_add_parse_error(parser, token); |
3552 | 59 | insert_element_from_token(parser, token); |
3553 | 59 | pop_current_node(parser); |
3554 | 59 | acknowledge_self_closing_tag(parser); |
3555 | 59 | return; |
3556 | 59 | } |
3557 | 2.32M | if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { |
3558 | 9.23k | parser_add_parse_error(parser, token); |
3559 | 9.23k | if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
3560 | 9.08k | ignore_token(parser); |
3561 | 9.08k | return; |
3562 | 9.08k | } |
3563 | 143 | state->_form_element = insert_element_from_token(parser, token); |
3564 | 143 | pop_current_node(parser); |
3565 | 143 | return; |
3566 | 9.23k | } |
3567 | 2.31M | if (token->type == GUMBO_TOKEN_EOF) { |
3568 | 1.04k | handle_in_body(parser, token); |
3569 | 1.04k | return; |
3570 | 1.04k | } |
3571 | | // foster-parenting-start-tag or foster-parenting-end-tag error |
3572 | 2.31M | parser_add_parse_error(parser, token); |
3573 | 2.31M | state->_foster_parent_insertions = true; |
3574 | 2.31M | handle_in_body(parser, token); |
3575 | 2.31M | state->_foster_parent_insertions = false; |
3576 | 2.31M | } |
3577 | | |
3578 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext |
3579 | 420k | static void handle_in_table_text(GumboParser* parser, GumboToken* token) { |
3580 | 420k | if (token->type == GUMBO_TOKEN_NULL) { |
3581 | 12.6k | parser_add_parse_error(parser, token); |
3582 | 12.6k | ignore_token(parser); |
3583 | 12.6k | return; |
3584 | 12.6k | } |
3585 | 407k | GumboParserState* state = parser->_parser_state; |
3586 | | // Non-whitespace tokens will cause parse errors later. |
3587 | | // It's not entirely clear from the spec how this is supposed to work. |
3588 | | // https://github.com/whatwg/html/issues/4046 |
3589 | 407k | if (token->type == GUMBO_TOKEN_WHITESPACE |
3590 | 407k | || token->type == GUMBO_TOKEN_CHARACTER) { |
3591 | 396k | insert_text_token(parser, token); |
3592 | 396k | gumbo_character_token_buffer_append(token, &state->_table_character_tokens); |
3593 | 396k | return; |
3594 | 396k | } |
3595 | | |
3596 | 11.3k | GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens; |
3597 | 11.3k | if (state->_text_node._type != GUMBO_NODE_WHITESPACE) { |
3598 | | // Each character in buffer is an error. Unfortunately, that means we need |
3599 | | // to emit a bunch of errors at the appropriate locations. |
3600 | 406k | for (size_t i = 0, n = buffer->length; i < n; ++i) { |
3601 | 395k | GumboToken tok; |
3602 | 395k | gumbo_character_token_buffer_get(buffer, i, &tok); |
3603 | | // foster-parenting-character error |
3604 | 395k | parser_add_parse_error(parser, &tok); |
3605 | 395k | } |
3606 | 10.8k | state->_foster_parent_insertions = true; |
3607 | 10.8k | set_frameset_not_ok(parser); |
3608 | 10.8k | reconstruct_active_formatting_elements(parser); |
3609 | 10.8k | } |
3610 | 11.3k | maybe_flush_text_node_buffer(parser); |
3611 | 11.3k | gumbo_character_token_buffer_clear(buffer); |
3612 | 11.3k | state->_foster_parent_insertions = false; |
3613 | 11.3k | state->_reprocess_current_token = true; |
3614 | 11.3k | state->_insertion_mode = state->_original_insertion_mode; |
3615 | 11.3k | } |
3616 | | |
3617 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption |
3618 | 26.6k | static void handle_in_caption(GumboParser* parser, GumboToken* token) { |
3619 | 26.6k | if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) { |
3620 | 745 | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { |
3621 | 0 | parser_add_parse_error(parser, token); |
3622 | 0 | ignore_token(parser); |
3623 | 0 | return; |
3624 | 0 | } |
3625 | 745 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3626 | 745 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) |
3627 | 708 | parser_add_parse_error(parser, token); |
3628 | 2.07k | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) |
3629 | 1.33k | ; |
3630 | 745 | clear_active_formatting_elements(parser); |
3631 | 745 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3632 | 745 | return; |
3633 | 745 | } |
3634 | 25.9k | if ( |
3635 | 25.9k | tag_in(token, kStartTag, &(const TagSet) { |
3636 | 25.9k | TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD), |
3637 | 25.9k | TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
3638 | 25.9k | }) |
3639 | 25.9k | || (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) |
3640 | 25.9k | ) { |
3641 | 697 | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { |
3642 | 0 | parser_add_parse_error(parser, token); |
3643 | 0 | ignore_token(parser); |
3644 | 0 | return; |
3645 | 0 | } |
3646 | 697 | generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL); |
3647 | 697 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) |
3648 | 404 | parser_add_parse_error(parser, token); |
3649 | 2.14k | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) |
3650 | 1.44k | ; |
3651 | 697 | clear_active_formatting_elements(parser); |
3652 | 697 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3653 | 697 | parser->_parser_state->_reprocess_current_token = true; |
3654 | 697 | return; |
3655 | 697 | } |
3656 | 25.2k | if ( |
3657 | 25.2k | tag_in(token, kEndTag, &(const TagSet) { |
3658 | 25.2k | TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), |
3659 | 25.2k | TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
3660 | 25.2k | }) |
3661 | 25.2k | ) { |
3662 | 47 | parser_add_parse_error(parser, token); |
3663 | 47 | ignore_token(parser); |
3664 | 47 | return; |
3665 | 47 | } |
3666 | 25.1k | handle_in_body(parser, token); |
3667 | 25.1k | } |
3668 | | |
3669 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup |
3670 | 54.3k | static void handle_in_column_group(GumboParser* parser, GumboToken* token) { |
3671 | 54.3k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
3672 | 3.38k | insert_text_token(parser, token); |
3673 | 3.38k | return; |
3674 | 3.38k | } |
3675 | 50.9k | if (token->type == GUMBO_TOKEN_COMMENT) { |
3676 | 5.41k | append_comment_node(parser, get_current_node(parser), token); |
3677 | 5.41k | return; |
3678 | 5.41k | } |
3679 | 45.5k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
3680 | 127 | parser_add_parse_error(parser, token); |
3681 | 127 | ignore_token(parser); |
3682 | 127 | return; |
3683 | 127 | } |
3684 | 45.3k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
3685 | 37 | handle_in_body(parser, token); |
3686 | 37 | return; |
3687 | 37 | } |
3688 | 45.3k | if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { |
3689 | 1.11k | insert_element_from_token(parser, token); |
3690 | 1.11k | pop_current_node(parser); |
3691 | 1.11k | acknowledge_self_closing_tag(parser); |
3692 | 1.11k | return; |
3693 | 1.11k | } |
3694 | 44.2k | if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) { |
3695 | 161 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { |
3696 | 19 | parser_add_parse_error(parser, token); |
3697 | 19 | ignore_token(parser); |
3698 | 19 | return; |
3699 | 19 | } |
3700 | 142 | pop_current_node(parser); |
3701 | 142 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3702 | 142 | return; |
3703 | 161 | } |
3704 | 44.0k | if (tag_is(token, kEndTag, GUMBO_TAG_COL)) { |
3705 | 3 | parser_add_parse_error(parser, token); |
3706 | 3 | ignore_token(parser); |
3707 | 3 | return; |
3708 | 3 | } |
3709 | 44.0k | if ( |
3710 | 44.0k | tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) |
3711 | 44.0k | || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) |
3712 | 44.0k | ) { |
3713 | 242 | handle_in_head(parser, token); |
3714 | 242 | return; |
3715 | 242 | } |
3716 | 43.8k | if (token->type == GUMBO_TOKEN_EOF) { |
3717 | 314 | handle_in_body(parser, token); |
3718 | 314 | return; |
3719 | 314 | } |
3720 | 43.5k | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { |
3721 | 42.2k | parser_add_parse_error(parser, token); |
3722 | 42.2k | ignore_token(parser); |
3723 | 42.2k | return; |
3724 | 42.2k | } |
3725 | 1.22k | pop_current_node(parser); |
3726 | 1.22k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3727 | 1.22k | parser->_parser_state->_reprocess_current_token = true; |
3728 | 1.22k | } |
3729 | | |
3730 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody |
3731 | 851k | static void handle_in_table_body(GumboParser* parser, GumboToken* token) { |
3732 | 851k | if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { |
3733 | 4.09k | clear_stack_to_table_body_context(parser); |
3734 | 4.09k | insert_element_from_token(parser, token); |
3735 | 4.09k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); |
3736 | 4.09k | return; |
3737 | 4.09k | } |
3738 | 847k | if (tag_in(token, kStartTag, &td_th_tags)) { |
3739 | 1.32k | parser_add_parse_error(parser, token); |
3740 | 1.32k | clear_stack_to_table_body_context(parser); |
3741 | 1.32k | insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED); |
3742 | 1.32k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); |
3743 | 1.32k | parser->_parser_state->_reprocess_current_token = true; |
3744 | 1.32k | return; |
3745 | 1.32k | } |
3746 | 846k | if ( |
3747 | 846k | tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) |
3748 | 846k | ) { |
3749 | 584 | if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { |
3750 | 87 | parser_add_parse_error(parser, token); |
3751 | 87 | ignore_token(parser); |
3752 | 87 | return; |
3753 | 87 | } |
3754 | 497 | clear_stack_to_table_body_context(parser); |
3755 | 497 | pop_current_node(parser); |
3756 | 497 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3757 | 497 | return; |
3758 | 584 | } |
3759 | 845k | if ( |
3760 | 845k | tag_in(token, kStartTag, &(const TagSet) { |
3761 | 845k | TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), |
3762 | 845k | TAG(THEAD) |
3763 | 845k | }) |
3764 | 845k | || tag_is(token, kEndTag, GUMBO_TAG_TABLE) |
3765 | 845k | ) { |
3766 | 3.58k | if ( |
3767 | 3.58k | !( |
3768 | 3.58k | has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) |
3769 | 3.58k | || has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) |
3770 | 3.58k | || has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT) |
3771 | 3.58k | ) |
3772 | 3.58k | ) { |
3773 | 717 | parser_add_parse_error(parser, token); |
3774 | 717 | ignore_token(parser); |
3775 | 717 | return; |
3776 | 717 | } |
3777 | 2.86k | clear_stack_to_table_body_context(parser); |
3778 | 2.86k | pop_current_node(parser); |
3779 | 2.86k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
3780 | 2.86k | parser->_parser_state->_reprocess_current_token = true; |
3781 | 2.86k | return; |
3782 | 3.58k | } |
3783 | 842k | if ( |
3784 | 842k | tag_in(token, kEndTag, &(const TagSet) { |
3785 | 842k | TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD), |
3786 | 842k | TAG(TH), TAG(TR) |
3787 | 842k | }) |
3788 | 842k | ) { |
3789 | 1.34k | parser_add_parse_error(parser, token); |
3790 | 1.34k | ignore_token(parser); |
3791 | 1.34k | return; |
3792 | 1.34k | } |
3793 | 840k | handle_in_table(parser, token); |
3794 | 840k | } |
3795 | | |
3796 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr |
3797 | 432k | static void handle_in_row(GumboParser* parser, GumboToken* token) { |
3798 | 432k | if (tag_in(token, kStartTag, &td_th_tags)) { |
3799 | 23.5k | clear_stack_to_table_row_context(parser); |
3800 | 23.5k | insert_element_from_token(parser, token); |
3801 | 23.5k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL); |
3802 | 23.5k | add_formatting_element(parser, &kActiveFormattingScopeMarker); |
3803 | 23.5k | return; |
3804 | 23.5k | } |
3805 | 409k | if (tag_is(token, kEndTag, GUMBO_TAG_TR)) { |
3806 | 722 | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { |
3807 | 12 | parser_add_parse_error(parser, token); |
3808 | 12 | ignore_token(parser); |
3809 | 12 | return; |
3810 | 12 | } |
3811 | 710 | clear_stack_to_table_row_context(parser); |
3812 | 710 | pop_current_node(parser); |
3813 | 710 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
3814 | 710 | return; |
3815 | 722 | } |
3816 | 408k | if ( |
3817 | 408k | tag_in(token, kStartTag, &(const TagSet) { |
3818 | 408k | TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), |
3819 | 408k | TAG(THEAD), TAG(TR) |
3820 | 408k | }) |
3821 | 408k | || tag_is(token, kEndTag, GUMBO_TAG_TABLE) |
3822 | 408k | ) { |
3823 | 3.09k | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { |
3824 | 325 | parser_add_parse_error(parser, token); |
3825 | 325 | ignore_token(parser); |
3826 | 325 | return; |
3827 | 325 | } |
3828 | 2.76k | clear_stack_to_table_row_context(parser); |
3829 | 2.76k | pop_current_node(parser); |
3830 | 2.76k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
3831 | 2.76k | parser->_parser_state->_reprocess_current_token = true; |
3832 | 2.76k | return; |
3833 | 3.09k | } |
3834 | 405k | if ( |
3835 | 405k | tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) |
3836 | 405k | ) { |
3837 | 509 | if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { |
3838 | 76 | parser_add_parse_error(parser, token); |
3839 | 76 | ignore_token(parser); |
3840 | 76 | return; |
3841 | 76 | } |
3842 | 433 | if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { |
3843 | 0 | ignore_token(parser); |
3844 | 0 | return; |
3845 | 0 | } |
3846 | 433 | clear_stack_to_table_row_context(parser); |
3847 | 433 | pop_current_node(parser); |
3848 | 433 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
3849 | 433 | parser->_parser_state->_reprocess_current_token = true; |
3850 | 433 | return; |
3851 | 433 | } |
3852 | 404k | if ( |
3853 | 404k | tag_in(token, kEndTag, &(const TagSet) { |
3854 | 404k | TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), |
3855 | 404k | TAG(TD), TAG(TH) |
3856 | 404k | }) |
3857 | 404k | ) { |
3858 | 2.89k | parser_add_parse_error(parser, token); |
3859 | 2.89k | ignore_token(parser); |
3860 | 2.89k | return; |
3861 | 2.89k | } |
3862 | 402k | handle_in_table(parser, token); |
3863 | 402k | } |
3864 | | |
3865 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd |
3866 | 667k | static void handle_in_cell(GumboParser* parser, GumboToken* token) { |
3867 | 667k | if (tag_in(token, kEndTag, &td_th_tags)) { |
3868 | 1.54k | GumboTag token_tag = token->v.end_tag.tag; |
3869 | 1.54k | if (!has_an_element_in_table_scope(parser, token_tag)) { |
3870 | 23 | parser_add_parse_error(parser, token); |
3871 | 23 | ignore_token(parser); |
3872 | 23 | return; |
3873 | 23 | } |
3874 | 1.52k | close_table_cell(parser, token, token_tag); |
3875 | 1.52k | return; |
3876 | 1.54k | } |
3877 | 666k | if ( |
3878 | 666k | tag_in(token, kStartTag, &(const TagSet) { |
3879 | 666k | TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD), |
3880 | 666k | TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) |
3881 | 666k | }) |
3882 | 666k | ) { |
3883 | 20.7k | gumbo_debug("Handling <td> in cell.\n"); |
3884 | 20.7k | if ( |
3885 | 20.7k | !has_an_element_in_table_scope(parser, GUMBO_TAG_TH) |
3886 | 20.7k | && !has_an_element_in_table_scope(parser, GUMBO_TAG_TD) |
3887 | 20.7k | ) { |
3888 | 0 | gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n"); |
3889 | 0 | parser_add_parse_error(parser, token); |
3890 | 0 | ignore_token(parser); |
3891 | 0 | return; |
3892 | 0 | } |
3893 | 20.7k | parser->_parser_state->_reprocess_current_token = true; |
3894 | 20.7k | close_current_cell(parser, token); |
3895 | 20.7k | return; |
3896 | 20.7k | } |
3897 | 645k | if ( |
3898 | 645k | tag_in(token, kEndTag, &(const TagSet) { |
3899 | 645k | TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML) |
3900 | 645k | }) |
3901 | 645k | ) { |
3902 | 162 | parser_add_parse_error(parser, token); |
3903 | 162 | ignore_token(parser); |
3904 | 162 | return; |
3905 | 162 | } |
3906 | 645k | if ( |
3907 | 645k | tag_in(token, kEndTag, &(const TagSet) { |
3908 | 645k | TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) |
3909 | 645k | }) |
3910 | 645k | ) { |
3911 | 701 | if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { |
3912 | 57 | parser_add_parse_error(parser, token); |
3913 | 57 | ignore_token(parser); |
3914 | 57 | return; |
3915 | 57 | } |
3916 | 644 | parser->_parser_state->_reprocess_current_token = true; |
3917 | 644 | close_current_cell(parser, token); |
3918 | 644 | return; |
3919 | 701 | } |
3920 | 644k | handle_in_body(parser, token); |
3921 | 644k | } |
3922 | | |
3923 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect |
3924 | 262k | static void handle_in_select(GumboParser* parser, GumboToken* token) { |
3925 | 262k | if (token->type == GUMBO_TOKEN_NULL) { |
3926 | 14.8k | parser_add_parse_error(parser, token); |
3927 | 14.8k | ignore_token(parser); |
3928 | 14.8k | return; |
3929 | 14.8k | } |
3930 | 247k | if ( |
3931 | 247k | token->type == GUMBO_TOKEN_CHARACTER |
3932 | 247k | || token->type == GUMBO_TOKEN_WHITESPACE |
3933 | 247k | ) { |
3934 | 212k | insert_text_token(parser, token); |
3935 | 212k | return; |
3936 | 212k | } |
3937 | 35.3k | if (token->type == GUMBO_TOKEN_COMMENT) { |
3938 | 507 | append_comment_node(parser, get_current_node(parser), token); |
3939 | 507 | return; |
3940 | 507 | } |
3941 | 34.8k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
3942 | 230 | parser_add_parse_error(parser, token); |
3943 | 230 | ignore_token(parser); |
3944 | 230 | return; |
3945 | 230 | } |
3946 | 34.6k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
3947 | 83 | handle_in_body(parser, token); |
3948 | 83 | return; |
3949 | 83 | } |
3950 | 34.5k | if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) { |
3951 | 8.28k | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { |
3952 | 345 | pop_current_node(parser); |
3953 | 345 | } |
3954 | 8.28k | insert_element_from_token(parser, token); |
3955 | 8.28k | return; |
3956 | 8.28k | } |
3957 | 26.2k | if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) { |
3958 | 9.13k | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { |
3959 | 692 | pop_current_node(parser); |
3960 | 692 | } |
3961 | 9.13k | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { |
3962 | 1.25k | pop_current_node(parser); |
3963 | 1.25k | } |
3964 | 9.13k | insert_element_from_token(parser, token); |
3965 | 9.13k | return; |
3966 | 9.13k | } |
3967 | 17.1k | if (tag_is(token, kStartTag, GUMBO_TAG_HR)) { |
3968 | 499 | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { |
3969 | 180 | pop_current_node(parser); |
3970 | 180 | } |
3971 | 499 | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { |
3972 | 142 | pop_current_node(parser); |
3973 | 142 | } |
3974 | 499 | insert_element_from_token(parser, token); |
3975 | 499 | pop_current_node(parser); |
3976 | 499 | acknowledge_self_closing_tag(parser); |
3977 | 499 | return; |
3978 | 499 | } |
3979 | 16.6k | if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) { |
3980 | 571 | GumboVector* open_elements = &parser->_parser_state->_open_elements; |
3981 | 571 | if ( |
3982 | 571 | node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) |
3983 | 571 | && node_html_tag_is ( |
3984 | 451 | open_elements->data[open_elements->length - 2], |
3985 | 451 | GUMBO_TAG_OPTGROUP |
3986 | 451 | ) |
3987 | 571 | ) { |
3988 | 238 | pop_current_node(parser); |
3989 | 238 | } |
3990 | 571 | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { |
3991 | 255 | pop_current_node(parser); |
3992 | 255 | return; |
3993 | 255 | } |
3994 | 316 | parser_add_parse_error(parser, token); |
3995 | 316 | ignore_token(parser); |
3996 | 316 | return; |
3997 | 571 | } |
3998 | 16.0k | if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) { |
3999 | 59 | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { |
4000 | 56 | pop_current_node(parser); |
4001 | 56 | return; |
4002 | 56 | } |
4003 | 3 | parser_add_parse_error(parser, token); |
4004 | 3 | ignore_token(parser); |
4005 | 3 | return; |
4006 | 59 | } |
4007 | 16.0k | if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) { |
4008 | 318 | if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { |
4009 | 0 | parser_add_parse_error(parser, token); |
4010 | 0 | ignore_token(parser); |
4011 | 0 | return; |
4012 | 0 | } |
4013 | 318 | close_current_select(parser); |
4014 | 318 | return; |
4015 | 318 | } |
4016 | 15.7k | if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { |
4017 | 2.92k | parser_add_parse_error(parser, token); |
4018 | 2.92k | ignore_token(parser); |
4019 | 2.92k | if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { |
4020 | 2.92k | close_current_select(parser); |
4021 | 2.92k | } |
4022 | 2.92k | return; |
4023 | 2.92k | } |
4024 | 12.7k | if ( |
4025 | 12.7k | tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)}) |
4026 | 12.7k | ) { |
4027 | 193 | parser_add_parse_error(parser, token); |
4028 | 193 | if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { |
4029 | 0 | ignore_token(parser); |
4030 | 193 | } else { |
4031 | 193 | close_current_select(parser); |
4032 | 193 | parser->_parser_state->_reprocess_current_token = true; |
4033 | 193 | } |
4034 | 193 | return; |
4035 | 193 | } |
4036 | 12.5k | if ( |
4037 | 12.5k | tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)}) |
4038 | 12.5k | || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) |
4039 | 12.5k | ) { |
4040 | 440 | handle_in_head(parser, token); |
4041 | 440 | return; |
4042 | 440 | } |
4043 | 12.1k | if (token->type == GUMBO_TOKEN_EOF) { |
4044 | 287 | handle_in_body(parser, token); |
4045 | 287 | return; |
4046 | 287 | } |
4047 | 11.8k | parser_add_parse_error(parser, token); |
4048 | 11.8k | ignore_token(parser); |
4049 | 11.8k | } |
4050 | | |
4051 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable |
4052 | 147k | static void handle_in_select_in_table(GumboParser* parser, GumboToken* token) { |
4053 | 147k | static const TagSet tags = { |
4054 | 147k | TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), |
4055 | 147k | TAG(TR), TAG(TD), TAG(TH) |
4056 | 147k | }; |
4057 | 147k | if (tag_in(token, kStartTag, &tags)) { |
4058 | 6.21k | parser_add_parse_error(parser, token); |
4059 | 6.21k | close_current_select(parser); |
4060 | 6.21k | parser->_parser_state->_reprocess_current_token = true; |
4061 | 6.21k | return; |
4062 | 6.21k | } |
4063 | 141k | if (tag_in(token, kEndTag, &tags)) { |
4064 | 1.47k | parser_add_parse_error(parser, token); |
4065 | 1.47k | if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { |
4066 | 346 | ignore_token(parser); |
4067 | 346 | return; |
4068 | 346 | } |
4069 | 1.12k | close_current_select(parser); |
4070 | 1.12k | parser->_parser_state->_reprocess_current_token = true; |
4071 | 1.12k | return; |
4072 | 1.47k | } |
4073 | 139k | handle_in_select(parser, token); |
4074 | 139k | } |
4075 | | |
4076 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate |
4077 | 27.1k | static void handle_in_template(GumboParser* parser, GumboToken* token) { |
4078 | 27.1k | GumboParserState* state = parser->_parser_state; |
4079 | 27.1k | switch (token->type) { |
4080 | 603 | case GUMBO_TOKEN_WHITESPACE: |
4081 | 18.9k | case GUMBO_TOKEN_CHARACTER: |
4082 | 19.0k | case GUMBO_TOKEN_COMMENT: |
4083 | 21.2k | case GUMBO_TOKEN_NULL: |
4084 | 21.5k | case GUMBO_TOKEN_DOCTYPE: |
4085 | 21.5k | handle_in_body(parser, token); |
4086 | 21.5k | return; |
4087 | 5.59k | default: |
4088 | 5.59k | break; |
4089 | 27.1k | } |
4090 | 5.59k | if ( |
4091 | 5.59k | tag_in(token, kStartTag, &(const TagSet) { |
4092 | 5.59k | TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META), |
4093 | 5.59k | TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) |
4094 | 5.59k | }) |
4095 | 5.59k | || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) |
4096 | 5.59k | ) { |
4097 | 1.08k | handle_in_head(parser, token); |
4098 | 1.08k | return; |
4099 | 1.08k | } |
4100 | 4.51k | if ( |
4101 | 4.51k | tag_in(token, kStartTag, &(const TagSet) { |
4102 | 4.51k | TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD) |
4103 | 4.51k | }) |
4104 | 4.51k | ) { |
4105 | 423 | pop_template_insertion_mode(parser); |
4106 | 423 | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
4107 | 423 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); |
4108 | 423 | state->_reprocess_current_token = true; |
4109 | 423 | return; |
4110 | 423 | } |
4111 | 4.08k | if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { |
4112 | 186 | pop_template_insertion_mode(parser); |
4113 | 186 | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); |
4114 | 186 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); |
4115 | 186 | state->_reprocess_current_token = true; |
4116 | 186 | return; |
4117 | 186 | } |
4118 | 3.90k | if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { |
4119 | 419 | pop_template_insertion_mode(parser); |
4120 | 419 | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
4121 | 419 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); |
4122 | 419 | state->_reprocess_current_token = true; |
4123 | 419 | return; |
4124 | 419 | } |
4125 | 3.48k | if (tag_in(token, kStartTag, &td_th_tags)) { |
4126 | 167 | pop_template_insertion_mode(parser); |
4127 | 167 | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); |
4128 | 167 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); |
4129 | 167 | state->_reprocess_current_token = true; |
4130 | 167 | return; |
4131 | 167 | } |
4132 | 3.31k | if (token->type == GUMBO_TOKEN_START_TAG) { |
4133 | 1.00k | pop_template_insertion_mode(parser); |
4134 | 1.00k | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
4135 | 1.00k | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
4136 | 1.00k | state->_reprocess_current_token = true; |
4137 | 1.00k | return; |
4138 | 1.00k | } |
4139 | 2.31k | if (token->type == GUMBO_TOKEN_END_TAG) { |
4140 | 235 | parser_add_parse_error(parser, token); |
4141 | 235 | ignore_token(parser); |
4142 | 235 | return; |
4143 | 235 | } |
4144 | 2.07k | if (token->type == GUMBO_TOKEN_EOF) { |
4145 | 2.07k | if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { |
4146 | | // Stop parsing. |
4147 | 0 | return; |
4148 | 0 | } |
4149 | 2.07k | parser_add_parse_error(parser, token); |
4150 | 9.12k | while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) |
4151 | 7.05k | ; |
4152 | 2.07k | clear_active_formatting_elements(parser); |
4153 | 2.07k | pop_template_insertion_mode(parser); |
4154 | 2.07k | reset_insertion_mode_appropriately(parser); |
4155 | 2.07k | state->_reprocess_current_token = true; |
4156 | 2.07k | return; |
4157 | 2.07k | } |
4158 | 0 | assert(0 && "unreachable"); |
4159 | 0 | } |
4160 | | |
4161 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody |
4162 | 1.30k | static void handle_after_body(GumboParser* parser, GumboToken* token) { |
4163 | 1.30k | if ( |
4164 | 1.30k | token->type == GUMBO_TOKEN_WHITESPACE |
4165 | 1.30k | || tag_is(token, kStartTag, GUMBO_TAG_HTML) |
4166 | 1.30k | ) { |
4167 | 140 | handle_in_body(parser, token); |
4168 | 140 | return; |
4169 | 140 | } |
4170 | 1.16k | if (token->type == GUMBO_TOKEN_COMMENT) { |
4171 | 85 | GumboNode* html_node = parser->_output->root; |
4172 | 85 | assert(html_node != NULL); |
4173 | 0 | append_comment_node(parser, html_node, token); |
4174 | 85 | return; |
4175 | 85 | } |
4176 | 1.08k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
4177 | 39 | parser_add_parse_error(parser, token); |
4178 | 39 | ignore_token(parser); |
4179 | 39 | return; |
4180 | 39 | } |
4181 | 1.04k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
4182 | 0 | handle_in_body(parser, token); |
4183 | 0 | return; |
4184 | 0 | } |
4185 | 1.04k | if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { |
4186 | | /* fragment case: ignore the closing HTML token */ |
4187 | 719 | if (is_fragment_parser(parser)) { |
4188 | 0 | parser_add_parse_error(parser, token); |
4189 | 0 | ignore_token(parser); |
4190 | 0 | return; |
4191 | 0 | } |
4192 | 719 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY); |
4193 | 719 | GumboNode* html = parser->_parser_state->_open_elements.data[0]; |
4194 | 719 | assert(node_html_tag_is(html, GUMBO_TAG_HTML)); |
4195 | 0 | record_end_of_element ( |
4196 | 719 | parser->_parser_state->_current_token, |
4197 | 719 | &html->v.element |
4198 | 719 | ); |
4199 | 719 | return; |
4200 | 719 | } |
4201 | 326 | if (token->type == GUMBO_TOKEN_EOF) { |
4202 | 19 | return; |
4203 | 19 | } |
4204 | 307 | parser_add_parse_error(parser, token); |
4205 | 307 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
4206 | 307 | parser->_parser_state->_reprocess_current_token = true; |
4207 | 307 | } |
4208 | | |
4209 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset |
4210 | 837k | static void handle_in_frameset(GumboParser* parser, GumboToken* token) { |
4211 | 837k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
4212 | 796k | insert_text_token(parser, token); |
4213 | 796k | return; |
4214 | 796k | } |
4215 | 40.4k | if (token->type == GUMBO_TOKEN_COMMENT) { |
4216 | 1.57k | append_comment_node(parser, get_current_node(parser), token); |
4217 | 1.57k | return; |
4218 | 1.57k | } |
4219 | 38.9k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
4220 | 169 | parser_add_parse_error(parser, token); |
4221 | 169 | ignore_token(parser); |
4222 | 169 | return; |
4223 | 169 | } |
4224 | 38.7k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
4225 | 368 | handle_in_body(parser, token); |
4226 | 368 | return; |
4227 | 368 | } |
4228 | 38.3k | if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { |
4229 | 310 | insert_element_from_token(parser, token); |
4230 | 310 | return; |
4231 | 310 | } |
4232 | 38.0k | if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) { |
4233 | 191 | if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) { |
4234 | 0 | parser_add_parse_error(parser, token); |
4235 | 0 | ignore_token(parser); |
4236 | 0 | return; |
4237 | 0 | } |
4238 | 191 | pop_current_node(parser); |
4239 | 191 | if ( |
4240 | 191 | !is_fragment_parser(parser) |
4241 | 191 | && !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET) |
4242 | 191 | ) { |
4243 | 91 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET); |
4244 | 91 | } |
4245 | 191 | return; |
4246 | 191 | } |
4247 | 37.8k | if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) { |
4248 | 21 | insert_element_from_token(parser, token); |
4249 | 21 | pop_current_node(parser); |
4250 | 21 | acknowledge_self_closing_tag(parser); |
4251 | 21 | return; |
4252 | 21 | } |
4253 | 37.8k | if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { |
4254 | 162 | handle_in_head(parser, token); |
4255 | 162 | return; |
4256 | 162 | } |
4257 | 37.6k | if (token->type == GUMBO_TOKEN_EOF) { |
4258 | 104 | if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) |
4259 | 104 | parser_add_parse_error(parser, token); |
4260 | 104 | return; |
4261 | 104 | } |
4262 | 37.5k | parser_add_parse_error(parser, token); |
4263 | 37.5k | ignore_token(parser); |
4264 | 37.5k | } |
4265 | | |
4266 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset |
4267 | 244k | static void handle_after_frameset(GumboParser* parser, GumboToken* token) { |
4268 | 244k | if (token->type == GUMBO_TOKEN_WHITESPACE) { |
4269 | 101k | insert_text_token(parser, token); |
4270 | 101k | return; |
4271 | 101k | } |
4272 | 142k | if (token->type == GUMBO_TOKEN_COMMENT) { |
4273 | 98.5k | append_comment_node(parser, get_current_node(parser), token); |
4274 | 98.5k | return; |
4275 | 98.5k | } |
4276 | 44.2k | if (token->type == GUMBO_TOKEN_DOCTYPE) { |
4277 | 171 | parser_add_parse_error(parser, token); |
4278 | 171 | ignore_token(parser); |
4279 | 171 | return; |
4280 | 171 | } |
4281 | 44.1k | if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { |
4282 | 106 | handle_in_body(parser, token); |
4283 | 106 | return; |
4284 | 106 | } |
4285 | 43.9k | if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { |
4286 | 31 | GumboNode* html = parser->_parser_state->_open_elements.data[0]; |
4287 | 31 | assert(node_html_tag_is(html, GUMBO_TAG_HTML)); |
4288 | 0 | record_end_of_element ( |
4289 | 31 | parser->_parser_state->_current_token, |
4290 | 31 | &html->v.element |
4291 | 31 | ); |
4292 | 31 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET); |
4293 | 31 | return; |
4294 | 31 | } |
4295 | 43.9k | if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { |
4296 | 171 | return handle_in_head(parser, token); |
4297 | 171 | } |
4298 | 43.7k | if (token->type == GUMBO_TOKEN_EOF) { |
4299 | 60 | return; |
4300 | 60 | } |
4301 | 43.7k | parser_add_parse_error(parser, token); |
4302 | 43.7k | ignore_token(parser); |
4303 | 43.7k | } |
4304 | | |
4305 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode |
4306 | 1.17k | static void handle_after_after_body(GumboParser* parser, GumboToken* token) { |
4307 | 1.17k | if (token->type == GUMBO_TOKEN_COMMENT) { |
4308 | 18 | append_comment_node(parser, get_document_node(parser), token); |
4309 | 18 | return; |
4310 | 18 | } |
4311 | 1.15k | if ( |
4312 | 1.15k | token->type == GUMBO_TOKEN_DOCTYPE |
4313 | 1.15k | || token->type == GUMBO_TOKEN_WHITESPACE |
4314 | 1.15k | || tag_is(token, kStartTag, GUMBO_TAG_HTML) |
4315 | 1.15k | ) { |
4316 | 437 | handle_in_body(parser, token); |
4317 | 437 | return; |
4318 | 437 | } |
4319 | 719 | if (token->type == GUMBO_TOKEN_EOF) { |
4320 | 20 | return; |
4321 | 20 | } |
4322 | 699 | parser_add_parse_error(parser, token); |
4323 | 699 | set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); |
4324 | 699 | parser->_parser_state->_reprocess_current_token = true; |
4325 | 699 | } |
4326 | | |
4327 | | // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode |
4328 | | static void handle_after_after_frameset ( |
4329 | | GumboParser* parser, |
4330 | | GumboToken* token |
4331 | 52.0k | ) { |
4332 | 52.0k | if (token->type == GUMBO_TOKEN_COMMENT) { |
4333 | 165 | append_comment_node(parser, get_document_node(parser), token); |
4334 | 165 | return; |
4335 | 165 | } |
4336 | 51.9k | if ( |
4337 | 51.9k | token->type == GUMBO_TOKEN_DOCTYPE |
4338 | 51.9k | || token->type == GUMBO_TOKEN_WHITESPACE |
4339 | 51.9k | || tag_is(token, kStartTag, GUMBO_TAG_HTML) |
4340 | 51.9k | ) { |
4341 | 681 | handle_in_body(parser, token); |
4342 | 681 | return; |
4343 | 681 | } |
4344 | 51.2k | if (token->type == GUMBO_TOKEN_EOF) { |
4345 | 31 | return; |
4346 | 31 | } |
4347 | 51.2k | if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { |
4348 | 152 | handle_in_head(parser, token); |
4349 | 152 | return; |
4350 | 152 | } |
4351 | 51.0k | parser_add_parse_error(parser, token); |
4352 | 51.0k | ignore_token(parser); |
4353 | 51.0k | } |
4354 | | |
4355 | | // Function pointers for each insertion mode. |
4356 | | // Keep in sync with insertion_mode.h. |
4357 | | typedef void (*TokenHandler)(GumboParser* parser, GumboToken* token); |
4358 | | static const TokenHandler kTokenHandlers[] = { |
4359 | | handle_initial, |
4360 | | handle_before_html, |
4361 | | handle_before_head, |
4362 | | handle_in_head, |
4363 | | handle_in_head_noscript, |
4364 | | handle_after_head, |
4365 | | handle_in_body, |
4366 | | handle_text, |
4367 | | handle_in_table, |
4368 | | handle_in_table_text, |
4369 | | handle_in_caption, |
4370 | | handle_in_column_group, |
4371 | | handle_in_table_body, |
4372 | | handle_in_row, |
4373 | | handle_in_cell, |
4374 | | handle_in_select, |
4375 | | handle_in_select_in_table, |
4376 | | handle_in_template, |
4377 | | handle_after_body, |
4378 | | handle_in_frameset, |
4379 | | handle_after_frameset, |
4380 | | handle_after_after_body, |
4381 | | handle_after_after_frameset |
4382 | | }; |
4383 | | |
4384 | 14.3M | static void handle_html_content(GumboParser* parser, GumboToken* token) { |
4385 | 14.3M | const GumboInsertionMode mode = parser->_parser_state->_insertion_mode; |
4386 | 14.3M | const TokenHandler handler = kTokenHandlers[mode]; |
4387 | 14.3M | handler(parser, token); |
4388 | 14.3M | } |
4389 | | |
4390 | | // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign |
4391 | 2.06M | static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) { |
4392 | 2.06M | gumbo_debug("Handling foreign content"); |
4393 | 2.06M | switch (token->type) { |
4394 | 270k | case GUMBO_TOKEN_NULL: |
4395 | 270k | parser_add_parse_error(parser, token); |
4396 | 270k | token->v.character = kUtf8ReplacementChar; |
4397 | 270k | insert_text_token(parser, token); |
4398 | 270k | return; |
4399 | 13.3k | case GUMBO_TOKEN_WHITESPACE: |
4400 | 13.3k | insert_text_token(parser, token); |
4401 | 13.3k | return; |
4402 | 500k | case GUMBO_TOKEN_CDATA: |
4403 | 1.76M | case GUMBO_TOKEN_CHARACTER: |
4404 | 1.76M | insert_text_token(parser, token); |
4405 | 1.76M | set_frameset_not_ok(parser); |
4406 | 1.76M | return; |
4407 | 809 | case GUMBO_TOKEN_COMMENT: |
4408 | 809 | append_comment_node(parser, get_current_node(parser), token); |
4409 | 809 | return; |
4410 | 164 | case GUMBO_TOKEN_DOCTYPE: |
4411 | 164 | parser_add_parse_error(parser, token); |
4412 | 164 | ignore_token(parser); |
4413 | 164 | return; |
4414 | 14.8k | default: |
4415 | | // Fall through to the if-statements below. |
4416 | 14.8k | break; |
4417 | 2.06M | } |
4418 | | // Order matters for these clauses. |
4419 | 14.8k | if ( |
4420 | 14.8k | tag_in(token, kStartTag, &(const TagSet) { |
4421 | 14.8k | TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(CENTER), |
4422 | 14.8k | TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), TAG(EM), TAG(EMBED), |
4423 | 14.8k | TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), TAG(HEAD), |
4424 | 14.8k | TAG(HR), TAG(I), TAG(IMG), TAG(LI), TAG(LISTING), TAG(MENU), TAG(META), |
4425 | 14.8k | TAG(NOBR), TAG(OL), TAG(P), TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), |
4426 | 14.8k | TAG(SPAN), TAG(STRONG), TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), |
4427 | 14.8k | TAG(TT), TAG(U), TAG(UL), TAG(VAR) |
4428 | 14.8k | }) |
4429 | 14.8k | || ( |
4430 | 13.6k | tag_is(token, kStartTag, GUMBO_TAG_FONT) |
4431 | 13.6k | && ( |
4432 | 312 | token_has_attribute(token, "color") |
4433 | 312 | || token_has_attribute(token, "face") |
4434 | 312 | || token_has_attribute(token, "size") |
4435 | 312 | ) |
4436 | 13.6k | ) |
4437 | 14.8k | || tag_in(token, kEndTag, &(const TagSet) { TAG(BR), TAG(P) }) |
4438 | 14.8k | ) { |
4439 | | /* Parse error */ |
4440 | 1.86k | parser_add_parse_error(parser, token); |
4441 | | |
4442 | 1.86k | while ( |
4443 | 7.00k | !( |
4444 | 7.00k | is_mathml_integration_point(get_current_node(parser)) |
4445 | 7.00k | || is_html_integration_point(get_current_node(parser)) |
4446 | 7.00k | || get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML |
4447 | 7.00k | ) |
4448 | 5.14k | ) { |
4449 | 5.14k | pop_current_node(parser); |
4450 | 5.14k | } |
4451 | 1.86k | handle_html_content(parser, token); |
4452 | 1.86k | return; |
4453 | 1.86k | } |
4454 | | |
4455 | 13.0k | if (token->type == GUMBO_TOKEN_START_TAG) { |
4456 | 11.1k | const GumboNamespaceEnum current_namespace = |
4457 | 11.1k | get_adjusted_current_node(parser)->v.element.tag_namespace; |
4458 | 11.1k | if (current_namespace == GUMBO_NAMESPACE_MATHML) { |
4459 | 2.86k | adjust_mathml_attributes(token); |
4460 | 2.86k | } |
4461 | 11.1k | if (current_namespace == GUMBO_NAMESPACE_SVG) { |
4462 | 8.30k | adjust_svg_tag(token); |
4463 | 8.30k | adjust_svg_attributes(token); |
4464 | 8.30k | } |
4465 | 11.1k | adjust_foreign_attributes(token); |
4466 | 11.1k | insert_foreign_element(parser, token, current_namespace); |
4467 | 11.1k | if (token->v.start_tag.is_self_closing) { |
4468 | 514 | pop_current_node(parser); |
4469 | 514 | acknowledge_self_closing_tag(parser); |
4470 | 514 | } |
4471 | 11.1k | return; |
4472 | | // </script> tags are handled like any other end tag, putting the script's |
4473 | | // text into a text node child and closing the current node. |
4474 | 11.1k | } |
4475 | 1.85k | assert(token->type == GUMBO_TOKEN_END_TAG); |
4476 | 0 | GumboNode* node = get_current_node(parser); |
4477 | 1.85k | GumboTag tag = token->v.end_tag.tag; |
4478 | 1.85k | const char* name = token->v.end_tag.name; |
4479 | 1.85k | assert(node != NULL); |
4480 | | |
4481 | 1.85k | if (!node_tagname_is(node, tag, name)) |
4482 | 1.13k | parser_add_parse_error(parser, token); |
4483 | 1.85k | int i = parser->_parser_state->_open_elements.length; |
4484 | 8.24k | for (--i; i > 0;) { |
4485 | | // Here we move up the stack until we find an HTML element (in which |
4486 | | // case we do nothing) or we find the element that we're about to |
4487 | | // close (in which case we pop everything we've seen until that |
4488 | | // point.) |
4489 | 8.24k | gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i); |
4490 | 8.24k | if (node_tagname_is(node, tag, name)) { |
4491 | 773 | gumbo_debug("Matches.\n"); |
4492 | 1.26k | while (node != pop_current_node(parser)) { |
4493 | | // Pop all the nodes below the current one. Node is guaranteed to |
4494 | | // be an element on the stack of open elements (set below), so |
4495 | | // this loop is guaranteed to terminate. |
4496 | 491 | } |
4497 | 773 | return; |
4498 | 773 | } |
4499 | 7.46k | --i; |
4500 | 7.46k | node = parser->_parser_state->_open_elements.data[i]; |
4501 | 7.46k | if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) { |
4502 | | // The loop continues only in foreign namespaces. |
4503 | 1.08k | break; |
4504 | 1.08k | } |
4505 | 7.46k | } |
4506 | 1.08k | assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML); |
4507 | 1.08k | if (i == 0) |
4508 | 0 | return; |
4509 | | // We can't call handle_token directly because the current node is still in |
4510 | | // a foriegn namespace, so it would re-enter this and result in infinite |
4511 | | // recursion. |
4512 | 1.08k | handle_html_content(parser, token); |
4513 | 1.08k | } |
4514 | | |
4515 | | // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction |
4516 | 16.3M | static void handle_token(GumboParser* parser, GumboToken* token) { |
4517 | 16.3M | if ( |
4518 | 16.3M | parser->_parser_state->_ignore_next_linefeed |
4519 | 16.3M | && token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n' |
4520 | 16.3M | ) { |
4521 | 42 | parser->_parser_state->_ignore_next_linefeed = false; |
4522 | 42 | ignore_token(parser); |
4523 | 42 | return; |
4524 | 42 | } |
4525 | | // This needs to be reset both here and in the conditional above to catch both |
4526 | | // the case where the next token is not whitespace (so we don't ignore |
4527 | | // whitespace in the middle of <pre> tags) and where there are multiple |
4528 | | // whitespace tokens (so we don't ignore the second one). |
4529 | 16.3M | parser->_parser_state->_ignore_next_linefeed = false; |
4530 | | |
4531 | 16.3M | if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) { |
4532 | 2.94k | parser->_parser_state->_closed_body_tag = true; |
4533 | 2.94k | } |
4534 | 16.3M | if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { |
4535 | 2.79k | parser->_parser_state->_closed_html_tag = true; |
4536 | 2.79k | } |
4537 | | |
4538 | 16.3M | const GumboNode* current_node = get_adjusted_current_node(parser); |
4539 | 16.3M | assert ( |
4540 | 16.3M | !current_node |
4541 | 16.3M | || current_node->type == GUMBO_NODE_ELEMENT |
4542 | 16.3M | || current_node->type == GUMBO_NODE_TEMPLATE |
4543 | 16.3M | ); |
4544 | 16.3M | if (current_node) |
4545 | 16.3M | gumbo_debug("Current node: <%s>.\n", current_node->v.element.name); |
4546 | 16.3M | if (!current_node || |
4547 | 16.3M | current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML || |
4548 | 16.3M | (is_mathml_integration_point(current_node) && |
4549 | 2.07M | (token->type == GUMBO_TOKEN_CHARACTER || |
4550 | 4.03k | token->type == GUMBO_TOKEN_WHITESPACE || |
4551 | 4.03k | token->type == GUMBO_TOKEN_NULL || |
4552 | 4.03k | (token->type == GUMBO_TOKEN_START_TAG && |
4553 | 984 | !tag_in(token, kStartTag, |
4554 | 514 | &(const TagSet){TAG(MGLYPH), TAG(MALIGNMARK)})))) || |
4555 | 16.3M | (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML && |
4556 | 2.07M | node_qualified_tag_is( |
4557 | 206k | current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && |
4558 | 2.07M | tag_is(token, kStartTag, GUMBO_TAG_SVG)) || |
4559 | 16.3M | (is_html_integration_point(current_node) && |
4560 | 2.07M | (token->type == GUMBO_TOKEN_START_TAG || |
4561 | 6.50k | token->type == GUMBO_TOKEN_CHARACTER || |
4562 | 6.50k | token->type == GUMBO_TOKEN_NULL || |
4563 | 6.50k | token->type == GUMBO_TOKEN_WHITESPACE)) || |
4564 | 16.3M | token->type == GUMBO_TOKEN_EOF) { |
4565 | 14.3M | handle_html_content(parser, token); |
4566 | 14.3M | } else { |
4567 | 2.06M | handle_in_foreign_content(parser, token); |
4568 | 2.06M | } |
4569 | 16.3M | } |
4570 | | |
4571 | | static GumboNode* create_fragment_ctx_element ( |
4572 | | const char* tag_name, |
4573 | | GumboNamespaceEnum ns, |
4574 | | const char* encoding |
4575 | 0 | ) { |
4576 | 0 | assert(tag_name); |
4577 | 0 | GumboTag tag = gumbo_tagn_enum(tag_name, strlen(tag_name)); |
4578 | 0 | GumboNodeType type = |
4579 | 0 | ns == GUMBO_NAMESPACE_HTML && tag == GUMBO_TAG_TEMPLATE |
4580 | 0 | ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT; |
4581 | 0 | GumboNode* node = create_node(type); |
4582 | 0 | GumboElement* element = &node->v.element; |
4583 | 0 | element->children = kGumboEmptyVector; |
4584 | 0 | if (encoding) { |
4585 | 0 | gumbo_vector_init(1, &element->attributes); |
4586 | 0 | GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); |
4587 | 0 | attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; |
4588 | 0 | attr->name = "encoding"; // Do not free this! |
4589 | 0 | attr->original_name = kGumboEmptyString; |
4590 | 0 | attr->value = encoding; // Do not free this! |
4591 | 0 | attr->original_value = kGumboEmptyString; |
4592 | 0 | attr->name_start = kGumboEmptySourcePosition; |
4593 | 0 | gumbo_vector_add(attr, &element->attributes); |
4594 | 0 | } else { |
4595 | 0 | element->attributes = kGumboEmptyVector; |
4596 | 0 | } |
4597 | 0 | element->tag = tag; |
4598 | 0 | element->tag_namespace = ns; |
4599 | 0 | element->name = tag_name; // Do not free this! |
4600 | 0 | element->original_tag = kGumboEmptyString; |
4601 | 0 | element->original_end_tag = kGumboEmptyString; |
4602 | 0 | element->start_pos = kGumboEmptySourcePosition; |
4603 | 0 | element->end_pos = kGumboEmptySourcePosition; |
4604 | 0 | return node; |
4605 | 0 | } |
4606 | | |
4607 | 0 | static void destroy_fragment_ctx_element(GumboNode* ctx) { |
4608 | 0 | assert(ctx->type == GUMBO_NODE_ELEMENT || ctx->type == GUMBO_NODE_TEMPLATE); |
4609 | 0 | GumboElement* element = &ctx->v.element; |
4610 | 0 | element->name = NULL; // Do not free. |
4611 | 0 | if (element->attributes.length > 0) { |
4612 | 0 | assert(element->attributes.length == 1); |
4613 | 0 | GumboAttribute* attr = gumbo_vector_pop(&element->attributes); |
4614 | | // Do not free attr->name or attr->value, just free the attr. |
4615 | 0 | gumbo_free(attr); |
4616 | 0 | } |
4617 | 0 | destroy_node(ctx); |
4618 | 0 | } |
4619 | | |
4620 | | static void fragment_parser_init ( |
4621 | | GumboParser* parser, |
4622 | | const GumboOptions* options |
4623 | 0 | ) { |
4624 | 0 | assert(options->fragment_context != NULL); |
4625 | 0 | const char* fragment_ctx = options->fragment_context; |
4626 | 0 | GumboNamespaceEnum fragment_namespace = options->fragment_namespace; |
4627 | 0 | const char* fragment_encoding = options->fragment_encoding; |
4628 | 0 | GumboQuirksModeEnum quirks = options->quirks_mode; |
4629 | 0 | bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor; |
4630 | |
|
4631 | 0 | GumboNode* root; |
4632 | | // 2. |
4633 | 0 | get_document_node(parser)->v.document.doc_type_quirks_mode = quirks; |
4634 | | |
4635 | | // 3. |
4636 | 0 | parser->_parser_state->_fragment_ctx = |
4637 | 0 | create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding); |
4638 | 0 | GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag; |
4639 | | |
4640 | | // 4. |
4641 | 0 | if (fragment_namespace == GUMBO_NAMESPACE_HTML) { |
4642 | | // Non-HTML namespaces always start in the DATA state. |
4643 | 0 | switch (ctx_tag) { |
4644 | 0 | case GUMBO_TAG_TITLE: |
4645 | 0 | case GUMBO_TAG_TEXTAREA: |
4646 | 0 | gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); |
4647 | 0 | break; |
4648 | | |
4649 | 0 | case GUMBO_TAG_STYLE: |
4650 | 0 | case GUMBO_TAG_XMP: |
4651 | 0 | case GUMBO_TAG_IFRAME: |
4652 | 0 | case GUMBO_TAG_NOEMBED: |
4653 | 0 | case GUMBO_TAG_NOFRAMES: |
4654 | 0 | gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); |
4655 | 0 | break; |
4656 | | |
4657 | 0 | case GUMBO_TAG_SCRIPT: |
4658 | 0 | gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA); |
4659 | 0 | break; |
4660 | | |
4661 | 0 | case GUMBO_TAG_NOSCRIPT: |
4662 | | /* scripting is disabled in Gumbo, so leave the tokenizer |
4663 | | * in the default data state */ |
4664 | 0 | break; |
4665 | | |
4666 | 0 | case GUMBO_TAG_PLAINTEXT: |
4667 | 0 | gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); |
4668 | 0 | break; |
4669 | | |
4670 | 0 | default: |
4671 | | /* default data state */ |
4672 | 0 | break; |
4673 | 0 | } |
4674 | 0 | } |
4675 | | |
4676 | | // 5. 6. 7. |
4677 | 0 | root = insert_element_of_tag_type ( |
4678 | 0 | parser, |
4679 | 0 | GUMBO_TAG_HTML, |
4680 | 0 | GUMBO_INSERTION_IMPLIED |
4681 | 0 | ); |
4682 | 0 | parser->_output->root = root; |
4683 | | |
4684 | | // 8. |
4685 | 0 | if (ctx_tag == GUMBO_TAG_TEMPLATE) { |
4686 | 0 | push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); |
4687 | 0 | } |
4688 | | |
4689 | | // 10. |
4690 | 0 | reset_insertion_mode_appropriately(parser); |
4691 | | |
4692 | | // 11. |
4693 | 0 | if (ctx_has_form_ancestor |
4694 | 0 | || (ctx_tag == GUMBO_TAG_FORM |
4695 | 0 | && fragment_namespace == GUMBO_NAMESPACE_HTML)) { |
4696 | 0 | static const GumboNode form_ancestor = { |
4697 | 0 | .type = GUMBO_NODE_ELEMENT, |
4698 | 0 | .parent = NULL, |
4699 | 0 | .index_within_parent = -1, |
4700 | 0 | .parse_flags = GUMBO_INSERTION_BY_PARSER, |
4701 | 0 | .v.element = { |
4702 | 0 | .children = GUMBO_EMPTY_VECTOR_INIT, |
4703 | 0 | .tag = GUMBO_TAG_FORM, |
4704 | 0 | .name = NULL, |
4705 | 0 | .tag_namespace = GUMBO_NAMESPACE_HTML, |
4706 | 0 | .original_tag = GUMBO_EMPTY_STRING_INIT, |
4707 | 0 | .original_end_tag = GUMBO_EMPTY_STRING_INIT, |
4708 | 0 | .start_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT, |
4709 | 0 | .end_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT, |
4710 | 0 | .attributes = GUMBO_EMPTY_VECTOR_INIT, |
4711 | 0 | }, |
4712 | 0 | }; |
4713 | | // This cast is okay because _form_element is only modified if it is |
4714 | | // in in the list of open elements. This will never be. |
4715 | 0 | parser->_parser_state->_form_element = (GumboNode *)&form_ancestor; |
4716 | 0 | } |
4717 | 0 | } |
4718 | | |
4719 | 0 | GumboOutput* gumbo_parse(const char* buffer) { |
4720 | 0 | return gumbo_parse_with_options ( |
4721 | 0 | &kGumboDefaultOptions, |
4722 | 0 | buffer, |
4723 | 0 | strlen(buffer) |
4724 | 0 | ); |
4725 | 0 | } |
4726 | | |
4727 | | GumboOutput* gumbo_parse_with_options ( |
4728 | | const GumboOptions* options, |
4729 | | const char* buffer, |
4730 | | size_t length |
4731 | 10.0k | ) { |
4732 | 10.0k | GumboParser parser; |
4733 | 10.0k | parser._options = options; |
4734 | 10.0k | output_init(&parser); |
4735 | 10.0k | gumbo_tokenizer_state_init(&parser, buffer, length); |
4736 | 10.0k | parser_state_init(&parser); |
4737 | | |
4738 | 10.0k | if (options->fragment_context != NULL) |
4739 | 0 | fragment_parser_init(&parser, options); |
4740 | | |
4741 | 10.0k | GumboParserState* state = parser._parser_state; |
4742 | 10.0k | gumbo_debug ( |
4743 | 10.0k | "Parsing %.*s.\n", |
4744 | 10.0k | (int) length, |
4745 | 10.0k | buffer |
4746 | 10.0k | ); |
4747 | | |
4748 | | // Sanity check so that infinite loops die with an assertion failure instead |
4749 | | // of hanging the process before we ever get an error. |
4750 | 10.0k | uint_fast32_t loop_count = 0; |
4751 | | |
4752 | 10.0k | const unsigned int max_tree_depth = options->max_tree_depth; |
4753 | 10.0k | GumboToken token; |
4754 | | |
4755 | 16.3M | do { |
4756 | 16.3M | if (state->_reprocess_current_token) { |
4757 | 127k | state->_reprocess_current_token = false; |
4758 | 16.2M | } else { |
4759 | 16.2M | GumboNode* adjusted_current_node = get_adjusted_current_node(&parser); |
4760 | 16.2M | gumbo_tokenizer_set_is_adjusted_current_node_foreign ( |
4761 | 16.2M | &parser, |
4762 | 16.2M | adjusted_current_node && |
4763 | 16.2M | adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML |
4764 | 16.2M | ); |
4765 | 16.2M | gumbo_lex(&parser, &token); |
4766 | 16.2M | } |
4767 | | |
4768 | 16.3M | const char* token_type = "text"; |
4769 | 16.3M | switch (token.type) { |
4770 | 7.77k | case GUMBO_TOKEN_DOCTYPE: |
4771 | 7.77k | token_type = "doctype"; |
4772 | 7.77k | break; |
4773 | 1.28M | case GUMBO_TOKEN_START_TAG: |
4774 | 1.28M | if (token.v.start_tag.tag == GUMBO_TAG_UNKNOWN) |
4775 | 157k | token_type = token.v.start_tag.name; |
4776 | 1.12M | else |
4777 | 1.12M | token_type = gumbo_normalized_tagname(token.v.start_tag.tag); |
4778 | 1.28M | break; |
4779 | 47.8k | case GUMBO_TOKEN_END_TAG: |
4780 | 47.8k | token_type = gumbo_normalized_tagname(token.v.end_tag.tag); |
4781 | 47.8k | break; |
4782 | 200k | case GUMBO_TOKEN_COMMENT: |
4783 | 200k | token_type = "comment"; |
4784 | 200k | break; |
4785 | 14.8M | default: |
4786 | 14.8M | break; |
4787 | 16.3M | } |
4788 | 16.3M | gumbo_debug ( |
4789 | 16.3M | "Handling %s token @%lu:%lu in state %u.\n", |
4790 | 16.3M | (char*) token_type, |
4791 | 16.3M | (unsigned long)token.position.line, |
4792 | 16.3M | (unsigned long)token.position.column, |
4793 | 16.3M | state->_insertion_mode |
4794 | 16.3M | ); |
4795 | | |
4796 | 16.3M | state->_current_token = &token; |
4797 | 16.3M | state->_self_closing_flag_acknowledged = false; |
4798 | | |
4799 | 16.3M | handle_token(&parser, &token); |
4800 | | |
4801 | | // Check for memory leaks when ownership is transferred from start tag |
4802 | | // tokens to nodes. |
4803 | 16.3M | assert ( |
4804 | 16.3M | state->_reprocess_current_token |
4805 | 16.3M | || token.type != GUMBO_TOKEN_START_TAG |
4806 | 16.3M | || (token.v.start_tag.attributes.data == NULL |
4807 | 16.3M | && token.v.start_tag.name == NULL) |
4808 | 16.3M | ); |
4809 | | |
4810 | 16.3M | if (!state->_reprocess_current_token) { |
4811 | | // If we're done with the token, check for unacknowledged self-closing |
4812 | | // flags on start tags. |
4813 | 16.2M | if (token.type == GUMBO_TOKEN_START_TAG && |
4814 | 16.2M | token.v.start_tag.is_self_closing && |
4815 | 16.2M | !state->_self_closing_flag_acknowledged) { |
4816 | 2.01k | GumboError* error = gumbo_add_error(&parser); |
4817 | 2.01k | if (error) { |
4818 | | // This is essentially a tokenizer error that's only caught during |
4819 | | // tree construction. |
4820 | 2.01k | error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS; |
4821 | 2.01k | error->original_text = token.original_text; |
4822 | 2.01k | error->position = token.position; |
4823 | 2.01k | } |
4824 | 2.01k | } |
4825 | | // Make sure we free the end tag's name since it doesn't get transferred |
4826 | | // to a token. |
4827 | 16.2M | if (token.type == GUMBO_TOKEN_END_TAG && |
4828 | 16.2M | token.v.end_tag.tag == GUMBO_TAG_UNKNOWN) |
4829 | 6.51k | gumbo_free(token.v.end_tag.name); |
4830 | 16.2M | } |
4831 | | |
4832 | 16.3M | if (unlikely(state->_open_elements.length > max_tree_depth)) { |
4833 | 6 | parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP; |
4834 | 6 | gumbo_debug("Tree depth limit exceeded.\n"); |
4835 | 6 | break; |
4836 | 6 | } |
4837 | | |
4838 | 16.3M | ++loop_count; |
4839 | 16.3M | assert(loop_count < 1000000000UL); |
4840 | | |
4841 | 16.3M | } while ( |
4842 | 16.3M | (token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) |
4843 | 16.3M | && !(options->stop_on_first_error && parser._output->document_error) |
4844 | 10.0k | ); |
4845 | | |
4846 | 10.0k | finish_parsing(&parser); |
4847 | | // For API uniformity reasons, if the doctype still has nulls, convert them to |
4848 | | // empty strings. |
4849 | 10.0k | GumboDocument* doc_type = &parser._output->document->v.document; |
4850 | 10.0k | if (doc_type->name == NULL) { |
4851 | 9.62k | doc_type->name = gumbo_strdup(""); |
4852 | 9.62k | } |
4853 | 10.0k | if (doc_type->public_identifier == NULL) { |
4854 | 9.62k | doc_type->public_identifier = gumbo_strdup(""); |
4855 | 9.62k | } |
4856 | 10.0k | if (doc_type->system_identifier == NULL) { |
4857 | 9.62k | doc_type->system_identifier = gumbo_strdup(""); |
4858 | 9.62k | } |
4859 | | |
4860 | 10.0k | parser_state_destroy(&parser); |
4861 | 10.0k | gumbo_tokenizer_state_destroy(&parser); |
4862 | 10.0k | return parser._output; |
4863 | 10.0k | } |
4864 | | |
4865 | 0 | const char* gumbo_status_to_string(GumboOutputStatus status) { |
4866 | 0 | switch (status) { |
4867 | 0 | case GUMBO_STATUS_OK: |
4868 | 0 | return "OK"; |
4869 | 0 | case GUMBO_STATUS_OUT_OF_MEMORY: |
4870 | 0 | return "System allocator returned NULL during parsing"; |
4871 | 0 | case GUMBO_STATUS_TOO_MANY_ATTRIBUTES: |
4872 | 0 | return "Attributes per element limit exceeded"; |
4873 | 0 | case GUMBO_STATUS_TREE_TOO_DEEP: |
4874 | 0 | return "Document tree depth limit exceeded"; |
4875 | 0 | default: |
4876 | 0 | return "Unknown GumboOutputStatus value"; |
4877 | 0 | } |
4878 | 0 | } |
4879 | | |
4880 | 0 | void gumbo_destroy_node(GumboNode* node) { |
4881 | 0 | destroy_node(node); |
4882 | 0 | } |
4883 | | |
4884 | 10.0k | void gumbo_destroy_output(GumboOutput* output) { |
4885 | 10.0k | destroy_node(output->document); |
4886 | 24.2M | for (unsigned int i = 0; i < output->errors.length; ++i) { |
4887 | 24.2M | gumbo_error_destroy(output->errors.data[i]); |
4888 | 24.2M | } |
4889 | 10.0k | gumbo_vector_destroy(&output->errors); |
4890 | 10.0k | gumbo_free(output); |
4891 | 10.0k | } |