Coverage Report

Created: 2023-11-19 06:47

/src/nokogiri/gumbo-parser/src/parser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 Copyright 2017-2018 Craig Barnes.
3
 Copyright 2010 Google Inc.
4
5
 Licensed under the Apache License, Version 2.0 (the "License");
6
 you may not use this file except in compliance with the License.
7
 You may obtain a copy of the License at
8
9
    https://www.apache.org/licenses/LICENSE-2.0
10
11
 Unless required by applicable law or agreed to in writing, software
12
 distributed under the License is distributed on an "AS IS" BASIS,
13
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 See the License for the specific language governing permissions and
15
 limitations under the License.
16
*/
17
18
#include <assert.h>
19
#include <stdarg.h>
20
#include <stdint.h>
21
#include <stdlib.h>
22
#include <string.h>
23
24
#include "ascii.h"
25
#include "attribute.h"
26
#include "error.h"
27
#include "nokogiri_gumbo.h"
28
#include "insertion_mode.h"
29
#include "macros.h"
30
#include "parser.h"
31
#include "replacement.h"
32
#include "tokenizer.h"
33
#include "tokenizer_states.h"
34
#include "token_buffer.h"
35
#include "utf8.h"
36
#include "util.h"
37
#include "vector.h"
38
39
typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
40
197M
#define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
41
8.48M
#define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
42
15.3M
#define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
43
44
1.25M
#define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
45
1.25M
#define kGumboEmptySourcePosition (const GumboSourcePosition) \
46
1.25M
  GUMBO_EMPTY_SOURCE_POSITION_INIT
47
48
const GumboOptions kGumboDefaultOptions = {
49
  .tab_stop = 8,
50
  .stop_on_first_error = false,
51
  .max_attributes = 400,
52
  .max_tree_depth = 400,
53
  .max_errors = -1,
54
  .fragment_context = NULL,
55
  .fragment_namespace = GUMBO_NAMESPACE_HTML,
56
  .fragment_encoding = NULL,
57
  .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
  .fragment_context_has_form_ancestor = false,
59
};
60
61
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
62
#define TERMINATOR {.data = NULL, .length = 0}
63
64
// The doctype arrays have an explicit terminator because we want to pass them
65
// to a helper function, and passing them as a pointer discards sizeof
66
// information. The SVG arrays are used only by one-off functions, and so loops
67
// over them use sizeof directly instead of a terminator.
68
69
static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
70
  STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
71
  STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
72
  STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
73
  STRING("-//IETF//DTD HTML 2.0 Level 1//"),
74
  STRING("-//IETF//DTD HTML 2.0 Level 2//"),
75
  STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
76
  STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
77
  STRING("-//IETF//DTD HTML 2.0 Strict//"),
78
  STRING("-//IETF//DTD HTML 2.0//"),
79
  STRING("-//IETF//DTD HTML 2.1E//"),
80
  STRING("-//IETF//DTD HTML 3.0//"),
81
  STRING("-//IETF//DTD HTML 3.2 Final//"),
82
  STRING("-//IETF//DTD HTML 3.2//"),
83
  STRING("-//IETF//DTD HTML 3//"),
84
  STRING("-//IETF//DTD HTML Level 0//"),
85
  STRING("-//IETF//DTD HTML Level 1//"),
86
  STRING("-//IETF//DTD HTML Level 2//"),
87
  STRING("-//IETF//DTD HTML Level 3//"),
88
  STRING("-//IETF//DTD HTML Strict Level 0//"),
89
  STRING("-//IETF//DTD HTML Strict Level 1//"),
90
  STRING("-//IETF//DTD HTML Strict Level 2//"),
91
  STRING("-//IETF//DTD HTML Strict Level 3//"),
92
  STRING("-//IETF//DTD HTML Strict//"),
93
  STRING("-//IETF//DTD HTML//"),
94
  STRING("-//Metrius//DTD Metrius Presentational//"),
95
  STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
96
  STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
97
  STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
98
  STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
99
  STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
100
  STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
101
  STRING("-//Netscape Comm. Corp.//DTD HTML//"),
102
  STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
103
  STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
104
  STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
105
  STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
106
  STRING(
107
    "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
108
    "extensions to HTML 4.0//"),
109
  STRING(
110
    "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
111
    "extensions to HTML 4.0//"),
112
  STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
113
  STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
114
  STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
115
  STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
116
  STRING("-//W3C//DTD HTML 3 1995-03-24//"),
117
  STRING("-//W3C//DTD HTML 3.2 Draft//"),
118
  STRING("-//W3C//DTD HTML 3.2 Final//"),
119
  STRING("-//W3C//DTD HTML 3.2//"),
120
  STRING("-//W3C//DTD HTML 3.2S Draft//"),
121
  STRING("-//W3C//DTD HTML 4.0 Frameset//"),
122
  STRING("-//W3C//DTD HTML 4.0 Transitional//"),
123
  STRING("-//W3C//DTD HTML Experimental 19960712//"),
124
  STRING("-//W3C//DTD HTML Experimental 970421//"),
125
  STRING("-//W3C//DTD W3 HTML//"),
126
  STRING("-//W3O//DTD W3 HTML 3.0//"),
127
  STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
128
  STRING("-//WebTechs//DTD Mozilla HTML//"),
129
  TERMINATOR
130
};
131
132
static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
133
  STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
134
  STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
135
  STRING("HTML"),
136
  TERMINATOR
137
};
138
139
static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
140
  STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
141
  TERMINATOR
142
};
143
144
static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
145
  STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
146
  STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
147
  TERMINATOR
148
};
149
150
static const GumboStringPiece kSystemIdDependentPublicIdPrefixes[] = {
151
  STRING("-//W3C//DTD HTML 4.01 Frameset//"),
152
  STRING("-//W3C//DTD HTML 4.01 Transitional//"),
153
  TERMINATOR
154
};
155
156
// Indexed by GumboNamespaceEnum; keep in sync with that.
157
static const char* kLegalXmlns[] = {
158
  "http://www.w3.org/1999/xhtml",
159
  "http://www.w3.org/2000/svg",
160
  "http://www.w3.org/1998/Math/MathML"
161
};
162
163
// The "scope marker" for the list of active formatting elements. We use a
164
// pointer to this as a generic marker element, since the particular element
165
// scope doesn't matter.
166
static const GumboNode kActiveFormattingScopeMarker;
167
168
// The tag_is and tag_in function use true & false to denote start & end tags,
169
// but for readability, we define constants for them here.
170
static const bool kStartTag = true;
171
static const bool kEndTag = false;
172
173
// Because GumboStringPieces are immutable, we can't insert a character directly
174
// into a text node. Instead, we accumulate all pending characters here and
175
// flush them out to a text node whenever a new element is inserted.
176
//
177
// https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character
178
typedef struct _TextNodeBufferState {
179
  // The accumulated text to be inserted into the current text node.
180
  GumboStringBuffer _buffer;
181
182
  // A pointer to the original text represented by this text node. Note that
183
  // because of foster parenting and other strange DOM manipulations, this may
184
  // include other non-text HTML tags in it; it is defined as the span of
185
  // original text from the first character in this text node to the last
186
  // character in this text node.
187
  const char* _start_original_text;
188
189
  // The source position of the start of this text node.
190
  GumboSourcePosition _start_position;
191
192
  // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
193
  GumboNodeType _type;
194
} TextNodeBufferState;
195
196
typedef struct GumboInternalParserState {
197
  // https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode
198
  GumboInsertionMode _insertion_mode;
199
200
  // Used for run_generic_parsing_algorithm, which needs to switch back to the
201
  // original insertion mode at its conclusion.
202
  GumboInsertionMode _original_insertion_mode;
203
204
  // https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
205
  GumboVector /*GumboNode*/ _open_elements;
206
207
  // https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
208
  GumboVector /*GumboNode*/ _active_formatting_elements;
209
210
  // The stack of template insertion modes.
211
  // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
212
  GumboVector /*InsertionMode*/ _template_insertion_modes;
213
214
  // https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers
215
  GumboNode* _head_element;
216
  GumboNode* _form_element;
217
218
  // The element used as fragment context when parsing in fragment mode
219
  GumboNode* _fragment_ctx;
220
221
  // The flag for when the spec says "Reprocess the current token in..."
222
  bool _reprocess_current_token;
223
224
  // The flag for "acknowledge the token's self-closing flag".
225
  bool _self_closing_flag_acknowledged;
226
227
  // The "frameset-ok" flag from the spec.
228
  bool _frameset_ok;
229
230
  // The flag for "If the next token is a LINE FEED, ignore that token...".
231
  bool _ignore_next_linefeed;
232
233
  // The flag for "whenever a node would be inserted into the current node, it
234
  // must instead be foster parented". This is used for misnested table
235
  // content, which needs to be handled according to "in body" rules yet foster
236
  // parented outside of the table.
237
  // It would perhaps be more explicit to have this as a parameter to
238
  // handle_in_body and insert_element, but given how special-purpose this is
239
  // and the number of call-sites that would need to take the extra parameter,
240
  // it's easier just to have a state flag.
241
  bool _foster_parent_insertions;
242
243
  // The accumulated text node buffer state.
244
  TextNodeBufferState _text_node;
245
246
  // The accumulated character tokens in tables for error purposes.
247
  GumboCharacterTokenBuffer _table_character_tokens;
248
249
  // The current token.
250
  GumboToken* _current_token;
251
252
  // The way that the spec is written, the </body> and </html> tags are *always*
253
  // implicit, because encountering one of those tokens merely switches the
254
  // insertion mode out of "in body". So we have individual state flags for
255
  // those end tags that are then inspected by pop_current_node when the <body>
256
  // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
257
  // flag appropriately.
258
  bool _closed_body_tag;
259
  bool _closed_html_tag;
260
} GumboParserState;
261
262
29.6k
static bool token_has_attribute(const GumboToken* token, const char* name) {
263
29.6k
  assert(token->type == GUMBO_TOKEN_START_TAG);
264
0
  return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
265
29.6k
}
266
267
// Checks if the value of the specified attribute is a case-insensitive match
268
// for the specified string.
269
static bool attribute_matches (
270
  const GumboVector* attributes,
271
  const char* name,
272
  const char* value
273
2.59k
) {
274
2.59k
  const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
275
2.59k
  return attr ? gumbo_ascii_strcasecmp(value, attr->value) == 0 : false;
276
2.59k
}
277
278
// Checks if the value of the specified attribute is a case-sensitive match
279
// for the specified string.
280
static bool attribute_matches_case_sensitive (
281
  const GumboVector* attributes,
282
  const char* name,
283
  const char* value
284
13.4k
) {
285
13.4k
  const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
286
13.4k
  return attr ? strcmp(value, attr->value) == 0 : false;
287
13.4k
}
288
289
// Checks if the specified attribute vectors are identical.
290
static bool all_attributes_match (
291
  const GumboVector* attr1,
292
  const GumboVector* attr2
293
234k
) {
294
234k
  unsigned int num_unmatched_attr2_elements = attr2->length;
295
238k
  for (unsigned int i = 0; i < attr1->length; ++i) {
296
13.4k
    const GumboAttribute* attr = attr1->data[i];
297
13.4k
    if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
298
4.01k
      --num_unmatched_attr2_elements;
299
9.43k
    } else {
300
9.43k
      return false;
301
9.43k
    }
302
13.4k
  }
303
224k
  return num_unmatched_attr2_elements == 0;
304
234k
}
305
306
8.50M
static void set_frameset_not_ok(GumboParser* parser) {
307
8.50M
  gumbo_debug("Setting frameset_ok to false.\n");
308
8.50M
  parser->_parser_state->_frameset_ok = false;
309
8.50M
}
310
311
1.84M
static GumboNode* create_node(GumboNodeType type) {
312
1.84M
  GumboNode* node = gumbo_alloc(sizeof(GumboNode));
313
1.84M
  node->parent = NULL;
314
1.84M
  node->index_within_parent = -1;
315
1.84M
  node->type = type;
316
1.84M
  node->parse_flags = GUMBO_INSERTION_NORMAL;
317
1.84M
  return node;
318
1.84M
}
319
320
10.0k
static GumboNode* new_document_node() {
321
10.0k
  GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
322
10.0k
  document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
323
10.0k
  gumbo_vector_init(1, &document_node->v.document.children);
324
325
  // Must be initialized explicitly, as there's no guarantee that we'll see a
326
  // doc type token.
327
10.0k
  GumboDocument* document = &document_node->v.document;
328
10.0k
  document->has_doctype = false;
329
10.0k
  document->name = NULL;
330
10.0k
  document->public_identifier = NULL;
331
10.0k
  document->system_identifier = NULL;
332
10.0k
  document->doc_type_quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
333
10.0k
  return document_node;
334
10.0k
}
335
336
10.0k
static void output_init(GumboParser* parser) {
337
10.0k
  GumboOutput* output = gumbo_alloc(sizeof(GumboOutput));
338
10.0k
  output->root = NULL;
339
10.0k
  output->document = new_document_node();
340
10.0k
  output->document_error = false;
341
10.0k
  output->status = GUMBO_STATUS_OK;
342
10.0k
  parser->_output = output;
343
10.0k
  gumbo_init_errors(parser);
344
10.0k
}
345
346
10.0k
static void parser_state_init(GumboParser* parser) {
347
10.0k
  GumboParserState* parser_state = gumbo_alloc(sizeof(GumboParserState));
348
10.0k
  parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
349
10.0k
  parser_state->_reprocess_current_token = false;
350
10.0k
  parser_state->_frameset_ok = true;
351
10.0k
  parser_state->_ignore_next_linefeed = false;
352
10.0k
  parser_state->_foster_parent_insertions = false;
353
10.0k
  parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
354
10.0k
  gumbo_string_buffer_init(&parser_state->_text_node._buffer);
355
10.0k
  gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
356
10.0k
  gumbo_vector_init(10, &parser_state->_open_elements);
357
10.0k
  gumbo_vector_init(5, &parser_state->_active_formatting_elements);
358
10.0k
  gumbo_vector_init(5, &parser_state->_template_insertion_modes);
359
10.0k
  parser_state->_head_element = NULL;
360
10.0k
  parser_state->_form_element = NULL;
361
10.0k
  parser_state->_fragment_ctx = NULL;
362
10.0k
  parser_state->_current_token = NULL;
363
10.0k
  parser_state->_closed_body_tag = false;
364
10.0k
  parser_state->_closed_html_tag = false;
365
10.0k
  parser->_parser_state = parser_state;
366
10.0k
}
367
368
typedef void (*TreeTraversalCallback)(GumboNode* node);
369
370
10.1k
static void tree_traverse(GumboNode* node, TreeTraversalCallback callback) {
371
10.1k
  GumboNode* current_node = node;
372
10.1k
  unsigned int offset = 0;
373
374
3.89M
tailcall:
375
3.89M
  switch (current_node->type) {
376
21.5k
    case GUMBO_NODE_DOCUMENT:
377
38.5k
    case GUMBO_NODE_TEMPLATE:
378
3.27M
    case GUMBO_NODE_ELEMENT: {
379
3.27M
      GumboVector* children = (current_node->type == GUMBO_NODE_DOCUMENT)
380
3.27M
        ? &current_node->v.document.children
381
3.27M
        : &current_node->v.element.children
382
3.27M
      ;
383
3.27M
      if (offset >= children->length) {
384
1.33M
        assert(offset == children->length);
385
0
        break;
386
1.94M
      } else {
387
1.94M
        current_node = children->data[offset];
388
1.94M
        offset = 0;
389
1.94M
        goto tailcall;
390
1.94M
      }
391
3.27M
    }
392
308k
    case GUMBO_NODE_TEXT:
393
308k
    case GUMBO_NODE_CDATA:
394
508k
    case GUMBO_NODE_COMMENT:
395
620k
    case GUMBO_NODE_WHITESPACE:
396
620k
      assert(offset == 0);
397
0
      break;
398
3.89M
  }
399
400
1.95M
  offset = current_node->index_within_parent + 1;
401
1.95M
  GumboNode* next_node = current_node->parent;
402
1.95M
  callback(current_node);
403
1.95M
  if (current_node == node) {
404
10.1k
    return;
405
10.1k
  }
406
1.94M
  current_node = next_node;
407
1.94M
  goto tailcall;
408
1.95M
}
409
410
1.95M
static void destroy_node_callback(GumboNode* node) {
411
1.95M
  switch (node->type) {
412
10.0k
    case GUMBO_NODE_DOCUMENT: {
413
10.0k
      GumboDocument* doc = &node->v.document;
414
10.0k
      gumbo_free((void*) doc->children.data);
415
10.0k
      gumbo_free((void*) doc->name);
416
10.0k
      gumbo_free((void*) doc->public_identifier);
417
10.0k
      gumbo_free((void*) doc->system_identifier);
418
10.0k
    } break;
419
3.19k
    case GUMBO_NODE_TEMPLATE:
420
1.32M
    case GUMBO_NODE_ELEMENT:
421
1.39M
      for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
422
72.1k
        gumbo_destroy_attribute(node->v.element.attributes.data[i]);
423
72.1k
      }
424
1.32M
      gumbo_free(node->v.element.attributes.data);
425
1.32M
      gumbo_free(node->v.element.children.data);
426
1.32M
      if (node->v.element.tag == GUMBO_TAG_UNKNOWN)
427
147k
        gumbo_free((void *)node->v.element.name);
428
1.32M
      break;
429
308k
    case GUMBO_NODE_TEXT:
430
308k
    case GUMBO_NODE_CDATA:
431
508k
    case GUMBO_NODE_COMMENT:
432
620k
    case GUMBO_NODE_WHITESPACE:
433
620k
      gumbo_free((void*) node->v.text.text);
434
620k
      break;
435
1.95M
  }
436
1.95M
  gumbo_free(node);
437
1.95M
}
438
439
10.1k
static void destroy_node(GumboNode* node) {
440
10.1k
  tree_traverse(node, &destroy_node_callback);
441
10.1k
}
442
443
static void destroy_fragment_ctx_element(GumboNode* ctx);
444
445
10.0k
static void parser_state_destroy(GumboParser* parser) {
446
10.0k
  GumboParserState* state = parser->_parser_state;
447
10.0k
  if (state->_fragment_ctx) {
448
0
    destroy_fragment_ctx_element(state->_fragment_ctx);
449
0
  }
450
10.0k
  gumbo_vector_destroy(&state->_active_formatting_elements);
451
10.0k
  gumbo_vector_destroy(&state->_open_elements);
452
10.0k
  gumbo_vector_destroy(&state->_template_insertion_modes);
453
10.0k
  gumbo_string_buffer_destroy(&state->_text_node._buffer);
454
10.0k
  gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
455
10.0k
  gumbo_free(state);
456
10.0k
}
457
458
31.7k
static GumboNode* get_document_node(const GumboParser* parser) {
459
31.7k
  return parser->_output->document;
460
31.7k
}
461
462
1.07k
static bool is_fragment_parser(const GumboParser* parser) {
463
1.07k
  return !!parser->_parser_state->_fragment_ctx;
464
1.07k
}
465
466
// Returns the node at the bottom of the stack of open elements, or NULL if no
467
// elements have been added yet.
468
40.0M
static GumboNode* get_current_node(const GumboParser* parser) {
469
40.0M
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
470
40.0M
  if (open_elements->length == 0) {
471
37.8k
    assert(!parser->_output->root);
472
0
    return NULL;
473
37.8k
  }
474
39.9M
  assert(open_elements->length > 0);
475
0
  assert(open_elements->data != NULL);
476
0
  return open_elements->data[open_elements->length - 1];
477
40.0M
}
478
479
32.6M
static GumboNode* get_adjusted_current_node(const GumboParser* parser) {
480
32.6M
  const GumboParserState* state = parser->_parser_state;
481
32.6M
  if (state->_open_elements.length == 1 && state->_fragment_ctx) {
482
0
    return state->_fragment_ctx;
483
0
  }
484
32.6M
  return get_current_node(parser);
485
32.6M
}
486
487
// Returns true if the given needle is in the given array of literal
488
// GumboStringPieces. If exact_match is true, this requires that they match
489
// exactly; otherwise, this performs a prefix match to check if any of the
490
// elements in haystack start with needle. This always performs a
491
// case-insensitive match.
492
static bool is_in_static_list (
493
  const GumboStringPiece* needle,
494
  const GumboStringPiece* haystack,
495
  bool exact_match
496
175
) {
497
175
  if (needle->length == 0)
498
69
    return false;
499
106
  if (exact_match) {
500
112
    for (size_t i = 0; haystack[i].data; ++i) {
501
81
      if (gumbo_string_equals_ignore_case(needle, &haystack[i]))
502
0
        return true;
503
81
    }
504
75
  } else {
505
1.55k
    for (size_t i = 0; haystack[i].data; ++i) {
506
1.47k
      if (gumbo_string_prefix_ignore_case(&haystack[i], needle))
507
0
        return true;
508
1.47k
    }
509
75
  }
510
106
  return false;
511
106
}
512
513
184k
static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
514
184k
  parser->_parser_state->_insertion_mode = mode;
515
184k
}
516
517
static void push_template_insertion_mode (
518
  GumboParser* parser,
519
  GumboInsertionMode mode
520
5.39k
) {
521
5.39k
  gumbo_vector_add (
522
5.39k
    (void*) mode,
523
5.39k
    &parser->_parser_state->_template_insertion_modes
524
5.39k
  );
525
5.39k
}
526
527
5.01k
static void pop_template_insertion_mode(GumboParser* parser) {
528
5.01k
  gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes);
529
5.01k
}
530
531
// Returns the current template insertion mode. If the stack of template
532
// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
533
static GumboInsertionMode get_current_template_insertion_mode (
534
  const GumboParser* parser
535
13.6k
) {
536
13.6k
  GumboVector* modes = &parser->_parser_state->_template_insertion_modes;
537
13.6k
  if (modes->length == 0) {
538
9.85k
    return GUMBO_INSERTION_MODE_INITIAL;
539
9.85k
  }
540
3.77k
  return (GumboInsertionMode)(intptr_t) modes->data[(modes->length - 1)];
541
13.6k
}
542
543
// Returns true if the specified token is either a start or end tag
544
// (specified by is_start) with one of the tag types in the TagSet.
545
static bool tag_in (
546
  const GumboToken* token,
547
  bool is_start,
548
  const TagSet* tags
549
25.4M
) {
550
25.4M
  GumboTag token_tag;
551
25.4M
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
552
8.89M
    token_tag = token->v.start_tag.tag;
553
16.5M
  } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
554
104k
    token_tag = token->v.end_tag.tag;
555
16.4M
  } else {
556
16.4M
    return false;
557
16.4M
  }
558
9.00M
  return (*tags)[(unsigned) token_tag] != 0u;
559
25.4M
}
560
561
// Like tag_in, but for the single-tag case.
562
68.7M
static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
563
68.7M
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
564
13.1M
    return token->v.start_tag.tag == tag;
565
13.1M
  }
566
55.5M
  if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
567
263k
    return token->v.end_tag.tag == tag;
568
263k
  }
569
55.3M
  return false;
570
55.5M
}
571
572
static inline bool tagset_includes (
573
  const TagSet* tagset,
574
  GumboNamespaceEnum ns,
575
  GumboTag tag
576
10.5M
) {
577
10.5M
  return ((*tagset)[(unsigned) tag] & (1u << (unsigned) ns)) != 0u;
578
10.5M
}
579
580
// Like tag_in, but checks for the tag of a node, rather than a token.
581
7.64M
static bool node_tag_in_set(const GumboNode* node, const TagSet* tags) {
582
7.64M
  assert(node != NULL);
583
7.64M
  if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
584
0
    return false;
585
0
  }
586
7.64M
  return tagset_includes (
587
7.64M
    tags,
588
7.64M
    node->v.element.tag_namespace,
589
7.64M
    node->v.element.tag
590
7.64M
  );
591
7.64M
}
592
593
static bool node_qualified_tagname_is (
594
  const GumboNode* node,
595
  GumboNamespaceEnum ns,
596
  GumboTag tag,
597
  const char *name
598
1.65M
) {
599
1.65M
  assert(node);
600
0
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
601
0
  assert(node->v.element.name);
602
0
  assert(tag != GUMBO_TAG_UNKNOWN || name);
603
0
  GumboTag element_tag = node->v.element.tag;
604
1.65M
  const char *element_name = node->v.element.name;
605
1.65M
  assert(element_tag != GUMBO_TAG_UNKNOWN || element_name);
606
1.65M
  if (node->v.element.tag_namespace != ns || element_tag != tag)
607
901k
    return false;
608
756k
  if (tag != GUMBO_TAG_UNKNOWN)
609
743k
    return true;
610
12.3k
  return !gumbo_ascii_strcasecmp(element_name, name);
611
756k
}
612
613
static bool node_html_tagname_is (
614
  const GumboNode* node,
615
  GumboTag tag,
616
  const char *name
617
497k
) {
618
497k
  return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name);
619
497k
}
620
621
static bool node_tagname_is (
622
  const GumboNode* node,
623
  GumboTag tag,
624
  const char *name
625
10.0k
) {
626
10.0k
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
627
0
  return node_qualified_tagname_is(node, node->v.element.tag_namespace, tag, name);
628
10.0k
}
629
630
// Like node_tag_in, but for the single-tag case.
631
static bool node_qualified_tag_is (
632
  const GumboNode* node,
633
  GumboNamespaceEnum ns,
634
  GumboTag tag
635
16.2M
) {
636
16.2M
  assert(node);
637
0
  assert(tag != GUMBO_TAG_UNKNOWN);
638
0
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
639
0
  return
640
16.2M
    node->v.element.tag == tag
641
16.2M
    && node->v.element.tag_namespace == ns;
642
16.2M
}
643
644
// Like node_tag_in, but for the single-tag case in the HTML namespace
645
12.7M
static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
646
12.7M
  return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
647
12.7M
}
648
649
// https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately
650
// This is a helper function that returns the appropriate insertion mode instead
651
// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
652
// indicate that there is no appropriate insertion mode, and the loop should
653
// continue.
654
static GumboInsertionMode get_appropriate_insertion_mode (
655
  const GumboParser* parser,
656
  int index
657
351k
) {
658
351k
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
659
351k
  const GumboNode* node = open_elements->data[index];
660
351k
  const bool is_last = index == 0;
661
662
351k
  if (is_last && is_fragment_parser(parser)) {
663
0
    node = parser->_parser_state->_fragment_ctx;
664
0
  }
665
666
351k
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667
351k
  if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) {
668
1.45k
    return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
669
1.45k
  }
670
671
350k
  switch (node->v.element.tag) {
672
188
    case GUMBO_TAG_SELECT: {
673
188
      if (is_last) {
674
0
        return GUMBO_INSERTION_MODE_IN_SELECT;
675
0
      }
676
3.15k
      for (int i = index; i > 0; --i) {
677
3.11k
        const GumboNode* ancestor = open_elements->data[i];
678
3.11k
        if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
679
108
          return GUMBO_INSERTION_MODE_IN_SELECT;
680
108
        }
681
3.00k
        if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
682
40
          return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
683
40
        }
684
3.00k
      }
685
40
      return GUMBO_INSERTION_MODE_IN_SELECT;
686
188
    }
687
6.50k
    case GUMBO_TAG_TD:
688
7.17k
    case GUMBO_TAG_TH:
689
7.17k
      if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
690
0
      break;
691
1.27k
    case GUMBO_TAG_TR:
692
1.27k
      return GUMBO_INSERTION_MODE_IN_ROW;
693
201
    case GUMBO_TAG_TBODY:
694
333
    case GUMBO_TAG_THEAD:
695
601
    case GUMBO_TAG_TFOOT:
696
601
      return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
697
382
    case GUMBO_TAG_CAPTION:
698
382
      return GUMBO_INSERTION_MODE_IN_CAPTION;
699
115
    case GUMBO_TAG_COLGROUP:
700
115
      return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
701
563
    case GUMBO_TAG_TABLE:
702
563
      return GUMBO_INSERTION_MODE_IN_TABLE;
703
2.16k
    case GUMBO_TAG_TEMPLATE:
704
2.16k
      return get_current_template_insertion_mode(parser);
705
186
    case GUMBO_TAG_HEAD:
706
186
      if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
707
0
      break;
708
7.89k
    case GUMBO_TAG_BODY:
709
7.89k
      return GUMBO_INSERTION_MODE_IN_BODY;
710
0
    case GUMBO_TAG_FRAMESET:
711
0
      return GUMBO_INSERTION_MODE_IN_FRAMESET;
712
161
    case GUMBO_TAG_HTML:
713
161
      return parser->_parser_state->_head_element
714
161
        ? GUMBO_INSERTION_MODE_AFTER_HEAD
715
161
        : GUMBO_INSERTION_MODE_BEFORE_HEAD;
716
329k
    default:
717
329k
      break;
718
350k
  }
719
329k
  return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
720
350k
}
721
722
// This performs the actual "reset the insertion mode" loop.
723
20.7k
static void reset_insertion_mode_appropriately(GumboParser* parser) {
724
20.7k
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
725
351k
  for (int i = open_elements->length; --i >= 0;) {
726
351k
    GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
727
351k
    if (mode != GUMBO_INSERTION_MODE_INITIAL) {
728
20.7k
      set_insertion_mode(parser, mode);
729
20.7k
      return;
730
20.7k
    }
731
351k
  }
732
  // Should never get here, because is_last will be set on the last iteration
733
  // and will force GUMBO_INSERTION_MODE_IN_BODY.
734
0
  assert(0);
735
0
}
736
737
static void parser_add_parse_error (
738
  GumboParser* parser,
739
  const GumboToken* token
740
5.83M
) {
741
5.83M
  gumbo_debug("Adding parse error.\n");
742
5.83M
  GumboError* error = gumbo_add_error(parser);
743
5.83M
  if (!error) {
744
0
    return;
745
0
  }
746
5.83M
  error->type = GUMBO_ERR_PARSER;
747
5.83M
  error->position = token->position;
748
5.83M
  error->original_text = token->original_text;
749
5.83M
  GumboParserError* extra_data = &error->v.parser;
750
5.83M
  extra_data->input_type = token->type;
751
5.83M
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
5.83M
  if (token->type == GUMBO_TOKEN_START_TAG) {
753
936k
    extra_data->input_tag = token->v.start_tag.tag;
754
4.89M
  } else if (token->type == GUMBO_TOKEN_END_TAG) {
755
43.2k
    extra_data->input_tag = token->v.end_tag.tag;
756
43.2k
  }
757
5.83M
  const GumboParserState* state = parser->_parser_state;
758
5.83M
  extra_data->parser_state = state->_insertion_mode;
759
5.83M
  gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack);
760
429M
  for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
761
423M
    const GumboNode* node = state->_open_elements.data[i];
762
423M
    assert (
763
423M
      node->type == GUMBO_NODE_ELEMENT
764
423M
      || node->type == GUMBO_NODE_TEMPLATE
765
423M
    );
766
0
    gumbo_vector_add (
767
423M
      (void*) node->v.element.tag,
768
423M
      &extra_data->tag_stack
769
423M
    );
770
423M
  }
771
5.83M
}
772
773
// https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
774
2.08M
static bool is_mathml_integration_point(const GumboNode* node) {
775
2.08M
  static const TagSet mathml_integration_point_tags = {
776
2.08M
    TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
777
2.08M
    TAG_MATHML(MS), TAG_MATHML(MTEXT)
778
2.08M
  };
779
2.08M
  return node_tag_in_set(node, &mathml_integration_point_tags);
780
2.08M
}
781
782
// https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point
783
2.07M
static bool is_html_integration_point(const GumboNode* node) {
784
2.07M
  static const TagSet html_integration_point_svg_tags = {
785
2.07M
      TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)
786
2.07M
  };
787
2.07M
  if (node_tag_in_set(node, &html_integration_point_svg_tags)) {
788
6.52k
    return true;
789
6.52k
  }
790
791
2.07M
  const bool is_mathml_annotation_xml_element = node_qualified_tag_is (
792
2.07M
    node,
793
2.07M
    GUMBO_NAMESPACE_MATHML,
794
2.07M
    GUMBO_TAG_ANNOTATION_XML
795
2.07M
  );
796
2.07M
  const GumboVector* attributes = &node->v.element.attributes;
797
2.07M
  if (
798
2.07M
    is_mathml_annotation_xml_element
799
2.07M
    && (
800
584
      attribute_matches(attributes, "encoding", "text/html")
801
584
      || attribute_matches(attributes, "encoding", "application/xhtml+xml")
802
584
    )
803
2.07M
  ) {
804
0
    return true;
805
0
  }
806
807
2.07M
  return false;
808
2.07M
}
809
810
// This represents a place to insert a node, consisting of a target parent and a
811
// child index within that parent. If the node should be inserted at the end of
812
// the parent's child, index will be -1.
813
typedef struct {
814
  GumboNode* target;
815
  int index;
816
} InsertionLocation;
817
818
static InsertionLocation get_appropriate_insertion_location (
819
  const GumboParser* parser,
820
  GumboNode* override_target
821
1.73M
) {
822
1.73M
  InsertionLocation retval = {override_target, -1};
823
1.73M
  if (retval.target == NULL) {
824
    // No override target; default to the current node, but special-case the
825
    // root node since get_current_node() assumes the stack of open elements is
826
    // non-empty.
827
1.72M
    retval.target = (parser->_output->root != NULL)
828
1.72M
      ? get_current_node(parser)
829
1.72M
      : get_document_node(parser)
830
1.72M
    ;
831
1.72M
  }
832
1.73M
  if (
833
1.73M
    !parser->_parser_state->_foster_parent_insertions
834
1.73M
    || !node_tag_in_set(retval.target, &(const TagSet) {
835
1.02M
      TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
836
1.02M
    })
837
1.73M
  ) {
838
1.21M
    return retval;
839
1.21M
  }
840
841
  // Foster-parenting case.
842
519k
  int last_template_index = -1;
843
519k
  int last_table_index = -1;
844
519k
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
845
4.49M
  for (unsigned int i = 0; i < open_elements->length; ++i) {
846
3.97M
    if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
847
21.9k
      last_template_index = i;
848
21.9k
    }
849
3.97M
    if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
850
543k
      last_table_index = i;
851
543k
    }
852
3.97M
  }
853
519k
  if (
854
519k
    last_template_index != -1
855
519k
    && (last_table_index == -1 || last_template_index > last_table_index)
856
519k
  ) {
857
1.20k
    retval.target = open_elements->data[last_template_index];
858
1.20k
    return retval;
859
1.20k
  }
860
518k
  if (last_table_index == -1) {
861
0
    retval.target = open_elements->data[0];
862
0
    return retval;
863
0
  }
864
518k
  const GumboNode* last_table = open_elements->data[last_table_index];
865
518k
  if (last_table->parent != NULL) {
866
518k
    retval.target = last_table->parent;
867
518k
    retval.index = last_table->index_within_parent;
868
518k
    return retval;
869
518k
  }
870
871
0
  retval.target = open_elements->data[last_table_index - 1];
872
0
  return retval;
873
518k
}
874
875
// Appends a node to the end of its parent, setting the "parent" and
876
// "index_within_parent" fields appropriately.
877
1.43M
static void append_node(GumboNode* parent, GumboNode* node) {
878
1.43M
  assert(node->parent == NULL);
879
0
  assert(node->index_within_parent == (unsigned int) -1);
880
0
  GumboVector* children;
881
1.43M
  if (
882
1.43M
    parent->type == GUMBO_NODE_ELEMENT
883
1.43M
    || parent->type == GUMBO_NODE_TEMPLATE
884
1.43M
  ) {
885
1.42M
    children = &parent->v.element.children;
886
1.42M
  } else {
887
11.4k
    assert(parent->type == GUMBO_NODE_DOCUMENT);
888
0
    children = &parent->v.document.children;
889
11.4k
  }
890
0
  node->parent = parent;
891
1.43M
  node->index_within_parent = children->length;
892
1.43M
  gumbo_vector_add((void*) node, children);
893
1.43M
  assert(node->index_within_parent < children->length);
894
1.43M
}
895
896
// Inserts a node at the specified InsertionLocation, updating the
897
// "parent" and "index_within_parent" fields of it and all its siblings.
898
// If the index of the location is -1, this calls append_node.
899
1.73M
static void insert_node(GumboNode* node, InsertionLocation location) {
900
1.73M
  assert(node->parent == NULL);
901
0
  assert(node->index_within_parent == (unsigned int) -1);
902
0
  GumboNode* parent = location.target;
903
1.73M
  int index = location.index;
904
1.73M
  if (index != -1) {
905
518k
    GumboVector* children = NULL;
906
518k
    if (
907
518k
      parent->type == GUMBO_NODE_ELEMENT
908
518k
      || parent->type == GUMBO_NODE_TEMPLATE
909
518k
    ) {
910
518k
      children = &parent->v.element.children;
911
518k
    } else if (parent->type == GUMBO_NODE_DOCUMENT) {
912
0
      children = &parent->v.document.children;
913
0
      assert(children->length == 0);
914
0
    } else {
915
0
      assert(0);
916
0
    }
917
918
0
    assert(index >= 0);
919
0
    assert((unsigned int) index < children->length);
920
0
    node->parent = parent;
921
518k
    node->index_within_parent = index;
922
518k
    gumbo_vector_insert_at((void*) node, index, children);
923
518k
    assert(node->index_within_parent < children->length);
924
1.03M
    for (unsigned int i = index + 1; i < children->length; ++i) {
925
518k
      GumboNode* sibling = children->data[i];
926
518k
      sibling->index_within_parent = i;
927
518k
      assert(sibling->index_within_parent < children->length);
928
518k
    }
929
1.21M
  } else {
930
1.21M
    append_node(parent, node);
931
1.21M
  }
932
1.73M
}
933
934
2.75M
static void maybe_flush_text_node_buffer(GumboParser* parser) {
935
2.75M
  GumboParserState* state = parser->_parser_state;
936
2.75M
  TextNodeBufferState* buffer_state = &state->_text_node;
937
2.75M
  if (buffer_state->_buffer.length == 0) {
938
2.33M
    return;
939
2.33M
  }
940
941
420k
  assert (
942
420k
    buffer_state->_type == GUMBO_NODE_WHITESPACE
943
420k
    || buffer_state->_type == GUMBO_NODE_TEXT
944
420k
    || buffer_state->_type == GUMBO_NODE_CDATA
945
420k
  );
946
0
  GumboNode* text_node = create_node(buffer_state->_type);
947
420k
  GumboText* text_node_data = &text_node->v.text;
948
420k
  text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer);
949
420k
  text_node_data->original_text.data = buffer_state->_start_original_text;
950
420k
  text_node_data->original_text.length =
951
420k
      state->_current_token->original_text.data -
952
420k
      buffer_state->_start_original_text;
953
420k
  text_node_data->start_pos = buffer_state->_start_position;
954
955
420k
  gumbo_debug (
956
420k
    "Flushing text node buffer of %.*s.\n",
957
420k
    (int) buffer_state->_buffer.length,
958
420k
    buffer_state->_buffer.data
959
420k
  );
960
961
420k
  InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
962
420k
  if (location.target->type == GUMBO_NODE_DOCUMENT) {
963
    // The DOM does not allow Document nodes to have Text children, so per the
964
    // spec, they are dropped on the floor.
965
0
    destroy_node(text_node);
966
420k
  } else {
967
420k
    insert_node(text_node, location);
968
420k
  }
969
970
420k
  gumbo_string_buffer_clear(&buffer_state->_buffer);
971
420k
  buffer_state->_type = GUMBO_NODE_WHITESPACE;
972
420k
  assert(buffer_state->_buffer.length == 0);
973
420k
}
974
975
static void record_end_of_element (
976
  const GumboToken* current_token,
977
  GumboElement* element
978
1.29M
) {
979
1.29M
  element->end_pos = current_token->position;
980
1.29M
  element->original_end_tag =
981
1.29M
    (current_token->type == GUMBO_TOKEN_END_TAG)
982
1.29M
      ? current_token->original_text
983
1.29M
      : kGumboEmptyString;
984
1.29M
}
985
986
1.31M
static GumboNode* pop_current_node(GumboParser* parser) {
987
1.31M
  GumboParserState* state = parser->_parser_state;
988
1.31M
  maybe_flush_text_node_buffer(parser);
989
1.31M
  if (state->_open_elements.length > 0) {
990
1.29M
    assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
991
0
    gumbo_debug (
992
1.29M
      "Popping %s node.\n",
993
1.29M
      gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)
994
1.29M
    );
995
1.29M
  }
996
0
  GumboNode* current_node = gumbo_vector_pop(&state->_open_elements);
997
1.31M
  if (!current_node) {
998
20.1k
    assert(state->_open_elements.length == 0);
999
0
    return NULL;
1000
20.1k
  }
1001
1.29M
  assert (
1002
1.29M
    current_node->type == GUMBO_NODE_ELEMENT
1003
1.29M
    || current_node->type == GUMBO_NODE_TEMPLATE
1004
1.29M
  );
1005
0
  bool is_closed_body_or_html_tag =
1006
1.29M
    (
1007
1.29M
      node_html_tag_is(current_node, GUMBO_TAG_BODY)
1008
1.29M
      && state->_closed_body_tag
1009
1.29M
    ) || (
1010
1.29M
      node_html_tag_is(current_node, GUMBO_TAG_HTML)
1011
1.29M
      && state->_closed_html_tag
1012
1.29M
    )
1013
1.29M
  ;
1014
1.29M
  if (
1015
1.29M
    (
1016
1.29M
      state->_current_token->type != GUMBO_TOKEN_END_TAG
1017
1.29M
      || !node_qualified_tagname_is (
1018
35.7k
        current_node,
1019
35.7k
        GUMBO_NAMESPACE_HTML,
1020
35.7k
        state->_current_token->v.end_tag.tag,
1021
35.7k
        state->_current_token->v.end_tag.name
1022
35.7k
      )
1023
1.29M
    )
1024
1.29M
    && !is_closed_body_or_html_tag
1025
1.29M
  ) {
1026
1.28M
    current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1027
1.28M
  }
1028
1.29M
  if (!is_closed_body_or_html_tag) {
1029
1.29M
    record_end_of_element(state->_current_token, &current_node->v.element);
1030
1.29M
  }
1031
1.29M
  return current_node;
1032
1.31M
}
1033
1034
static void append_comment_node (
1035
  GumboParser* parser,
1036
  GumboNode* node,
1037
  const GumboToken* token
1038
200k
) {
1039
200k
  maybe_flush_text_node_buffer(parser);
1040
200k
  GumboNode* comment = create_node(GUMBO_NODE_COMMENT);
1041
200k
  comment->type = GUMBO_NODE_COMMENT;
1042
200k
  comment->parse_flags = GUMBO_INSERTION_NORMAL;
1043
200k
  comment->v.text.text = token->v.text;
1044
200k
  comment->v.text.original_text = token->original_text;
1045
200k
  comment->v.text.start_pos = token->position;
1046
200k
  append_node(node, comment);
1047
200k
}
1048
1049
// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context
1050
27.4k
static void clear_stack_to_table_row_context(GumboParser* parser) {
1051
27.4k
  static const TagSet tags = {TAG(HTML), TAG(TR), TAG(TEMPLATE)};
1052
56.9k
  while (!node_tag_in_set(get_current_node(parser), &tags)) {
1053
29.5k
    pop_current_node(parser);
1054
29.5k
  }
1055
27.4k
}
1056
1057
// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context
1058
8.60k
static void clear_stack_to_table_context(GumboParser* parser) {
1059
8.60k
  static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)};
1060
24.1k
  while (!node_tag_in_set(get_current_node(parser), &tags)) {
1061
15.5k
    pop_current_node(parser);
1062
15.5k
  }
1063
8.60k
}
1064
1065
// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context
1066
8.78k
static void clear_stack_to_table_body_context(GumboParser* parser) {
1067
8.78k
  static const TagSet tags = {
1068
8.78k
    TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE)
1069
8.78k
  };
1070
29.0k
  while (!node_tag_in_set(get_current_node(parser), &tags)) {
1071
20.2k
    pop_current_node(parser);
1072
20.2k
  }
1073
8.78k
}
1074
1075
// Creates a parser-inserted element in the HTML namespace and returns it.
1076
36.3k
static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
1077
  // XXX: This will fail for creating fragments with an element with tag
1078
  // GUMBO_TAG_UNKNOWN
1079
36.3k
  assert(tag != GUMBO_TAG_UNKNOWN);
1080
0
  GumboNode* node = create_node(GUMBO_NODE_ELEMENT);
1081
36.3k
  GumboElement* element = &node->v.element;
1082
36.3k
  gumbo_vector_init(1, &element->children);
1083
36.3k
  gumbo_vector_init(0, &element->attributes);
1084
36.3k
  element->tag = tag;
1085
36.3k
  element->name = gumbo_normalized_tagname(tag);
1086
36.3k
  element->tag_namespace = GUMBO_NAMESPACE_HTML;
1087
36.3k
  element->original_tag = kGumboEmptyString;
1088
36.3k
  element->original_end_tag = kGumboEmptyString;
1089
36.3k
  element->start_pos = (parser->_parser_state->_current_token)
1090
36.3k
    ? parser->_parser_state->_current_token->position
1091
36.3k
    : kGumboEmptySourcePosition
1092
36.3k
  ;
1093
36.3k
  element->end_pos = kGumboEmptySourcePosition;
1094
36.3k
  return node;
1095
36.3k
}
1096
1097
// Constructs an element from the given start tag token.
1098
static GumboNode* create_element_from_token (
1099
  GumboToken* token,
1100
  GumboNamespaceEnum tag_namespace
1101
1.17M
) {
1102
1.17M
  assert(token->type == GUMBO_TOKEN_START_TAG);
1103
0
  GumboTokenStartTag* start_tag = &token->v.start_tag;
1104
1105
1.17M
  GumboNodeType type =
1106
1.17M
    (
1107
1.17M
      tag_namespace == GUMBO_NAMESPACE_HTML
1108
1.17M
      && start_tag->tag == GUMBO_TAG_TEMPLATE
1109
1.17M
    )
1110
1.17M
    ? GUMBO_NODE_TEMPLATE
1111
1.17M
    : GUMBO_NODE_ELEMENT
1112
1.17M
  ;
1113
1114
1.17M
  GumboNode* node = create_node(type);
1115
1.17M
  GumboElement* element = &node->v.element;
1116
1.17M
  gumbo_vector_init(1, &element->children);
1117
1.17M
  element->attributes = start_tag->attributes;
1118
1.17M
  element->tag = start_tag->tag;
1119
1.17M
  element->name = start_tag->name ? start_tag->name : gumbo_normalized_tagname(start_tag->tag);
1120
1.17M
  element->tag_namespace = tag_namespace;
1121
1122
1.17M
  assert(token->original_text.length >= 2);
1123
0
  assert(token->original_text.data[0] == '<');
1124
0
  assert(token->original_text.data[token->original_text.length - 1] == '>');
1125
0
  element->original_tag = token->original_text;
1126
1.17M
  element->start_pos = token->position;
1127
1.17M
  element->original_end_tag = kGumboEmptyString;
1128
1.17M
  element->end_pos = kGumboEmptySourcePosition;
1129
1130
  // The element takes ownership of the attributes and name from the token, so
1131
  // any allocated-memory fields should be nulled out.
1132
1.17M
  start_tag->attributes = kGumboEmptyVector;
1133
1.17M
  start_tag->name = NULL;
1134
1.17M
  return node;
1135
1.17M
}
1136
1137
// https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element
1138
static void insert_element (
1139
  GumboParser* parser,
1140
  GumboNode* node,
1141
  bool is_reconstructing_formatting_elements
1142
1.21M
) {
1143
1.21M
  GumboParserState* state = parser->_parser_state;
1144
  // NOTE(jdtang): The text node buffer must always be flushed before inserting
1145
  // a node, otherwise we're handling nodes in a different order than the spec
1146
  // mandated. However, one clause of the spec (character tokens in the body)
1147
  // requires that we reconstruct the active formatting elements *before* adding
1148
  // the character, and reconstructing the active formatting elements may itself
1149
  // result in the insertion of new elements (which should be pushed onto the
1150
  // stack of open elements before the buffer is flushed). We solve this (for
1151
  // the time being, the spec has been rewritten for <template> and the new
1152
  // version may be simpler here) with a boolean flag to this method.
1153
1.21M
  if (!is_reconstructing_formatting_elements) {
1154
1.21M
    maybe_flush_text_node_buffer(parser);
1155
1.21M
  }
1156
1.21M
  InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1157
1.21M
  insert_node(node, location);
1158
1.21M
  gumbo_vector_add((void*) node, &state->_open_elements);
1159
1.21M
}
1160
1161
// Convenience method that combines create_element_from_token and
1162
// insert_element, inserting the generated element directly into the current
1163
// node. Returns the node inserted.
1164
static GumboNode* insert_element_from_token (
1165
  GumboParser* parser,
1166
  GumboToken* token
1167
1.16M
) {
1168
1.16M
  GumboNode* element = create_element_from_token(token, GUMBO_NAMESPACE_HTML);
1169
1.16M
  insert_element(parser, element, false);
1170
1.16M
  gumbo_debug (
1171
1.16M
    "Inserting <%s> element (@%p) from token.\n",
1172
1.16M
    gumbo_normalized_tagname(element->v.element.tag),
1173
1.16M
    (void*)element
1174
1.16M
  );
1175
1.16M
  return element;
1176
1.16M
}
1177
1178
// Convenience method that combines create_element and insert_element, inserting
1179
// a parser-generated element of a specific tag type. Returns the node
1180
// inserted.
1181
static GumboNode* insert_element_of_tag_type (
1182
  GumboParser* parser,
1183
  GumboTag tag,
1184
  GumboParseFlags reason
1185
36.3k
) {
1186
36.3k
  GumboNode* element = create_element(parser, tag);
1187
36.3k
  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
36.3k
  insert_element(parser, element, false);
1189
36.3k
  gumbo_debug (
1190
36.3k
    "Inserting %s element (@%p) from tag type.\n",
1191
36.3k
    gumbo_normalized_tagname(tag),
1192
36.3k
    (void*)element
1193
36.3k
  );
1194
36.3k
  return element;
1195
36.3k
}
1196
1197
// Convenience method for creating foreign namespaced element. Returns the node
1198
// inserted.
1199
static GumboNode* insert_foreign_element (
1200
  GumboParser* parser,
1201
  GumboToken* token,
1202
  GumboNamespaceEnum tag_namespace
1203
14.4k
) {
1204
14.4k
  assert(token->type == GUMBO_TOKEN_START_TAG);
1205
0
  GumboNode* element = create_element_from_token(token, tag_namespace);
1206
14.4k
  insert_element(parser, element, false);
1207
14.4k
  if (
1208
14.4k
    token_has_attribute(token, "xmlns")
1209
14.4k
    && !attribute_matches_case_sensitive (
1210
0
      &token->v.start_tag.attributes,
1211
0
      "xmlns",
1212
0
      kLegalXmlns[tag_namespace]
1213
0
    )
1214
14.4k
  ) {
1215
    // TODO(jdtang): Since there're multiple possible error codes here, we
1216
    // eventually need reason codes to differentiate them.
1217
0
    parser_add_parse_error(parser, token);
1218
0
  }
1219
14.4k
  if (
1220
14.4k
    token_has_attribute(token, "xmlns:xlink")
1221
14.4k
    && !attribute_matches_case_sensitive (
1222
0
      &token->v.start_tag.attributes,
1223
0
      "xmlns:xlink",
1224
0
      "http://www.w3.org/1999/xlink"
1225
0
    )
1226
14.4k
  ) {
1227
0
    parser_add_parse_error(parser, token);
1228
0
  }
1229
14.4k
  return element;
1230
14.4k
}
1231
1232
12.1M
static void insert_text_token(GumboParser* parser, GumboToken* token) {
1233
12.1M
  assert (
1234
12.1M
    token->type == GUMBO_TOKEN_WHITESPACE
1235
12.1M
    || token->type == GUMBO_TOKEN_CHARACTER
1236
12.1M
    || token->type == GUMBO_TOKEN_NULL
1237
12.1M
    || token->type == GUMBO_TOKEN_CDATA
1238
12.1M
  );
1239
0
  TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1240
12.1M
  if (buffer_state->_buffer.length == 0) {
1241
    // Initialize position fields.
1242
420k
    buffer_state->_start_original_text = token->original_text.data;
1243
420k
    buffer_state->_start_position = token->position;
1244
420k
  }
1245
12.1M
  gumbo_string_buffer_append_codepoint (
1246
12.1M
    token->v.character,
1247
12.1M
    &buffer_state->_buffer
1248
12.1M
  );
1249
12.1M
  if (token->type == GUMBO_TOKEN_CHARACTER) {
1250
10.3M
    buffer_state->_type = GUMBO_NODE_TEXT;
1251
10.3M
  } else if (token->type == GUMBO_TOKEN_CDATA) {
1252
500k
    buffer_state->_type = GUMBO_NODE_CDATA;
1253
500k
  }
1254
12.1M
  gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1255
12.1M
}
1256
1257
// https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
1258
static void run_generic_parsing_algorithm (
1259
  GumboParser* parser,
1260
  GumboToken* token,
1261
  GumboTokenizerEnum lexer_state
1262
2.91k
) {
1263
2.91k
  insert_element_from_token(parser, token);
1264
2.91k
  gumbo_tokenizer_set_state(parser, lexer_state);
1265
2.91k
  GumboParserState* parser_state = parser->_parser_state;
1266
2.91k
  parser_state->_original_insertion_mode = parser_state->_insertion_mode;
1267
2.91k
  parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1268
2.91k
}
1269
1270
7.31k
static void acknowledge_self_closing_tag(GumboParser* parser) {
1271
7.31k
  parser->_parser_state->_self_closing_flag_acknowledged = true;
1272
7.31k
}
1273
1274
// Returns true if there's an anchor tag in the list of active formatting
1275
// elements, and fills in its index if so.
1276
89.9k
static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1277
89.9k
  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1278
110k
  for (int i = elements->length; --i >= 0;) {
1279
107k
    GumboNode* node = elements->data[i];
1280
107k
    if (node == &kActiveFormattingScopeMarker) {
1281
60.3k
      return false;
1282
60.3k
    }
1283
47.3k
    if (node_html_tag_is(node, GUMBO_TAG_A)) {
1284
26.9k
      *anchor_index = i;
1285
26.9k
      return true;
1286
26.9k
    }
1287
47.3k
  }
1288
2.70k
  return false;
1289
89.9k
}
1290
1291
// Counts the number of open formatting elements in the list of active
1292
// formatting elements (after the last active scope marker) that have a specific
1293
// tag. If this is > 0, then earliest_matching_index will be filled in with the
1294
// index of the first such element.
1295
static int count_formatting_elements_of_tag (
1296
  GumboParser* parser,
1297
  const GumboNode* desired_node,
1298
  int* earliest_matching_index
1299
388k
) {
1300
388k
  const GumboElement* desired_element = &desired_node->v.element;
1301
388k
  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1302
388k
  int num_identical_elements = 0;
1303
1.37M
  for (int i = elements->length; --i >= 0;) {
1304
1.34M
    GumboNode* node = elements->data[i];
1305
1.34M
    if (node == &kActiveFormattingScopeMarker) {
1306
362k
      break;
1307
362k
    }
1308
986k
    assert(node->type == GUMBO_NODE_ELEMENT);
1309
0
    if (
1310
986k
      node_qualified_tagname_is (
1311
986k
        node,
1312
986k
        desired_element->tag_namespace,
1313
986k
        desired_element->tag,
1314
986k
        desired_element->name
1315
986k
      )
1316
986k
      && all_attributes_match(&node->v.element.attributes, &desired_element->attributes)
1317
986k
    ) {
1318
223k
      num_identical_elements++;
1319
223k
      *earliest_matching_index = i;
1320
223k
    }
1321
986k
  }
1322
388k
  return num_identical_elements;
1323
388k
}
1324
1325
// https://html.spec.whatwg.org/multipage/parsing.html#reconstruct-the-active-formatting-elements
1326
388k
static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1327
388k
  assert (
1328
388k
    node == &kActiveFormattingScopeMarker
1329
388k
    || node->type == GUMBO_NODE_ELEMENT
1330
388k
  );
1331
0
  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1332
388k
  if (node == &kActiveFormattingScopeMarker) {
1333
92.3k
    gumbo_debug("Adding a scope marker.\n");
1334
295k
  } else {
1335
295k
    gumbo_debug("Adding a formatting element.\n");
1336
295k
  }
1337
1338
  // Hunt for identical elements.
1339
388k
  int earliest_identical_element = elements->length;
1340
388k
  int num_identical_elements = count_formatting_elements_of_tag (
1341
388k
    parser,
1342
388k
    node,
1343
388k
    &earliest_identical_element
1344
388k
  );
1345
1346
  // Noah's Ark clause: if there're at least 3, remove the earliest.
1347
388k
  if (num_identical_elements >= 3) {
1348
35.0k
    gumbo_debug (
1349
35.0k
      "Noah's ark clause: removing element at %d.\n",
1350
35.0k
      earliest_identical_element
1351
35.0k
    );
1352
35.0k
    gumbo_vector_remove_at(earliest_identical_element, elements);
1353
35.0k
  }
1354
1355
388k
  gumbo_vector_add((void*) node, elements);
1356
388k
}
1357
1358
1.51M
static bool is_open_element(const GumboParser* parser, const GumboNode* node) {
1359
1.51M
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1360
113M
  for (unsigned int i = 0; i < open_elements->length; ++i) {
1361
113M
    if (open_elements->data[i] == node) {
1362
1.42M
      return true;
1363
1.42M
    }
1364
113M
  }
1365
89.5k
  return false;
1366
1.51M
}
1367
1368
// Clones attributes, tags, etc. of a node, but does not copy the content. The
1369
// clone shares no structure with the original node: all owned strings and
1370
// values are fresh copies.
1371
static GumboNode* clone_node (
1372
  GumboNode* node,
1373
  GumboParseFlags reason
1374
105k
) {
1375
105k
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1376
0
  GumboNode* new_node = gumbo_alloc(sizeof(GumboNode));
1377
105k
  *new_node = *node;
1378
105k
  new_node->parent = NULL;
1379
105k
  new_node->index_within_parent = -1;
1380
  // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1381
  // have a separate end tag.
1382
105k
  new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1383
105k
  new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1384
105k
  GumboElement* element = &new_node->v.element;
1385
105k
  gumbo_vector_init(1, &element->children);
1386
1387
105k
  const GumboVector* old_attributes = &node->v.element.attributes;
1388
105k
  gumbo_vector_init(old_attributes->length, &element->attributes);
1389
125k
  for (unsigned int i = 0; i < old_attributes->length; ++i) {
1390
20.3k
    const GumboAttribute* old_attr = old_attributes->data[i];
1391
20.3k
    GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
1392
20.3k
    *attr = *old_attr;
1393
20.3k
    attr->name = gumbo_strdup(old_attr->name);
1394
20.3k
    attr->value = gumbo_strdup(old_attr->value);
1395
20.3k
    gumbo_vector_add(attr, &element->attributes);
1396
20.3k
  }
1397
105k
  return new_node;
1398
105k
}
1399
1400
// "Reconstruct active formatting elements" part of the spec.
1401
// This implementation is based on the html5lib translation from the
1402
// mess of GOTOs in the spec to reasonably structured programming.
1403
// https://github.com/html5lib/html5lib-python/blob/master/html5lib/treebuilders/base.py
1404
7.29M
static void reconstruct_active_formatting_elements(GumboParser* parser) {
1405
7.29M
  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1406
  // Step 1
1407
7.29M
  if (elements->length == 0) {
1408
5.36M
    return;
1409
5.36M
  }
1410
1411
  // Step 2 & 3
1412
1.92M
  unsigned int i = elements->length - 1;
1413
1.92M
  GumboNode* element = elements->data[i];
1414
1.92M
  if (
1415
1.92M
    element == &kActiveFormattingScopeMarker
1416
1.92M
    || is_open_element(parser, element)
1417
1.92M
  ) {
1418
1.90M
    return;
1419
1.90M
  }
1420
1421
  // Step 6
1422
89.5k
  do {
1423
89.5k
    if (i == 0) {
1424
      // Step 4
1425
3.50k
      i = -1;  // Incremented to 0 below.
1426
3.50k
      break;
1427
3.50k
    }
1428
    // Step 5
1429
86.0k
    element = elements->data[--i];
1430
86.0k
  } while (
1431
86.0k
    element != &kActiveFormattingScopeMarker
1432
86.0k
    && !is_open_element(parser, element)
1433
18.2k
  );
1434
1435
0
  ++i;
1436
18.2k
  gumbo_debug (
1437
18.2k
    "Reconstructing elements from %u on %s parent.\n",
1438
18.2k
    i,
1439
18.2k
    gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)
1440
18.2k
  );
1441
107k
  for (; i < elements->length; ++i) {
1442
    // Step 7 & 8.
1443
89.5k
    assert(elements->length > 0);
1444
0
    assert(i < elements->length);
1445
0
    element = elements->data[i];
1446
89.5k
    assert(element != &kActiveFormattingScopeMarker);
1447
0
    GumboNode* clone = clone_node (
1448
89.5k
      element,
1449
89.5k
      GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT
1450
89.5k
    );
1451
    // Step 9.
1452
89.5k
    InsertionLocation location =
1453
89.5k
        get_appropriate_insertion_location(parser, NULL);
1454
89.5k
    insert_node(clone, location);
1455
89.5k
    gumbo_vector_add (
1456
89.5k
      (void*) clone,
1457
89.5k
      &parser->_parser_state->_open_elements
1458
89.5k
    );
1459
1460
    // Step 10.
1461
89.5k
    elements->data[i] = clone;
1462
89.5k
    gumbo_debug (
1463
89.5k
      "Reconstructed %s element at %u.\n",
1464
89.5k
      gumbo_normalized_tagname(clone->v.element.tag),
1465
89.5k
      i
1466
89.5k
    );
1467
89.5k
  }
1468
18.2k
}
1469
1470
27.2k
static void clear_active_formatting_elements(GumboParser* parser) {
1471
27.2k
  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1472
27.2k
  int num_elements_cleared = 0;
1473
27.2k
  const GumboNode* node;
1474
42.4k
  do {
1475
42.4k
    node = gumbo_vector_pop(elements);
1476
42.4k
    ++num_elements_cleared;
1477
42.4k
  } while (node && node != &kActiveFormattingScopeMarker);
1478
27.2k
  gumbo_debug (
1479
27.2k
    "Cleared %d elements from active formatting list.\n",
1480
27.2k
    num_elements_cleared
1481
27.2k
  );
1482
27.2k
}
1483
1484
// https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
1485
GumboQuirksModeEnum gumbo_compute_quirks_mode (
1486
  const char *name,
1487
  const char *pubid_str,
1488
  const char *sysid_str
1489
107
) {
1490
1491
107
  GumboStringPiece pubid = {
1492
107
    .data = pubid_str,
1493
107
    .length = pubid_str? strlen(pubid_str) : 0,
1494
107
  };
1495
107
  GumboStringPiece sysid = {
1496
107
    .data = sysid_str,
1497
107
    .length = sysid_str? strlen(sysid_str) : 0,
1498
107
  };
1499
107
  bool has_system_identifier = !!sysid_str;
1500
107
  if (
1501
107
    name == NULL
1502
107
    || strcmp(name, "html")
1503
107
    || is_in_static_list(&pubid, kQuirksModePublicIdPrefixes, false)
1504
107
    || is_in_static_list(&pubid, kQuirksModePublicIdExactMatches, true)
1505
107
    || is_in_static_list(&sysid, kQuirksModeSystemIdExactMatches, true)
1506
107
    || (
1507
35
      !has_system_identifier
1508
35
      && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false)
1509
35
    )
1510
107
  ) {
1511
72
    return GUMBO_DOCTYPE_QUIRKS;
1512
72
  }
1513
1514
35
  if (
1515
35
    is_in_static_list(&pubid, kLimitedQuirksPublicIdPrefixes, false)
1516
35
    || (
1517
35
      has_system_identifier
1518
35
      && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false)
1519
35
    )
1520
35
  ) {
1521
0
    return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1522
0
  }
1523
1524
35
  return GUMBO_DOCTYPE_NO_QUIRKS;
1525
35
}
1526
1527
472
static GumboQuirksModeEnum compute_quirks_mode(const GumboTokenDocType* doctype) {
1528
472
  if (doctype->force_quirks)
1529
365
    return GUMBO_DOCTYPE_QUIRKS;
1530
107
  return gumbo_compute_quirks_mode (
1531
107
    doctype->name,
1532
107
    doctype->has_public_identifier? doctype->public_identifier : NULL,
1533
107
    doctype->has_system_identifier? doctype->system_identifier : NULL
1534
107
  );
1535
472
}
1536
1537
// The following functions are all defined by the "has an element in __ scope"
1538
// sections of the HTML5 spec:
1539
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
1540
// The basic idea behind them is that they check for an element of the given
1541
// qualified name, contained within a scope formed by a set of other qualified
1542
// names. For example, "has an element in list scope" looks for an element of
1543
// the given qualified name within the nearest enclosing <ol> or <ul>, along
1544
// with a bunch of generic element types that serve to "firewall" their content
1545
// from the rest of the document. Note that because of the way the spec is
1546
// written,
1547
// all elements are expected to be in the HTML namespace
1548
static bool has_an_element_in_specific_scope (
1549
  const GumboParser* parser,
1550
  int expected_size,
1551
  const GumboTag* expected,
1552
  bool negate,
1553
  const TagSet* tags
1554
733k
) {
1555
733k
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1556
3.53M
  for (int i = open_elements->length; --i >= 0;) {
1557
3.53M
    const GumboNode* node = open_elements->data[i];
1558
3.53M
    if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
1559
0
      continue;
1560
0
    }
1561
1562
3.53M
    GumboTag node_tag = node->v.element.tag;
1563
3.53M
    GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1564
6.52M
    for (int j = 0; j < expected_size; ++j) {
1565
3.64M
      if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) {
1566
648k
        return true;
1567
648k
      }
1568
3.64M
    }
1569
1570
2.88M
    bool found = tagset_includes(tags, node_ns, node_tag);
1571
2.88M
    if (negate != found) {
1572
85.6k
      return false;
1573
85.6k
    }
1574
2.88M
  }
1575
0
  return false;
1576
733k
}
1577
1578
// Checks for the presence of an open element of the specified tag type.
1579
10.6k
static bool has_open_element(const GumboParser* parser, GumboTag tag) {
1580
10.6k
  static const TagSet tags = {TAG(HTML)};
1581
10.6k
  return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1582
10.6k
}
1583
1584
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope
1585
#define DEFAULT_SCOPE_TAGS \
1586
611k
  TAG(APPLET), \
1587
1.17M
  TAG(CAPTION), \
1588
1.17M
  TAG(HTML), \
1589
1.17M
  TAG(TABLE), \
1590
1.17M
  TAG(TD), \
1591
1.17M
  TAG(TH), \
1592
1.17M
  TAG(MARQUEE), \
1593
1.17M
  TAG(OBJECT), \
1594
1.17M
  TAG(TEMPLATE), \
1595
1.17M
  TAG_MATHML(MI), \
1596
1.17M
  TAG_MATHML(MO), \
1597
1.17M
  TAG_MATHML(MN), \
1598
1.17M
  TAG_MATHML(MS), \
1599
1.17M
  TAG_MATHML(MTEXT), \
1600
1.17M
  TAG_MATHML(ANNOTATION_XML), \
1601
1.17M
  TAG_SVG(FOREIGNOBJECT), \
1602
1.17M
  TAG_SVG(DESC), \
1603
1.17M
  TAG_SVG(TITLE)
1604
1605
static const TagSet heading_tags = {
1606
  TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6)
1607
};
1608
1609
static const TagSet td_th_tags = {
1610
  TAG(TD), TAG(TH)
1611
};
1612
1613
static const TagSet dd_dt_tags = {
1614
  TAG(DD), TAG(DT)
1615
};
1616
1617
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope
1618
45.2k
static bool has_an_element_in_scope(const GumboParser* parser, GumboTag tag) {
1619
45.2k
  static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1620
45.2k
  return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1621
45.2k
}
1622
1623
// Like "has an element in scope", but for the specific case of looking for a
1624
// unique target node, not for any node with a given tag name. This duplicates
1625
// much of the algorithm from has_an_element_in_specific_scope because the
1626
// predicate is different when checking for an exact node, and it's easier &
1627
// faster just to duplicate the code for this one case than to try and
1628
// parameterize it.
1629
266
static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node) {
1630
266
  static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1631
266
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1632
2.23k
  for (int i = open_elements->length; --i >= 0;) {
1633
2.23k
    const GumboNode* current = open_elements->data[i];
1634
2.23k
    const GumboNodeType type = current->type;
1635
2.23k
    if (current == node) {
1636
137
      return true;
1637
137
    }
1638
2.09k
    if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1639
0
      continue;
1640
0
    }
1641
2.09k
    if (node_tag_in_set(current, &tags)) {
1642
129
      return false;
1643
129
    }
1644
2.09k
  }
1645
0
  assert(false);
1646
0
  return false;
1647
266
}
1648
1649
// Like has_an_element_in_scope, but restricts the expected qualified name to a
1650
// range of possible qualified names instead of just a single one.
1651
static bool has_an_element_in_scope_with_tagname (
1652
  const GumboParser* parser,
1653
  int len,
1654
  const GumboTag expected[]
1655
2.55k
) {
1656
2.55k
  static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1657
2.55k
  return has_an_element_in_specific_scope(parser, len, expected, false, &tags);
1658
2.55k
}
1659
1660
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-list-item-scope
1661
311
static bool has_an_element_in_list_scope(const GumboParser* parser, GumboTag tag) {
1662
311
  static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(OL), TAG(UL)};
1663
311
  return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1664
311
}
1665
1666
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-button-scope
1667
563k
static bool has_an_element_in_button_scope(const GumboParser* parser, GumboTag tag) {
1668
563k
  static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(BUTTON)};
1669
563k
  return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1670
563k
}
1671
1672
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-table-scope
1673
108k
static bool has_an_element_in_table_scope(const GumboParser* parser, GumboTag tag) {
1674
108k
  static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)};
1675
108k
  return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1676
108k
}
1677
1678
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope
1679
3.43k
static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag tag) {
1680
3.43k
  static const TagSet tags = {TAG(OPTGROUP), TAG(OPTION)};
1681
3.43k
  return has_an_element_in_specific_scope(parser, 1, &tag, true, &tags);
1682
3.43k
}
1683
1684
// https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags
1685
// "exception" is the "element to exclude from the process" listed in the spec.
1686
// Pass GUMBO_TAG_LAST to not exclude any of them.
1687
static void generate_implied_end_tags (
1688
  GumboParser* parser,
1689
  GumboTag exception,
1690
  const char* exception_name
1691
564k
) {
1692
564k
  static const TagSet tags = {
1693
564k
    TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
1694
564k
    TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
1695
564k
  };
1696
564k
  while (
1697
568k
    node_tag_in_set(get_current_node(parser), &tags)
1698
568k
    && !node_html_tagname_is(get_current_node(parser), exception, exception_name)
1699
564k
  ) {
1700
3.21k
    pop_current_node(parser);
1701
3.21k
  }
1702
564k
}
1703
1704
// This is the "generate all implied end tags thoroughly" clause of the spec.
1705
// https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
1706
734
static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1707
734
  static const TagSet tags = {
1708
734
    TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
1709
734
    TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1710
734
    TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
1711
734
  };
1712
1.06k
  while (node_tag_in_set(get_current_node(parser), &tags)) {
1713
335
    pop_current_node(parser);
1714
335
  }
1715
734
}
1716
1717
// This factors out the clauses in the "in body" insertion mode checking "if
1718
// there is a node in the stack of open elements that is not" one of a list of
1719
// elements in which case it's a parse error.
1720
// This is used in "an end-of-file token", "an end tag whose tag name is
1721
// 'body'", and "an end tag whose tag name is 'html'".
1722
static bool stack_contains_nonclosable_element (
1723
  GumboParser* parser
1724
10.8k
) {
1725
10.8k
  static const TagSet tags = {
1726
10.8k
    TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
1727
10.8k
    TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
1728
10.8k
    TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
1729
10.8k
  };
1730
10.8k
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
1731
33.5k
  for (size_t i = 0; i < open_elements->length; ++i) {
1732
26.0k
    if (!node_tag_in_set(open_elements->data[i], &tags))
1733
3.41k
      return true;
1734
26.0k
  }
1735
7.48k
  return false;
1736
10.8k
}
1737
1738
// This factors out the clauses relating to "act as if an end tag token with tag
1739
// name "table" had been seen. Returns true if there's a table element in table
1740
// scope which was successfully closed, false if not and the token should be
1741
// ignored. Does not add parse errors; callers should handle that.
1742
7.31k
static bool close_table(GumboParser* parser) {
1743
7.31k
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1744
204
    return false;
1745
204
  }
1746
1747
7.11k
  GumboNode* node = pop_current_node(parser);
1748
144k
  while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1749
136k
    node = pop_current_node(parser);
1750
136k
  }
1751
7.11k
  reset_insertion_mode_appropriately(parser);
1752
7.11k
  return true;
1753
7.31k
}
1754
1755
// This factors out the clauses relating to "act as if an end tag token with tag
1756
// name `cell_tag` had been seen".
1757
static void close_table_cell (
1758
  GumboParser* parser,
1759
  const GumboToken* token,
1760
  GumboTag cell_tag
1761
22.8k
) {
1762
22.8k
  generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
1763
22.8k
  const GumboNode* node = get_current_node(parser);
1764
22.8k
  if (!node_html_tag_is(node, cell_tag))
1765
13.1k
    parser_add_parse_error(parser, token);
1766
259k
  do {
1767
259k
    node = pop_current_node(parser);
1768
259k
  } while (!node_html_tag_is(node, cell_tag));
1769
1770
22.8k
  clear_active_formatting_elements(parser);
1771
22.8k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1772
22.8k
}
1773
1774
// https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
1775
// This holds the logic to determine whether we should close a <td> or a <th>.
1776
21.3k
static void close_current_cell(GumboParser* parser, const GumboToken* token) {
1777
21.3k
  GumboTag cell_tag;
1778
21.3k
  if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1779
20.3k
    assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1780
0
    cell_tag = GUMBO_TAG_TD;
1781
20.3k
  } else {
1782
998
    assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1783
0
    cell_tag = GUMBO_TAG_TH;
1784
998
  }
1785
0
  close_table_cell(parser, token, cell_tag);
1786
21.3k
}
1787
1788
// This factors out the "act as if an end tag of tag name 'select' had been
1789
// seen" clause of the spec, since it's referenced in several places. It pops
1790
// all nodes from the stack until the current <select> has been closed, then
1791
// resets the insertion mode appropriately.
1792
10.7k
static void close_current_select(GumboParser* parser) {
1793
10.7k
  GumboNode* node = pop_current_node(parser);
1794
24.9k
  while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1795
14.2k
    node = pop_current_node(parser);
1796
14.2k
  }
1797
10.7k
  reset_insertion_mode_appropriately(parser);
1798
10.7k
}
1799
1800
// The list of nodes in the "special" category:
1801
// https://html.spec.whatwg.org/multipage/parsing.html#special
1802
209k
static bool is_special_node(const GumboNode* node) {
1803
209k
  assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1804
0
  return node_tag_in_set(node, &(const TagSet) {
1805
209k
      TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1806
209k
      TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1807
209k
      TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1808
209k
      TAG(COLGROUP), TAG(DD), TAG(DETAILS), TAG(DIR),
1809
209k
      TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1810
209k
      TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1811
209k
      TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1812
209k
      TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1813
209k
      TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING),
1814
209k
      TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1815
209k
      TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1816
209k
      TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1817
209k
      TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1818
209k
      TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1819
209k
      TAG(THEAD), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1820
1821
209k
      TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1822
209k
      TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1823
1824
209k
      TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1825
1826
      // This TagSet needs to include the "title" element in both the
1827
      // HTML and SVG namespaces. Using both TAG(TITLE) and TAG_SVG(TITLE)
1828
      // won't work, due to the simplistic way in which the TAG macros are
1829
      // implemented, so we do it like this instead:
1830
209k
      [GUMBO_TAG_TITLE] =
1831
209k
          (1 << GUMBO_NAMESPACE_HTML) |
1832
209k
          (1 << GUMBO_NAMESPACE_SVG)
1833
209k
    }
1834
209k
  );
1835
209k
}
1836
1837
// Implicitly closes currently open elements until it reaches an element with
1838
// the
1839
// specified qualified name. If the elements closed are in the set handled by
1840
// generate_implied_end_tags, this is normal operation and this function returns
1841
// true. Otherwise, a parse error is recorded and this function returns false.
1842
static void implicitly_close_tags (
1843
  GumboParser* parser,
1844
  GumboToken* token,
1845
  GumboNamespaceEnum target_ns,
1846
  GumboTag target
1847
537k
) {
1848
537k
  assert(target != GUMBO_TAG_UNKNOWN);
1849
0
  generate_implied_end_tags(parser, target, NULL);
1850
537k
  if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1851
42.7k
    parser_add_parse_error(parser, token);
1852
42.7k
    while (
1853
153k
      !node_qualified_tag_is(get_current_node(parser), target_ns, target)
1854
111k
    ) {
1855
111k
      pop_current_node(parser);
1856
111k
    }
1857
42.7k
  }
1858
537k
  assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1859
0
  pop_current_node(parser);
1860
537k
}
1861
1862
// If the stack of open elements has a <p> tag in button scope, this acts as if
1863
// a </p> tag was encountered, implicitly closing tags. Returns false if a
1864
// parse error occurs. This is a convenience function because this particular
1865
// clause appears several times in the spec.
1866
static void maybe_implicitly_close_p_tag (
1867
  GumboParser* parser,
1868
  GumboToken* token
1869
560k
) {
1870
560k
  if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1871
533k
    implicitly_close_tags (
1872
533k
      parser,
1873
533k
      token,
1874
533k
      GUMBO_NAMESPACE_HTML,
1875
533k
      GUMBO_TAG_P
1876
533k
    );
1877
533k
  }
1878
560k
}
1879
1880
// Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1881
// tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1882
static void maybe_implicitly_close_list_tag (
1883
  GumboParser* parser,
1884
  GumboToken* token,
1885
  bool is_li
1886
3.00k
) {
1887
3.00k
  GumboParserState* state = parser->_parser_state;
1888
3.00k
  set_frameset_not_ok(parser);
1889
15.5k
  for (int i = state->_open_elements.length; --i >= 0;) {
1890
15.5k
    const GumboNode* node = state->_open_elements.data[i];
1891
15.5k
    bool is_list_tag = is_li
1892
15.5k
      ? node_html_tag_is(node, GUMBO_TAG_LI)
1893
15.5k
      : node_tag_in_set(node, &dd_dt_tags)
1894
15.5k
    ;
1895
15.5k
    if (is_list_tag) {
1896
1.21k
      implicitly_close_tags (
1897
1.21k
        parser,
1898
1.21k
        token,
1899
1.21k
        node->v.element.tag_namespace,
1900
1.21k
        node->v.element.tag
1901
1.21k
      );
1902
1.21k
      return;
1903
1.21k
    }
1904
1905
14.3k
    if (
1906
14.3k
      is_special_node(node)
1907
14.3k
      && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
1908
14.3k
    ) {
1909
1.79k
      return;
1910
1.79k
    }
1911
14.3k
  }
1912
3.00k
}
1913
1914
static void merge_attributes (
1915
  GumboToken* token,
1916
  GumboNode* node
1917
1.71k
) {
1918
1.71k
  assert(token->type == GUMBO_TOKEN_START_TAG);
1919
0
  assert(node->type == GUMBO_NODE_ELEMENT);
1920
0
  const GumboVector* token_attr = &token->v.start_tag.attributes;
1921
1.71k
  GumboVector* node_attr = &node->v.element.attributes;
1922
1923
3.34k
  for (unsigned int i = 0; i < token_attr->length; ++i) {
1924
1.62k
    GumboAttribute* attr = token_attr->data[i];
1925
1.62k
    if (!gumbo_get_attribute(node_attr, attr->name)) {
1926
      // Ownership of the attribute is transferred by this gumbo_vector_add,
1927
      // so it has to be nulled out of the original token so it doesn't get
1928
      // double-deleted.
1929
1.25k
      gumbo_vector_add(attr, node_attr);
1930
1.25k
      token_attr->data[i] = NULL;
1931
1.25k
    }
1932
1.62k
  }
1933
  // When attributes are merged, it means the token has been ignored and merged
1934
  // with another token, so we need to free its memory. The attributes that are
1935
  // transferred need to be nulled-out in the vector above so that they aren't
1936
  // double-deleted.
1937
1.71k
  gumbo_token_destroy(token);
1938
1939
1.71k
#ifndef NDEBUG
1940
  // Mark this sentinel so the assertion in the main loop knows it's been
1941
  // destroyed.
1942
1.71k
  token->v.start_tag.attributes = kGumboEmptyVector;
1943
1.71k
#endif
1944
1.71k
}
1945
1946
0
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1947
0
  const StringReplacement *replacement = gumbo_get_svg_tag_replacement (
1948
0
    tag->data,
1949
0
    tag->length
1950
0
  );
1951
0
  return replacement ? replacement->to : NULL;
1952
0
}
1953
1954
// https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
1955
// This destructively modifies any matching attributes on the token and sets the
1956
// namespace appropriately.
1957
14.4k
static void adjust_foreign_attributes(GumboToken* token) {
1958
14.4k
  assert(token->type == GUMBO_TOKEN_START_TAG);
1959
0
  const GumboVector* attributes = &token->v.start_tag.attributes;
1960
21.2k
  for (unsigned int i = 0, n = attributes->length; i < n; ++i) {
1961
6.84k
    GumboAttribute* attr = attributes->data[i];
1962
6.84k
    const ForeignAttrReplacement* entry = gumbo_get_foreign_attr_replacement (
1963
6.84k
      attr->name,
1964
6.84k
      strlen(attr->name)
1965
6.84k
    );
1966
6.84k
    if (!entry) {
1967
6.68k
      continue;
1968
6.68k
    }
1969
156
    gumbo_free((void*) attr->name);
1970
156
    attr->attr_namespace = entry->attr_namespace;
1971
156
    attr->name = gumbo_strdup(entry->local_name);
1972
156
  }
1973
14.4k
}
1974
1975
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
1976
// This adjusts svg tags.
1977
8.30k
static void adjust_svg_tag(GumboToken* token) {
1978
8.30k
  assert(token->type == GUMBO_TOKEN_START_TAG);
1979
8.30k
  if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
1980
106
    assert(token->v.start_tag.name == NULL);
1981
0
    token->v.start_tag.name = "foreignObject";
1982
8.20k
  } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
1983
4.20k
    assert(token->v.start_tag.name);
1984
0
    const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
1985
4.20k
      token->v.start_tag.name,
1986
4.20k
      strlen(token->v.start_tag.name)
1987
4.20k
    );
1988
4.20k
    if (replacement) {
1989
      // This cast is safe because we allocated this memory and we'll free it.
1990
3
      strcpy((char *)token->v.start_tag.name, replacement->to);
1991
3
    }
1992
4.20k
  }
1993
8.30k
}
1994
1995
// https://html.spec.whatwg.org/multipage/parsing.html#adjust-svg-attributes
1996
// This destructively modifies any matching attributes on the token.
1997
10.3k
static void adjust_svg_attributes(GumboToken* token) {
1998
10.3k
  assert(token->type == GUMBO_TOKEN_START_TAG);
1999
0
  const GumboVector* attributes = &token->v.start_tag.attributes;
2000
16.0k
  for (unsigned int i = 0, n = attributes->length; i < n; i++) {
2001
5.61k
    GumboAttribute* attr = (GumboAttribute*) attributes->data[i];
2002
5.61k
    const StringReplacement* replacement = gumbo_get_svg_attr_replacement (
2003
5.61k
      attr->name,
2004
5.61k
      attr->original_name.length
2005
5.61k
    );
2006
5.61k
    if (!replacement) {
2007
5.12k
      continue;
2008
5.12k
    }
2009
498
    gumbo_free((void*) attr->name);
2010
498
    attr->name = gumbo_strdup(replacement->to);
2011
498
  }
2012
10.3k
}
2013
2014
// https://html.spec.whatwg.org/multipage/parsing.html#adjust-mathml-attributes
2015
// Note that this may destructively modify the token with the new attribute
2016
// value.
2017
4.06k
static void adjust_mathml_attributes(GumboToken* token) {
2018
4.06k
  assert(token->type == GUMBO_TOKEN_START_TAG);
2019
0
  GumboAttribute* attr = gumbo_get_attribute (
2020
4.06k
    &token->v.start_tag.attributes,
2021
4.06k
    "definitionurl"
2022
4.06k
  );
2023
4.06k
  if (!attr) {
2024
4.06k
    return;
2025
4.06k
  }
2026
0
  gumbo_free((void*) attr->name);
2027
0
  attr->name = gumbo_strdup("definitionURL");
2028
0
}
2029
2030
static void maybe_add_doctype_error (
2031
  GumboParser* parser,
2032
  const GumboToken* token
2033
472
) {
2034
472
  const GumboTokenDocType* doctype = &token->v.doc_type;
2035
472
  if (
2036
472
    strcmp(doctype->name, "html")
2037
472
    || doctype->has_public_identifier
2038
472
    || (doctype->has_system_identifier
2039
36
        && strcmp(doctype->system_identifier, "about:legacy-compat"))
2040
472
  ) {
2041
466
    parser_add_parse_error(parser, token);
2042
466
  }
2043
472
}
2044
2045
15.5k
static void remove_from_parent(GumboNode* node) {
2046
15.5k
  if (!node->parent) {
2047
    // The node may not have a parent if, for example, it is a newly-cloned copy
2048
    // of an active formatting element. DOM manipulations continue with the
2049
    // orphaned fragment of the DOM tree until it's appended/foster-parented to
2050
    // the common ancestor at the end of the adoption agency algorithm.
2051
5.75k
    return;
2052
5.75k
  }
2053
9.75k
  assert(node->parent->type == GUMBO_NODE_ELEMENT);
2054
0
  GumboVector* children = &node->parent->v.element.children;
2055
9.75k
  int index = gumbo_vector_index_of(children, node);
2056
9.75k
  assert(index != -1);
2057
2058
0
  gumbo_vector_remove_at(index, children);
2059
9.75k
  node->parent = NULL;
2060
9.75k
  node->index_within_parent = -1;
2061
121k
  for (unsigned int i = index; i < children->length; ++i) {
2062
111k
    GumboNode* child = children->data[i];
2063
111k
    child->index_within_parent = i;
2064
111k
  }
2065
9.75k
}
2066
2067
// This is here to clean up memory when the spec says "Ignore current token."
2068
2.71M
static void ignore_token(GumboParser* parser) {
2069
2.71M
  GumboToken* token = parser->_parser_state->_current_token;
2070
  // Ownership of the token's internal buffers are normally transferred to the
2071
  // element, but if no element is emitted (as happens in non-verbatim-mode
2072
  // when a token is ignored), we need to free it here to prevent a memory
2073
  // leak.
2074
2.71M
  gumbo_token_destroy(token);
2075
2.71M
#ifndef NDEBUG
2076
2.71M
  if (token->type == GUMBO_TOKEN_START_TAG) {
2077
    // Mark this sentinel so the assertion in the main loop knows it's been
2078
    // destroyed.
2079
36.3k
    token->v.start_tag.attributes = kGumboEmptyVector;
2080
36.3k
    token->v.start_tag.name = NULL;
2081
36.3k
  }
2082
2.71M
#endif
2083
2.71M
}
2084
2085
// The token is usually an end tag; however, the adoption agency algorithm may
2086
// invoke this for an 'a' or 'nobr' start tag.
2087
// Returns false if there was an error.
2088
static void in_body_any_other_end_tag(GumboParser* parser, GumboToken* token)
2089
9.50k
{
2090
9.50k
  GumboParserState* state = parser->_parser_state;
2091
9.50k
  GumboTag tag;
2092
9.50k
  const char* tagname;
2093
2094
9.50k
  if (token->type == GUMBO_TOKEN_END_TAG) {
2095
9.38k
    tag = token->v.end_tag.tag;
2096
9.38k
    tagname = token->v.end_tag.name;
2097
9.38k
  } else {
2098
116
    assert(token->type == GUMBO_TOKEN_START_TAG);
2099
0
    tag = token->v.start_tag.tag;
2100
116
    tagname = token->v.start_tag.name;
2101
116
  }
2102
2103
0
  assert(state->_open_elements.length > 0);
2104
0
  assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2105
  // Walk up the stack of open elements until we find one that either:
2106
  // a) Matches the tag name we saw
2107
  // b) Is in the "special" category.
2108
  // If we see a), implicitly close everything up to and including it. If we
2109
  // see b), then record a parse error, don't close anything (except the
2110
  // implied end tags) and ignore the end tag token.
2111
126k
  for (int i = state->_open_elements.length; --i >= 0;) {
2112
126k
    const GumboNode* node = state->_open_elements.data[i];
2113
126k
    if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, tagname)) {
2114
1.00k
      generate_implied_end_tags(parser, tag, tagname);
2115
      // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example of an error.
2116
      // foo is the "current node" but sarcasm is node.
2117
      // XXX: Write a test for this.
2118
1.00k
      if (node != get_current_node(parser)) {
2119
408
        parser_add_parse_error(parser, token);
2120
408
      }
2121
2.18k
      while (node != pop_current_node(parser))
2122
1.18k
        ;  // Pop everything.
2123
1.00k
      return;
2124
125k
    } else if (is_special_node(node)) {
2125
8.50k
      parser_add_parse_error(parser, token);
2126
8.50k
      ignore_token(parser);
2127
8.50k
      return;
2128
8.50k
    }
2129
126k
  }
2130
  // <html> is in the special category, so we should never get here.
2131
0
  assert(0 && "unreachable");
2132
0
}
2133
2134
// https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
2135
// Also described in the "in body" handling for end formatting tags.
2136
// Returns false if there was an error.
2137
static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2138
30.8k
{
2139
30.8k
  GumboParserState* state = parser->_parser_state;
2140
30.8k
  gumbo_debug("Entering adoption agency algorithm.\n");
2141
  // Step 1.
2142
30.8k
  GumboTag subject;
2143
30.8k
  if (token->type == GUMBO_TOKEN_START_TAG) {
2144
27.8k
    subject = token->v.start_tag.tag;
2145
27.8k
  } else {
2146
2.96k
    assert(token->type == GUMBO_TOKEN_END_TAG);
2147
0
    subject = token->v.end_tag.tag;
2148
2.96k
  }
2149
0
  assert(subject != GUMBO_TAG_UNKNOWN);
2150
2151
  // Step 2.
2152
0
  GumboNode* current_node = get_current_node(parser);
2153
30.8k
  if (
2154
30.8k
    node_html_tag_is(current_node, subject)
2155
30.8k
    && -1 == gumbo_vector_index_of (
2156
20.2k
      &state->_active_formatting_elements,
2157
20.2k
      current_node
2158
20.2k
    )
2159
30.8k
  ) {
2160
381
    pop_current_node(parser);
2161
381
    return;
2162
381
  }
2163
2164
  // Steps 3-5 & 21:
2165
40.2k
  for (unsigned int i = 0; i < 8; ++i) {
2166
    // Step 6.
2167
39.7k
    GumboNode* formatting_node = NULL;
2168
39.7k
    int formatting_node_in_open_elements = -1;
2169
69.8k
    for (int j = state->_active_formatting_elements.length; --j >= 0;) {
2170
68.8k
      GumboNode* current_node = state->_active_formatting_elements.data[j];
2171
68.8k
      if (current_node == &kActiveFormattingScopeMarker) {
2172
834
        gumbo_debug("Broke on scope marker; aborting.\n");
2173
        // Last scope marker; abort the algorithm and handle according to "any
2174
        // other end tag" (below).
2175
834
        break;
2176
834
      }
2177
67.9k
      if (node_html_tag_is(current_node, subject)) {
2178
        // Found it.
2179
37.8k
        formatting_node = current_node;
2180
37.8k
        formatting_node_in_open_elements = gumbo_vector_index_of (
2181
37.8k
          &state->_open_elements,
2182
37.8k
          formatting_node
2183
37.8k
        );
2184
37.8k
        gumbo_debug (
2185
37.8k
          "Formatting element of tag %s at %d.\n",
2186
37.8k
          gumbo_normalized_tagname(subject),
2187
37.8k
          formatting_node_in_open_elements
2188
37.8k
        );
2189
37.8k
        break;
2190
37.8k
      }
2191
67.9k
    }
2192
39.7k
    if (!formatting_node) {
2193
      // No matching tag; not a parse error outright, but fall through to the
2194
      // "any other end tag" clause (which may potentially add a parse error,
2195
      // but not always).
2196
1.86k
      gumbo_debug("No active formatting elements; aborting.\n");
2197
1.86k
      in_body_any_other_end_tag(parser, token);
2198
1.86k
      return;
2199
1.86k
    }
2200
2201
    // Step 7
2202
37.8k
    if (formatting_node_in_open_elements == -1) {
2203
97
      gumbo_debug("Formatting node not on stack of open elements.\n");
2204
97
      parser_add_parse_error(parser, token);
2205
97
      gumbo_vector_remove (
2206
97
        formatting_node,
2207
97
        &state->_active_formatting_elements
2208
97
      );
2209
97
      return;
2210
97
    }
2211
2212
    // Step 8
2213
37.7k
    if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
2214
931
      parser_add_parse_error(parser, token);
2215
931
      gumbo_debug("Element not in scope.\n");
2216
931
      return;
2217
931
    }
2218
2219
    // Step 9
2220
36.8k
    if (formatting_node != get_current_node(parser))
2221
14.1k
      parser_add_parse_error(parser, token);  // But continue onwards.
2222
36.8k
    assert(formatting_node);
2223
0
    assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
2224
0
    assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
2225
2226
    // Step 10
2227
0
    GumboNode* furthest_block = NULL;
2228
36.8k
    for (
2229
36.8k
      unsigned int j = formatting_node_in_open_elements;
2230
96.2k
      j < state->_open_elements.length;
2231
59.3k
      ++j
2232
69.1k
    ) {
2233
69.1k
      assert(j > 0);
2234
0
      GumboNode* current = state->_open_elements.data[j];
2235
69.1k
      if (is_special_node(current)) {
2236
9.75k
        furthest_block = current;
2237
9.75k
        break;
2238
9.75k
      }
2239
69.1k
    }
2240
    // Step 11.
2241
36.8k
    if (!furthest_block) {
2242
35.4k
      while (pop_current_node(parser) != formatting_node)
2243
8.35k
        ;
2244
27.0k
      gumbo_vector_remove (
2245
27.0k
        formatting_node,
2246
27.0k
        &state->_active_formatting_elements
2247
27.0k
      );
2248
27.0k
      return;
2249
27.0k
    }
2250
9.75k
    assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
2251
2252
    // Step 12.
2253
    // Elements may be moved and reparented by this algorithm, so
2254
    // common_ancestor is not necessarily the same as formatting_node->parent.
2255
0
    GumboNode* common_ancestor = state->_open_elements.data [
2256
9.75k
      formatting_node_in_open_elements - 1
2257
9.75k
    ];
2258
9.75k
    gumbo_debug (
2259
9.75k
      "Common ancestor tag = %s, furthest block tag = %s.\n",
2260
9.75k
      gumbo_normalized_tagname(common_ancestor->v.element.tag),
2261
9.75k
      gumbo_normalized_tagname(furthest_block->v.element.tag)
2262
9.75k
    );
2263
2264
    // Step 13.
2265
9.75k
    int bookmark = 1 + gumbo_vector_index_of (
2266
9.75k
      &state->_active_formatting_elements,
2267
9.75k
      formatting_node
2268
9.75k
    );
2269
9.75k
    gumbo_debug("Bookmark at %d.\n", bookmark);
2270
    // Step 14.
2271
9.75k
    GumboNode* node = furthest_block;
2272
9.75k
    GumboNode* last_node = furthest_block;
2273
    // Must be stored explicitly, in case node is removed from the stack of open
2274
    // elements, to handle step 14.3.
2275
9.75k
    int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
2276
9.75k
    assert(saved_node_index > 0);
2277
    // Step 14.1.
2278
23.9k
    for (int j = 0;;) {
2279
      // Step 14.2.
2280
23.9k
      ++j;
2281
      // Step 14.3.
2282
23.9k
      int node_index = gumbo_vector_index_of(&state->_open_elements, node);
2283
23.9k
      gumbo_debug (
2284
23.9k
        "Current index: %d, last index: %d.\n",
2285
23.9k
        node_index,
2286
23.9k
        saved_node_index
2287
23.9k
      );
2288
23.9k
      if (node_index == -1) {
2289
6.23k
        node_index = saved_node_index;
2290
6.23k
      }
2291
23.9k
      saved_node_index = --node_index;
2292
23.9k
      assert(node_index > 0);
2293
0
      assert((unsigned int) node_index < state->_open_elements.capacity);
2294
0
      node = state->_open_elements.data[node_index];
2295
23.9k
      assert(node->parent);
2296
      // Step 14.4.
2297
23.9k
      if (node == formatting_node) {
2298
9.75k
        break;
2299
9.75k
      }
2300
14.1k
      int formatting_index = gumbo_vector_index_of (
2301
14.1k
        &state->_active_formatting_elements,
2302
14.1k
        node
2303
14.1k
      );
2304
      // Step 14.5.
2305
14.1k
      if (j > 3 && formatting_index != -1) {
2306
2.16k
        gumbo_debug("Removing formatting element at %d.\n", formatting_index);
2307
2.16k
        gumbo_vector_remove_at (
2308
2.16k
          formatting_index,
2309
2.16k
          &state->_active_formatting_elements
2310
2.16k
        );
2311
        // Removing the element shifts all indices over by one, so we may need
2312
        // to move the bookmark.
2313
2.16k
        if (formatting_index < bookmark) {
2314
1.70k
          --bookmark;
2315
1.70k
          gumbo_debug("Moving bookmark to %d.\n", bookmark);
2316
1.70k
        }
2317
2.16k
        continue;
2318
2.16k
      }
2319
11.9k
      if (formatting_index == -1) {
2320
        // Step 14.6.
2321
6.23k
        gumbo_vector_remove_at(node_index, &state->_open_elements);
2322
6.23k
        continue;
2323
6.23k
      }
2324
      // Step 14.7.
2325
      // "common ancestor as the intended parent" doesn't actually mean insert
2326
      // it into the common ancestor; that happens below.
2327
5.75k
      node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
2328
5.75k
      assert(formatting_index >= 0);
2329
0
      state->_active_formatting_elements.data[formatting_index] = node;
2330
5.75k
      assert(node_index >= 0);
2331
0
      state->_open_elements.data[node_index] = node;
2332
      // Step 14.8.
2333
5.75k
      if (last_node == furthest_block) {
2334
3.83k
        bookmark = formatting_index + 1;
2335
3.83k
        gumbo_debug("Bookmark moved to %d.\n", bookmark);
2336
3.83k
        assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2337
3.83k
      }
2338
      // Step 14.9.
2339
0
      last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2340
5.75k
      remove_from_parent(last_node);
2341
5.75k
      append_node(node, last_node);
2342
      // Step 14.10.
2343
5.75k
      last_node = node;
2344
5.75k
    }  // Step 14.11.
2345
2346
    // Step 15.
2347
9.75k
    gumbo_debug (
2348
9.75k
      "Removing %s node from parent ",
2349
9.75k
      gumbo_normalized_tagname(last_node->v.element.tag)
2350
9.75k
    );
2351
9.75k
    remove_from_parent(last_node);
2352
9.75k
    last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2353
9.75k
    InsertionLocation location = get_appropriate_insertion_location (
2354
9.75k
      parser,
2355
9.75k
      common_ancestor
2356
9.75k
    );
2357
9.75k
    gumbo_debug (
2358
9.75k
      "and inserting it into %s.\n",
2359
9.75k
      gumbo_normalized_tagname(location.target->v.element.tag)
2360
9.75k
    );
2361
9.75k
    insert_node(last_node, location);
2362
2363
    // Step 16.
2364
9.75k
    GumboNode* new_formatting_node = clone_node (
2365
9.75k
      formatting_node,
2366
9.75k
      GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
2367
9.75k
    );
2368
9.75k
    formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2369
2370
    // Step 17. Instead of appending nodes one-by-one, we swap the children
2371
    // vector of furthest_block with the empty children of new_formatting_node,
2372
    // reducing memory traffic and allocations. We still have to reset their
2373
    // parent pointers, though.
2374
9.75k
    GumboVector temp = new_formatting_node->v.element.children;
2375
9.75k
    new_formatting_node->v.element.children = furthest_block->v.element.children;
2376
9.75k
    furthest_block->v.element.children = temp;
2377
2378
9.75k
    temp = new_formatting_node->v.element.children;
2379
96.1k
    for (unsigned int i = 0; i < temp.length; ++i) {
2380
86.4k
      GumboNode* child = temp.data[i];
2381
86.4k
      child->parent = new_formatting_node;
2382
86.4k
    }
2383
2384
    // Step 18.
2385
9.75k
    append_node(furthest_block, new_formatting_node);
2386
2387
    // Step 19.
2388
    // If the formatting node was before the bookmark, it may shift over all
2389
    // indices after it, so we need to explicitly find the index and possibly
2390
    // adjust the bookmark.
2391
9.75k
    int formatting_node_index = gumbo_vector_index_of (
2392
9.75k
      &state->_active_formatting_elements,
2393
9.75k
      formatting_node
2394
9.75k
    );
2395
9.75k
    assert(formatting_node_index != -1);
2396
9.75k
    if (formatting_node_index < bookmark) {
2397
9.75k
      gumbo_debug (
2398
9.75k
        "Formatting node at %d is before bookmark at %d; decrementing.\n",
2399
9.75k
        formatting_node_index, bookmark
2400
9.75k
      );
2401
9.75k
      --bookmark;
2402
9.75k
    }
2403
9.75k
    gumbo_vector_remove_at (
2404
9.75k
      formatting_node_index,
2405
9.75k
      &state->_active_formatting_elements
2406
9.75k
    );
2407
9.75k
    assert(bookmark >= 0);
2408
0
    assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2409
0
    gumbo_vector_insert_at (
2410
9.75k
      new_formatting_node,
2411
9.75k
      bookmark,
2412
9.75k
      &state->_active_formatting_elements
2413
9.75k
    );
2414
2415
    // Step 20.
2416
9.75k
    gumbo_vector_remove(formatting_node, &state->_open_elements);
2417
9.75k
    int insert_at = 1 + gumbo_vector_index_of (
2418
9.75k
      &state->_open_elements,
2419
9.75k
      furthest_block
2420
9.75k
    );
2421
9.75k
    assert(insert_at >= 0);
2422
0
    assert((unsigned int) insert_at <= state->_open_elements.length);
2423
0
    gumbo_vector_insert_at (
2424
9.75k
      new_formatting_node,
2425
9.75k
      insert_at,
2426
9.75k
      &state->_open_elements
2427
9.75k
    );
2428
9.75k
  }  // Step 21.
2429
30.4k
}
2430
2431
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
10.0k
static void finish_parsing(GumboParser* parser) {
2433
10.0k
  gumbo_debug("Finishing parsing");
2434
10.0k
  maybe_flush_text_node_buffer(parser);
2435
10.0k
  GumboParserState* state = parser->_parser_state;
2436
10.0k
  for (
2437
10.0k
    GumboNode* node = pop_current_node(parser);
2438
64.6k
    node;
2439
54.5k
    node = pop_current_node(parser)
2440
54.5k
  ) {
2441
54.5k
    if (
2442
54.5k
      (node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag)
2443
54.5k
      || (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)
2444
54.5k
    ) {
2445
235
      continue;
2446
235
    }
2447
54.3k
    node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2448
54.3k
  }
2449
10.0k
  while (pop_current_node(parser))
2450
0
    ;  // Pop them all.
2451
10.0k
}
2452
2453
11.9k
static void handle_initial(GumboParser* parser, GumboToken* token) {
2454
11.9k
  GumboDocument* document = &get_document_node(parser)->v.document;
2455
11.9k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2456
1.09k
    ignore_token(parser);
2457
1.09k
    return;
2458
1.09k
  }
2459
10.8k
  if (token->type == GUMBO_TOKEN_COMMENT) {
2460
800
    append_comment_node(parser, get_document_node(parser), token);
2461
800
    return;
2462
800
  }
2463
10.0k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2464
472
    document->has_doctype = true;
2465
472
    document->name = token->v.doc_type.name;
2466
472
    document->public_identifier = token->v.doc_type.public_identifier;
2467
472
    document->system_identifier = token->v.doc_type.system_identifier;
2468
472
    document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2469
472
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2470
472
    maybe_add_doctype_error(parser, token);
2471
472
    return;
2472
472
  }
2473
9.62k
  parser_add_parse_error(parser, token);
2474
9.62k
  document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2475
9.62k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2476
9.62k
  parser->_parser_state->_reprocess_current_token = true;
2477
9.62k
}
2478
2479
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
2480
11.7k
static void handle_before_html(GumboParser* parser, GumboToken* token) {
2481
11.7k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2482
425
    parser_add_parse_error(parser, token);
2483
425
    ignore_token(parser);
2484
425
    return;
2485
425
  }
2486
11.3k
  if (token->type == GUMBO_TOKEN_COMMENT) {
2487
351
    append_comment_node(parser, get_document_node(parser), token);
2488
351
    return;
2489
351
  }
2490
10.9k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2491
617
    ignore_token(parser);
2492
617
    return;
2493
617
  }
2494
10.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2495
41
    GumboNode* html_node = insert_element_from_token(parser, token);
2496
41
    parser->_output->root = html_node;
2497
41
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2498
41
    return;
2499
41
  }
2500
10.3k
  if (
2501
10.3k
    token->type == GUMBO_TOKEN_END_TAG
2502
10.3k
    && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2503
10.3k
  ) {
2504
268
    parser_add_parse_error(parser, token);
2505
268
    ignore_token(parser);
2506
268
    return;
2507
268
  }
2508
10.0k
  GumboNode* html_node = insert_element_of_tag_type (
2509
10.0k
    parser,
2510
10.0k
    GUMBO_TAG_HTML,
2511
10.0k
    GUMBO_INSERTION_IMPLIED
2512
10.0k
  );
2513
10.0k
  assert(html_node);
2514
0
  parser->_output->root = html_node;
2515
10.0k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2516
10.0k
  parser->_parser_state->_reprocess_current_token = true;
2517
10.0k
}
2518
2519
// Forward declarations because of mutual dependencies.
2520
static void handle_token(GumboParser* parser, GumboToken* token);
2521
static void handle_in_body(GumboParser* parser, GumboToken* token);
2522
static void handle_in_template(GumboParser* parser, GumboToken* token);
2523
2524
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
2525
10.5k
static void handle_before_head(GumboParser* parser, GumboToken* token) {
2526
10.5k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2527
120
    ignore_token(parser);
2528
120
    return;
2529
120
  }
2530
10.4k
  if (token->type == GUMBO_TOKEN_COMMENT) {
2531
76
    append_comment_node(parser, get_current_node(parser), token);
2532
76
    return;
2533
76
  }
2534
10.3k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2535
44
    parser_add_parse_error(parser, token);
2536
44
    ignore_token(parser);
2537
44
    return;
2538
44
  }
2539
10.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2540
73
    handle_in_body(parser, token);
2541
73
    return;
2542
73
  }
2543
10.2k
  if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2544
20
    GumboNode* node = insert_element_from_token(parser, token);
2545
20
    parser->_parser_state->_head_element = node;
2546
20
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2547
20
    return;
2548
20
  }
2549
10.2k
  if (
2550
10.2k
    token->type == GUMBO_TOKEN_END_TAG
2551
10.2k
    && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2552
10.2k
  ) {
2553
141
    parser_add_parse_error(parser, token);
2554
141
    ignore_token(parser);
2555
141
    return;
2556
141
  }
2557
10.0k
  GumboNode* node = insert_element_of_tag_type (
2558
10.0k
    parser,
2559
10.0k
    GUMBO_TAG_HEAD,
2560
10.0k
    GUMBO_INSERTION_IMPLIED
2561
10.0k
  );
2562
10.0k
  parser->_parser_state->_head_element = node;
2563
10.0k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2564
10.0k
  parser->_parser_state->_reprocess_current_token = true;
2565
10.0k
}
2566
2567
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
2568
21.2k
static void handle_in_head(GumboParser* parser, GumboToken* token) {
2569
21.2k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2570
958
    insert_text_token(parser, token);
2571
958
    return;
2572
958
  }
2573
20.3k
  if (token->type == GUMBO_TOKEN_COMMENT) {
2574
526
    append_comment_node(parser, get_current_node(parser), token);
2575
526
    return;
2576
526
  }
2577
19.7k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2578
85
    parser_add_parse_error(parser, token);
2579
85
    ignore_token(parser);
2580
85
    return;
2581
85
  }
2582
19.7k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2583
60
    return handle_in_body(parser, token);
2584
60
  }
2585
19.6k
  if (
2586
19.6k
    tag_in(token, kStartTag, &(const TagSet) {
2587
19.6k
      TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
2588
19.6k
    })
2589
19.6k
  ) {
2590
2.01k
    insert_element_from_token(parser, token);
2591
2.01k
    pop_current_node(parser);
2592
2.01k
    acknowledge_self_closing_tag(parser);
2593
2.01k
    return;
2594
2.01k
  }
2595
17.6k
  if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2596
309
    insert_element_from_token(parser, token);
2597
309
    pop_current_node(parser);
2598
309
    acknowledge_self_closing_tag(parser);
2599
    // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2600
    // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2601
    // should specifically look for that string in the document and re-encode it
2602
    // before passing to Gumbo.
2603
309
    return;
2604
309
  }
2605
17.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2606
359
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2607
359
    return;
2608
359
  }
2609
16.9k
  if (
2610
16.9k
    tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2611
16.9k
  ) {
2612
580
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
580
    return;
2614
580
  }
2615
16.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2616
371
    insert_element_from_token(parser, token);
2617
371
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2618
371
    return;
2619
371
  }
2620
16.0k
  if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2621
1.26k
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
2622
1.26k
    return;
2623
1.26k
  }
2624
14.7k
  if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2625
39
    GumboNode* head = pop_current_node(parser);
2626
39
    UNUSED_IF_NDEBUG(head);
2627
39
    assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2628
0
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2629
39
    return;
2630
39
  }
2631
14.7k
  if (
2632
14.7k
    tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2633
14.7k
  ) {
2634
44
    pop_current_node(parser);
2635
44
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2636
44
    parser->_parser_state->_reprocess_current_token = true;
2637
44
    return;
2638
44
  }
2639
14.6k
  if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2640
3.19k
    insert_element_from_token(parser, token);
2641
3.19k
    add_formatting_element(parser, &kActiveFormattingScopeMarker);
2642
3.19k
    set_frameset_not_ok(parser);
2643
3.19k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2644
3.19k
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2645
3.19k
    return;
2646
3.19k
  }
2647
11.4k
  if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2648
1.17k
    if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2649
436
      parser_add_parse_error(parser, token);
2650
436
      ignore_token(parser);
2651
436
      return;
2652
436
    }
2653
734
    generate_all_implied_end_tags_thoroughly(parser);
2654
734
    if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE))
2655
119
      parser_add_parse_error(parser, token);
2656
2.42k
    while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2657
1.69k
      ;
2658
734
    clear_active_formatting_elements(parser);
2659
734
    pop_template_insertion_mode(parser);
2660
734
    reset_insertion_mode_appropriately(parser);
2661
734
    return;
2662
1.17k
  }
2663
10.3k
  if (
2664
10.3k
    tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2665
10.3k
    || (token->type == GUMBO_TOKEN_END_TAG)
2666
10.3k
  ) {
2667
303
    parser_add_parse_error(parser, token);
2668
303
    ignore_token(parser);
2669
303
    return;
2670
303
  }
2671
10.0k
  pop_current_node(parser);
2672
10.0k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2673
10.0k
  parser->_parser_state->_reprocess_current_token = true;
2674
10.0k
  return;
2675
10.3k
}
2676
2677
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript
2678
1.20k
static void handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2679
1.20k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2680
7
    parser_add_parse_error(parser, token);
2681
7
    ignore_token(parser);
2682
7
    return;
2683
7
  }
2684
1.19k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2685
19
    handle_in_body(parser, token);
2686
19
    return;
2687
19
  }
2688
1.17k
  if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2689
154
    const GumboNode* node = pop_current_node(parser);
2690
154
    assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2691
0
    UNUSED_IF_NDEBUG(node);
2692
154
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2693
154
    return;
2694
154
  }
2695
1.02k
  if (
2696
1.02k
    token->type == GUMBO_TOKEN_WHITESPACE
2697
1.02k
    || token->type == GUMBO_TOKEN_COMMENT
2698
1.02k
    || tag_in (token, kStartTag, &(const TagSet) {
2699
731
      TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2700
731
      TAG(META), TAG(NOFRAMES), TAG(STYLE)
2701
731
    })
2702
1.02k
  ) {
2703
453
    handle_in_head(parser, token);
2704
453
    return;
2705
453
  }
2706
572
  if (
2707
572
    tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
2708
572
    || (
2709
429
      token->type == GUMBO_TOKEN_END_TAG
2710
429
      && !tag_is(token, kEndTag, GUMBO_TAG_BR)
2711
429
    )
2712
572
  ) {
2713
355
    parser_add_parse_error(parser, token);
2714
355
    ignore_token(parser);
2715
355
    return;
2716
355
  }
2717
217
  parser_add_parse_error(parser, token);
2718
217
  const GumboNode* node = pop_current_node(parser);
2719
217
  assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2720
0
  UNUSED_IF_NDEBUG(node);
2721
217
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2722
217
  parser->_parser_state->_reprocess_current_token = true;
2723
217
}
2724
2725
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
2726
11.8k
static void handle_after_head(GumboParser* parser, GumboToken* token) {
2727
11.8k
  GumboParserState* state = parser->_parser_state;
2728
11.8k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2729
353
    insert_text_token(parser, token);
2730
353
    return;
2731
353
  }
2732
11.4k
  if (token->type == GUMBO_TOKEN_COMMENT) {
2733
353
    append_comment_node(parser, get_current_node(parser), token);
2734
353
    return;
2735
353
  }
2736
11.1k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2737
5
    parser_add_parse_error(parser, token);
2738
5
    ignore_token(parser);
2739
5
    return;
2740
5
  }
2741
11.1k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2742
21
    handle_in_body(parser, token);
2743
21
    return;
2744
21
  }
2745
11.1k
  if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2746
4
    insert_element_from_token(parser, token);
2747
4
    set_frameset_not_ok(parser);
2748
4
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2749
4
    return;
2750
4
  }
2751
11.1k
  if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2752
132
    insert_element_from_token(parser, token);
2753
132
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2754
132
    return;
2755
132
  }
2756
10.9k
  if (
2757
10.9k
    tag_in(token, kStartTag, &(const TagSet) {
2758
10.9k
      TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
2759
10.9k
      TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
2760
10.9k
    })
2761
10.9k
  ) {
2762
346
    parser_add_parse_error(parser, token);
2763
346
    assert(state->_head_element != NULL);
2764
    // This must be flushed before we push the head element on, as there may be
2765
    // pending character tokens that should be attached to the root.
2766
0
    maybe_flush_text_node_buffer(parser);
2767
346
    gumbo_vector_add(state->_head_element, &state->_open_elements);
2768
346
    handle_in_head(parser, token);
2769
346
    gumbo_vector_remove(state->_head_element, &state->_open_elements);
2770
346
    return;
2771
346
  }
2772
10.6k
  if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2773
0
    handle_in_head(parser, token);
2774
0
    return;
2775
0
  }
2776
10.6k
  if (
2777
10.6k
    tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2778
10.6k
    || (
2779
10.2k
      token->type == GUMBO_TOKEN_END_TAG
2780
10.2k
      && !tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2781
10.2k
    )
2782
10.6k
  ) {
2783
666
    parser_add_parse_error(parser, token);
2784
666
    ignore_token(parser);
2785
666
    return;
2786
666
  }
2787
9.95k
  insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2788
9.95k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2789
9.95k
  state->_reprocess_current_token = true;
2790
9.95k
}
2791
2792
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
2793
10.4M
static void handle_in_body(GumboParser* parser, GumboToken* token) {
2794
10.4M
  GumboParserState* state = parser->_parser_state;
2795
10.4M
  assert(state->_open_elements.length > 0);
2796
10.4M
  if (token->type == GUMBO_TOKEN_NULL) {
2797
2.45M
    parser_add_parse_error(parser, token);
2798
2.45M
    ignore_token(parser);
2799
2.45M
    return;
2800
2.45M
  }
2801
7.98M
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2802
102k
    reconstruct_active_formatting_elements(parser);
2803
102k
    insert_text_token(parser, token);
2804
102k
    return;
2805
102k
  }
2806
7.87M
  if (
2807
7.87M
    token->type == GUMBO_TOKEN_CHARACTER
2808
7.87M
    || token->type == GUMBO_TOKEN_CDATA
2809
7.87M
  ) {
2810
6.64M
    reconstruct_active_formatting_elements(parser);
2811
6.64M
    insert_text_token(parser, token);
2812
6.64M
    set_frameset_not_ok(parser);
2813
6.64M
    return;
2814
6.64M
  }
2815
1.23M
  if (token->type == GUMBO_TOKEN_COMMENT) {
2816
86.3k
    append_comment_node(parser, get_current_node(parser), token);
2817
86.3k
    return;
2818
86.3k
  }
2819
1.15M
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2820
5.32k
    parser_add_parse_error(parser, token);
2821
5.32k
    ignore_token(parser);
2822
5.32k
    return;
2823
5.32k
  }
2824
1.14M
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2825
1.64k
    parser_add_parse_error(parser, token);
2826
1.64k
    if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2827
132
      ignore_token(parser);
2828
132
      return;
2829
132
    }
2830
1.50k
    assert(parser->_output->root != NULL);
2831
0
    assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2832
0
    merge_attributes(token, parser->_output->root);
2833
1.50k
    return;
2834
1.64k
  }
2835
1.14M
  if (
2836
1.14M
    tag_in(token, kStartTag, &(const TagSet) {
2837
1.14M
      TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2838
1.14M
      TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
2839
1.14M
      TAG(TITLE)
2840
1.14M
    })
2841
1.14M
    || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
2842
1.14M
  ) {
2843
4.27k
    handle_in_head(parser, token);
2844
4.27k
    return;
2845
4.27k
  }
2846
1.14M
  if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2847
654
    parser_add_parse_error(parser, token);
2848
654
    if (
2849
654
      state->_open_elements.length < 2
2850
654
      || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)
2851
654
      || has_open_element(parser, GUMBO_TAG_TEMPLATE)
2852
654
    ) {
2853
448
      ignore_token(parser);
2854
448
    } else {
2855
206
      set_frameset_not_ok(parser);
2856
206
      merge_attributes(token, state->_open_elements.data[1]);
2857
206
    }
2858
654
    return;
2859
654
  }
2860
1.14M
  if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2861
668
    parser_add_parse_error(parser, token);
2862
668
    if (
2863
668
      state->_open_elements.length < 2
2864
668
      || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)
2865
668
      || !state->_frameset_ok
2866
668
    ) {
2867
605
      ignore_token(parser);
2868
605
      return;
2869
605
    }
2870
    // Save the body node for later removal.
2871
63
    GumboNode* body_node = state->_open_elements.data[1];
2872
2873
    // Pop all nodes except root HTML element.
2874
63
    GumboNode* node;
2875
92
    do {
2876
92
      node = pop_current_node(parser);
2877
92
    } while (node != state->_open_elements.data[1]);
2878
2879
    // Removing & destroying the body node is going to kill any nodes that have
2880
    // been added to the list of active formatting elements, and so we should
2881
    // clear it to prevent a use-after-free if the list of active formatting
2882
    // elements is reconstructed afterwards. This may happen if whitespace
2883
    // follows the </frameset>.
2884
63
    clear_active_formatting_elements(parser);
2885
2886
    // Remove the body node. We may want to factor this out into a generic
2887
    // helper, but right now this is the only code that needs to do this.
2888
63
    GumboVector* children = &parser->_output->root->v.element.children;
2889
468
    for (unsigned int i = 0; i < children->length; ++i) {
2890
468
      if (children->data[i] == body_node) {
2891
63
        gumbo_vector_remove_at(i, children);
2892
63
        break;
2893
63
      }
2894
468
    }
2895
63
    destroy_node(body_node);
2896
2897
    // Insert the <frameset>, and switch the insertion mode.
2898
63
    insert_element_from_token(parser, token);
2899
63
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2900
63
    return;
2901
668
  }
2902
1.14M
  if (token->type == GUMBO_TOKEN_EOF) {
2903
11.4k
    if (get_current_template_insertion_mode(parser) !=
2904
11.4k
        GUMBO_INSERTION_MODE_INITIAL) {
2905
1.61k
      handle_in_template(parser, token);
2906
1.61k
      return;
2907
1.61k
    }
2908
9.85k
    if (stack_contains_nonclosable_element(parser))
2909
2.70k
      parser_add_parse_error(parser, token);
2910
9.85k
    return;
2911
11.4k
  }
2912
1.12M
  if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
2913
408
    if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2914
68
      parser_add_parse_error(parser, token);
2915
68
      ignore_token(parser);
2916
68
      return;
2917
68
    }
2918
340
    if (stack_contains_nonclosable_element(parser))
2919
190
      parser_add_parse_error(parser, token);
2920
340
    GumboNode* body = state->_open_elements.data[1];
2921
340
    assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2922
0
    record_end_of_element(state->_current_token, &body->v.element);
2923
340
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2924
340
    return;
2925
408
  }
2926
1.12M
  if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2927
798
    if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2928
93
      parser_add_parse_error(parser, token);
2929
93
      ignore_token(parser);
2930
93
      return;
2931
93
    }
2932
705
    if (stack_contains_nonclosable_element(parser))
2933
523
      parser_add_parse_error(parser, token);
2934
705
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2935
705
    parser->_parser_state->_reprocess_current_token = true;
2936
705
    return;
2937
798
  }
2938
1.12M
  if (
2939
1.12M
    tag_in(token, kStartTag, &(const TagSet) {
2940
1.12M
      TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
2941
1.12M
      TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2942
1.12M
      TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2943
1.12M
      TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2944
1.12M
      TAG(SUMMARY), TAG(UL), TAG(SEARCH)
2945
1.12M
    })
2946
1.12M
  ) {
2947
552k
    maybe_implicitly_close_p_tag(parser, token);
2948
552k
    insert_element_from_token(parser, token);
2949
552k
    return;
2950
552k
  }
2951
575k
  if (tag_in(token, kStartTag, &heading_tags)) {
2952
3.05k
    maybe_implicitly_close_p_tag(parser, token);
2953
3.05k
    if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
2954
297
      parser_add_parse_error(parser, token);
2955
297
      pop_current_node(parser);
2956
297
    }
2957
3.05k
    insert_element_from_token(parser, token);
2958
3.05k
    return;
2959
3.05k
  }
2960
572k
  if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2961
707
    maybe_implicitly_close_p_tag(parser, token);
2962
707
    insert_element_from_token(parser, token);
2963
707
    state->_ignore_next_linefeed = true;
2964
707
    set_frameset_not_ok(parser);
2965
707
    return;
2966
707
  }
2967
571k
  if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2968
961
    if (
2969
961
      state->_form_element != NULL
2970
961
      && !has_open_element(parser, GUMBO_TAG_TEMPLATE)
2971
961
    ) {
2972
477
      gumbo_debug("Ignoring nested form.\n");
2973
477
      parser_add_parse_error(parser, token);
2974
477
      ignore_token(parser);
2975
477
      return;
2976
477
    }
2977
484
    maybe_implicitly_close_p_tag(parser, token);
2978
484
    GumboNode* form_element = insert_element_from_token(parser, token);
2979
484
    if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2980
246
      state->_form_element = form_element;
2981
246
    }
2982
484
    return;
2983
961
  }
2984
570k
  if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2985
2.09k
    maybe_implicitly_close_list_tag(parser, token, true);
2986
2.09k
    maybe_implicitly_close_p_tag(parser, token);
2987
2.09k
    insert_element_from_token(parser, token);
2988
2.09k
    return;
2989
2.09k
  }
2990
568k
  if (tag_in(token, kStartTag, &dd_dt_tags)) {
2991
914
    maybe_implicitly_close_list_tag(parser, token, false);
2992
914
    maybe_implicitly_close_p_tag(parser, token);
2993
914
    insert_element_from_token(parser, token);
2994
914
    return;
2995
914
  }
2996
567k
  if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2997
45
    maybe_implicitly_close_p_tag(parser, token);
2998
45
    insert_element_from_token(parser, token);
2999
45
    gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
3000
45
    return;
3001
45
  }
3002
567k
  if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
3003
137
    if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
3004
55
      parser_add_parse_error(parser, token);
3005
      // We don't want to use implicitly_close_tags here because it may add an
3006
      // error and we've already added the only error the standard specifies.
3007
55
      generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3008
170
      while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
3009
115
        ;
3010
55
    }
3011
137
    reconstruct_active_formatting_elements(parser);
3012
137
    insert_element_from_token(parser, token);
3013
137
    set_frameset_not_ok(parser);
3014
137
    return;
3015
137
  }
3016
567k
  if (
3017
567k
    tag_in(token, kEndTag, &(const TagSet) {
3018
567k
      TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
3019
567k
      TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
3020
567k
      TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
3021
567k
      TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL),
3022
567k
      TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL), TAG(SEARCH)
3023
567k
    })
3024
567k
  ) {
3025
291
    GumboTag tag = token->v.end_tag.tag;
3026
291
    if (!has_an_element_in_scope(parser, tag)) {
3027
83
      parser_add_parse_error(parser, token);
3028
83
      ignore_token(parser);
3029
83
      return;
3030
83
    }
3031
208
    return implicitly_close_tags (
3032
208
      parser,
3033
208
      token,
3034
208
      GUMBO_NAMESPACE_HTML,
3035
208
      token->v.end_tag.tag
3036
208
    );
3037
291
  }
3038
567k
  if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
3039
795
    if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3040
267
      if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
3041
184
        parser_add_parse_error(parser, token);
3042
184
        ignore_token(parser);
3043
184
        return;
3044
184
      }
3045
83
      generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3046
83
      if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM))
3047
22
        parser_add_parse_error(parser, token);
3048
231
      while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
3049
148
        ;
3050
83
      return;
3051
528
    } else {
3052
528
      GumboNode* node = state->_form_element;
3053
528
      assert(!node || node->type == GUMBO_NODE_ELEMENT);
3054
0
      state->_form_element = NULL;
3055
528
      if (!node || !has_node_in_scope(parser, node)) {
3056
391
        gumbo_debug("Closing an unopened form.\n");
3057
391
        parser_add_parse_error(parser, token);
3058
391
        ignore_token(parser);
3059
391
        return;
3060
391
      }
3061
      // Since we remove the form node without popping, we need to make sure
3062
      // that we flush any text nodes at the end of the form.
3063
137
      maybe_flush_text_node_buffer(parser);
3064
      // This differs from implicitly_close_tags because we remove *only* the
3065
      // <form> element; other nodes are left in scope.
3066
137
      generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3067
137
      if (get_current_node(parser) != node)
3068
34
        parser_add_parse_error(parser, token);
3069
103
      else
3070
103
        record_end_of_element(token, &node->v.element);
3071
3072
137
      GumboVector* open_elements = &state->_open_elements;
3073
137
      int index = gumbo_vector_index_of(open_elements, node);
3074
137
      assert(index >= 0);
3075
0
      gumbo_vector_remove_at(index, open_elements);
3076
137
      return;
3077
528
    }
3078
795
  }
3079
566k
  if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3080
2.32k
    if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
3081
2.09k
      parser_add_parse_error(parser, token);
3082
      // reconstruct_active_formatting_elements(parser);
3083
2.09k
      insert_element_of_tag_type (
3084
2.09k
        parser,
3085
2.09k
        GUMBO_TAG_P,
3086
2.09k
        GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3087
2.09k
      );
3088
2.09k
    }
3089
2.32k
    implicitly_close_tags (
3090
2.32k
      parser,
3091
2.32k
      token,
3092
2.32k
      GUMBO_NAMESPACE_HTML,
3093
2.32k
      GUMBO_TAG_P
3094
2.32k
    );
3095
2.32k
    return;
3096
2.32k
  }
3097
564k
  if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3098
311
    if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
3099
194
      parser_add_parse_error(parser, token);
3100
194
      ignore_token(parser);
3101
194
      return;
3102
194
    }
3103
117
    implicitly_close_tags (
3104
117
      parser,
3105
117
      token,
3106
117
      GUMBO_NAMESPACE_HTML,
3107
117
      GUMBO_TAG_LI
3108
117
    );
3109
117
    return;
3110
311
  }
3111
563k
  if (tag_in(token, kEndTag, &dd_dt_tags)) {
3112
491
    GumboTag token_tag = token->v.end_tag.tag;
3113
491
    if (!has_an_element_in_scope(parser, token_tag)) {
3114
445
      parser_add_parse_error(parser, token);
3115
445
      ignore_token(parser);
3116
445
      return;
3117
445
    }
3118
46
    implicitly_close_tags (
3119
46
      parser,
3120
46
      token,
3121
46
      GUMBO_NAMESPACE_HTML,
3122
46
      token_tag
3123
46
    );
3124
46
    return;
3125
491
  }
3126
563k
  if (tag_in(token, kEndTag, &heading_tags)) {
3127
2.55k
    if (
3128
2.55k
      !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
3129
2.55k
        GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3130
2.55k
        GUMBO_TAG_H5, GUMBO_TAG_H6
3131
2.55k
      })
3132
2.55k
    ) {
3133
      // No heading open; ignore the token entirely.
3134
579
      parser_add_parse_error(parser, token);
3135
579
      ignore_token(parser);
3136
579
      return;
3137
579
    }
3138
1.97k
    generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3139
1.97k
    const GumboNode* current_node = get_current_node(parser);
3140
1.97k
    if (!node_html_tag_is(current_node, token->v.end_tag.tag)) {
3141
      // There're children of the heading currently open; close them below and
3142
      // record a parse error.
3143
      // TODO(jdtang): Add a way to distinguish this error case from the one
3144
      // above.
3145
1.74k
      parser_add_parse_error(parser, token);
3146
1.74k
    }
3147
3.00k
    do {
3148
3.00k
      current_node = pop_current_node(parser);
3149
3.00k
    } while (!node_tag_in_set(current_node, &heading_tags));
3150
1.97k
    return;
3151
2.55k
  }
3152
560k
  if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3153
63.9k
    int last_a;
3154
63.9k
    int has_matching_a = find_last_anchor_index(parser, &last_a);
3155
63.9k
    if (has_matching_a) {
3156
26.0k
      assert(has_matching_a == 1);
3157
0
      parser_add_parse_error(parser, token);
3158
26.0k
      (void)adoption_agency_algorithm(parser, token);
3159
      // The adoption agency algorithm usually removes all instances of <a>
3160
      // from the list of active formatting elements, but in case it doesn't,
3161
      // we're supposed to do this. (The conditions where it might not are
3162
      // listed in the spec.)
3163
26.0k
      if (find_last_anchor_index(parser, &last_a)) {
3164
883
        void* last_element = gumbo_vector_remove_at (
3165
883
          last_a,
3166
883
          &state->_active_formatting_elements
3167
883
        );
3168
883
        gumbo_vector_remove(last_element, &state->_open_elements);
3169
883
      }
3170
26.0k
    }
3171
0
    reconstruct_active_formatting_elements(parser);
3172
63.9k
    add_formatting_element(parser, insert_element_from_token(parser, token));
3173
63.9k
    return;
3174
63.9k
  }
3175
496k
  if (
3176
496k
    tag_in(token, kStartTag, &(const TagSet) {
3177
496k
      TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
3178
496k
      TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
3179
496k
    })
3180
496k
  ) {
3181
229k
    reconstruct_active_formatting_elements(parser);
3182
229k
    add_formatting_element(parser, insert_element_from_token(parser, token));
3183
229k
    return;
3184
229k
  }
3185
267k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3186
2.52k
    reconstruct_active_formatting_elements(parser);
3187
2.52k
    if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
3188
1.82k
      parser_add_parse_error(parser, token);
3189
1.82k
      adoption_agency_algorithm(parser, token);
3190
1.82k
      reconstruct_active_formatting_elements(parser);
3191
1.82k
    }
3192
2.52k
    insert_element_from_token(parser, token);
3193
2.52k
    add_formatting_element(parser, get_current_node(parser));
3194
2.52k
    return;
3195
2.52k
  }
3196
265k
  if (
3197
265k
    tag_in(token, kEndTag, &(const TagSet) {
3198
265k
      TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
3199
265k
      TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
3200
265k
      TAG(U)
3201
265k
    })
3202
265k
  ) {
3203
2.96k
    adoption_agency_algorithm(parser, token);
3204
2.96k
    return;
3205
2.96k
  }
3206
262k
  if (
3207
262k
    tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3208
262k
  ) {
3209
64.0k
    reconstruct_active_formatting_elements(parser);
3210
64.0k
    insert_element_from_token(parser, token);
3211
64.0k
    add_formatting_element(parser, &kActiveFormattingScopeMarker);
3212
64.0k
    set_frameset_not_ok(parser);
3213
64.0k
    return;
3214
64.0k
  }
3215
197k
  if (
3216
197k
    tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3217
197k
  ) {
3218
129
    GumboTag token_tag = token->v.end_tag.tag;
3219
129
    if (!has_an_element_in_scope(parser, token_tag)) {
3220
34
      parser_add_parse_error(parser, token);
3221
34
      ignore_token(parser);
3222
34
      return;
3223
34
    }
3224
95
    implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3225
95
    clear_active_formatting_elements(parser);
3226
95
    return;
3227
129
  }
3228
197k
  if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3229
8.31k
    if (
3230
8.31k
      get_document_node(parser)->v.document.doc_type_quirks_mode
3231
8.31k
        != GUMBO_DOCTYPE_QUIRKS
3232
8.31k
    ) {
3233
91
      maybe_implicitly_close_p_tag(parser, token);
3234
91
    }
3235
8.31k
    insert_element_from_token(parser, token);
3236
8.31k
    set_frameset_not_ok(parser);
3237
8.31k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3238
8.31k
    return;
3239
8.31k
  }
3240
189k
  if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3241
290
    parser_add_parse_error(parser, token);
3242
290
    reconstruct_active_formatting_elements(parser);
3243
290
    insert_element_of_tag_type (
3244
290
      parser,
3245
290
      GUMBO_TAG_BR,
3246
290
      GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3247
290
    );
3248
290
    pop_current_node(parser);
3249
290
    acknowledge_self_closing_tag(parser);
3250
290
    set_frameset_not_ok(parser);
3251
290
    return;
3252
290
  }
3253
189k
  if (
3254
189k
    tag_in(token, kStartTag, &(const TagSet) {
3255
189k
      TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
3256
189k
      TAG(WBR)
3257
189k
    })
3258
189k
  ) {
3259
397
    bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
3260
397
    if (is_image) {
3261
58
      parser_add_parse_error(parser, token);
3262
58
      token->v.start_tag.tag = GUMBO_TAG_IMG;
3263
58
    }
3264
397
    reconstruct_active_formatting_elements(parser);
3265
397
    GumboNode* node = insert_element_from_token(parser, token);
3266
397
    if (is_image)
3267
58
      node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
3268
397
    pop_current_node(parser);
3269
397
    acknowledge_self_closing_tag(parser);
3270
397
    set_frameset_not_ok(parser);
3271
397
    return;
3272
397
  }
3273
188k
  if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3274
1.12k
    reconstruct_active_formatting_elements(parser);
3275
1.12k
    GumboNode *input = insert_element_from_token(parser, token);
3276
1.12k
    pop_current_node(parser);
3277
1.12k
    acknowledge_self_closing_tag(parser);
3278
1.12k
    if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
3279
969
      set_frameset_not_ok(parser);
3280
1.12k
    return;
3281
1.12k
  }
3282
187k
  if (
3283
187k
    tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
3284
187k
  ) {
3285
256
    insert_element_from_token(parser, token);
3286
256
    pop_current_node(parser);
3287
256
    acknowledge_self_closing_tag(parser);
3288
256
    return;
3289
256
  }
3290
187k
  if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3291
475
    maybe_implicitly_close_p_tag(parser, token);
3292
475
    insert_element_from_token(parser, token);
3293
475
    pop_current_node(parser);
3294
475
    acknowledge_self_closing_tag(parser);
3295
475
    set_frameset_not_ok(parser);
3296
475
    return;
3297
475
  }
3298
186k
  if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3299
78
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
3300
78
    parser->_parser_state->_ignore_next_linefeed = true;
3301
78
    set_frameset_not_ok(parser);
3302
78
    return;
3303
78
  }
3304
186k
  if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3305
494
    maybe_implicitly_close_p_tag(parser, token);
3306
494
    reconstruct_active_formatting_elements(parser);
3307
494
    set_frameset_not_ok(parser);
3308
494
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3309
494
    return;
3310
494
  }
3311
186k
  if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3312
36
    set_frameset_not_ok(parser);
3313
36
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3314
36
    return;
3315
36
  }
3316
186k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3317
101
    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3318
101
    return;
3319
101
  }
3320
186k
  if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3321
11.0k
    reconstruct_active_formatting_elements(parser);
3322
11.0k
    insert_element_from_token(parser, token);
3323
11.0k
    set_frameset_not_ok(parser);
3324
11.0k
    GumboInsertionMode state = parser->_parser_state->_insertion_mode;
3325
11.0k
    if (
3326
11.0k
      state == GUMBO_INSERTION_MODE_IN_TABLE
3327
11.0k
      || state == GUMBO_INSERTION_MODE_IN_CAPTION
3328
11.0k
      || state == GUMBO_INSERTION_MODE_IN_TABLE_BODY
3329
11.0k
      || state == GUMBO_INSERTION_MODE_IN_ROW
3330
11.0k
      || state == GUMBO_INSERTION_MODE_IN_CELL
3331
11.0k
    ) {
3332
8.58k
      set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
3333
8.58k
    } else {
3334
2.50k
      set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
3335
2.50k
    }
3336
11.0k
    return;
3337
11.0k
  }
3338
175k
  if (
3339
175k
    tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
3340
175k
  ) {
3341
6.63k
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3342
1.68k
      pop_current_node(parser);
3343
1.68k
    }
3344
6.63k
    reconstruct_active_formatting_elements(parser);
3345
6.63k
    insert_element_from_token(parser, token);
3346
6.63k
    return;
3347
6.63k
  }
3348
168k
  if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
3349
1.39k
    if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3350
265
      generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3351
265
      if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY))
3352
97
        parser_add_parse_error(parser, token);
3353
265
    }
3354
1.39k
    insert_element_from_token(parser, token);
3355
1.39k
    return;
3356
1.39k
  }
3357
167k
  if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
3358
1.02k
    if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3359
76
      generate_implied_end_tags(parser, GUMBO_TAG_RTC, NULL);
3360
76
      GumboNode* current = get_current_node(parser);
3361
76
      if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
3362
76
          !node_html_tag_is(current, GUMBO_TAG_RTC)) {
3363
16
        parser_add_parse_error(parser, token);
3364
16
      }
3365
76
    }
3366
1.02k
    insert_element_from_token(parser, token);
3367
1.02k
    return;
3368
1.02k
  }
3369
166k
  if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3370
1.20k
    reconstruct_active_formatting_elements(parser);
3371
1.20k
    adjust_mathml_attributes(token);
3372
1.20k
    adjust_foreign_attributes(token);
3373
1.20k
    insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
3374
1.20k
    if (token->v.start_tag.is_self_closing) {
3375
174
      pop_current_node(parser);
3376
174
      acknowledge_self_closing_tag(parser);
3377
174
    }
3378
1.20k
    return;
3379
1.20k
  }
3380
164k
  if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3381
2.08k
    reconstruct_active_formatting_elements(parser);
3382
2.08k
    adjust_svg_attributes(token);
3383
2.08k
    adjust_foreign_attributes(token);
3384
2.08k
    insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
3385
2.08k
    if (token->v.start_tag.is_self_closing) {
3386
67
      pop_current_node(parser);
3387
67
      acknowledge_self_closing_tag(parser);
3388
67
    }
3389
2.08k
    return;
3390
2.08k
  }
3391
162k
  if (
3392
162k
    tag_in(token, kStartTag, &(const TagSet) {
3393
162k
      TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
3394
162k
      TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3395
162k
    })
3396
162k
  ) {
3397
2.47k
    parser_add_parse_error(parser, token);
3398
2.47k
    ignore_token(parser);
3399
2.47k
    return;
3400
2.47k
  }
3401
160k
  if (token->type == GUMBO_TOKEN_START_TAG) {
3402
152k
    reconstruct_active_formatting_elements(parser);
3403
152k
    insert_element_from_token(parser, token);
3404
152k
    return;
3405
152k
  }
3406
7.63k
  in_body_any_other_end_tag(parser, token);
3407
7.63k
}
3408
3409
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
3410
1.83M
static void handle_text(GumboParser* parser, GumboToken* token) {
3411
1.83M
  if (
3412
1.83M
    token->type == GUMBO_TOKEN_CHARACTER
3413
1.83M
    || token->type == GUMBO_TOKEN_WHITESPACE
3414
1.83M
  ) {
3415
1.83M
    insert_text_token(parser, token);
3416
1.83M
    return;
3417
1.83M
  }
3418
  // We provide only bare-bones script handling that doesn't involve any of
3419
  // the parser-pause/already-started/script-nesting flags or re-entrant
3420
  // invocations of the tokenizer. Because the intended usage of this library
3421
  // is mostly for templating, refactoring, and static-analysis libraries, we
3422
  // provide the script body as a text-node child of the <script> element.
3423
  // This behavior doesn't support document.write of partial HTML elements,
3424
  // but should be adequate for almost all other scripting support.
3425
2.91k
  if (token->type == GUMBO_TOKEN_EOF) {
3426
1.03k
    parser_add_parse_error(parser, token);
3427
1.03k
    parser->_parser_state->_reprocess_current_token = true;
3428
1.03k
  }
3429
2.91k
  pop_current_node(parser);
3430
2.91k
  set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3431
2.91k
}
3432
3433
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
3434
2.35M
static void handle_in_table(GumboParser* parser, GumboToken* token) {
3435
2.35M
  GumboParserState* state = parser->_parser_state;
3436
2.35M
  if (
3437
2.35M
    (token->type == GUMBO_TOKEN_CHARACTER
3438
2.35M
     || token->type == GUMBO_TOKEN_WHITESPACE
3439
2.35M
     || token->type == GUMBO_TOKEN_NULL)
3440
2.35M
    && node_tag_in_set(get_current_node(parser), &(const TagSet) {
3441
1.53M
      TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR)
3442
1.53M
    })
3443
2.35M
  ) {
3444
    // The "pending table character tokens" list described in the spec is
3445
    // nothing more than the TextNodeBufferState. We accumulate text tokens as
3446
    // normal, except that when we go to flush them in the handle_in_table_text,
3447
    // we set _foster_parent_insertions if there're non-whitespace characters in
3448
    // the buffer.
3449
11.3k
    assert(state->_text_node._buffer.length == 0);
3450
0
    assert(state->_table_character_tokens.length == 0);
3451
0
    state->_original_insertion_mode = state->_insertion_mode;
3452
11.3k
    state->_reprocess_current_token = true;
3453
11.3k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3454
11.3k
    return;
3455
11.3k
  }
3456
2.34M
  if (token->type == GUMBO_TOKEN_COMMENT) {
3457
4.38k
    append_comment_node(parser, get_current_node(parser), token);
3458
4.38k
    return;
3459
4.38k
  }
3460
2.34M
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
3461
465
    parser_add_parse_error(parser, token);
3462
465
    ignore_token(parser);
3463
465
    return;
3464
465
  }
3465
2.34M
  if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3466
1.53k
    clear_stack_to_table_context(parser);
3467
1.53k
    add_formatting_element(parser, &kActiveFormattingScopeMarker);
3468
1.53k
    insert_element_from_token(parser, token);
3469
1.53k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3470
1.53k
    return;
3471
1.53k
  }
3472
2.34M
  if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3473
773
    clear_stack_to_table_context(parser);
3474
773
    insert_element_from_token(parser, token);
3475
773
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3476
773
    return;
3477
773
  }
3478
2.33M
  if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3479
722
    clear_stack_to_table_context(parser);
3480
722
    insert_element_of_tag_type (
3481
722
      parser,
3482
722
      GUMBO_TAG_COLGROUP,
3483
722
      GUMBO_INSERTION_IMPLIED
3484
722
    );
3485
722
    state->_reprocess_current_token = true;
3486
722
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3487
722
    return;
3488
722
  }
3489
2.33M
  if (
3490
2.33M
    tag_in(token, kStartTag, &(const TagSet) {
3491
2.33M
      TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3492
2.33M
    })
3493
2.33M
  ) {
3494
3.76k
    clear_stack_to_table_context(parser);
3495
3.76k
    insert_element_from_token(parser, token);
3496
3.76k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3497
3.76k
    return;
3498
3.76k
  }
3499
2.33M
  if (
3500
2.33M
    tag_in(token, kStartTag, &(const TagSet) {
3501
2.33M
      TAG(TD), TAG(TH), TAG(TR)
3502
2.33M
    })
3503
2.33M
  ) {
3504
1.81k
    clear_stack_to_table_context(parser);
3505
1.81k
    insert_element_of_tag_type (
3506
1.81k
      parser,
3507
1.81k
      GUMBO_TAG_TBODY,
3508
1.81k
      GUMBO_INSERTION_IMPLIED
3509
1.81k
    );
3510
1.81k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3511
1.81k
    state->_reprocess_current_token = true;
3512
1.81k
    return;
3513
1.81k
  }
3514
2.33M
  if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3515
6.82k
    parser_add_parse_error(parser, token);
3516
6.82k
    if (close_table(parser)) {
3517
6.64k
      state->_reprocess_current_token = true;
3518
6.64k
    } else {
3519
178
      ignore_token(parser);
3520
178
    }
3521
6.82k
    return;
3522
6.82k
  }
3523
2.32M
  if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3524
493
    if (!close_table(parser)) {
3525
26
      parser_add_parse_error(parser, token);
3526
26
      return;
3527
26
    }
3528
467
    return;
3529
493
  }
3530
2.32M
  if (
3531
2.32M
    tag_in(token, kEndTag, &(const TagSet) {
3532
2.32M
      TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3533
2.32M
      TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3534
2.32M
    })
3535
2.32M
  ) {
3536
2.19k
    parser_add_parse_error(parser, token);
3537
2.19k
    ignore_token(parser);
3538
2.19k
    return;
3539
2.19k
  }
3540
2.32M
  if (
3541
2.32M
    tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
3542
2.32M
    || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
3543
2.32M
  ) {
3544
883
    handle_in_head(parser, token);
3545
883
    return;
3546
883
  }
3547
2.32M
  if (
3548
2.32M
    tag_is(token, kStartTag, GUMBO_TAG_INPUT)
3549
2.32M
    && attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
3550
2.32M
  ) {
3551
59
    parser_add_parse_error(parser, token);
3552
59
    insert_element_from_token(parser, token);
3553
59
    pop_current_node(parser);
3554
59
    acknowledge_self_closing_tag(parser);
3555
59
    return;
3556
59
  }
3557
2.32M
  if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3558
9.23k
    parser_add_parse_error(parser, token);
3559
9.23k
    if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3560
9.08k
      ignore_token(parser);
3561
9.08k
      return;
3562
9.08k
    }
3563
143
    state->_form_element = insert_element_from_token(parser, token);
3564
143
    pop_current_node(parser);
3565
143
    return;
3566
9.23k
  }
3567
2.31M
  if (token->type == GUMBO_TOKEN_EOF) {
3568
1.04k
    handle_in_body(parser, token);
3569
1.04k
    return;
3570
1.04k
  }
3571
  // foster-parenting-start-tag or foster-parenting-end-tag error
3572
2.31M
  parser_add_parse_error(parser, token);
3573
2.31M
  state->_foster_parent_insertions = true;
3574
2.31M
  handle_in_body(parser, token);
3575
2.31M
  state->_foster_parent_insertions = false;
3576
2.31M
}
3577
3578
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
3579
420k
static void handle_in_table_text(GumboParser* parser, GumboToken* token) {
3580
420k
  if (token->type == GUMBO_TOKEN_NULL) {
3581
12.6k
    parser_add_parse_error(parser, token);
3582
12.6k
    ignore_token(parser);
3583
12.6k
    return;
3584
12.6k
  }
3585
407k
  GumboParserState* state = parser->_parser_state;
3586
  // Non-whitespace tokens will cause parse errors later.
3587
  // It's not entirely clear from the spec how this is supposed to work.
3588
  // https://github.com/whatwg/html/issues/4046
3589
407k
  if (token->type == GUMBO_TOKEN_WHITESPACE
3590
407k
      || token->type == GUMBO_TOKEN_CHARACTER) {
3591
396k
    insert_text_token(parser, token);
3592
396k
    gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
3593
396k
    return;
3594
396k
  }
3595
3596
11.3k
  GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
3597
11.3k
  if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
3598
    // Each character in buffer is an error. Unfortunately, that means we need
3599
    // to emit a bunch of errors at the appropriate locations.
3600
406k
    for (size_t i = 0, n = buffer->length; i < n; ++i) {
3601
395k
      GumboToken tok;
3602
395k
      gumbo_character_token_buffer_get(buffer, i, &tok);
3603
      // foster-parenting-character error
3604
395k
      parser_add_parse_error(parser, &tok);
3605
395k
    }
3606
10.8k
    state->_foster_parent_insertions = true;
3607
10.8k
    set_frameset_not_ok(parser);
3608
10.8k
    reconstruct_active_formatting_elements(parser);
3609
10.8k
  }
3610
11.3k
  maybe_flush_text_node_buffer(parser);
3611
11.3k
  gumbo_character_token_buffer_clear(buffer);
3612
11.3k
  state->_foster_parent_insertions = false;
3613
11.3k
  state->_reprocess_current_token = true;
3614
11.3k
  state->_insertion_mode = state->_original_insertion_mode;
3615
11.3k
}
3616
3617
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
3618
26.6k
static void handle_in_caption(GumboParser* parser, GumboToken* token) {
3619
26.6k
  if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3620
745
    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3621
0
      parser_add_parse_error(parser, token);
3622
0
      ignore_token(parser);
3623
0
      return;
3624
0
    }
3625
745
    generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3626
745
    if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3627
708
      parser_add_parse_error(parser, token);
3628
2.07k
    while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3629
1.33k
      ;
3630
745
    clear_active_formatting_elements(parser);
3631
745
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3632
745
    return;
3633
745
  }
3634
25.9k
  if (
3635
25.9k
    tag_in(token, kStartTag, &(const TagSet) {
3636
25.9k
      TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3637
25.9k
      TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3638
25.9k
    })
3639
25.9k
    || (tag_is(token, kEndTag, GUMBO_TAG_TABLE))
3640
25.9k
  ) {
3641
697
    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3642
0
      parser_add_parse_error(parser, token);
3643
0
      ignore_token(parser);
3644
0
      return;
3645
0
    }
3646
697
    generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3647
697
    if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3648
404
      parser_add_parse_error(parser, token);
3649
2.14k
    while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3650
1.44k
      ;
3651
697
    clear_active_formatting_elements(parser);
3652
697
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3653
697
    parser->_parser_state->_reprocess_current_token = true;
3654
697
    return;
3655
697
  }
3656
25.2k
  if (
3657
25.2k
    tag_in(token, kEndTag, &(const TagSet) {
3658
25.2k
      TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
3659
25.2k
      TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3660
25.2k
    })
3661
25.2k
  ) {
3662
47
    parser_add_parse_error(parser, token);
3663
47
    ignore_token(parser);
3664
47
    return;
3665
47
  }
3666
25.1k
  handle_in_body(parser, token);
3667
25.1k
}
3668
3669
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
3670
54.3k
static void handle_in_column_group(GumboParser* parser, GumboToken* token) {
3671
54.3k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
3672
3.38k
    insert_text_token(parser, token);
3673
3.38k
    return;
3674
3.38k
  }
3675
50.9k
  if (token->type == GUMBO_TOKEN_COMMENT) {
3676
5.41k
    append_comment_node(parser, get_current_node(parser), token);
3677
5.41k
    return;
3678
5.41k
  }
3679
45.5k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
3680
127
    parser_add_parse_error(parser, token);
3681
127
    ignore_token(parser);
3682
127
    return;
3683
127
  }
3684
45.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3685
37
    handle_in_body(parser, token);
3686
37
    return;
3687
37
  }
3688
45.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3689
1.11k
    insert_element_from_token(parser, token);
3690
1.11k
    pop_current_node(parser);
3691
1.11k
    acknowledge_self_closing_tag(parser);
3692
1.11k
    return;
3693
1.11k
  }
3694
44.2k
  if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3695
161
    if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3696
19
      parser_add_parse_error(parser, token);
3697
19
      ignore_token(parser);
3698
19
      return;
3699
19
    }
3700
142
    pop_current_node(parser);
3701
142
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3702
142
    return;
3703
161
  }
3704
44.0k
  if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3705
3
    parser_add_parse_error(parser, token);
3706
3
    ignore_token(parser);
3707
3
    return;
3708
3
  }
3709
44.0k
  if (
3710
44.0k
    tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
3711
44.0k
    || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3712
44.0k
  ) {
3713
242
    handle_in_head(parser, token);
3714
242
    return;
3715
242
  }
3716
43.8k
  if (token->type == GUMBO_TOKEN_EOF) {
3717
314
    handle_in_body(parser, token);
3718
314
    return;
3719
314
  }
3720
43.5k
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3721
42.2k
    parser_add_parse_error(parser, token);
3722
42.2k
    ignore_token(parser);
3723
42.2k
    return;
3724
42.2k
  }
3725
1.22k
  pop_current_node(parser);
3726
1.22k
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3727
1.22k
  parser->_parser_state->_reprocess_current_token = true;
3728
1.22k
}
3729
3730
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
3731
851k
static void handle_in_table_body(GumboParser* parser, GumboToken* token) {
3732
851k
  if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3733
4.09k
    clear_stack_to_table_body_context(parser);
3734
4.09k
    insert_element_from_token(parser, token);
3735
4.09k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3736
4.09k
    return;
3737
4.09k
  }
3738
847k
  if (tag_in(token, kStartTag, &td_th_tags)) {
3739
1.32k
    parser_add_parse_error(parser, token);
3740
1.32k
    clear_stack_to_table_body_context(parser);
3741
1.32k
    insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3742
1.32k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3743
1.32k
    parser->_parser_state->_reprocess_current_token = true;
3744
1.32k
    return;
3745
1.32k
  }
3746
846k
  if (
3747
846k
    tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3748
846k
  ) {
3749
584
    if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3750
87
      parser_add_parse_error(parser, token);
3751
87
      ignore_token(parser);
3752
87
      return;
3753
87
    }
3754
497
    clear_stack_to_table_body_context(parser);
3755
497
    pop_current_node(parser);
3756
497
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3757
497
    return;
3758
584
  }
3759
845k
  if (
3760
845k
    tag_in(token, kStartTag, &(const TagSet) {
3761
845k
      TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3762
845k
      TAG(THEAD)
3763
845k
    })
3764
845k
    || tag_is(token, kEndTag, GUMBO_TAG_TABLE)
3765
845k
  ) {
3766
3.58k
    if (
3767
3.58k
      !(
3768
3.58k
        has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY)
3769
3.58k
        || has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD)
3770
3.58k
        || has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT)
3771
3.58k
      )
3772
3.58k
    ) {
3773
717
      parser_add_parse_error(parser, token);
3774
717
      ignore_token(parser);
3775
717
      return;
3776
717
    }
3777
2.86k
    clear_stack_to_table_body_context(parser);
3778
2.86k
    pop_current_node(parser);
3779
2.86k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3780
2.86k
    parser->_parser_state->_reprocess_current_token = true;
3781
2.86k
    return;
3782
3.58k
  }
3783
842k
  if (
3784
842k
    tag_in(token, kEndTag, &(const TagSet) {
3785
842k
      TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
3786
842k
      TAG(TH), TAG(TR)
3787
842k
    })
3788
842k
  ) {
3789
1.34k
    parser_add_parse_error(parser, token);
3790
1.34k
    ignore_token(parser);
3791
1.34k
    return;
3792
1.34k
  }
3793
840k
  handle_in_table(parser, token);
3794
840k
}
3795
3796
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
3797
432k
static void handle_in_row(GumboParser* parser, GumboToken* token) {
3798
432k
  if (tag_in(token, kStartTag, &td_th_tags)) {
3799
23.5k
    clear_stack_to_table_row_context(parser);
3800
23.5k
    insert_element_from_token(parser, token);
3801
23.5k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3802
23.5k
    add_formatting_element(parser, &kActiveFormattingScopeMarker);
3803
23.5k
    return;
3804
23.5k
  }
3805
409k
  if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3806
722
    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3807
12
      parser_add_parse_error(parser, token);
3808
12
      ignore_token(parser);
3809
12
      return;
3810
12
    }
3811
710
    clear_stack_to_table_row_context(parser);
3812
710
    pop_current_node(parser);
3813
710
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3814
710
    return;
3815
722
  }
3816
408k
  if (
3817
408k
    tag_in(token, kStartTag, &(const TagSet) {
3818
408k
      TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3819
408k
      TAG(THEAD), TAG(TR)
3820
408k
    })
3821
408k
    || tag_is(token, kEndTag, GUMBO_TAG_TABLE)
3822
408k
  ) {
3823
3.09k
    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3824
325
      parser_add_parse_error(parser, token);
3825
325
      ignore_token(parser);
3826
325
      return;
3827
325
    }
3828
2.76k
    clear_stack_to_table_row_context(parser);
3829
2.76k
    pop_current_node(parser);
3830
2.76k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3831
2.76k
    parser->_parser_state->_reprocess_current_token = true;
3832
2.76k
    return;
3833
3.09k
  }
3834
405k
  if (
3835
405k
    tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3836
405k
  ) {
3837
509
    if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3838
76
      parser_add_parse_error(parser, token);
3839
76
      ignore_token(parser);
3840
76
      return;
3841
76
    }
3842
433
    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3843
0
      ignore_token(parser);
3844
0
      return;
3845
0
    }
3846
433
    clear_stack_to_table_row_context(parser);
3847
433
    pop_current_node(parser);
3848
433
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3849
433
    parser->_parser_state->_reprocess_current_token = true;
3850
433
    return;
3851
433
  }
3852
404k
  if (
3853
404k
    tag_in(token, kEndTag, &(const TagSet) {
3854
404k
      TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3855
404k
      TAG(TD), TAG(TH)
3856
404k
    })
3857
404k
  ) {
3858
2.89k
    parser_add_parse_error(parser, token);
3859
2.89k
    ignore_token(parser);
3860
2.89k
    return;
3861
2.89k
  }
3862
402k
  handle_in_table(parser, token);
3863
402k
}
3864
3865
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
3866
667k
static void handle_in_cell(GumboParser* parser, GumboToken* token) {
3867
667k
  if (tag_in(token, kEndTag, &td_th_tags)) {
3868
1.54k
    GumboTag token_tag = token->v.end_tag.tag;
3869
1.54k
    if (!has_an_element_in_table_scope(parser, token_tag)) {
3870
23
      parser_add_parse_error(parser, token);
3871
23
      ignore_token(parser);
3872
23
      return;
3873
23
    }
3874
1.52k
    close_table_cell(parser, token, token_tag);
3875
1.52k
    return;
3876
1.54k
  }
3877
666k
  if (
3878
666k
    tag_in(token, kStartTag, &(const TagSet) {
3879
666k
      TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3880
666k
      TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3881
666k
    })
3882
666k
  ) {
3883
20.7k
    gumbo_debug("Handling <td> in cell.\n");
3884
20.7k
    if (
3885
20.7k
      !has_an_element_in_table_scope(parser, GUMBO_TAG_TH)
3886
20.7k
      && !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)
3887
20.7k
    ) {
3888
0
      gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3889
0
      parser_add_parse_error(parser, token);
3890
0
      ignore_token(parser);
3891
0
      return;
3892
0
    }
3893
20.7k
    parser->_parser_state->_reprocess_current_token = true;
3894
20.7k
    close_current_cell(parser, token);
3895
20.7k
    return;
3896
20.7k
  }
3897
645k
  if (
3898
645k
    tag_in(token, kEndTag, &(const TagSet) {
3899
645k
      TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
3900
645k
    })
3901
645k
  ) {
3902
162
    parser_add_parse_error(parser, token);
3903
162
    ignore_token(parser);
3904
162
    return;
3905
162
  }
3906
645k
  if (
3907
645k
    tag_in(token, kEndTag, &(const TagSet) {
3908
645k
      TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3909
645k
    })
3910
645k
  ) {
3911
701
    if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3912
57
      parser_add_parse_error(parser, token);
3913
57
      ignore_token(parser);
3914
57
      return;
3915
57
    }
3916
644
    parser->_parser_state->_reprocess_current_token = true;
3917
644
    close_current_cell(parser, token);
3918
644
    return;
3919
701
  }
3920
644k
  handle_in_body(parser, token);
3921
644k
}
3922
3923
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
3924
262k
static void handle_in_select(GumboParser* parser, GumboToken* token) {
3925
262k
  if (token->type == GUMBO_TOKEN_NULL) {
3926
14.8k
    parser_add_parse_error(parser, token);
3927
14.8k
    ignore_token(parser);
3928
14.8k
    return;
3929
14.8k
  }
3930
247k
  if (
3931
247k
    token->type == GUMBO_TOKEN_CHARACTER
3932
247k
    || token->type == GUMBO_TOKEN_WHITESPACE
3933
247k
  ) {
3934
212k
    insert_text_token(parser, token);
3935
212k
    return;
3936
212k
  }
3937
35.3k
  if (token->type == GUMBO_TOKEN_COMMENT) {
3938
507
    append_comment_node(parser, get_current_node(parser), token);
3939
507
    return;
3940
507
  }
3941
34.8k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
3942
230
    parser_add_parse_error(parser, token);
3943
230
    ignore_token(parser);
3944
230
    return;
3945
230
  }
3946
34.6k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3947
83
    handle_in_body(parser, token);
3948
83
    return;
3949
83
  }
3950
34.5k
  if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3951
8.28k
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3952
345
      pop_current_node(parser);
3953
345
    }
3954
8.28k
    insert_element_from_token(parser, token);
3955
8.28k
    return;
3956
8.28k
  }
3957
26.2k
  if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3958
9.13k
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3959
692
      pop_current_node(parser);
3960
692
    }
3961
9.13k
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3962
1.25k
      pop_current_node(parser);
3963
1.25k
    }
3964
9.13k
    insert_element_from_token(parser, token);
3965
9.13k
    return;
3966
9.13k
  }
3967
17.1k
  if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3968
499
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3969
180
      pop_current_node(parser);
3970
180
    }
3971
499
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3972
142
      pop_current_node(parser);
3973
142
    }
3974
499
    insert_element_from_token(parser, token);
3975
499
    pop_current_node(parser);
3976
499
    acknowledge_self_closing_tag(parser);
3977
499
    return;
3978
499
  }
3979
16.6k
  if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3980
571
    GumboVector* open_elements = &parser->_parser_state->_open_elements;
3981
571
    if (
3982
571
      node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
3983
571
      && node_html_tag_is (
3984
451
        open_elements->data[open_elements->length - 2],
3985
451
        GUMBO_TAG_OPTGROUP
3986
451
      )
3987
571
    ) {
3988
238
      pop_current_node(parser);
3989
238
    }
3990
571
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3991
255
      pop_current_node(parser);
3992
255
      return;
3993
255
    }
3994
316
    parser_add_parse_error(parser, token);
3995
316
    ignore_token(parser);
3996
316
    return;
3997
571
  }
3998
16.0k
  if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3999
59
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
4000
56
      pop_current_node(parser);
4001
56
      return;
4002
56
    }
4003
3
    parser_add_parse_error(parser, token);
4004
3
    ignore_token(parser);
4005
3
    return;
4006
59
  }
4007
16.0k
  if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
4008
318
    if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
4009
0
      parser_add_parse_error(parser, token);
4010
0
      ignore_token(parser);
4011
0
      return;
4012
0
    }
4013
318
    close_current_select(parser);
4014
318
    return;
4015
318
  }
4016
15.7k
  if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
4017
2.92k
    parser_add_parse_error(parser, token);
4018
2.92k
    ignore_token(parser);
4019
2.92k
    if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
4020
2.92k
      close_current_select(parser);
4021
2.92k
    }
4022
2.92k
    return;
4023
2.92k
  }
4024
12.7k
  if (
4025
12.7k
    tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
4026
12.7k
  ) {
4027
193
    parser_add_parse_error(parser, token);
4028
193
    if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
4029
0
      ignore_token(parser);
4030
193
    } else {
4031
193
      close_current_select(parser);
4032
193
      parser->_parser_state->_reprocess_current_token = true;
4033
193
    }
4034
193
    return;
4035
193
  }
4036
12.5k
  if (
4037
12.5k
    tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
4038
12.5k
    || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
4039
12.5k
  ) {
4040
440
    handle_in_head(parser, token);
4041
440
    return;
4042
440
  }
4043
12.1k
  if (token->type == GUMBO_TOKEN_EOF) {
4044
287
    handle_in_body(parser, token);
4045
287
    return;
4046
287
  }
4047
11.8k
  parser_add_parse_error(parser, token);
4048
11.8k
  ignore_token(parser);
4049
11.8k
}
4050
4051
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
4052
147k
static void handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
4053
147k
  static const TagSet tags = {
4054
147k
    TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
4055
147k
    TAG(TR), TAG(TD), TAG(TH)
4056
147k
  };
4057
147k
  if (tag_in(token, kStartTag, &tags)) {
4058
6.21k
    parser_add_parse_error(parser, token);
4059
6.21k
    close_current_select(parser);
4060
6.21k
    parser->_parser_state->_reprocess_current_token = true;
4061
6.21k
    return;
4062
6.21k
  }
4063
141k
  if (tag_in(token, kEndTag, &tags)) {
4064
1.47k
    parser_add_parse_error(parser, token);
4065
1.47k
    if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
4066
346
      ignore_token(parser);
4067
346
      return;
4068
346
    }
4069
1.12k
    close_current_select(parser);
4070
1.12k
    parser->_parser_state->_reprocess_current_token = true;
4071
1.12k
    return;
4072
1.47k
  }
4073
139k
  handle_in_select(parser, token);
4074
139k
}
4075
4076
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
4077
27.1k
static void handle_in_template(GumboParser* parser, GumboToken* token) {
4078
27.1k
  GumboParserState* state = parser->_parser_state;
4079
27.1k
  switch (token->type) {
4080
603
    case GUMBO_TOKEN_WHITESPACE:
4081
18.9k
    case GUMBO_TOKEN_CHARACTER:
4082
19.0k
    case GUMBO_TOKEN_COMMENT:
4083
21.2k
    case GUMBO_TOKEN_NULL:
4084
21.5k
    case GUMBO_TOKEN_DOCTYPE:
4085
21.5k
      handle_in_body(parser, token);
4086
21.5k
      return;
4087
5.59k
    default:
4088
5.59k
      break;
4089
27.1k
  }
4090
5.59k
  if (
4091
5.59k
    tag_in(token, kStartTag, &(const TagSet) {
4092
5.59k
      TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
4093
5.59k
      TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
4094
5.59k
    })
4095
5.59k
    || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
4096
5.59k
  ) {
4097
1.08k
    handle_in_head(parser, token);
4098
1.08k
    return;
4099
1.08k
  }
4100
4.51k
  if (
4101
4.51k
    tag_in(token, kStartTag, &(const TagSet) {
4102
4.51k
      TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
4103
4.51k
    })
4104
4.51k
  ) {
4105
423
    pop_template_insertion_mode(parser);
4106
423
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
4107
423
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
4108
423
    state->_reprocess_current_token = true;
4109
423
    return;
4110
423
  }
4111
4.08k
  if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
4112
186
    pop_template_insertion_mode(parser);
4113
186
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
4114
186
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
4115
186
    state->_reprocess_current_token = true;
4116
186
    return;
4117
186
  }
4118
3.90k
  if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
4119
419
    pop_template_insertion_mode(parser);
4120
419
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
4121
419
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
4122
419
    state->_reprocess_current_token = true;
4123
419
    return;
4124
419
  }
4125
3.48k
  if (tag_in(token, kStartTag, &td_th_tags)) {
4126
167
    pop_template_insertion_mode(parser);
4127
167
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4128
167
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4129
167
    state->_reprocess_current_token = true;
4130
167
    return;
4131
167
  }
4132
3.31k
  if (token->type == GUMBO_TOKEN_START_TAG) {
4133
1.00k
    pop_template_insertion_mode(parser);
4134
1.00k
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4135
1.00k
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4136
1.00k
    state->_reprocess_current_token = true;
4137
1.00k
    return;
4138
1.00k
  }
4139
2.31k
  if (token->type == GUMBO_TOKEN_END_TAG) {
4140
235
    parser_add_parse_error(parser, token);
4141
235
    ignore_token(parser);
4142
235
    return;
4143
235
  }
4144
2.07k
  if (token->type == GUMBO_TOKEN_EOF) {
4145
2.07k
    if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
4146
      // Stop parsing.
4147
0
      return;
4148
0
    }
4149
2.07k
    parser_add_parse_error(parser, token);
4150
9.12k
    while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
4151
7.05k
      ;
4152
2.07k
    clear_active_formatting_elements(parser);
4153
2.07k
    pop_template_insertion_mode(parser);
4154
2.07k
    reset_insertion_mode_appropriately(parser);
4155
2.07k
    state->_reprocess_current_token = true;
4156
2.07k
    return;
4157
2.07k
  }
4158
0
  assert(0 && "unreachable");
4159
0
}
4160
4161
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
4162
1.30k
static void handle_after_body(GumboParser* parser, GumboToken* token) {
4163
1.30k
  if (
4164
1.30k
    token->type == GUMBO_TOKEN_WHITESPACE
4165
1.30k
    || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4166
1.30k
  ) {
4167
140
    handle_in_body(parser, token);
4168
140
    return;
4169
140
  }
4170
1.16k
  if (token->type == GUMBO_TOKEN_COMMENT) {
4171
85
    GumboNode* html_node = parser->_output->root;
4172
85
    assert(html_node != NULL);
4173
0
    append_comment_node(parser, html_node, token);
4174
85
    return;
4175
85
  }
4176
1.08k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
4177
39
    parser_add_parse_error(parser, token);
4178
39
    ignore_token(parser);
4179
39
    return;
4180
39
  }
4181
1.04k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4182
0
    handle_in_body(parser, token);
4183
0
    return;
4184
0
  }
4185
1.04k
  if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4186
    /* fragment case: ignore the closing HTML token */
4187
719
    if (is_fragment_parser(parser)) {
4188
0
      parser_add_parse_error(parser, token);
4189
0
      ignore_token(parser);
4190
0
      return;
4191
0
    }
4192
719
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
4193
719
    GumboNode* html = parser->_parser_state->_open_elements.data[0];
4194
719
    assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4195
0
    record_end_of_element (
4196
719
      parser->_parser_state->_current_token,
4197
719
      &html->v.element
4198
719
    );
4199
719
    return;
4200
719
  }
4201
326
  if (token->type == GUMBO_TOKEN_EOF) {
4202
19
    return;
4203
19
  }
4204
307
  parser_add_parse_error(parser, token);
4205
307
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4206
307
  parser->_parser_state->_reprocess_current_token = true;
4207
307
}
4208
4209
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
4210
837k
static void handle_in_frameset(GumboParser* parser, GumboToken* token) {
4211
837k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4212
796k
    insert_text_token(parser, token);
4213
796k
    return;
4214
796k
  }
4215
40.4k
  if (token->type == GUMBO_TOKEN_COMMENT) {
4216
1.57k
    append_comment_node(parser, get_current_node(parser), token);
4217
1.57k
    return;
4218
1.57k
  }
4219
38.9k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
4220
169
    parser_add_parse_error(parser, token);
4221
169
    ignore_token(parser);
4222
169
    return;
4223
169
  }
4224
38.7k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4225
368
    handle_in_body(parser, token);
4226
368
    return;
4227
368
  }
4228
38.3k
  if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4229
310
    insert_element_from_token(parser, token);
4230
310
    return;
4231
310
  }
4232
38.0k
  if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4233
191
    if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4234
0
      parser_add_parse_error(parser, token);
4235
0
      ignore_token(parser);
4236
0
      return;
4237
0
    }
4238
191
    pop_current_node(parser);
4239
191
    if (
4240
191
      !is_fragment_parser(parser)
4241
191
      && !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)
4242
191
    ) {
4243
91
      set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
4244
91
    }
4245
191
    return;
4246
191
  }
4247
37.8k
  if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4248
21
    insert_element_from_token(parser, token);
4249
21
    pop_current_node(parser);
4250
21
    acknowledge_self_closing_tag(parser);
4251
21
    return;
4252
21
  }
4253
37.8k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4254
162
    handle_in_head(parser, token);
4255
162
    return;
4256
162
  }
4257
37.6k
  if (token->type == GUMBO_TOKEN_EOF) {
4258
104
    if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML))
4259
104
      parser_add_parse_error(parser, token);
4260
104
    return;
4261
104
  }
4262
37.5k
  parser_add_parse_error(parser, token);
4263
37.5k
  ignore_token(parser);
4264
37.5k
}
4265
4266
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
4267
244k
static void handle_after_frameset(GumboParser* parser, GumboToken* token) {
4268
244k
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4269
101k
    insert_text_token(parser, token);
4270
101k
    return;
4271
101k
  }
4272
142k
  if (token->type == GUMBO_TOKEN_COMMENT) {
4273
98.5k
    append_comment_node(parser, get_current_node(parser), token);
4274
98.5k
    return;
4275
98.5k
  }
4276
44.2k
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
4277
171
    parser_add_parse_error(parser, token);
4278
171
    ignore_token(parser);
4279
171
    return;
4280
171
  }
4281
44.1k
  if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4282
106
    handle_in_body(parser, token);
4283
106
    return;
4284
106
  }
4285
43.9k
  if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4286
31
    GumboNode* html = parser->_parser_state->_open_elements.data[0];
4287
31
    assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4288
0
    record_end_of_element (
4289
31
      parser->_parser_state->_current_token,
4290
31
      &html->v.element
4291
31
    );
4292
31
    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
4293
31
    return;
4294
31
  }
4295
43.9k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4296
171
    return handle_in_head(parser, token);
4297
171
  }
4298
43.7k
  if (token->type == GUMBO_TOKEN_EOF) {
4299
60
    return;
4300
60
  }
4301
43.7k
  parser_add_parse_error(parser, token);
4302
43.7k
  ignore_token(parser);
4303
43.7k
}
4304
4305
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
4306
1.17k
static void handle_after_after_body(GumboParser* parser, GumboToken* token) {
4307
1.17k
  if (token->type == GUMBO_TOKEN_COMMENT) {
4308
18
    append_comment_node(parser, get_document_node(parser), token);
4309
18
    return;
4310
18
  }
4311
1.15k
  if (
4312
1.15k
    token->type == GUMBO_TOKEN_DOCTYPE
4313
1.15k
    || token->type == GUMBO_TOKEN_WHITESPACE
4314
1.15k
    || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4315
1.15k
  ) {
4316
437
    handle_in_body(parser, token);
4317
437
    return;
4318
437
  }
4319
719
  if (token->type == GUMBO_TOKEN_EOF) {
4320
20
    return;
4321
20
  }
4322
699
  parser_add_parse_error(parser, token);
4323
699
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4324
699
  parser->_parser_state->_reprocess_current_token = true;
4325
699
}
4326
4327
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
4328
static void handle_after_after_frameset (
4329
  GumboParser* parser,
4330
  GumboToken* token
4331
52.0k
) {
4332
52.0k
  if (token->type == GUMBO_TOKEN_COMMENT) {
4333
165
    append_comment_node(parser, get_document_node(parser), token);
4334
165
    return;
4335
165
  }
4336
51.9k
  if (
4337
51.9k
    token->type == GUMBO_TOKEN_DOCTYPE
4338
51.9k
    || token->type == GUMBO_TOKEN_WHITESPACE
4339
51.9k
    || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4340
51.9k
  ) {
4341
681
    handle_in_body(parser, token);
4342
681
    return;
4343
681
  }
4344
51.2k
  if (token->type == GUMBO_TOKEN_EOF) {
4345
31
    return;
4346
31
  }
4347
51.2k
  if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4348
152
    handle_in_head(parser, token);
4349
152
    return;
4350
152
  }
4351
51.0k
  parser_add_parse_error(parser, token);
4352
51.0k
  ignore_token(parser);
4353
51.0k
}
4354
4355
// Function pointers for each insertion mode.
4356
// Keep in sync with insertion_mode.h.
4357
typedef void (*TokenHandler)(GumboParser* parser, GumboToken* token);
4358
static const TokenHandler kTokenHandlers[] = {
4359
  handle_initial,
4360
  handle_before_html,
4361
  handle_before_head,
4362
  handle_in_head,
4363
  handle_in_head_noscript,
4364
  handle_after_head,
4365
  handle_in_body,
4366
  handle_text,
4367
  handle_in_table,
4368
  handle_in_table_text,
4369
  handle_in_caption,
4370
  handle_in_column_group,
4371
  handle_in_table_body,
4372
  handle_in_row,
4373
  handle_in_cell,
4374
  handle_in_select,
4375
  handle_in_select_in_table,
4376
  handle_in_template,
4377
  handle_after_body,
4378
  handle_in_frameset,
4379
  handle_after_frameset,
4380
  handle_after_after_body,
4381
  handle_after_after_frameset
4382
};
4383
4384
14.3M
static void handle_html_content(GumboParser* parser, GumboToken* token) {
4385
14.3M
  const GumboInsertionMode mode = parser->_parser_state->_insertion_mode;
4386
14.3M
  const TokenHandler handler = kTokenHandlers[mode];
4387
14.3M
  handler(parser, token);
4388
14.3M
}
4389
4390
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4391
2.06M
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4392
2.06M
  gumbo_debug("Handling foreign content");
4393
2.06M
  switch (token->type) {
4394
270k
    case GUMBO_TOKEN_NULL:
4395
270k
      parser_add_parse_error(parser, token);
4396
270k
      token->v.character = kUtf8ReplacementChar;
4397
270k
      insert_text_token(parser, token);
4398
270k
      return;
4399
13.3k
    case GUMBO_TOKEN_WHITESPACE:
4400
13.3k
      insert_text_token(parser, token);
4401
13.3k
      return;
4402
500k
    case GUMBO_TOKEN_CDATA:
4403
1.76M
    case GUMBO_TOKEN_CHARACTER:
4404
1.76M
      insert_text_token(parser, token);
4405
1.76M
      set_frameset_not_ok(parser);
4406
1.76M
      return;
4407
809
    case GUMBO_TOKEN_COMMENT:
4408
809
      append_comment_node(parser, get_current_node(parser), token);
4409
809
      return;
4410
164
    case GUMBO_TOKEN_DOCTYPE:
4411
164
      parser_add_parse_error(parser, token);
4412
164
      ignore_token(parser);
4413
164
      return;
4414
14.8k
    default:
4415
      // Fall through to the if-statements below.
4416
14.8k
      break;
4417
2.06M
  }
4418
  // Order matters for these clauses.
4419
14.8k
  if (
4420
14.8k
    tag_in(token, kStartTag, &(const TagSet) {
4421
14.8k
      TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(CENTER),
4422
14.8k
      TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), TAG(EM), TAG(EMBED),
4423
14.8k
      TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), TAG(HEAD),
4424
14.8k
      TAG(HR), TAG(I), TAG(IMG), TAG(LI), TAG(LISTING), TAG(MENU), TAG(META),
4425
14.8k
      TAG(NOBR), TAG(OL), TAG(P), TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL),
4426
14.8k
      TAG(SPAN), TAG(STRONG), TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE),
4427
14.8k
      TAG(TT), TAG(U), TAG(UL), TAG(VAR)
4428
14.8k
    })
4429
14.8k
    || (
4430
13.6k
      tag_is(token, kStartTag, GUMBO_TAG_FONT)
4431
13.6k
      && (
4432
312
        token_has_attribute(token, "color")
4433
312
        || token_has_attribute(token, "face")
4434
312
        || token_has_attribute(token, "size")
4435
312
      )
4436
13.6k
    )
4437
14.8k
    || tag_in(token, kEndTag, &(const TagSet) { TAG(BR), TAG(P) })
4438
14.8k
  ) {
4439
    /* Parse error */
4440
1.86k
    parser_add_parse_error(parser, token);
4441
4442
1.86k
    while (
4443
7.00k
      !(
4444
7.00k
        is_mathml_integration_point(get_current_node(parser))
4445
7.00k
        || is_html_integration_point(get_current_node(parser))
4446
7.00k
        || get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
4447
7.00k
      )
4448
5.14k
    ) {
4449
5.14k
      pop_current_node(parser);
4450
5.14k
    }
4451
1.86k
    handle_html_content(parser, token);
4452
1.86k
    return;
4453
1.86k
  }
4454
4455
13.0k
  if (token->type == GUMBO_TOKEN_START_TAG) {
4456
11.1k
    const GumboNamespaceEnum current_namespace =
4457
11.1k
        get_adjusted_current_node(parser)->v.element.tag_namespace;
4458
11.1k
    if (current_namespace == GUMBO_NAMESPACE_MATHML) {
4459
2.86k
      adjust_mathml_attributes(token);
4460
2.86k
    }
4461
11.1k
    if (current_namespace == GUMBO_NAMESPACE_SVG) {
4462
8.30k
      adjust_svg_tag(token);
4463
8.30k
      adjust_svg_attributes(token);
4464
8.30k
    }
4465
11.1k
    adjust_foreign_attributes(token);
4466
11.1k
    insert_foreign_element(parser, token, current_namespace);
4467
11.1k
    if (token->v.start_tag.is_self_closing) {
4468
514
      pop_current_node(parser);
4469
514
      acknowledge_self_closing_tag(parser);
4470
514
    }
4471
11.1k
    return;
4472
    // </script> tags are handled like any other end tag, putting the script's
4473
    // text into a text node child and closing the current node.
4474
11.1k
  }
4475
1.85k
  assert(token->type == GUMBO_TOKEN_END_TAG);
4476
0
  GumboNode* node = get_current_node(parser);
4477
1.85k
  GumboTag tag = token->v.end_tag.tag;
4478
1.85k
  const char* name = token->v.end_tag.name;
4479
1.85k
  assert(node != NULL);
4480
4481
1.85k
  if (!node_tagname_is(node, tag, name))
4482
1.13k
    parser_add_parse_error(parser, token);
4483
1.85k
  int i = parser->_parser_state->_open_elements.length;
4484
8.24k
  for (--i; i > 0;) {
4485
    // Here we move up the stack until we find an HTML element (in which
4486
    // case we do nothing) or we find the element that we're about to
4487
    // close (in which case we pop everything we've seen until that
4488
    // point.)
4489
8.24k
    gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4490
8.24k
    if (node_tagname_is(node, tag, name)) {
4491
773
      gumbo_debug("Matches.\n");
4492
1.26k
      while (node != pop_current_node(parser)) {
4493
        // Pop all the nodes below the current one. Node is guaranteed to
4494
        // be an element on the stack of open elements (set below), so
4495
        // this loop is guaranteed to terminate.
4496
491
      }
4497
773
      return;
4498
773
    }
4499
7.46k
    --i;
4500
7.46k
    node = parser->_parser_state->_open_elements.data[i];
4501
7.46k
    if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4502
      // The loop continues only in foreign namespaces.
4503
1.08k
      break;
4504
1.08k
    }
4505
7.46k
  }
4506
1.08k
  assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4507
1.08k
  if (i == 0)
4508
0
    return;
4509
  // We can't call handle_token directly because the current node is still in
4510
  // a foriegn namespace, so it would re-enter this and result in infinite
4511
  // recursion.
4512
1.08k
  handle_html_content(parser, token);
4513
1.08k
}
4514
4515
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
4516
16.3M
static void handle_token(GumboParser* parser, GumboToken* token) {
4517
16.3M
  if (
4518
16.3M
    parser->_parser_state->_ignore_next_linefeed
4519
16.3M
    && token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n'
4520
16.3M
  ) {
4521
42
    parser->_parser_state->_ignore_next_linefeed = false;
4522
42
    ignore_token(parser);
4523
42
    return;
4524
42
  }
4525
  // This needs to be reset both here and in the conditional above to catch both
4526
  // the case where the next token is not whitespace (so we don't ignore
4527
  // whitespace in the middle of <pre> tags) and where there are multiple
4528
  // whitespace tokens (so we don't ignore the second one).
4529
16.3M
  parser->_parser_state->_ignore_next_linefeed = false;
4530
4531
16.3M
  if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
4532
2.94k
    parser->_parser_state->_closed_body_tag = true;
4533
2.94k
  }
4534
16.3M
  if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4535
2.79k
    parser->_parser_state->_closed_html_tag = true;
4536
2.79k
  }
4537
4538
16.3M
  const GumboNode* current_node = get_adjusted_current_node(parser);
4539
16.3M
  assert (
4540
16.3M
    !current_node
4541
16.3M
    || current_node->type == GUMBO_NODE_ELEMENT
4542
16.3M
    || current_node->type == GUMBO_NODE_TEMPLATE
4543
16.3M
  );
4544
16.3M
  if (current_node)
4545
16.3M
    gumbo_debug("Current node: <%s>.\n", current_node->v.element.name);
4546
16.3M
  if (!current_node ||
4547
16.3M
      current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
4548
16.3M
      (is_mathml_integration_point(current_node) &&
4549
2.07M
          (token->type == GUMBO_TOKEN_CHARACTER ||
4550
4.03k
              token->type == GUMBO_TOKEN_WHITESPACE ||
4551
4.03k
              token->type == GUMBO_TOKEN_NULL ||
4552
4.03k
              (token->type == GUMBO_TOKEN_START_TAG &&
4553
984
                  !tag_in(token, kStartTag,
4554
514
                      &(const TagSet){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
4555
16.3M
      (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
4556
2.07M
          node_qualified_tag_is(
4557
206k
              current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
4558
2.07M
          tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
4559
16.3M
      (is_html_integration_point(current_node) &&
4560
2.07M
          (token->type == GUMBO_TOKEN_START_TAG ||
4561
6.50k
              token->type == GUMBO_TOKEN_CHARACTER ||
4562
6.50k
              token->type == GUMBO_TOKEN_NULL ||
4563
6.50k
              token->type == GUMBO_TOKEN_WHITESPACE)) ||
4564
16.3M
      token->type == GUMBO_TOKEN_EOF) {
4565
14.3M
    handle_html_content(parser, token);
4566
14.3M
  } else {
4567
2.06M
    handle_in_foreign_content(parser, token);
4568
2.06M
  }
4569
16.3M
}
4570
4571
static GumboNode* create_fragment_ctx_element (
4572
  const char* tag_name,
4573
  GumboNamespaceEnum ns,
4574
  const char* encoding
4575
0
) {
4576
0
  assert(tag_name);
4577
0
  GumboTag tag = gumbo_tagn_enum(tag_name, strlen(tag_name));
4578
0
  GumboNodeType type =
4579
0
    ns == GUMBO_NAMESPACE_HTML && tag == GUMBO_TAG_TEMPLATE
4580
0
    ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT;
4581
0
  GumboNode* node = create_node(type);
4582
0
  GumboElement* element = &node->v.element;
4583
0
  element->children = kGumboEmptyVector;
4584
0
  if (encoding) {
4585
0
    gumbo_vector_init(1, &element->attributes);
4586
0
    GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
4587
0
    attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
4588
0
    attr->name = "encoding"; // Do not free this!
4589
0
    attr->original_name = kGumboEmptyString;
4590
0
    attr->value = encoding; // Do not free this!
4591
0
    attr->original_value = kGumboEmptyString;
4592
0
    attr->name_start = kGumboEmptySourcePosition;
4593
0
    gumbo_vector_add(attr, &element->attributes);
4594
0
  } else {
4595
0
    element->attributes = kGumboEmptyVector;
4596
0
  }
4597
0
  element->tag = tag;
4598
0
  element->tag_namespace = ns;
4599
0
  element->name = tag_name; // Do not free this!
4600
0
  element->original_tag = kGumboEmptyString;
4601
0
  element->original_end_tag = kGumboEmptyString;
4602
0
  element->start_pos = kGumboEmptySourcePosition;
4603
0
  element->end_pos = kGumboEmptySourcePosition;
4604
0
  return node;
4605
0
}
4606
4607
0
static void destroy_fragment_ctx_element(GumboNode* ctx) {
4608
0
  assert(ctx->type == GUMBO_NODE_ELEMENT || ctx->type == GUMBO_NODE_TEMPLATE);
4609
0
  GumboElement* element = &ctx->v.element;
4610
0
  element->name = NULL; // Do not free.
4611
0
  if (element->attributes.length > 0) {
4612
0
    assert(element->attributes.length == 1);
4613
0
    GumboAttribute* attr = gumbo_vector_pop(&element->attributes);
4614
    // Do not free attr->name or attr->value, just free the attr.
4615
0
    gumbo_free(attr);
4616
0
  }
4617
0
  destroy_node(ctx);
4618
0
}
4619
4620
static void fragment_parser_init (
4621
  GumboParser* parser,
4622
  const GumboOptions* options
4623
0
) {
4624
0
  assert(options->fragment_context != NULL);
4625
0
  const char* fragment_ctx = options->fragment_context;
4626
0
  GumboNamespaceEnum fragment_namespace = options->fragment_namespace;
4627
0
  const char* fragment_encoding = options->fragment_encoding;
4628
0
  GumboQuirksModeEnum quirks = options->quirks_mode;
4629
0
  bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4630
4631
0
  GumboNode* root;
4632
  // 2.
4633
0
  get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4634
4635
  // 3.
4636
0
  parser->_parser_state->_fragment_ctx =
4637
0
    create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4638
0
  GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
4639
4640
  // 4.
4641
0
  if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4642
    // Non-HTML namespaces always start in the DATA state.
4643
0
    switch (ctx_tag) {
4644
0
      case GUMBO_TAG_TITLE:
4645
0
      case GUMBO_TAG_TEXTAREA:
4646
0
        gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4647
0
        break;
4648
4649
0
      case GUMBO_TAG_STYLE:
4650
0
      case GUMBO_TAG_XMP:
4651
0
      case GUMBO_TAG_IFRAME:
4652
0
      case GUMBO_TAG_NOEMBED:
4653
0
      case GUMBO_TAG_NOFRAMES:
4654
0
        gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4655
0
        break;
4656
4657
0
      case GUMBO_TAG_SCRIPT:
4658
0
        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
4659
0
        break;
4660
4661
0
      case GUMBO_TAG_NOSCRIPT:
4662
        /* scripting is disabled in Gumbo, so leave the tokenizer
4663
         * in the default data state */
4664
0
        break;
4665
4666
0
      case GUMBO_TAG_PLAINTEXT:
4667
0
        gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4668
0
        break;
4669
4670
0
      default:
4671
        /* default data state */
4672
0
        break;
4673
0
    }
4674
0
  }
4675
4676
  // 5. 6. 7.
4677
0
  root = insert_element_of_tag_type (
4678
0
    parser,
4679
0
    GUMBO_TAG_HTML,
4680
0
    GUMBO_INSERTION_IMPLIED
4681
0
  );
4682
0
  parser->_output->root = root;
4683
4684
  // 8.
4685
0
  if (ctx_tag == GUMBO_TAG_TEMPLATE) {
4686
0
    push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4687
0
  }
4688
4689
  // 10.
4690
0
  reset_insertion_mode_appropriately(parser);
4691
4692
  // 11.
4693
0
  if (ctx_has_form_ancestor
4694
0
      || (ctx_tag == GUMBO_TAG_FORM
4695
0
          && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4696
0
    static const GumboNode form_ancestor = {
4697
0
      .type = GUMBO_NODE_ELEMENT,
4698
0
      .parent = NULL,
4699
0
      .index_within_parent = -1,
4700
0
      .parse_flags = GUMBO_INSERTION_BY_PARSER,
4701
0
      .v.element = {
4702
0
        .children = GUMBO_EMPTY_VECTOR_INIT,
4703
0
        .tag = GUMBO_TAG_FORM,
4704
0
        .name = NULL,
4705
0
        .tag_namespace = GUMBO_NAMESPACE_HTML,
4706
0
        .original_tag = GUMBO_EMPTY_STRING_INIT,
4707
0
        .original_end_tag = GUMBO_EMPTY_STRING_INIT,
4708
0
        .start_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT,
4709
0
        .end_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT,
4710
0
        .attributes = GUMBO_EMPTY_VECTOR_INIT,
4711
0
      },
4712
0
    };
4713
    // This cast is okay because _form_element is only modified if it is
4714
    // in in the list of open elements. This will never be.
4715
0
    parser->_parser_state->_form_element = (GumboNode *)&form_ancestor;
4716
0
  }
4717
0
}
4718
4719
0
GumboOutput* gumbo_parse(const char* buffer) {
4720
0
  return gumbo_parse_with_options (
4721
0
    &kGumboDefaultOptions,
4722
0
    buffer,
4723
0
    strlen(buffer)
4724
0
  );
4725
0
}
4726
4727
GumboOutput* gumbo_parse_with_options (
4728
  const GumboOptions* options,
4729
  const char* buffer,
4730
  size_t length
4731
10.0k
) {
4732
10.0k
  GumboParser parser;
4733
10.0k
  parser._options = options;
4734
10.0k
  output_init(&parser);
4735
10.0k
  gumbo_tokenizer_state_init(&parser, buffer, length);
4736
10.0k
  parser_state_init(&parser);
4737
4738
10.0k
  if (options->fragment_context != NULL)
4739
0
    fragment_parser_init(&parser, options);
4740
4741
10.0k
  GumboParserState* state = parser._parser_state;
4742
10.0k
  gumbo_debug (
4743
10.0k
    "Parsing %.*s.\n",
4744
10.0k
    (int) length,
4745
10.0k
    buffer
4746
10.0k
  );
4747
4748
  // Sanity check so that infinite loops die with an assertion failure instead
4749
  // of hanging the process before we ever get an error.
4750
10.0k
  uint_fast32_t loop_count = 0;
4751
4752
10.0k
  const unsigned int max_tree_depth = options->max_tree_depth;
4753
10.0k
  GumboToken token;
4754
4755
16.3M
  do {
4756
16.3M
    if (state->_reprocess_current_token) {
4757
127k
      state->_reprocess_current_token = false;
4758
16.2M
    } else {
4759
16.2M
      GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
4760
16.2M
      gumbo_tokenizer_set_is_adjusted_current_node_foreign (
4761
16.2M
        &parser,
4762
16.2M
        adjusted_current_node &&
4763
16.2M
          adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4764
16.2M
      );
4765
16.2M
      gumbo_lex(&parser, &token);
4766
16.2M
    }
4767
4768
16.3M
    const char* token_type = "text";
4769
16.3M
    switch (token.type) {
4770
7.77k
      case GUMBO_TOKEN_DOCTYPE:
4771
7.77k
        token_type = "doctype";
4772
7.77k
        break;
4773
1.28M
      case GUMBO_TOKEN_START_TAG:
4774
1.28M
        if (token.v.start_tag.tag == GUMBO_TAG_UNKNOWN)
4775
157k
          token_type = token.v.start_tag.name;
4776
1.12M
        else
4777
1.12M
          token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4778
1.28M
        break;
4779
47.8k
      case GUMBO_TOKEN_END_TAG:
4780
47.8k
        token_type = gumbo_normalized_tagname(token.v.end_tag.tag);
4781
47.8k
        break;
4782
200k
      case GUMBO_TOKEN_COMMENT:
4783
200k
        token_type = "comment";
4784
200k
        break;
4785
14.8M
      default:
4786
14.8M
        break;
4787
16.3M
    }
4788
16.3M
    gumbo_debug (
4789
16.3M
      "Handling %s token @%lu:%lu in state %u.\n",
4790
16.3M
      (char*) token_type,
4791
16.3M
      (unsigned long)token.position.line,
4792
16.3M
      (unsigned long)token.position.column,
4793
16.3M
      state->_insertion_mode
4794
16.3M
    );
4795
4796
16.3M
    state->_current_token = &token;
4797
16.3M
    state->_self_closing_flag_acknowledged = false;
4798
4799
16.3M
    handle_token(&parser, &token);
4800
4801
    // Check for memory leaks when ownership is transferred from start tag
4802
    // tokens to nodes.
4803
16.3M
    assert (
4804
16.3M
      state->_reprocess_current_token
4805
16.3M
      || token.type != GUMBO_TOKEN_START_TAG
4806
16.3M
      || (token.v.start_tag.attributes.data == NULL
4807
16.3M
          && token.v.start_tag.name == NULL)
4808
16.3M
    );
4809
4810
16.3M
    if (!state->_reprocess_current_token) {
4811
      // If we're done with the token, check for unacknowledged self-closing
4812
      // flags on start tags.
4813
16.2M
      if (token.type == GUMBO_TOKEN_START_TAG &&
4814
16.2M
          token.v.start_tag.is_self_closing &&
4815
16.2M
          !state->_self_closing_flag_acknowledged) {
4816
2.01k
        GumboError* error = gumbo_add_error(&parser);
4817
2.01k
        if (error) {
4818
          // This is essentially a tokenizer error that's only caught during
4819
          // tree construction.
4820
2.01k
          error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
4821
2.01k
          error->original_text = token.original_text;
4822
2.01k
          error->position = token.position;
4823
2.01k
        }
4824
2.01k
      }
4825
      // Make sure we free the end tag's name since it doesn't get transferred
4826
      // to a token.
4827
16.2M
      if (token.type == GUMBO_TOKEN_END_TAG &&
4828
16.2M
          token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4829
6.51k
        gumbo_free(token.v.end_tag.name);
4830
16.2M
    }
4831
4832
16.3M
    if (unlikely(state->_open_elements.length > max_tree_depth)) {
4833
6
      parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4834
6
      gumbo_debug("Tree depth limit exceeded.\n");
4835
6
      break;
4836
6
    }
4837
4838
16.3M
    ++loop_count;
4839
16.3M
    assert(loop_count < 1000000000UL);
4840
4841
16.3M
  } while (
4842
16.3M
    (token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token)
4843
16.3M
    && !(options->stop_on_first_error && parser._output->document_error)
4844
10.0k
  );
4845
4846
10.0k
  finish_parsing(&parser);
4847
  // For API uniformity reasons, if the doctype still has nulls, convert them to
4848
  // empty strings.
4849
10.0k
  GumboDocument* doc_type = &parser._output->document->v.document;
4850
10.0k
  if (doc_type->name == NULL) {
4851
9.62k
    doc_type->name = gumbo_strdup("");
4852
9.62k
  }
4853
10.0k
  if (doc_type->public_identifier == NULL) {
4854
9.62k
    doc_type->public_identifier = gumbo_strdup("");
4855
9.62k
  }
4856
10.0k
  if (doc_type->system_identifier == NULL) {
4857
9.62k
    doc_type->system_identifier = gumbo_strdup("");
4858
9.62k
  }
4859
4860
10.0k
  parser_state_destroy(&parser);
4861
10.0k
  gumbo_tokenizer_state_destroy(&parser);
4862
10.0k
  return parser._output;
4863
10.0k
}
4864
4865
0
const char* gumbo_status_to_string(GumboOutputStatus status) {
4866
0
  switch (status) {
4867
0
    case GUMBO_STATUS_OK:
4868
0
      return "OK";
4869
0
    case GUMBO_STATUS_OUT_OF_MEMORY:
4870
0
      return "System allocator returned NULL during parsing";
4871
0
    case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
4872
0
      return "Attributes per element limit exceeded";
4873
0
    case GUMBO_STATUS_TREE_TOO_DEEP:
4874
0
      return "Document tree depth limit exceeded";
4875
0
    default:
4876
0
      return "Unknown GumboOutputStatus value";
4877
0
  }
4878
0
}
4879
4880
0
void gumbo_destroy_node(GumboNode* node) {
4881
0
  destroy_node(node);
4882
0
}
4883
4884
10.0k
void gumbo_destroy_output(GumboOutput* output) {
4885
10.0k
  destroy_node(output->document);
4886
24.2M
  for (unsigned int i = 0; i < output->errors.length; ++i) {
4887
24.2M
    gumbo_error_destroy(output->errors.data[i]);
4888
24.2M
  }
4889
10.0k
  gumbo_vector_destroy(&output->errors);
4890
10.0k
  gumbo_free(output);
4891
10.0k
}