Coverage Report

Created: 2025-09-08 06:20

/proc/self/cwd/cpp/htmlparser/parser.cc
Line
Count
Source (jump to first uncovered line)
1
#include <algorithm>
2
#include <set>
3
#include <tuple>
4
#ifdef DUMP_NODES
5
#include <iostream>  // For DumpDocument
6
#endif               // DUMP_NODES
7
8
#include "absl/flags/flag.h"
9
#include "absl/status/status.h"
10
#include "cpp/htmlparser/atomutil.h"
11
#include "cpp/htmlparser/comparators.h"
12
#include "cpp/htmlparser/defer.h"
13
#include "cpp/htmlparser/doctype.h"
14
#include "cpp/htmlparser/foreign.h"
15
#include "cpp/htmlparser/logging.h"
16
#include "cpp/htmlparser/parser.h"
17
#include "cpp/htmlparser/strings.h"
18
19
ABSL_RETIRED_FLAG(uint32_t, htmlparser_max_nodes_depth_count, 245, "retired");
20
21
namespace htmlparser {
22
23
namespace {
24
// Internal functions forward declarations.
25
std::string ExtractWhitespace(const std::string& s);
26
27
#ifdef DUMP_NODES
28
void DumpNode(Node* root_node) {
29
  for (Node* c = root_node->FirstChild(); c; c = c->NextSibling()) {
30
    std::cerr << c->NameSpace() << ": " << AtomUtil::ToString(c->DataAtom())
31
              << std::endl;
32
    DumpNode(c);
33
  }
34
}
35
// Dumps the nodes in the DOM in their final order after parsing.
36
void DumpDocument(Document* doc) { DumpNode(doc->RootNode()); }
37
38
#endif  // DUMP_NODES
39
40
}  // namespace.
41
42
0
std::unique_ptr<Document> Parse(std::string_view html) {
43
0
  std::unique_ptr<Parser> parser = std::make_unique<Parser>(
44
0
      html,
45
0
      ParseOptions{.scripting = true,
46
0
                   .frameset_ok = true,
47
0
                   .record_node_offsets = true,
48
0
                   .record_attribute_offsets = true,
49
0
                   .count_num_terms_in_text_node = true});
50
0
  return parser->Parse();
51
0
}
52
53
std::unique_ptr<Document> ParseWithOptions(std::string_view html,
54
0
                                           const ParseOptions& options) {
55
0
  return std::make_unique<Parser>(html, options)->Parse();
56
0
}
57
58
std::unique_ptr<Document> ParseFragmentWithOptions(std::string_view html,
59
                                                   const ParseOptions& options,
60
0
                                                   Node* fragment_parent) {
61
0
  std::unique_ptr<Parser> parser = std::make_unique<Parser>(
62
0
      html, options, fragment_parent);
63
0
  Node* root = parser->document_->NewNode(NodeType::ELEMENT_NODE, Atom::HTML);
64
0
  parser->document_->root_node_->AppendChild(root);
65
0
  parser->open_elements_stack_.Push(root);
66
67
0
  if (fragment_parent && fragment_parent->DataAtom() == Atom::TEMPLATE) {
68
0
    parser->template_stack_.push_back(std::bind(&Parser::InTemplateIM,
69
0
                                                parser.get()));
70
0
  }
71
72
0
  parser->ResetInsertionMode();
73
74
0
  for (Node* node = fragment_parent; node; node = node->Parent()) {
75
0
    if (node->Type() == NodeType::ELEMENT_NODE &&
76
0
        node->DataAtom() == Atom::FORM) {
77
0
      parser->form_ = node;
78
0
      break;
79
0
    }
80
0
  }
81
82
0
  auto doc = parser->Parse();
83
84
0
  if (doc->status().ok()) {
85
0
    Node* parent = fragment_parent ? root : doc->root_node_;
86
0
    for (Node* c = parent->FirstChild(); c;) {
87
0
      Node* next = c->NextSibling();
88
0
      doc->fragment_nodes_.push_back(std::move(c));
89
0
      parent->RemoveChild(c);
90
0
      c = next;
91
0
    }
92
0
  }
93
94
0
  return doc;
95
0
}
96
97
std::unique_ptr<Document> ParseFragment(std::string_view html,
98
0
                                        Node* fragment_parent) {
99
  // Expects clients to update the offsets relative to the parent which
100
  // this fragment belongs.
101
0
  ParseOptions options = {.scripting = true,
102
0
                          .frameset_ok = true,
103
0
                          .record_node_offsets = true,
104
0
                          .record_attribute_offsets = true,
105
0
                          .count_num_terms_in_text_node = true};
106
0
  return ParseFragmentWithOptions(html, options, fragment_parent);
107
0
}
108
109
Parser::Parser(std::string_view html, const ParseOptions& options,
110
               Node* fragment_parent)
111
    : tokenizer_(std::make_unique<Tokenizer>(
112
          html,
113
          fragment_parent ? AtomUtil::ToString(fragment_parent->atom_) : "")),
114
      on_node_callback_(options.on_node_callback),
115
      document_(new Document),
116
      scope_marker_(document_->NewNode(NodeType::SCOPE_MARKER_NODE)),
117
      scripting_(options.scripting),
118
      frameset_ok_(options.frameset_ok),
119
      record_node_offsets_(options.record_node_offsets),
120
      record_attribute_offsets_(options.record_attribute_offsets),
121
      count_num_terms_in_text_node_(options.count_num_terms_in_text_node),
122
      fragment_(fragment_parent != nullptr),
123
12.5k
      context_node_(fragment_parent) {
124
12.5k
  document_->metadata_.html_src_bytes = html.size();
125
12.5k
  insertion_mode_ = std::bind(&Parser::InitialIM, this);
126
12.5k
}
127
128
12.5k
std::unique_ptr<Document> Parser::Parse() {
129
12.5k
  bool eof = tokenizer_->IsEOF();
130
15.1M
  while (!eof) {
131
15.1M
    Node* node = open_elements_stack_.Top();
132
15.1M
    tokenizer_->SetAllowCDATA(node && !node->name_space_.empty());
133
    // Read and parse the next token.
134
15.1M
    TokenType token_type = tokenizer_->Next(!template_stack_.empty());
135
136
    // No end of input, but error token. Parsing failed.
137
15.1M
    if (token_type == TokenType::ERROR_TOKEN) {
138
12.5k
      eof = tokenizer_->IsEOF();
139
12.5k
      if (!eof && tokenizer_->Error()) {
140
0
        document_->status_ = absl::InvalidArgumentError(
141
0
            "htmlparser::Parser tokenizer error.");
142
0
        return std::move(document_);
143
0
      }
144
12.5k
    }
145
15.1M
    token_ = tokenizer_->token();
146
15.1M
    ParseCurrentToken();
147
15.1M
  }
148
149
#ifdef DUMP_NODES
150
  DumpDocument(document_.get());
151
#endif
152
153
12.5k
  document_->metadata_.document_end_location = tokenizer_->CurrentPosition();
154
12.5k
  return std::move(document_);
155
12.5k
}  // End Parser::Parse.
156
157
32.7M
Node* Parser::top() {
158
32.7M
  Node* node = open_elements_stack_.Top();
159
32.7M
  if (node) {
160
32.7M
    return node;
161
32.7M
  }
162
163
12.5k
  return document_->root_node_;
164
32.7M
}  // End Parser::Top.
165
166
template <typename... Args>
167
542k
bool Parser::PopUntil(Scope scope, Args... match_tags) {
168
542k
  std::vector<Atom> argsList{match_tags...};
169
542k
  int i = IndexOfElementInScope(scope, argsList);
170
542k
  if (i != -1) {
171
350k
    open_elements_stack_.Pop(open_elements_stack_.size() - i);
172
350k
    return true;
173
350k
  }
174
192k
  return false;
175
542k
}  // End Parser::PopUntil.
bool htmlparser::Parser::PopUntil<htmlparser::Atom>(htmlparser::Parser::Scope, htmlparser::Atom)
Line
Count
Source
167
444k
bool Parser::PopUntil(Scope scope, Args... match_tags) {
168
444k
  std::vector<Atom> argsList{match_tags...};
169
444k
  int i = IndexOfElementInScope(scope, argsList);
170
444k
  if (i != -1) {
171
252k
    open_elements_stack_.Pop(open_elements_stack_.size() - i);
172
252k
    return true;
173
252k
  }
174
191k
  return false;
175
444k
}  // End Parser::PopUntil.
bool htmlparser::Parser::PopUntil<htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom>(htmlparser::Parser::Scope, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom)
Line
Count
Source
167
1.55k
bool Parser::PopUntil(Scope scope, Args... match_tags) {
168
1.55k
  std::vector<Atom> argsList{match_tags...};
169
1.55k
  int i = IndexOfElementInScope(scope, argsList);
170
1.55k
  if (i != -1) {
171
542
    open_elements_stack_.Pop(open_elements_stack_.size() - i);
172
542
    return true;
173
542
  }
174
1.01k
  return false;
175
1.55k
}  // End Parser::PopUntil.
bool htmlparser::Parser::PopUntil<htmlparser::Atom, htmlparser::Atom, htmlparser::Atom>(htmlparser::Parser::Scope, htmlparser::Atom, htmlparser::Atom, htmlparser::Atom)
Line
Count
Source
167
21.4k
bool Parser::PopUntil(Scope scope, Args... match_tags) {
168
21.4k
  std::vector<Atom> argsList{match_tags...};
169
21.4k
  int i = IndexOfElementInScope(scope, argsList);
170
21.4k
  if (i != -1) {
171
21.4k
    open_elements_stack_.Pop(open_elements_stack_.size() - i);
172
21.4k
    return true;
173
21.4k
  }
174
0
  return false;
175
21.4k
}  // End Parser::PopUntil.
bool htmlparser::Parser::PopUntil<htmlparser::Atom, htmlparser::Atom>(htmlparser::Parser::Scope, htmlparser::Atom, htmlparser::Atom)
Line
Count
Source
167
75.3k
bool Parser::PopUntil(Scope scope, Args... match_tags) {
168
75.3k
  std::vector<Atom> argsList{match_tags...};
169
75.3k
  int i = IndexOfElementInScope(scope, argsList);
170
75.3k
  if (i != -1) {
171
75.3k
    open_elements_stack_.Pop(open_elements_stack_.size() - i);
172
75.3k
    return true;
173
75.3k
  }
174
0
  return false;
175
75.3k
}  // End Parser::PopUntil.
176
177
int Parser::IndexOfElementInScope(Scope scope,
178
925k
                                  const std::vector<Atom>& match_tags) const {
179
313M
  for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
180
313M
    Node* node = open_elements_stack_.at(i);
181
313M
    if (node->name_space_.empty()) {
182
313M
      for (Atom a : match_tags) {
183
313M
        if (a == node->atom_) {
184
716k
          return i;
185
716k
        }
186
313M
      }
187
312M
      switch (scope) {
188
165M
        case Scope::DefaultScope:
189
          // No-op.
190
165M
          break;
191
947
        case Scope::ListItemScope:
192
947
          if (node->atom_ == Atom::OL || node->atom_ == Atom::UL) return -1;
193
559
          break;
194
145M
        case Scope::ButtonScope:
195
145M
          if (node->atom_ == Atom::BUTTON) return -1;
196
145M
          break;
197
145M
        case Scope::TableScope:
198
1.68M
          if (node->atom_ == Atom::HTML || node->atom_ == Atom::TABLE ||
199
1.68M
              node->atom_ == Atom::TEMPLATE) {
200
1.25k
            return -1;
201
1.25k
          }
202
1.67M
          break;
203
1.67M
        case Scope::SelectScope:
204
0
          if (node->atom_ != Atom::OPTGROUP && node->atom_ != Atom::OPTION) {
205
0
            return -1;
206
0
          }
207
0
          break;
208
0
        default:
209
0
          CHECK(false) << "HTML Parser reached unreachable scope";
210
312M
      }
211
312M
    }
212
213
312M
    switch (scope) {
214
165M
      case Scope::DefaultScope:
215
165M
      case Scope::ListItemScope:
216
310M
      case Scope::ButtonScope: {
217
931M
        for (auto& scope_stop_tags : kDefaultScopeStopTags) {
218
931M
          if (scope_stop_tags.first == node->name_space_) {
219
2.79G
            for (Atom t : scope_stop_tags.second) {
220
2.79G
              if (t == Atom::UNKNOWN) break;
221
2.79G
              if (t == node->atom_) return -1;
222
2.79G
            }
223
310M
          }
224
931M
        }
225
310M
        break;
226
310M
      }
227
310M
      default:
228
1.67M
        break;
229
312M
    }
230
312M
  }
231
0
  return -1;
232
925k
}  // Parser::IndexOfElementInScope.
233
234
template <typename... Args>
235
380k
bool Parser::ElementInScope(Scope scope, Args... tags) const {
236
380k
  std::vector<Atom> argsList{tags...};
237
380k
  return IndexOfElementInScope(scope, argsList) != -1;
238
380k
}  // Parser::ElementInScope.
239
240
3.60M
void Parser::ClearStackToContext(Scope scope) {
241
7.26M
  for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
242
7.26M
    Node* node = open_elements_stack_.at(i);
243
7.26M
    Atom atom = node->atom_;
244
7.26M
    switch (scope) {
245
2.87M
      case Scope::TableScope:
246
2.87M
        if (atom == Atom::HTML || atom == Atom::TABLE ||
247
2.87M
            atom == Atom::TEMPLATE) {
248
1.18M
          open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
249
1.18M
          return;
250
1.18M
        }
251
1.68M
        break;
252
1.92M
      case Scope::TableRowScope:
253
1.92M
        if (atom == Atom::HTML || atom == Atom::TR || atom == Atom::TEMPLATE) {
254
1.23M
          open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
255
1.23M
          return;
256
1.23M
        }
257
692k
        break;
258
2.46M
      case Scope::TableBodyScope:
259
2.46M
        if (atom == Atom::HTML || atom == Atom::TBODY || atom == Atom::TFOOT ||
260
2.46M
            atom == Atom::THEAD || atom == Atom::TEMPLATE) {
261
1.18M
          open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
262
1.18M
          return;
263
1.18M
        }
264
1.27M
        break;
265
1.27M
      default:
266
0
        CHECK(false) << "HTML Parser reached unreachable scope";
267
7.26M
    }
268
7.26M
  }
269
3.60M
}  // Parser::ClearStackToContext.
270
271
void Parser::GenerateImpliedEndTags(
272
1.89k
    const std::initializer_list<Atom>& exceptions) {
273
1.89k
  int i = open_elements_stack_.size() - 1;
274
5.80k
  for (; i >= 0; --i) {
275
5.80k
    Node* node = open_elements_stack_.at(i);
276
5.80k
    if (node->node_type_ == NodeType::ELEMENT_NODE) {
277
5.80k
      switch (node->atom_) {
278
392
        case Atom::DD:
279
611
        case Atom::DT:
280
1.21k
        case Atom::LI:
281
1.21k
        case Atom::OPTGROUP:
282
1.21k
        case Atom::OPTION:
283
1.42k
        case Atom::P:
284
1.79k
        case Atom::RB:
285
2.28k
        case Atom::RP:
286
3.90k
        case Atom::RT:
287
4.09k
        case Atom::RTC:
288
4.09k
          for (auto e : exceptions) {
289
1.41k
            if (node->atom_ == e) {
290
              // Pop nodes and return early.
291
194
              open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
292
194
              return;
293
194
            }
294
1.41k
          }
295
3.90k
          continue;
296
3.90k
        default:
297
1.70k
          break;
298
5.80k
      }
299
5.80k
    }
300
1.70k
    break;
301
5.80k
  }
302
1.70k
  open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
303
1.70k
}  // Parser::GenerateImpliedEndTags.
304
305
22.9M
void Parser::AddChild(Node* node) {
306
22.9M
  if (ShouldFosterParent()) {
307
16.7k
    FosterParent(node);
308
22.9M
  } else {
309
22.9M
    top()->AppendChild(node);
310
22.9M
  }
311
312
22.9M
  if (node->node_type_ == NodeType::ELEMENT_NODE) {
313
21.3M
    open_elements_stack_.Push(node);
314
21.3M
  }
315
22.9M
}  // Parser::AddChild.
316
317
23.5M
bool Parser::ShouldFosterParent() {
318
23.5M
  if (!foster_parenting_) return false;
319
5.00M
  Atom a = top()->atom_;
320
5.00M
  return (a == Atom::TABLE || a == Atom::TBODY || a == Atom::TFOOT ||
321
5.00M
          a == Atom::THEAD || a == Atom::TR);
322
23.5M
}  // Parser::ShouldFosterParent.
323
324
47.2k
void Parser::FosterParent(Node* node) {
325
47.2k
  Node* table = nullptr;
326
47.2k
  Node* parent = nullptr;
327
47.2k
  Node* prev = nullptr;
328
47.2k
  Node* tpl = nullptr;
329
47.2k
  int i = -1;
330
533k
  for (i = open_elements_stack_.size() - 1; i >= 0; --i) {
331
533k
    if (open_elements_stack_.at(i)->atom_ == Atom::TABLE) {
332
47.2k
      table = open_elements_stack_.at(i);
333
47.2k
      break;
334
47.2k
    }
335
533k
  }
336
337
47.2k
  int j = -1;
338
1.18G
  for (j = open_elements_stack_.size() - 1; j >= 0; --j) {
339
1.18G
    if (open_elements_stack_.at(j)->atom_ == Atom::TEMPLATE) {
340
0
      tpl = open_elements_stack_.at(j);
341
0
      break;
342
0
    }
343
1.18G
  }
344
345
47.2k
  if (tpl && (!table || j > i)) {
346
0
    tpl->AppendChild(node);
347
0
    return;
348
0
  }
349
350
47.2k
  if (!table) {
351
    // The foster parent is the html element.
352
0
    parent = open_elements_stack_.at(0);
353
47.2k
  } else {
354
47.2k
    parent = table->Parent();
355
47.2k
  }
356
357
47.2k
  if (!parent) {
358
0
    parent = open_elements_stack_.at(i - 1);
359
0
  }
360
361
47.2k
  if (table) {
362
47.2k
    prev = table->PrevSibling();
363
47.2k
  } else {
364
0
    prev = parent->LastChild();
365
0
  }
366
367
47.2k
  if (prev && prev->node_type_ == NodeType::TEXT_NODE &&
368
47.2k
      node->node_type_ == NodeType::TEXT_NODE) {
369
7.73k
    prev->data_.append(node->data_);
370
7.73k
    return;
371
7.73k
  }
372
373
39.5k
  parent->InsertBefore(node, table);
374
39.5k
}  // Parser::FosterParent.
375
376
626k
void Parser::AddText(const std::string& text) {
377
626k
  if (text.empty()) return;
378
379
626k
  auto text_node = document_->NewNode(NodeType::TEXT_NODE);
380
626k
  if (record_node_offsets_) {
381
0
    text_node->line_col_in_html_src_ = token_.line_col_in_html_src;
382
0
  }
383
384
626k
  if (ShouldFosterParent()) {
385
28.7k
    text_node->data_.assign(text, 0, text.size());
386
28.7k
    FosterParent(text_node);
387
28.7k
    return;
388
28.7k
  }
389
390
597k
  Node* top_node = top();
391
597k
  if (top_node->LastChild() &&
392
597k
      top_node->LastChild()->node_type_ == NodeType::TEXT_NODE) {
393
7.42k
    top_node->LastChild()->data_.append(text);
394
7.42k
    return;
395
7.42k
  }
396
397
590k
  text_node->data_.assign(text, 0, text.size());
398
590k
  AddChild(text_node);
399
  // Count number of terms in ths text node, except if this is <script>,
400
  // <textarea> or a comment node.
401
590k
  if (count_num_terms_in_text_node_ && text_node->Parent() &&
402
590k
      text_node->Parent()->DataAtom() != Atom::SCRIPT &&
403
590k
      text_node->Parent()->Type() != NodeType::COMMENT_NODE &&
404
590k
      text_node->Parent()->DataAtom() != Atom::TEXTAREA) {
405
0
    text_node->num_terms_ = Strings::CountTerms(text);
406
0
  }
407
590k
}  // Parser::AddText.
408
409
15.5M
void Parser::AddElement() {
410
15.5M
  Node* element_node = document_->NewNode(NodeType::ELEMENT_NODE, token_.atom);
411
15.5M
  if (token_.atom == Atom::UNKNOWN) {
412
7.91M
    element_node->data_ = token_.data;
413
7.91M
  }
414
415
15.5M
  if (record_node_offsets_) {
416
0
    element_node->line_col_in_html_src_ = token_.line_col_in_html_src;
417
0
  }
418
419
15.5M
  switch (token_.atom) {
420
12.7k
    case Atom::HTML: {
421
12.7k
      element_node->SetManufactured(document_->metadata_.has_manufactured_html);
422
12.7k
      break;
423
0
    }
424
12.5k
    case Atom::HEAD: {
425
12.5k
      element_node->SetManufactured(document_->metadata_.has_manufactured_head);
426
12.5k
      break;
427
0
    }
428
12.5k
    case Atom::BODY: {
429
12.5k
      element_node->SetManufactured(document_->metadata_.has_manufactured_body);
430
12.5k
      break;
431
0
    }
432
15.4M
    default:
433
15.4M
      break;
434
15.5M
  }
435
436
15.5M
  std::copy(token_.attributes.begin(), token_.attributes.end(),
437
15.5M
            std::back_inserter(element_node->attributes_));
438
15.5M
  AddChild(element_node);
439
440
15.5M
  if (!record_attribute_offsets_ && !element_node->attributes_.empty()) {
441
161k
    std::transform(
442
161k
        element_node->attributes_.begin(), element_node->attributes_.end(),
443
576k
        element_node->attributes_.begin(), [](Attribute attr) -> Attribute {
444
576k
          attr.line_col_in_html_src = std::nullopt;
445
576k
          return attr;
446
576k
        });
447
161k
  }
448
449
15.5M
  if (on_node_callback_) {
450
0
    on_node_callback_(element_node, token_);
451
0
  }
452
15.5M
}  // Parser::AddElement.
453
454
// Section 12.2.4.3.
455
2.29M
void Parser::AddFormattingElement() {
456
2.29M
  Atom tag_atom = token_.atom;
457
2.29M
  AddElement();
458
459
  // Implement the Noah's Ark clause, but with three per family instead of two.
460
2.29M
  int identical_elements = 0;
461
2.91G
  for (int i = active_formatting_elements_stack_.size() - 1; i >= 0; --i) {
462
2.91G
    Node* node = active_formatting_elements_stack_.at(i);
463
2.91G
    if (node->node_type_ == NodeType::SCOPE_MARKER_NODE) break;
464
2.91G
    if (node->node_type_ != NodeType::ELEMENT_NODE) continue;
465
2.91G
    if (node->name_space_ != "") continue;
466
2.91G
    if (node->atom_ != tag_atom) continue;
467
2.63G
    if (node->attributes_.size() != token_.attributes.size()) continue;
468
469
2.63G
    bool attr_matched = false;
470
2.63G
    for (int j = 0; j < node->attributes_.size(); ++j) {
471
1.00M
      for (int k = 0; k < token_.attributes.size(); ++k) {
472
605k
        attr_matched = (node->attributes_[j] == token_.attributes[k]);
473
        // Found a match for this attribute, continue with the next attribute.
474
605k
        if (attr_matched) break;
475
605k
      }
476
477
598k
      if (attr_matched) continue;
478
479
      // If we get here, there is no attribute that matches a.
480
      // Therefore the element is not identical to the new one.
481
      // Stop processing rest of the attributes and proceed to next element.
482
402k
      break;
483
598k
    }
484
485
2.63G
    if (attr_matched) {
486
191k
      identical_elements++;
487
191k
      if (identical_elements >= 3) {
488
63.7k
        active_formatting_elements_stack_.Remove(node);
489
63.7k
      }
490
191k
    }
491
2.63G
  }
492
493
2.29M
  active_formatting_elements_stack_.Push(top());
494
2.29M
}  // Parser::AddFormattingElement.
495
496
// Section 12.2.4.3.
497
76.3k
void Parser::ClearActiveFormattingElements() {
498
165k
  while (active_formatting_elements_stack_.size() != 0) {
499
165k
    Node* node = active_formatting_elements_stack_.Pop();
500
165k
    if (node->node_type_ == NodeType::SCOPE_MARKER_NODE) break;
501
165k
  }
502
76.3k
}  // Parser::ClearActiveFormattingElements.
503
504
// Section 12.2.4.3.
505
9.10M
void Parser::ReconstructActiveFormattingElements() {
506
9.10M
  Node* node = active_formatting_elements_stack_.Top();
507
9.10M
  if (!node) return;
508
509
6.07M
  if (node->node_type_ == NodeType::SCOPE_MARKER_NODE ||
510
6.07M
      open_elements_stack_.Index(node) != -1) {
511
6.05M
    return;
512
6.05M
  }
513
514
20.5k
  int i = active_formatting_elements_stack_.size() - 1;
515
5.82M
  while (node->node_type_ != NodeType::SCOPE_MARKER_NODE &&
516
5.82M
         open_elements_stack_.Index(node) == -1) {
517
5.81M
    if (i == 0) {
518
7.98k
      i = -1;
519
7.98k
      break;
520
7.98k
    }
521
5.80M
    i--;
522
5.80M
    node = active_formatting_elements_stack_.at(i);
523
5.80M
  }
524
525
5.81M
  do {
526
5.81M
    i++;
527
5.81M
    auto clone = document_->CloneNode(active_formatting_elements_stack_.at(i));
528
5.81M
    AddChild(clone);
529
5.81M
    active_formatting_elements_stack_.Replace(i, clone);
530
5.81M
  } while (i < active_formatting_elements_stack_.size() - 1);
531
20.5k
}  // Parser::ReconstructActiveFormattingElements.
532
533
// Section 12.2.5.
534
18.5k
void Parser::AcknowledgeSelfClosingTag() {
535
18.5k
  has_self_closing_token_ = false;
536
18.5k
}  // Parser::AcknowledgeSelfClosingTag.
537
538
// Section 12.2.4.1, "using the rules for".
539
17.6k
void Parser::SetOriginalIM() {
540
17.6k
  CHECK(!original_insertion_mode_)
541
17.6k
      << "html: bad parser state: original_insertion_mode was set twice";
542
17.6k
  original_insertion_mode_ = insertion_mode_;
543
17.6k
}  // Parser::SetOriginalIM.
544
545
// Section 12.2.4.1, "reset the insertion mode".
546
117k
void Parser::ResetInsertionMode() {
547
274k
  for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
548
274k
    Node* node = open_elements_stack_.at(i);
549
274k
    bool last = (i == 0);
550
274k
    if (last && context_node_) {
551
0
      node = context_node_;
552
0
    }
553
554
274k
    switch (node->atom_) {
555
0
      case Atom::SELECT:
556
0
        if (!last) {
557
0
          Node* ancestor = node;
558
0
          Node* first = open_elements_stack_.at(0);
559
0
          while (ancestor != first) {
560
0
            ancestor = open_elements_stack_.at(
561
0
                open_elements_stack_.Index(ancestor) - 1);
562
0
            switch (ancestor->atom_) {
563
0
              case Atom::TEMPLATE:
564
0
                insertion_mode_ = std::bind(&Parser::InSelectIM, this);
565
0
                return;
566
0
              case Atom::TABLE:
567
0
                insertion_mode_ = std::bind(&Parser::InSelectInTableIM, this);
568
0
                return;
569
0
              default:
570
0
                break;
571
0
            }
572
0
          }
573
0
        }
574
0
        insertion_mode_ = std::bind(&Parser::InSelectIM, this);
575
0
        break;
576
7.41k
      case Atom::TD:
577
112k
      case Atom::TH:
578
        // https://bugs.chromium.org/p/chromium/issues/detail?id=829668
579
112k
        insertion_mode_ = std::bind(&Parser::InCellIM, this);
580
112k
        break;
581
0
      case Atom::TR:
582
0
        insertion_mode_ = std::bind(&Parser::InRowIM, this);
583
0
        break;
584
0
      case Atom::TBODY:
585
0
      case Atom::THEAD:
586
0
      case Atom::TFOOT:
587
0
        insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
588
0
        break;
589
0
      case Atom::CAPTION:
590
0
        insertion_mode_ = std::bind(&Parser::InCaptionIM, this);
591
0
        break;
592
0
      case Atom::COLGROUP:
593
0
        insertion_mode_ = std::bind(&Parser::InColumnGroupIM, this);
594
0
        break;
595
0
      case Atom::TABLE:
596
0
        insertion_mode_ = std::bind(&Parser::InTableIM, this);
597
0
        break;
598
0
      case Atom::TEMPLATE:
599
        // TODO: remove this divergence from the HTML5 spec.
600
0
        if (!node->name_space_.empty()) {
601
0
          continue;
602
0
        }
603
0
        insertion_mode_ = template_stack_.back();
604
0
        break;
605
0
      case Atom::HEAD:
606
        // https://bugs.chromium.org/p/chromium/issues/detail?id=829668
607
0
        insertion_mode_ = std::bind(&Parser::InHeadIM, this);
608
0
        break;
609
4.78k
      case Atom::BODY:
610
4.78k
        insertion_mode_ = std::bind(&Parser::InBodyIM, this);
611
4.78k
        break;
612
0
      case Atom::FRAMESET:
613
0
        insertion_mode_ = std::bind(&Parser::InFramesetIM, this);
614
0
        break;
615
0
      case Atom::HTML:
616
0
        if (head_) {
617
0
          insertion_mode_ = std::bind(&Parser::AfterHeadIM, this);
618
0
        } else {
619
0
          insertion_mode_ = std::bind(&Parser::BeforeHeadIM, this);
620
0
        }
621
0
        break;
622
156k
      default:
623
156k
        if (last) {
624
0
          insertion_mode_ = std::bind(&Parser::InBodyIM, this);
625
0
          return;
626
0
        }
627
156k
        continue;
628
274k
    }
629
117k
    return;
630
274k
  }
631
117k
}  // Parser::ResetInsertionMode.
632
633
// Section 12.2.6.4.1.
634
17.4k
bool Parser::InitialIM() {
635
17.4k
  switch (token_.token_type) {
636
3.84k
    case TokenType::TEXT_TOKEN: {
637
      // https://www.w3.org/TR/2011/WD-html5-20110113/tokenization.html#the-initial-insertion-mode
638
3.84k
      Strings::TrimLeft(&token_.data, Strings::kWhitespace);
639
3.84k
      if (token_.data.empty()) {
640
        // It was all whitespace, so ignore it.
641
1.49k
        return true;
642
1.49k
      }
643
2.35k
      break;
644
3.84k
    }
645
3.42k
    case TokenType::COMMENT_TOKEN: {
646
3.42k
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
647
3.42k
      node->data_ = std::move(token_.data);
648
3.42k
      if (record_node_offsets_) {
649
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
650
0
      }
651
3.42k
      node->SetManufactured(token_.is_manufactured);
652
3.42k
      document_->root_node_->AppendChild(node);
653
3.42k
      return true;
654
3.84k
    }
655
125
    case TokenType::DOCTYPE_TOKEN: {
656
125
      auto doctype_node = document_->NewNode(NodeType::DOCTYPE_NODE);
657
125
      bool quirks_mode = ParseDoctype(token_.data, doctype_node);
658
125
      if (record_node_offsets_) {
659
0
        doctype_node->line_col_in_html_src_ = token_.line_col_in_html_src;
660
0
      }
661
125
      document_->root_node_->AppendChild(doctype_node);
662
125
      document_->metadata_.quirks_mode = quirks_mode;
663
125
      insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);
664
665
125
      if (on_node_callback_) {
666
0
        on_node_callback_(doctype_node, token_);
667
0
      }
668
669
125
      return true;
670
3.84k
    }
671
10.0k
    default:
672
10.0k
      break;
673
17.4k
  }
674
675
12.3k
  document_->metadata_.quirks_mode = true;
676
12.3k
  insertion_mode_ = std::bind(&Parser::BeforeHTMLIM, this);
677
12.3k
  return false;
678
17.4k
}  // Parser::InitialIM.
679
680
// Section 12.2.6.4.2.
681
26.6k
bool Parser::BeforeHTMLIM() {
682
26.6k
  switch (token_.token_type) {
683
244
    case TokenType::DOCTYPE_TOKEN: {
684
      // Ignore the token.
685
244
      return true;
686
0
    }
687
2.86k
    case TokenType::TEXT_TOKEN: {
688
      // https://www.w3.org/TR/2011/WD-html5-20110113/tokenization.html#the-before-html-insertion-mode
689
2.86k
      Strings::TrimLeft(&token_.data, Strings::kWhitespace);
690
2.86k
      if (token_.data.empty()) {
691
        // It was all whitespace, so ignore it.
692
498
        return true;
693
498
      }
694
2.36k
      break;
695
2.86k
    }
696
21.5k
    case TokenType::START_TAG_TOKEN: {
697
21.5k
      if (token_.atom == Atom::HTML) {
698
12.5k
        AddElement();
699
12.5k
        insertion_mode_ = std::bind(&Parser::BeforeHeadIM, this);
700
12.5k
        return true;
701
12.5k
      }
702
9.00k
      break;
703
21.5k
    }
704
9.00k
    case TokenType::END_TAG_TOKEN: {
705
533
      switch ((Atom)token_.atom) {
706
90
        case Atom::HEAD:
707
162
        case Atom::BODY:
708
260
        case Atom::HTML:
709
267
        case Atom::BR:
710
267
          ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::HTML,
711
267
                            AtomUtil::ToString(Atom::HTML));
712
267
          return false;
713
266
        default:
714
          // Ignore the token.
715
266
          return true;
716
533
      }
717
0
      break;
718
533
    }
719
726
    case TokenType::COMMENT_TOKEN: {
720
726
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
721
726
      node->SetManufactured(token_.is_manufactured);
722
726
      if (record_node_offsets_) {
723
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
724
0
      }
725
726
      node->data_ = std::move(token_.data);
726
726
      document_->root_node_->AppendChild(node);
727
726
      return true;
728
533
    }
729
807
    default:
730
807
      break;
731
26.6k
  }
732
12.1k
  ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::HTML,
733
12.1k
                    AtomUtil::ToString(Atom::HTML));
734
12.1k
  return false;
735
26.6k
}  // Parser::BeforeHTMLIM.
736
737
// Section 12.2.6.4.3.
738
26.6k
bool Parser::BeforeHeadIM() {
739
26.6k
  switch (token_.token_type) {
740
2.92k
    case TokenType::TEXT_TOKEN: {
741
      // https://www.w3.org/TR/2011/WD-html5-20110113/tokenization.html#the-before-head-insertion-mode
742
2.92k
      Strings::TrimLeft(&token_.data, Strings::kWhitespace);
743
2.92k
      if (token_.data.empty()) {
744
        // It was all whitespace, so ignore it.
745
551
        return true;
746
551
      }
747
2.37k
      break;
748
2.92k
    }
749
21.8k
    case TokenType::START_TAG_TOKEN: {
750
21.8k
      switch (token_.atom) {
751
12.5k
        case Atom::HEAD:
752
12.5k
          AddElement();
753
12.5k
          head_ = top();
754
12.5k
          insertion_mode_ = std::bind(&Parser::InHeadIM, this);
755
12.5k
          return true;
756
389
        case Atom::HTML:
757
389
          return InBodyIM();
758
8.92k
        default:
759
8.92k
          break;
760
21.8k
      }
761
8.92k
      break;
762
21.8k
    }
763
8.92k
    case TokenType::END_TAG_TOKEN: {
764
305
      switch (token_.atom) {
765
90
        case Atom::HEAD:
766
162
        case Atom::BODY:
767
260
        case Atom::HTML:
768
267
        case Atom::BR:
769
267
          ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::HEAD,
770
267
                            AtomUtil::ToString(Atom::HEAD));
771
267
          return false;
772
38
        default:
773
          // Ignore the token.
774
38
          return true;
775
305
      }
776
0
      break;
777
305
    }
778
734
    case TokenType::COMMENT_TOKEN: {
779
734
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
780
734
      node->SetManufactured(token_.is_manufactured);
781
734
      if (record_node_offsets_) {
782
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
783
0
      }
784
734
      node->data_ = std::move(token_.data);
785
734
      AddChild(node);
786
734
      return true;
787
305
    }
788
0
    case TokenType::DOCTYPE_TOKEN: {
789
      // Ignore the token.
790
0
      return true;
791
305
    }
792
870
    default:
793
870
      break;
794
26.6k
  }
795
796
12.1k
  ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::HEAD,
797
12.1k
                    AtomUtil::ToString(Atom::HEAD));
798
12.1k
  return false;
799
26.6k
}  // Parser::BeforeHeadIM.
800
801
// Section 12.2.6.4.4.
802
61.7k
bool Parser::InHeadIM() {
803
61.7k
  switch (token_.token_type) {
804
3.18k
    case TokenType::TEXT_TOKEN: {
805
3.18k
      std::string s = token_.data;
806
3.18k
      Strings::TrimLeft(&s, Strings::kWhitespace);
807
3.18k
      if (s.size() < token_.data.size()) {
808
        // Add the initial whitespace to the current node.
809
        // https://www.w3.org/TR/2011/WD-html5-20110113/tokenization.html#parsing-main-inhead
810
798
        AddText(token_.data.substr(0, token_.data.size() - s.size()));
811
798
        if (s.empty()) {
812
773
          return true;
813
773
        }
814
25
        token_.data = s;
815
25
      }
816
2.41k
      break;
817
3.18k
    }
818
39.7k
    case TokenType::START_TAG_TOKEN: {
819
39.7k
      switch (token_.atom) {
820
227
        case Atom::HTML:
821
227
          return InBodyIM();
822
9.10k
        case Atom::BASE:
823
9.10k
        case Atom::BASEFONT:
824
9.10k
        case Atom::BGSOUND:
825
10.1k
        case Atom::LINK:
826
10.7k
        case Atom::META: {
827
10.7k
          AddElement();
828
10.7k
          open_elements_stack_.Pop();
829
10.7k
          AcknowledgeSelfClosingTag();
830
10.7k
          if (!top() || !top()->LastChild()) return true;
831
          // Record some extra document url related info.
832
10.5k
          if (token_.atom == Atom::BASE) {
833
9.10k
            auto base_node = top()->LastChild();
834
9.10k
            RecordBaseURLMetadata(base_node);
835
9.10k
          } else if (token_.atom == Atom::LINK) {
836
824
            auto link_node = top()->LastChild();
837
824
            RecordLinkRelCanonical(link_node);
838
824
          }
839
10.5k
          return true;
840
10.7k
        }
841
2.73k
        case Atom::NOSCRIPT: {
842
2.73k
          if (scripting_) {
843
2.73k
            ParseGenericRawTextElement();
844
2.73k
            return true;
845
2.73k
          }
846
0
          AddElement();
847
0
          insertion_mode_ = std::bind(&Parser::InHeadNoscriptIM, this);
848
          // Don't let the tokenizer go into raw text mode when scripting is
849
          // disabled.
850
0
          tokenizer_->NextIsNotRawText();
851
0
          return true;
852
2.73k
        }
853
5.39k
        case Atom::SCRIPT:
854
17.6k
        case Atom::TITLE: {
855
17.6k
          AddElement();
856
17.6k
          SetOriginalIM();
857
17.6k
          insertion_mode_ = std::bind(&Parser::TextIM, this);
858
17.6k
          return true;
859
5.39k
        }
860
0
        case Atom::NOFRAMES:
861
217
        case Atom::STYLE: {
862
217
          ParseGenericRawTextElement();
863
217
          return true;
864
0
        }
865
207
        case Atom::HEAD: {
866
          // Ignore the token.
867
207
          return true;
868
0
        }
869
0
        case Atom::TEMPLATE: {
870
0
          AddElement();
871
0
          active_formatting_elements_stack_.Push(scope_marker_);
872
0
          frameset_ok_ = false;
873
0
          insertion_mode_ = std::bind(&Parser::InTemplateIM, this);
874
0
          template_stack_.push_back(std::bind(&Parser::InTemplateIM, this));
875
0
          return true;
876
0
        }
877
7.90k
        default:
878
          // Ignore remaining tags.
879
7.90k
          break;
880
39.7k
      }
881
7.90k
      break;
882
39.7k
    }
883
15.9k
    case TokenType::END_TAG_TOKEN: {
884
15.9k
      switch (token_.atom) {
885
12.5k
        case Atom::HEAD: {
886
12.5k
          open_elements_stack_.Pop();
887
12.5k
          insertion_mode_ = std::bind(&Parser::AfterHeadIM, this);
888
12.5k
          return true;
889
0
        }
890
72
        case Atom::BODY:
891
170
        case Atom::HTML:
892
177
        case Atom::BR: {
893
177
          ParseImpliedToken(TokenType::END_TAG_TOKEN, Atom::HEAD,
894
177
                            AtomUtil::ToString(Atom::HEAD));
895
177
          return false;
896
170
        }
897
0
        case Atom::TEMPLATE: {
898
0
          if (!open_elements_stack_.Contains(Atom::TEMPLATE)) return true;
899
900
          // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
901
0
          GenerateImpliedEndTags();
902
0
          for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
903
0
            Node* node = open_elements_stack_.at(i);
904
0
            if (node->name_space_.empty() && node->atom_ == Atom::TEMPLATE) {
905
0
              open_elements_stack_.Pop(open_elements_stack_.size() - i);
906
0
              break;
907
0
            }
908
0
          }
909
910
0
          ClearActiveFormattingElements();
911
0
          template_stack_.pop_back();
912
0
          ResetInsertionMode();
913
0
          return true;
914
0
        }
915
3.23k
        default:
916
          // Ignore the token.
917
3.23k
          return true;
918
15.9k
      }
919
0
      break;
920
15.9k
    }
921
957
    case TokenType::COMMENT_TOKEN: {
922
957
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
923
957
      node->SetManufactured(token_.is_manufactured);
924
957
      if (record_node_offsets_) {
925
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
926
0
      }
927
957
      node->data_ = std::move(token_.data);
928
957
      AddChild(node);
929
957
      return true;
930
15.9k
    }
931
0
    case TokenType::DOCTYPE_TOKEN: {
932
      // Ignore the token.
933
0
      return true;
934
15.9k
    }
935
1.92k
    default:
936
1.92k
      break;
937
61.7k
  }
938
939
12.2k
  ParseImpliedToken(TokenType::END_TAG_TOKEN, Atom::HEAD,
940
12.2k
                    AtomUtil::ToString(Atom::HEAD));
941
12.2k
  return false;
942
61.7k
}  // Parser::InHeadIM.
943
944
// 12.2.6.4.5.
945
0
bool Parser::InHeadNoscriptIM() {
946
0
  switch (token_.token_type) {
947
0
    case TokenType::DOCTYPE_TOKEN: {
948
      // Ignore the token.
949
0
      return true;
950
0
    }
951
0
    case TokenType::START_TAG_TOKEN: {
952
0
      switch (token_.atom) {
953
0
        case Atom::HTML: {
954
0
          return InBodyIM();
955
0
          break;
956
0
        }
957
0
        case Atom::BASEFONT:
958
0
        case Atom::BGSOUND:
959
0
        case Atom::LINK:
960
0
        case Atom::META:
961
0
        case Atom::NOFRAMES:
962
0
        case Atom::STYLE: {
963
0
          return InHeadIM();
964
0
          break;
965
0
        }
966
0
        case Atom::HEAD:
967
          // Ignore the token.
968
0
          return true;
969
0
        case Atom::NOSCRIPT: {
970
          // Don't let the tokenizer go into raw text mode even when a
971
          // <noscript> tag is in "in head noscript" insertion mode.
972
0
          tokenizer_->NextIsNotRawText();
973
          // Ignore the token.
974
0
          return true;
975
0
        }
976
0
        default:
977
0
          break;
978
0
      }
979
0
      break;
980
0
    }
981
0
    case TokenType::END_TAG_TOKEN: {
982
0
      switch (token_.atom) {
983
0
        case Atom::NOSCRIPT:
984
0
        case Atom::BR: {
985
0
          break;
986
0
        }
987
0
        default:
988
          // Ignore the token.
989
0
          return true;
990
0
      }
991
0
      break;
992
0
    }
993
0
    case TokenType::TEXT_TOKEN: {
994
0
      if (Strings::IsAllWhitespaceChars(token_.data)) {
995
        // It was all whitespace.
996
0
        return InHeadIM();
997
0
      }
998
0
      break;
999
0
    }
1000
0
    case TokenType::COMMENT_TOKEN: {
1001
0
      return InHeadIM();
1002
0
      break;
1003
0
    }
1004
0
    default:
1005
0
      break;
1006
0
  }
1007
0
  open_elements_stack_.Pop();
1008
0
  CHECK(top()->atom_ == Atom::HEAD)
1009
0
      << "html: the new current node will be a head element.";
1010
1011
0
  insertion_mode_ = std::bind(&Parser::InHeadIM, this);
1012
0
  if (token_.atom == Atom::NOSCRIPT) {
1013
0
    return true;
1014
0
  }
1015
1016
0
  return false;
1017
0
}  // Parser::InHeadNoscriptIM.
1018
1019
// Section 12.2.6.4.6.
1020
26.6k
bool Parser::AfterHeadIM() {
1021
26.6k
  switch (token_.token_type) {
1022
3.02k
    case TokenType::TEXT_TOKEN: {
1023
3.02k
      std::string s = token_.data;
1024
3.02k
      Strings::TrimLeft(&s);
1025
3.02k
      if (s.size() < token_.data.size()) {
1026
        // Add the initial whitespace to the current node.
1027
595
        AddText(token_.data.substr(0, token_.data.size() - s.size()));
1028
595
        if (s.empty()) return true;
1029
11
        token_.data = s;
1030
11
      }
1031
2.43k
      break;
1032
3.02k
    }
1033
20.5k
    case TokenType::START_TAG_TOKEN: {
1034
20.5k
      switch (token_.atom) {
1035
0
        case Atom::HTML:
1036
0
          return InBodyIM();
1037
12.5k
        case Atom::BODY: {
1038
12.5k
          AddElement();
1039
12.5k
          frameset_ok_ = false;
1040
12.5k
          insertion_mode_ = std::bind(&Parser::InBodyIM, this);
1041
12.5k
          return true;
1042
0
        }
1043
0
        case Atom::FRAMESET: {
1044
0
          AddElement();
1045
0
          insertion_mode_ = std::bind(&Parser::InFramesetIM, this);
1046
0
          return true;
1047
0
        }
1048
0
        case Atom::BASE:
1049
0
        case Atom::BASEFONT:
1050
0
        case Atom::BGSOUND:
1051
0
        case Atom::LINK:
1052
194
        case Atom::META:
1053
194
        case Atom::NOFRAMES:
1054
194
        case Atom::SCRIPT:
1055
194
        case Atom::STYLE:
1056
194
        case Atom::TEMPLATE:
1057
194
        case Atom::TITLE: {
1058
194
          open_elements_stack_.Push(head_);
1059
194
          defer(open_elements_stack_.Remove(head_));
1060
194
          return InHeadIM();
1061
194
        }
1062
572
        case Atom::HEAD:
1063
          // Ignore the token.
1064
572
          return true;
1065
7.27k
        default:
1066
7.27k
          break;
1067
20.5k
      }
1068
7.27k
      break;
1069
20.5k
    }
1070
7.27k
    case TokenType::END_TAG_TOKEN: {
1071
628
      switch (token_.atom) {
1072
72
        case Atom::BODY:
1073
170
        case Atom::HTML:
1074
177
        case Atom::BR: {
1075
          // Drop down to creating an implied <body> tag.
1076
177
          break;
1077
170
        }
1078
0
        case Atom::TEMPLATE: {
1079
0
          return InHeadIM();
1080
170
        }
1081
451
        default:
1082
          // Ignore the token.
1083
451
          return true;
1084
628
      }
1085
177
      break;
1086
628
    }
1087
410
    case TokenType::COMMENT_TOKEN: {
1088
410
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
1089
410
      node->SetManufactured(token_.is_manufactured);
1090
410
      if (record_node_offsets_) {
1091
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
1092
0
      }
1093
410
      node->data_ = std::move(token_.data);
1094
410
      AddChild(node);
1095
410
      return true;
1096
628
    }
1097
0
    case TokenType::DOCTYPE_TOKEN:
1098
      // Ignore the token.
1099
0
      return true;
1100
1.99k
    default:
1101
1.99k
      break;
1102
26.6k
  }
1103
1104
11.8k
  ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::BODY,
1105
11.8k
                    AtomUtil::ToString(Atom::BODY));
1106
11.8k
  frameset_ok_ = true;
1107
11.8k
  return false;
1108
26.6k
}  // Parser::AfterHeadIM.
1109
1110
// Section 12.2.6.4.7.
1111
11.8M
bool Parser::InBodyIM() {  // NOLINT
1112
11.8M
  switch (token_.token_type) {
1113
573k
    case TokenType::TEXT_TOKEN: {
1114
573k
      std::string d = token_.data;
1115
573k
      Node* node = open_elements_stack_.Top();
1116
573k
      switch (node->atom_) {
1117
754
        case Atom::PRE:
1118
754
        case Atom::LISTING: {
1119
754
          if (!node->FirstChild()) {
1120
            // Ignore a new line at the start of a <pre> block.
1121
676
            if (!d.empty() && d.front() == '\r') {
1122
0
              d = d.substr(1);
1123
0
            }
1124
676
            if (!d.empty() && d.front() == '\n') {
1125
263
              d = d.substr(1);
1126
263
            }
1127
676
          }
1128
754
          break;
1129
754
        }
1130
572k
        default:
1131
572k
          break;
1132
573k
      }
1133
1134
573k
      Strings::ReplaceAny(&d, Strings::kNullChar, "");
1135
      // Checks if data empty or all null characters.
1136
573k
      if (d.empty()) {
1137
438
        return true;
1138
438
      }
1139
1140
572k
      ReconstructActiveFormattingElements();
1141
572k
      AddText(d);
1142
572k
      if (frameset_ok_ && !Strings::IsAllWhitespaceChars(d)) {
1143
        // There were non-whitespace chracters inserted.
1144
3.31k
        frameset_ok_ = false;
1145
3.31k
      }
1146
572k
      break;
1147
573k
    }
1148
10.0M
    case TokenType::START_TAG_TOKEN: {
1149
10.0M
      switch (token_.atom) {
1150
2.28k
        case Atom::HTML: {
1151
2.28k
          num_html_tags_++;
1152
2.28k
          if (open_elements_stack_.Contains(Atom::TEMPLATE)) {
1153
0
            return true;
1154
0
          }
1155
2.28k
          CopyAttributes(open_elements_stack_.at(0), token_);
1156
2.28k
          if (!document_->metadata_.has_manufactured_html ||
1157
2.28k
              num_html_tags_ > 1) {
1158
2.23k
            document_->metadata_.duplicate_html_elements = true;
1159
2.23k
            document_->metadata_.duplicate_html_element_location =
1160
2.23k
                token_.line_col_in_html_src;
1161
2.23k
          }
1162
2.28k
          break;
1163
2.28k
        }
1164
7.57k
        case Atom::BASE:
1165
7.57k
        case Atom::BASEFONT:
1166
7.57k
        case Atom::BGSOUND:
1167
8.32k
        case Atom::LINK:
1168
8.51k
        case Atom::META:
1169
8.51k
        case Atom::NOFRAMES:
1170
12.9k
        case Atom::SCRIPT:
1171
13.1k
        case Atom::STYLE:
1172
13.1k
        case Atom::TEMPLATE:
1173
21.7k
        case Atom::TITLE: {
1174
21.7k
          return InHeadIM();
1175
13.1k
        }
1176
23.3k
        case Atom::BODY: {
1177
23.3k
          num_body_tags_++;
1178
23.3k
          if (open_elements_stack_.Contains(Atom::TEMPLATE)) {
1179
0
            return true;
1180
0
          }
1181
23.3k
          if (open_elements_stack_.size() >= 2) {
1182
23.3k
            Node* body = open_elements_stack_.at(1);
1183
23.3k
            if (body->node_type_ == NodeType::ELEMENT_NODE &&
1184
23.3k
                body->atom_ == Atom::BODY) {
1185
23.3k
              frameset_ok_ = false;
1186
23.3k
              CopyAttributes(body, token_);
1187
23.3k
              if (!document_->metadata_.has_manufactured_body ||
1188
23.3k
                  num_body_tags_ > 1) {
1189
22.7k
                document_->metadata_.duplicate_body_elements = true;
1190
22.7k
                document_->metadata_.duplicate_body_element_location =
1191
22.7k
                    token_.line_col_in_html_src;
1192
22.7k
              }
1193
23.3k
            }
1194
23.3k
          }
1195
23.3k
          break;
1196
23.3k
        }
1197
0
        case Atom::FRAMESET: {
1198
0
          if (!frameset_ok_ || open_elements_stack_.size() < 2 ||
1199
0
              open_elements_stack_.at(1)->atom_ != Atom::BODY) {
1200
            // Ignore the token.
1201
0
            return true;
1202
0
          }
1203
0
          auto body = open_elements_stack_.at(1);
1204
0
          if (body->Parent()) {
1205
0
            auto removed_body = body->Parent()->RemoveChild(body);
1206
0
            open_elements_stack_.Remove(removed_body);
1207
0
          }
1208
          // Remove all nodes except one, the last in the stack.
1209
0
          open_elements_stack_.Pop(open_elements_stack_.size() - 1);
1210
0
          AddElement();
1211
0
          insertion_mode_ = std::bind(&Parser::InFramesetIM, this);
1212
0
          return true;
1213
0
        }
1214
0
        case Atom::ADDRESS:
1215
0
        case Atom::ARTICLE:
1216
194
        case Atom::ASIDE:
1217
194
        case Atom::BLOCKQUOTE:
1218
194
        case Atom::CENTER:
1219
194
        case Atom::DETAILS:
1220
194
        case Atom::DIALOG:
1221
389
        case Atom::DIR:
1222
881
        case Atom::DIV:
1223
17.8k
        case Atom::DL:
1224
17.8k
        case Atom::FIELDSET:
1225
17.8k
        case Atom::FIGCAPTION:
1226
17.8k
        case Atom::FIGURE:
1227
18.0k
        case Atom::FOOTER:
1228
18.2k
        case Atom::HEADER:
1229
18.2k
        case Atom::HGROUP:
1230
18.4k
        case Atom::MAIN:
1231
18.6k
        case Atom::MENU:
1232
19.0k
        case Atom::NAV:
1233
19.3k
        case Atom::OL:
1234
30.7k
        case Atom::P:
1235
30.7k
        case Atom::SECTION:
1236
30.7k
        case Atom::SUMMARY:
1237
59.9k
        case Atom::UL: {
1238
59.9k
          PopUntil(Scope::ButtonScope, Atom::P);
1239
59.9k
          AddElement();
1240
59.9k
          break;
1241
30.7k
        }
1242
1.21k
        case Atom::H1:
1243
2.14k
        case Atom::H2:
1244
3.35k
        case Atom::H3:
1245
4.30k
        case Atom::H4:
1246
4.88k
        case Atom::H5:
1247
5.61k
        case Atom::H6: {
1248
5.61k
          PopUntil(Scope::ButtonScope, Atom::P);
1249
5.61k
          Node* top_node = top();
1250
5.61k
          if (top_node) {
1251
5.61k
            switch (top_node->atom_) {
1252
484
              case Atom::H1:
1253
877
              case Atom::H2:
1254
1.37k
              case Atom::H3:
1255
1.80k
              case Atom::H4:
1256
2.18k
              case Atom::H5:
1257
2.56k
              case Atom::H6:
1258
2.56k
                open_elements_stack_.Pop();
1259
2.56k
                break;
1260
3.05k
              default:
1261
3.05k
                break;
1262
5.61k
            }
1263
5.61k
          }
1264
5.61k
          AddElement();
1265
5.61k
          break;
1266
5.61k
        }
1267
922
        case Atom::PRE:
1268
922
        case Atom::LISTING: {
1269
922
          PopUntil(Scope::ButtonScope, Atom::P);
1270
922
          AddElement();
1271
          // The newline, if any, will be dealth with by the TEXT_TOKEN case.
1272
922
          frameset_ok_ = false;
1273
922
          break;
1274
922
        }
1275
1.55k
        case Atom::FORM: {
1276
1.55k
          if (form_ && !open_elements_stack_.Contains(Atom::TEMPLATE)) {
1277
            // Ignore the token.
1278
1.02k
            return true;
1279
1.02k
          }
1280
531
          PopUntil(Scope::ButtonScope, Atom::P);
1281
531
          AddElement();
1282
531
          if (!open_elements_stack_.Contains(Atom::TEMPLATE)) {
1283
531
            form_ = top();
1284
531
          }
1285
531
          break;
1286
1.55k
        }
1287
113k
        case Atom::LI: {
1288
113k
          frameset_ok_ = false;
1289
785k
          for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
1290
785k
            Node* node = open_elements_stack_.at(i);
1291
785k
            switch (node->atom_) {
1292
1.23k
              case Atom::LI:
1293
                // Remove all except last in stack.
1294
1.23k
                open_elements_stack_.Pop(open_elements_stack_.size() - i);
1295
1.23k
                break;
1296
0
              case Atom::ADDRESS:
1297
202
              case Atom::DIV:
1298
418
              case Atom::P:
1299
418
                continue;
1300
783k
              default:
1301
783k
                if (!node->IsSpecialElement()) continue;
1302
785k
            }
1303
113k
            break;
1304
785k
          }
1305
113k
          PopUntil(Scope::ButtonScope, Atom::P);
1306
113k
          AddElement();
1307
113k
          break;
1308
113k
        }
1309
11.4k
        case Atom::DD:
1310
12.8k
        case Atom::DT: {
1311
12.8k
          frameset_ok_ = false;
1312
109k
          for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
1313
109k
            Node* node = open_elements_stack_.at(i);
1314
109k
            switch (node->atom_) {
1315
542
              case Atom::DD:
1316
1.60k
              case Atom::DT:
1317
                // Remove all except last in stack.
1318
1.60k
                open_elements_stack_.Pop(open_elements_stack_.size() - i);
1319
1.60k
                break;
1320
0
              case Atom::ADDRESS:
1321
201
              case Atom::DIV:
1322
464
              case Atom::P:
1323
464
                continue;
1324
107k
              default:
1325
107k
                if (!node->IsSpecialElement()) continue;
1326
109k
            }
1327
12.8k
            break;
1328
109k
          }
1329
12.8k
          PopUntil(Scope::ButtonScope, Atom::P);
1330
12.8k
          AddElement();
1331
12.8k
          break;
1332
12.8k
        }
1333
0
        case Atom::PLAINTEXT: {
1334
0
          PopUntil(Scope::ButtonScope, Atom::P);
1335
0
          AddElement();
1336
0
          break;
1337
12.8k
        }
1338
311
        case Atom::BUTTON: {
1339
311
          PopUntil(Scope::DefaultScope, Atom::BUTTON);
1340
311
          ReconstructActiveFormattingElements();
1341
311
          AddElement();
1342
311
          frameset_ok_ = false;
1343
311
          break;
1344
12.8k
        }
1345
177k
        case Atom::A: {
1346
767k
          for (int i = active_formatting_elements_stack_.size() - 1; i >= 0;
1347
766k
               --i) {
1348
766k
            Node* node = active_formatting_elements_stack_.at(i);
1349
766k
            if (node->node_type_ == NodeType::SCOPE_MARKER_NODE) break;
1350
759k
            if (node->node_type_ == NodeType::ELEMENT_NODE &&
1351
759k
                node->atom_ == Atom::A) {
1352
169k
              InBodyEndTagFormatting(Atom::A, "a");
1353
169k
              open_elements_stack_.Remove(node);
1354
169k
              active_formatting_elements_stack_.Remove(node);
1355
169k
              break;
1356
169k
            }
1357
759k
          }
1358
177k
          ReconstructActiveFormattingElements();
1359
177k
          AddFormattingElement();
1360
177k
          break;
1361
12.8k
        }
1362
222k
        case Atom::B:
1363
223k
        case Atom::BIG:
1364
223k
        case Atom::CODE:
1365
223k
        case Atom::EM:
1366
223k
        case Atom::FONT:
1367
722k
        case Atom::I:
1368
1.91M
        case Atom::S:
1369
1.91M
        case Atom::SMALL:
1370
1.92M
        case Atom::STRIKE:
1371
1.92M
        case Atom::STRONG:
1372
1.93M
        case Atom::TT:
1373
2.11M
        case Atom::U: {
1374
2.11M
          ReconstructActiveFormattingElements();
1375
2.11M
          AddFormattingElement();
1376
2.11M
          break;
1377
1.93M
        }
1378
403
        case Atom::NOBR: {
1379
403
          ReconstructActiveFormattingElements();
1380
403
          if (ElementInScope(Scope::DefaultScope, Atom::NOBR)) {
1381
195
            InBodyEndTagFormatting(Atom::NOBR, "nobr");
1382
195
            ReconstructActiveFormattingElements();
1383
195
          }
1384
403
          AddFormattingElement();
1385
403
          break;
1386
1.93M
        }
1387
539
        case Atom::APPLET:
1388
539
        case Atom::MARQUEE:
1389
539
        case Atom::OBJECT: {
1390
539
          ReconstructActiveFormattingElements();
1391
539
          AddElement();
1392
539
          active_formatting_elements_stack_.Push(scope_marker_);
1393
539
          frameset_ok_ = false;
1394
539
          break;
1395
539
        }
1396
1.27M
        case Atom::TABLE: {
1397
1.27M
          if (!document_->metadata_.quirks_mode) {
1398
0
            PopUntil(Scope::ButtonScope, Atom::P);
1399
0
          }
1400
1.27M
          AddElement();
1401
1.27M
          frameset_ok_ = false;
1402
1.27M
          insertion_mode_ = std::bind(&Parser::InTableIM, this);
1403
1.27M
          return true;
1404
539
        }
1405
194
        case Atom::AREA:
1406
703
        case Atom::BR:
1407
897
        case Atom::EMBED:
1408
1.15k
        case Atom::IMG:
1409
1.82k
        case Atom::INPUT:
1410
1.82k
        case Atom::KEYGEN:
1411
2.02k
        case Atom::WBR: {
1412
2.02k
          ReconstructActiveFormattingElements();
1413
2.02k
          AddElement();
1414
2.02k
          open_elements_stack_.Pop();
1415
2.02k
          AcknowledgeSelfClosingTag();
1416
2.02k
          if (token_.atom == Atom::INPUT) {
1417
1.19k
            for (auto& attr : token_.attributes) {
1418
1.19k
              if (attr.key == "type" &&
1419
1.19k
                  Strings::EqualFold(attr.value, "hidden")) {
1420
                  // Skip setting frameset_ok_ = false;
1421
0
                  return true;
1422
0
              }
1423
1.19k
            }
1424
661
          }
1425
2.02k
          frameset_ok_ = false;
1426
2.02k
          break;
1427
2.02k
        }
1428
194
        case Atom::PARAM:
1429
194
        case Atom::SOURCE:
1430
388
        case Atom::TRACK: {
1431
388
          AddElement();
1432
388
          open_elements_stack_.Pop();
1433
388
          AcknowledgeSelfClosingTag();
1434
388
          break;
1435
194
        }
1436
199
        case Atom::HR: {
1437
199
          PopUntil(Scope::ButtonScope, Atom::P);
1438
199
          AddElement();
1439
199
          open_elements_stack_.Pop();
1440
199
          AcknowledgeSelfClosingTag();
1441
199
          frameset_ok_ = false;
1442
199
          break;
1443
194
        }
1444
66
        case Atom::IMAGE: {
1445
66
          token_.atom = Atom::IMG;
1446
66
          token_.data = AtomUtil::ToString(Atom::IMG);
1447
66
          return false;
1448
194
        }
1449
0
        case Atom::TEXTAREA: {
1450
0
          AddElement();
1451
0
          SetOriginalIM();
1452
0
          frameset_ok_ = false;
1453
0
          insertion_mode_ = std::bind(&Parser::TextIM, this);
1454
0
          break;
1455
194
        }
1456
3.56k
        case Atom::XMP: {
1457
3.56k
          PopUntil(Scope::ButtonScope, Atom::P);
1458
3.56k
          ReconstructActiveFormattingElements();
1459
3.56k
          frameset_ok_ = false;
1460
3.56k
          ParseGenericRawTextElement();
1461
3.56k
          break;
1462
194
        }
1463
195
        case Atom::IFRAME: {
1464
195
          frameset_ok_ = false;
1465
195
          ParseGenericRawTextElement();
1466
195
          break;
1467
194
        }
1468
69
        case Atom::NOEMBED: {
1469
69
          ParseGenericRawTextElement();
1470
69
          break;
1471
194
        }
1472
324
        case Atom::NOSCRIPT: {
1473
324
          if (scripting_) {
1474
324
            ParseGenericRawTextElement();
1475
324
            return true;
1476
324
          }
1477
0
          ReconstructActiveFormattingElements();
1478
0
          AddElement();
1479
          // Don't let the tokenizer go into raw text mode when scripting is
1480
          // disabled.
1481
0
          tokenizer_->NextIsNotRawText();
1482
0
          break;
1483
324
        }
1484
0
        case Atom::SELECT: {
1485
0
          ReconstructActiveFormattingElements();
1486
0
          AddElement();
1487
0
          frameset_ok_ = false;
1488
0
          insertion_mode_ = std::bind(&Parser::InSelectIM, this);
1489
0
          return true;
1490
0
          break;
1491
324
        }
1492
0
        case Atom::OPTGROUP:
1493
0
        case Atom::OPTION: {
1494
0
          if (top()->atom_ == Atom::OPTION) {
1495
0
            open_elements_stack_.Pop();
1496
0
          }
1497
0
          ReconstructActiveFormattingElements();
1498
0
          AddElement();
1499
0
          break;
1500
0
        }
1501
5.32k
        case Atom::RB:
1502
5.52k
        case Atom::RTC: {
1503
5.52k
          if (ElementInScope(Scope::DefaultScope, Atom::RUBY)) {
1504
398
            GenerateImpliedEndTags();
1505
398
          }
1506
5.52k
          AddElement();
1507
5.52k
          break;
1508
5.32k
        }
1509
1.36k
        case Atom::RP:
1510
7.99k
        case Atom::RT: {
1511
7.99k
          if (ElementInScope(Scope::DefaultScope, Atom::RUBY)) {
1512
1.06k
            GenerateImpliedEndTags({Atom::RTC});
1513
1.06k
          }
1514
7.99k
          AddElement();
1515
7.99k
          break;
1516
1.36k
        }
1517
987
        case Atom::MATH:
1518
2.31k
        case Atom::SVG: {
1519
2.31k
          ReconstructActiveFormattingElements();
1520
2.31k
          if (token_.atom == Atom::MATH) {
1521
987
            AdjustMathMLAttributeNames(&token_.attributes);
1522
1.33k
          } else {
1523
1.33k
            AdjustSVGAttributeNames(&token_.attributes);
1524
1.33k
          }
1525
2.31k
          AdjustForeignAttributes(&token_.attributes);
1526
2.31k
          AddElement();
1527
2.31k
          top()->name_space_ = AtomUtil::ToString(token_.atom);
1528
2.31k
          if (has_self_closing_token_) {
1529
256
            open_elements_stack_.Pop();
1530
256
            AcknowledgeSelfClosingTag();
1531
256
          }
1532
2.31k
          return true;
1533
0
          break;
1534
987
        }
1535
0
        case Atom::CAPTION:
1536
245
        case Atom::COL:
1537
245
        case Atom::COLGROUP:
1538
439
        case Atom::FRAME:
1539
652
        case Atom::HEAD:
1540
846
        case Atom::TBODY:
1541
1.05k
        case Atom::TD:
1542
1.25k
        case Atom::TFOOT:
1543
1.77k
        case Atom::TH:
1544
2.00k
        case Atom::THEAD:
1545
2.20k
        case Atom::TR: {
1546
          // Ignore the token.
1547
2.20k
          break;
1548
2.00k
        }
1549
6.22M
        default:
1550
6.22M
          ReconstructActiveFormattingElements();
1551
6.22M
          AddElement();
1552
10.0M
      }
1553
8.76M
      break;
1554
10.0M
    }
1555
8.76M
    case TokenType::END_TAG_TOKEN: {
1556
193k
      switch (token_.atom) {
1557
9.58k
        case Atom::BODY:
1558
9.58k
          if (ElementInScope(Scope::DefaultScope, Atom::BODY)) {
1559
9.39k
            insertion_mode_ = std::bind(&Parser::AfterBodyIM, this);
1560
9.39k
          }
1561
9.58k
          break;
1562
1.84k
        case Atom::HTML: {
1563
1.84k
          if (ElementInScope(Scope::DefaultScope, Atom::BODY)) {
1564
1.84k
            ParseImpliedToken(TokenType::END_TAG_TOKEN, Atom::BODY,
1565
1.84k
                              AtomUtil::ToString(Atom::BODY));
1566
1.84k
            return false;
1567
1.84k
          }
1568
0
          return true;
1569
0
          break;
1570
1.84k
        }
1571
0
        case Atom::ADDRESS:
1572
0
        case Atom::ARTICLE:
1573
18
        case Atom::ASIDE:
1574
18
        case Atom::BLOCKQUOTE:
1575
213
        case Atom::BUTTON:
1576
213
        case Atom::CENTER:
1577
213
        case Atom::DETAILS:
1578
213
        case Atom::DIALOG:
1579
279
        case Atom::DIR:
1580
474
        case Atom::DIV:
1581
673
        case Atom::DL:
1582
673
        case Atom::FIELDSET:
1583
673
        case Atom::FIGCAPTION:
1584
673
        case Atom::FIGURE:
1585
739
        case Atom::FOOTER:
1586
933
        case Atom::HEADER:
1587
933
        case Atom::HGROUP:
1588
933
        case Atom::LISTING:
1589
936
        case Atom::MAIN:
1590
1.00k
        case Atom::MENU:
1591
1.19k
        case Atom::NAV:
1592
1.39k
        case Atom::OL:
1593
1.59k
        case Atom::PRE:
1594
1.59k
        case Atom::SECTION:
1595
1.59k
        case Atom::SUMMARY:
1596
1.78k
        case Atom::UL: {
1597
1.78k
          PopUntil(Scope::DefaultScope, token_.atom);
1598
1.78k
          break;
1599
1.59k
        }
1600
1.69k
        case Atom::FORM: {
1601
1.69k
          if (open_elements_stack_.Contains(Atom::TEMPLATE)) {
1602
0
            int i = IndexOfElementInScope(Scope::DefaultScope, {Atom::FORM});
1603
0
            if (i == -1) {
1604
              // Ignore the token.
1605
0
              return true;
1606
0
            }
1607
0
            GenerateImpliedEndTags();
1608
0
            if (open_elements_stack_.at(i)->atom_ != Atom::FORM) {
1609
              // Ignore the token.
1610
0
              return true;
1611
0
            }
1612
0
            PopUntil(Scope::DefaultScope, Atom::FORM);
1613
1.69k
          } else {
1614
1.69k
            Node* node = form_;
1615
1.69k
            form_ = nullptr;
1616
1.69k
            int i = IndexOfElementInScope(Scope::DefaultScope, {Atom::FORM});
1617
1.69k
            if (!node || i == -1 || open_elements_stack_.at(i) != node) {
1618
              // Ignore the token.
1619
1.25k
              return true;
1620
1.25k
            }
1621
434
            GenerateImpliedEndTags();
1622
434
            open_elements_stack_.Remove(node);
1623
434
          }
1624
434
          break;
1625
1.69k
        }
1626
1.71k
        case Atom::P: {
1627
1.71k
          if (!ElementInScope(Scope::ButtonScope, Atom::P)) {
1628
1.47k
            ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::P,
1629
1.47k
                              AtomUtil::ToString(Atom::P));
1630
1.47k
          }
1631
1.71k
          PopUntil(Scope::ButtonScope, Atom::P);
1632
1.71k
          break;
1633
1.69k
        }
1634
101k
        case Atom::LI: {
1635
101k
          PopUntil(Scope::ListItemScope, Atom::LI);
1636
101k
          break;
1637
1.69k
        }
1638
211
        case Atom::DD:
1639
409
        case Atom::DT: {
1640
409
          PopUntil(Scope::DefaultScope, token_.atom);
1641
409
          break;
1642
211
        }
1643
201
        case Atom::H1:
1644
401
        case Atom::H2:
1645
475
        case Atom::H3:
1646
692
        case Atom::H4:
1647
1.16k
        case Atom::H5:
1648
1.55k
        case Atom::H6: {
1649
1.55k
          PopUntil(Scope::DefaultScope, Atom::H1, Atom::H2, Atom::H3, Atom::H4,
1650
1.55k
                   Atom::H5, Atom::H6);
1651
1.55k
          break;
1652
1.16k
        }
1653
804
        case Atom::A:
1654
8.17k
        case Atom::B:
1655
8.37k
        case Atom::BIG:
1656
8.43k
        case Atom::CODE:
1657
8.63k
        case Atom::EM:
1658
8.83k
        case Atom::FONT:
1659
44.0k
        case Atom::I:
1660
44.4k
        case Atom::NOBR:
1661
51.3k
        case Atom::S:
1662
51.3k
        case Atom::SMALL:
1663
51.5k
        case Atom::STRIKE:
1664
51.5k
        case Atom::STRONG:
1665
51.7k
        case Atom::TT:
1666
53.1k
        case Atom::U: {
1667
53.1k
          InBodyEndTagFormatting(token_.atom,
1668
53.1k
                                 token_.atom != Atom::UNKNOWN
1669
53.1k
                                     ? AtomUtil::ToString(token_.atom)
1670
53.1k
                                     : token_.data);
1671
53.1k
          break;
1672
51.7k
        }
1673
596
        case Atom::APPLET:
1674
596
        case Atom::MARQUEE:
1675
596
        case Atom::OBJECT: {
1676
596
          if (PopUntil(Scope::DefaultScope, token_.atom)) {
1677
318
            ClearActiveFormattingElements();
1678
318
          }
1679
596
          break;
1680
596
        }
1681
198
        case Atom::BR: {
1682
198
          token_.token_type = TokenType::START_TAG_TOKEN;
1683
198
          return false;
1684
0
          break;
1685
596
        }
1686
0
        case Atom::TEMPLATE: {
1687
0
          return InHeadIM();
1688
0
          break;
1689
596
        }
1690
19.8k
        default:
1691
19.8k
          InBodyEndTagOther(token_.atom, token_.data);
1692
193k
      }
1693
190k
      break;
1694
193k
    }
1695
1.03M
    case TokenType::COMMENT_TOKEN: {
1696
1.03M
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
1697
1.03M
      node->SetManufactured(token_.is_manufactured);
1698
1.03M
      if (record_node_offsets_) {
1699
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
1700
0
      }
1701
1.03M
      node->data_ = token_.data;
1702
1.03M
      AddChild(node);
1703
1.03M
      break;
1704
193k
    }
1705
12.4k
    case TokenType::ERROR_TOKEN: {
1706
12.4k
      if (template_stack_.size() > 0) {
1707
0
        insertion_mode_ = std::bind(&Parser::InTemplateIM, this);
1708
0
        return false;
1709
12.4k
      } else {
1710
168k
        for (Node* n : open_elements_stack_) {
1711
168k
          switch (n->atom_) {
1712
483
            case Atom::DD:
1713
997
            case Atom::LI:
1714
997
            case Atom::OPTGROUP:
1715
997
            case Atom::OPTION:
1716
1.32k
            case Atom::P:
1717
5.52k
            case Atom::RB:
1718
6.13k
            case Atom::RP:
1719
6.41k
            case Atom::RT:
1720
6.61k
            case Atom::RTC:
1721
7.41k
            case Atom::TBODY:
1722
7.73k
            case Atom::TD:
1723
8.03k
            case Atom::TFOOT:
1724
149k
            case Atom::TH:
1725
150k
            case Atom::THEAD:
1726
150k
            case Atom::TR:
1727
156k
            case Atom::BODY:
1728
162k
            case Atom::HTML:
1729
              // Ignore.
1730
162k
              break;
1731
6.79k
            default:
1732
6.79k
              return true;
1733
168k
          }
1734
168k
        }
1735
12.4k
      }
1736
5.63k
      break;
1737
12.4k
    }
1738
5.63k
    default:
1739
321
      break;
1740
11.8M
  }
1741
1742
10.5M
  return true;
1743
11.8M
}  // NOLINT(readability/fn_size)
1744
// Parser::InBodyIM end.
1745
1746
222k
void Parser::InBodyEndTagFormatting(Atom tag_atom, std::string_view tag_name) {
1747
  // This is the "adoption agency" algorithm, described at
1748
  // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1749
1750
  // TODO: this is a fairly literal line-by-line translation of that algorithm.
1751
  // Once the code successfully parses the comprehensive test suite, we should
1752
  // refactor this code to be more idiomatic.
1753
1754
  // Steps 1-2
1755
222k
  if (auto current = open_elements_stack_.Top();
1756
222k
      current->data_ == tag_name &&
1757
222k
      active_formatting_elements_stack_.Index(current) == -1) {
1758
0
    open_elements_stack_.Pop();
1759
0
    return;
1760
0
  }
1761
1762
  // Steps 3-5. The outer loop.
1763
341k
  for (int i = 0; i < 8; ++i) {
1764
    // Step 6. Find the formatting element.
1765
334k
    Node* formatting_element = nullptr;
1766
18.2M
    for (int j = active_formatting_elements_stack_.size() - 1; j >= 0; --j) {
1767
18.2M
      if (active_formatting_elements_stack_.at(j)->node_type_ ==
1768
18.2M
          NodeType::SCOPE_MARKER_NODE) {
1769
213
        break;
1770
213
      }
1771
18.2M
      if (active_formatting_elements_stack_.at(j)->atom_ == tag_atom) {
1772
331k
        formatting_element = active_formatting_elements_stack_.at(j);
1773
331k
        break;
1774
331k
      }
1775
18.2M
    }
1776
1777
334k
    if (!formatting_element) {
1778
2.96k
      InBodyEndTagOther(tag_atom, tag_name);
1779
2.96k
      return;
1780
2.96k
    }
1781
1782
    // Step 7. Ignore the tag if formatting element is not in the stack of open
1783
    // elements.
1784
331k
    int fe_index = open_elements_stack_.Index(formatting_element);
1785
331k
    if (fe_index == -1) {
1786
641
      active_formatting_elements_stack_.Remove(formatting_element);
1787
641
      return;
1788
641
    }
1789
1790
    // Step 8. Ignore the tag if formatting element is not in the scope.
1791
331k
    if (!ElementInScope(Scope::DefaultScope, tag_atom)) {
1792
      // Ignore the tag.
1793
219
      return;
1794
219
    }
1795
1796
    // Step 9. This step is omitted because it's just a parse error but no
1797
    // need to return.
1798
1799
    // Steps 10-11. Find the furthest block.
1800
330k
    Node* furthest_block = nullptr;
1801
3.73M
    for (int k = fe_index; k < open_elements_stack_.size(); ++k) {
1802
3.52M
      if (open_elements_stack_.at(k)->IsSpecialElement()) {
1803
118k
        furthest_block = open_elements_stack_.at(k);
1804
118k
        break;
1805
118k
      }
1806
3.52M
    }
1807
1808
330k
    if (!furthest_block) {
1809
212k
      Node* e = open_elements_stack_.Pop();
1810
2.08M
      while (e != formatting_element) {
1811
1.87M
        e = open_elements_stack_.Pop();
1812
1.87M
      }
1813
212k
      active_formatting_elements_stack_.Remove(e);
1814
212k
      return;
1815
212k
    }
1816
1817
    // Steps 12-13. Find the common ancestor and bookmark node.
1818
118k
    Node* common_ancestor = open_elements_stack_.at(fe_index - 1);
1819
118k
    auto bookmark = active_formatting_elements_stack_.Index(formatting_element);
1820
1821
    // Step 14. The inner loop. Find the last_node to reparent.
1822
118k
    Node* last_node = furthest_block;
1823
118k
    Node* node = furthest_block;
1824
118k
    int x = open_elements_stack_.Index(node);
1825
    // Step 14.1.
1826
118k
    int j = 0;
1827
1.31M
    while (true) {
1828
      // Step 14.2.
1829
1.31M
      j++;
1830
      // Step 14.3.
1831
1.31M
      x--;
1832
1.31M
      node = open_elements_stack_.at(x);
1833
      // Step 14.4. Go to the next step if node is formatting element.
1834
1.31M
      if (node == formatting_element) break;
1835
1836
      // Step 14.5. Remove node from the list of active formatting elements if
1837
      // inner loop counter is greater than three and node is in the list of
1838
      // active formatting elements.
1839
1.20M
      if (int ni = active_formatting_elements_stack_.Index(node);
1840
1.20M
          j > 3 && ni > -1) {
1841
392k
        active_formatting_elements_stack_.Remove(node);
1842
        // If any element of the list of active formatting elements is removed,
1843
        // we need to take care whether bookmark should be decremented or not.
1844
        // This is because the value of bookmark may exceed the size of the
1845
        // list by removing elements from the list.
1846
392k
        if (ni <= bookmark) {
1847
391k
          bookmark--;
1848
391k
        }
1849
392k
        continue;
1850
392k
      }
1851
1852
      // Step 14.6. Continue the next inner loop if node is not in the list of
1853
      // active formatting elements.
1854
807k
      if (active_formatting_elements_stack_.Index(node) == -1) {
1855
736k
        open_elements_stack_.Remove(node);
1856
736k
        continue;
1857
736k
      }
1858
1859
      // Step 14.7.
1860
71.1k
      Node* clone = document_->CloneNode(node);
1861
71.1k
      active_formatting_elements_stack_.Replace(
1862
71.1k
          active_formatting_elements_stack_.Index(node), clone);
1863
71.1k
      open_elements_stack_.Replace(open_elements_stack_.Index(node), clone);
1864
71.1k
      node = clone;
1865
1866
      // Step 14.8.
1867
71.1k
      if (last_node == furthest_block) {
1868
35.8k
        bookmark = active_formatting_elements_stack_.Index(node) + 1;
1869
35.8k
      }
1870
      // Step 14.9.
1871
71.1k
      if (last_node->Parent()) {
1872
35.8k
        last_node = last_node->Parent()->RemoveChild(last_node);
1873
35.8k
      }
1874
71.1k
      node->AppendChild(last_node);
1875
1876
      // Step 14.10.
1877
71.1k
      last_node = node;
1878
71.1k
    }
1879
1880
    // Step 15. Reparent lastNode to the common ancestor,
1881
    // or for misnested table nodes, to the foster parent.
1882
118k
    if (last_node->Parent()) {
1883
82.9k
      last_node = last_node->Parent()->RemoveChild(last_node);
1884
82.9k
    }
1885
1886
118k
    switch (common_ancestor->atom_) {
1887
559
      case Atom::TABLE:
1888
1.05k
      case Atom::TBODY:
1889
1.05k
      case Atom::TFOOT:
1890
1.05k
      case Atom::THEAD:
1891
1.77k
      case Atom::TR:
1892
1.77k
        FosterParent(last_node);
1893
1.77k
        break;
1894
117k
      default:
1895
117k
        common_ancestor->AppendChild(last_node);
1896
118k
    }
1897
1898
    // Steps 16-18. Reparent nodes from the furthest block's children
1899
    // to a clone of the formatting element.
1900
118k
    Node* clone = document_->CloneNode(formatting_element);
1901
118k
    furthest_block->ReparentChildrenTo(clone);
1902
118k
    furthest_block->AppendChild(clone);
1903
1904
    // Step 19. Fix up the list of active formatting elements.
1905
118k
    int old_loc = active_formatting_elements_stack_.Index(formatting_element);
1906
118k
    if (old_loc != -1 && old_loc < bookmark) {
1907
      // Move the bookmark with the rest of the list.
1908
35.8k
      bookmark--;
1909
35.8k
    }
1910
1911
118k
    active_formatting_elements_stack_.Remove(formatting_element);
1912
118k
    active_formatting_elements_stack_.Insert(bookmark, clone);
1913
1914
    // Step 20. Fix up the stack of open elements.
1915
118k
    open_elements_stack_.Remove(formatting_element);
1916
118k
    open_elements_stack_.Insert(open_elements_stack_.Index(furthest_block) + 1,
1917
118k
                                clone);
1918
118k
  }
1919
222k
}  // Parser::InBodyEndTagFormatting.
1920
1921
22.8k
void Parser::InBodyEndTagOther(Atom tag_atom, std::string_view tag_name) {
1922
21.4M
  for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
1923
    // Two element nodes have the same tag if they have the same Data (a
1924
    // string-typed field). As an optimization, for common HTML tags, each
1925
    // Data string is assigned a unique, non-zero Atom (a uint32-typed
1926
    // field), since integer comparison is faster than string comparison.
1927
    // Uncommon (custom) tags get a zero Atom.
1928
    //
1929
    // The if condition here is equivalent to (node->data_ == tag_name).
1930
21.4M
    if (open_elements_stack_.at(i)->atom_ == tag_atom &&
1931
21.4M
        ((tag_atom != Atom::UNKNOWN) ||
1932
17.3M
         (open_elements_stack_.at(i)->data_ == tag_name))) {
1933
1.29k
      open_elements_stack_.Pop(open_elements_stack_.size() - i);
1934
1.29k
      break;
1935
1.29k
    }
1936
1937
21.4M
    if (open_elements_stack_.at(i)->IsSpecialElement()) break;
1938
21.4M
  }
1939
22.8k
}  // Parser::InBodyEndTagOther.
1940
1941
// Section 12.2.6.4.8.
1942
42.0k
bool Parser::TextIM() {
1943
42.0k
  switch (token_.token_type) {
1944
1.10k
    case TokenType::ERROR_TOKEN:
1945
1.10k
      open_elements_stack_.Pop();
1946
1.10k
      break;
1947
17.3k
    case TokenType::TEXT_TOKEN: {
1948
17.3k
      std::string_view data_view(token_.data);
1949
17.3k
      Node* node = open_elements_stack_.Top();
1950
17.3k
      if ((node->atom_ == Atom::TEXTAREA) && !node->FirstChild()) {
1951
        // Ignore a newline at the start of a <textarea> block.
1952
0
        if (!data_view.empty() && data_view.front() == '\r') {
1953
0
          data_view.remove_prefix(1);
1954
0
        }
1955
0
        if (!data_view.empty() && data_view.front() == '\n') {
1956
0
          data_view.remove_prefix(1);
1957
0
        }
1958
0
      }
1959
17.3k
      if (data_view.empty()) return true;
1960
17.3k
      AddText(data_view.data());
1961
17.3k
      return true;
1962
17.3k
    }
1963
23.6k
    case TokenType::END_TAG_TOKEN:
1964
23.6k
      open_elements_stack_.Pop();
1965
23.6k
      break;
1966
0
    default:
1967
0
      break;
1968
42.0k
  }
1969
24.7k
  insertion_mode_ = original_insertion_mode_;
1970
24.7k
  original_insertion_mode_ = nullptr;
1971
24.7k
  return token_.token_type == TokenType::END_TAG_TOKEN;
1972
42.0k
}  // Parser::TextIM.
1973
1974
// Section 12.2.6.4.9.
1975
4.66M
bool Parser::InTableIM() {
1976
4.66M
  switch (token_.token_type) {
1977
89.9k
    case TokenType::TEXT_TOKEN: {
1978
89.9k
      Strings::ReplaceAny(&token_.data, Strings::kNullChar, "");
1979
89.9k
      switch (open_elements_stack_.Top()->atom_) {
1980
24.1k
        case Atom::TABLE:
1981
27.8k
        case Atom::TBODY:
1982
31.3k
        case Atom::TFOOT:
1983
31.6k
        case Atom::THEAD:
1984
32.6k
        case Atom::TR: {
1985
          // All whitespace including \x00.
1986
32.6k
          if (Strings::IsAllWhitespaceChars(token_.data,
1987
32.6k
                                            Strings::kWhitespaceOrNull)) {
1988
518
            AddText(token_.data);
1989
518
            return true;
1990
518
          }
1991
32.1k
          break;
1992
32.6k
        }
1993
57.2k
        default:
1994
57.2k
          break;
1995
89.9k
      }
1996
89.4k
      break;
1997
89.9k
    }
1998
4.54M
    case TokenType::START_TAG_TOKEN: {
1999
4.54M
      switch (token_.atom) {
2000
0
        case Atom::CAPTION: {
2001
0
          ClearStackToContext(Scope::TableScope);
2002
0
          active_formatting_elements_stack_.Push(scope_marker_);
2003
0
          AddElement();
2004
0
          insertion_mode_ = std::bind(&Parser::InCaptionIM, this);
2005
0
          return true;
2006
0
        }
2007
4.20k
        case Atom::COLGROUP: {
2008
4.20k
          ClearStackToContext(Scope::TableScope);
2009
4.20k
          AddElement();
2010
4.20k
          insertion_mode_ = std::bind(&Parser::InColumnGroupIM, this);
2011
4.20k
          return true;
2012
0
        }
2013
4.20k
        case Atom::COL: {
2014
4.20k
          ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::COLGROUP,
2015
4.20k
                            AtomUtil::ToString(Atom::COLGROUP));
2016
4.20k
          return false;
2017
0
        }
2018
1.17M
        case Atom::TBODY:
2019
1.18M
        case Atom::TFOOT:
2020
1.18M
        case Atom::THEAD: {
2021
1.18M
          ClearStackToContext(Scope::TableScope);
2022
1.18M
          AddElement();
2023
1.18M
          insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
2024
1.18M
          return true;
2025
1.18M
        }
2026
587k
        case Atom::TD:
2027
1.17M
        case Atom::TH:
2028
1.17M
        case Atom::TR: {
2029
1.17M
          ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::TBODY,
2030
1.17M
                            AtomUtil::ToString(Atom::TBODY));
2031
1.17M
          return false;
2032
1.17M
        }
2033
100k
        case Atom::TABLE: {
2034
100k
          if (PopUntil(Scope::TableScope, Atom::TABLE)) {
2035
100k
            ResetInsertionMode();
2036
100k
            return false;
2037
100k
          }
2038
          // Ignore the token.
2039
0
          return true;
2040
100k
        }
2041
1
        case Atom::STYLE:
2042
1
        case Atom::SCRIPT:
2043
1
        case Atom::TEMPLATE: {
2044
1
          return InHeadIM();
2045
1
        }
2046
0
        case Atom::INPUT: {
2047
0
          for (auto& attr : token_.attributes) {
2048
0
            if (attr.key == "type" &&
2049
0
                Strings::EqualFold(attr.value, "hidden")) {
2050
0
              AddElement();
2051
0
              open_elements_stack_.Pop();
2052
0
              return true;
2053
0
            }
2054
0
          }
2055
0
          break;
2056
          // Otherwise drop down to the default action.
2057
0
        }
2058
740
        case Atom::FORM: {
2059
740
          if (open_elements_stack_.Contains(Atom::TEMPLATE) || form_) {
2060
            // Ignore the token.
2061
208
            return true;
2062
208
          }
2063
532
          AddElement();
2064
532
          form_ = open_elements_stack_.Pop();
2065
532
          break;
2066
740
        }
2067
0
        case Atom::SELECT: {
2068
0
          ReconstructActiveFormattingElements();
2069
0
          switch (top()->atom_) {
2070
0
            case Atom::TABLE:
2071
0
            case Atom::TBODY:
2072
0
            case Atom::TFOOT:
2073
0
            case Atom::THEAD:
2074
0
            case Atom::TR:
2075
0
              foster_parenting_ = true;
2076
0
              break;
2077
0
            default:
2078
              // Ignore remaining tags.
2079
0
              break;
2080
0
          }
2081
0
          AddElement();
2082
0
          foster_parenting_ = false;
2083
0
          frameset_ok_ = false;
2084
0
          insertion_mode_ = std::bind(&Parser::InSelectInTableIM, this);
2085
0
          return true;
2086
0
        }
2087
2.07M
        default:
2088
          // Ignore remaining tags.
2089
2.07M
          break;
2090
4.54M
      }
2091
2.07M
      break;
2092
4.54M
    }
2093
2.07M
    case TokenType::END_TAG_TOKEN: {
2094
29.3k
      switch (token_.atom) {
2095
17.4k
        case Atom::TABLE:
2096
17.4k
          if (PopUntil(Scope::TableScope, Atom::TABLE)) {
2097
17.4k
            ResetInsertionMode();
2098
17.4k
            return true;
2099
17.4k
          }
2100
          // Ignore the token.
2101
0
          return true;
2102
194
        case Atom::BODY:
2103
194
        case Atom::CAPTION:
2104
393
        case Atom::COL:
2105
393
        case Atom::COLGROUP:
2106
600
        case Atom::HTML:
2107
823
        case Atom::TBODY:
2108
1.05k
        case Atom::TD:
2109
4.24k
        case Atom::TFOOT:
2110
4.44k
        case Atom::TH:
2111
4.71k
        case Atom::THEAD:
2112
4.94k
        case Atom::TR:
2113
          // Ignore the token.
2114
4.94k
          return true;
2115
0
        case Atom::TEMPLATE:
2116
0
          return InHeadIM();
2117
6.96k
        default:
2118
          // Ignore.
2119
6.96k
          break;
2120
29.3k
      }
2121
6.96k
      break;
2122
29.3k
    }
2123
6.96k
    case TokenType::COMMENT_TOKEN: {
2124
1.72k
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2125
1.72k
      node->SetManufactured(token_.is_manufactured);
2126
1.72k
      if (record_node_offsets_) {
2127
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2128
0
      }
2129
1.72k
      node->data_ = token_.data;
2130
1.72k
      AddChild(node);
2131
1.72k
      return true;
2132
29.3k
    }
2133
0
    case TokenType::DOCTYPE_TOKEN: {
2134
      // Ignore the token.
2135
0
      return true;
2136
29.3k
    }
2137
1.36k
    case TokenType::ERROR_TOKEN: {
2138
1.36k
      return InBodyIM();
2139
29.3k
    }
2140
0
    default:
2141
0
      break;
2142
4.66M
  }
2143
2144
2.17M
  foster_parenting_ = true;
2145
2.17M
  defer(foster_parenting_ = false;);
2146
2.17M
  return InBodyIM();
2147
4.66M
}  // Parser::InTableIM.
2148
2149
// Section 12.2.6.4.11.
2150
0
bool Parser::InCaptionIM() {
2151
0
  switch (token_.token_type) {
2152
0
    case TokenType::START_TAG_TOKEN: {
2153
0
      switch (token_.atom) {
2154
0
        case Atom::CAPTION:
2155
0
        case Atom::COL:
2156
0
        case Atom::COLGROUP:
2157
0
        case Atom::TBODY:
2158
0
        case Atom::TD:
2159
0
        case Atom::TFOOT:
2160
0
        case Atom::THEAD:
2161
0
        case Atom::TR: {
2162
0
          if (PopUntil(Scope::TableScope, Atom::CAPTION)) {
2163
0
            ClearActiveFormattingElements();
2164
0
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2165
0
            return false;
2166
0
          }
2167
          // Ignore the token.
2168
0
          return true;
2169
0
        }
2170
0
        case Atom::SELECT: {
2171
0
          ReconstructActiveFormattingElements();
2172
0
          AddElement();
2173
0
          frameset_ok_ = false;
2174
0
          insertion_mode_ = std::bind(&Parser::InSelectInTableIM, this);
2175
0
          return true;
2176
0
        }
2177
0
        default:
2178
0
          break;
2179
0
      }
2180
0
      break;
2181
0
    }
2182
0
    case TokenType::END_TAG_TOKEN: {
2183
0
      switch (token_.atom) {
2184
0
        case Atom::CAPTION: {
2185
0
          if (PopUntil(Scope::TableScope, Atom::CAPTION)) {
2186
0
            ClearActiveFormattingElements();
2187
0
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2188
0
          }
2189
0
          return true;
2190
0
        }
2191
0
        case Atom::TABLE: {
2192
0
          if (PopUntil(Scope::TableScope, Atom::CAPTION)) {
2193
0
            ClearActiveFormattingElements();
2194
0
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2195
0
            return false;
2196
0
          }
2197
          // Ignore the token.
2198
0
          return true;
2199
0
        }
2200
0
        case Atom::BODY:
2201
0
        case Atom::COL:
2202
0
        case Atom::COLGROUP:
2203
0
        case Atom::HTML:
2204
0
        case Atom::TBODY:
2205
0
        case Atom::TD:
2206
0
        case Atom::TFOOT:
2207
0
        case Atom::TH:
2208
0
        case Atom::THEAD:
2209
0
        case Atom::TR: {
2210
          // Ignore the token.
2211
0
          return true;
2212
0
        }
2213
0
        default:
2214
0
          break;
2215
0
      }
2216
0
      break;
2217
0
    }
2218
0
    default:
2219
0
      break;
2220
0
  }
2221
2222
0
  return InBodyIM();
2223
0
}  // Parser::InCaptionIM.
2224
2225
// Section 12.2.6.4.12.
2226
10.9k
bool Parser::InColumnGroupIM() {
2227
10.9k
  switch (token_.token_type) {
2228
2.99k
    case TokenType::TEXT_TOKEN: {
2229
2.99k
      std::string s = token_.data;
2230
2.99k
      Strings::TrimLeft(&s);
2231
2.99k
      if (s.size() < token_.data.size()) {
2232
        // Add the initial whitespace to the current node.
2233
2.00k
        AddText(token_.data.substr(0, token_.data.size() - s.size()));
2234
2.00k
        if (s.empty()) return true;
2235
1.10k
        token_.data = s;
2236
1.10k
      }
2237
2.09k
      break;
2238
2.99k
    }
2239
2.09k
    case TokenType::COMMENT_TOKEN: {
2240
942
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2241
942
      node->SetManufactured(token_.is_manufactured);
2242
942
      if (record_node_offsets_) {
2243
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2244
0
      }
2245
942
      node->data_ = token_.data;
2246
942
      AddChild(node);
2247
942
      return true;
2248
2.99k
    }
2249
0
    case TokenType::DOCTYPE_TOKEN: {
2250
      // Ignore the token.
2251
0
      return true;
2252
2.99k
    }
2253
6.52k
    case TokenType::START_TAG_TOKEN: {
2254
6.52k
      switch (token_.atom) {
2255
66
        case Atom::HTML: {
2256
66
          return InBodyIM();
2257
0
        }
2258
4.66k
        case Atom::COL: {
2259
4.66k
          AddElement();
2260
4.66k
          open_elements_stack_.Pop();
2261
4.66k
          AcknowledgeSelfClosingTag();
2262
4.66k
          return true;
2263
0
        }
2264
0
        case Atom::TEMPLATE: {
2265
0
          return InHeadIM();
2266
0
        }
2267
1.78k
        default:
2268
1.78k
          break;
2269
6.52k
      }
2270
1.78k
      break;
2271
6.52k
    }
2272
1.78k
    case TokenType::END_TAG_TOKEN: {
2273
424
      switch (token_.atom) {
2274
0
        case Atom::COLGROUP:
2275
0
          if (open_elements_stack_.Top()->atom_ == Atom::COLGROUP) {
2276
0
            open_elements_stack_.Pop();
2277
0
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2278
0
          }
2279
0
          return true;
2280
208
        case Atom::COL:
2281
          // Ignore the token.
2282
208
          return true;
2283
0
        case Atom::TEMPLATE:
2284
0
          return InHeadIM();
2285
216
        default:
2286
216
          break;
2287
424
      }
2288
216
      break;
2289
424
    }
2290
216
    case TokenType::ERROR_TOKEN: {
2291
101
      return InBodyIM();
2292
424
    }
2293
0
    default:
2294
0
      break;
2295
10.9k
  }
2296
2297
4.10k
  if (open_elements_stack_.Top()->atom_ != Atom::COLGROUP) {
2298
0
    return true;
2299
0
  }
2300
4.10k
  open_elements_stack_.Pop();
2301
4.10k
  insertion_mode_ = std::bind(&Parser::InTableIM, this);
2302
4.10k
  return false;
2303
4.10k
}  // Parser::InColumnGroupIM.
2304
2305
// Section 12.2.6.4.13.
2306
2.47M
bool Parser::InTableBodyIM() {
2307
2.47M
  switch (token_.token_type) {
2308
2.43M
    case TokenType::START_TAG_TOKEN: {
2309
2.43M
      switch (token_.atom) {
2310
1.17M
        case Atom::TR: {
2311
1.17M
          ClearStackToContext(Scope::TableBodyScope);
2312
1.17M
          AddElement();
2313
1.17M
          insertion_mode_ = std::bind(&Parser::InRowIM, this);
2314
1.17M
          return true;
2315
0
        }
2316
588k
        case Atom::TD:
2317
1.17M
        case Atom::TH: {
2318
1.17M
          ParseImpliedToken(TokenType::START_TAG_TOKEN, Atom::TR,
2319
1.17M
                            AtomUtil::ToString(Atom::TR));
2320
1.17M
          return false;
2321
588k
        }
2322
0
        case Atom::CAPTION:
2323
1.86k
        case Atom::COL:
2324
1.86k
        case Atom::COLGROUP:
2325
2.35k
        case Atom::TBODY:
2326
3.27k
        case Atom::TFOOT:
2327
5.06k
        case Atom::THEAD: {
2328
5.06k
          if (PopUntil(Scope::TableScope, Atom::TBODY, Atom::THEAD,
2329
5.06k
                       Atom::TFOOT)) {
2330
5.06k
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2331
5.06k
            return false;
2332
5.06k
          }
2333
          // Ignore the token.
2334
0
          return true;
2335
5.06k
        }
2336
76.7k
        default:
2337
76.7k
          break;
2338
2.43M
      }
2339
76.7k
      break;
2340
2.43M
    }
2341
76.7k
    case TokenType::END_TAG_TOKEN: {
2342
26.2k
      switch (token_.atom) {
2343
349
        case Atom::TBODY:
2344
3.95k
        case Atom::TFOOT:
2345
4.24k
        case Atom::THEAD: {
2346
4.24k
          if (ElementInScope(Scope::TableScope, token_.atom)) {
2347
3.99k
            ClearStackToContext(Scope::TableBodyScope);
2348
3.99k
            open_elements_stack_.Pop();
2349
3.99k
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2350
3.99k
          }
2351
4.24k
          return true;
2352
3.95k
        }
2353
16.3k
        case Atom::TABLE: {
2354
16.3k
          if (PopUntil(Scope::TableScope, Atom::TBODY, Atom::THEAD,
2355
16.3k
                       Atom::TFOOT)) {
2356
16.3k
            insertion_mode_ = std::bind(&Parser::InTableIM, this);
2357
16.3k
            return false;
2358
16.3k
          }
2359
          // Ignore the token.
2360
0
          return true;
2361
16.3k
        }
2362
3.80k
        case Atom::BODY:
2363
3.80k
        case Atom::CAPTION:
2364
3.87k
        case Atom::COL:
2365
3.87k
        case Atom::COLGROUP:
2366
4.23k
        case Atom::HTML:
2367
4.43k
        case Atom::TD:
2368
4.62k
        case Atom::TH:
2369
4.87k
        case Atom::TR: {
2370
          // Ignore the token.
2371
4.87k
          return true;
2372
4.62k
        }
2373
727
        default:
2374
727
          break;
2375
26.2k
      }
2376
727
      break;
2377
26.2k
    }
2378
727
    case TokenType::COMMENT_TOKEN: {
2379
712
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2380
712
      node->SetManufactured(token_.is_manufactured);
2381
712
      if (record_node_offsets_) {
2382
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2383
0
      }
2384
712
      node->data_ = token_.data;
2385
712
      AddChild(node);
2386
712
      return true;
2387
26.2k
    }
2388
8.10k
    default:
2389
8.10k
      break;
2390
2.47M
  }
2391
2392
85.6k
  return InTableIM();
2393
2.47M
}  // Parser::InTableBodyIM.
2394
2395
// Section 12.2.6.4.14.
2396
2.78M
bool Parser::InRowIM() {
2397
2.78M
  switch (token_.token_type) {
2398
2.74M
    case TokenType::START_TAG_TOKEN: {
2399
2.74M
      switch (token_.atom) {
2400
599k
        case Atom::TD:
2401
1.23M
        case Atom::TH: {
2402
1.23M
          ClearStackToContext(Scope::TableRowScope);
2403
1.23M
          AddElement();
2404
1.23M
          active_formatting_elements_stack_.Push(scope_marker_);
2405
1.23M
          insertion_mode_ = std::bind(&Parser::InCellIM, this);
2406
1.23M
          return true;
2407
599k
        }
2408
0
        case Atom::CAPTION:
2409
1.86k
        case Atom::COL:
2410
1.86k
        case Atom::COLGROUP:
2411
2.07k
        case Atom::TBODY:
2412
2.60k
        case Atom::TFOOT:
2413
3.72k
        case Atom::THEAD:
2414
5.24k
        case Atom::TR: {
2415
5.24k
          if (PopUntil(Scope::TableScope, Atom::TR)) {
2416
5.24k
            insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
2417
5.24k
            return false;
2418
5.24k
          }
2419
          // Ignore the token.
2420
0
          return true;
2421
0
          break;
2422
5.24k
        }
2423
1.50M
        default:
2424
1.50M
          break;
2425
2.74M
      }
2426
1.50M
      break;
2427
2.74M
    }
2428
1.50M
    case TokenType::END_TAG_TOKEN: {
2429
24.6k
      switch (token_.atom) {
2430
1.16k
        case Atom::TR: {
2431
1.16k
          if (PopUntil(Scope::TableScope, Atom::TR)) {
2432
1.16k
            insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
2433
1.16k
          }
2434
          // Ignore the token.
2435
1.16k
          return true;
2436
0
        }
2437
16.3k
        case Atom::TABLE: {
2438
16.3k
          if (PopUntil(Scope::TableScope, Atom::TR)) {
2439
16.3k
            insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
2440
16.3k
            return false;
2441
16.3k
          }
2442
          // Ignore the token.
2443
0
          return true;
2444
16.3k
        }
2445
343
        case Atom::TBODY:
2446
549
        case Atom::TFOOT:
2447
821
        case Atom::THEAD: {
2448
821
          if (ElementInScope(Scope::TableScope, token_.atom)) {
2449
547
            ParseImpliedToken(TokenType::END_TAG_TOKEN, Atom::TR,
2450
547
                              AtomUtil::ToString(Atom::TR));
2451
547
            return false;
2452
547
          }
2453
          // Ignore the token.
2454
274
          return true;
2455
821
        }
2456
194
        case Atom::BODY:
2457
194
        case Atom::CAPTION:
2458
538
        case Atom::COL:
2459
538
        case Atom::COLGROUP:
2460
739
        case Atom::HTML:
2461
1.30k
        case Atom::TD:
2462
1.50k
        case Atom::TH: {
2463
          // Ignore the token.
2464
1.50k
          return true;
2465
1.30k
        }
2466
4.80k
        default:
2467
4.80k
          break;
2468
24.6k
      }
2469
4.80k
      break;
2470
24.6k
    }
2471
19.3k
    default:
2472
19.3k
      break;
2473
2.78M
  }
2474
2475
1.52M
  return InTableIM();
2476
2.78M
}  // Parser::InRowIM.
2477
2478
// Section 12.2.6.4.15.
2479
2.46M
bool Parser::InCellIM() {
2480
2.46M
  switch (token_.token_type) {
2481
2.41M
    case TokenType::START_TAG_TOKEN: {
2482
2.41M
      switch (token_.atom) {
2483
0
        case Atom::CAPTION:
2484
1.68k
        case Atom::COL:
2485
1.68k
        case Atom::COLGROUP:
2486
1.89k
        case Atom::TBODY:
2487
11.9k
        case Atom::TD:
2488
12.4k
        case Atom::TFOOT:
2489
56.9k
        case Atom::TH:
2490
57.9k
        case Atom::THEAD:
2491
58.1k
        case Atom::TR: {
2492
58.1k
          if (PopUntil(Scope::TableScope, Atom::TD, Atom::TH)) {
2493
            // Close the cell and reprocess.
2494
58.1k
            ClearActiveFormattingElements();
2495
58.1k
            insertion_mode_ = std::bind(&Parser::InRowIM, this);
2496
58.1k
            return false;
2497
58.1k
          }
2498
          // Ignore the token.
2499
0
          return true;
2500
58.1k
        }
2501
0
        case Atom::SELECT: {
2502
0
          ReconstructActiveFormattingElements();
2503
0
          AddElement();
2504
0
          frameset_ok_ = false;
2505
0
          insertion_mode_ = std::bind(&Parser::InSelectInTableIM, this);
2506
0
          return true;
2507
58.1k
        }
2508
2.35M
        default:
2509
2.35M
          break;
2510
2.41M
      }
2511
2.35M
      break;
2512
2.41M
    }
2513
2.35M
    case TokenType::END_TAG_TOKEN: {
2514
20.2k
      switch (token_.atom) {
2515
379
        case Atom::TD:
2516
957
        case Atom::TH: {
2517
957
          if (!PopUntil(Scope::TableScope, token_.atom)) {
2518
            // Ignore the token.
2519
278
            return true;
2520
278
          }
2521
679
          ClearActiveFormattingElements();
2522
679
          insertion_mode_ = std::bind(&Parser::InRowIM, this);
2523
679
          return true;
2524
957
        }
2525
194
        case Atom::BODY:
2526
194
        case Atom::CAPTION:
2527
388
        case Atom::COL:
2528
388
        case Atom::COLGROUP:
2529
454
        case Atom::HTML: {
2530
          // Ignore the token.
2531
454
          return true;
2532
388
        }
2533
16.3k
        case Atom::TABLE:
2534
16.6k
        case Atom::TBODY:
2535
17.0k
        case Atom::TFOOT:
2536
17.2k
        case Atom::THEAD:
2537
17.6k
        case Atom::TR: {
2538
17.6k
          if (!ElementInScope(Scope::TableScope, token_.atom)) {
2539
            // Ignore the token.
2540
451
            return true;
2541
451
          }
2542
          // Close the cell and reprocess.
2543
17.2k
          if (PopUntil(Scope::TableScope, Atom::TD, Atom::TH)) {
2544
17.2k
            ClearActiveFormattingElements();
2545
17.2k
          }
2546
17.2k
          insertion_mode_ = std::bind(&Parser::InRowIM, this);
2547
17.2k
          return false;
2548
17.6k
        }
2549
1.15k
        default:
2550
1.15k
          break;
2551
20.2k
      }
2552
1.15k
      break;
2553
20.2k
    }
2554
26.5k
    default:
2555
26.5k
      break;
2556
2.46M
  }
2557
2.38M
  return InBodyIM();
2558
2.46M
}  // Parser::InCellIM.
2559
2560
// Section 12.2.6.4.16.
2561
0
bool Parser::InSelectIM() {
2562
0
  switch (token_.token_type) {
2563
0
    case TokenType::TEXT_TOKEN: {
2564
0
      Strings::ReplaceAny(&token_.data, Strings::kNullChar, "");
2565
0
      AddText(token_.data);
2566
0
      break;
2567
0
    }
2568
0
    case TokenType::START_TAG_TOKEN: {
2569
0
      switch (token_.atom) {
2570
0
        case Atom::HTML: {
2571
0
          return InBodyIM();
2572
0
        }
2573
0
        case Atom::OPTION: {
2574
0
          if (top()->atom_ == Atom::OPTION) {
2575
0
            open_elements_stack_.Pop();
2576
0
          }
2577
0
          AddElement();
2578
0
          break;
2579
0
        }
2580
0
        case Atom::OPTGROUP: {
2581
0
          if (top()->atom_ == Atom::OPTION) {
2582
0
            open_elements_stack_.Pop();
2583
0
          }
2584
0
          if (top()->atom_ == Atom::OPTGROUP) {
2585
0
            open_elements_stack_.Pop();
2586
0
          }
2587
0
          AddElement();
2588
0
          break;
2589
0
        }
2590
0
        case Atom::SELECT: {
2591
0
          if (PopUntil(Scope::SelectScope, Atom::SELECT)) {
2592
0
            ResetInsertionMode();
2593
0
          }
2594
          // Ignore the token.
2595
0
          return true;
2596
0
        }
2597
0
        case Atom::INPUT:
2598
0
        case Atom::KEYGEN:
2599
0
        case Atom::TEXTAREA: {
2600
0
          if (ElementInScope(Scope::SelectScope, Atom::SELECT)) {
2601
0
            ParseImpliedToken(TokenType::END_TAG_TOKEN, Atom::SELECT,
2602
0
                              AtomUtil::ToString(Atom::SELECT));
2603
0
            return false;
2604
0
          }
2605
          // In order to properly ignore <textarea>, we need to change the
2606
          // tokenizer mode.
2607
0
          tokenizer_->NextIsNotRawText();
2608
          // Ignore the token.
2609
0
          return true;
2610
0
        }
2611
0
        case Atom::SCRIPT:
2612
0
        case Atom::TEMPLATE: {
2613
0
          return InHeadIM();
2614
0
        }
2615
0
        case Atom::IFRAME:
2616
0
        case Atom::NOEMBED:
2617
0
        case Atom::NOFRAMES:
2618
0
        case Atom::NOSCRIPT:
2619
0
        case Atom::PLAINTEXT:
2620
0
        case Atom::STYLE:
2621
0
        case Atom::TITLE:
2622
0
        case Atom::XMP: {
2623
          // Don't let the tokenizer go into raw text mode when there are raw
2624
          // tags to be ignored. These tags should be ignored from the tokenizer
2625
          // properly.
2626
0
          tokenizer_->NextIsNotRawText();
2627
          // Ignore the token.
2628
0
          return true;
2629
0
        }
2630
0
        default:
2631
0
          break;
2632
0
      }
2633
0
      break;
2634
0
    }
2635
0
    case TokenType::END_TAG_TOKEN: {
2636
0
      switch (token_.atom) {
2637
0
        case Atom::OPTION: {
2638
0
          if (top()->atom_ == Atom::OPTION) {
2639
0
            open_elements_stack_.Pop();
2640
0
          }
2641
0
          break;
2642
0
        }
2643
0
        case Atom::OPTGROUP: {
2644
0
          int i = open_elements_stack_.size() - 1;
2645
0
          Node* node = open_elements_stack_.at(i);
2646
0
          if (node && node->atom_ == Atom::OPTION) {
2647
0
            i--;
2648
0
          }
2649
0
          node = open_elements_stack_.at(i);
2650
0
          if (node && node->atom_ == Atom::OPTGROUP) {
2651
0
            open_elements_stack_.Pop(open_elements_stack_.size() - i);
2652
0
          }
2653
0
          break;
2654
0
        }
2655
0
        case Atom::SELECT: {
2656
0
          if (!PopUntil(Scope::SelectScope, Atom::SELECT)) {
2657
            // Ignore the token.
2658
0
            return true;
2659
0
          }
2660
0
          ResetInsertionMode();
2661
0
          break;
2662
0
        }
2663
0
        case Atom::TEMPLATE: {
2664
0
          return InHeadIM();
2665
0
        }
2666
0
        default:
2667
0
          break;
2668
0
      }
2669
0
      break;
2670
0
    }
2671
0
    case TokenType::COMMENT_TOKEN: {
2672
0
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2673
0
      node->SetManufactured(token_.is_manufactured);
2674
0
      if (record_node_offsets_) {
2675
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2676
0
      }
2677
0
      node->data_ = token_.data;
2678
0
      AddChild(node);
2679
0
      break;
2680
0
    }
2681
0
    case TokenType::DOCTYPE_TOKEN: {
2682
      // Ignore the token.
2683
0
      return true;
2684
0
    }
2685
0
    case TokenType::ERROR_TOKEN: {
2686
0
      return InBodyIM();
2687
0
    }
2688
0
    default:
2689
0
      break;
2690
0
  }
2691
2692
0
  return true;
2693
0
}  // Parser::InSelectIM.
2694
2695
// Section 12.2.6.4.17.
2696
0
bool Parser::InSelectInTableIM() {
2697
0
  switch (token_.token_type) {
2698
0
    case TokenType::START_TAG_TOKEN:
2699
0
    case TokenType::END_TAG_TOKEN: {
2700
0
      switch (token_.atom) {
2701
0
        case Atom::CAPTION:
2702
0
        case Atom::TABLE:
2703
0
        case Atom::TBODY:
2704
0
        case Atom::TFOOT:
2705
0
        case Atom::THEAD:
2706
0
        case Atom::TR:
2707
0
        case Atom::TD:
2708
0
        case Atom::TH: {
2709
0
          if (token_.token_type == TokenType::END_TAG_TOKEN &&
2710
0
              !ElementInScope(Scope::TableScope, token_.atom)) {
2711
            // Ignore the token.
2712
0
            return true;
2713
0
          }
2714
          // This is like p.popUntil(selectScope, a.Select), but it also
2715
          // matches <math select>, not just <select>. Matching the MathML
2716
          // tag is arguably incorrect (conceptually), but it mimics what
2717
          // Chromium does.
2718
0
          for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
2719
0
            if (open_elements_stack_.at(i)->atom_ == Atom::SELECT) {
2720
0
              open_elements_stack_.Pop(open_elements_stack_.size() - i);
2721
0
              break;
2722
0
            }
2723
0
          }
2724
0
          ResetInsertionMode();
2725
0
          return false;
2726
0
        }
2727
0
        default:
2728
0
          break;
2729
0
      }
2730
0
      break;
2731
0
    }
2732
0
    default:
2733
0
      break;
2734
0
  }
2735
2736
0
  return InSelectIM();
2737
0
}  // Parser::InSelectInTableIM.
2738
2739
// Section 12.2.6.4.18.
2740
0
bool Parser::InTemplateIM() {
2741
0
  switch (token_.token_type) {
2742
0
    case TokenType::TEXT_TOKEN:
2743
0
    case TokenType::COMMENT_TOKEN:
2744
0
    case TokenType::DOCTYPE_TOKEN:
2745
0
      return InBodyIM();
2746
0
    case TokenType::START_TAG_TOKEN: {
2747
0
      switch (token_.atom) {
2748
0
        case Atom::BASE:
2749
0
        case Atom::BASEFONT:
2750
0
        case Atom::BGSOUND:
2751
0
        case Atom::LINK:
2752
0
        case Atom::META:
2753
0
        case Atom::NOFRAMES:
2754
0
        case Atom::SCRIPT:
2755
0
        case Atom::STYLE:
2756
0
        case Atom::TEMPLATE:
2757
0
        case Atom::TITLE:
2758
0
          return InHeadIM();
2759
0
        case Atom::CAPTION:
2760
0
        case Atom::COLGROUP:
2761
0
        case Atom::TBODY:
2762
0
        case Atom::TFOOT:
2763
0
        case Atom::THEAD: {
2764
0
          template_stack_.pop_back();
2765
0
          template_stack_.push_back(std::bind(&Parser::InTableIM, this));
2766
0
          insertion_mode_ = std::bind(&Parser::InTableIM, this);
2767
0
          return false;
2768
0
        }
2769
0
        case Atom::COL: {
2770
0
          template_stack_.pop_back();
2771
0
          template_stack_.push_back(std::bind(&Parser::InColumnGroupIM, this));
2772
0
          insertion_mode_ = std::bind(&Parser::InColumnGroupIM, this);
2773
0
          return false;
2774
0
        }
2775
0
        case Atom::TR: {
2776
0
          template_stack_.pop_back();
2777
0
          template_stack_.push_back(std::bind(&Parser::InTableBodyIM, this));
2778
0
          insertion_mode_ = std::bind(&Parser::InTableBodyIM, this);
2779
0
          return false;
2780
0
        }
2781
0
        case Atom::TD:
2782
0
        case Atom::TH: {
2783
0
          template_stack_.pop_back();
2784
0
          template_stack_.push_back(std::bind(&Parser::InRowIM, this));
2785
0
          insertion_mode_ = std::bind(&Parser::InRowIM, this);
2786
0
          return false;
2787
0
        }
2788
0
        default:
2789
0
          template_stack_.pop_back();
2790
0
          template_stack_.push_back(std::bind(&Parser::InBodyIM, this));
2791
0
          insertion_mode_ = std::bind(&Parser::InBodyIM, this);
2792
0
          return false;
2793
0
      }
2794
0
    }
2795
0
    case TokenType::END_TAG_TOKEN: {
2796
0
      switch (token_.atom) {
2797
0
        case Atom::TEMPLATE:
2798
0
          return InHeadIM();
2799
0
        default:
2800
          // Ignore the token.
2801
0
          return true;
2802
0
      }
2803
0
    }
2804
0
    case TokenType::ERROR_TOKEN: {
2805
0
      if (!open_elements_stack_.Contains(Atom::TEMPLATE)) {
2806
        // Ignore the token.
2807
0
        return true;
2808
0
      }
2809
      // TODO: remove this divergence from the HTML5 spec.
2810
      //
2811
      // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
2812
0
      GenerateImpliedEndTags();
2813
0
      for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
2814
0
        Node* node = open_elements_stack_.at(i);
2815
0
        if (node->name_space_.empty() && node->atom_ == Atom::TEMPLATE) {
2816
0
          open_elements_stack_.Pop(open_elements_stack_.size() - i);
2817
0
          break;
2818
0
        }
2819
0
      }
2820
0
      ClearActiveFormattingElements();
2821
0
      template_stack_.pop_back();
2822
0
      ResetInsertionMode();
2823
0
      return false;
2824
0
    }
2825
0
    default:
2826
0
      break;
2827
0
  }
2828
0
  return false;
2829
0
}  // Parser::InTemplateIM.
2830
2831
// Section 12.2.6.4.19.
2832
19.4k
bool Parser::AfterBodyIM() {
2833
19.4k
  switch (token_.token_type) {
2834
24
    case TokenType::ERROR_TOKEN:
2835
      // Stop parsing.
2836
24
      return true;
2837
7.07k
    case TokenType::TEXT_TOKEN:
2838
      // https://www.w3.org/TR/2011/WD-html5-20110113/tokenization.html#parsing-main-afterbody
2839
7.07k
      if (token_.data.find_first_not_of(Strings::kWhitespace) ==
2840
7.07k
          std::string::npos) {
2841
        // It was all whitesapce.
2842
3.60k
        return InBodyIM();
2843
3.60k
      }
2844
3.47k
      break;
2845
3.47k
    case TokenType::START_TAG_TOKEN:
2846
1.81k
      if (token_.atom == Atom::HTML) {
2847
0
        return InBodyIM();
2848
0
      }
2849
1.81k
      break;
2850
4.08k
    case TokenType::END_TAG_TOKEN:
2851
4.08k
      if (token_.atom == Atom::HTML) {
2852
1.84k
        if (!fragment_) {
2853
1.84k
          insertion_mode_ = std::bind(&Parser::AfterAfterBodyIM, this);
2854
1.84k
        }
2855
1.84k
        return true;
2856
1.84k
      }
2857
2.24k
      break;
2858
6.43k
    case TokenType::COMMENT_TOKEN: {
2859
      // The comment is attached to the <html> element.
2860
6.43k
      CHECK(open_elements_stack_.size() > 0 &&
2861
6.43k
            open_elements_stack_.at(0)->atom_ == Atom::HTML)
2862
6.43k
          << "html: bad parser state: <html> element not found, in the "
2863
6.43k
             "after-body insertion mode";
2864
6.43k
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2865
6.43k
      node->SetManufactured(token_.is_manufactured);
2866
6.43k
      if (record_node_offsets_) {
2867
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2868
0
      }
2869
6.43k
      node->data_ = token_.data;
2870
6.43k
      open_elements_stack_.at(0)->AppendChild(node);
2871
6.43k
      return true;
2872
4.08k
    }
2873
0
    default:
2874
0
      break;
2875
19.4k
  }
2876
2877
7.52k
  insertion_mode_ = std::bind(&Parser::InBodyIM, this);
2878
7.52k
  return false;
2879
19.4k
}  // Parser::AfterBodyIM.
2880
2881
// Section 12.2.6.4.20.
2882
0
bool Parser::InFramesetIM() {
2883
0
  switch (token_.token_type) {
2884
0
    case TokenType::COMMENT_TOKEN: {
2885
0
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2886
0
      node->SetManufactured(token_.is_manufactured);
2887
0
      if (record_node_offsets_) {
2888
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2889
0
      }
2890
0
      node->data_ = token_.data;
2891
0
      AddChild(node);
2892
0
      break;
2893
0
    }
2894
0
    case TokenType::TEXT_TOKEN: {
2895
0
      std::string whitespace_only = ExtractWhitespace(token_.data);
2896
0
      if (!whitespace_only.empty()) AddText(whitespace_only);
2897
0
      break;
2898
0
    }
2899
0
    case TokenType::START_TAG_TOKEN:
2900
0
      switch (token_.atom) {
2901
0
        case Atom::HTML:
2902
0
          return InBodyIM();
2903
0
        case Atom::FRAMESET:
2904
0
          AddElement();
2905
0
          break;
2906
0
        case Atom::FRAME:
2907
0
          AddElement();
2908
0
          open_elements_stack_.Pop();
2909
0
          AcknowledgeSelfClosingTag();
2910
0
          break;
2911
0
        case Atom::NOFRAMES:
2912
0
          return InHeadIM();
2913
0
        default:
2914
0
          break;
2915
0
      }
2916
0
      break;
2917
0
    case TokenType::END_TAG_TOKEN:
2918
0
      switch (token_.atom) {
2919
0
        case Atom::FRAMESET:
2920
0
          if (open_elements_stack_.Top()->atom_ != Atom::HTML) {
2921
0
            open_elements_stack_.Pop();
2922
0
            if (open_elements_stack_.Top()->atom_ != Atom::FRAMESET) {
2923
0
              insertion_mode_ = std::bind(&Parser::AfterFramesetIM, this);
2924
0
              return true;
2925
0
            }
2926
0
          }
2927
0
          break;
2928
0
        default:
2929
0
          break;
2930
0
      }
2931
0
      break;
2932
0
    default:
2933
      // Ignore the token.
2934
0
      break;
2935
0
  }
2936
0
  return true;
2937
0
}  // Parser::InFramesetIM.
2938
2939
// Section 12.2.6.4.21.
2940
0
bool Parser::AfterFramesetIM() {
2941
0
  switch (token_.token_type) {
2942
0
    case TokenType::COMMENT_TOKEN: {
2943
0
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
2944
0
      node->SetManufactured(token_.is_manufactured);
2945
0
      if (record_node_offsets_) {
2946
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
2947
0
      }
2948
0
      node->data_ = token_.data;
2949
0
      AddChild(node);
2950
0
      break;
2951
0
    }
2952
0
    case TokenType::TEXT_TOKEN: {
2953
0
      std::string whitespace_only = ExtractWhitespace(token_.data);
2954
0
      if (!whitespace_only.empty()) AddText(whitespace_only);
2955
0
      break;
2956
0
    }
2957
0
    case TokenType::START_TAG_TOKEN:
2958
0
      switch (token_.atom) {
2959
0
        case Atom::HTML:
2960
0
          return InBodyIM();
2961
0
        case Atom::NOFRAMES:
2962
0
          return InHeadIM();
2963
0
        default:
2964
0
          break;
2965
0
      }
2966
0
      break;
2967
0
    case TokenType::END_TAG_TOKEN:
2968
0
      switch (token_.atom) {
2969
0
        case Atom::HTML:
2970
0
          insertion_mode_ = std::bind(&Parser::AfterAfterFramesetIM, this);
2971
0
          return true;
2972
0
        default:
2973
0
          break;
2974
0
      }
2975
0
      break;
2976
0
    default:
2977
      // Ignore the token.
2978
0
      break;
2979
0
  }
2980
0
  return true;
2981
0
}  // Parser::AfterFramesetIM.
2982
2983
// Section 12.2.6.4.22.
2984
4.06k
bool Parser::AfterAfterBodyIM() {
2985
4.06k
  switch (token_.token_type) {
2986
70
    case TokenType::ERROR_TOKEN:
2987
      // Stop parsing.
2988
70
      return true;
2989
1.21k
    case TokenType::TEXT_TOKEN: {
2990
1.21k
      if (token_.data.find_first_not_of(Strings::kWhitespace) ==
2991
1.21k
          std::string::npos) {
2992
625
        return InBodyIM();
2993
625
      }
2994
589
      break;
2995
1.21k
    }
2996
1.00k
    case TokenType::START_TAG_TOKEN:
2997
1.00k
      if (token_.atom == Atom::HTML) {
2998
796
        return InBodyIM();
2999
796
      }
3000
208
      break;
3001
804
    case TokenType::COMMENT_TOKEN: {
3002
804
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
3003
804
      node->SetManufactured(token_.is_manufactured);
3004
804
      if (record_node_offsets_) {
3005
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
3006
0
      }
3007
804
      node->data_ = token_.data;
3008
804
      document_->root_node_->AppendChild(node);
3009
804
      return true;
3010
1.00k
    }
3011
0
    case TokenType::DOCTYPE_TOKEN:
3012
0
      return InBodyIM();
3013
975
    default:
3014
975
      break;
3015
4.06k
  }
3016
3017
1.77k
  insertion_mode_ = std::bind(&Parser::InBodyIM, this);
3018
1.77k
  return false;
3019
4.06k
}  // Parser::AfterAfterBodyIM.
3020
3021
// Section 12.2.6.4.23.
3022
0
bool Parser::AfterAfterFramesetIM() {
3023
0
  switch (token_.token_type) {
3024
0
    case TokenType::COMMENT_TOKEN: {
3025
0
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
3026
0
      node->SetManufactured(token_.is_manufactured);
3027
0
      if (record_node_offsets_) {
3028
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
3029
0
      }
3030
0
      node->data_ = token_.data;
3031
0
      document_->root_node_->AppendChild(node);
3032
0
      break;
3033
0
    }
3034
0
    case TokenType::TEXT_TOKEN: {
3035
0
      std::string whitespace_only = ExtractWhitespace(token_.data);
3036
0
      if (!whitespace_only.empty()) {
3037
0
        token_.data = whitespace_only;
3038
0
        return InBodyIM();
3039
0
      }
3040
0
      break;
3041
0
    }
3042
0
    case TokenType::START_TAG_TOKEN:
3043
0
      switch (token_.atom) {
3044
0
        case Atom::HTML:
3045
0
          return InBodyIM();
3046
0
        case Atom::NOFRAMES:
3047
0
          return InHeadIM();
3048
0
        default:
3049
0
          break;
3050
0
      }
3051
0
      break;
3052
0
    case TokenType::DOCTYPE_TOKEN:
3053
0
      return InBodyIM();
3054
0
    default:
3055
0
      break;
3056
0
  }
3057
0
  return true;
3058
0
}  // Parser::AfterAfterFramesetIM.
3059
3060
21.9M
Node* Parser::AdjustedCurrentNode() {
3061
21.9M
  if (open_elements_stack_.size() == 1 && fragment_ && context_node_)
3062
0
    return context_node_;
3063
21.9M
  return open_elements_stack_.Top();
3064
21.9M
}
3065
3066
// Section 12.2.6.5.
3067
1.87M
bool Parser::ParseForeignContent() {
3068
1.87M
  switch (token_.token_type) {
3069
32.4k
    case TokenType::TEXT_TOKEN: {
3070
32.4k
      if (frameset_ok_) {
3071
393
        frameset_ok_ = (token_.data.find_first_not_of(
3072
393
                            Strings::kWhitespaceOrNull) == std::string::npos);
3073
393
      }
3074
      // Replaces null char with \ufffd replacement character.
3075
32.4k
      Strings::ReplaceAny(&token_.data, Strings::kNullChar,
3076
32.4k
                          Strings::kNullReplacementChar);
3077
32.4k
      AddText(token_.data);
3078
32.4k
      break;
3079
0
    }
3080
1.52k
    case TokenType::COMMENT_TOKEN: {
3081
1.52k
      Node* node = document_->NewNode(NodeType::COMMENT_NODE);
3082
1.52k
      node->SetManufactured(token_.is_manufactured);
3083
1.52k
      if (record_node_offsets_) {
3084
0
        node->line_col_in_html_src_ = token_.line_col_in_html_src;
3085
0
      }
3086
1.52k
      node->data_ = token_.data;
3087
1.52k
      AddChild(node);
3088
1.52k
      break;
3089
0
    }
3090
1.83M
    case TokenType::START_TAG_TOKEN: {
3091
1.83M
      if (!fragment_) {
3092
1.83M
        auto breaktout_tag = std::find(std::begin(kBreakoutTags),
3093
1.83M
                                       std::end(kBreakoutTags), token_.atom);
3094
1.83M
        bool is_breakout_tag = breaktout_tag != std::end(kBreakoutTags);
3095
3096
1.83M
        if (token_.atom == Atom::FONT) {
3097
1.27k
          for (auto& attr : token_.attributes) {
3098
1.27k
            std::string key = attr.key;
3099
1.27k
            if (key == "color" || key == "face" || key == "size") {
3100
0
              is_breakout_tag = true;
3101
0
              break;
3102
0
            }
3103
1.27k
          }
3104
566
        }
3105
1.83M
        if (is_breakout_tag) {
3106
718k
          for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
3107
718k
            Node* node = open_elements_stack_.at(i);
3108
718k
            if (node->name_space_.empty() || HtmlIntegrationPoint(*node) ||
3109
718k
                MathMLTextIntegrationPoint(*node)) {
3110
706
              open_elements_stack_.Pop(open_elements_stack_.size() - i - 1);
3111
706
              break;
3112
706
            }
3113
718k
          }
3114
706
          return false;
3115
706
        }
3116
1.83M
      }
3117
3118
1.83M
      Node* current = AdjustedCurrentNode();
3119
1.83M
      if (current->name_space_ == "math") {
3120
1.80M
        AdjustMathMLAttributeNames(&token_.attributes);
3121
1.80M
      } else if (current->name_space_ == "svg") {
3122
1.18M
        for (auto [name, adjusted] : kSvgTagNameAdjustments) {
3123
1.18M
          if (name == token_.atom) {
3124
0
            token_.atom = adjusted;
3125
0
          }
3126
1.18M
        }
3127
32.8k
        AdjustSVGAttributeNames(&token_.attributes);
3128
32.8k
      } else {
3129
0
        CHECK(false) << "html: bad parser state: unexpected namespace";
3130
0
      }
3131
3132
1.83M
      AdjustForeignAttributes(&token_.attributes);
3133
1.83M
      auto& ns = current->name_space_;
3134
1.83M
      AddElement();
3135
1.83M
      top()->name_space_ = ns;
3136
1.83M
      if (!ns.empty()) {
3137
        // Don't let the tokenizer go into raw text mode in foreign content.
3138
        // (e.g. in an SVG <title> tag).
3139
1.83M
        tokenizer_->NextIsNotRawText();
3140
1.83M
      }
3141
1.83M
      if (has_self_closing_token_) {
3142
194
        open_elements_stack_.Pop();
3143
194
        AcknowledgeSelfClosingTag();
3144
194
      }
3145
1.83M
      break;
3146
1.83M
    }
3147
4.27k
    case TokenType::END_TAG_TOKEN:
3148
32.2k
      for (int i = open_elements_stack_.size() - 1; i >= 0; --i) {
3149
32.2k
        if (open_elements_stack_.at(i)->name_space_.empty()) {
3150
3.78k
          return insertion_mode_();
3151
3.78k
        }
3152
3153
28.4k
        auto sn = open_elements_stack_.at(i);
3154
28.4k
        auto node_data = sn->atom_ != Atom::UNKNOWN
3155
28.4k
                             ? AtomUtil::ToString(sn->atom_)
3156
28.4k
                             : sn->data_;
3157
28.4k
        auto token_data = token_.atom != Atom::UNKNOWN
3158
28.4k
                              ? AtomUtil::ToString(token_.atom)
3159
28.4k
                              : token_.data;
3160
3161
28.4k
        if (Strings::EqualFold(node_data, token_data)) {
3162
489
          open_elements_stack_.Pop(open_elements_stack_.size() - i);
3163
489
          break;
3164
489
        }
3165
28.4k
      }
3166
489
      return true;
3167
0
    default:
3168
      // Ignore the token.
3169
0
      break;
3170
1.87M
  }
3171
1.87M
  return true;
3172
1.87M
}  // Parser::ParseForeignContent.
3173
3174
// Section 12.2.6.
3175
20.1M
bool Parser::InForeignContent() {
3176
20.1M
  if (open_elements_stack_.size() == 0) return false;
3177
3178
20.1M
  Node* node = AdjustedCurrentNode();
3179
20.1M
  if (node->name_space_.empty()) return false;
3180
1.88M
  Atom token_atom = token_.atom;
3181
1.88M
  TokenType token_type = token_.token_type;
3182
1.88M
  if (MathMLTextIntegrationPoint(*node)) {
3183
2.95k
    if (token_type == TokenType::START_TAG_TOKEN &&
3184
2.95k
        token_atom != Atom::MGLYPH && token_atom != Atom::MALIGNMARK) {
3185
1.19k
      return false;
3186
1.19k
    }
3187
1.76k
    if (token_type == TokenType::TEXT_TOKEN) {
3188
489
      return false;
3189
489
    }
3190
1.76k
  }
3191
3192
1.87M
  if (node->name_space_ == "math" && node->atom_ == Atom::ANNOTATION_XML &&
3193
1.87M
      token_type == TokenType::START_TAG_TOKEN && token_atom == Atom::SVG) {
3194
0
    return false;
3195
0
  }
3196
3197
1.87M
  if (HtmlIntegrationPoint(*node) &&
3198
1.87M
      (token_type == TokenType::START_TAG_TOKEN ||
3199
1.18k
       token_type == TokenType::TEXT_TOKEN)) {
3200
518
    return false;
3201
518
  }
3202
3203
1.87M
  if (token_type == TokenType::ERROR_TOKEN) {
3204
1.09k
    return false;
3205
1.09k
  }
3206
3207
1.87M
  return true;
3208
1.87M
}  // Parser::InForeignContent.
3209
3210
// Section 12.2.6.2.
3211
7.10k
void Parser::ParseGenericRawTextElement() {
3212
7.10k
  AddElement();
3213
7.10k
  original_insertion_mode_ = insertion_mode_;
3214
7.10k
  insertion_mode_ = std::bind(&Parser::TextIM, this);
3215
7.10k
}
3216
3217
void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
3218
2.40M
                               const std::string& data) {
3219
  // Copy original token.
3220
2.40M
  Token real_token = {.token_type = token_.token_type,
3221
2.40M
                      .atom = token_.atom,
3222
2.40M
                      .data = token_.data,
3223
2.40M
                      .line_col_in_html_src = token_.line_col_in_html_src,
3224
2.40M
                      .attributes = token_.attributes};
3225
2.40M
  bool self_closing = has_self_closing_token_;
3226
  // Create implied tokens.
3227
2.40M
  token_ = {.token_type = token_type,
3228
2.40M
            .atom = atom,
3229
2.40M
            .data = data,
3230
            // For reporting purposes implied tokens are assumed to be parsed at
3231
            // the current tag location.
3232
2.40M
            .line_col_in_html_src = token_.line_col_in_html_src,
3233
2.40M
            .attributes = {}};
3234
2.40M
  has_self_closing_token_ = false;
3235
3236
  // Accounting for manufactured tags.
3237
2.40M
  if (token_type == TokenType::START_TAG_TOKEN) {
3238
2.39M
    switch (atom) {
3239
12.4k
      case Atom::HTML:
3240
12.4k
        document_->metadata_.has_manufactured_html = true;
3241
12.4k
        break;
3242
12.4k
      case Atom::HEAD:
3243
12.4k
        document_->metadata_.has_manufactured_head = true;
3244
12.4k
        break;
3245
11.8k
      case Atom::BODY:
3246
11.8k
        document_->metadata_.has_manufactured_body = true;
3247
11.8k
        break;
3248
2.35M
      default:
3249
2.35M
        break;
3250
2.39M
    }
3251
2.39M
  }
3252
3253
2.40M
  ParseCurrentToken();
3254
  // Restore original token.
3255
2.40M
  token_ = {.token_type = real_token.token_type,
3256
2.40M
            .atom = real_token.atom,
3257
2.40M
            .data = real_token.data,
3258
2.40M
            .line_col_in_html_src = token_.line_col_in_html_src,
3259
2.40M
            .attributes = real_token.attributes};
3260
2.40M
  has_self_closing_token_ = self_closing;
3261
2.40M
}  // Parser::ParseImpliedToken.
3262
3263
17.5M
void Parser::ParseCurrentToken() {
3264
17.5M
  if (token_.token_type == TokenType::SELF_CLOSING_TAG_TOKEN) {
3265
783
    has_self_closing_token_ = true;
3266
783
    token_.token_type = TokenType::START_TAG_TOKEN;
3267
783
  }
3268
3269
17.5M
  bool consumed = false;
3270
3271
37.6M
  while (!consumed) {
3272
20.1M
    if (InForeignContent()) {
3273
1.87M
      consumed = ParseForeignContent();
3274
18.2M
    } else {
3275
18.2M
      consumed = insertion_mode_();
3276
18.2M
    }
3277
20.1M
  }
3278
3279
17.5M
  if (has_self_closing_token_) {
3280
    // This is a parse error, but ignore it.
3281
332
    has_self_closing_token_ = false;
3282
332
  }
3283
17.5M
}  // Parser::ParseCurrentToken.
3284
3285
25.6k
void Parser::CopyAttributes(Node* node, Token token) const {
3286
25.6k
  if (token.attributes.empty()) return;
3287
18.0k
  std::set<std::string> attr_keys;
3288
18.0k
  std::transform(node->attributes_.begin(), node->attributes_.end(),
3289
18.0k
                 std::inserter(attr_keys, attr_keys.begin()),
3290
235k
                 [](const Attribute& attr) -> std::string { return attr.key; });
3291
22.5k
  for (const Attribute& attr : token.attributes) {
3292
22.5k
    if (attr_keys.find(attr.key) == attr_keys.end()) {
3293
4.40k
      node->attributes_.push_back(attr);
3294
4.40k
      attr_keys.insert(attr.key);
3295
4.40k
    }
3296
22.5k
  }
3297
18.0k
}  // Parser::CopyAttributes.
3298
3299
9.10k
void Parser::RecordBaseURLMetadata(Node* base_node) {
3300
9.10k
  if (base_node->Type() != NodeType::ELEMENT_NODE ||
3301
9.10k
      base_node->DataAtom() != Atom::BASE) return;
3302
3303
9.21k
  for (auto& attr : base_node->Attributes()) {
3304
9.21k
    if (Strings::EqualFold(attr.key, "href")) {
3305
195
      document_->metadata_.base_url.first = attr.value;
3306
9.02k
    } else if (Strings::EqualFold(attr.key, "target")) {
3307
0
      document_->metadata_.base_url.second = attr.value;
3308
0
    }
3309
9.21k
  }
3310
8.84k
}
3311
3312
824
void Parser::RecordLinkRelCanonical(Node* link_node) {
3313
824
  if (link_node->Type() != NodeType::ELEMENT_NODE ||
3314
824
      link_node->DataAtom() != Atom::LINK) return;
3315
3316
561
  bool canonical = false;
3317
561
  std::string canonical_url;
3318
1.35k
  for (auto& attr : link_node->Attributes()) {
3319
1.35k
    if (Strings::EqualFold(attr.key, "rel") &&
3320
1.35k
        Strings::EqualFold(attr.value, "canonical")) {
3321
0
      canonical = true;
3322
1.35k
    } else if (Strings::EqualFold(attr.key, "href")) {
3323
0
      canonical_url = attr.value;
3324
0
    }
3325
1.35k
  }
3326
561
  if (canonical && !canonical_url.empty()) {
3327
0
    document_->metadata_.canonical_url = canonical_url;
3328
0
  }
3329
561
}
3330
3331
namespace {
3332
// Returns only whitespace characters in s.
3333
// <space><space>foo<space>bar<space> returns 4 spaces.
3334
0
std::string ExtractWhitespace(const std::string& s) {
3335
0
  std::string only_whitespaces;
3336
0
  std::copy_if(
3337
0
      s.begin(), s.end(),
3338
0
      only_whitespaces.begin(),  // Unused, populated directly in predicate.
3339
0
      [&only_whitespaces](char c) -> bool {
3340
0
        if (Strings::kWhitespace.find(c) != std::string::npos) {
3341
0
          only_whitespaces.push_back(c);
3342
0
        }
3343
0
        return false;
3344
0
      });
3345
0
  return only_whitespaces;
3346
0
}
3347
3348
}  // namespace
3349
3350
}  // namespace htmlparser