Coverage Report

Created: 2025-09-08 06:20

/proc/self/cwd/cpp/htmlparser/tokenizer.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/tokenizer.h"
2
3
#include "absl/flags/flag.h"
4
#include "cpp/htmlparser/atom.h"
5
#include "cpp/htmlparser/atomutil.h"
6
#include "cpp/htmlparser/defer.h"
7
#include "cpp/htmlparser/strings.h"
8
9
ABSL_FLAG(std::size_t, htmlparser_max_attributes_per_node,
10
          1000,
11
          "Protects out of memory errors by dropping insanely large amounts "
12
          "of attributes per node.");
13
14
namespace htmlparser {
15
16
Tokenizer::Tokenizer(std::string_view html, std::string context_tag) :
17
12.5k
    buffer_(html) {
18
12.5k
  lines_cols_.push_back(std::make_pair(1, 0));
19
12.5k
  current_line_col_ = std::make_pair(1, 0);
20
12.5k
  token_line_col_ = std::make_pair(1, 0);
21
12.5k
  if (!context_tag.empty()) {
22
0
    Strings::ToLower(&context_tag);
23
0
    if (std::find(kAllowedFragmentContainers.begin(),
24
0
                  kAllowedFragmentContainers.end(),
25
0
                  AtomUtil::ToAtom(context_tag)) !=
26
0
        kAllowedFragmentContainers.end()) {
27
0
      raw_tag_ = context_tag;
28
0
    }
29
0
  }
30
12.5k
}
31
32
189M
inline char Tokenizer::ReadByte() {
33
189M
  if (raw_.end >= buffer_.size()) {
34
12.5k
    eof_ = true;
35
12.5k
    return 0;
36
12.5k
  }
37
38
189M
  char c = buffer_.at(raw_.end++);
39
189M
  current_line_col_.second++;
40
189M
  int multi_byte = Strings::CodePointByteSequenceCount(c);
41
189M
  if (multi_byte > 1) {
42
11.1M
    current_line_col_.second -= (multi_byte - 1);
43
11.1M
  }
44
45
189M
  if (c == '\n' || (c == '\r' &&
46
187M
                    raw_.end < buffer_.size() &&
47
187M
                    buffer_.at(raw_.end) != '\n')) {
48
19.7M
    lines_cols_.back() = current_line_col_;
49
    // Increment line number and reset column number.
50
19.7M
    current_line_col_.first++;
51
19.7M
    current_line_col_.second = 0;
52
19.7M
    lines_cols_.push_back({current_line_col_.first + 1, 0});
53
19.7M
  }
54
55
189M
  return c;
56
189M
}
57
58
47.7M
inline void Tokenizer::UnreadByte() {
59
47.7M
  raw_.end--;
60
47.7M
  if (current_line_col_.first > 1 && current_line_col_.second == 0) {
61
3.75M
    if (lines_cols_.size() > 1) {
62
3.75M
      lines_cols_.pop_back();
63
3.75M
    }
64
3.75M
    current_line_col_ = lines_cols_.back();
65
3.75M
    return;
66
3.75M
  }
67
68
43.9M
  current_line_col_.second--;
69
43.9M
}
70
71
17.7M
void Tokenizer::SkipWhiteSpace() {
72
20.9M
  while (!eof_) {
73
20.9M
    char c = ReadByte();
74
20.9M
    switch (c) {
75
1.01k
      case ' ':
76
699k
      case '\n':
77
3.03M
      case '\r':
78
3.05M
      case '\t':
79
3.11M
      case '\f':
80
3.11M
        break;
81
17.7M
      default:
82
17.7M
        UnreadByte();
83
17.7M
        return;
84
20.9M
    }
85
20.9M
  }
86
17.7M
}
87
88
15.1M
void Tokenizer::SetAllowCDATA(bool allow_cdata) {
89
15.1M
  allow_cdata_ = allow_cdata;
90
15.1M
}
91
92
1.83M
void Tokenizer::NextIsNotRawText() {
93
1.83M
  raw_tag_ = "";
94
1.83M
}
95
96
24.7k
void Tokenizer::ReadRawOrRCDATA() {
97
24.7k
  if (raw_tag_ == "script") {
98
5.39k
    ReadScript();
99
5.39k
    text_is_raw_ = true;
100
5.39k
    raw_tag_ = "";
101
5.39k
    return;
102
5.39k
  }
103
104
1.27M
  while (!eof_) {
105
1.27M
    char c = ReadByte();
106
1.27M
    if (eof_) break;
107
1.27M
    if (c != '<') continue;
108
275k
    c = ReadByte();
109
275k
    if (eof_) break;
110
275k
    if (c != '/') continue;
111
238k
    if (ReadRawEndTag() || eof_) break;
112
238k
  }
113
114
19.3k
  data_.end = raw_.end;
115
  // A textarea's or title's RCDATA can contain escaped entities.
116
19.3k
  text_is_raw_ = raw_tag_ != "textarea" && raw_tag_ != "title";
117
19.3k
  raw_tag_ = "";
118
19.3k
}
119
120
257k
bool Tokenizer::ReadRawEndTag() {
121
1.02M
  for (std::size_t i = 0; i < raw_tag_.size(); ++i) {
122
902k
    char c = ReadByte();
123
902k
    if (eof_) return false;
124
902k
    if (c != raw_tag_.at(i) && c != (raw_tag_.at(i) - ('a' - 'A'))) {
125
135k
      UnreadByte();
126
135k
      return false;
127
135k
    }
128
902k
  }
129
130
121k
  char c = ReadByte();
131
121k
  if (eof_) return false;
132
121k
  switch (c) {
133
696
    case ' ':
134
762
    case '\n':
135
780
    case '\t':
136
814
    case '\f':
137
1.12k
    case '/':
138
24.4k
    case '>':
139
      // The 3 is 2 for the leading "</" plus 1 for the trailing character c.
140
24.4k
      raw_.end -= (3 /* <, /, and > */+ raw_tag_.size());
141
24.4k
      current_line_col_.second -= (3 /* <, /, and > */ + raw_tag_.size());
142
24.4k
      return true;
143
121k
  }
144
97.3k
  UnreadByte();
145
97.3k
  return false;
146
121k
}
147
148
enum ScriptDataState {
149
  DONE = 0,
150
  SCRIPT_DATA = 1,
151
  SCRIPT_DATA_LESS_THAN_SIGN = 2,
152
  SCRIPT_DATA_END_TAG_OPEN = 3,
153
  SCRIPT_DATA_ESCAPE_START = 4,
154
  SCRIPT_DATA_ESCAPE_START_DASH = 5,
155
  SCRIPT_DATA_ESCAPED = 6,
156
  SCRIPT_DATA_ESCAPED_DASH = 7,
157
  SCRIPT_DATA_ESCAPED_DASH_DASH = 8,
158
  SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 9,
159
  SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 10,
160
  SCRIPT_DATA_DOUBLE_ESCAPE_START = 11,
161
  SCRIPT_DATA_DOUBLE_ESCAPED = 12,
162
  SCRIPT_DATA_DOUBLE_ESCAPED_DASH  = 13,
163
  SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH  = 14,
164
  SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 15,
165
  SCRIPT_DATA_DOUBLE_ESCAPED_END = 16
166
};
167
168
5.39k
void Tokenizer::ReadScript() {
169
5.39k
  defer({data_.end = raw_.end;});
170
5.39k
  ScriptDataState state = ScriptDataState::SCRIPT_DATA;
171
24.3M
  while (!eof_ && state != ScriptDataState::DONE) {
172
24.3M
    switch (state) {
173
8.44M
      case ScriptDataState::SCRIPT_DATA: {
174
8.44M
        char c = ReadByte();
175
8.44M
        if (eof_) return;
176
8.44M
        if (c == '<') {
177
2.95M
          state = ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN;
178
5.48M
        } else {
179
5.48M
          state = ScriptDataState::SCRIPT_DATA;
180
5.48M
        }
181
8.44M
        break;
182
8.44M
      }
183
2.95M
      case ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN: {
184
2.95M
        char c = ReadByte();
185
2.95M
        if (eof_) return;
186
2.95M
        if (c == '/') {
187
14.5k
          state = ScriptDataState::SCRIPT_DATA_END_TAG_OPEN;
188
2.94M
        } else if (c == '!') {
189
1.72M
          state = ScriptDataState::SCRIPT_DATA_ESCAPE_START;
190
1.72M
        } else {
191
1.21M
          UnreadByte();
192
1.21M
          state = ScriptDataState::SCRIPT_DATA;
193
1.21M
        }
194
2.95M
        break;
195
2.95M
      }
196
14.5k
      case ScriptDataState::SCRIPT_DATA_END_TAG_OPEN: {
197
14.5k
        if (ReadRawEndTag() || eof_) {
198
1.52k
          return;
199
1.52k
        }
200
13.0k
        state = ScriptDataState::SCRIPT_DATA;
201
13.0k
        break;
202
14.5k
      }
203
1.72M
      case ScriptDataState::SCRIPT_DATA_ESCAPE_START: {
204
1.72M
        char c = ReadByte();
205
1.72M
        if (eof_) return;
206
1.72M
        if (c == '-') {
207
1.71M
          state = ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH;
208
1.71M
        } else {
209
10.6k
          UnreadByte();
210
10.6k
          state = ScriptDataState::SCRIPT_DATA;
211
10.6k
        }
212
1.72M
        break;
213
1.72M
      }
214
1.71M
      case ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH: {
215
1.71M
        char c = ReadByte();
216
1.71M
        if (eof_) return;
217
1.71M
        if (c == '-') {
218
861k
          state = SCRIPT_DATA_ESCAPED_DASH_DASH;
219
861k
        } else {
220
851k
          UnreadByte();
221
851k
          state = ScriptDataState::SCRIPT_DATA;
222
851k
        }
223
1.71M
        break;
224
1.71M
      }
225
4.24M
      case ScriptDataState::SCRIPT_DATA_ESCAPED: {
226
4.24M
        char c = ReadByte();
227
4.24M
        if (eof_) return;
228
4.24M
        if (c == '-') {
229
1.61M
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH;
230
2.62M
        } else if (c == '<') {
231
75.2k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
232
2.55M
        } else {
233
2.55M
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
234
2.55M
        }
235
4.24M
        break;
236
4.24M
      }
237
1.61M
      case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH: {
238
1.61M
        char c = ReadByte();
239
1.61M
        if (eof_) return;
240
1.61M
        if (c == '-') {
241
2.35k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH;
242
1.61M
        } else if (c == '<') {
243
1.60M
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
244
1.60M
        } else {
245
6.94k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
246
6.94k
        }
247
1.61M
        break;
248
1.61M
      }
249
864k
      case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH: {
250
864k
        char c = ReadByte();
251
864k
        if (eof_) return;
252
864k
        if (c == '-') {
253
337
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH;
254
863k
        } else if (c == '<') {
255
6.75k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
256
856k
        } else if (c == '>') {
257
194
          state = ScriptDataState::SCRIPT_DATA;
258
856k
        } else {
259
856k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
260
856k
        }
261
864k
        break;
262
864k
      }
263
1.69M
      case ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: {
264
1.69M
        char c = ReadByte();
265
1.69M
        if (eof_) return;
266
1.69M
        if (c == '/') {
267
3.46k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
268
1.68M
        } else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
269
831k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START;
270
856k
        } else {
271
856k
          UnreadByte();
272
856k
          state = ScriptDataState::SCRIPT_DATA;
273
856k
        }
274
1.69M
        break;
275
1.69M
      }
276
3.46k
      case ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN: {
277
3.46k
        if (ReadRawEndTag()) {
278
3.07k
          state = ScriptDataState::DONE;
279
3.07k
        } else {
280
389
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
281
389
        }
282
3.46k
        break;
283
1.69M
      }
284
831k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START: {
285
831k
        UnreadByte();
286
831k
        static std::string script_tag_l = "script";
287
831k
        static std::string script_tag_u = "SCRIPT";
288
5.81M
        for (int8_t i = 0; i < 6 /*script*/; ++i) {
289
4.98M
          char c = ReadByte();
290
4.98M
          if (eof_) return;
291
4.98M
          if (c != script_tag_l[i] && c != script_tag_u[i]) {
292
4.98M
            UnreadByte();
293
4.98M
            state = ScriptDataState::SCRIPT_DATA_ESCAPED;
294
4.98M
          }
295
4.98M
        }
296
831k
        char c = ReadByte();
297
831k
        if (eof_) return;
298
831k
        if (c == ' ' || c == '\n' || c == '\r' || c == '\t'  || c == '\f'
299
831k
            || c == '/' || c == '>') {
300
1.78k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
301
829k
        } else {
302
829k
          UnreadByte();
303
829k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
304
829k
        }
305
831k
        break;
306
831k
      }
307
134k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED: {
308
134k
        char c = ReadByte();
309
134k
        if (eof_) return;
310
134k
        if (c == '-') {
311
46.8k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH;
312
87.5k
        } else if (c == '<') {
313
19.6k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
314
67.8k
        } else {
315
67.8k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
316
67.8k
        }
317
134k
        break;
318
134k
      }
319
46.8k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH: {
320
46.8k
        char c = ReadByte();
321
46.8k
        if (eof_) return;
322
46.8k
        if (c == '-') {
323
9.28k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
324
37.5k
        } else if (c == '<') {
325
27.1k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
326
27.1k
        } else if (c == '>') {
327
592
          state = ScriptDataState::SCRIPT_DATA;
328
9.79k
        } else {
329
9.79k
          state = SCRIPT_DATA_DOUBLE_ESCAPED;
330
9.79k
        }
331
46.8k
        break;
332
46.8k
      }
333
9.53k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: {
334
9.53k
        char c = ReadByte();
335
9.53k
        if (eof_) return;
336
9.52k
        if (c == '-') {
337
253
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
338
9.27k
        } else if (c == '<') {
339
252
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
340
9.01k
        } else if (c == '>') {
341
195
          state = ScriptDataState::SCRIPT_DATA;
342
8.82k
        } else {
343
8.82k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
344
8.82k
        }
345
9.52k
        break;
346
9.53k
      }
347
47.1k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: {
348
47.1k
        char c = ReadByte();
349
47.1k
        if (eof_) return;
350
47.1k
        if (c == '/') {
351
1.04k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END;
352
46.0k
        } else {
353
46.0k
          UnreadByte();
354
46.0k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
355
46.0k
        }
356
47.1k
        break;
357
47.1k
      }
358
1.04k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END: {
359
1.04k
        if (ReadRawEndTag()) {
360
760
          raw_.end += std::string("</script>").size();
361
760
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
362
760
        } else {
363
280
          if (eof_) return;
364
273
          state  = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
365
273
        }
366
1.03k
        break;
367
1.04k
      }
368
1.03k
      default:
369
0
        break;
370
24.3M
    }
371
24.3M
  }
372
5.39k
}
373
374
897
void Tokenizer::ReadComment() {
375
897
  data_.start = raw_.end;
376
897
  defer({
377
897
    if (data_.end < data_.start) {
378
      // It's a comment with no data, like <!-->
379
897
      data_.end = data_.start;
380
897
    }
381
897
  });
382
897
  int dash_count = 2;
383
2.52M
  while (!eof_) {
384
2.52M
    char c = ReadByte();
385
2.52M
    if (eof_) {
386
      // Ignore up to two dashes at EOF.
387
71
      if (dash_count > 2) {
388
5
        dash_count = 2;
389
5
      }
390
71
      data_.end = raw_.end - dash_count;
391
71
      return;
392
71
    }
393
2.52M
    if (c == '-') {
394
585k
      dash_count++;
395
585k
      continue;
396
1.93M
    } else if (c == '>') {
397
759
      if (dash_count >= 2) {
398
563
        data_.end = raw_.end - 3 /* --> */;
399
563
        return;
400
563
      }
401
1.93M
    } else if (c == '!') {
402
319k
      if (dash_count >= 2) {
403
462
        char c = ReadByte();
404
462
        if (eof_) {
405
13
          data_.end = raw_.end;
406
13
          return;
407
13
        }
408
449
        if (c == '>') {
409
250
          data_.end = raw_.end - 4 /* --!> */;
410
250
          return;
411
250
        }
412
449
      }
413
319k
    }
414
1.93M
    dash_count = 0;
415
1.93M
  }
416
897
}
417
418
1.03M
void Tokenizer::ReadUntilCloseAngle() {
419
1.03M
  data_.start = raw_.end;
420
3.26M
  while (!eof_) {
421
3.26M
    char c = ReadByte();
422
3.26M
    if (eof_) {
423
458
      data_.end = raw_.end;
424
458
      return;
425
458
    }
426
3.26M
    if (c == '>') {
427
1.03M
      data_.end = raw_.end - 1 /* ">" */;
428
1.03M
      return;
429
1.03M
    }
430
3.26M
  }
431
1.03M
}
432
433
1.03M
TokenType Tokenizer::ReadMarkupDeclaration() {
434
1.03M
  data_.start = raw_.end;
435
1.03M
  char c[2];
436
3.09M
  for (int i = 0; i < 2; ++i) {
437
2.06M
    c[i] = ReadByte();
438
2.06M
    if (eof_) {
439
69
      data_.end = raw_.end;
440
69
      return TokenType::COMMENT_TOKEN;
441
69
    }
442
2.06M
  }
443
444
1.03M
  if (c[0] == '-' && c[1] == '-') {
445
897
    ReadComment();
446
897
    return TokenType::COMMENT_TOKEN;
447
897
  }
448
449
1.03M
  UnreadByte();
450
1.03M
  UnreadByte();
451
1.03M
  if (ReadDoctype()) {
452
690
    return TokenType::DOCTYPE_TOKEN;
453
690
  }
454
455
1.03M
  if (allow_cdata_ && ReadCDATA()) {
456
251
    convert_null_ = true;
457
251
    return TokenType::TEXT_TOKEN;
458
251
  }
459
460
  // It's a bogus comment.
461
1.02M
  ReadUntilCloseAngle();
462
1.02M
  return TokenType::COMMENT_TOKEN;
463
1.03M
}
464
465
1.03M
bool Tokenizer::ReadDoctype() {
466
1.03M
  token_line_col_ = {current_line_col_.first,
467
1.03M
                     current_line_col_.second - 2 /* <! */};
468
469
1.03M
  static constexpr std::string_view kDoctype = "DOCTYPE";
470
1.03M
  for (std::size_t i = 0; i < kDoctype.size(); ++i) {
471
1.03M
    char c = ReadByte();
472
1.03M
    if (eof_) {
473
12
      data_.end = raw_.end;
474
12
      return false;
475
12
    }
476
1.03M
    if (c != kDoctype.at(i) && c != (kDoctype.at(i) + ('a' - 'A'))) {
477
      // Back up to read the fragment of "DOCTYPE" again.
478
1.03M
      raw_.end = data_.start;
479
1.03M
      return false;
480
1.03M
    }
481
1.03M
  }
482
483
690
  SkipWhiteSpace();
484
690
  if (eof_) {
485
18
    data_.start = raw_.end;
486
18
    data_.end = raw_.end;
487
18
    return true;
488
18
  }
489
490
672
  ReadUntilCloseAngle();
491
672
  return true;
492
690
}
493
494
524
bool Tokenizer::ReadCDATA() {
495
524
  static constexpr std::string_view kCData = "[CDATA[";
496
2.61k
  for (std::size_t i = 0; i < kCData.size(); ++i) {
497
2.36k
    char c = ReadByte();
498
2.36k
    if (eof_) {
499
8
      data_.end = raw_.end;
500
8
      return false;
501
8
    }
502
2.35k
    if (c != kCData[i]) {
503
      // Back up to read the fragment of "[CDATA[" again.
504
265
      data_.end = raw_.start;
505
265
      return false;
506
265
    }
507
2.35k
  }
508
251
  data_.start = raw_.end;
509
251
  int brackets = 0;
510
1.27k
  while (!eof_) {
511
1.27k
    char c = ReadByte();
512
1.27k
    if (eof_) {
513
29
      data_.end = raw_.end;
514
29
      return true;
515
29
    }
516
1.24k
    switch (c) {
517
637
      case ']': {
518
637
        brackets++;
519
637
        break;
520
0
      }
521
416
      case '>': {
522
416
        if (brackets >= 2) {
523
222
          data_.end = raw_.end - 3 /* "]]>" */;
524
222
          return true;
525
222
        }
526
194
        brackets = 0;
527
194
        break;
528
416
      }
529
194
      default:
530
194
        brackets = 0;
531
1.24k
    }
532
1.24k
  }
533
0
  return false;
534
251
}
535
536
template<typename... Args>
537
5.22M
bool Tokenizer::StartTagIn(Args... ss) {
538
5.22M
  std::vector<std::string> argsList{ss...};
539
9.96M
  for (const auto& s : argsList) {
540
9.96M
    if (data_.end - data_.start != s.size()) continue;
541
1.44M
    bool matched = true;
542
3.14M
    for (std::size_t i = 0; i < s.size(); ++i) {
543
3.11M
      char c = buffer_.at(data_.start + i);
544
3.11M
      if ('A' <= c && c <= 'Z') {
545
2.94M
        c += 'a' - 'A';
546
2.94M
      }
547
3.11M
      if (c != s[i]) {
548
1.41M
        matched = false;
549
1.41M
        break;
550
1.41M
      }
551
3.11M
    }
552
1.44M
    if (matched) {
553
25.0k
      return true;
554
25.0k
    }
555
1.44M
  }
556
5.19M
  return false;
557
5.22M
}
bool htmlparser::Tokenizer::StartTagIn<char const*>(char const*)
Line
Count
Source
537
519k
bool Tokenizer::StartTagIn(Args... ss) {
538
519k
  std::vector<std::string> argsList{ss...};
539
519k
  for (const auto& s : argsList) {
540
519k
    if (data_.end - data_.start != s.size()) continue;
541
4.32k
    bool matched = true;
542
17.2k
    for (std::size_t i = 0; i < s.size(); ++i) {
543
13.4k
      char c = buffer_.at(data_.start + i);
544
13.4k
      if ('A' <= c && c <= 'Z') {
545
8.14k
        c += 'a' - 'A';
546
8.14k
      }
547
13.4k
      if (c != s[i]) {
548
567
        matched = false;
549
567
        break;
550
567
      }
551
13.4k
    }
552
4.32k
    if (matched) {
553
3.75k
      return true;
554
3.75k
    }
555
4.32k
  }
556
515k
  return false;
557
519k
}
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*, char const*>(char const*, char const*, char const*)
Line
Count
Source
537
47.3k
bool Tokenizer::StartTagIn(Args... ss) {
538
47.3k
  std::vector<std::string> argsList{ss...};
539
141k
  for (const auto& s : argsList) {
540
141k
    if (data_.end - data_.start != s.size()) continue;
541
80.1k
    bool matched = true;
542
331k
    for (std::size_t i = 0; i < s.size(); ++i) {
543
328k
      char c = buffer_.at(data_.start + i);
544
328k
      if ('A' <= c && c <= 'Z') {
545
298k
        c += 'a' - 'A';
546
298k
      }
547
328k
      if (c != s[i]) {
548
76.9k
        matched = false;
549
76.9k
        break;
550
76.9k
      }
551
328k
    }
552
80.1k
    if (matched) {
553
3.13k
      return true;
554
3.13k
    }
555
80.1k
  }
556
44.2k
  return false;
557
47.3k
}
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*>(char const*, char const*)
Line
Count
Source
537
4.65M
bool Tokenizer::StartTagIn(Args... ss) {
538
4.65M
  std::vector<std::string> argsList{ss...};
539
9.30M
  for (const auto& s : argsList) {
540
9.30M
    if (data_.end - data_.start != s.size()) continue;
541
1.35M
    bool matched = true;
542
2.79M
    for (std::size_t i = 0; i < s.size(); ++i) {
543
2.77M
      char c = buffer_.at(data_.start + i);
544
2.77M
      if ('A' <= c && c <= 'Z') {
545
2.63M
        c += 'a' - 'A';
546
2.63M
      }
547
2.77M
      if (c != s[i]) {
548
1.33M
        matched = false;
549
1.33M
        break;
550
1.33M
      }
551
2.77M
    }
552
1.35M
    if (matched) {
553
18.1k
      return true;
554
18.1k
    }
555
1.35M
  }
556
4.63M
  return false;
557
4.65M
}
558
559
13.1M
TokenType Tokenizer::ReadStartTag(bool template_mode) {
560
13.1M
  token_line_col_ = {current_line_col_.first,
561
13.1M
                     current_line_col_.second - 1 /* < */};
562
13.1M
  ReadTag(true, template_mode);
563
564
13.1M
  if (eof_) {
565
373
    return TokenType::ERROR_TOKEN;
566
373
  }
567
568
  // Several tags flag the tokenizer's next token as raw.
569
13.1M
  bool raw = false;
570
13.1M
  char c = buffer_.at(data_.start);
571
572
  // Lowercase.
573
13.1M
  if ('A' <= c && c <= 'Z') {
574
4.96M
    c += 'a' - 'A';
575
4.96M
  }
576
577
13.1M
  switch (c) {
578
501k
    case 'i':
579
501k
      raw = StartTagIn("iframe");
580
501k
      break;
581
47.3k
    case 'n':
582
47.3k
      raw = StartTagIn("noembed", "noframes", "noscript");
583
47.3k
      break;
584
11.1k
    case 'p':
585
11.1k
      raw = StartTagIn("plaintext");
586
11.1k
      break;
587
1.36M
    case 's':
588
1.36M
      raw = StartTagIn("script", "style");
589
1.36M
      break;
590
3.29M
    case 't':
591
3.29M
      raw = StartTagIn("textarea", "title");
592
3.29M
      break;
593
6.09k
    case 'x':
594
6.09k
      raw = StartTagIn("xmp");
595
13.1M
  }
596
597
13.1M
  if (raw) {
598
25.0k
    int size = data_.end - data_.start;
599
25.0k
    raw_tag_ = std::string(buffer_.substr(data_.start, size));
600
25.0k
    Strings::ToLower(&raw_tag_);
601
25.0k
  }
602
603
  // Look for a self-closing token like "<br/>".
604
13.1M
  if (!eof_ && buffer_[raw_.end - 2] == '/') {
605
783
    return TokenType::SELF_CLOSING_TAG_TOKEN;
606
783
  }
607
608
13.1M
  return TokenType::START_TAG_TOKEN;
609
13.1M
}
610
611
13.4M
void Tokenizer::ReadTag(bool save_attr, bool template_mode) {
612
13.4M
  attributes_.clear();
613
13.4M
  n_attributes_returned_ = 0;
614
615
  // Read the tag name and attribute key/value pairs.
616
13.4M
  ReadTagName();
617
13.4M
  SkipWhiteSpace();
618
619
13.4M
  if (eof_) {
620
132
    return;
621
132
  }
622
623
15.6M
  while (!eof_) {
624
15.6M
    char c = ReadByte();
625
15.6M
    if (eof_ || c == '>') {
626
13.4M
      break;
627
13.4M
    }
628
629
    // Undo previous > read.
630
2.18M
    UnreadByte();
631
632
2.18M
    ReadTagAttributeKey(template_mode);
633
2.18M
    ReadTagAttributeValue();
634
    // Save pending_attribute if save_attr and that attribute has a non-empty
635
    // key.
636
2.18M
    if (save_attr &&
637
        // Skip excessive attributes.
638
2.18M
        attributes_.size() < ::absl::GetFlag(
639
2.11M
            FLAGS_htmlparser_max_attributes_per_node) &&
640
2.18M
        std::get<0>(pending_attribute_).start !=
641
669k
        std::get<0>(pending_attribute_).end) {
642
616k
      attributes_.push_back(pending_attribute_);
643
616k
    }
644
2.18M
    SkipWhiteSpace();
645
2.18M
  }
646
13.4M
}
647
648
13.4M
void Tokenizer::ReadTagName() {
649
13.4M
  data_.start = raw_.end - 1;
650
31.7M
  while (!eof_) {
651
31.7M
    char c = ReadByte();
652
31.7M
    if (eof_) {
653
68
      data_.end = raw_.end;
654
68
      return;
655
68
    }
656
31.7M
    switch (c) {
657
31.2k
      case ' ':
658
32.0k
      case '\n':
659
139k
      case '\r':
660
139k
      case '\t':
661
139k
      case '\f':
662
139k
        data_.end = raw_.end - 1;
663
139k
        return;
664
75.8k
      case '/':
665
13.2M
      case '>':
666
13.2M
        UnreadByte();
667
13.2M
        data_.end = raw_.end;
668
13.2M
        return;
669
31.7M
    }
670
31.7M
  }
671
13.4M
}
672
673
// Sets pending_attribute_[0] to the "k" in "<div k=v>".
674
// Precondition: eof_ != true;
675
2.18M
void Tokenizer::ReadTagAttributeKey(bool template_mode) {
676
2.18M
  std::get<0>(pending_attribute_).start = raw_.end;
677
2.18M
  std::get<LineCol>(pending_attribute_) =
678
2.18M
      {current_line_col_.first, current_line_col_.second + 1};
679
680
  // All mustache_ prefixed variables applies to parsing logic for AMP mustache
681
  // templates. See: https://amp.dev/documentation/components/amp-mustache/
682
2.18M
  bool mustache_inside_section_block = false;
683
2.18M
  std::string mustache_section_name = "";
684
685
27.9M
  while (!eof_) {
686
27.9M
    char c = ReadByte();
687
27.9M
    if (eof_) {
688
135
      std::get<0>(pending_attribute_).start = raw_.end;
689
135
      return;
690
135
    }
691
692
    // Template attributes processing.
693
    // Looks for following special syntax.
694
    // {{#section}}...{{/section}}
695
    // {{^section}}...{{/section}}
696
    // {{variable}}
697
27.9M
    if (template_mode) {
698
0
      UnreadByte();
699
0
      UnreadByte();
700
0
      UnreadByte();
701
0
      char c1 = ReadByte();
702
0
      char c2 = ReadByte();
703
0
      c = ReadByte();
704
0
      if (mustache_inside_section_block && c1 == '{' && c2 == '{' && c == '/') {
705
        // Look for closing section name. If not resort to default behavior.
706
        // Reason for this logic is to differentiate between:
707
        // <p {{#mycondition}}class=foo{{/mycondition}} foo=bar> vs.
708
        // <img {{#mycondition}}class=foo />
709
0
        int raw_end = raw_.end;
710
0
        std::string_view close_section =
711
0
            buffer_.substr(raw_.end, mustache_section_name.size());
712
0
        bool section_name_match = close_section == mustache_section_name;
713
0
        if (section_name_match) {
714
0
          raw_.end += mustache_section_name.size();
715
0
          char e1 = ReadByte();
716
0
          char e2 = ReadByte();
717
0
          if (e1 == '}' && e2 == '}') {
718
0
            mustache_inside_section_block = false;
719
0
            continue;
720
0
          } else {
721
0
            raw_.end = raw_end;
722
0
          }
723
0
        }
724
0
      }
725
726
0
      if (c1 == '{' && c2 == '{' && (c == '#' || c == '^')) {
727
0
        auto n = buffer_.find("}}", raw_.end);
728
0
        if (n != std::string_view::npos) {
729
0
          mustache_section_name = buffer_.substr(raw_.end, n - raw_.end);
730
0
          mustache_inside_section_block = true;
731
0
          continue;
732
0
        }
733
0
      }
734
0
    }
735
736
27.9M
    switch (c) {
737
19.1k
      case ' ':
738
142k
      case '\n':
739
167k
      case '\r':
740
168k
      case '\t':
741
212k
      case '\f':
742
1.95M
      case '/': {
743
1.95M
        std::get<0>(pending_attribute_).end = raw_.end - 1;
744
1.95M
        return;
745
212k
      }
746
10.5k
      case '=':
747
222k
      case '>': {
748
222k
        UnreadByte();
749
222k
        std::get<0>(pending_attribute_).end = raw_.end;
750
222k
        return;
751
10.5k
      }
752
27.9M
    }
753
27.9M
  }
754
2.18M
}
755
756
// Sets pending_attribute_.second to the "v" in "<div k=v>".
757
2.18M
void Tokenizer::ReadTagAttributeValue() {
758
2.18M
  std::get<1>(pending_attribute_).start = raw_.end;
759
2.18M
  std::get<1>(pending_attribute_).end = raw_.end;
760
2.18M
  SkipWhiteSpace();
761
2.18M
  if (eof_) {
762
219
    return;
763
219
  }
764
2.18M
  char c = ReadByte();
765
2.18M
  if (eof_) {
766
0
    return;
767
0
  }
768
769
2.18M
  if (c != '=') {
770
2.17M
    UnreadByte();
771
2.17M
    return;
772
2.17M
  }
773
774
10.6k
  SkipWhiteSpace();
775
10.6k
  if (eof_) {
776
14
    return;
777
14
  }
778
779
10.5k
  char quote = ReadByte();
780
10.5k
  if (eof_) {
781
0
    return;
782
0
  }
783
784
10.5k
  switch (quote) {
785
195
    case '>':
786
195
      UnreadByte();
787
195
      return;
788
255
    case '\'':
789
265
    case '"':
790
265
      std::get<1>(pending_attribute_).start = raw_.end;
791
459
      while (!eof_) {
792
459
        c = ReadByte();
793
459
        if (eof_) {
794
12
          std::get<1>(pending_attribute_).end = raw_.end;
795
12
          return;
796
12
        }
797
447
        if (c == quote) {
798
253
          std::get<1>(pending_attribute_).end = raw_.end - 1;
799
253
          return;
800
253
        }
801
447
      }
802
0
      break;
803
10.1k
    default: {
804
10.1k
      std::get<1>(pending_attribute_).start = raw_.end - 1;
805
3.83M
      while (!eof_) {
806
3.83M
        c = ReadByte();
807
3.83M
        if (eof_) {
808
8
          std::get<1>(pending_attribute_).end = raw_.end;
809
8
          return;
810
8
        }
811
3.83M
        switch (c) {
812
1.19k
          case ' ':
813
1.39k
          case '\n':
814
1.68k
          case '\r':
815
1.93k
          case '\t':
816
9.22k
          case '\f':
817
9.22k
            std::get<1>(pending_attribute_).end = raw_.end - 1;
818
9.22k
            return;
819
902
          case '>':
820
902
            UnreadByte();
821
902
            std::get<1>(pending_attribute_).end = raw_.end;
822
902
            return;
823
3.83M
        }
824
3.83M
      }
825
10.1k
    }
826
10.5k
  }
827
10.5k
}
828
829
15.1M
TokenType Tokenizer::Next(bool template_mode) {
830
15.1M
  raw_.start = raw_.end;
831
15.1M
  data_.start = raw_.end;
832
15.1M
  data_.end = raw_.end;
833
15.1M
  is_token_manufactured_ = false;
834
835
15.1M
  if (eof_) {
836
3.98k
    err_ = true;
837
3.98k
    token_type_ = TokenType::ERROR_TOKEN;
838
3.98k
    return token_type_;
839
3.98k
  }
840
841
15.1M
  if (raw_tag_ != "") {
842
24.7k
    if (raw_tag_ == "plaintext") {
843
      // Read everything up to EOF.
844
0
      while (!eof_) {
845
0
        ReadByte();
846
0
      }
847
0
      data_.end = raw_.end;
848
0
      text_is_raw_ = true;
849
24.7k
    } else {
850
24.7k
      ReadRawOrRCDATA();
851
24.7k
    }
852
853
24.7k
    if (data_.end > data_.start) {
854
17.3k
      token_type_ = TokenType::TEXT_TOKEN;
855
17.3k
      convert_null_ = true;
856
17.3k
      return token_type_;
857
17.3k
    }
858
24.7k
  }
859
860
15.0M
  text_is_raw_ = false;
861
15.0M
  convert_null_ = false;
862
863
30.6M
  while (!eof_) {
864
30.6M
    char c = ReadByte();
865
866
30.6M
    if (eof_) {
867
10.0k
      break;
868
10.0k
    }
869
870
30.6M
    if (c != '<') {
871
15.3M
      continue;
872
15.3M
    }
873
874
15.2M
    c = ReadByte();
875
15.2M
    if (eof_) break;
876
877
    // Check if the '<' we have just read is part of a tag, comment or
878
    // doctype. If not, it's part of the accumulated text token.
879
15.2M
    TokenType token_type;
880
15.2M
    if (Strings::IsCharAlphabet(c)) {
881
13.6M
      token_type = TokenType::START_TAG_TOKEN;
882
13.6M
    } else if (c == '/') {
883
309k
      token_type = TokenType::END_TAG_TOKEN;
884
1.30M
    } else if (c == '!' || c == '?') {
885
1.14M
      token_type = TokenType::COMMENT_TOKEN;
886
1.14M
    } else {
887
167k
      UnreadByte();
888
167k
      continue;
889
167k
    }
890
891
    // We have a non-text token, but we might have accumulated some text
892
    // before that. If so, we return the text first, and return the non text
893
    // token on the subsequent call to Next.
894
    //
895
    // <space><space><mytag>, returns two spaces before processing the mytag
896
    // token in the next call.
897
15.0M
    if (int x = raw_.end - 2 /* "<a" */; raw_.start < x) {
898
608k
      raw_.end = x;
899
608k
      data_.end = x;
900
      // We know there is no \n so no line adjustment needed.
901
608k
      current_line_col_.second -= 2;
902
608k
      token_type_ = TokenType::TEXT_TOKEN;
903
608k
      return token_type_;
904
608k
    }
905
906
14.4M
    switch (token_type) {
907
13.1M
      case TokenType::START_TAG_TOKEN:
908
13.1M
        token_type_ = ReadStartTag(template_mode);
909
13.1M
        return token_type_;
910
271k
      case TokenType::END_TAG_TOKEN:
911
271k
        c = ReadByte();
912
271k
        if (eof_) break;
913
271k
        if (c == '>') {
914
          // "</> does not generate a token at all. Generate an empty comment
915
          // to allow passthrough clients to pick up the data using raw_.
916
          // Reset the tokenizer state and start again.
917
12.5k
          token_type_ = TokenType::COMMENT_TOKEN;
918
12.5k
          return token_type_;
919
12.5k
        }
920
258k
        if (Strings::IsCharAlphabet(c)) {
921
256k
          ReadTag(false);
922
256k
          if (eof_) {
923
56
            token_type_ = TokenType::ERROR_TOKEN;
924
255k
          } else {
925
255k
            token_type_ = TokenType::END_TAG_TOKEN;
926
255k
          }
927
256k
          return token_type_;
928
256k
        }
929
2.65k
        UnreadByte();
930
2.65k
        ReadUntilCloseAngle();
931
2.65k
        token_type_ = TokenType::COMMENT_TOKEN;
932
2.65k
        return token_type_;
933
1.03M
      case TokenType::COMMENT_TOKEN: {
934
1.03M
        if (c == '!') {
935
1.03M
          token_type_ = ReadMarkupDeclaration();
936
1.03M
          return token_type_;
937
1.03M
        }
938
2.90k
        is_token_manufactured_ = true;
939
        // <? is part of the comment text.
940
2.90k
        UnreadByte();
941
2.90k
        ReadUntilCloseAngle();
942
2.90k
        token_type_ = TokenType::COMMENT_TOKEN;
943
2.90k
        return token_type_;
944
1.03M
      }
945
0
      default:
946
0
        break;
947
14.4M
    }
948
14.4M
  }
949
950
10.4k
  if (raw_.start < raw_.end) {
951
2.33k
    data_.end = raw_.end;
952
2.33k
    token_type_ = TokenType::TEXT_TOKEN;
953
2.33k
    return token_type_;
954
2.33k
  }
955
956
8.10k
  token_type_ = TokenType::ERROR_TOKEN;
957
8.10k
  return token_type_;
958
10.4k
}
959
960
0
std::string_view Tokenizer::Raw() {
961
0
  int size = raw_.end - raw_.start;
962
0
  return buffer_.substr(raw_.start, size);
963
0
}
964
965
1.67M
std::string Tokenizer::Text() {
966
1.67M
  switch (token_type_) {
967
628k
    case TokenType::TEXT_TOKEN:
968
1.67M
    case TokenType::COMMENT_TOKEN:
969
1.67M
    case TokenType::DOCTYPE_TOKEN: {
970
1.67M
      int size = data_.end - data_.start;
971
1.67M
      std::string s(buffer_.substr(data_.start, size));
972
1.67M
      data_.start = raw_.end;
973
1.67M
      data_.end = raw_.end;
974
1.67M
      Strings::ConvertNewLines(&s);
975
1.67M
      if (convert_null_ || token_type_ == TokenType::COMMENT_TOKEN) {
976
        // Replace \x00 with \ufffd.
977
1.06M
        Strings::ReplaceAny(&s,
978
1.06M
                            Strings::kNullChar,
979
1.06M
                            Strings::kNullReplacementChar);
980
1.06M
      }
981
1.67M
      if (!text_is_raw_) Strings::UnescapeString(&s, false);
982
1.67M
      return s;
983
1.67M
    }
984
0
    default:
985
0
      break;
986
1.67M
  }
987
988
0
  return "";
989
1.67M
}
990
991
13.4M
std::optional<std::tuple<std::string, bool>> Tokenizer::TagName() {
992
13.4M
  if (data_.start < data_.end) {
993
13.4M
    switch (token_type_) {
994
13.1M
      case TokenType::START_TAG_TOKEN:
995
13.4M
      case TokenType::END_TAG_TOKEN:
996
13.4M
      case TokenType::SELF_CLOSING_TAG_TOKEN: {
997
13.4M
        int size = data_.end - data_.start;
998
13.4M
        std::string s(buffer_.substr(data_.start, size));
999
13.4M
        data_.start = raw_.end;
1000
13.4M
        data_.end = raw_.end;
1001
13.4M
        Strings::ToLower(&s);
1002
13.4M
        return std::make_tuple<std::string, bool>(std::move(s),
1003
13.4M
            n_attributes_returned_ < attributes_.size());
1004
13.4M
      }
1005
0
      default:
1006
0
        break;
1007
13.4M
    }
1008
13.4M
  }
1009
1010
0
  return std::nullopt;
1011
13.4M
}
1012
1013
599k
std::optional<std::tuple<Attribute, bool>> Tokenizer::TagAttr() {
1014
599k
  if (n_attributes_returned_ < attributes_.size()) {
1015
599k
    switch (token_type_) {
1016
593k
      case TokenType::START_TAG_TOKEN:
1017
599k
      case TokenType::SELF_CLOSING_TAG_TOKEN: {
1018
599k
        auto attr = attributes_[n_attributes_returned_];
1019
599k
        n_attributes_returned_++;
1020
599k
        int size = std::get<0>(attr).end - std::get<0>(attr).start;
1021
599k
        std::string key(buffer_.substr(std::get<0>(attr).start, size));
1022
599k
        int value_size = std::get<1>(attr).end - std::get<1>(attr).start;
1023
599k
        std::string val(buffer_.substr(std::get<1>(attr).start, value_size));
1024
599k
        Strings::ToLower(&key);
1025
599k
        Strings::ConvertNewLines(&val);
1026
599k
        Strings::UnescapeString(&val, true);
1027
599k
        return std::make_tuple<Attribute, bool>(
1028
599k
            {.name_space = "",
1029
599k
             .key = std::move(key),
1030
599k
             .value = std::move(val),
1031
599k
             .line_col_in_html_src = std::get<LineCol>(attr)},
1032
599k
            n_attributes_returned_ < attributes_.size());
1033
593k
      }
1034
0
      default:
1035
0
        break;
1036
599k
    }
1037
599k
  }
1038
1039
0
  return std::nullopt;
1040
599k
}
1041
1042
15.1M
Token Tokenizer::token() {
1043
15.1M
  Token t;
1044
15.1M
  t.token_type = token_type_;
1045
15.1M
  switch (token_type_) {
1046
628k
    case TokenType::TEXT_TOKEN: {
1047
628k
      t.data = Text();
1048
628k
      int line_number = current_line_col_.first;
1049
628k
      int column_number = current_line_col_.second - t.data.size();
1050
      // Shift to previous line, where this text belongs.
1051
628k
      if (column_number < 0) {
1052
306k
        if (lines_cols_.size() > 1) {
1053
306k
          auto previous_token_linecol = lines_cols_[lines_cols_.size() - 2];
1054
306k
          line_number = previous_token_linecol.first;
1055
306k
          column_number =
1056
306k
              previous_token_linecol.second - abs(column_number) + 1;
1057
306k
        } else {
1058
425
          column_number = 0;
1059
425
        }
1060
306k
      }
1061
628k
      token_line_col_ = {line_number, column_number};
1062
628k
      break;
1063
0
    }
1064
1.04M
    case TokenType::COMMENT_TOKEN:
1065
1.04M
    case TokenType::DOCTYPE_TOKEN:
1066
1.04M
      t.data = Text();
1067
1.04M
      t.is_manufactured = is_token_manufactured_;
1068
1.04M
      token_line_col_ = {current_line_col_.first,
1069
1.04M
                         current_line_col_.second - t.data.size()};
1070
1.04M
      break;
1071
13.1M
    case TokenType::START_TAG_TOKEN:
1072
13.1M
    case TokenType::SELF_CLOSING_TAG_TOKEN:
1073
13.4M
    case TokenType::END_TAG_TOKEN: {
1074
13.4M
      auto tag_name_value = TagName();
1075
13.4M
      if (tag_name_value.has_value()) {
1076
13.4M
        std::string tag_name = std::get<0>(tag_name_value.value());
1077
13.4M
        bool has_attributes = std::get<1>(tag_name_value.value());
1078
13.4M
        Atom atom = AtomUtil::ToAtom(tag_name);
1079
13.4M
        if (atom != Atom::UNKNOWN) {
1080
5.49M
          t.atom = atom;
1081
7.92M
        } else {
1082
7.92M
          t.atom = Atom::UNKNOWN;
1083
7.92M
          t.data = tag_name;
1084
7.92M
        }
1085
13.4M
        if (has_attributes) {
1086
599k
          while (true) {
1087
599k
            auto a = TagAttr();
1088
599k
            if (!a.has_value()) break;
1089
599k
            auto attr = std::get<Attribute>(a.value());
1090
599k
            bool more_attributes = std::get<bool>(a.value());
1091
599k
            t.attributes.push_back(attr);
1092
599k
            if (!more_attributes) break;
1093
599k
          }
1094
179k
        }
1095
13.4M
      }
1096
13.4M
      break;
1097
13.1M
    }
1098
12.5k
    case TokenType::ERROR_TOKEN:
1099
      // Ignore.
1100
12.5k
      break;
1101
15.1M
  }
1102
1103
15.1M
  t.line_col_in_html_src = token_line_col_;
1104
15.1M
  return t;
1105
15.1M
}
1106
1107
}  // namespace htmlparser