Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/tokenizer.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/tokenizer.h"
2
3
#include "absl/flags/flag.h"
4
#include "cpp/htmlparser/atom.h"
5
#include "cpp/htmlparser/atomutil.h"
6
#include "cpp/htmlparser/defer.h"
7
#include "cpp/htmlparser/strings.h"
8
9
ABSL_FLAG(std::size_t, htmlparser_max_attributes_per_node,
10
          1000,
11
          "Protects out of memory errors by dropping insanely large amounts "
12
          "of attributes per node.");
13
14
namespace htmlparser {
15
16
Tokenizer::Tokenizer(std::string_view html, std::string context_tag) :
17
11.5k
    buffer_(html) {
18
11.5k
  lines_cols_.push_back(std::make_pair(1, 0));
19
11.5k
  current_line_col_ = std::make_pair(1, 0);
20
11.5k
  token_line_col_ = std::make_pair(1, 0);
21
11.5k
  if (!context_tag.empty()) {
22
0
    Strings::ToLower(&context_tag);
23
0
    if (std::find(kAllowedFragmentContainers.begin(),
24
0
                  kAllowedFragmentContainers.end(),
25
0
                  AtomUtil::ToAtom(context_tag)) !=
26
0
        kAllowedFragmentContainers.end()) {
27
0
      raw_tag_ = context_tag;
28
0
    }
29
0
  }
30
11.5k
}
31
32
172M
inline char Tokenizer::ReadByte() {
33
172M
  if (raw_.end >= buffer_.size()) {
34
11.5k
    eof_ = true;
35
11.5k
    return 0;
36
11.5k
  }
37
38
172M
  char c = buffer_.at(raw_.end++);
39
172M
  current_line_col_.second++;
40
172M
  int multi_byte = Strings::CodePointByteSequenceCount(c);
41
172M
  if (multi_byte > 1) {
42
12.4M
    current_line_col_.second -= (multi_byte - 1);
43
12.4M
  }
44
45
172M
  if (c == '\n' || (c == '\r' &&
46
172M
                    raw_.end < buffer_.size() &&
47
172M
                    buffer_.at(raw_.end) != '\n')) {
48
14.6M
    lines_cols_.back() = current_line_col_;
49
    // Increment line number and reset column number.
50
14.6M
    current_line_col_.first++;
51
14.6M
    current_line_col_.second = 0;
52
14.6M
    lines_cols_.push_back({current_line_col_.first + 1, 0});
53
14.6M
  }
54
55
172M
  return c;
56
172M
}
57
58
45.6M
inline void Tokenizer::UnreadByte() {
59
45.6M
  raw_.end--;
60
45.6M
  if (current_line_col_.first > 1 && current_line_col_.second == 0) {
61
2.64M
    if (lines_cols_.size() > 1) {
62
2.64M
      lines_cols_.pop_back();
63
2.64M
    }
64
2.64M
    current_line_col_ = lines_cols_.back();
65
2.64M
    return;
66
2.64M
  }
67
68
42.9M
  current_line_col_.second--;
69
42.9M
}
70
71
19.2M
void Tokenizer::SkipWhiteSpace() {
72
23.2M
  while (!eof_) {
73
23.2M
    char c = ReadByte();
74
23.2M
    switch (c) {
75
381
      case ' ':
76
74.5k
      case '\n':
77
3.93M
      case '\r':
78
3.93M
      case '\t':
79
4.03M
      case '\f':
80
4.03M
        break;
81
19.2M
      default:
82
19.2M
        UnreadByte();
83
19.2M
        return;
84
23.2M
    }
85
23.2M
  }
86
19.2M
}
87
88
15.3M
void Tokenizer::SetAllowCDATA(bool allow_cdata) {
89
15.3M
  allow_cdata_ = allow_cdata;
90
15.3M
}
91
92
354k
void Tokenizer::NextIsNotRawText() {
93
354k
  raw_tag_ = "";
94
354k
}
95
96
23.7k
void Tokenizer::ReadRawOrRCDATA() {
97
23.7k
  if (raw_tag_ == "script") {
98
2.65k
    ReadScript();
99
2.65k
    text_is_raw_ = true;
100
2.65k
    raw_tag_ = "";
101
2.65k
    return;
102
2.65k
  }
103
104
404k
  while (!eof_) {
105
404k
    char c = ReadByte();
106
404k
    if (eof_) break;
107
404k
    if (c != '<') continue;
108
99.0k
    c = ReadByte();
109
99.0k
    if (eof_) break;
110
99.0k
    if (c != '/') continue;
111
74.4k
    if (ReadRawEndTag() || eof_) break;
112
74.4k
  }
113
114
21.1k
  data_.end = raw_.end;
115
  // A textarea's or title's RCDATA can contain escaped entities.
116
21.1k
  text_is_raw_ = raw_tag_ != "textarea" && raw_tag_ != "title";
117
21.1k
  raw_tag_ = "";
118
21.1k
}
119
120
114k
bool Tokenizer::ReadRawEndTag() {
121
421k
  for (std::size_t i = 0; i < raw_tag_.size(); ++i) {
122
364k
    char c = ReadByte();
123
364k
    if (eof_) return false;
124
364k
    if (c != raw_tag_.at(i) && c != (raw_tag_.at(i) - ('a' - 'A'))) {
125
57.1k
      UnreadByte();
126
57.1k
      return false;
127
57.1k
    }
128
364k
  }
129
130
57.0k
  char c = ReadByte();
131
57.0k
  if (eof_) return false;
132
57.0k
  switch (c) {
133
1.70k
    case ' ':
134
1.80k
    case '\n':
135
1.82k
    case '\t':
136
1.83k
    case '\f':
137
1.85k
    case '/':
138
25.2k
    case '>':
139
      // The 3 is 2 for the leading "</" plus 1 for the trailing character c.
140
25.2k
      raw_.end -= (3 /* <, /, and > */+ raw_tag_.size());
141
25.2k
      current_line_col_.second -= (3 /* <, /, and > */ + raw_tag_.size());
142
25.2k
      return true;
143
57.0k
  }
144
31.7k
  UnreadByte();
145
31.7k
  return false;
146
57.0k
}
147
148
enum ScriptDataState {
149
  DONE = 0,
150
  SCRIPT_DATA = 1,
151
  SCRIPT_DATA_LESS_THAN_SIGN = 2,
152
  SCRIPT_DATA_END_TAG_OPEN = 3,
153
  SCRIPT_DATA_ESCAPE_START = 4,
154
  SCRIPT_DATA_ESCAPE_START_DASH = 5,
155
  SCRIPT_DATA_ESCAPED = 6,
156
  SCRIPT_DATA_ESCAPED_DASH = 7,
157
  SCRIPT_DATA_ESCAPED_DASH_DASH = 8,
158
  SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 9,
159
  SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 10,
160
  SCRIPT_DATA_DOUBLE_ESCAPE_START = 11,
161
  SCRIPT_DATA_DOUBLE_ESCAPED = 12,
162
  SCRIPT_DATA_DOUBLE_ESCAPED_DASH  = 13,
163
  SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH  = 14,
164
  SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 15,
165
  SCRIPT_DATA_DOUBLE_ESCAPED_END = 16
166
};
167
168
2.65k
void Tokenizer::ReadScript() {
169
2.65k
  defer({data_.end = raw_.end;});
170
2.65k
  ScriptDataState state = ScriptDataState::SCRIPT_DATA;
171
12.0M
  while (!eof_ && state != ScriptDataState::DONE) {
172
12.0M
    switch (state) {
173
4.92M
      case ScriptDataState::SCRIPT_DATA: {
174
4.92M
        char c = ReadByte();
175
4.92M
        if (eof_) return;
176
4.92M
        if (c == '<') {
177
1.44M
          state = ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN;
178
3.48M
        } else {
179
3.48M
          state = ScriptDataState::SCRIPT_DATA;
180
3.48M
        }
181
4.92M
        break;
182
4.92M
      }
183
1.44M
      case ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN: {
184
1.44M
        char c = ReadByte();
185
1.44M
        if (eof_) return;
186
1.44M
        if (c == '/') {
187
36.0k
          state = ScriptDataState::SCRIPT_DATA_END_TAG_OPEN;
188
1.40M
        } else if (c == '!') {
189
615k
          state = ScriptDataState::SCRIPT_DATA_ESCAPE_START;
190
790k
        } else {
191
790k
          UnreadByte();
192
790k
          state = ScriptDataState::SCRIPT_DATA;
193
790k
        }
194
1.44M
        break;
195
1.44M
      }
196
36.0k
      case ScriptDataState::SCRIPT_DATA_END_TAG_OPEN: {
197
36.0k
        if (ReadRawEndTag() || eof_) {
198
1.51k
          return;
199
1.51k
        }
200
34.5k
        state = ScriptDataState::SCRIPT_DATA;
201
34.5k
        break;
202
36.0k
      }
203
615k
      case ScriptDataState::SCRIPT_DATA_ESCAPE_START: {
204
615k
        char c = ReadByte();
205
615k
        if (eof_) return;
206
615k
        if (c == '-') {
207
613k
          state = ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH;
208
613k
        } else {
209
2.22k
          UnreadByte();
210
2.22k
          state = ScriptDataState::SCRIPT_DATA;
211
2.22k
        }
212
615k
        break;
213
615k
      }
214
613k
      case ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH: {
215
613k
        char c = ReadByte();
216
613k
        if (eof_) return;
217
613k
        if (c == '-') {
218
306k
          state = SCRIPT_DATA_ESCAPED_DASH_DASH;
219
306k
        } else {
220
306k
          UnreadByte();
221
306k
          state = ScriptDataState::SCRIPT_DATA;
222
306k
        }
223
613k
        break;
224
613k
      }
225
1.53M
      case ScriptDataState::SCRIPT_DATA_ESCAPED: {
226
1.53M
        char c = ReadByte();
227
1.53M
        if (eof_) return;
228
1.53M
        if (c == '-') {
229
591k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH;
230
946k
        } else if (c == '<') {
231
23.3k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
232
923k
        } else {
233
923k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
234
923k
        }
235
1.53M
        break;
236
1.53M
      }
237
591k
      case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH: {
238
591k
        char c = ReadByte();
239
591k
        if (eof_) return;
240
591k
        if (c == '-') {
241
2.14k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH;
242
589k
        } else if (c == '<') {
243
586k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
244
586k
        } else {
245
2.70k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
246
2.70k
        }
247
591k
        break;
248
591k
      }
249
309k
      case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH: {
250
309k
        char c = ReadByte();
251
309k
        if (eof_) return;
252
309k
        if (c == '-') {
253
434
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH;
254
309k
        } else if (c == '<') {
255
3.74k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
256
305k
        } else if (c == '>') {
257
66
          state = ScriptDataState::SCRIPT_DATA;
258
305k
        } else {
259
305k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
260
305k
        }
261
309k
        break;
262
309k
      }
263
613k
      case ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: {
264
613k
        char c = ReadByte();
265
613k
        if (eof_) return;
266
613k
        if (c == '/') {
267
508
          state = ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
268
613k
        } else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
269
307k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START;
270
307k
        } else {
271
305k
          UnreadByte();
272
305k
          state = ScriptDataState::SCRIPT_DATA;
273
305k
        }
274
613k
        break;
275
613k
      }
276
508
      case ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN: {
277
508
        if (ReadRawEndTag()) {
278
408
          state = ScriptDataState::DONE;
279
408
        } else {
280
100
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
281
100
        }
282
508
        break;
283
613k
      }
284
307k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START: {
285
307k
        UnreadByte();
286
307k
        static std::string script_tag_l = "script";
287
307k
        static std::string script_tag_u = "SCRIPT";
288
2.15M
        for (int8_t i = 0; i < 6 /*script*/; ++i) {
289
1.84M
          char c = ReadByte();
290
1.84M
          if (eof_) return;
291
1.84M
          if (c != script_tag_l[i] && c != script_tag_u[i]) {
292
1.83M
            UnreadByte();
293
1.83M
            state = ScriptDataState::SCRIPT_DATA_ESCAPED;
294
1.83M
          }
295
1.84M
        }
296
307k
        char c = ReadByte();
297
307k
        if (eof_) return;
298
307k
        if (c == ' ' || c == '\n' || c == '\r' || c == '\t'  || c == '\f'
299
307k
            || c == '/' || c == '>') {
300
3.34k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
301
304k
        } else {
302
304k
          UnreadByte();
303
304k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
304
304k
        }
305
307k
        break;
306
307k
      }
307
606k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED: {
308
606k
        char c = ReadByte();
309
606k
        if (eof_) return;
310
606k
        if (c == '-') {
311
204k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH;
312
402k
        } else if (c == '<') {
313
85.7k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
314
316k
        } else {
315
316k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
316
316k
        }
317
606k
        break;
318
606k
      }
319
204k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH: {
320
204k
        char c = ReadByte();
321
204k
        if (eof_) return;
322
204k
        if (c == '-') {
323
38.9k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
324
165k
        } else if (c == '<') {
325
120k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
326
120k
        } else if (c == '>') {
327
567
          state = ScriptDataState::SCRIPT_DATA;
328
44.1k
        } else {
329
44.1k
          state = SCRIPT_DATA_DOUBLE_ESCAPED;
330
44.1k
        }
331
204k
        break;
332
204k
      }
333
39.1k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: {
334
39.1k
        char c = ReadByte();
335
39.1k
        if (eof_) return;
336
39.1k
        if (c == '-') {
337
216
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
338
38.9k
        } else if (c == '<') {
339
1.23k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
340
37.6k
        } else if (c == '>') {
341
66
          state = ScriptDataState::SCRIPT_DATA;
342
37.6k
        } else {
343
37.6k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
344
37.6k
        }
345
39.1k
        break;
346
39.1k
      }
347
207k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: {
348
207k
        char c = ReadByte();
349
207k
        if (eof_) return;
350
207k
        if (c == '/') {
351
3.27k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END;
352
204k
        } else {
353
204k
          UnreadByte();
354
204k
          state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
355
204k
        }
356
207k
        break;
357
207k
      }
358
3.27k
      case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END: {
359
3.27k
        if (ReadRawEndTag()) {
360
2.47k
          raw_.end += std::string("</script>").size();
361
2.47k
          state = ScriptDataState::SCRIPT_DATA_ESCAPED;
362
2.47k
        } else {
363
796
          if (eof_) return;
364
791
          state  = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED;
365
791
        }
366
3.26k
        break;
367
3.27k
      }
368
3.26k
      default:
369
0
        break;
370
12.0M
    }
371
12.0M
  }
372
2.65k
}
373
374
1.25k
void Tokenizer::ReadComment() {
375
1.25k
  data_.start = raw_.end;
376
1.25k
  defer({
377
1.25k
    if (data_.end < data_.start) {
378
      // It's a comment with no data, like <!-->
379
1.25k
      data_.end = data_.start;
380
1.25k
    }
381
1.25k
  });
382
1.25k
  int dash_count = 2;
383
5.29M
  while (!eof_) {
384
5.29M
    char c = ReadByte();
385
5.29M
    if (eof_) {
386
      // Ignore up to two dashes at EOF.
387
79
      if (dash_count > 2) {
388
10
        dash_count = 2;
389
10
      }
390
79
      data_.end = raw_.end - dash_count;
391
79
      return;
392
79
    }
393
5.29M
    if (c == '-') {
394
1.33M
      dash_count++;
395
1.33M
      continue;
396
3.96M
    } else if (c == '>') {
397
7.47k
      if (dash_count >= 2) {
398
333
        data_.end = raw_.end - 3 /* --> */;
399
333
        return;
400
333
      }
401
3.95M
    } else if (c == '!') {
402
726k
      if (dash_count >= 2) {
403
1.04k
        char c = ReadByte();
404
1.04k
        if (eof_) {
405
14
          data_.end = raw_.end;
406
14
          return;
407
14
        }
408
1.02k
        if (c == '>') {
409
828
          data_.end = raw_.end - 4 /* --!> */;
410
828
          return;
411
828
        }
412
1.02k
      }
413
726k
    }
414
3.96M
    dash_count = 0;
415
3.96M
  }
416
1.25k
}
417
418
1.39M
void Tokenizer::ReadUntilCloseAngle() {
419
1.39M
  data_.start = raw_.end;
420
3.88M
  while (!eof_) {
421
3.88M
    char c = ReadByte();
422
3.88M
    if (eof_) {
423
404
      data_.end = raw_.end;
424
404
      return;
425
404
    }
426
3.88M
    if (c == '>') {
427
1.39M
      data_.end = raw_.end - 1 /* ">" */;
428
1.39M
      return;
429
1.39M
    }
430
3.88M
  }
431
1.39M
}
432
433
1.39M
TokenType Tokenizer::ReadMarkupDeclaration() {
434
1.39M
  data_.start = raw_.end;
435
1.39M
  char c[2];
436
4.17M
  for (int i = 0; i < 2; ++i) {
437
2.78M
    c[i] = ReadByte();
438
2.78M
    if (eof_) {
439
45
      data_.end = raw_.end;
440
45
      return TokenType::COMMENT_TOKEN;
441
45
    }
442
2.78M
  }
443
444
1.39M
  if (c[0] == '-' && c[1] == '-') {
445
1.25k
    ReadComment();
446
1.25k
    return TokenType::COMMENT_TOKEN;
447
1.25k
  }
448
449
1.39M
  UnreadByte();
450
1.39M
  UnreadByte();
451
1.39M
  if (ReadDoctype()) {
452
107
    return TokenType::DOCTYPE_TOKEN;
453
107
  }
454
455
1.39M
  if (allow_cdata_ && ReadCDATA()) {
456
64
    convert_null_ = true;
457
64
    return TokenType::TEXT_TOKEN;
458
64
  }
459
460
  // It's a bogus comment.
461
1.39M
  ReadUntilCloseAngle();
462
1.39M
  return TokenType::COMMENT_TOKEN;
463
1.39M
}
464
465
1.39M
bool Tokenizer::ReadDoctype() {
466
1.39M
  token_line_col_ = {current_line_col_.first,
467
1.39M
                     current_line_col_.second - 2 /* <! */};
468
469
1.39M
  static constexpr std::string_view kDoctype = "DOCTYPE";
470
1.39M
  for (std::size_t i = 0; i < kDoctype.size(); ++i) {
471
1.39M
    char c = ReadByte();
472
1.39M
    if (eof_) {
473
5
      data_.end = raw_.end;
474
5
      return false;
475
5
    }
476
1.39M
    if (c != kDoctype.at(i) && c != (kDoctype.at(i) + ('a' - 'A'))) {
477
      // Back up to read the fragment of "DOCTYPE" again.
478
1.39M
      raw_.end = data_.start;
479
1.39M
      return false;
480
1.39M
    }
481
1.39M
  }
482
483
107
  SkipWhiteSpace();
484
107
  if (eof_) {
485
2
    data_.start = raw_.end;
486
2
    data_.end = raw_.end;
487
2
    return true;
488
2
  }
489
490
105
  ReadUntilCloseAngle();
491
105
  return true;
492
107
}
493
494
177
bool Tokenizer::ReadCDATA() {
495
177
  static constexpr std::string_view kCData = "[CDATA[";
496
823
  for (std::size_t i = 0; i < kCData.size(); ++i) {
497
759
    char c = ReadByte();
498
759
    if (eof_) {
499
8
      data_.end = raw_.end;
500
8
      return false;
501
8
    }
502
751
    if (c != kCData[i]) {
503
      // Back up to read the fragment of "[CDATA[" again.
504
105
      data_.end = raw_.start;
505
105
      return false;
506
105
    }
507
751
  }
508
64
  data_.start = raw_.end;
509
64
  int brackets = 0;
510
4.05k
  while (!eof_) {
511
4.05k
    char c = ReadByte();
512
4.05k
    if (eof_) {
513
27
      data_.end = raw_.end;
514
27
      return true;
515
27
    }
516
4.02k
    switch (c) {
517
841
      case ']': {
518
841
        brackets++;
519
841
        break;
520
0
      }
521
1.01k
      case '>': {
522
1.01k
        if (brackets >= 2) {
523
37
          data_.end = raw_.end - 3 /* "]]>" */;
524
37
          return true;
525
37
        }
526
976
        brackets = 0;
527
976
        break;
528
1.01k
      }
529
2.17k
      default:
530
2.17k
        brackets = 0;
531
4.02k
    }
532
4.02k
  }
533
0
  return false;
534
64
}
535
536
template<typename... Args>
537
6.28M
bool Tokenizer::StartTagIn(Args... ss) {
538
6.28M
  std::vector<std::string> argsList{ss...};
539
11.9M
  for (const auto& s : argsList) {
540
11.9M
    if (data_.end - data_.start != s.size()) continue;
541
1.41M
    bool matched = true;
542
3.21M
    for (std::size_t i = 0; i < s.size(); ++i) {
543
3.19M
      char c = buffer_.at(data_.start + i);
544
3.19M
      if ('A' <= c && c <= 'Z') {
545
2.95M
        c += 'a' - 'A';
546
2.95M
      }
547
3.19M
      if (c != s[i]) {
548
1.39M
        matched = false;
549
1.39M
        break;
550
1.39M
      }
551
3.19M
    }
552
1.41M
    if (matched) {
553
24.1k
      return true;
554
24.1k
    }
555
1.41M
  }
556
6.26M
  return false;
557
6.28M
}
bool htmlparser::Tokenizer::StartTagIn<char const*>(char const*)
Line
Count
Source
537
673k
bool Tokenizer::StartTagIn(Args... ss) {
538
673k
  std::vector<std::string> argsList{ss...};
539
673k
  for (const auto& s : argsList) {
540
673k
    if (data_.end - data_.start != s.size()) continue;
541
5.53k
    bool matched = true;
542
21.6k
    for (std::size_t i = 0; i < s.size(); ++i) {
543
16.6k
      char c = buffer_.at(data_.start + i);
544
16.6k
      if ('A' <= c && c <= 'Z') {
545
10.6k
        c += 'a' - 'A';
546
10.6k
      }
547
16.6k
      if (c != s[i]) {
548
469
        matched = false;
549
469
        break;
550
469
      }
551
16.6k
    }
552
5.53k
    if (matched) {
553
5.06k
      return true;
554
5.06k
    }
555
5.53k
  }
556
668k
  return false;
557
673k
}
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*, char const*>(char const*, char const*, char const*)
Line
Count
Source
537
91.2k
bool Tokenizer::StartTagIn(Args... ss) {
538
91.2k
  std::vector<std::string> argsList{ss...};
539
273k
  for (const auto& s : argsList) {
540
273k
    if (data_.end - data_.start != s.size()) continue;
541
156k
    bool matched = true;
542
607k
    for (std::size_t i = 0; i < s.size(); ++i) {
543
606k
      char c = buffer_.at(data_.start + i);
544
606k
      if ('A' <= c && c <= 'Z') {
545
582k
        c += 'a' - 'A';
546
582k
      }
547
606k
      if (c != s[i]) {
548
155k
        matched = false;
549
155k
        break;
550
155k
      }
551
606k
    }
552
156k
    if (matched) {
553
558
      return true;
554
558
    }
555
156k
  }
556
90.7k
  return false;
557
91.2k
}
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*>(char const*, char const*)
Line
Count
Source
537
5.52M
bool Tokenizer::StartTagIn(Args... ss) {
538
5.52M
  std::vector<std::string> argsList{ss...};
539
11.0M
  for (const auto& s : argsList) {
540
11.0M
    if (data_.end - data_.start != s.size()) continue;
541
1.25M
    bool matched = true;
542
2.58M
    for (std::size_t i = 0; i < s.size(); ++i) {
543
2.56M
      char c = buffer_.at(data_.start + i);
544
2.56M
      if ('A' <= c && c <= 'Z') {
545
2.36M
        c += 'a' - 'A';
546
2.36M
      }
547
2.56M
      if (c != s[i]) {
548
1.23M
        matched = false;
549
1.23M
        break;
550
1.23M
      }
551
2.56M
    }
552
1.25M
    if (matched) {
553
18.4k
      return true;
554
18.4k
    }
555
1.25M
  }
556
5.50M
  return false;
557
5.52M
}
558
559
12.9M
TokenType Tokenizer::ReadStartTag(bool template_mode) {
560
12.9M
  token_line_col_ = {current_line_col_.first,
561
12.9M
                     current_line_col_.second - 1 /* < */};
562
12.9M
  ReadTag(true, template_mode);
563
564
12.9M
  if (eof_) {
565
306
    return TokenType::ERROR_TOKEN;
566
306
  }
567
568
  // Several tags flag the tokenizer's next token as raw.
569
12.9M
  bool raw = false;
570
12.9M
  char c = buffer_.at(data_.start);
571
572
  // Lowercase.
573
12.9M
  if ('A' <= c && c <= 'Z') {
574
6.39M
    c += 'a' - 'A';
575
6.39M
  }
576
577
12.9M
  switch (c) {
578
644k
    case 'i':
579
644k
      raw = StartTagIn("iframe");
580
644k
      break;
581
91.2k
    case 'n':
582
91.2k
      raw = StartTagIn("noembed", "noframes", "noscript");
583
91.2k
      break;
584
20.1k
    case 'p':
585
20.1k
      raw = StartTagIn("plaintext");
586
20.1k
      break;
587
3.13M
    case 's':
588
3.13M
      raw = StartTagIn("script", "style");
589
3.13M
      break;
590
2.39M
    case 't':
591
2.39M
      raw = StartTagIn("textarea", "title");
592
2.39M
      break;
593
8.68k
    case 'x':
594
8.68k
      raw = StartTagIn("xmp");
595
12.9M
  }
596
597
12.9M
  if (raw) {
598
24.1k
    int size = data_.end - data_.start;
599
24.1k
    raw_tag_ = std::string(buffer_.substr(data_.start, size));
600
24.1k
    Strings::ToLower(&raw_tag_);
601
24.1k
  }
602
603
  // Look for a self-closing token like "<br/>".
604
12.9M
  if (!eof_ && buffer_[raw_.end - 2] == '/') {
605
10.7k
    return TokenType::SELF_CLOSING_TAG_TOKEN;
606
10.7k
  }
607
608
12.9M
  return TokenType::START_TAG_TOKEN;
609
12.9M
}
610
611
13.3M
void Tokenizer::ReadTag(bool save_attr, bool template_mode) {
612
13.3M
  attributes_.clear();
613
13.3M
  n_attributes_returned_ = 0;
614
615
  // Read the tag name and attribute key/value pairs.
616
13.3M
  ReadTagName();
617
13.3M
  SkipWhiteSpace();
618
619
13.3M
  if (eof_) {
620
136
    return;
621
136
  }
622
623
16.2M
  while (!eof_) {
624
16.2M
    char c = ReadByte();
625
16.2M
    if (eof_ || c == '>') {
626
13.3M
      break;
627
13.3M
    }
628
629
    // Undo previous > read.
630
2.93M
    UnreadByte();
631
632
2.93M
    ReadTagAttributeKey(template_mode);
633
2.93M
    ReadTagAttributeValue();
634
    // Save pending_attribute if save_attr and that attribute has a non-empty
635
    // key.
636
2.93M
    if (save_attr &&
637
        // Skip excessive attributes.
638
2.93M
        attributes_.size() < ::absl::GetFlag(
639
2.81M
            FLAGS_htmlparser_max_attributes_per_node) &&
640
2.93M
        std::get<0>(pending_attribute_).start !=
641
512k
        std::get<0>(pending_attribute_).end) {
642
430k
      attributes_.push_back(pending_attribute_);
643
430k
    }
644
2.93M
    SkipWhiteSpace();
645
2.93M
  }
646
13.3M
}
647
648
13.3M
void Tokenizer::ReadTagName() {
649
13.3M
  data_.start = raw_.end - 1;
650
26.8M
  while (!eof_) {
651
26.8M
    char c = ReadByte();
652
26.8M
    if (eof_) {
653
75
      data_.end = raw_.end;
654
75
      return;
655
75
    }
656
26.8M
    switch (c) {
657
70.2k
      case ' ':
658
72.1k
      case '\n':
659
149k
      case '\r':
660
150k
      case '\t':
661
150k
      case '\f':
662
150k
        data_.end = raw_.end - 1;
663
150k
        return;
664
120k
      case '/':
665
13.1M
      case '>':
666
13.1M
        UnreadByte();
667
13.1M
        data_.end = raw_.end;
668
13.1M
        return;
669
26.8M
    }
670
26.8M
  }
671
13.3M
}
672
673
// Sets pending_attribute_[0] to the "k" in "<div k=v>".
674
// Precondition: eof_ != true;
675
2.93M
void Tokenizer::ReadTagAttributeKey(bool template_mode) {
676
2.93M
  std::get<0>(pending_attribute_).start = raw_.end;
677
2.93M
  std::get<LineCol>(pending_attribute_) =
678
2.93M
      {current_line_col_.first, current_line_col_.second + 1};
679
680
  // All mustache_ prefixed variables applies to parsing logic for AMP mustache
681
  // templates. See: https://amp.dev/documentation/components/amp-mustache/
682
2.93M
  bool mustache_inside_section_block = false;
683
2.93M
  std::string mustache_section_name = "";
684
685
32.5M
  while (!eof_) {
686
32.5M
    char c = ReadByte();
687
32.5M
    if (eof_) {
688
99
      std::get<0>(pending_attribute_).start = raw_.end;
689
99
      return;
690
99
    }
691
692
    // Template attributes processing.
693
    // Looks for following special syntax.
694
    // {{#section}}...{{/section}}
695
    // {{^section}}...{{/section}}
696
    // {{variable}}
697
32.5M
    if (template_mode) {
698
0
      UnreadByte();
699
0
      UnreadByte();
700
0
      UnreadByte();
701
0
      char c1 = ReadByte();
702
0
      char c2 = ReadByte();
703
0
      c = ReadByte();
704
0
      if (mustache_inside_section_block && c1 == '{' && c2 == '{' && c == '/') {
705
        // Look for closing section name. If not resort to default behavior.
706
        // Reason for this logic is to differentiate between:
707
        // <p {{#mycondition}}class=foo{{/mycondition}} foo=bar> vs.
708
        // <img {{#mycondition}}class=foo />
709
0
        int raw_end = raw_.end;
710
0
        std::string_view close_section =
711
0
            buffer_.substr(raw_.end, mustache_section_name.size());
712
0
        bool section_name_match = close_section == mustache_section_name;
713
0
        if (section_name_match) {
714
0
          raw_.end += mustache_section_name.size();
715
0
          char e1 = ReadByte();
716
0
          char e2 = ReadByte();
717
0
          if (e1 == '}' && e2 == '}') {
718
0
            mustache_inside_section_block = false;
719
0
            continue;
720
0
          } else {
721
0
            raw_.end = raw_end;
722
0
          }
723
0
        }
724
0
      }
725
726
0
      if (c1 == '{' && c2 == '{' && (c == '#' || c == '^')) {
727
0
        auto n = buffer_.find("}}", raw_.end);
728
0
        if (n != std::string_view::npos) {
729
0
          mustache_section_name = buffer_.substr(raw_.end, n - raw_.end);
730
0
          mustache_inside_section_block = true;
731
0
          continue;
732
0
        }
733
0
      }
734
0
    }
735
736
32.5M
    switch (c) {
737
49.5k
      case ' ':
738
59.6k
      case '\n':
739
364k
      case '\r':
740
367k
      case '\t':
741
446k
      case '\f':
742
2.66M
      case '/': {
743
2.66M
        std::get<0>(pending_attribute_).end = raw_.end - 1;
744
2.66M
        return;
745
446k
      }
746
19.7k
      case '=':
747
266k
      case '>': {
748
266k
        UnreadByte();
749
266k
        std::get<0>(pending_attribute_).end = raw_.end;
750
266k
        return;
751
19.7k
      }
752
32.5M
    }
753
32.5M
  }
754
2.93M
}
755
756
// Sets pending_attribute_.second to the "v" in "<div k=v>".
757
2.93M
void Tokenizer::ReadTagAttributeValue() {
758
2.93M
  std::get<1>(pending_attribute_).start = raw_.end;
759
2.93M
  std::get<1>(pending_attribute_).end = raw_.end;
760
2.93M
  SkipWhiteSpace();
761
2.93M
  if (eof_) {
762
148
    return;
763
148
  }
764
2.93M
  char c = ReadByte();
765
2.93M
  if (eof_) {
766
0
    return;
767
0
  }
768
769
2.93M
  if (c != '=') {
770
2.91M
    UnreadByte();
771
2.91M
    return;
772
2.91M
  }
773
774
19.7k
  SkipWhiteSpace();
775
19.7k
  if (eof_) {
776
12
    return;
777
12
  }
778
779
19.7k
  char quote = ReadByte();
780
19.7k
  if (eof_) {
781
0
    return;
782
0
  }
783
784
19.7k
  switch (quote) {
785
726
    case '>':
786
726
      UnreadByte();
787
726
      return;
788
245
    case '\'':
789
255
    case '"':
790
255
      std::get<1>(pending_attribute_).start = raw_.end;
791
453
      while (!eof_) {
792
453
        c = ReadByte();
793
453
        if (eof_) {
794
14
          std::get<1>(pending_attribute_).end = raw_.end;
795
14
          return;
796
14
        }
797
439
        if (c == quote) {
798
241
          std::get<1>(pending_attribute_).end = raw_.end - 1;
799
241
          return;
800
241
        }
801
439
      }
802
0
      break;
803
18.8k
    default: {
804
18.8k
      std::get<1>(pending_attribute_).start = raw_.end - 1;
805
2.35M
      while (!eof_) {
806
2.35M
        c = ReadByte();
807
2.35M
        if (eof_) {
808
2
          std::get<1>(pending_attribute_).end = raw_.end;
809
2
          return;
810
2
        }
811
2.35M
        switch (c) {
812
995
          case ' ':
813
1.20k
          case '\n':
814
1.97k
          case '\r':
815
2.42k
          case '\t':
816
7.40k
          case '\f':
817
7.40k
            std::get<1>(pending_attribute_).end = raw_.end - 1;
818
7.40k
            return;
819
11.3k
          case '>':
820
11.3k
            UnreadByte();
821
11.3k
            std::get<1>(pending_attribute_).end = raw_.end;
822
11.3k
            return;
823
2.35M
        }
824
2.35M
      }
825
18.8k
    }
826
19.7k
  }
827
19.7k
}
828
829
15.3M
TokenType Tokenizer::Next(bool template_mode) {
830
15.3M
  raw_.start = raw_.end;
831
15.3M
  data_.start = raw_.end;
832
15.3M
  data_.end = raw_.end;
833
15.3M
  is_token_manufactured_ = false;
834
835
15.3M
  if (eof_) {
836
3.72k
    err_ = true;
837
3.72k
    token_type_ = TokenType::ERROR_TOKEN;
838
3.72k
    return token_type_;
839
3.72k
  }
840
841
15.3M
  if (raw_tag_ != "") {
842
23.7k
    if (raw_tag_ == "plaintext") {
843
      // Read everything up to EOF.
844
0
      while (!eof_) {
845
0
        ReadByte();
846
0
      }
847
0
      data_.end = raw_.end;
848
0
      text_is_raw_ = true;
849
23.7k
    } else {
850
23.7k
      ReadRawOrRCDATA();
851
23.7k
    }
852
853
23.7k
    if (data_.end > data_.start) {
854
20.4k
      token_type_ = TokenType::TEXT_TOKEN;
855
20.4k
      convert_null_ = true;
856
20.4k
      return token_type_;
857
20.4k
    }
858
23.7k
  }
859
860
15.2M
  text_is_raw_ = false;
861
15.2M
  convert_null_ = false;
862
863
24.4M
  while (!eof_) {
864
24.4M
    char c = ReadByte();
865
866
24.4M
    if (eof_) {
867
9.36k
      break;
868
9.36k
    }
869
870
24.4M
    if (c != '<') {
871
8.98M
      continue;
872
8.98M
    }
873
874
15.4M
    c = ReadByte();
875
15.4M
    if (eof_) break;
876
877
    // Check if the '<' we have just read is part of a tag, comment or
878
    // doctype. If not, it's part of the accumulated text token.
879
15.4M
    TokenType token_type;
880
15.4M
    if (Strings::IsCharAlphabet(c)) {
881
13.4M
      token_type = TokenType::START_TAG_TOKEN;
882
13.4M
    } else if (c == '/') {
883
450k
      token_type = TokenType::END_TAG_TOKEN;
884
1.58M
    } else if (c == '!' || c == '?') {
885
1.40M
      token_type = TokenType::COMMENT_TOKEN;
886
1.40M
    } else {
887
174k
      UnreadByte();
888
174k
      continue;
889
174k
    }
890
891
    // We have a non-text token, but we might have accumulated some text
892
    // before that. If so, we return the text first, and return the non text
893
    // token on the subsequent call to Next.
894
    //
895
    // <space><space><mytag>, returns two spaces before processing the mytag
896
    // token in the next call.
897
15.2M
    if (int x = raw_.end - 2 /* "<a" */; raw_.start < x) {
898
547k
      raw_.end = x;
899
547k
      data_.end = x;
900
      // We know there is no \n so no line adjustment needed.
901
547k
      current_line_col_.second -= 2;
902
547k
      token_type_ = TokenType::TEXT_TOKEN;
903
547k
      return token_type_;
904
547k
    }
905
906
14.7M
    switch (token_type) {
907
12.9M
      case TokenType::START_TAG_TOKEN:
908
12.9M
        token_type_ = ReadStartTag(template_mode);
909
12.9M
        return token_type_;
910
414k
      case TokenType::END_TAG_TOKEN:
911
414k
        c = ReadByte();
912
414k
        if (eof_) break;
913
414k
        if (c == '>') {
914
          // "</> does not generate a token at all. Generate an empty comment
915
          // to allow passthrough clients to pick up the data using raw_.
916
          // Reset the tokenizer state and start again.
917
5.01k
          token_type_ = TokenType::COMMENT_TOKEN;
918
5.01k
          return token_type_;
919
5.01k
        }
920
409k
        if (Strings::IsCharAlphabet(c)) {
921
406k
          ReadTag(false);
922
406k
          if (eof_) {
923
52
            token_type_ = TokenType::ERROR_TOKEN;
924
406k
          } else {
925
406k
            token_type_ = TokenType::END_TAG_TOKEN;
926
406k
          }
927
406k
          return token_type_;
928
406k
        }
929
2.82k
        UnreadByte();
930
2.82k
        ReadUntilCloseAngle();
931
2.82k
        token_type_ = TokenType::COMMENT_TOKEN;
932
2.82k
        return token_type_;
933
1.39M
      case TokenType::COMMENT_TOKEN: {
934
1.39M
        if (c == '!') {
935
1.39M
          token_type_ = ReadMarkupDeclaration();
936
1.39M
          return token_type_;
937
1.39M
        }
938
1.28k
        is_token_manufactured_ = true;
939
        // <? is part of the comment text.
940
1.28k
        UnreadByte();
941
1.28k
        ReadUntilCloseAngle();
942
1.28k
        token_type_ = TokenType::COMMENT_TOKEN;
943
1.28k
        return token_type_;
944
1.39M
      }
945
0
      default:
946
0
        break;
947
14.7M
    }
948
14.7M
  }
949
950
9.66k
  if (raw_.start < raw_.end) {
951
2.22k
    data_.end = raw_.end;
952
2.22k
    token_type_ = TokenType::TEXT_TOKEN;
953
2.22k
    return token_type_;
954
2.22k
  }
955
956
7.44k
  token_type_ = TokenType::ERROR_TOKEN;
957
7.44k
  return token_type_;
958
9.66k
}
959
960
0
std::string_view Tokenizer::Raw() {
961
0
  int size = raw_.end - raw_.start;
962
0
  return buffer_.substr(raw_.start, size);
963
0
}
964
965
1.97M
std::string Tokenizer::Text() {
966
1.97M
  switch (token_type_) {
967
570k
    case TokenType::TEXT_TOKEN:
968
1.97M
    case TokenType::COMMENT_TOKEN:
969
1.97M
    case TokenType::DOCTYPE_TOKEN: {
970
1.97M
      int size = data_.end - data_.start;
971
1.97M
      std::string s(buffer_.substr(data_.start, size));
972
1.97M
      data_.start = raw_.end;
973
1.97M
      data_.end = raw_.end;
974
1.97M
      Strings::ConvertNewLines(&s);
975
1.97M
      if (convert_null_ || token_type_ == TokenType::COMMENT_TOKEN) {
976
        // Replace \x00 with \ufffd.
977
1.42M
        Strings::ReplaceAny(&s,
978
1.42M
                            Strings::kNullChar,
979
1.42M
                            Strings::kNullReplacementChar);
980
1.42M
      }
981
1.97M
      if (!text_is_raw_) Strings::UnescapeString(&s, false);
982
1.97M
      return s;
983
1.97M
    }
984
0
    default:
985
0
      break;
986
1.97M
  }
987
988
0
  return "";
989
1.97M
}
990
991
13.3M
std::optional<std::tuple<std::string, bool>> Tokenizer::TagName() {
992
13.3M
  if (data_.start < data_.end) {
993
13.3M
    switch (token_type_) {
994
12.9M
      case TokenType::START_TAG_TOKEN:
995
13.3M
      case TokenType::END_TAG_TOKEN:
996
13.3M
      case TokenType::SELF_CLOSING_TAG_TOKEN: {
997
13.3M
        int size = data_.end - data_.start;
998
13.3M
        std::string s(buffer_.substr(data_.start, size));
999
13.3M
        data_.start = raw_.end;
1000
13.3M
        data_.end = raw_.end;
1001
13.3M
        Strings::ToLower(&s);
1002
13.3M
        return std::make_tuple<std::string, bool>(std::move(s),
1003
13.3M
            n_attributes_returned_ < attributes_.size());
1004
13.3M
      }
1005
0
      default:
1006
0
        break;
1007
13.3M
    }
1008
13.3M
  }
1009
1010
0
  return std::nullopt;
1011
13.3M
}
1012
1013
417k
std::optional<std::tuple<Attribute, bool>> Tokenizer::TagAttr() {
1014
417k
  if (n_attributes_returned_ < attributes_.size()) {
1015
417k
    switch (token_type_) {
1016
409k
      case TokenType::START_TAG_TOKEN:
1017
417k
      case TokenType::SELF_CLOSING_TAG_TOKEN: {
1018
417k
        auto attr = attributes_[n_attributes_returned_];
1019
417k
        n_attributes_returned_++;
1020
417k
        int size = std::get<0>(attr).end - std::get<0>(attr).start;
1021
417k
        std::string key(buffer_.substr(std::get<0>(attr).start, size));
1022
417k
        int value_size = std::get<1>(attr).end - std::get<1>(attr).start;
1023
417k
        std::string val(buffer_.substr(std::get<1>(attr).start, value_size));
1024
417k
        Strings::ToLower(&key);
1025
417k
        Strings::ConvertNewLines(&val);
1026
417k
        Strings::UnescapeString(&val, true);
1027
417k
        return std::make_tuple<Attribute, bool>(
1028
417k
            {.name_space = "",
1029
417k
             .key = std::move(key),
1030
417k
             .value = std::move(val),
1031
417k
             .line_col_in_html_src = std::get<LineCol>(attr)},
1032
417k
            n_attributes_returned_ < attributes_.size());
1033
409k
      }
1034
0
      default:
1035
0
        break;
1036
417k
    }
1037
417k
  }
1038
1039
0
  return std::nullopt;
1040
417k
}
1041
1042
15.3M
Token Tokenizer::token() {
1043
15.3M
  Token t;
1044
15.3M
  t.token_type = token_type_;
1045
15.3M
  switch (token_type_) {
1046
570k
    case TokenType::TEXT_TOKEN: {
1047
570k
      t.data = Text();
1048
570k
      int line_number = current_line_col_.first;
1049
570k
      int column_number = current_line_col_.second - t.data.size();
1050
      // Shift to previous line, where this text belongs.
1051
570k
      if (column_number < 0) {
1052
209k
        if (lines_cols_.size() > 1) {
1053
209k
          auto previous_token_linecol = lines_cols_[lines_cols_.size() - 2];
1054
209k
          line_number = previous_token_linecol.first;
1055
209k
          column_number =
1056
209k
              previous_token_linecol.second - abs(column_number) + 1;
1057
209k
        } else {
1058
224
          column_number = 0;
1059
224
        }
1060
209k
      }
1061
570k
      token_line_col_ = {line_number, column_number};
1062
570k
      break;
1063
0
    }
1064
1.40M
    case TokenType::COMMENT_TOKEN:
1065
1.40M
    case TokenType::DOCTYPE_TOKEN:
1066
1.40M
      t.data = Text();
1067
1.40M
      t.is_manufactured = is_token_manufactured_;
1068
1.40M
      token_line_col_ = {current_line_col_.first,
1069
1.40M
                         current_line_col_.second - t.data.size()};
1070
1.40M
      break;
1071
12.9M
    case TokenType::START_TAG_TOKEN:
1072
12.9M
    case TokenType::SELF_CLOSING_TAG_TOKEN:
1073
13.3M
    case TokenType::END_TAG_TOKEN: {
1074
13.3M
      auto tag_name_value = TagName();
1075
13.3M
      if (tag_name_value.has_value()) {
1076
13.3M
        std::string tag_name = std::get<0>(tag_name_value.value());
1077
13.3M
        bool has_attributes = std::get<1>(tag_name_value.value());
1078
13.3M
        Atom atom = AtomUtil::ToAtom(tag_name);
1079
13.3M
        if (atom != Atom::UNKNOWN) {
1080
7.12M
          t.atom = atom;
1081
7.12M
        } else {
1082
6.20M
          t.atom = Atom::UNKNOWN;
1083
6.20M
          t.data = tag_name;
1084
6.20M
        }
1085
13.3M
        if (has_attributes) {
1086
417k
          while (true) {
1087
417k
            auto a = TagAttr();
1088
417k
            if (!a.has_value()) break;
1089
417k
            auto attr = std::get<Attribute>(a.value());
1090
417k
            bool more_attributes = std::get<bool>(a.value());
1091
417k
            t.attributes.push_back(attr);
1092
417k
            if (!more_attributes) break;
1093
417k
          }
1094
199k
        }
1095
13.3M
      }
1096
13.3M
      break;
1097
12.9M
    }
1098
11.5k
    case TokenType::ERROR_TOKEN:
1099
      // Ignore.
1100
11.5k
      break;
1101
15.3M
  }
1102
1103
15.3M
  t.line_col_in_html_src = token_line_col_;
1104
15.3M
  return t;
1105
15.3M
}
1106
1107
}  // namespace htmlparser