/proc/self/cwd/cpp/htmlparser/tokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "cpp/htmlparser/tokenizer.h" |
2 | | |
3 | | #include "absl/flags/flag.h" |
4 | | #include "cpp/htmlparser/atom.h" |
5 | | #include "cpp/htmlparser/atomutil.h" |
6 | | #include "cpp/htmlparser/defer.h" |
7 | | #include "cpp/htmlparser/strings.h" |
8 | | |
9 | | ABSL_FLAG(std::size_t, htmlparser_max_attributes_per_node, |
10 | | 1000, |
11 | | "Protects out of memory errors by dropping insanely large amounts " |
12 | | "of attributes per node."); |
13 | | |
14 | | namespace htmlparser { |
15 | | |
16 | | Tokenizer::Tokenizer(std::string_view html, std::string context_tag) : |
17 | 12.5k | buffer_(html) { |
18 | 12.5k | lines_cols_.push_back(std::make_pair(1, 0)); |
19 | 12.5k | current_line_col_ = std::make_pair(1, 0); |
20 | 12.5k | token_line_col_ = std::make_pair(1, 0); |
21 | 12.5k | if (!context_tag.empty()) { |
22 | 0 | Strings::ToLower(&context_tag); |
23 | 0 | if (std::find(kAllowedFragmentContainers.begin(), |
24 | 0 | kAllowedFragmentContainers.end(), |
25 | 0 | AtomUtil::ToAtom(context_tag)) != |
26 | 0 | kAllowedFragmentContainers.end()) { |
27 | 0 | raw_tag_ = context_tag; |
28 | 0 | } |
29 | 0 | } |
30 | 12.5k | } |
31 | | |
32 | 189M | inline char Tokenizer::ReadByte() { |
33 | 189M | if (raw_.end >= buffer_.size()) { |
34 | 12.5k | eof_ = true; |
35 | 12.5k | return 0; |
36 | 12.5k | } |
37 | | |
38 | 189M | char c = buffer_.at(raw_.end++); |
39 | 189M | current_line_col_.second++; |
40 | 189M | int multi_byte = Strings::CodePointByteSequenceCount(c); |
41 | 189M | if (multi_byte > 1) { |
42 | 11.1M | current_line_col_.second -= (multi_byte - 1); |
43 | 11.1M | } |
44 | | |
45 | 189M | if (c == '\n' || (c == '\r' && |
46 | 187M | raw_.end < buffer_.size() && |
47 | 187M | buffer_.at(raw_.end) != '\n')) { |
48 | 19.7M | lines_cols_.back() = current_line_col_; |
49 | | // Increment line number and reset column number. |
50 | 19.7M | current_line_col_.first++; |
51 | 19.7M | current_line_col_.second = 0; |
52 | 19.7M | lines_cols_.push_back({current_line_col_.first + 1, 0}); |
53 | 19.7M | } |
54 | | |
55 | 189M | return c; |
56 | 189M | } |
57 | | |
58 | 47.7M | inline void Tokenizer::UnreadByte() { |
59 | 47.7M | raw_.end--; |
60 | 47.7M | if (current_line_col_.first > 1 && current_line_col_.second == 0) { |
61 | 3.75M | if (lines_cols_.size() > 1) { |
62 | 3.75M | lines_cols_.pop_back(); |
63 | 3.75M | } |
64 | 3.75M | current_line_col_ = lines_cols_.back(); |
65 | 3.75M | return; |
66 | 3.75M | } |
67 | | |
68 | 43.9M | current_line_col_.second--; |
69 | 43.9M | } |
70 | | |
71 | 17.7M | void Tokenizer::SkipWhiteSpace() { |
72 | 20.9M | while (!eof_) { |
73 | 20.9M | char c = ReadByte(); |
74 | 20.9M | switch (c) { |
75 | 1.01k | case ' ': |
76 | 699k | case '\n': |
77 | 3.03M | case '\r': |
78 | 3.05M | case '\t': |
79 | 3.11M | case '\f': |
80 | 3.11M | break; |
81 | 17.7M | default: |
82 | 17.7M | UnreadByte(); |
83 | 17.7M | return; |
84 | 20.9M | } |
85 | 20.9M | } |
86 | 17.7M | } |
87 | | |
88 | 15.1M | void Tokenizer::SetAllowCDATA(bool allow_cdata) { |
89 | 15.1M | allow_cdata_ = allow_cdata; |
90 | 15.1M | } |
91 | | |
92 | 1.83M | void Tokenizer::NextIsNotRawText() { |
93 | 1.83M | raw_tag_ = ""; |
94 | 1.83M | } |
95 | | |
96 | 24.7k | void Tokenizer::ReadRawOrRCDATA() { |
97 | 24.7k | if (raw_tag_ == "script") { |
98 | 5.39k | ReadScript(); |
99 | 5.39k | text_is_raw_ = true; |
100 | 5.39k | raw_tag_ = ""; |
101 | 5.39k | return; |
102 | 5.39k | } |
103 | | |
104 | 1.27M | while (!eof_) { |
105 | 1.27M | char c = ReadByte(); |
106 | 1.27M | if (eof_) break; |
107 | 1.27M | if (c != '<') continue; |
108 | 275k | c = ReadByte(); |
109 | 275k | if (eof_) break; |
110 | 275k | if (c != '/') continue; |
111 | 238k | if (ReadRawEndTag() || eof_) break; |
112 | 238k | } |
113 | | |
114 | 19.3k | data_.end = raw_.end; |
115 | | // A textarea's or title's RCDATA can contain escaped entities. |
116 | 19.3k | text_is_raw_ = raw_tag_ != "textarea" && raw_tag_ != "title"; |
117 | 19.3k | raw_tag_ = ""; |
118 | 19.3k | } |
119 | | |
120 | 257k | bool Tokenizer::ReadRawEndTag() { |
121 | 1.02M | for (std::size_t i = 0; i < raw_tag_.size(); ++i) { |
122 | 902k | char c = ReadByte(); |
123 | 902k | if (eof_) return false; |
124 | 902k | if (c != raw_tag_.at(i) && c != (raw_tag_.at(i) - ('a' - 'A'))) { |
125 | 135k | UnreadByte(); |
126 | 135k | return false; |
127 | 135k | } |
128 | 902k | } |
129 | | |
130 | 121k | char c = ReadByte(); |
131 | 121k | if (eof_) return false; |
132 | 121k | switch (c) { |
133 | 696 | case ' ': |
134 | 762 | case '\n': |
135 | 780 | case '\t': |
136 | 814 | case '\f': |
137 | 1.12k | case '/': |
138 | 24.4k | case '>': |
139 | | // The 3 is 2 for the leading "</" plus 1 for the trailing character c. |
140 | 24.4k | raw_.end -= (3 /* <, /, and > */+ raw_tag_.size()); |
141 | 24.4k | current_line_col_.second -= (3 /* <, /, and > */ + raw_tag_.size()); |
142 | 24.4k | return true; |
143 | 121k | } |
144 | 97.3k | UnreadByte(); |
145 | 97.3k | return false; |
146 | 121k | } |
147 | | |
148 | | enum ScriptDataState { |
149 | | DONE = 0, |
150 | | SCRIPT_DATA = 1, |
151 | | SCRIPT_DATA_LESS_THAN_SIGN = 2, |
152 | | SCRIPT_DATA_END_TAG_OPEN = 3, |
153 | | SCRIPT_DATA_ESCAPE_START = 4, |
154 | | SCRIPT_DATA_ESCAPE_START_DASH = 5, |
155 | | SCRIPT_DATA_ESCAPED = 6, |
156 | | SCRIPT_DATA_ESCAPED_DASH = 7, |
157 | | SCRIPT_DATA_ESCAPED_DASH_DASH = 8, |
158 | | SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 9, |
159 | | SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 10, |
160 | | SCRIPT_DATA_DOUBLE_ESCAPE_START = 11, |
161 | | SCRIPT_DATA_DOUBLE_ESCAPED = 12, |
162 | | SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 13, |
163 | | SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 14, |
164 | | SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 15, |
165 | | SCRIPT_DATA_DOUBLE_ESCAPED_END = 16 |
166 | | }; |
167 | | |
168 | 5.39k | void Tokenizer::ReadScript() { |
169 | 5.39k | defer({data_.end = raw_.end;}); |
170 | 5.39k | ScriptDataState state = ScriptDataState::SCRIPT_DATA; |
171 | 24.3M | while (!eof_ && state != ScriptDataState::DONE) { |
172 | 24.3M | switch (state) { |
173 | 8.44M | case ScriptDataState::SCRIPT_DATA: { |
174 | 8.44M | char c = ReadByte(); |
175 | 8.44M | if (eof_) return; |
176 | 8.44M | if (c == '<') { |
177 | 2.95M | state = ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN; |
178 | 5.48M | } else { |
179 | 5.48M | state = ScriptDataState::SCRIPT_DATA; |
180 | 5.48M | } |
181 | 8.44M | break; |
182 | 8.44M | } |
183 | 2.95M | case ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN: { |
184 | 2.95M | char c = ReadByte(); |
185 | 2.95M | if (eof_) return; |
186 | 2.95M | if (c == '/') { |
187 | 14.5k | state = ScriptDataState::SCRIPT_DATA_END_TAG_OPEN; |
188 | 2.94M | } else if (c == '!') { |
189 | 1.72M | state = ScriptDataState::SCRIPT_DATA_ESCAPE_START; |
190 | 1.72M | } else { |
191 | 1.21M | UnreadByte(); |
192 | 1.21M | state = ScriptDataState::SCRIPT_DATA; |
193 | 1.21M | } |
194 | 2.95M | break; |
195 | 2.95M | } |
196 | 14.5k | case ScriptDataState::SCRIPT_DATA_END_TAG_OPEN: { |
197 | 14.5k | if (ReadRawEndTag() || eof_) { |
198 | 1.52k | return; |
199 | 1.52k | } |
200 | 13.0k | state = ScriptDataState::SCRIPT_DATA; |
201 | 13.0k | break; |
202 | 14.5k | } |
203 | 1.72M | case ScriptDataState::SCRIPT_DATA_ESCAPE_START: { |
204 | 1.72M | char c = ReadByte(); |
205 | 1.72M | if (eof_) return; |
206 | 1.72M | if (c == '-') { |
207 | 1.71M | state = ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH; |
208 | 1.71M | } else { |
209 | 10.6k | UnreadByte(); |
210 | 10.6k | state = ScriptDataState::SCRIPT_DATA; |
211 | 10.6k | } |
212 | 1.72M | break; |
213 | 1.72M | } |
214 | 1.71M | case ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH: { |
215 | 1.71M | char c = ReadByte(); |
216 | 1.71M | if (eof_) return; |
217 | 1.71M | if (c == '-') { |
218 | 861k | state = SCRIPT_DATA_ESCAPED_DASH_DASH; |
219 | 861k | } else { |
220 | 851k | UnreadByte(); |
221 | 851k | state = ScriptDataState::SCRIPT_DATA; |
222 | 851k | } |
223 | 1.71M | break; |
224 | 1.71M | } |
225 | 4.24M | case ScriptDataState::SCRIPT_DATA_ESCAPED: { |
226 | 4.24M | char c = ReadByte(); |
227 | 4.24M | if (eof_) return; |
228 | 4.24M | if (c == '-') { |
229 | 1.61M | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH; |
230 | 2.62M | } else if (c == '<') { |
231 | 75.2k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
232 | 2.55M | } else { |
233 | 2.55M | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
234 | 2.55M | } |
235 | 4.24M | break; |
236 | 4.24M | } |
237 | 1.61M | case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH: { |
238 | 1.61M | char c = ReadByte(); |
239 | 1.61M | if (eof_) return; |
240 | 1.61M | if (c == '-') { |
241 | 2.35k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH; |
242 | 1.61M | } else if (c == '<') { |
243 | 1.60M | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
244 | 1.60M | } else { |
245 | 6.94k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
246 | 6.94k | } |
247 | 1.61M | break; |
248 | 1.61M | } |
249 | 864k | case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH: { |
250 | 864k | char c = ReadByte(); |
251 | 864k | if (eof_) return; |
252 | 864k | if (c == '-') { |
253 | 337 | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH; |
254 | 863k | } else if (c == '<') { |
255 | 6.75k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
256 | 856k | } else if (c == '>') { |
257 | 194 | state = ScriptDataState::SCRIPT_DATA; |
258 | 856k | } else { |
259 | 856k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
260 | 856k | } |
261 | 864k | break; |
262 | 864k | } |
263 | 1.69M | case ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { |
264 | 1.69M | char c = ReadByte(); |
265 | 1.69M | if (eof_) return; |
266 | 1.69M | if (c == '/') { |
267 | 3.46k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN; |
268 | 1.68M | } else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { |
269 | 831k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START; |
270 | 856k | } else { |
271 | 856k | UnreadByte(); |
272 | 856k | state = ScriptDataState::SCRIPT_DATA; |
273 | 856k | } |
274 | 1.69M | break; |
275 | 1.69M | } |
276 | 3.46k | case ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN: { |
277 | 3.46k | if (ReadRawEndTag()) { |
278 | 3.07k | state = ScriptDataState::DONE; |
279 | 3.07k | } else { |
280 | 389 | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
281 | 389 | } |
282 | 3.46k | break; |
283 | 1.69M | } |
284 | 831k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START: { |
285 | 831k | UnreadByte(); |
286 | 831k | static std::string script_tag_l = "script"; |
287 | 831k | static std::string script_tag_u = "SCRIPT"; |
288 | 5.81M | for (int8_t i = 0; i < 6 /*script*/; ++i) { |
289 | 4.98M | char c = ReadByte(); |
290 | 4.98M | if (eof_) return; |
291 | 4.98M | if (c != script_tag_l[i] && c != script_tag_u[i]) { |
292 | 4.98M | UnreadByte(); |
293 | 4.98M | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
294 | 4.98M | } |
295 | 4.98M | } |
296 | 831k | char c = ReadByte(); |
297 | 831k | if (eof_) return; |
298 | 831k | if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\f' |
299 | 831k | || c == '/' || c == '>') { |
300 | 1.78k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
301 | 829k | } else { |
302 | 829k | UnreadByte(); |
303 | 829k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
304 | 829k | } |
305 | 831k | break; |
306 | 831k | } |
307 | 134k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED: { |
308 | 134k | char c = ReadByte(); |
309 | 134k | if (eof_) return; |
310 | 134k | if (c == '-') { |
311 | 46.8k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH; |
312 | 87.5k | } else if (c == '<') { |
313 | 19.6k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
314 | 67.8k | } else { |
315 | 67.8k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
316 | 67.8k | } |
317 | 134k | break; |
318 | 134k | } |
319 | 46.8k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH: { |
320 | 46.8k | char c = ReadByte(); |
321 | 46.8k | if (eof_) return; |
322 | 46.8k | if (c == '-') { |
323 | 9.28k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; |
324 | 37.5k | } else if (c == '<') { |
325 | 27.1k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
326 | 27.1k | } else if (c == '>') { |
327 | 592 | state = ScriptDataState::SCRIPT_DATA; |
328 | 9.79k | } else { |
329 | 9.79k | state = SCRIPT_DATA_DOUBLE_ESCAPED; |
330 | 9.79k | } |
331 | 46.8k | break; |
332 | 46.8k | } |
333 | 9.53k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: { |
334 | 9.53k | char c = ReadByte(); |
335 | 9.53k | if (eof_) return; |
336 | 9.52k | if (c == '-') { |
337 | 253 | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; |
338 | 9.27k | } else if (c == '<') { |
339 | 252 | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
340 | 9.01k | } else if (c == '>') { |
341 | 195 | state = ScriptDataState::SCRIPT_DATA; |
342 | 8.82k | } else { |
343 | 8.82k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
344 | 8.82k | } |
345 | 9.52k | break; |
346 | 9.53k | } |
347 | 47.1k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: { |
348 | 47.1k | char c = ReadByte(); |
349 | 47.1k | if (eof_) return; |
350 | 47.1k | if (c == '/') { |
351 | 1.04k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END; |
352 | 46.0k | } else { |
353 | 46.0k | UnreadByte(); |
354 | 46.0k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
355 | 46.0k | } |
356 | 47.1k | break; |
357 | 47.1k | } |
358 | 1.04k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END: { |
359 | 1.04k | if (ReadRawEndTag()) { |
360 | 760 | raw_.end += std::string("</script>").size(); |
361 | 760 | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
362 | 760 | } else { |
363 | 280 | if (eof_) return; |
364 | 273 | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
365 | 273 | } |
366 | 1.03k | break; |
367 | 1.04k | } |
368 | 1.03k | default: |
369 | 0 | break; |
370 | 24.3M | } |
371 | 24.3M | } |
372 | 5.39k | } |
373 | | |
374 | 897 | void Tokenizer::ReadComment() { |
375 | 897 | data_.start = raw_.end; |
376 | 897 | defer({ |
377 | 897 | if (data_.end < data_.start) { |
378 | | // It's a comment with no data, like <!--> |
379 | 897 | data_.end = data_.start; |
380 | 897 | } |
381 | 897 | }); |
382 | 897 | int dash_count = 2; |
383 | 2.52M | while (!eof_) { |
384 | 2.52M | char c = ReadByte(); |
385 | 2.52M | if (eof_) { |
386 | | // Ignore up to two dashes at EOF. |
387 | 71 | if (dash_count > 2) { |
388 | 5 | dash_count = 2; |
389 | 5 | } |
390 | 71 | data_.end = raw_.end - dash_count; |
391 | 71 | return; |
392 | 71 | } |
393 | 2.52M | if (c == '-') { |
394 | 585k | dash_count++; |
395 | 585k | continue; |
396 | 1.93M | } else if (c == '>') { |
397 | 759 | if (dash_count >= 2) { |
398 | 563 | data_.end = raw_.end - 3 /* --> */; |
399 | 563 | return; |
400 | 563 | } |
401 | 1.93M | } else if (c == '!') { |
402 | 319k | if (dash_count >= 2) { |
403 | 462 | char c = ReadByte(); |
404 | 462 | if (eof_) { |
405 | 13 | data_.end = raw_.end; |
406 | 13 | return; |
407 | 13 | } |
408 | 449 | if (c == '>') { |
409 | 250 | data_.end = raw_.end - 4 /* --!> */; |
410 | 250 | return; |
411 | 250 | } |
412 | 449 | } |
413 | 319k | } |
414 | 1.93M | dash_count = 0; |
415 | 1.93M | } |
416 | 897 | } |
417 | | |
418 | 1.03M | void Tokenizer::ReadUntilCloseAngle() { |
419 | 1.03M | data_.start = raw_.end; |
420 | 3.26M | while (!eof_) { |
421 | 3.26M | char c = ReadByte(); |
422 | 3.26M | if (eof_) { |
423 | 458 | data_.end = raw_.end; |
424 | 458 | return; |
425 | 458 | } |
426 | 3.26M | if (c == '>') { |
427 | 1.03M | data_.end = raw_.end - 1 /* ">" */; |
428 | 1.03M | return; |
429 | 1.03M | } |
430 | 3.26M | } |
431 | 1.03M | } |
432 | | |
433 | 1.03M | TokenType Tokenizer::ReadMarkupDeclaration() { |
434 | 1.03M | data_.start = raw_.end; |
435 | 1.03M | char c[2]; |
436 | 3.09M | for (int i = 0; i < 2; ++i) { |
437 | 2.06M | c[i] = ReadByte(); |
438 | 2.06M | if (eof_) { |
439 | 69 | data_.end = raw_.end; |
440 | 69 | return TokenType::COMMENT_TOKEN; |
441 | 69 | } |
442 | 2.06M | } |
443 | | |
444 | 1.03M | if (c[0] == '-' && c[1] == '-') { |
445 | 897 | ReadComment(); |
446 | 897 | return TokenType::COMMENT_TOKEN; |
447 | 897 | } |
448 | | |
449 | 1.03M | UnreadByte(); |
450 | 1.03M | UnreadByte(); |
451 | 1.03M | if (ReadDoctype()) { |
452 | 690 | return TokenType::DOCTYPE_TOKEN; |
453 | 690 | } |
454 | | |
455 | 1.03M | if (allow_cdata_ && ReadCDATA()) { |
456 | 251 | convert_null_ = true; |
457 | 251 | return TokenType::TEXT_TOKEN; |
458 | 251 | } |
459 | | |
460 | | // It's a bogus comment. |
461 | 1.02M | ReadUntilCloseAngle(); |
462 | 1.02M | return TokenType::COMMENT_TOKEN; |
463 | 1.03M | } |
464 | | |
465 | 1.03M | bool Tokenizer::ReadDoctype() { |
466 | 1.03M | token_line_col_ = {current_line_col_.first, |
467 | 1.03M | current_line_col_.second - 2 /* <! */}; |
468 | | |
469 | 1.03M | static constexpr std::string_view kDoctype = "DOCTYPE"; |
470 | 1.03M | for (std::size_t i = 0; i < kDoctype.size(); ++i) { |
471 | 1.03M | char c = ReadByte(); |
472 | 1.03M | if (eof_) { |
473 | 12 | data_.end = raw_.end; |
474 | 12 | return false; |
475 | 12 | } |
476 | 1.03M | if (c != kDoctype.at(i) && c != (kDoctype.at(i) + ('a' - 'A'))) { |
477 | | // Back up to read the fragment of "DOCTYPE" again. |
478 | 1.03M | raw_.end = data_.start; |
479 | 1.03M | return false; |
480 | 1.03M | } |
481 | 1.03M | } |
482 | | |
483 | 690 | SkipWhiteSpace(); |
484 | 690 | if (eof_) { |
485 | 18 | data_.start = raw_.end; |
486 | 18 | data_.end = raw_.end; |
487 | 18 | return true; |
488 | 18 | } |
489 | | |
490 | 672 | ReadUntilCloseAngle(); |
491 | 672 | return true; |
492 | 690 | } |
493 | | |
494 | 524 | bool Tokenizer::ReadCDATA() { |
495 | 524 | static constexpr std::string_view kCData = "[CDATA["; |
496 | 2.61k | for (std::size_t i = 0; i < kCData.size(); ++i) { |
497 | 2.36k | char c = ReadByte(); |
498 | 2.36k | if (eof_) { |
499 | 8 | data_.end = raw_.end; |
500 | 8 | return false; |
501 | 8 | } |
502 | 2.35k | if (c != kCData[i]) { |
503 | | // Back up to read the fragment of "[CDATA[" again. |
504 | 265 | data_.end = raw_.start; |
505 | 265 | return false; |
506 | 265 | } |
507 | 2.35k | } |
508 | 251 | data_.start = raw_.end; |
509 | 251 | int brackets = 0; |
510 | 1.27k | while (!eof_) { |
511 | 1.27k | char c = ReadByte(); |
512 | 1.27k | if (eof_) { |
513 | 29 | data_.end = raw_.end; |
514 | 29 | return true; |
515 | 29 | } |
516 | 1.24k | switch (c) { |
517 | 637 | case ']': { |
518 | 637 | brackets++; |
519 | 637 | break; |
520 | 0 | } |
521 | 416 | case '>': { |
522 | 416 | if (brackets >= 2) { |
523 | 222 | data_.end = raw_.end - 3 /* "]]>" */; |
524 | 222 | return true; |
525 | 222 | } |
526 | 194 | brackets = 0; |
527 | 194 | break; |
528 | 416 | } |
529 | 194 | default: |
530 | 194 | brackets = 0; |
531 | 1.24k | } |
532 | 1.24k | } |
533 | 0 | return false; |
534 | 251 | } |
535 | | |
536 | | template<typename... Args> |
537 | 5.22M | bool Tokenizer::StartTagIn(Args... ss) { |
538 | 5.22M | std::vector<std::string> argsList{ss...}; |
539 | 9.96M | for (const auto& s : argsList) { |
540 | 9.96M | if (data_.end - data_.start != s.size()) continue; |
541 | 1.44M | bool matched = true; |
542 | 3.14M | for (std::size_t i = 0; i < s.size(); ++i) { |
543 | 3.11M | char c = buffer_.at(data_.start + i); |
544 | 3.11M | if ('A' <= c && c <= 'Z') { |
545 | 2.94M | c += 'a' - 'A'; |
546 | 2.94M | } |
547 | 3.11M | if (c != s[i]) { |
548 | 1.41M | matched = false; |
549 | 1.41M | break; |
550 | 1.41M | } |
551 | 3.11M | } |
552 | 1.44M | if (matched) { |
553 | 25.0k | return true; |
554 | 25.0k | } |
555 | 1.44M | } |
556 | 5.19M | return false; |
557 | 5.22M | } bool htmlparser::Tokenizer::StartTagIn<char const*>(char const*) Line | Count | Source | 537 | 519k | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 519k | std::vector<std::string> argsList{ss...}; | 539 | 519k | for (const auto& s : argsList) { | 540 | 519k | if (data_.end - data_.start != s.size()) continue; | 541 | 4.32k | bool matched = true; | 542 | 17.2k | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 13.4k | char c = buffer_.at(data_.start + i); | 544 | 13.4k | if ('A' <= c && c <= 'Z') { | 545 | 8.14k | c += 'a' - 'A'; | 546 | 8.14k | } | 547 | 13.4k | if (c != s[i]) { | 548 | 567 | matched = false; | 549 | 567 | break; | 550 | 567 | } | 551 | 13.4k | } | 552 | 4.32k | if (matched) { | 553 | 3.75k | return true; | 554 | 3.75k | } | 555 | 4.32k | } | 556 | 515k | return false; | 557 | 519k | } |
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*, char const*>(char const*, char const*, char const*) Line | Count | Source | 537 | 47.3k | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 47.3k | std::vector<std::string> argsList{ss...}; | 539 | 141k | for (const auto& s : argsList) { | 540 | 141k | if (data_.end - data_.start != s.size()) continue; | 541 | 80.1k | bool matched = true; | 542 | 331k | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 328k | char c = buffer_.at(data_.start + i); | 544 | 328k | if ('A' <= c && c <= 'Z') { | 545 | 298k | c += 'a' - 'A'; | 546 | 298k | } | 547 | 328k | if (c != s[i]) { | 548 | 76.9k | matched = false; | 549 | 76.9k | break; | 550 | 76.9k | } | 551 | 328k | } | 552 | 80.1k | if (matched) { | 553 | 3.13k | return true; | 554 | 3.13k | } | 555 | 80.1k | } | 556 | 44.2k | return false; | 557 | 47.3k | } |
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*>(char const*, char const*) Line | Count | Source | 537 | 4.65M | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 4.65M | std::vector<std::string> argsList{ss...}; | 539 | 9.30M | for (const auto& s : argsList) { | 540 | 9.30M | if (data_.end - data_.start != s.size()) continue; | 541 | 1.35M | bool matched = true; | 542 | 2.79M | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 2.77M | char c = buffer_.at(data_.start + i); | 544 | 2.77M | if ('A' <= c && c <= 'Z') { | 545 | 2.63M | c += 'a' - 'A'; | 546 | 2.63M | } | 547 | 2.77M | if (c != s[i]) { | 548 | 1.33M | matched = false; | 549 | 1.33M | break; | 550 | 1.33M | } | 551 | 2.77M | } | 552 | 1.35M | if (matched) { | 553 | 18.1k | return true; | 554 | 18.1k | } | 555 | 1.35M | } | 556 | 4.63M | return false; | 557 | 4.65M | } |
|
558 | | |
559 | 13.1M | TokenType Tokenizer::ReadStartTag(bool template_mode) { |
560 | 13.1M | token_line_col_ = {current_line_col_.first, |
561 | 13.1M | current_line_col_.second - 1 /* < */}; |
562 | 13.1M | ReadTag(true, template_mode); |
563 | | |
564 | 13.1M | if (eof_) { |
565 | 373 | return TokenType::ERROR_TOKEN; |
566 | 373 | } |
567 | | |
568 | | // Several tags flag the tokenizer's next token as raw. |
569 | 13.1M | bool raw = false; |
570 | 13.1M | char c = buffer_.at(data_.start); |
571 | | |
572 | | // Lowercase. |
573 | 13.1M | if ('A' <= c && c <= 'Z') { |
574 | 4.96M | c += 'a' - 'A'; |
575 | 4.96M | } |
576 | | |
577 | 13.1M | switch (c) { |
578 | 501k | case 'i': |
579 | 501k | raw = StartTagIn("iframe"); |
580 | 501k | break; |
581 | 47.3k | case 'n': |
582 | 47.3k | raw = StartTagIn("noembed", "noframes", "noscript"); |
583 | 47.3k | break; |
584 | 11.1k | case 'p': |
585 | 11.1k | raw = StartTagIn("plaintext"); |
586 | 11.1k | break; |
587 | 1.36M | case 's': |
588 | 1.36M | raw = StartTagIn("script", "style"); |
589 | 1.36M | break; |
590 | 3.29M | case 't': |
591 | 3.29M | raw = StartTagIn("textarea", "title"); |
592 | 3.29M | break; |
593 | 6.09k | case 'x': |
594 | 6.09k | raw = StartTagIn("xmp"); |
595 | 13.1M | } |
596 | | |
597 | 13.1M | if (raw) { |
598 | 25.0k | int size = data_.end - data_.start; |
599 | 25.0k | raw_tag_ = std::string(buffer_.substr(data_.start, size)); |
600 | 25.0k | Strings::ToLower(&raw_tag_); |
601 | 25.0k | } |
602 | | |
603 | | // Look for a self-closing token like "<br/>". |
604 | 13.1M | if (!eof_ && buffer_[raw_.end - 2] == '/') { |
605 | 783 | return TokenType::SELF_CLOSING_TAG_TOKEN; |
606 | 783 | } |
607 | | |
608 | 13.1M | return TokenType::START_TAG_TOKEN; |
609 | 13.1M | } |
610 | | |
611 | 13.4M | void Tokenizer::ReadTag(bool save_attr, bool template_mode) { |
612 | 13.4M | attributes_.clear(); |
613 | 13.4M | n_attributes_returned_ = 0; |
614 | | |
615 | | // Read the tag name and attribute key/value pairs. |
616 | 13.4M | ReadTagName(); |
617 | 13.4M | SkipWhiteSpace(); |
618 | | |
619 | 13.4M | if (eof_) { |
620 | 132 | return; |
621 | 132 | } |
622 | | |
623 | 15.6M | while (!eof_) { |
624 | 15.6M | char c = ReadByte(); |
625 | 15.6M | if (eof_ || c == '>') { |
626 | 13.4M | break; |
627 | 13.4M | } |
628 | | |
629 | | // Undo previous > read. |
630 | 2.18M | UnreadByte(); |
631 | | |
632 | 2.18M | ReadTagAttributeKey(template_mode); |
633 | 2.18M | ReadTagAttributeValue(); |
634 | | // Save pending_attribute if save_attr and that attribute has a non-empty |
635 | | // key. |
636 | 2.18M | if (save_attr && |
637 | | // Skip excessive attributes. |
638 | 2.18M | attributes_.size() < ::absl::GetFlag( |
639 | 2.11M | FLAGS_htmlparser_max_attributes_per_node) && |
640 | 2.18M | std::get<0>(pending_attribute_).start != |
641 | 669k | std::get<0>(pending_attribute_).end) { |
642 | 616k | attributes_.push_back(pending_attribute_); |
643 | 616k | } |
644 | 2.18M | SkipWhiteSpace(); |
645 | 2.18M | } |
646 | 13.4M | } |
647 | | |
648 | 13.4M | void Tokenizer::ReadTagName() { |
649 | 13.4M | data_.start = raw_.end - 1; |
650 | 31.7M | while (!eof_) { |
651 | 31.7M | char c = ReadByte(); |
652 | 31.7M | if (eof_) { |
653 | 68 | data_.end = raw_.end; |
654 | 68 | return; |
655 | 68 | } |
656 | 31.7M | switch (c) { |
657 | 31.2k | case ' ': |
658 | 32.0k | case '\n': |
659 | 139k | case '\r': |
660 | 139k | case '\t': |
661 | 139k | case '\f': |
662 | 139k | data_.end = raw_.end - 1; |
663 | 139k | return; |
664 | 75.8k | case '/': |
665 | 13.2M | case '>': |
666 | 13.2M | UnreadByte(); |
667 | 13.2M | data_.end = raw_.end; |
668 | 13.2M | return; |
669 | 31.7M | } |
670 | 31.7M | } |
671 | 13.4M | } |
672 | | |
673 | | // Sets pending_attribute_[0] to the "k" in "<div k=v>". |
674 | | // Precondition: eof_ != true; |
675 | 2.18M | void Tokenizer::ReadTagAttributeKey(bool template_mode) { |
676 | 2.18M | std::get<0>(pending_attribute_).start = raw_.end; |
677 | 2.18M | std::get<LineCol>(pending_attribute_) = |
678 | 2.18M | {current_line_col_.first, current_line_col_.second + 1}; |
679 | | |
680 | | // All mustache_ prefixed variables applies to parsing logic for AMP mustache |
681 | | // templates. See: https://amp.dev/documentation/components/amp-mustache/ |
682 | 2.18M | bool mustache_inside_section_block = false; |
683 | 2.18M | std::string mustache_section_name = ""; |
684 | | |
685 | 27.9M | while (!eof_) { |
686 | 27.9M | char c = ReadByte(); |
687 | 27.9M | if (eof_) { |
688 | 135 | std::get<0>(pending_attribute_).start = raw_.end; |
689 | 135 | return; |
690 | 135 | } |
691 | | |
692 | | // Template attributes processing. |
693 | | // Looks for following special syntax. |
694 | | // {{#section}}...{{/section}} |
695 | | // {{^section}}...{{/section}} |
696 | | // {{variable}} |
697 | 27.9M | if (template_mode) { |
698 | 0 | UnreadByte(); |
699 | 0 | UnreadByte(); |
700 | 0 | UnreadByte(); |
701 | 0 | char c1 = ReadByte(); |
702 | 0 | char c2 = ReadByte(); |
703 | 0 | c = ReadByte(); |
704 | 0 | if (mustache_inside_section_block && c1 == '{' && c2 == '{' && c == '/') { |
705 | | // Look for closing section name. If not resort to default behavior. |
706 | | // Reason for this logic is to differentiate between: |
707 | | // <p {{#mycondition}}class=foo{{/mycondition}} foo=bar> vs. |
708 | | // <img {{#mycondition}}class=foo /> |
709 | 0 | int raw_end = raw_.end; |
710 | 0 | std::string_view close_section = |
711 | 0 | buffer_.substr(raw_.end, mustache_section_name.size()); |
712 | 0 | bool section_name_match = close_section == mustache_section_name; |
713 | 0 | if (section_name_match) { |
714 | 0 | raw_.end += mustache_section_name.size(); |
715 | 0 | char e1 = ReadByte(); |
716 | 0 | char e2 = ReadByte(); |
717 | 0 | if (e1 == '}' && e2 == '}') { |
718 | 0 | mustache_inside_section_block = false; |
719 | 0 | continue; |
720 | 0 | } else { |
721 | 0 | raw_.end = raw_end; |
722 | 0 | } |
723 | 0 | } |
724 | 0 | } |
725 | | |
726 | 0 | if (c1 == '{' && c2 == '{' && (c == '#' || c == '^')) { |
727 | 0 | auto n = buffer_.find("}}", raw_.end); |
728 | 0 | if (n != std::string_view::npos) { |
729 | 0 | mustache_section_name = buffer_.substr(raw_.end, n - raw_.end); |
730 | 0 | mustache_inside_section_block = true; |
731 | 0 | continue; |
732 | 0 | } |
733 | 0 | } |
734 | 0 | } |
735 | | |
736 | 27.9M | switch (c) { |
737 | 19.1k | case ' ': |
738 | 142k | case '\n': |
739 | 167k | case '\r': |
740 | 168k | case '\t': |
741 | 212k | case '\f': |
742 | 1.95M | case '/': { |
743 | 1.95M | std::get<0>(pending_attribute_).end = raw_.end - 1; |
744 | 1.95M | return; |
745 | 212k | } |
746 | 10.5k | case '=': |
747 | 222k | case '>': { |
748 | 222k | UnreadByte(); |
749 | 222k | std::get<0>(pending_attribute_).end = raw_.end; |
750 | 222k | return; |
751 | 10.5k | } |
752 | 27.9M | } |
753 | 27.9M | } |
754 | 2.18M | } |
755 | | |
756 | | // Sets pending_attribute_.second to the "v" in "<div k=v>". |
757 | 2.18M | void Tokenizer::ReadTagAttributeValue() { |
758 | 2.18M | std::get<1>(pending_attribute_).start = raw_.end; |
759 | 2.18M | std::get<1>(pending_attribute_).end = raw_.end; |
760 | 2.18M | SkipWhiteSpace(); |
761 | 2.18M | if (eof_) { |
762 | 219 | return; |
763 | 219 | } |
764 | 2.18M | char c = ReadByte(); |
765 | 2.18M | if (eof_) { |
766 | 0 | return; |
767 | 0 | } |
768 | | |
769 | 2.18M | if (c != '=') { |
770 | 2.17M | UnreadByte(); |
771 | 2.17M | return; |
772 | 2.17M | } |
773 | | |
774 | 10.6k | SkipWhiteSpace(); |
775 | 10.6k | if (eof_) { |
776 | 14 | return; |
777 | 14 | } |
778 | | |
779 | 10.5k | char quote = ReadByte(); |
780 | 10.5k | if (eof_) { |
781 | 0 | return; |
782 | 0 | } |
783 | | |
784 | 10.5k | switch (quote) { |
785 | 195 | case '>': |
786 | 195 | UnreadByte(); |
787 | 195 | return; |
788 | 255 | case '\'': |
789 | 265 | case '"': |
790 | 265 | std::get<1>(pending_attribute_).start = raw_.end; |
791 | 459 | while (!eof_) { |
792 | 459 | c = ReadByte(); |
793 | 459 | if (eof_) { |
794 | 12 | std::get<1>(pending_attribute_).end = raw_.end; |
795 | 12 | return; |
796 | 12 | } |
797 | 447 | if (c == quote) { |
798 | 253 | std::get<1>(pending_attribute_).end = raw_.end - 1; |
799 | 253 | return; |
800 | 253 | } |
801 | 447 | } |
802 | 0 | break; |
803 | 10.1k | default: { |
804 | 10.1k | std::get<1>(pending_attribute_).start = raw_.end - 1; |
805 | 3.83M | while (!eof_) { |
806 | 3.83M | c = ReadByte(); |
807 | 3.83M | if (eof_) { |
808 | 8 | std::get<1>(pending_attribute_).end = raw_.end; |
809 | 8 | return; |
810 | 8 | } |
811 | 3.83M | switch (c) { |
812 | 1.19k | case ' ': |
813 | 1.39k | case '\n': |
814 | 1.68k | case '\r': |
815 | 1.93k | case '\t': |
816 | 9.22k | case '\f': |
817 | 9.22k | std::get<1>(pending_attribute_).end = raw_.end - 1; |
818 | 9.22k | return; |
819 | 902 | case '>': |
820 | 902 | UnreadByte(); |
821 | 902 | std::get<1>(pending_attribute_).end = raw_.end; |
822 | 902 | return; |
823 | 3.83M | } |
824 | 3.83M | } |
825 | 10.1k | } |
826 | 10.5k | } |
827 | 10.5k | } |
828 | | |
829 | 15.1M | TokenType Tokenizer::Next(bool template_mode) { |
830 | 15.1M | raw_.start = raw_.end; |
831 | 15.1M | data_.start = raw_.end; |
832 | 15.1M | data_.end = raw_.end; |
833 | 15.1M | is_token_manufactured_ = false; |
834 | | |
835 | 15.1M | if (eof_) { |
836 | 3.98k | err_ = true; |
837 | 3.98k | token_type_ = TokenType::ERROR_TOKEN; |
838 | 3.98k | return token_type_; |
839 | 3.98k | } |
840 | | |
841 | 15.1M | if (raw_tag_ != "") { |
842 | 24.7k | if (raw_tag_ == "plaintext") { |
843 | | // Read everything up to EOF. |
844 | 0 | while (!eof_) { |
845 | 0 | ReadByte(); |
846 | 0 | } |
847 | 0 | data_.end = raw_.end; |
848 | 0 | text_is_raw_ = true; |
849 | 24.7k | } else { |
850 | 24.7k | ReadRawOrRCDATA(); |
851 | 24.7k | } |
852 | | |
853 | 24.7k | if (data_.end > data_.start) { |
854 | 17.3k | token_type_ = TokenType::TEXT_TOKEN; |
855 | 17.3k | convert_null_ = true; |
856 | 17.3k | return token_type_; |
857 | 17.3k | } |
858 | 24.7k | } |
859 | | |
860 | 15.0M | text_is_raw_ = false; |
861 | 15.0M | convert_null_ = false; |
862 | | |
863 | 30.6M | while (!eof_) { |
864 | 30.6M | char c = ReadByte(); |
865 | | |
866 | 30.6M | if (eof_) { |
867 | 10.0k | break; |
868 | 10.0k | } |
869 | | |
870 | 30.6M | if (c != '<') { |
871 | 15.3M | continue; |
872 | 15.3M | } |
873 | | |
874 | 15.2M | c = ReadByte(); |
875 | 15.2M | if (eof_) break; |
876 | | |
877 | | // Check if the '<' we have just read is part of a tag, comment or |
878 | | // doctype. If not, it's part of the accumulated text token. |
879 | 15.2M | TokenType token_type; |
880 | 15.2M | if (Strings::IsCharAlphabet(c)) { |
881 | 13.6M | token_type = TokenType::START_TAG_TOKEN; |
882 | 13.6M | } else if (c == '/') { |
883 | 309k | token_type = TokenType::END_TAG_TOKEN; |
884 | 1.30M | } else if (c == '!' || c == '?') { |
885 | 1.14M | token_type = TokenType::COMMENT_TOKEN; |
886 | 1.14M | } else { |
887 | 167k | UnreadByte(); |
888 | 167k | continue; |
889 | 167k | } |
890 | | |
891 | | // We have a non-text token, but we might have accumulated some text |
892 | | // before that. If so, we return the text first, and return the non text |
893 | | // token on the subsequent call to Next. |
894 | | // |
895 | | // <space><space><mytag>, returns two spaces before processing the mytag |
896 | | // token in the next call. |
897 | 15.0M | if (int x = raw_.end - 2 /* "<a" */; raw_.start < x) { |
898 | 608k | raw_.end = x; |
899 | 608k | data_.end = x; |
900 | | // We know there is no \n so no line adjustment needed. |
901 | 608k | current_line_col_.second -= 2; |
902 | 608k | token_type_ = TokenType::TEXT_TOKEN; |
903 | 608k | return token_type_; |
904 | 608k | } |
905 | | |
906 | 14.4M | switch (token_type) { |
907 | 13.1M | case TokenType::START_TAG_TOKEN: |
908 | 13.1M | token_type_ = ReadStartTag(template_mode); |
909 | 13.1M | return token_type_; |
910 | 271k | case TokenType::END_TAG_TOKEN: |
911 | 271k | c = ReadByte(); |
912 | 271k | if (eof_) break; |
913 | 271k | if (c == '>') { |
914 | | // "</> does not generate a token at all. Generate an empty comment |
915 | | // to allow passthrough clients to pick up the data using raw_. |
916 | | // Reset the tokenizer state and start again. |
917 | 12.5k | token_type_ = TokenType::COMMENT_TOKEN; |
918 | 12.5k | return token_type_; |
919 | 12.5k | } |
920 | 258k | if (Strings::IsCharAlphabet(c)) { |
921 | 256k | ReadTag(false); |
922 | 256k | if (eof_) { |
923 | 56 | token_type_ = TokenType::ERROR_TOKEN; |
924 | 255k | } else { |
925 | 255k | token_type_ = TokenType::END_TAG_TOKEN; |
926 | 255k | } |
927 | 256k | return token_type_; |
928 | 256k | } |
929 | 2.65k | UnreadByte(); |
930 | 2.65k | ReadUntilCloseAngle(); |
931 | 2.65k | token_type_ = TokenType::COMMENT_TOKEN; |
932 | 2.65k | return token_type_; |
933 | 1.03M | case TokenType::COMMENT_TOKEN: { |
934 | 1.03M | if (c == '!') { |
935 | 1.03M | token_type_ = ReadMarkupDeclaration(); |
936 | 1.03M | return token_type_; |
937 | 1.03M | } |
938 | 2.90k | is_token_manufactured_ = true; |
939 | | // <? is part of the comment text. |
940 | 2.90k | UnreadByte(); |
941 | 2.90k | ReadUntilCloseAngle(); |
942 | 2.90k | token_type_ = TokenType::COMMENT_TOKEN; |
943 | 2.90k | return token_type_; |
944 | 1.03M | } |
945 | 0 | default: |
946 | 0 | break; |
947 | 14.4M | } |
948 | 14.4M | } |
949 | | |
950 | 10.4k | if (raw_.start < raw_.end) { |
951 | 2.33k | data_.end = raw_.end; |
952 | 2.33k | token_type_ = TokenType::TEXT_TOKEN; |
953 | 2.33k | return token_type_; |
954 | 2.33k | } |
955 | | |
956 | 8.10k | token_type_ = TokenType::ERROR_TOKEN; |
957 | 8.10k | return token_type_; |
958 | 10.4k | } |
959 | | |
960 | 0 | std::string_view Tokenizer::Raw() { |
961 | 0 | int size = raw_.end - raw_.start; |
962 | 0 | return buffer_.substr(raw_.start, size); |
963 | 0 | } |
964 | | |
965 | 1.67M | std::string Tokenizer::Text() { |
966 | 1.67M | switch (token_type_) { |
967 | 628k | case TokenType::TEXT_TOKEN: |
968 | 1.67M | case TokenType::COMMENT_TOKEN: |
969 | 1.67M | case TokenType::DOCTYPE_TOKEN: { |
970 | 1.67M | int size = data_.end - data_.start; |
971 | 1.67M | std::string s(buffer_.substr(data_.start, size)); |
972 | 1.67M | data_.start = raw_.end; |
973 | 1.67M | data_.end = raw_.end; |
974 | 1.67M | Strings::ConvertNewLines(&s); |
975 | 1.67M | if (convert_null_ || token_type_ == TokenType::COMMENT_TOKEN) { |
976 | | // Replace \x00 with \ufffd. |
977 | 1.06M | Strings::ReplaceAny(&s, |
978 | 1.06M | Strings::kNullChar, |
979 | 1.06M | Strings::kNullReplacementChar); |
980 | 1.06M | } |
981 | 1.67M | if (!text_is_raw_) Strings::UnescapeString(&s, false); |
982 | 1.67M | return s; |
983 | 1.67M | } |
984 | 0 | default: |
985 | 0 | break; |
986 | 1.67M | } |
987 | | |
988 | 0 | return ""; |
989 | 1.67M | } |
990 | | |
991 | 13.4M | std::optional<std::tuple<std::string, bool>> Tokenizer::TagName() { |
992 | 13.4M | if (data_.start < data_.end) { |
993 | 13.4M | switch (token_type_) { |
994 | 13.1M | case TokenType::START_TAG_TOKEN: |
995 | 13.4M | case TokenType::END_TAG_TOKEN: |
996 | 13.4M | case TokenType::SELF_CLOSING_TAG_TOKEN: { |
997 | 13.4M | int size = data_.end - data_.start; |
998 | 13.4M | std::string s(buffer_.substr(data_.start, size)); |
999 | 13.4M | data_.start = raw_.end; |
1000 | 13.4M | data_.end = raw_.end; |
1001 | 13.4M | Strings::ToLower(&s); |
1002 | 13.4M | return std::make_tuple<std::string, bool>(std::move(s), |
1003 | 13.4M | n_attributes_returned_ < attributes_.size()); |
1004 | 13.4M | } |
1005 | 0 | default: |
1006 | 0 | break; |
1007 | 13.4M | } |
1008 | 13.4M | } |
1009 | | |
1010 | 0 | return std::nullopt; |
1011 | 13.4M | } |
1012 | | |
1013 | 599k | std::optional<std::tuple<Attribute, bool>> Tokenizer::TagAttr() { |
1014 | 599k | if (n_attributes_returned_ < attributes_.size()) { |
1015 | 599k | switch (token_type_) { |
1016 | 593k | case TokenType::START_TAG_TOKEN: |
1017 | 599k | case TokenType::SELF_CLOSING_TAG_TOKEN: { |
1018 | 599k | auto attr = attributes_[n_attributes_returned_]; |
1019 | 599k | n_attributes_returned_++; |
1020 | 599k | int size = std::get<0>(attr).end - std::get<0>(attr).start; |
1021 | 599k | std::string key(buffer_.substr(std::get<0>(attr).start, size)); |
1022 | 599k | int value_size = std::get<1>(attr).end - std::get<1>(attr).start; |
1023 | 599k | std::string val(buffer_.substr(std::get<1>(attr).start, value_size)); |
1024 | 599k | Strings::ToLower(&key); |
1025 | 599k | Strings::ConvertNewLines(&val); |
1026 | 599k | Strings::UnescapeString(&val, true); |
1027 | 599k | return std::make_tuple<Attribute, bool>( |
1028 | 599k | {.name_space = "", |
1029 | 599k | .key = std::move(key), |
1030 | 599k | .value = std::move(val), |
1031 | 599k | .line_col_in_html_src = std::get<LineCol>(attr)}, |
1032 | 599k | n_attributes_returned_ < attributes_.size()); |
1033 | 593k | } |
1034 | 0 | default: |
1035 | 0 | break; |
1036 | 599k | } |
1037 | 599k | } |
1038 | | |
1039 | 0 | return std::nullopt; |
1040 | 599k | } |
1041 | | |
1042 | 15.1M | Token Tokenizer::token() { |
1043 | 15.1M | Token t; |
1044 | 15.1M | t.token_type = token_type_; |
1045 | 15.1M | switch (token_type_) { |
1046 | 628k | case TokenType::TEXT_TOKEN: { |
1047 | 628k | t.data = Text(); |
1048 | 628k | int line_number = current_line_col_.first; |
1049 | 628k | int column_number = current_line_col_.second - t.data.size(); |
1050 | | // Shift to previous line, where this text belongs. |
1051 | 628k | if (column_number < 0) { |
1052 | 306k | if (lines_cols_.size() > 1) { |
1053 | 306k | auto previous_token_linecol = lines_cols_[lines_cols_.size() - 2]; |
1054 | 306k | line_number = previous_token_linecol.first; |
1055 | 306k | column_number = |
1056 | 306k | previous_token_linecol.second - abs(column_number) + 1; |
1057 | 306k | } else { |
1058 | 425 | column_number = 0; |
1059 | 425 | } |
1060 | 306k | } |
1061 | 628k | token_line_col_ = {line_number, column_number}; |
1062 | 628k | break; |
1063 | 0 | } |
1064 | 1.04M | case TokenType::COMMENT_TOKEN: |
1065 | 1.04M | case TokenType::DOCTYPE_TOKEN: |
1066 | 1.04M | t.data = Text(); |
1067 | 1.04M | t.is_manufactured = is_token_manufactured_; |
1068 | 1.04M | token_line_col_ = {current_line_col_.first, |
1069 | 1.04M | current_line_col_.second - t.data.size()}; |
1070 | 1.04M | break; |
1071 | 13.1M | case TokenType::START_TAG_TOKEN: |
1072 | 13.1M | case TokenType::SELF_CLOSING_TAG_TOKEN: |
1073 | 13.4M | case TokenType::END_TAG_TOKEN: { |
1074 | 13.4M | auto tag_name_value = TagName(); |
1075 | 13.4M | if (tag_name_value.has_value()) { |
1076 | 13.4M | std::string tag_name = std::get<0>(tag_name_value.value()); |
1077 | 13.4M | bool has_attributes = std::get<1>(tag_name_value.value()); |
1078 | 13.4M | Atom atom = AtomUtil::ToAtom(tag_name); |
1079 | 13.4M | if (atom != Atom::UNKNOWN) { |
1080 | 5.49M | t.atom = atom; |
1081 | 7.92M | } else { |
1082 | 7.92M | t.atom = Atom::UNKNOWN; |
1083 | 7.92M | t.data = tag_name; |
1084 | 7.92M | } |
1085 | 13.4M | if (has_attributes) { |
1086 | 599k | while (true) { |
1087 | 599k | auto a = TagAttr(); |
1088 | 599k | if (!a.has_value()) break; |
1089 | 599k | auto attr = std::get<Attribute>(a.value()); |
1090 | 599k | bool more_attributes = std::get<bool>(a.value()); |
1091 | 599k | t.attributes.push_back(attr); |
1092 | 599k | if (!more_attributes) break; |
1093 | 599k | } |
1094 | 179k | } |
1095 | 13.4M | } |
1096 | 13.4M | break; |
1097 | 13.1M | } |
1098 | 12.5k | case TokenType::ERROR_TOKEN: |
1099 | | // Ignore. |
1100 | 12.5k | break; |
1101 | 15.1M | } |
1102 | | |
1103 | 15.1M | t.line_col_in_html_src = token_line_col_; |
1104 | 15.1M | return t; |
1105 | 15.1M | } |
1106 | | |
1107 | | } // namespace htmlparser |