/proc/self/cwd/cpp/htmlparser/tokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "cpp/htmlparser/tokenizer.h" |
2 | | |
3 | | #include "absl/flags/flag.h" |
4 | | #include "cpp/htmlparser/atom.h" |
5 | | #include "cpp/htmlparser/atomutil.h" |
6 | | #include "cpp/htmlparser/defer.h" |
7 | | #include "cpp/htmlparser/strings.h" |
8 | | |
9 | | ABSL_FLAG(std::size_t, htmlparser_max_attributes_per_node, |
10 | | 1000, |
11 | | "Protects out of memory errors by dropping insanely large amounts " |
12 | | "of attributes per node."); |
13 | | |
14 | | namespace htmlparser { |
15 | | |
16 | | Tokenizer::Tokenizer(std::string_view html, std::string context_tag) : |
17 | 11.5k | buffer_(html) { |
18 | 11.5k | lines_cols_.push_back(std::make_pair(1, 0)); |
19 | 11.5k | current_line_col_ = std::make_pair(1, 0); |
20 | 11.5k | token_line_col_ = std::make_pair(1, 0); |
21 | 11.5k | if (!context_tag.empty()) { |
22 | 0 | Strings::ToLower(&context_tag); |
23 | 0 | if (std::find(kAllowedFragmentContainers.begin(), |
24 | 0 | kAllowedFragmentContainers.end(), |
25 | 0 | AtomUtil::ToAtom(context_tag)) != |
26 | 0 | kAllowedFragmentContainers.end()) { |
27 | 0 | raw_tag_ = context_tag; |
28 | 0 | } |
29 | 0 | } |
30 | 11.5k | } |
31 | | |
32 | 172M | inline char Tokenizer::ReadByte() { |
33 | 172M | if (raw_.end >= buffer_.size()) { |
34 | 11.5k | eof_ = true; |
35 | 11.5k | return 0; |
36 | 11.5k | } |
37 | | |
38 | 172M | char c = buffer_.at(raw_.end++); |
39 | 172M | current_line_col_.second++; |
40 | 172M | int multi_byte = Strings::CodePointByteSequenceCount(c); |
41 | 172M | if (multi_byte > 1) { |
42 | 12.4M | current_line_col_.second -= (multi_byte - 1); |
43 | 12.4M | } |
44 | | |
45 | 172M | if (c == '\n' || (c == '\r' && |
46 | 172M | raw_.end < buffer_.size() && |
47 | 172M | buffer_.at(raw_.end) != '\n')) { |
48 | 14.6M | lines_cols_.back() = current_line_col_; |
49 | | // Increment line number and reset column number. |
50 | 14.6M | current_line_col_.first++; |
51 | 14.6M | current_line_col_.second = 0; |
52 | 14.6M | lines_cols_.push_back({current_line_col_.first + 1, 0}); |
53 | 14.6M | } |
54 | | |
55 | 172M | return c; |
56 | 172M | } |
57 | | |
58 | 45.6M | inline void Tokenizer::UnreadByte() { |
59 | 45.6M | raw_.end--; |
60 | 45.6M | if (current_line_col_.first > 1 && current_line_col_.second == 0) { |
61 | 2.64M | if (lines_cols_.size() > 1) { |
62 | 2.64M | lines_cols_.pop_back(); |
63 | 2.64M | } |
64 | 2.64M | current_line_col_ = lines_cols_.back(); |
65 | 2.64M | return; |
66 | 2.64M | } |
67 | | |
68 | 42.9M | current_line_col_.second--; |
69 | 42.9M | } |
70 | | |
71 | 19.2M | void Tokenizer::SkipWhiteSpace() { |
72 | 23.2M | while (!eof_) { |
73 | 23.2M | char c = ReadByte(); |
74 | 23.2M | switch (c) { |
75 | 381 | case ' ': |
76 | 74.5k | case '\n': |
77 | 3.93M | case '\r': |
78 | 3.93M | case '\t': |
79 | 4.03M | case '\f': |
80 | 4.03M | break; |
81 | 19.2M | default: |
82 | 19.2M | UnreadByte(); |
83 | 19.2M | return; |
84 | 23.2M | } |
85 | 23.2M | } |
86 | 19.2M | } |
87 | | |
88 | 15.3M | void Tokenizer::SetAllowCDATA(bool allow_cdata) { |
89 | 15.3M | allow_cdata_ = allow_cdata; |
90 | 15.3M | } |
91 | | |
92 | 354k | void Tokenizer::NextIsNotRawText() { |
93 | 354k | raw_tag_ = ""; |
94 | 354k | } |
95 | | |
96 | 23.7k | void Tokenizer::ReadRawOrRCDATA() { |
97 | 23.7k | if (raw_tag_ == "script") { |
98 | 2.65k | ReadScript(); |
99 | 2.65k | text_is_raw_ = true; |
100 | 2.65k | raw_tag_ = ""; |
101 | 2.65k | return; |
102 | 2.65k | } |
103 | | |
104 | 404k | while (!eof_) { |
105 | 404k | char c = ReadByte(); |
106 | 404k | if (eof_) break; |
107 | 404k | if (c != '<') continue; |
108 | 99.0k | c = ReadByte(); |
109 | 99.0k | if (eof_) break; |
110 | 99.0k | if (c != '/') continue; |
111 | 74.4k | if (ReadRawEndTag() || eof_) break; |
112 | 74.4k | } |
113 | | |
114 | 21.1k | data_.end = raw_.end; |
115 | | // A textarea's or title's RCDATA can contain escaped entities. |
116 | 21.1k | text_is_raw_ = raw_tag_ != "textarea" && raw_tag_ != "title"; |
117 | 21.1k | raw_tag_ = ""; |
118 | 21.1k | } |
119 | | |
120 | 114k | bool Tokenizer::ReadRawEndTag() { |
121 | 421k | for (std::size_t i = 0; i < raw_tag_.size(); ++i) { |
122 | 364k | char c = ReadByte(); |
123 | 364k | if (eof_) return false; |
124 | 364k | if (c != raw_tag_.at(i) && c != (raw_tag_.at(i) - ('a' - 'A'))) { |
125 | 57.1k | UnreadByte(); |
126 | 57.1k | return false; |
127 | 57.1k | } |
128 | 364k | } |
129 | | |
130 | 57.0k | char c = ReadByte(); |
131 | 57.0k | if (eof_) return false; |
132 | 57.0k | switch (c) { |
133 | 1.70k | case ' ': |
134 | 1.80k | case '\n': |
135 | 1.82k | case '\t': |
136 | 1.83k | case '\f': |
137 | 1.85k | case '/': |
138 | 25.2k | case '>': |
139 | | // The 3 is 2 for the leading "</" plus 1 for the trailing character c. |
140 | 25.2k | raw_.end -= (3 /* <, /, and > */+ raw_tag_.size()); |
141 | 25.2k | current_line_col_.second -= (3 /* <, /, and > */ + raw_tag_.size()); |
142 | 25.2k | return true; |
143 | 57.0k | } |
144 | 31.7k | UnreadByte(); |
145 | 31.7k | return false; |
146 | 57.0k | } |
147 | | |
148 | | enum ScriptDataState { |
149 | | DONE = 0, |
150 | | SCRIPT_DATA = 1, |
151 | | SCRIPT_DATA_LESS_THAN_SIGN = 2, |
152 | | SCRIPT_DATA_END_TAG_OPEN = 3, |
153 | | SCRIPT_DATA_ESCAPE_START = 4, |
154 | | SCRIPT_DATA_ESCAPE_START_DASH = 5, |
155 | | SCRIPT_DATA_ESCAPED = 6, |
156 | | SCRIPT_DATA_ESCAPED_DASH = 7, |
157 | | SCRIPT_DATA_ESCAPED_DASH_DASH = 8, |
158 | | SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 9, |
159 | | SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 10, |
160 | | SCRIPT_DATA_DOUBLE_ESCAPE_START = 11, |
161 | | SCRIPT_DATA_DOUBLE_ESCAPED = 12, |
162 | | SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 13, |
163 | | SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 14, |
164 | | SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 15, |
165 | | SCRIPT_DATA_DOUBLE_ESCAPED_END = 16 |
166 | | }; |
167 | | |
168 | 2.65k | void Tokenizer::ReadScript() { |
169 | 2.65k | defer({data_.end = raw_.end;}); |
170 | 2.65k | ScriptDataState state = ScriptDataState::SCRIPT_DATA; |
171 | 12.0M | while (!eof_ && state != ScriptDataState::DONE) { |
172 | 12.0M | switch (state) { |
173 | 4.92M | case ScriptDataState::SCRIPT_DATA: { |
174 | 4.92M | char c = ReadByte(); |
175 | 4.92M | if (eof_) return; |
176 | 4.92M | if (c == '<') { |
177 | 1.44M | state = ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN; |
178 | 3.48M | } else { |
179 | 3.48M | state = ScriptDataState::SCRIPT_DATA; |
180 | 3.48M | } |
181 | 4.92M | break; |
182 | 4.92M | } |
183 | 1.44M | case ScriptDataState::SCRIPT_DATA_LESS_THAN_SIGN: { |
184 | 1.44M | char c = ReadByte(); |
185 | 1.44M | if (eof_) return; |
186 | 1.44M | if (c == '/') { |
187 | 36.0k | state = ScriptDataState::SCRIPT_DATA_END_TAG_OPEN; |
188 | 1.40M | } else if (c == '!') { |
189 | 615k | state = ScriptDataState::SCRIPT_DATA_ESCAPE_START; |
190 | 790k | } else { |
191 | 790k | UnreadByte(); |
192 | 790k | state = ScriptDataState::SCRIPT_DATA; |
193 | 790k | } |
194 | 1.44M | break; |
195 | 1.44M | } |
196 | 36.0k | case ScriptDataState::SCRIPT_DATA_END_TAG_OPEN: { |
197 | 36.0k | if (ReadRawEndTag() || eof_) { |
198 | 1.51k | return; |
199 | 1.51k | } |
200 | 34.5k | state = ScriptDataState::SCRIPT_DATA; |
201 | 34.5k | break; |
202 | 36.0k | } |
203 | 615k | case ScriptDataState::SCRIPT_DATA_ESCAPE_START: { |
204 | 615k | char c = ReadByte(); |
205 | 615k | if (eof_) return; |
206 | 615k | if (c == '-') { |
207 | 613k | state = ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH; |
208 | 613k | } else { |
209 | 2.22k | UnreadByte(); |
210 | 2.22k | state = ScriptDataState::SCRIPT_DATA; |
211 | 2.22k | } |
212 | 615k | break; |
213 | 615k | } |
214 | 613k | case ScriptDataState::SCRIPT_DATA_ESCAPE_START_DASH: { |
215 | 613k | char c = ReadByte(); |
216 | 613k | if (eof_) return; |
217 | 613k | if (c == '-') { |
218 | 306k | state = SCRIPT_DATA_ESCAPED_DASH_DASH; |
219 | 306k | } else { |
220 | 306k | UnreadByte(); |
221 | 306k | state = ScriptDataState::SCRIPT_DATA; |
222 | 306k | } |
223 | 613k | break; |
224 | 613k | } |
225 | 1.53M | case ScriptDataState::SCRIPT_DATA_ESCAPED: { |
226 | 1.53M | char c = ReadByte(); |
227 | 1.53M | if (eof_) return; |
228 | 1.53M | if (c == '-') { |
229 | 591k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH; |
230 | 946k | } else if (c == '<') { |
231 | 23.3k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
232 | 923k | } else { |
233 | 923k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
234 | 923k | } |
235 | 1.53M | break; |
236 | 1.53M | } |
237 | 591k | case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH: { |
238 | 591k | char c = ReadByte(); |
239 | 591k | if (eof_) return; |
240 | 591k | if (c == '-') { |
241 | 2.14k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH; |
242 | 589k | } else if (c == '<') { |
243 | 586k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
244 | 586k | } else { |
245 | 2.70k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
246 | 2.70k | } |
247 | 591k | break; |
248 | 591k | } |
249 | 309k | case ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH: { |
250 | 309k | char c = ReadByte(); |
251 | 309k | if (eof_) return; |
252 | 309k | if (c == '-') { |
253 | 434 | state = ScriptDataState::SCRIPT_DATA_ESCAPED_DASH_DASH; |
254 | 309k | } else if (c == '<') { |
255 | 3.74k | state = ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; |
256 | 305k | } else if (c == '>') { |
257 | 66 | state = ScriptDataState::SCRIPT_DATA; |
258 | 305k | } else { |
259 | 305k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
260 | 305k | } |
261 | 309k | break; |
262 | 309k | } |
263 | 613k | case ScriptDataState::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { |
264 | 613k | char c = ReadByte(); |
265 | 613k | if (eof_) return; |
266 | 613k | if (c == '/') { |
267 | 508 | state = ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN; |
268 | 613k | } else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { |
269 | 307k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START; |
270 | 307k | } else { |
271 | 305k | UnreadByte(); |
272 | 305k | state = ScriptDataState::SCRIPT_DATA; |
273 | 305k | } |
274 | 613k | break; |
275 | 613k | } |
276 | 508 | case ScriptDataState::SCRIPT_DATA_ESCAPED_END_TAG_OPEN: { |
277 | 508 | if (ReadRawEndTag()) { |
278 | 408 | state = ScriptDataState::DONE; |
279 | 408 | } else { |
280 | 100 | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
281 | 100 | } |
282 | 508 | break; |
283 | 613k | } |
284 | 307k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPE_START: { |
285 | 307k | UnreadByte(); |
286 | 307k | static std::string script_tag_l = "script"; |
287 | 307k | static std::string script_tag_u = "SCRIPT"; |
288 | 2.15M | for (int8_t i = 0; i < 6 /*script*/; ++i) { |
289 | 1.84M | char c = ReadByte(); |
290 | 1.84M | if (eof_) return; |
291 | 1.84M | if (c != script_tag_l[i] && c != script_tag_u[i]) { |
292 | 1.83M | UnreadByte(); |
293 | 1.83M | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
294 | 1.83M | } |
295 | 1.84M | } |
296 | 307k | char c = ReadByte(); |
297 | 307k | if (eof_) return; |
298 | 307k | if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\f' |
299 | 307k | || c == '/' || c == '>') { |
300 | 3.34k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
301 | 304k | } else { |
302 | 304k | UnreadByte(); |
303 | 304k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
304 | 304k | } |
305 | 307k | break; |
306 | 307k | } |
307 | 606k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED: { |
308 | 606k | char c = ReadByte(); |
309 | 606k | if (eof_) return; |
310 | 606k | if (c == '-') { |
311 | 204k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH; |
312 | 402k | } else if (c == '<') { |
313 | 85.7k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
314 | 316k | } else { |
315 | 316k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
316 | 316k | } |
317 | 606k | break; |
318 | 606k | } |
319 | 204k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH: { |
320 | 204k | char c = ReadByte(); |
321 | 204k | if (eof_) return; |
322 | 204k | if (c == '-') { |
323 | 38.9k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; |
324 | 165k | } else if (c == '<') { |
325 | 120k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
326 | 120k | } else if (c == '>') { |
327 | 567 | state = ScriptDataState::SCRIPT_DATA; |
328 | 44.1k | } else { |
329 | 44.1k | state = SCRIPT_DATA_DOUBLE_ESCAPED; |
330 | 44.1k | } |
331 | 204k | break; |
332 | 204k | } |
333 | 39.1k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: { |
334 | 39.1k | char c = ReadByte(); |
335 | 39.1k | if (eof_) return; |
336 | 39.1k | if (c == '-') { |
337 | 216 | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; |
338 | 38.9k | } else if (c == '<') { |
339 | 1.23k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; |
340 | 37.6k | } else if (c == '>') { |
341 | 66 | state = ScriptDataState::SCRIPT_DATA; |
342 | 37.6k | } else { |
343 | 37.6k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
344 | 37.6k | } |
345 | 39.1k | break; |
346 | 39.1k | } |
347 | 207k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: { |
348 | 207k | char c = ReadByte(); |
349 | 207k | if (eof_) return; |
350 | 207k | if (c == '/') { |
351 | 3.27k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END; |
352 | 204k | } else { |
353 | 204k | UnreadByte(); |
354 | 204k | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
355 | 204k | } |
356 | 207k | break; |
357 | 207k | } |
358 | 3.27k | case ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED_END: { |
359 | 3.27k | if (ReadRawEndTag()) { |
360 | 2.47k | raw_.end += std::string("</script>").size(); |
361 | 2.47k | state = ScriptDataState::SCRIPT_DATA_ESCAPED; |
362 | 2.47k | } else { |
363 | 796 | if (eof_) return; |
364 | 791 | state = ScriptDataState::SCRIPT_DATA_DOUBLE_ESCAPED; |
365 | 791 | } |
366 | 3.26k | break; |
367 | 3.27k | } |
368 | 3.26k | default: |
369 | 0 | break; |
370 | 12.0M | } |
371 | 12.0M | } |
372 | 2.65k | } |
373 | | |
374 | 1.25k | void Tokenizer::ReadComment() { |
375 | 1.25k | data_.start = raw_.end; |
376 | 1.25k | defer({ |
377 | 1.25k | if (data_.end < data_.start) { |
378 | | // It's a comment with no data, like <!--> |
379 | 1.25k | data_.end = data_.start; |
380 | 1.25k | } |
381 | 1.25k | }); |
382 | 1.25k | int dash_count = 2; |
383 | 5.29M | while (!eof_) { |
384 | 5.29M | char c = ReadByte(); |
385 | 5.29M | if (eof_) { |
386 | | // Ignore up to two dashes at EOF. |
387 | 79 | if (dash_count > 2) { |
388 | 10 | dash_count = 2; |
389 | 10 | } |
390 | 79 | data_.end = raw_.end - dash_count; |
391 | 79 | return; |
392 | 79 | } |
393 | 5.29M | if (c == '-') { |
394 | 1.33M | dash_count++; |
395 | 1.33M | continue; |
396 | 3.96M | } else if (c == '>') { |
397 | 7.47k | if (dash_count >= 2) { |
398 | 333 | data_.end = raw_.end - 3 /* --> */; |
399 | 333 | return; |
400 | 333 | } |
401 | 3.95M | } else if (c == '!') { |
402 | 726k | if (dash_count >= 2) { |
403 | 1.04k | char c = ReadByte(); |
404 | 1.04k | if (eof_) { |
405 | 14 | data_.end = raw_.end; |
406 | 14 | return; |
407 | 14 | } |
408 | 1.02k | if (c == '>') { |
409 | 828 | data_.end = raw_.end - 4 /* --!> */; |
410 | 828 | return; |
411 | 828 | } |
412 | 1.02k | } |
413 | 726k | } |
414 | 3.96M | dash_count = 0; |
415 | 3.96M | } |
416 | 1.25k | } |
417 | | |
418 | 1.39M | void Tokenizer::ReadUntilCloseAngle() { |
419 | 1.39M | data_.start = raw_.end; |
420 | 3.88M | while (!eof_) { |
421 | 3.88M | char c = ReadByte(); |
422 | 3.88M | if (eof_) { |
423 | 404 | data_.end = raw_.end; |
424 | 404 | return; |
425 | 404 | } |
426 | 3.88M | if (c == '>') { |
427 | 1.39M | data_.end = raw_.end - 1 /* ">" */; |
428 | 1.39M | return; |
429 | 1.39M | } |
430 | 3.88M | } |
431 | 1.39M | } |
432 | | |
433 | 1.39M | TokenType Tokenizer::ReadMarkupDeclaration() { |
434 | 1.39M | data_.start = raw_.end; |
435 | 1.39M | char c[2]; |
436 | 4.17M | for (int i = 0; i < 2; ++i) { |
437 | 2.78M | c[i] = ReadByte(); |
438 | 2.78M | if (eof_) { |
439 | 45 | data_.end = raw_.end; |
440 | 45 | return TokenType::COMMENT_TOKEN; |
441 | 45 | } |
442 | 2.78M | } |
443 | | |
444 | 1.39M | if (c[0] == '-' && c[1] == '-') { |
445 | 1.25k | ReadComment(); |
446 | 1.25k | return TokenType::COMMENT_TOKEN; |
447 | 1.25k | } |
448 | | |
449 | 1.39M | UnreadByte(); |
450 | 1.39M | UnreadByte(); |
451 | 1.39M | if (ReadDoctype()) { |
452 | 107 | return TokenType::DOCTYPE_TOKEN; |
453 | 107 | } |
454 | | |
455 | 1.39M | if (allow_cdata_ && ReadCDATA()) { |
456 | 64 | convert_null_ = true; |
457 | 64 | return TokenType::TEXT_TOKEN; |
458 | 64 | } |
459 | | |
460 | | // It's a bogus comment. |
461 | 1.39M | ReadUntilCloseAngle(); |
462 | 1.39M | return TokenType::COMMENT_TOKEN; |
463 | 1.39M | } |
464 | | |
465 | 1.39M | bool Tokenizer::ReadDoctype() { |
466 | 1.39M | token_line_col_ = {current_line_col_.first, |
467 | 1.39M | current_line_col_.second - 2 /* <! */}; |
468 | | |
469 | 1.39M | static constexpr std::string_view kDoctype = "DOCTYPE"; |
470 | 1.39M | for (std::size_t i = 0; i < kDoctype.size(); ++i) { |
471 | 1.39M | char c = ReadByte(); |
472 | 1.39M | if (eof_) { |
473 | 5 | data_.end = raw_.end; |
474 | 5 | return false; |
475 | 5 | } |
476 | 1.39M | if (c != kDoctype.at(i) && c != (kDoctype.at(i) + ('a' - 'A'))) { |
477 | | // Back up to read the fragment of "DOCTYPE" again. |
478 | 1.39M | raw_.end = data_.start; |
479 | 1.39M | return false; |
480 | 1.39M | } |
481 | 1.39M | } |
482 | | |
483 | 107 | SkipWhiteSpace(); |
484 | 107 | if (eof_) { |
485 | 2 | data_.start = raw_.end; |
486 | 2 | data_.end = raw_.end; |
487 | 2 | return true; |
488 | 2 | } |
489 | | |
490 | 105 | ReadUntilCloseAngle(); |
491 | 105 | return true; |
492 | 107 | } |
493 | | |
494 | 177 | bool Tokenizer::ReadCDATA() { |
495 | 177 | static constexpr std::string_view kCData = "[CDATA["; |
496 | 823 | for (std::size_t i = 0; i < kCData.size(); ++i) { |
497 | 759 | char c = ReadByte(); |
498 | 759 | if (eof_) { |
499 | 8 | data_.end = raw_.end; |
500 | 8 | return false; |
501 | 8 | } |
502 | 751 | if (c != kCData[i]) { |
503 | | // Back up to read the fragment of "[CDATA[" again. |
504 | 105 | data_.end = raw_.start; |
505 | 105 | return false; |
506 | 105 | } |
507 | 751 | } |
508 | 64 | data_.start = raw_.end; |
509 | 64 | int brackets = 0; |
510 | 4.05k | while (!eof_) { |
511 | 4.05k | char c = ReadByte(); |
512 | 4.05k | if (eof_) { |
513 | 27 | data_.end = raw_.end; |
514 | 27 | return true; |
515 | 27 | } |
516 | 4.02k | switch (c) { |
517 | 841 | case ']': { |
518 | 841 | brackets++; |
519 | 841 | break; |
520 | 0 | } |
521 | 1.01k | case '>': { |
522 | 1.01k | if (brackets >= 2) { |
523 | 37 | data_.end = raw_.end - 3 /* "]]>" */; |
524 | 37 | return true; |
525 | 37 | } |
526 | 976 | brackets = 0; |
527 | 976 | break; |
528 | 1.01k | } |
529 | 2.17k | default: |
530 | 2.17k | brackets = 0; |
531 | 4.02k | } |
532 | 4.02k | } |
533 | 0 | return false; |
534 | 64 | } |
535 | | |
536 | | template<typename... Args> |
537 | 6.28M | bool Tokenizer::StartTagIn(Args... ss) { |
538 | 6.28M | std::vector<std::string> argsList{ss...}; |
539 | 11.9M | for (const auto& s : argsList) { |
540 | 11.9M | if (data_.end - data_.start != s.size()) continue; |
541 | 1.41M | bool matched = true; |
542 | 3.21M | for (std::size_t i = 0; i < s.size(); ++i) { |
543 | 3.19M | char c = buffer_.at(data_.start + i); |
544 | 3.19M | if ('A' <= c && c <= 'Z') { |
545 | 2.95M | c += 'a' - 'A'; |
546 | 2.95M | } |
547 | 3.19M | if (c != s[i]) { |
548 | 1.39M | matched = false; |
549 | 1.39M | break; |
550 | 1.39M | } |
551 | 3.19M | } |
552 | 1.41M | if (matched) { |
553 | 24.1k | return true; |
554 | 24.1k | } |
555 | 1.41M | } |
556 | 6.26M | return false; |
557 | 6.28M | } bool htmlparser::Tokenizer::StartTagIn<char const*>(char const*) Line | Count | Source | 537 | 673k | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 673k | std::vector<std::string> argsList{ss...}; | 539 | 673k | for (const auto& s : argsList) { | 540 | 673k | if (data_.end - data_.start != s.size()) continue; | 541 | 5.53k | bool matched = true; | 542 | 21.6k | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 16.6k | char c = buffer_.at(data_.start + i); | 544 | 16.6k | if ('A' <= c && c <= 'Z') { | 545 | 10.6k | c += 'a' - 'A'; | 546 | 10.6k | } | 547 | 16.6k | if (c != s[i]) { | 548 | 469 | matched = false; | 549 | 469 | break; | 550 | 469 | } | 551 | 16.6k | } | 552 | 5.53k | if (matched) { | 553 | 5.06k | return true; | 554 | 5.06k | } | 555 | 5.53k | } | 556 | 668k | return false; | 557 | 673k | } |
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*, char const*>(char const*, char const*, char const*) Line | Count | Source | 537 | 91.2k | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 91.2k | std::vector<std::string> argsList{ss...}; | 539 | 273k | for (const auto& s : argsList) { | 540 | 273k | if (data_.end - data_.start != s.size()) continue; | 541 | 156k | bool matched = true; | 542 | 607k | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 606k | char c = buffer_.at(data_.start + i); | 544 | 606k | if ('A' <= c && c <= 'Z') { | 545 | 582k | c += 'a' - 'A'; | 546 | 582k | } | 547 | 606k | if (c != s[i]) { | 548 | 155k | matched = false; | 549 | 155k | break; | 550 | 155k | } | 551 | 606k | } | 552 | 156k | if (matched) { | 553 | 558 | return true; | 554 | 558 | } | 555 | 156k | } | 556 | 90.7k | return false; | 557 | 91.2k | } |
bool htmlparser::Tokenizer::StartTagIn<char const*, char const*>(char const*, char const*) Line | Count | Source | 537 | 5.52M | bool Tokenizer::StartTagIn(Args... ss) { | 538 | 5.52M | std::vector<std::string> argsList{ss...}; | 539 | 11.0M | for (const auto& s : argsList) { | 540 | 11.0M | if (data_.end - data_.start != s.size()) continue; | 541 | 1.25M | bool matched = true; | 542 | 2.58M | for (std::size_t i = 0; i < s.size(); ++i) { | 543 | 2.56M | char c = buffer_.at(data_.start + i); | 544 | 2.56M | if ('A' <= c && c <= 'Z') { | 545 | 2.36M | c += 'a' - 'A'; | 546 | 2.36M | } | 547 | 2.56M | if (c != s[i]) { | 548 | 1.23M | matched = false; | 549 | 1.23M | break; | 550 | 1.23M | } | 551 | 2.56M | } | 552 | 1.25M | if (matched) { | 553 | 18.4k | return true; | 554 | 18.4k | } | 555 | 1.25M | } | 556 | 5.50M | return false; | 557 | 5.52M | } |
|
558 | | |
559 | 12.9M | TokenType Tokenizer::ReadStartTag(bool template_mode) { |
560 | 12.9M | token_line_col_ = {current_line_col_.first, |
561 | 12.9M | current_line_col_.second - 1 /* < */}; |
562 | 12.9M | ReadTag(true, template_mode); |
563 | | |
564 | 12.9M | if (eof_) { |
565 | 306 | return TokenType::ERROR_TOKEN; |
566 | 306 | } |
567 | | |
568 | | // Several tags flag the tokenizer's next token as raw. |
569 | 12.9M | bool raw = false; |
570 | 12.9M | char c = buffer_.at(data_.start); |
571 | | |
572 | | // Lowercase. |
573 | 12.9M | if ('A' <= c && c <= 'Z') { |
574 | 6.39M | c += 'a' - 'A'; |
575 | 6.39M | } |
576 | | |
577 | 12.9M | switch (c) { |
578 | 644k | case 'i': |
579 | 644k | raw = StartTagIn("iframe"); |
580 | 644k | break; |
581 | 91.2k | case 'n': |
582 | 91.2k | raw = StartTagIn("noembed", "noframes", "noscript"); |
583 | 91.2k | break; |
584 | 20.1k | case 'p': |
585 | 20.1k | raw = StartTagIn("plaintext"); |
586 | 20.1k | break; |
587 | 3.13M | case 's': |
588 | 3.13M | raw = StartTagIn("script", "style"); |
589 | 3.13M | break; |
590 | 2.39M | case 't': |
591 | 2.39M | raw = StartTagIn("textarea", "title"); |
592 | 2.39M | break; |
593 | 8.68k | case 'x': |
594 | 8.68k | raw = StartTagIn("xmp"); |
595 | 12.9M | } |
596 | | |
597 | 12.9M | if (raw) { |
598 | 24.1k | int size = data_.end - data_.start; |
599 | 24.1k | raw_tag_ = std::string(buffer_.substr(data_.start, size)); |
600 | 24.1k | Strings::ToLower(&raw_tag_); |
601 | 24.1k | } |
602 | | |
603 | | // Look for a self-closing token like "<br/>". |
604 | 12.9M | if (!eof_ && buffer_[raw_.end - 2] == '/') { |
605 | 10.7k | return TokenType::SELF_CLOSING_TAG_TOKEN; |
606 | 10.7k | } |
607 | | |
608 | 12.9M | return TokenType::START_TAG_TOKEN; |
609 | 12.9M | } |
610 | | |
611 | 13.3M | void Tokenizer::ReadTag(bool save_attr, bool template_mode) { |
612 | 13.3M | attributes_.clear(); |
613 | 13.3M | n_attributes_returned_ = 0; |
614 | | |
615 | | // Read the tag name and attribute key/value pairs. |
616 | 13.3M | ReadTagName(); |
617 | 13.3M | SkipWhiteSpace(); |
618 | | |
619 | 13.3M | if (eof_) { |
620 | 136 | return; |
621 | 136 | } |
622 | | |
623 | 16.2M | while (!eof_) { |
624 | 16.2M | char c = ReadByte(); |
625 | 16.2M | if (eof_ || c == '>') { |
626 | 13.3M | break; |
627 | 13.3M | } |
628 | | |
629 | | // Undo previous > read. |
630 | 2.93M | UnreadByte(); |
631 | | |
632 | 2.93M | ReadTagAttributeKey(template_mode); |
633 | 2.93M | ReadTagAttributeValue(); |
634 | | // Save pending_attribute if save_attr and that attribute has a non-empty |
635 | | // key. |
636 | 2.93M | if (save_attr && |
637 | | // Skip excessive attributes. |
638 | 2.93M | attributes_.size() < ::absl::GetFlag( |
639 | 2.81M | FLAGS_htmlparser_max_attributes_per_node) && |
640 | 2.93M | std::get<0>(pending_attribute_).start != |
641 | 512k | std::get<0>(pending_attribute_).end) { |
642 | 430k | attributes_.push_back(pending_attribute_); |
643 | 430k | } |
644 | 2.93M | SkipWhiteSpace(); |
645 | 2.93M | } |
646 | 13.3M | } |
647 | | |
648 | 13.3M | void Tokenizer::ReadTagName() { |
649 | 13.3M | data_.start = raw_.end - 1; |
650 | 26.8M | while (!eof_) { |
651 | 26.8M | char c = ReadByte(); |
652 | 26.8M | if (eof_) { |
653 | 75 | data_.end = raw_.end; |
654 | 75 | return; |
655 | 75 | } |
656 | 26.8M | switch (c) { |
657 | 70.2k | case ' ': |
658 | 72.1k | case '\n': |
659 | 149k | case '\r': |
660 | 150k | case '\t': |
661 | 150k | case '\f': |
662 | 150k | data_.end = raw_.end - 1; |
663 | 150k | return; |
664 | 120k | case '/': |
665 | 13.1M | case '>': |
666 | 13.1M | UnreadByte(); |
667 | 13.1M | data_.end = raw_.end; |
668 | 13.1M | return; |
669 | 26.8M | } |
670 | 26.8M | } |
671 | 13.3M | } |
672 | | |
673 | | // Sets pending_attribute_[0] to the "k" in "<div k=v>". |
674 | | // Precondition: eof_ != true; |
675 | 2.93M | void Tokenizer::ReadTagAttributeKey(bool template_mode) { |
676 | 2.93M | std::get<0>(pending_attribute_).start = raw_.end; |
677 | 2.93M | std::get<LineCol>(pending_attribute_) = |
678 | 2.93M | {current_line_col_.first, current_line_col_.second + 1}; |
679 | | |
680 | | // All mustache_ prefixed variables applies to parsing logic for AMP mustache |
681 | | // templates. See: https://amp.dev/documentation/components/amp-mustache/ |
682 | 2.93M | bool mustache_inside_section_block = false; |
683 | 2.93M | std::string mustache_section_name = ""; |
684 | | |
685 | 32.5M | while (!eof_) { |
686 | 32.5M | char c = ReadByte(); |
687 | 32.5M | if (eof_) { |
688 | 99 | std::get<0>(pending_attribute_).start = raw_.end; |
689 | 99 | return; |
690 | 99 | } |
691 | | |
692 | | // Template attributes processing. |
693 | | // Looks for following special syntax. |
694 | | // {{#section}}...{{/section}} |
695 | | // {{^section}}...{{/section}} |
696 | | // {{variable}} |
697 | 32.5M | if (template_mode) { |
698 | 0 | UnreadByte(); |
699 | 0 | UnreadByte(); |
700 | 0 | UnreadByte(); |
701 | 0 | char c1 = ReadByte(); |
702 | 0 | char c2 = ReadByte(); |
703 | 0 | c = ReadByte(); |
704 | 0 | if (mustache_inside_section_block && c1 == '{' && c2 == '{' && c == '/') { |
705 | | // Look for closing section name. If not resort to default behavior. |
706 | | // Reason for this logic is to differentiate between: |
707 | | // <p {{#mycondition}}class=foo{{/mycondition}} foo=bar> vs. |
708 | | // <img {{#mycondition}}class=foo /> |
709 | 0 | int raw_end = raw_.end; |
710 | 0 | std::string_view close_section = |
711 | 0 | buffer_.substr(raw_.end, mustache_section_name.size()); |
712 | 0 | bool section_name_match = close_section == mustache_section_name; |
713 | 0 | if (section_name_match) { |
714 | 0 | raw_.end += mustache_section_name.size(); |
715 | 0 | char e1 = ReadByte(); |
716 | 0 | char e2 = ReadByte(); |
717 | 0 | if (e1 == '}' && e2 == '}') { |
718 | 0 | mustache_inside_section_block = false; |
719 | 0 | continue; |
720 | 0 | } else { |
721 | 0 | raw_.end = raw_end; |
722 | 0 | } |
723 | 0 | } |
724 | 0 | } |
725 | | |
726 | 0 | if (c1 == '{' && c2 == '{' && (c == '#' || c == '^')) { |
727 | 0 | auto n = buffer_.find("}}", raw_.end); |
728 | 0 | if (n != std::string_view::npos) { |
729 | 0 | mustache_section_name = buffer_.substr(raw_.end, n - raw_.end); |
730 | 0 | mustache_inside_section_block = true; |
731 | 0 | continue; |
732 | 0 | } |
733 | 0 | } |
734 | 0 | } |
735 | | |
736 | 32.5M | switch (c) { |
737 | 49.5k | case ' ': |
738 | 59.6k | case '\n': |
739 | 364k | case '\r': |
740 | 367k | case '\t': |
741 | 446k | case '\f': |
742 | 2.66M | case '/': { |
743 | 2.66M | std::get<0>(pending_attribute_).end = raw_.end - 1; |
744 | 2.66M | return; |
745 | 446k | } |
746 | 19.7k | case '=': |
747 | 266k | case '>': { |
748 | 266k | UnreadByte(); |
749 | 266k | std::get<0>(pending_attribute_).end = raw_.end; |
750 | 266k | return; |
751 | 19.7k | } |
752 | 32.5M | } |
753 | 32.5M | } |
754 | 2.93M | } |
755 | | |
756 | | // Sets pending_attribute_.second to the "v" in "<div k=v>". |
757 | 2.93M | void Tokenizer::ReadTagAttributeValue() { |
758 | 2.93M | std::get<1>(pending_attribute_).start = raw_.end; |
759 | 2.93M | std::get<1>(pending_attribute_).end = raw_.end; |
760 | 2.93M | SkipWhiteSpace(); |
761 | 2.93M | if (eof_) { |
762 | 148 | return; |
763 | 148 | } |
764 | 2.93M | char c = ReadByte(); |
765 | 2.93M | if (eof_) { |
766 | 0 | return; |
767 | 0 | } |
768 | | |
769 | 2.93M | if (c != '=') { |
770 | 2.91M | UnreadByte(); |
771 | 2.91M | return; |
772 | 2.91M | } |
773 | | |
774 | 19.7k | SkipWhiteSpace(); |
775 | 19.7k | if (eof_) { |
776 | 12 | return; |
777 | 12 | } |
778 | | |
779 | 19.7k | char quote = ReadByte(); |
780 | 19.7k | if (eof_) { |
781 | 0 | return; |
782 | 0 | } |
783 | | |
784 | 19.7k | switch (quote) { |
785 | 726 | case '>': |
786 | 726 | UnreadByte(); |
787 | 726 | return; |
788 | 245 | case '\'': |
789 | 255 | case '"': |
790 | 255 | std::get<1>(pending_attribute_).start = raw_.end; |
791 | 453 | while (!eof_) { |
792 | 453 | c = ReadByte(); |
793 | 453 | if (eof_) { |
794 | 14 | std::get<1>(pending_attribute_).end = raw_.end; |
795 | 14 | return; |
796 | 14 | } |
797 | 439 | if (c == quote) { |
798 | 241 | std::get<1>(pending_attribute_).end = raw_.end - 1; |
799 | 241 | return; |
800 | 241 | } |
801 | 439 | } |
802 | 0 | break; |
803 | 18.8k | default: { |
804 | 18.8k | std::get<1>(pending_attribute_).start = raw_.end - 1; |
805 | 2.35M | while (!eof_) { |
806 | 2.35M | c = ReadByte(); |
807 | 2.35M | if (eof_) { |
808 | 2 | std::get<1>(pending_attribute_).end = raw_.end; |
809 | 2 | return; |
810 | 2 | } |
811 | 2.35M | switch (c) { |
812 | 995 | case ' ': |
813 | 1.20k | case '\n': |
814 | 1.97k | case '\r': |
815 | 2.42k | case '\t': |
816 | 7.40k | case '\f': |
817 | 7.40k | std::get<1>(pending_attribute_).end = raw_.end - 1; |
818 | 7.40k | return; |
819 | 11.3k | case '>': |
820 | 11.3k | UnreadByte(); |
821 | 11.3k | std::get<1>(pending_attribute_).end = raw_.end; |
822 | 11.3k | return; |
823 | 2.35M | } |
824 | 2.35M | } |
825 | 18.8k | } |
826 | 19.7k | } |
827 | 19.7k | } |
828 | | |
829 | 15.3M | TokenType Tokenizer::Next(bool template_mode) { |
830 | 15.3M | raw_.start = raw_.end; |
831 | 15.3M | data_.start = raw_.end; |
832 | 15.3M | data_.end = raw_.end; |
833 | 15.3M | is_token_manufactured_ = false; |
834 | | |
835 | 15.3M | if (eof_) { |
836 | 3.72k | err_ = true; |
837 | 3.72k | token_type_ = TokenType::ERROR_TOKEN; |
838 | 3.72k | return token_type_; |
839 | 3.72k | } |
840 | | |
841 | 15.3M | if (raw_tag_ != "") { |
842 | 23.7k | if (raw_tag_ == "plaintext") { |
843 | | // Read everything up to EOF. |
844 | 0 | while (!eof_) { |
845 | 0 | ReadByte(); |
846 | 0 | } |
847 | 0 | data_.end = raw_.end; |
848 | 0 | text_is_raw_ = true; |
849 | 23.7k | } else { |
850 | 23.7k | ReadRawOrRCDATA(); |
851 | 23.7k | } |
852 | | |
853 | 23.7k | if (data_.end > data_.start) { |
854 | 20.4k | token_type_ = TokenType::TEXT_TOKEN; |
855 | 20.4k | convert_null_ = true; |
856 | 20.4k | return token_type_; |
857 | 20.4k | } |
858 | 23.7k | } |
859 | | |
860 | 15.2M | text_is_raw_ = false; |
861 | 15.2M | convert_null_ = false; |
862 | | |
863 | 24.4M | while (!eof_) { |
864 | 24.4M | char c = ReadByte(); |
865 | | |
866 | 24.4M | if (eof_) { |
867 | 9.36k | break; |
868 | 9.36k | } |
869 | | |
870 | 24.4M | if (c != '<') { |
871 | 8.98M | continue; |
872 | 8.98M | } |
873 | | |
874 | 15.4M | c = ReadByte(); |
875 | 15.4M | if (eof_) break; |
876 | | |
877 | | // Check if the '<' we have just read is part of a tag, comment or |
878 | | // doctype. If not, it's part of the accumulated text token. |
879 | 15.4M | TokenType token_type; |
880 | 15.4M | if (Strings::IsCharAlphabet(c)) { |
881 | 13.4M | token_type = TokenType::START_TAG_TOKEN; |
882 | 13.4M | } else if (c == '/') { |
883 | 450k | token_type = TokenType::END_TAG_TOKEN; |
884 | 1.58M | } else if (c == '!' || c == '?') { |
885 | 1.40M | token_type = TokenType::COMMENT_TOKEN; |
886 | 1.40M | } else { |
887 | 174k | UnreadByte(); |
888 | 174k | continue; |
889 | 174k | } |
890 | | |
891 | | // We have a non-text token, but we might have accumulated some text |
892 | | // before that. If so, we return the text first, and return the non text |
893 | | // token on the subsequent call to Next. |
894 | | // |
895 | | // <space><space><mytag>, returns two spaces before processing the mytag |
896 | | // token in the next call. |
897 | 15.2M | if (int x = raw_.end - 2 /* "<a" */; raw_.start < x) { |
898 | 547k | raw_.end = x; |
899 | 547k | data_.end = x; |
900 | | // We know there is no \n so no line adjustment needed. |
901 | 547k | current_line_col_.second -= 2; |
902 | 547k | token_type_ = TokenType::TEXT_TOKEN; |
903 | 547k | return token_type_; |
904 | 547k | } |
905 | | |
906 | 14.7M | switch (token_type) { |
907 | 12.9M | case TokenType::START_TAG_TOKEN: |
908 | 12.9M | token_type_ = ReadStartTag(template_mode); |
909 | 12.9M | return token_type_; |
910 | 414k | case TokenType::END_TAG_TOKEN: |
911 | 414k | c = ReadByte(); |
912 | 414k | if (eof_) break; |
913 | 414k | if (c == '>') { |
914 | | // "</> does not generate a token at all. Generate an empty comment |
915 | | // to allow passthrough clients to pick up the data using raw_. |
916 | | // Reset the tokenizer state and start again. |
917 | 5.01k | token_type_ = TokenType::COMMENT_TOKEN; |
918 | 5.01k | return token_type_; |
919 | 5.01k | } |
920 | 409k | if (Strings::IsCharAlphabet(c)) { |
921 | 406k | ReadTag(false); |
922 | 406k | if (eof_) { |
923 | 52 | token_type_ = TokenType::ERROR_TOKEN; |
924 | 406k | } else { |
925 | 406k | token_type_ = TokenType::END_TAG_TOKEN; |
926 | 406k | } |
927 | 406k | return token_type_; |
928 | 406k | } |
929 | 2.82k | UnreadByte(); |
930 | 2.82k | ReadUntilCloseAngle(); |
931 | 2.82k | token_type_ = TokenType::COMMENT_TOKEN; |
932 | 2.82k | return token_type_; |
933 | 1.39M | case TokenType::COMMENT_TOKEN: { |
934 | 1.39M | if (c == '!') { |
935 | 1.39M | token_type_ = ReadMarkupDeclaration(); |
936 | 1.39M | return token_type_; |
937 | 1.39M | } |
938 | 1.28k | is_token_manufactured_ = true; |
939 | | // <? is part of the comment text. |
940 | 1.28k | UnreadByte(); |
941 | 1.28k | ReadUntilCloseAngle(); |
942 | 1.28k | token_type_ = TokenType::COMMENT_TOKEN; |
943 | 1.28k | return token_type_; |
944 | 1.39M | } |
945 | 0 | default: |
946 | 0 | break; |
947 | 14.7M | } |
948 | 14.7M | } |
949 | | |
950 | 9.66k | if (raw_.start < raw_.end) { |
951 | 2.22k | data_.end = raw_.end; |
952 | 2.22k | token_type_ = TokenType::TEXT_TOKEN; |
953 | 2.22k | return token_type_; |
954 | 2.22k | } |
955 | | |
956 | 7.44k | token_type_ = TokenType::ERROR_TOKEN; |
957 | 7.44k | return token_type_; |
958 | 9.66k | } |
959 | | |
960 | 0 | std::string_view Tokenizer::Raw() { |
961 | 0 | int size = raw_.end - raw_.start; |
962 | 0 | return buffer_.substr(raw_.start, size); |
963 | 0 | } |
964 | | |
965 | 1.97M | std::string Tokenizer::Text() { |
966 | 1.97M | switch (token_type_) { |
967 | 570k | case TokenType::TEXT_TOKEN: |
968 | 1.97M | case TokenType::COMMENT_TOKEN: |
969 | 1.97M | case TokenType::DOCTYPE_TOKEN: { |
970 | 1.97M | int size = data_.end - data_.start; |
971 | 1.97M | std::string s(buffer_.substr(data_.start, size)); |
972 | 1.97M | data_.start = raw_.end; |
973 | 1.97M | data_.end = raw_.end; |
974 | 1.97M | Strings::ConvertNewLines(&s); |
975 | 1.97M | if (convert_null_ || token_type_ == TokenType::COMMENT_TOKEN) { |
976 | | // Replace \x00 with \ufffd. |
977 | 1.42M | Strings::ReplaceAny(&s, |
978 | 1.42M | Strings::kNullChar, |
979 | 1.42M | Strings::kNullReplacementChar); |
980 | 1.42M | } |
981 | 1.97M | if (!text_is_raw_) Strings::UnescapeString(&s, false); |
982 | 1.97M | return s; |
983 | 1.97M | } |
984 | 0 | default: |
985 | 0 | break; |
986 | 1.97M | } |
987 | | |
988 | 0 | return ""; |
989 | 1.97M | } |
990 | | |
991 | 13.3M | std::optional<std::tuple<std::string, bool>> Tokenizer::TagName() { |
992 | 13.3M | if (data_.start < data_.end) { |
993 | 13.3M | switch (token_type_) { |
994 | 12.9M | case TokenType::START_TAG_TOKEN: |
995 | 13.3M | case TokenType::END_TAG_TOKEN: |
996 | 13.3M | case TokenType::SELF_CLOSING_TAG_TOKEN: { |
997 | 13.3M | int size = data_.end - data_.start; |
998 | 13.3M | std::string s(buffer_.substr(data_.start, size)); |
999 | 13.3M | data_.start = raw_.end; |
1000 | 13.3M | data_.end = raw_.end; |
1001 | 13.3M | Strings::ToLower(&s); |
1002 | 13.3M | return std::make_tuple<std::string, bool>(std::move(s), |
1003 | 13.3M | n_attributes_returned_ < attributes_.size()); |
1004 | 13.3M | } |
1005 | 0 | default: |
1006 | 0 | break; |
1007 | 13.3M | } |
1008 | 13.3M | } |
1009 | | |
1010 | 0 | return std::nullopt; |
1011 | 13.3M | } |
1012 | | |
1013 | 417k | std::optional<std::tuple<Attribute, bool>> Tokenizer::TagAttr() { |
1014 | 417k | if (n_attributes_returned_ < attributes_.size()) { |
1015 | 417k | switch (token_type_) { |
1016 | 409k | case TokenType::START_TAG_TOKEN: |
1017 | 417k | case TokenType::SELF_CLOSING_TAG_TOKEN: { |
1018 | 417k | auto attr = attributes_[n_attributes_returned_]; |
1019 | 417k | n_attributes_returned_++; |
1020 | 417k | int size = std::get<0>(attr).end - std::get<0>(attr).start; |
1021 | 417k | std::string key(buffer_.substr(std::get<0>(attr).start, size)); |
1022 | 417k | int value_size = std::get<1>(attr).end - std::get<1>(attr).start; |
1023 | 417k | std::string val(buffer_.substr(std::get<1>(attr).start, value_size)); |
1024 | 417k | Strings::ToLower(&key); |
1025 | 417k | Strings::ConvertNewLines(&val); |
1026 | 417k | Strings::UnescapeString(&val, true); |
1027 | 417k | return std::make_tuple<Attribute, bool>( |
1028 | 417k | {.name_space = "", |
1029 | 417k | .key = std::move(key), |
1030 | 417k | .value = std::move(val), |
1031 | 417k | .line_col_in_html_src = std::get<LineCol>(attr)}, |
1032 | 417k | n_attributes_returned_ < attributes_.size()); |
1033 | 409k | } |
1034 | 0 | default: |
1035 | 0 | break; |
1036 | 417k | } |
1037 | 417k | } |
1038 | | |
1039 | 0 | return std::nullopt; |
1040 | 417k | } |
1041 | | |
1042 | 15.3M | Token Tokenizer::token() { |
1043 | 15.3M | Token t; |
1044 | 15.3M | t.token_type = token_type_; |
1045 | 15.3M | switch (token_type_) { |
1046 | 570k | case TokenType::TEXT_TOKEN: { |
1047 | 570k | t.data = Text(); |
1048 | 570k | int line_number = current_line_col_.first; |
1049 | 570k | int column_number = current_line_col_.second - t.data.size(); |
1050 | | // Shift to previous line, where this text belongs. |
1051 | 570k | if (column_number < 0) { |
1052 | 209k | if (lines_cols_.size() > 1) { |
1053 | 209k | auto previous_token_linecol = lines_cols_[lines_cols_.size() - 2]; |
1054 | 209k | line_number = previous_token_linecol.first; |
1055 | 209k | column_number = |
1056 | 209k | previous_token_linecol.second - abs(column_number) + 1; |
1057 | 209k | } else { |
1058 | 224 | column_number = 0; |
1059 | 224 | } |
1060 | 209k | } |
1061 | 570k | token_line_col_ = {line_number, column_number}; |
1062 | 570k | break; |
1063 | 0 | } |
1064 | 1.40M | case TokenType::COMMENT_TOKEN: |
1065 | 1.40M | case TokenType::DOCTYPE_TOKEN: |
1066 | 1.40M | t.data = Text(); |
1067 | 1.40M | t.is_manufactured = is_token_manufactured_; |
1068 | 1.40M | token_line_col_ = {current_line_col_.first, |
1069 | 1.40M | current_line_col_.second - t.data.size()}; |
1070 | 1.40M | break; |
1071 | 12.9M | case TokenType::START_TAG_TOKEN: |
1072 | 12.9M | case TokenType::SELF_CLOSING_TAG_TOKEN: |
1073 | 13.3M | case TokenType::END_TAG_TOKEN: { |
1074 | 13.3M | auto tag_name_value = TagName(); |
1075 | 13.3M | if (tag_name_value.has_value()) { |
1076 | 13.3M | std::string tag_name = std::get<0>(tag_name_value.value()); |
1077 | 13.3M | bool has_attributes = std::get<1>(tag_name_value.value()); |
1078 | 13.3M | Atom atom = AtomUtil::ToAtom(tag_name); |
1079 | 13.3M | if (atom != Atom::UNKNOWN) { |
1080 | 7.12M | t.atom = atom; |
1081 | 7.12M | } else { |
1082 | 6.20M | t.atom = Atom::UNKNOWN; |
1083 | 6.20M | t.data = tag_name; |
1084 | 6.20M | } |
1085 | 13.3M | if (has_attributes) { |
1086 | 417k | while (true) { |
1087 | 417k | auto a = TagAttr(); |
1088 | 417k | if (!a.has_value()) break; |
1089 | 417k | auto attr = std::get<Attribute>(a.value()); |
1090 | 417k | bool more_attributes = std::get<bool>(a.value()); |
1091 | 417k | t.attributes.push_back(attr); |
1092 | 417k | if (!more_attributes) break; |
1093 | 417k | } |
1094 | 199k | } |
1095 | 13.3M | } |
1096 | 13.3M | break; |
1097 | 12.9M | } |
1098 | 11.5k | case TokenType::ERROR_TOKEN: |
1099 | | // Ignore. |
1100 | 11.5k | break; |
1101 | 15.3M | } |
1102 | | |
1103 | 15.3M | t.line_col_in_html_src = token_line_col_; |
1104 | 15.3M | return t; |
1105 | 15.3M | } |
1106 | | |
1107 | | } // namespace htmlparser |