/src/serenity/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> |
3 | | * Copyright (c) 2022, Linus Groh <linusg@serenityos.org> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #pragma once |
9 | | |
10 | | #include <AK/Queue.h> |
11 | | #include <AK/StringBuilder.h> |
12 | | #include <AK/StringView.h> |
13 | | #include <AK/Types.h> |
14 | | #include <AK/Utf8View.h> |
15 | | #include <LibJS/Heap/GCPtr.h> |
16 | | #include <LibWeb/Forward.h> |
17 | | #include <LibWeb/HTML/Parser/HTMLToken.h> |
18 | | |
19 | | namespace Web::HTML { |
20 | | |
21 | | #define ENUMERATE_TOKENIZER_STATES \ |
22 | | __ENUMERATE_TOKENIZER_STATE(Data) \ |
23 | | __ENUMERATE_TOKENIZER_STATE(RCDATA) \ |
24 | | __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \ |
25 | | __ENUMERATE_TOKENIZER_STATE(ScriptData) \ |
26 | | __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \ |
27 | | __ENUMERATE_TOKENIZER_STATE(TagOpen) \ |
28 | | __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \ |
29 | | __ENUMERATE_TOKENIZER_STATE(TagName) \ |
30 | | __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \ |
31 | | __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \ |
32 | | __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \ |
33 | | __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \ |
34 | | __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \ |
35 | | __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \ |
36 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \ |
37 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \ |
38 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \ |
39 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \ |
40 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \ |
41 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \ |
42 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \ |
43 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \ |
44 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \ |
45 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \ |
46 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \ |
47 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \ |
48 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \ |
49 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \ |
50 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \ |
51 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \ |
52 | | __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \ |
53 | | __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \ |
54 | | __ENUMERATE_TOKENIZER_STATE(AttributeName) \ |
55 | | __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \ |
56 | | __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \ |
57 | | __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \ |
58 | | __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \ |
59 | | __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \ |
60 | | __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \ |
61 | | __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \ |
62 | | __ENUMERATE_TOKENIZER_STATE(BogusComment) \ |
63 | | __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \ |
64 | | __ENUMERATE_TOKENIZER_STATE(CommentStart) \ |
65 | | __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \ |
66 | | __ENUMERATE_TOKENIZER_STATE(Comment) \ |
67 | | __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \ |
68 | | __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \ |
69 | | __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \ |
70 | | __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \ |
71 | | __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \ |
72 | | __ENUMERATE_TOKENIZER_STATE(CommentEnd) \ |
73 | | __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \ |
74 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \ |
75 | | __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \ |
76 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \ |
77 | | __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \ |
78 | | __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \ |
79 | | __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \ |
80 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \ |
81 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \ |
82 | | __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \ |
83 | | __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ |
84 | | __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \ |
85 | | __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \ |
86 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \ |
87 | | __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \ |
88 | | __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \ |
89 | | __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \ |
90 | | __ENUMERATE_TOKENIZER_STATE(CDATASection) \ |
91 | | __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \ |
92 | | __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \ |
93 | | __ENUMERATE_TOKENIZER_STATE(CharacterReference) \ |
94 | | __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \ |
95 | | __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \ |
96 | | __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \ |
97 | | __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \ |
98 | | __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \ |
99 | | __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \ |
100 | | __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \ |
101 | | __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) |
102 | | |
103 | | class HTMLTokenizer { |
104 | | public: |
105 | | explicit HTMLTokenizer(); |
106 | | explicit HTMLTokenizer(StringView input, ByteString const& encoding); |
107 | | |
108 | | enum class State { |
109 | | #define __ENUMERATE_TOKENIZER_STATE(state) state, |
110 | | ENUMERATE_TOKENIZER_STATES |
111 | | #undef __ENUMERATE_TOKENIZER_STATE |
112 | | }; |
113 | | |
114 | | enum class StopAtInsertionPoint { |
115 | | No, |
116 | | Yes, |
117 | | }; |
118 | | Optional<HTMLToken> next_token(StopAtInsertionPoint = StopAtInsertionPoint::No); |
119 | | |
120 | 0 | void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; } |
121 | | |
122 | | void switch_to(Badge<HTMLParser>, State new_state); |
123 | | void switch_to(State new_state) |
124 | 0 | { |
125 | 0 | m_state = new_state; |
126 | 0 | } |
127 | | |
128 | 0 | void set_blocked(bool b) { m_blocked = b; } |
129 | 0 | bool is_blocked() const { return m_blocked; } |
130 | | |
131 | 0 | ByteString source() const { return m_decoded_input; } |
132 | | |
133 | | void insert_input_at_insertion_point(StringView input); |
134 | | void insert_eof(); |
135 | | bool is_eof_inserted(); |
136 | | |
137 | 0 | bool is_insertion_point_defined() const { return m_insertion_point.defined; } |
138 | | bool is_insertion_point_reached() |
139 | 0 | { |
140 | 0 | return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position; |
141 | 0 | } |
142 | 0 | void undefine_insertion_point() { m_insertion_point.defined = false; } |
143 | 0 | void store_insertion_point() { m_old_insertion_point = m_insertion_point; } |
144 | 0 | void restore_insertion_point() { m_insertion_point = m_old_insertion_point; } |
145 | | void update_insertion_point() |
146 | 0 | { |
147 | 0 | m_insertion_point.defined = true; |
148 | 0 | m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator); |
149 | 0 | } |
150 | | |
151 | | // This permanently cuts off the tokenizer input stream. |
152 | 0 | void abort() { m_aborted = true; } |
153 | | |
154 | | private: |
155 | | void skip(size_t count); |
156 | | Optional<u32> next_code_point(); |
157 | | Optional<u32> peek_code_point(size_t offset) const; |
158 | | bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); |
159 | | void create_new_token(HTMLToken::Type); |
160 | | bool current_end_tag_token_is_appropriate() const; |
161 | | String consume_current_builder(); |
162 | | |
163 | | static char const* state_name(State state) |
164 | 0 | { |
165 | 0 | switch (state) { |
166 | 0 | #define __ENUMERATE_TOKENIZER_STATE(state) \ |
167 | 0 | case State::state: \ |
168 | 0 | return #state; |
169 | 0 | ENUMERATE_TOKENIZER_STATES |
170 | 0 | #undef __ENUMERATE_TOKENIZER_STATE |
171 | 0 | }; |
172 | 0 | VERIFY_NOT_REACHED(); |
173 | 0 | } |
174 | | |
175 | | void will_emit(HTMLToken&); |
176 | | void will_switch_to(State); |
177 | | void will_reconsume_in(State); |
178 | | |
179 | | bool consumed_as_part_of_an_attribute() const; |
180 | | |
181 | | void restore_to(Utf8CodePointIterator const& new_iterator); |
182 | | HTMLToken::Position nth_last_position(size_t n = 0); |
183 | | |
184 | | JS::GCPtr<HTMLParser> m_parser; |
185 | | |
186 | | State m_state { State::Data }; |
187 | | State m_return_state { State::Data }; |
188 | | |
189 | | Vector<u32> m_temporary_buffer; |
190 | | |
191 | | ByteString m_decoded_input; |
192 | | |
193 | | struct InsertionPoint { |
194 | | size_t position { 0 }; |
195 | | bool defined { false }; |
196 | | }; |
197 | | InsertionPoint m_insertion_point {}; |
198 | | InsertionPoint m_old_insertion_point {}; |
199 | | |
200 | | Utf8View m_utf8_view; |
201 | | Utf8CodePointIterator m_utf8_iterator; |
202 | | Utf8CodePointIterator m_prev_utf8_iterator; |
203 | | |
204 | | HTMLToken m_current_token; |
205 | | StringBuilder m_current_builder; |
206 | | |
207 | | Optional<ByteString> m_last_emitted_start_tag_name; |
208 | | |
209 | | bool m_explicit_eof_inserted { false }; |
210 | | bool m_has_emitted_eof { false }; |
211 | | |
212 | | Queue<HTMLToken> m_queued_tokens; |
213 | | |
214 | | u32 m_character_reference_code { 0 }; |
215 | | |
216 | | bool m_blocked { false }; |
217 | | |
218 | | bool m_aborted { false }; |
219 | | |
220 | | Vector<HTMLToken::Position> m_source_positions; |
221 | | }; |
222 | | |
223 | | } |