/src/serenity/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> |
3 | | * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #pragma once |
9 | | |
10 | | #include <AK/FlyString.h> |
11 | | #include <AK/Function.h> |
12 | | #include <AK/OwnPtr.h> |
13 | | #include <AK/Types.h> |
14 | | #include <AK/Variant.h> |
15 | | #include <AK/Vector.h> |
16 | | |
17 | | namespace Web::HTML { |
18 | | |
19 | | class HTMLTokenizer; |
20 | | |
21 | | class HTMLToken { |
22 | | AK_MAKE_NONCOPYABLE(HTMLToken); |
23 | | AK_MAKE_DEFAULT_MOVABLE(HTMLToken); |
24 | | |
25 | | public: |
26 | | enum class Type : u8 { |
27 | | Invalid, |
28 | | DOCTYPE, |
29 | | StartTag, |
30 | | EndTag, |
31 | | Comment, |
32 | | Character, |
33 | | EndOfFile, |
34 | | }; |
35 | | |
36 | | struct Position { |
37 | | size_t line { 0 }; |
38 | | size_t column { 0 }; |
39 | | size_t byte_offset { 0 }; |
40 | | }; |
41 | | |
42 | | struct Attribute { |
43 | | Optional<FlyString> prefix; |
44 | | FlyString local_name; |
45 | | Optional<FlyString> namespace_; |
46 | | String value; |
47 | | Position name_start_position; |
48 | | Position value_start_position; |
49 | | Position name_end_position; |
50 | | Position value_end_position; |
51 | | }; |
52 | | |
53 | | struct DoctypeData { |
54 | | // NOTE: "Missing" is a distinct state from the empty string. |
55 | | String name; |
56 | | String public_identifier; |
57 | | String system_identifier; |
58 | | bool missing_name { true }; |
59 | | bool missing_public_identifier { true }; |
60 | | bool missing_system_identifier { true }; |
61 | | bool force_quirks { false }; |
62 | | }; |
63 | | |
64 | | static HTMLToken make_character(u32 code_point) |
65 | 0 | { |
66 | 0 | HTMLToken token { Type::Character }; |
67 | 0 | token.set_code_point(code_point); |
68 | 0 | return token; |
69 | 0 | } |
70 | | |
71 | | static HTMLToken make_start_tag(FlyString const& tag_name) |
72 | 0 | { |
73 | 0 | HTMLToken token { Type::StartTag }; |
74 | 0 | token.set_tag_name(tag_name); |
75 | 0 | return token; |
76 | 0 | } |
77 | | |
78 | 0 | HTMLToken() = default; |
79 | | |
80 | | HTMLToken(Type type) |
81 | 0 | : m_type(type) |
82 | 0 | { |
83 | 0 | switch (m_type) { |
84 | 0 | case Type::Character: |
85 | 0 | m_data.set(0u); |
86 | 0 | break; |
87 | 0 | case Type::DOCTYPE: |
88 | 0 | m_data.set(OwnPtr<DoctypeData> {}); |
89 | 0 | break; |
90 | 0 | case Type::StartTag: |
91 | 0 | case Type::EndTag: |
92 | 0 | m_data.set(OwnPtr<Vector<Attribute>>()); |
93 | 0 | break; |
94 | 0 | default: |
95 | 0 | break; |
96 | 0 | } |
97 | 0 | } |
98 | | |
99 | 0 | bool is_doctype() const { return m_type == Type::DOCTYPE; } |
100 | 0 | bool is_start_tag() const { return m_type == Type::StartTag; } |
101 | 0 | bool is_end_tag() const { return m_type == Type::EndTag; } |
102 | 0 | bool is_comment() const { return m_type == Type::Comment; } |
103 | 0 | bool is_character() const { return m_type == Type::Character; } |
104 | 0 | bool is_end_of_file() const { return m_type == Type::EndOfFile; } |
105 | | |
106 | | u32 code_point() const |
107 | 0 | { |
108 | 0 | VERIFY(is_character()); |
109 | 0 | return m_data.get<u32>(); |
110 | 0 | } |
111 | | |
112 | | bool is_parser_whitespace() const |
113 | 0 | { |
114 | | // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not. |
115 | 0 | if (!is_character()) |
116 | 0 | return false; |
117 | 0 | switch (code_point()) { |
118 | 0 | case '\t': |
119 | 0 | case '\n': |
120 | 0 | case '\f': |
121 | 0 | case '\r': |
122 | 0 | case ' ': |
123 | 0 | return true; |
124 | 0 | default: |
125 | 0 | return false; |
126 | 0 | } |
127 | 0 | } |
128 | | |
129 | | void set_code_point(u32 code_point) |
130 | 0 | { |
131 | 0 | VERIFY(is_character()); |
132 | 0 | m_data.get<u32>() = code_point; |
133 | 0 | } |
134 | | |
135 | | String const& comment() const |
136 | 0 | { |
137 | 0 | VERIFY(is_comment()); |
138 | 0 | return m_comment_data; |
139 | 0 | } |
140 | | |
141 | | void set_comment(String comment) |
142 | 0 | { |
143 | 0 | VERIFY(is_comment()); |
144 | 0 | m_comment_data = move(comment); |
145 | 0 | } |
146 | | |
147 | | FlyString const& tag_name() const |
148 | 0 | { |
149 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
150 | 0 | return m_string_data; |
151 | 0 | } |
152 | | |
153 | | void set_tag_name(FlyString name) |
154 | 0 | { |
155 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
156 | 0 | m_string_data = move(name); |
157 | 0 | } |
158 | | |
159 | | bool is_self_closing() const |
160 | 0 | { |
161 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
162 | 0 | return m_tag_self_closing; |
163 | 0 | } |
164 | | |
165 | | void set_self_closing(bool self_closing) |
166 | 0 | { |
167 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
168 | 0 | m_tag_self_closing = self_closing; |
169 | 0 | } |
170 | | |
171 | | bool has_acknowledged_self_closing_flag() const |
172 | 0 | { |
173 | 0 | VERIFY(is_self_closing()); |
174 | 0 | return m_tag_self_closing_acknowledged; |
175 | 0 | } |
176 | | |
177 | | void acknowledge_self_closing_flag_if_set() |
178 | 0 | { |
179 | 0 | if (is_self_closing()) |
180 | 0 | m_tag_self_closing_acknowledged = true; |
181 | 0 | } |
182 | | |
183 | | bool has_attributes() const |
184 | 0 | { |
185 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
186 | 0 | auto* ptr = tag_attributes(); |
187 | 0 | return ptr && !ptr->is_empty(); |
188 | 0 | } |
189 | | |
190 | | size_t attribute_count() const |
191 | 0 | { |
192 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
193 | 0 | if (auto* ptr = tag_attributes()) |
194 | 0 | return ptr->size(); |
195 | 0 | return 0; |
196 | 0 | } |
197 | | |
198 | | void add_attribute(Attribute attribute) |
199 | 0 | { |
200 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
201 | 0 | ensure_tag_attributes().append(move(attribute)); |
202 | 0 | } |
203 | | |
204 | | Attribute const& last_attribute() const |
205 | 0 | { |
206 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
207 | 0 | VERIFY(has_attributes()); |
208 | 0 | return tag_attributes()->last(); |
209 | 0 | } |
210 | | |
211 | | Attribute& last_attribute() |
212 | 0 | { |
213 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
214 | 0 | VERIFY(has_attributes()); |
215 | 0 | return tag_attributes()->last(); |
216 | 0 | } |
217 | | |
218 | | void drop_attributes() |
219 | 0 | { |
220 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
221 | 0 | m_data.get<OwnPtr<Vector<Attribute>>>().clear(); |
222 | 0 | } |
223 | | |
224 | | void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const |
225 | 0 | { |
226 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
227 | 0 | auto* ptr = tag_attributes(); |
228 | 0 | if (!ptr) |
229 | 0 | return; |
230 | 0 | for (auto& attribute : *ptr) { |
231 | 0 | if (callback(attribute) == IterationDecision::Break) |
232 | 0 | break; |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | | void for_each_attribute(Function<IterationDecision(Attribute&)> callback) |
237 | 0 | { |
238 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
239 | 0 | auto* ptr = tag_attributes(); |
240 | 0 | if (!ptr) |
241 | 0 | return; |
242 | 0 | for (auto& attribute : *ptr) { |
243 | 0 | if (callback(attribute) == IterationDecision::Break) |
244 | 0 | break; |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | | Optional<String> attribute(FlyString const& attribute_name) const |
249 | 0 | { |
250 | 0 | if (auto result = raw_attribute(attribute_name); result.has_value()) |
251 | 0 | return result->value; |
252 | 0 | return {}; |
253 | 0 | } |
254 | | |
255 | | Optional<Attribute const&> raw_attribute(FlyString const& attribute_name) const |
256 | 0 | { |
257 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
258 | | |
259 | 0 | auto* ptr = tag_attributes(); |
260 | 0 | if (!ptr) |
261 | 0 | return {}; |
262 | 0 | for (auto const& attribute : *ptr) { |
263 | 0 | if (attribute_name == attribute.local_name) |
264 | 0 | return attribute; |
265 | 0 | } |
266 | 0 | return {}; |
267 | 0 | } |
268 | | |
269 | | bool has_attribute(FlyString const& attribute_name) const |
270 | 0 | { |
271 | 0 | return attribute(attribute_name).has_value(); |
272 | 0 | } |
273 | | |
274 | | void adjust_tag_name(FlyString const& old_name, FlyString const& new_name) |
275 | 0 | { |
276 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
277 | 0 | if (old_name == tag_name()) |
278 | 0 | set_tag_name(new_name); |
279 | 0 | } |
280 | | |
281 | | void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name) |
282 | 0 | { |
283 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
284 | 0 | for_each_attribute([&](Attribute& attribute) { |
285 | 0 | if (old_name == attribute.local_name) |
286 | 0 | attribute.local_name = new_name; |
287 | 0 | return IterationDecision::Continue; |
288 | 0 | }); |
289 | 0 | } |
290 | | |
291 | | void adjust_foreign_attribute(FlyString const& old_name, Optional<FlyString> const& prefix, FlyString const& local_name, Optional<FlyString> const& namespace_) |
292 | 0 | { |
293 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
294 | 0 | for_each_attribute([&](Attribute& attribute) { |
295 | 0 | if (old_name == attribute.local_name) { |
296 | 0 | attribute.prefix = prefix; |
297 | 0 | attribute.local_name = local_name; |
298 | 0 | attribute.namespace_ = namespace_; |
299 | 0 | } |
300 | 0 | return IterationDecision::Continue; |
301 | 0 | }); |
302 | 0 | } |
303 | | |
304 | | DoctypeData const& doctype_data() const |
305 | 0 | { |
306 | 0 | VERIFY(is_doctype()); |
307 | 0 | auto* ptr = m_data.get<OwnPtr<DoctypeData>>().ptr(); |
308 | 0 | VERIFY(ptr); |
309 | 0 | return *ptr; |
310 | 0 | } |
311 | | |
312 | | DoctypeData& ensure_doctype_data() |
313 | 0 | { |
314 | 0 | VERIFY(is_doctype()); |
315 | 0 | auto& ptr = m_data.get<OwnPtr<DoctypeData>>(); |
316 | 0 | if (!ptr) |
317 | 0 | ptr = make<DoctypeData>(); |
318 | 0 | return *ptr; |
319 | 0 | } |
320 | | |
321 | 0 | Type type() const { return m_type; } |
322 | | |
323 | | String to_string() const; |
324 | | |
325 | 0 | Position const& start_position() const { return m_start_position; } |
326 | 0 | Position const& end_position() const { return m_end_position; } |
327 | | |
328 | 0 | void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; } |
329 | 0 | void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; } |
330 | | |
331 | | void normalize_attributes(); |
332 | | |
333 | | private: |
334 | | Vector<Attribute> const* tag_attributes() const |
335 | 0 | { |
336 | 0 | return m_data.get<OwnPtr<Vector<Attribute>>>().ptr(); |
337 | 0 | } |
338 | | |
339 | | Vector<Attribute>* tag_attributes() |
340 | 0 | { |
341 | 0 | return m_data.get<OwnPtr<Vector<Attribute>>>().ptr(); |
342 | 0 | } |
343 | | |
344 | | Vector<Attribute>& ensure_tag_attributes() |
345 | 0 | { |
346 | 0 | VERIFY(is_start_tag() || is_end_tag()); |
347 | 0 | auto& ptr = m_data.get<OwnPtr<Vector<Attribute>>>(); |
348 | 0 | if (!ptr) |
349 | 0 | ptr = make<Vector<Attribute>>(); |
350 | 0 | return *ptr; |
351 | 0 | } |
352 | | |
353 | | Type m_type { Type::Invalid }; |
354 | | |
355 | | // Type::StartTag and Type::EndTag |
356 | | bool m_tag_self_closing { false }; |
357 | | bool m_tag_self_closing_acknowledged { false }; |
358 | | |
359 | | // Type::StartTag and Type::EndTag (tag name) |
360 | | FlyString m_string_data; |
361 | | |
362 | | // Type::Comment (comment data) |
363 | | String m_comment_data; |
364 | | |
365 | | Variant<Empty, u32, OwnPtr<DoctypeData>, OwnPtr<Vector<Attribute>>> m_data {}; |
366 | | |
367 | | Position m_start_position; |
368 | | Position m_end_position; |
369 | | }; |
370 | | |
371 | | } |