Coverage Report

Created: 2025-08-28 06:26

/src/serenity/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
3
 * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#pragma once
9
10
#include <AK/FlyString.h>
11
#include <AK/Function.h>
12
#include <AK/OwnPtr.h>
13
#include <AK/Types.h>
14
#include <AK/Variant.h>
15
#include <AK/Vector.h>
16
17
namespace Web::HTML {
18
19
class HTMLTokenizer;
20
21
class HTMLToken {
22
    AK_MAKE_NONCOPYABLE(HTMLToken);
23
    AK_MAKE_DEFAULT_MOVABLE(HTMLToken);
24
25
public:
26
    enum class Type : u8 {
27
        Invalid,
28
        DOCTYPE,
29
        StartTag,
30
        EndTag,
31
        Comment,
32
        Character,
33
        EndOfFile,
34
    };
35
36
    struct Position {
37
        size_t line { 0 };
38
        size_t column { 0 };
39
        size_t byte_offset { 0 };
40
    };
41
42
    struct Attribute {
43
        Optional<FlyString> prefix;
44
        FlyString local_name;
45
        Optional<FlyString> namespace_;
46
        String value;
47
        Position name_start_position;
48
        Position value_start_position;
49
        Position name_end_position;
50
        Position value_end_position;
51
    };
52
53
    struct DoctypeData {
54
        // NOTE: "Missing" is a distinct state from the empty string.
55
        String name;
56
        String public_identifier;
57
        String system_identifier;
58
        bool missing_name { true };
59
        bool missing_public_identifier { true };
60
        bool missing_system_identifier { true };
61
        bool force_quirks { false };
62
    };
63
64
    static HTMLToken make_character(u32 code_point)
65
0
    {
66
0
        HTMLToken token { Type::Character };
67
0
        token.set_code_point(code_point);
68
0
        return token;
69
0
    }
70
71
    static HTMLToken make_start_tag(FlyString const& tag_name)
72
0
    {
73
0
        HTMLToken token { Type::StartTag };
74
0
        token.set_tag_name(tag_name);
75
0
        return token;
76
0
    }
77
78
0
    HTMLToken() = default;
79
80
    HTMLToken(Type type)
81
0
        : m_type(type)
82
0
    {
83
0
        switch (m_type) {
84
0
        case Type::Character:
85
0
            m_data.set(0u);
86
0
            break;
87
0
        case Type::DOCTYPE:
88
0
            m_data.set(OwnPtr<DoctypeData> {});
89
0
            break;
90
0
        case Type::StartTag:
91
0
        case Type::EndTag:
92
0
            m_data.set(OwnPtr<Vector<Attribute>>());
93
0
            break;
94
0
        default:
95
0
            break;
96
0
        }
97
0
    }
98
99
0
    bool is_doctype() const { return m_type == Type::DOCTYPE; }
100
0
    bool is_start_tag() const { return m_type == Type::StartTag; }
101
0
    bool is_end_tag() const { return m_type == Type::EndTag; }
102
0
    bool is_comment() const { return m_type == Type::Comment; }
103
0
    bool is_character() const { return m_type == Type::Character; }
104
0
    bool is_end_of_file() const { return m_type == Type::EndOfFile; }
105
106
    u32 code_point() const
107
0
    {
108
0
        VERIFY(is_character());
109
0
        return m_data.get<u32>();
110
0
    }
111
112
    bool is_parser_whitespace() const
113
0
    {
114
        // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
115
0
        if (!is_character())
116
0
            return false;
117
0
        switch (code_point()) {
118
0
        case '\t':
119
0
        case '\n':
120
0
        case '\f':
121
0
        case '\r':
122
0
        case ' ':
123
0
            return true;
124
0
        default:
125
0
            return false;
126
0
        }
127
0
    }
128
129
    void set_code_point(u32 code_point)
130
0
    {
131
0
        VERIFY(is_character());
132
0
        m_data.get<u32>() = code_point;
133
0
    }
134
135
    String const& comment() const
136
0
    {
137
0
        VERIFY(is_comment());
138
0
        return m_comment_data;
139
0
    }
140
141
    void set_comment(String comment)
142
0
    {
143
0
        VERIFY(is_comment());
144
0
        m_comment_data = move(comment);
145
0
    }
146
147
    FlyString const& tag_name() const
148
0
    {
149
0
        VERIFY(is_start_tag() || is_end_tag());
150
0
        return m_string_data;
151
0
    }
152
153
    void set_tag_name(FlyString name)
154
0
    {
155
0
        VERIFY(is_start_tag() || is_end_tag());
156
0
        m_string_data = move(name);
157
0
    }
158
159
    bool is_self_closing() const
160
0
    {
161
0
        VERIFY(is_start_tag() || is_end_tag());
162
0
        return m_tag_self_closing;
163
0
    }
164
165
    void set_self_closing(bool self_closing)
166
0
    {
167
0
        VERIFY(is_start_tag() || is_end_tag());
168
0
        m_tag_self_closing = self_closing;
169
0
    }
170
171
    bool has_acknowledged_self_closing_flag() const
172
0
    {
173
0
        VERIFY(is_self_closing());
174
0
        return m_tag_self_closing_acknowledged;
175
0
    }
176
177
    void acknowledge_self_closing_flag_if_set()
178
0
    {
179
0
        if (is_self_closing())
180
0
            m_tag_self_closing_acknowledged = true;
181
0
    }
182
183
    bool has_attributes() const
184
0
    {
185
0
        VERIFY(is_start_tag() || is_end_tag());
186
0
        auto* ptr = tag_attributes();
187
0
        return ptr && !ptr->is_empty();
188
0
    }
189
190
    size_t attribute_count() const
191
0
    {
192
0
        VERIFY(is_start_tag() || is_end_tag());
193
0
        if (auto* ptr = tag_attributes())
194
0
            return ptr->size();
195
0
        return 0;
196
0
    }
197
198
    void add_attribute(Attribute attribute)
199
0
    {
200
0
        VERIFY(is_start_tag() || is_end_tag());
201
0
        ensure_tag_attributes().append(move(attribute));
202
0
    }
203
204
    Attribute const& last_attribute() const
205
0
    {
206
0
        VERIFY(is_start_tag() || is_end_tag());
207
0
        VERIFY(has_attributes());
208
0
        return tag_attributes()->last();
209
0
    }
210
211
    Attribute& last_attribute()
212
0
    {
213
0
        VERIFY(is_start_tag() || is_end_tag());
214
0
        VERIFY(has_attributes());
215
0
        return tag_attributes()->last();
216
0
    }
217
218
    void drop_attributes()
219
0
    {
220
0
        VERIFY(is_start_tag() || is_end_tag());
221
0
        m_data.get<OwnPtr<Vector<Attribute>>>().clear();
222
0
    }
223
224
    void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
225
0
    {
226
0
        VERIFY(is_start_tag() || is_end_tag());
227
0
        auto* ptr = tag_attributes();
228
0
        if (!ptr)
229
0
            return;
230
0
        for (auto& attribute : *ptr) {
231
0
            if (callback(attribute) == IterationDecision::Break)
232
0
                break;
233
0
        }
234
0
    }
235
236
    void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
237
0
    {
238
0
        VERIFY(is_start_tag() || is_end_tag());
239
0
        auto* ptr = tag_attributes();
240
0
        if (!ptr)
241
0
            return;
242
0
        for (auto& attribute : *ptr) {
243
0
            if (callback(attribute) == IterationDecision::Break)
244
0
                break;
245
0
        }
246
0
    }
247
248
    Optional<String> attribute(FlyString const& attribute_name) const
249
0
    {
250
0
        if (auto result = raw_attribute(attribute_name); result.has_value())
251
0
            return result->value;
252
0
        return {};
253
0
    }
254
255
    Optional<Attribute const&> raw_attribute(FlyString const& attribute_name) const
256
0
    {
257
0
        VERIFY(is_start_tag() || is_end_tag());
258
259
0
        auto* ptr = tag_attributes();
260
0
        if (!ptr)
261
0
            return {};
262
0
        for (auto const& attribute : *ptr) {
263
0
            if (attribute_name == attribute.local_name)
264
0
                return attribute;
265
0
        }
266
0
        return {};
267
0
    }
268
269
    bool has_attribute(FlyString const& attribute_name) const
270
0
    {
271
0
        return attribute(attribute_name).has_value();
272
0
    }
273
274
    void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
275
0
    {
276
0
        VERIFY(is_start_tag() || is_end_tag());
277
0
        if (old_name == tag_name())
278
0
            set_tag_name(new_name);
279
0
    }
280
281
    void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
282
0
    {
283
0
        VERIFY(is_start_tag() || is_end_tag());
284
0
        for_each_attribute([&](Attribute& attribute) {
285
0
            if (old_name == attribute.local_name)
286
0
                attribute.local_name = new_name;
287
0
            return IterationDecision::Continue;
288
0
        });
289
0
    }
290
291
    void adjust_foreign_attribute(FlyString const& old_name, Optional<FlyString> const& prefix, FlyString const& local_name, Optional<FlyString> const& namespace_)
292
0
    {
293
0
        VERIFY(is_start_tag() || is_end_tag());
294
0
        for_each_attribute([&](Attribute& attribute) {
295
0
            if (old_name == attribute.local_name) {
296
0
                attribute.prefix = prefix;
297
0
                attribute.local_name = local_name;
298
0
                attribute.namespace_ = namespace_;
299
0
            }
300
0
            return IterationDecision::Continue;
301
0
        });
302
0
    }
303
304
    DoctypeData const& doctype_data() const
305
0
    {
306
0
        VERIFY(is_doctype());
307
0
        auto* ptr = m_data.get<OwnPtr<DoctypeData>>().ptr();
308
0
        VERIFY(ptr);
309
0
        return *ptr;
310
0
    }
311
312
    DoctypeData& ensure_doctype_data()
313
0
    {
314
0
        VERIFY(is_doctype());
315
0
        auto& ptr = m_data.get<OwnPtr<DoctypeData>>();
316
0
        if (!ptr)
317
0
            ptr = make<DoctypeData>();
318
0
        return *ptr;
319
0
    }
320
321
0
    Type type() const { return m_type; }
322
323
    String to_string() const;
324
325
0
    Position const& start_position() const { return m_start_position; }
326
0
    Position const& end_position() const { return m_end_position; }
327
328
0
    void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
329
0
    void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
330
331
    void normalize_attributes();
332
333
private:
334
    Vector<Attribute> const* tag_attributes() const
335
0
    {
336
0
        return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
337
0
    }
338
339
    Vector<Attribute>* tag_attributes()
340
0
    {
341
0
        return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
342
0
    }
343
344
    Vector<Attribute>& ensure_tag_attributes()
345
0
    {
346
0
        VERIFY(is_start_tag() || is_end_tag());
347
0
        auto& ptr = m_data.get<OwnPtr<Vector<Attribute>>>();
348
0
        if (!ptr)
349
0
            ptr = make<Vector<Attribute>>();
350
0
        return *ptr;
351
0
    }
352
353
    Type m_type { Type::Invalid };
354
355
    // Type::StartTag and Type::EndTag
356
    bool m_tag_self_closing { false };
357
    bool m_tag_self_closing_acknowledged { false };
358
359
    // Type::StartTag and Type::EndTag (tag name)
360
    FlyString m_string_data;
361
362
    // Type::Comment (comment data)
363
    String m_comment_data;
364
365
    Variant<Empty, u32, OwnPtr<DoctypeData>, OwnPtr<Vector<Attribute>>> m_data {};
366
367
    Position m_start_position;
368
    Position m_end_position;
369
};
370
371
}