Coverage Report

Created: 2025-11-16 07:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
3
 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#pragma once
9
10
#include <AK/Queue.h>
11
#include <AK/StringBuilder.h>
12
#include <AK/StringView.h>
13
#include <AK/Types.h>
14
#include <AK/Utf8View.h>
15
#include <LibJS/Heap/GCPtr.h>
16
#include <LibWeb/Forward.h>
17
#include <LibWeb/HTML/Parser/HTMLToken.h>
18
19
namespace Web::HTML {
20
21
#define ENUMERATE_TOKENIZER_STATES                                        \
22
    __ENUMERATE_TOKENIZER_STATE(Data)                                     \
23
    __ENUMERATE_TOKENIZER_STATE(RCDATA)                                   \
24
    __ENUMERATE_TOKENIZER_STATE(RAWTEXT)                                  \
25
    __ENUMERATE_TOKENIZER_STATE(ScriptData)                               \
26
    __ENUMERATE_TOKENIZER_STATE(PLAINTEXT)                                \
27
    __ENUMERATE_TOKENIZER_STATE(TagOpen)                                  \
28
    __ENUMERATE_TOKENIZER_STATE(EndTagOpen)                               \
29
    __ENUMERATE_TOKENIZER_STATE(TagName)                                  \
30
    __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign)                       \
31
    __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen)                         \
32
    __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName)                         \
33
    __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign)                      \
34
    __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen)                        \
35
    __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName)                        \
36
    __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign)                   \
37
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen)                     \
38
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName)                     \
39
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart)                    \
40
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash)                \
41
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped)                        \
42
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash)                    \
43
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash)                \
44
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign)            \
45
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen)              \
46
    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName)              \
47
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart)              \
48
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped)                  \
49
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash)              \
50
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash)          \
51
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign)      \
52
    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd)                \
53
    __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName)                      \
54
    __ENUMERATE_TOKENIZER_STATE(AttributeName)                            \
55
    __ENUMERATE_TOKENIZER_STATE(AfterAttributeName)                       \
56
    __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue)                     \
57
    __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted)               \
58
    __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted)               \
59
    __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted)                   \
60
    __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted)                \
61
    __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag)                      \
62
    __ENUMERATE_TOKENIZER_STATE(BogusComment)                             \
63
    __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen)                    \
64
    __ENUMERATE_TOKENIZER_STATE(CommentStart)                             \
65
    __ENUMERATE_TOKENIZER_STATE(CommentStartDash)                         \
66
    __ENUMERATE_TOKENIZER_STATE(Comment)                                  \
67
    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign)                      \
68
    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang)                  \
69
    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash)              \
70
    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash)          \
71
    __ENUMERATE_TOKENIZER_STATE(CommentEndDash)                           \
72
    __ENUMERATE_TOKENIZER_STATE(CommentEnd)                               \
73
    __ENUMERATE_TOKENIZER_STATE(CommentEndBang)                           \
74
    __ENUMERATE_TOKENIZER_STATE(DOCTYPE)                                  \
75
    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName)                        \
76
    __ENUMERATE_TOKENIZER_STATE(DOCTYPEName)                              \
77
    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName)                         \
78
    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword)                \
79
    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier)            \
80
    __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted)      \
81
    __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted)      \
82
    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier)             \
83
    __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
84
    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword)                \
85
    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier)            \
86
    __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted)      \
87
    __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted)      \
88
    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier)             \
89
    __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE)                             \
90
    __ENUMERATE_TOKENIZER_STATE(CDATASection)                             \
91
    __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket)                      \
92
    __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd)                          \
93
    __ENUMERATE_TOKENIZER_STATE(CharacterReference)                       \
94
    __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference)                  \
95
    __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand)                       \
96
    __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference)                \
97
    __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart)       \
98
    __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart)           \
99
    __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference)            \
100
    __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference)                \
101
    __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
102
103
class HTMLTokenizer {
104
public:
105
    explicit HTMLTokenizer();
106
    explicit HTMLTokenizer(StringView input, ByteString const& encoding);
107
108
    enum class State {
109
#define __ENUMERATE_TOKENIZER_STATE(state) state,
110
        ENUMERATE_TOKENIZER_STATES
111
#undef __ENUMERATE_TOKENIZER_STATE
112
    };
113
114
    enum class StopAtInsertionPoint {
115
        No,
116
        Yes,
117
    };
118
    Optional<HTMLToken> next_token(StopAtInsertionPoint = StopAtInsertionPoint::No);
119
120
0
    void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
121
122
    void switch_to(Badge<HTMLParser>, State new_state);
123
    void switch_to(State new_state)
124
0
    {
125
0
        m_state = new_state;
126
0
    }
127
128
0
    void set_blocked(bool b) { m_blocked = b; }
129
0
    bool is_blocked() const { return m_blocked; }
130
131
0
    ByteString source() const { return m_decoded_input; }
132
133
    void insert_input_at_insertion_point(StringView input);
134
    void insert_eof();
135
    bool is_eof_inserted();
136
137
0
    bool is_insertion_point_defined() const { return m_insertion_point.defined; }
138
    bool is_insertion_point_reached()
139
0
    {
140
0
        return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position;
141
0
    }
142
0
    void undefine_insertion_point() { m_insertion_point.defined = false; }
143
0
    void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
144
0
    void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
145
    void update_insertion_point()
146
0
    {
147
0
        m_insertion_point.defined = true;
148
0
        m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
149
0
    }
150
151
    // This permanently cuts off the tokenizer input stream.
152
0
    void abort() { m_aborted = true; }
153
154
private:
155
    void skip(size_t count);
156
    Optional<u32> next_code_point();
157
    Optional<u32> peek_code_point(size_t offset) const;
158
    bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
159
    void create_new_token(HTMLToken::Type);
160
    bool current_end_tag_token_is_appropriate() const;
161
    String consume_current_builder();
162
163
    static char const* state_name(State state)
164
0
    {
165
0
        switch (state) {
166
0
#define __ENUMERATE_TOKENIZER_STATE(state) \
167
0
    case State::state:                     \
168
0
        return #state;
169
0
            ENUMERATE_TOKENIZER_STATES
170
0
#undef __ENUMERATE_TOKENIZER_STATE
171
0
        };
172
0
        VERIFY_NOT_REACHED();
173
0
    }
174
175
    void will_emit(HTMLToken&);
176
    void will_switch_to(State);
177
    void will_reconsume_in(State);
178
179
    bool consumed_as_part_of_an_attribute() const;
180
181
    void restore_to(Utf8CodePointIterator const& new_iterator);
182
    HTMLToken::Position nth_last_position(size_t n = 0);
183
184
    JS::GCPtr<HTMLParser> m_parser;
185
186
    State m_state { State::Data };
187
    State m_return_state { State::Data };
188
189
    Vector<u32> m_temporary_buffer;
190
191
    ByteString m_decoded_input;
192
193
    struct InsertionPoint {
194
        size_t position { 0 };
195
        bool defined { false };
196
    };
197
    InsertionPoint m_insertion_point {};
198
    InsertionPoint m_old_insertion_point {};
199
200
    Utf8View m_utf8_view;
201
    Utf8CodePointIterator m_utf8_iterator;
202
    Utf8CodePointIterator m_prev_utf8_iterator;
203
204
    HTMLToken m_current_token;
205
    StringBuilder m_current_builder;
206
207
    Optional<ByteString> m_last_emitted_start_tag_name;
208
209
    bool m_explicit_eof_inserted { false };
210
    bool m_has_emitted_eof { false };
211
212
    Queue<HTMLToken> m_queued_tokens;
213
214
    u32 m_character_reference_code { 0 };
215
216
    bool m_blocked { false };
217
218
    bool m_aborted { false };
219
220
    Vector<HTMLToken::Position> m_source_positions;
221
};
222
223
}