Coverage Report

Created: 2025-11-16 07:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibXML/Parser/Parser.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <LibXML/DOM/Document.h>
8
#include <LibXML/Parser/Parser.h>
9
10
struct Range {
11
    consteval Range(u32 start, u32 end)
12
        : start(start)
13
        , end(end)
14
0
    {
15
0
    }
16
17
    u32 start;
18
    u32 end;
19
};
20
21
template<auto... ranges>
22
struct ranges_for_search {
23
    auto contains(u32 value) const
24
151M
    {
25
3.11G
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
151M
    }
ranges_for_search<Range{1u, 8u}, Range{11u, 12u}, Range{14u, 31u}, Range{127u, 132u}, Range{134u, 159u}>::contains(unsigned int) const
Line
Count
Source
24
2.13M
    {
25
17.0M
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
2.13M
    }
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}>::contains(unsigned int) const
Line
Count
Source
24
54.8M
    {
25
623M
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
54.8M
    }
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}>::contains(unsigned int) const
Line
Count
Source
24
86.7M
    {
25
2.31G
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
86.7M
    }
ranges_for_search<Range{1u, 55295u}, Range{57344u, 65533u}, Range{65536u, 1114111u}>::contains(unsigned int) const
Line
Count
Source
24
14.6k
    {
25
30.0k
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
14.6k
    }
ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}>::contains(unsigned int) const
Line
Count
Source
24
2.07M
    {
25
6.19M
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
2.07M
    }
ranges_for_search<Range{48u, 57u}>::contains(unsigned int) const
Line
Count
Source
24
2.25M
    {
25
2.25M
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
2.25M
    }
ranges_for_search<Range{32u, 32u}, Range{13u, 13u}, Range{10u, 10u}, Range{45u, 45u}, Range{39u, 39u}, Range{40u, 40u}, Range{41u, 41u}, Range{43u, 43u}, Range{44u, 44u}, Range{46u, 46u}, Range{47u, 47u}, Range{58u, 58u}, Range{61u, 61u}, Range{63u, 63u}, Range{59u, 59u}, Range{33u, 33u}, Range{42u, 42u}, Range{35u, 35u}, Range{64u, 64u}, Range{36u, 36u}, Range{95u, 95u}, Range{37u, 37u}, Range{97u, 122u}, Range{65u, 90u}, Range{48u, 57u}>::contains(unsigned int) const
Line
Count
Source
24
3.78M
    {
25
150M
        return ((value >= ranges.start && value <= ranges.end) || ...);
26
3.78M
    }
27
28
    bool operator()(u32 value) const
29
10.4M
    {
30
10.4M
        return contains(value);
31
10.4M
    }
ranges_for_search<Range{1u, 8u}, Range{11u, 12u}, Range{14u, 31u}, Range{127u, 132u}, Range{134u, 159u}>::operator()(unsigned int) const
Line
Count
Source
29
2.13M
    {
30
2.13M
        return contains(value);
31
2.13M
    }
ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}>::operator()(unsigned int) const
Line
Count
Source
29
2.07M
    {
30
2.07M
        return contains(value);
31
2.07M
    }
ranges_for_search<Range{48u, 57u}>::operator()(unsigned int) const
Line
Count
Source
29
2.25M
    {
30
2.25M
        return contains(value);
31
2.25M
    }
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}>::operator()(unsigned int) const
Line
Count
Source
29
4.03M
    {
30
4.03M
        return contains(value);
31
4.03M
    }
32
33
    template<auto... ranges_to_include>
34
    consteval auto with() const
35
    {
36
        return ranges_for_search<ranges..., ranges_to_include...>();
37
    }
38
39
    template<auto... ranges_to_include>
40
    consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
41
    {
42
        return ranges_for_search<ranges..., ranges_to_include...>();
43
    }
44
};
45
46
template<size_t Count, typename Element>
47
struct StringSet {
48
    consteval StringSet(Element const (&entries)[Count])
49
    {
50
        for (size_t i = 0; i < Count - 1; ++i)
51
            elements[i] = entries[i];
52
    }
53
54
    consteval auto operator[](size_t i) const { return elements[i]; }
55
56
    Element elements[Count - 1];
57
};
58
59
template<StringSet chars>
60
consteval static auto set_to_search()
61
{
62
    return ([&]<auto... Ix>(IndexSequence<Ix...>) {
63
        return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
64
    }(MakeIndexSequence<array_size(chars.elements)>()));
65
}
66
67
namespace XML {
68
69
size_t Parser::s_debug_indent_level { 0 };
70
71
void Parser::append_node(NonnullOwnPtr<Node> node)
72
7.08M
{
73
7.08M
    if (m_entered_node) {
74
7.08M
        auto& entered_element = m_entered_node->content.get<Node::Element>();
75
7.08M
        entered_element.children.append(move(node));
76
7.08M
        enter_node(*entered_element.children.last());
77
7.08M
    } else {
78
2.49k
        m_root_node = move(node);
79
2.49k
        enter_node(*m_root_node);
80
2.49k
    }
81
7.08M
}
82
83
void Parser::append_text(StringView text, LineTrackingLexer::Position position)
84
7.92M
{
85
7.92M
    if (m_listener) {
86
0
        m_listener->text(text);
87
0
        return;
88
0
    }
89
90
7.92M
    if (!m_entered_node) {
91
0
        Node::Text node;
92
0
        node.builder.append(text);
93
0
        m_root_node = make<Node>(position, move(node));
94
0
        return;
95
0
    }
96
97
7.92M
    m_entered_node->content.visit(
98
7.92M
        [&](Node::Element& node) {
99
7.92M
            if (!node.children.is_empty()) {
100
3.84M
                auto* text_node = node.children.last()->content.get_pointer<Node::Text>();
101
3.84M
                if (text_node) {
102
652k
                    text_node->builder.append(text);
103
652k
                    return;
104
652k
                }
105
3.84M
            }
106
7.26M
            Node::Text text_node;
107
7.26M
            text_node.builder.append(text);
108
7.26M
            node.children.append(make<Node>(position, move(text_node), m_entered_node));
109
7.26M
        },
110
7.92M
        [&](auto&) {
111
            // Can't enter a text or comment node.
112
0
            VERIFY_NOT_REACHED();
113
0
        });
Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_text(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Text>(XML::Node::Text&) const
Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_text(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Comment>(XML::Node::Comment&) const
114
7.92M
}
115
116
void Parser::append_comment(StringView text, LineTrackingLexer::Position position)
117
0
{
118
0
    if (m_listener) {
119
0
        m_listener->comment(text);
120
0
        return;
121
0
    }
122
123
    // If there's no node to attach this to, drop it on the floor.
124
    // This can happen to comments in the prolog.
125
0
    if (!m_entered_node)
126
0
        return;
127
128
0
    m_entered_node->content.visit(
129
0
        [&](Node::Element& node) {
130
0
            node.children.append(make<Node>(position, Node::Comment { text }, m_entered_node));
131
0
        },
132
0
        [&](auto&) {
133
            // Can't enter a text or comment node.
134
0
            VERIFY_NOT_REACHED();
135
0
        });
Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_comment(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Text>(XML::Node::Text&) const
Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_comment(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Comment>(XML::Node::Comment&) const
136
0
}
137
138
void Parser::enter_node(Node& node)
139
7.08M
{
140
7.08M
    if (m_listener) {
141
0
        auto& element = node.content.get<Node::Element>();
142
0
        m_listener->element_start(element.name, element.attributes);
143
0
    }
144
145
7.08M
    if (&node != m_root_node.ptr())
146
7.08M
        node.parent = m_entered_node;
147
7.08M
    m_entered_node = &node;
148
7.08M
}
149
150
void Parser::leave_node()
151
7.08M
{
152
7.08M
    if (m_listener) {
153
0
        auto& element = m_entered_node->content.get<Node::Element>();
154
0
        m_listener->element_end(element.name);
155
0
    }
156
157
7.08M
    m_entered_node = m_entered_node->parent;
158
7.08M
}
159
160
ErrorOr<Document, ParseError> Parser::parse()
161
8.69k
{
162
8.69k
    if (auto result = parse_internal(); result.is_error()) {
163
8.59k
        if (m_parse_errors.is_empty())
164
1.02k
            return result.release_error();
165
7.57k
        return m_parse_errors.take_first();
166
8.59k
    }
167
108
    return Document {
168
108
        m_root_node.release_nonnull(),
169
108
        move(m_doctype),
170
108
        move(m_processing_instructions),
171
108
        m_version,
172
108
    };
173
8.69k
}
174
175
ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
176
0
{
177
0
    m_listener = &listener;
178
0
    ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
179
0
    m_listener->set_source(m_source);
180
0
    m_listener->document_start();
181
0
    auto result = parse_internal();
182
0
    if (result.is_error())
183
0
        m_listener->error(result.error());
184
0
    m_listener->document_end();
185
0
    if (m_doctype.has_value()) {
186
0
        m_listener->set_doctype(m_doctype.release_value());
187
0
    }
188
0
    m_root_node.clear();
189
0
    return result;
190
0
}
191
192
// 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
193
ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
194
113M
{
195
113M
    auto rollback = rollback_point();
196
113M
    auto rule = enter_rule();
197
198
    // S ::= (#x20 | #x9 | #xD | #xA)+
199
113M
    auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv));
200
113M
    if (required == Required::Yes && matched.is_empty())
201
8.27M
        return parse_error(m_lexer.current_position(), Expectation { "whitespace"sv });
202
203
105M
    rollback.disarm();
204
105M
    return {};
205
113M
}
206
207
// 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
208
constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
209
210
// 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
211
ErrorOr<void, ParseError> Parser::parse_internal()
212
8.69k
{
213
8.69k
    auto rule = enter_rule();
214
215
    // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
216
8.69k
    TRY(parse_prolog());
217
8.69k
    TRY(parse_element());
218
671
    while (true) {
219
671
        if (auto result = parse_misc(); result.is_error())
220
190
            break;
221
671
    }
222
223
190
    auto matched_source = m_source.substring_view(0, m_lexer.tell());
224
190
    if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
225
55
        return parse_error(
226
55
            m_lexer.position_for(it.index()),
227
55
            ByteString::formatted("Invalid character #{:x} used in document", *it));
228
55
    }
229
230
135
    if (!m_lexer.is_eof())
231
27
        return parse_error(m_lexer.current_position(), ByteString { "Garbage after document"sv });
232
233
108
    return {};
234
135
}
235
236
ErrorOr<void, ParseError> Parser::expect(StringView expected)
237
108M
{
238
108M
    auto rollback = rollback_point();
239
240
108M
    if (!m_lexer.consume_specific(expected)) {
241
36.9M
        if (m_options.treat_errors_as_fatal)
242
36.9M
            return parse_error(m_lexer.current_position(), ByteString::formatted("Expected '{}'", expected));
243
36.9M
    }
244
245
71.7M
    rollback.disarm();
246
71.7M
    return {};
247
108M
}
248
249
template<typename Pred>
250
requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
251
17.1M
{
252
17.1M
    auto rollback = rollback_point();
253
17.1M
    auto start = m_lexer.tell();
254
17.1M
    if (!m_lexer.next_is(predicate)) {
255
24.2k
        if (m_options.treat_errors_as_fatal)
256
24.2k
            return parse_error(m_lexer.current_position(), Expectation { description });
257
24.2k
    }
258
259
17.1M
    m_lexer.ignore();
260
17.1M
    rollback.disarm();
261
17.1M
    return m_source.substring_view(start, m_lexer.tell() - start);
262
17.1M
}
263
264
template<typename Pred>
265
requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description, bool allow_empty)
266
153k
{
267
153k
    auto rollback = rollback_point();
268
153k
    auto start = m_lexer.tell();
269
27.0M
    while (m_lexer.next_is(predicate)) {
270
26.8M
        if (m_lexer.is_eof())
271
54
            break;
272
26.8M
        m_lexer.ignore();
273
26.8M
    }
274
275
153k
    if (m_lexer.tell() == start && !allow_empty) {
276
1.39k
        if (m_options.treat_errors_as_fatal) {
277
1.39k
            return parse_error(m_lexer.current_position(), Expectation { description });
278
1.39k
        }
279
1.39k
    }
280
281
152k
    rollback.disarm();
282
152k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
153k
}
_ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj48ELj57EEEXtlS3_Lj97ELj102EEEXtlS3_Lj65ELj70EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b
Line
Count
Source
266
7.85k
{
267
7.85k
    auto rollback = rollback_point();
268
7.85k
    auto start = m_lexer.tell();
269
2.07M
    while (m_lexer.next_is(predicate)) {
270
2.06M
        if (m_lexer.is_eof())
271
0
            break;
272
2.06M
        m_lexer.ignore();
273
2.06M
    }
274
275
7.85k
    if (m_lexer.tell() == start && !allow_empty) {
276
884
        if (m_options.treat_errors_as_fatal) {
277
884
            return parse_error(m_lexer.current_position(), Expectation { description });
278
884
        }
279
884
    }
280
281
6.96k
    rollback.disarm();
282
6.96k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
7.85k
}
_ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj48ELj57EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b
Line
Count
Source
266
8.84k
{
267
8.84k
    auto rollback = rollback_point();
268
8.84k
    auto start = m_lexer.tell();
269
2.25M
    while (m_lexer.next_is(predicate)) {
270
2.24M
        if (m_lexer.is_eof())
271
0
            break;
272
2.24M
        m_lexer.ignore();
273
2.24M
    }
274
275
8.84k
    if (m_lexer.tell() == start && !allow_empty) {
276
453
        if (m_options.treat_errors_as_fatal) {
277
453
            return parse_error(m_lexer.current_position(), Expectation { description });
278
453
        }
279
453
    }
280
281
8.39k
    rollback.disarm();
282
8.39k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
8.84k
}
_ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj58ELj58EEEXtlS3_Lj65ELj90EEEXtlS3_Lj95ELj95EEEXtlS3_Lj97ELj122EEEXtlS3_Lj192ELj214EEEXtlS3_Lj216ELj246EEEXtlS3_Lj248ELj767EEEXtlS3_Lj880ELj893EEEXtlS3_Lj895ELj8191EEEXtlS3_Lj8204ELj8205EEEXtlS3_Lj8304ELj8591EEEXtlS3_Lj11264ELj12271EEEXtlS3_Lj12289ELj55295EEEXtlS3_Lj63744ELj64975EEEXtlS3_Lj65008ELj65533EEEXtlS3_Lj65536ELj983039EEEXtlS3_Lj45ELj45EEEXtlS3_Lj46ELj46EEEXtlS3_Lj48ELj57EEEXtlS3_Lj183ELj183EEEXtlS3_Lj768ELj879EEEXtlS3_Lj8255ELj8256EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b
Line
Count
Source
266
98.7k
{
267
98.7k
    auto rollback = rollback_point();
268
98.7k
    auto start = m_lexer.tell();
269
4.03M
    while (m_lexer.next_is(predicate)) {
270
3.93M
        if (m_lexer.is_eof())
271
0
            break;
272
3.93M
        m_lexer.ignore();
273
3.93M
    }
274
275
98.7k
    if (m_lexer.tell() == start && !allow_empty) {
276
60
        if (m_options.treat_errors_as_fatal) {
277
60
            return parse_error(m_lexer.current_position(), Expectation { description });
278
60
        }
279
60
    }
280
281
98.6k
    rollback.disarm();
282
98.6k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
98.7k
}
Parser.cpp:_ZN3XML6Parser11expect_manyIZNS0_23parse_public_id_literalEvE3$_0Q23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS4_10StringViewENS_10ParseErrorEEES3_S6_b
Line
Count
Source
266
14.2k
{
267
14.2k
    auto rollback = rollback_point();
268
14.2k
    auto start = m_lexer.tell();
269
3.78M
    while (m_lexer.next_is(predicate)) {
270
3.77M
        if (m_lexer.is_eof())
271
0
            break;
272
3.77M
        m_lexer.ignore();
273
3.77M
    }
274
275
14.2k
    if (m_lexer.tell() == start && !allow_empty) {
276
0
        if (m_options.treat_errors_as_fatal) {
277
0
            return parse_error(m_lexer.current_position(), Expectation { description });
278
0
        }
279
0
    }
280
281
14.2k
    rollback.disarm();
282
14.2k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
14.2k
}
_ZN3XML6Parser11expect_manyIZN2AK13is_not_any_ofENS2_10StringViewEEUlT_E_Q23IsCallableWithArgumentsIS4_bcEEENS2_7ErrorOrIS3_NS_10ParseErrorEEES4_S3_b
Line
Count
Source
266
23.9k
{
267
23.9k
    auto rollback = rollback_point();
268
23.9k
    auto start = m_lexer.tell();
269
14.8M
    while (m_lexer.next_is(predicate)) {
270
14.8M
        if (m_lexer.is_eof())
271
54
            break;
272
14.8M
        m_lexer.ignore();
273
14.8M
    }
274
275
23.9k
    if (m_lexer.tell() == start && !allow_empty) {
276
0
        if (m_options.treat_errors_as_fatal) {
277
0
            return parse_error(m_lexer.current_position(), Expectation { description });
278
0
        }
279
0
    }
280
281
23.9k
    rollback.disarm();
282
23.9k
    return m_source.substring_view(start, m_lexer.tell() - start);
283
23.9k
}
284
285
// 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
286
ErrorOr<void, ParseError> Parser::parse_prolog()
287
8.69k
{
288
8.69k
    auto rollback = rollback_point();
289
8.69k
    auto rule = enter_rule();
290
291
    // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
292
    // The following is valid in XML 1.0.
293
    // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
294
8.69k
    if (auto result = parse_xml_decl(); result.is_error()) {
295
8.56k
        m_version = Version::Version10;
296
8.56k
        m_in_compatibility_mode = true;
297
8.56k
    }
298
8.69k
    auto accept = accept_rule();
299
300
12.8k
    while (true) {
301
12.8k
        if (auto result = parse_misc(); result.is_error())
302
8.69k
            break;
303
12.8k
    }
304
305
8.69k
    if (auto result = parse_doctype_decl(); !result.is_error()) {
306
1.66k
        while (true) {
307
1.66k
            if (auto result = parse_misc(); result.is_error())
308
1.45k
                break;
309
1.66k
        }
310
1.45k
    }
311
312
8.69k
    rollback.disarm();
313
8.69k
    return {};
314
8.69k
}
315
316
// 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
317
ErrorOr<void, ParseError> Parser::parse_xml_decl()
318
8.69k
{
319
8.69k
    auto rollback = rollback_point();
320
8.69k
    auto rule = enter_rule();
321
322
    // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
323
324
8.69k
    TRY(expect("<?xml"sv));
325
404
    auto accept = accept_rule();
326
327
404
    TRY(parse_version_info());
328
250
    (void)parse_encoding_decl();
329
250
    (void)parse_standalone_document_decl();
330
250
    TRY(skip_whitespace());
331
250
    TRY(expect("?>"sv));
332
333
129
    rollback.disarm();
334
129
    return {};
335
250
}
336
337
// 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
338
ErrorOr<void, ParseError> Parser::parse_version_info()
339
404
{
340
404
    auto rollback = rollback_point();
341
404
    auto rule = enter_rule();
342
343
    // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
344
404
    TRY(skip_whitespace(Required::Yes));
345
400
    TRY(expect("version"sv));
346
397
    auto accept = accept_rule();
347
348
397
    TRY(parse_eq());
349
393
    TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
350
392
    m_lexer.retreat();
351
352
392
    auto version_string = m_lexer.consume_quoted_string();
353
392
    if (version_string == "1.0") {
354
        // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
355
194
        m_version = Version::Version10;
356
194
        m_in_compatibility_mode = true;
357
198
    } else {
358
198
        if (version_string != "1.1" && m_options.treat_errors_as_fatal)
359
142
            return parse_error(m_lexer.current_position(), ByteString::formatted("Expected '1.1', found '{}'", version_string));
360
198
    }
361
362
250
    m_version = Version::Version11;
363
250
    rollback.disarm();
364
250
    return {};
365
392
}
366
367
// 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
368
ErrorOr<void, ParseError> Parser::parse_eq()
369
16.5M
{
370
16.5M
    auto rollback = rollback_point();
371
16.5M
    auto rule = enter_rule();
372
373
    // Eq ::= S? '=' S?
374
16.5M
    auto accept = accept_rule();
375
16.5M
    TRY(skip_whitespace());
376
16.5M
    TRY(expect("="sv));
377
16.4M
    TRY(skip_whitespace());
378
16.4M
    rollback.disarm();
379
16.4M
    return {};
380
16.4M
}
381
382
// 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
383
ErrorOr<void, ParseError> Parser::parse_encoding_decl()
384
250
{
385
250
    auto rollback = rollback_point();
386
250
    auto rule = enter_rule();
387
388
    // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
389
250
    TRY(skip_whitespace(Required::Yes));
390
233
    TRY(expect("encoding"sv));
391
149
    auto accept = accept_rule();
392
393
149
    TRY(parse_eq());
394
148
    TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
395
147
    m_lexer.retreat();
396
397
    // FIXME: Actually do something with this encoding.
398
147
    m_encoding = m_lexer.consume_quoted_string();
399
400
147
    rollback.disarm();
401
147
    return {};
402
148
}
403
404
// 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
405
ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
406
250
{
407
250
    auto rollback = rollback_point();
408
250
    auto rule = enter_rule();
409
410
    // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
411
250
    TRY(skip_whitespace(Required::Yes));
412
109
    TRY(expect("standalone"sv));
413
70
    auto accept = accept_rule();
414
415
70
    TRY(parse_eq());
416
68
    TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
417
66
    m_lexer.retreat();
418
419
66
    auto value = m_lexer.consume_quoted_string();
420
66
    if (!value.is_one_of("yes", "no"))
421
64
        return parse_error(m_lexer.position_for(m_lexer.tell() - value.length()), Expectation { "one of 'yes' or 'no'"sv });
422
423
2
    m_standalone = value == "yes";
424
425
2
    rollback.disarm();
426
2
    return {};
427
66
}
428
429
// 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
430
ErrorOr<void, ParseError> Parser::parse_misc()
431
15.1k
{
432
15.1k
    auto rollback = rollback_point();
433
15.1k
    auto rule = enter_rule();
434
435
    // Misc ::= Comment | PI | S
436
15.1k
    if (auto result = parse_comment(); !result.is_error()) {
437
260
        rollback.disarm();
438
260
        return {};
439
260
    }
440
441
14.9k
    if (auto result = parse_processing_instruction(); !result.is_error()) {
442
3.09k
        rollback.disarm();
443
3.09k
        return {};
444
3.09k
    }
445
446
11.8k
    if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
447
1.46k
        rollback.disarm();
448
1.46k
        return {};
449
1.46k
    }
450
451
10.3k
    return parse_error(m_lexer.current_position(), Expectation { "a match for 'Misc'"sv });
452
11.8k
}
453
454
// 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
455
ErrorOr<void, ParseError> Parser::parse_comment()
456
4.35M
{
457
4.35M
    auto rollback = rollback_point();
458
4.35M
    auto rule = enter_rule();
459
460
    // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
461
4.35M
    auto comment_start = m_lexer.tell();
462
4.35M
    TRY(expect("<!--"sv));
463
9.63k
    auto accept = accept_rule();
464
465
9.63k
    bool last_seen_a_dash = false;
466
    // FIXME: This should disallow surrogate blocks
467
21.7M
    auto text = m_lexer.consume_while([&](auto ch) {
468
21.7M
        if (ch != '-') {
469
21.7M
            last_seen_a_dash = false;
470
21.7M
            return true;
471
21.7M
        }
472
473
27.1k
        if (last_seen_a_dash)
474
9.15k
            return false;
475
476
18.0k
        last_seen_a_dash = true;
477
18.0k
        return true;
478
27.1k
    });
479
480
9.63k
    if (last_seen_a_dash) {
481
9.17k
        m_lexer.retreat();
482
9.17k
        text = text.substring_view(0, text.length() - 1);
483
9.17k
    }
484
485
9.63k
    TRY(expect("-->"sv));
486
487
9.09k
    if (m_options.preserve_comments)
488
0
        append_comment(text, m_lexer.position_for(comment_start));
489
490
9.09k
    rollback.disarm();
491
9.09k
    return {};
492
9.63k
}
493
494
// 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
495
ErrorOr<void, ParseError> Parser::parse_processing_instruction()
496
4.45M
{
497
4.45M
    auto rollback = rollback_point();
498
4.45M
    auto rule = enter_rule();
499
500
    // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
501
4.45M
    TRY(expect("<?"sv));
502
101k
    auto accept = accept_rule();
503
504
101k
    auto target = TRY(parse_processing_instruction_target());
505
100k
    ByteString data;
506
100k
    if (auto result = skip_whitespace(Required::Yes); !result.is_error())
507
56.4k
        data = m_lexer.consume_until("?>");
508
100k
    TRY(expect("?>"sv));
509
510
97.8k
    m_processing_instructions.set(target, data);
511
97.8k
    rollback.disarm();
512
97.8k
    return {};
513
100k
}
514
515
// 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
516
ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
517
101k
{
518
101k
    auto rollback = rollback_point();
519
101k
    auto rule = enter_rule();
520
521
    // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
522
101k
    auto target = TRY(parse_name());
523
101k
    auto accept = accept_rule();
524
525
101k
    if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) {
526
956
        return parse_error(
527
956
            m_lexer.position_for(m_lexer.tell() - target.length()),
528
956
            ByteString { "Use of the reserved 'xml' name for processing instruction target name is disallowed"sv });
529
956
    }
530
531
100k
    rollback.disarm();
532
100k
    return target;
533
101k
}
534
535
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
536
constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
537
538
// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
539
constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
540
541
// 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
542
ErrorOr<Name, ParseError> Parser::parse_name()
543
56.4M
{
544
56.4M
    auto rollback = rollback_point();
545
56.4M
    auto rule = enter_rule();
546
547
    // Name ::= NameStartChar (NameChar)*
548
549
    // FIXME: This is a hacky workaround to read code points instead of bytes.
550
    // Replace this once we have a unicode-aware lexer.
551
56.4M
    auto start = m_lexer.tell();
552
56.4M
    StringView remaining = m_lexer.input().substring_view(start);
553
56.4M
    Utf8View view { remaining };
554
56.4M
    auto code_points = view.begin();
555
56.4M
    if (code_points.done() || !s_name_start_characters.contains(*code_points)) {
556
13.5M
        if (m_options.treat_errors_as_fatal)
557
13.5M
            return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv });
558
13.5M
    }
559
560
42.8M
    m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
561
42.8M
    ++code_points;
562
563
42.8M
    auto accept = accept_rule();
564
565
83.2M
    while (!code_points.done() && s_name_characters.contains(*code_points)) {
566
40.3M
        m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
567
40.3M
        ++code_points;
568
40.3M
    }
569
570
42.8M
    rollback.disarm();
571
42.8M
    return remaining.substring_view(0, m_lexer.tell() - start);
572
56.4M
}
573
574
// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
575
ErrorOr<void, ParseError> Parser::parse_doctype_decl()
576
8.69k
{
577
8.69k
    auto rollback = rollback_point();
578
8.69k
    auto rule = enter_rule();
579
8.69k
    Doctype doctype;
580
581
    // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
582
8.69k
    TRY(expect("<!DOCTYPE"sv));
583
5.64k
    auto accept = accept_rule();
584
585
5.64k
    TRY(skip_whitespace(Required::Yes));
586
5.64k
    doctype.type = TRY(parse_name());
587
5.64k
    if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
588
883
        if (auto id_result = parse_external_id(); !id_result.is_error()) {
589
67
            doctype.external_id = id_result.release_value();
590
67
            if (m_options.resolve_external_resource) {
591
0
                auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
592
0
                if (!resource_result.is_error()) {
593
0
                    auto declarations = TRY(resource_result.release_value().visit(
594
0
                        [&](ByteString resolved_source) -> ErrorOr<Vector<MarkupDeclaration>, ParseError> {
595
0
                            TemporaryChange source { m_source, resolved_source.view() };
596
0
                            TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
597
0
                            auto declarations = TRY(parse_external_subset());
598
0
                            if (!m_lexer.is_eof()) {
599
0
                                return parse_error(
600
0
                                    m_lexer.current_position(),
601
0
                                    ByteString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
602
0
                            }
603
0
                            return declarations;
604
0
                        },
605
0
                        [&](Vector<MarkupDeclaration> declarations) -> ErrorOr<Vector<MarkupDeclaration>, ParseError> {
606
0
                            return declarations;
607
0
                        }));
608
0
                    doctype.markup_declarations.extend(move(declarations));
609
0
                }
610
0
            }
611
67
        }
612
883
    }
613
11.2k
    TRY(skip_whitespace(Required::No));
614
11.2k
    if (m_lexer.consume_specific('[')) {
615
5.30k
        auto internal_subset = TRY(parse_internal_subset());
616
5.30k
        TRY(expect("]"sv));
617
1.96k
        TRY(skip_whitespace());
618
1.96k
        doctype.markup_declarations.extend(internal_subset);
619
1.96k
    }
620
621
11.2k
    TRY(expect(">"sv));
622
623
1.45k
    rollback.disarm();
624
1.45k
    m_doctype = move(doctype);
625
1.45k
    return {};
626
2.30k
}
627
628
// 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
629
ErrorOr<void, ParseError> Parser::parse_element()
630
7.68M
{
631
7.68M
    auto rollback = rollback_point();
632
7.68M
    auto rule = enter_rule();
633
634
    // element ::= EmptyElemTag
635
    //           | STag content ETag
636
7.68M
    if (auto result = parse_empty_element_tag(); !result.is_error()) {
637
3.00M
        append_node(result.release_value());
638
3.00M
        leave_node();
639
3.00M
        rollback.disarm();
640
3.00M
        return {};
641
3.00M
    }
642
643
4.67M
    auto accept = accept_rule();
644
4.67M
    auto start_tag = TRY(parse_start_tag());
645
4.07M
    auto& node = *start_tag;
646
4.07M
    auto& tag = node.content.get<Node::Element>();
647
4.07M
    append_node(move(start_tag));
648
4.07M
    ScopeGuard quit {
649
4.07M
        [&] {
650
4.07M
            leave_node();
651
4.07M
        }
652
4.07M
    };
653
654
4.07M
    TRY(parse_content());
655
656
4.07M
    auto tag_location = m_lexer.tell();
657
4.07M
    auto closing_name = TRY(parse_end_tag());
658
659
    // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
660
12.3k
    if (m_options.treat_errors_as_fatal && closing_name != tag.name)
661
4.04k
        return parse_error(m_lexer.position_for(tag_location), ByteString { "Invalid closing tag"sv });
662
663
8.33k
    rollback.disarm();
664
8.33k
    return {};
665
12.3k
}
666
667
// 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
668
ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
669
7.68M
{
670
7.68M
    auto rollback = rollback_point();
671
7.68M
    auto rule = enter_rule();
672
673
    // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
674
7.68M
    auto tag_start = m_lexer.tell();
675
7.68M
    TRY(expect("<"sv));
676
677
7.41M
    auto name = TRY(parse_name());
678
7.24M
    HashMap<Name, ByteString> attributes;
679
680
16.8M
    while (true) {
681
16.8M
        if (auto result = skip_whitespace(Required::Yes); result.is_error())
682
4.10M
            break;
683
684
12.7M
        if (auto result = parse_attribute(); !result.is_error()) {
685
9.57M
            auto attribute = result.release_value();
686
9.57M
            attributes.set(move(attribute.name), move(attribute.value));
687
9.57M
        } else {
688
3.14M
            break;
689
3.14M
        }
690
12.7M
    }
691
692
7.24M
    TRY(skip_whitespace());
693
7.24M
    TRY(expect("/>"sv));
694
695
3.00M
    auto accept = accept_rule();
696
697
3.00M
    rollback.disarm();
698
3.00M
    return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
699
7.24M
}
700
701
// 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
702
ErrorOr<Attribute, ParseError> Parser::parse_attribute()
703
19.5M
{
704
19.5M
    auto rollback = rollback_point();
705
19.5M
    auto rule = enter_rule();
706
707
    // Attribute ::= Name Eq AttValue
708
19.5M
    auto name = TRY(parse_name());
709
16.5M
    auto accept = accept_rule();
710
711
16.5M
    TRY(parse_eq());
712
16.4M
    auto value = TRY(parse_attribute_value());
713
714
16.2M
    rollback.disarm();
715
16.2M
    return Attribute {
716
16.2M
        move(name),
717
16.2M
        move(value),
718
16.2M
    };
719
16.4M
}
720
721
// 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
722
ErrorOr<ByteString, ParseError> Parser::parse_attribute_value()
723
17.0M
{
724
17.0M
    auto rollback = rollback_point();
725
17.0M
    auto rule = enter_rule();
726
727
    // AttValue ::= '"' ([^<&"] | Reference)* '"'
728
    //            | "'" ([^<&'] | Reference)* "'"
729
17.0M
    auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
730
17.0M
    auto accept = accept_rule();
731
732
17.0M
    auto text = TRY(parse_attribute_value_inner(quote));
733
16.9M
    TRY(expect(quote));
734
735
16.7M
    rollback.disarm();
736
16.7M
    return text;
737
16.9M
}
738
739
ErrorOr<ByteString, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
740
17.0M
{
741
17.0M
    StringBuilder builder;
742
180M
    while (true) {
743
180M
        if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
744
16.9M
            break;
745
746
163M
        if (m_lexer.next_is('<')) {
747
            // Not allowed, return a nice error to make it easier to debug.
748
17.7k
            return parse_error(m_lexer.current_position(), ByteString { "Unescaped '<' not allowed in attribute values"sv });
749
17.7k
        }
750
751
163M
        if (m_lexer.next_is('&')) {
752
33.8k
            auto reference = TRY(parse_reference());
753
28.6k
            if (auto* char_reference = reference.get_pointer<ByteString>())
754
3.26k
                builder.append(*char_reference);
755
25.4k
            else
756
25.4k
                builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
757
163M
        } else {
758
163M
            builder.append(m_lexer.consume());
759
163M
        }
760
163M
    }
761
16.9M
    return builder.to_byte_string();
762
17.0M
}
763
764
// Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
765
constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
766
767
// 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
768
ErrorOr<Variant<Parser::EntityReference, ByteString>, ParseError> Parser::parse_reference()
769
4.69M
{
770
4.69M
    auto rollback = rollback_point();
771
4.69M
    auto rule = enter_rule();
772
    // Reference ::= EntityRef | CharRef
773
774
    // 4.1.68. EntityRef
775
    // EntityRef ::= '&' Name ';'
776
777
    // 4.1.66. CharRef
778
    // CharRef ::= '&#' [0-9]+ ';'
779
    //           | '&#x' [0-9a-fA-F]+ ';'
780
781
4.69M
    auto reference_start = m_lexer.tell();
782
4.69M
    TRY(expect("&"sv));
783
287k
    auto accept = accept_rule();
784
785
287k
    auto name_result = parse_name();
786
287k
    if (name_result.is_error()) {
787
18.0k
        TRY(expect("#"sv));
788
16.6k
        Optional<u32> code_point;
789
16.6k
        if (m_lexer.consume_specific('x')) {
790
7.85k
            auto hex = TRY(expect_many(
791
6.96k
                ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
792
6.96k
                "any of [0-9a-fA-F]"sv));
793
6.96k
            code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
794
8.84k
        } else {
795
8.84k
            auto decimal = TRY(expect_many(
796
8.39k
                ranges_for_search<Range('0', '9')>(),
797
8.39k
                "any of [0-9]"sv));
798
8.39k
            code_point = decimal.to_number<u32>();
799
8.39k
        }
800
801
16.6k
        if (!code_point.has_value() || !s_characters.contains(*code_point))
802
1.66k
            return parse_error(m_lexer.position_for(reference_start), ByteString { "Invalid character reference"sv });
803
804
15.3k
        TRY(expect(";"sv));
805
806
12.8k
        StringBuilder builder;
807
12.8k
        builder.append_code_point(*code_point);
808
809
12.8k
        rollback.disarm();
810
12.8k
        return builder.to_byte_string();
811
13.6k
    }
812
813
269k
    auto name = name_result.release_value();
814
269k
    TRY(expect(";"sv));
815
816
268k
    rollback.disarm();
817
268k
    return EntityReference { move(name) };
818
269k
}
819
820
// 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
821
ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
822
4.67M
{
823
4.67M
    auto rollback = rollback_point();
824
4.67M
    auto rule = enter_rule();
825
826
    // STag ::= '<' Name (S Attribute)* S? '>'
827
4.67M
    auto tag_start = m_lexer.tell();
828
4.67M
    TRY(expect("<"sv));
829
4.40M
    auto accept = accept_rule();
830
831
4.40M
    auto name = TRY(parse_name());
832
4.23M
    HashMap<Name, ByteString> attributes;
833
834
10.9M
    while (true) {
835
10.9M
        if (auto result = skip_whitespace(Required::Yes); result.is_error())
836
4.09M
            break;
837
838
6.80M
        if (auto result = parse_attribute(); !result.is_error()) {
839
6.66M
            auto attribute = result.release_value();
840
6.66M
            attributes.set(move(attribute.name), move(attribute.value));
841
6.66M
        } else {
842
140k
            break;
843
140k
        }
844
6.80M
    }
845
846
4.23M
    TRY(skip_whitespace());
847
4.23M
    TRY(expect(">"sv));
848
849
4.07M
    rollback.disarm();
850
4.07M
    return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
851
4.23M
}
852
853
// 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
854
ErrorOr<Name, ParseError> Parser::parse_end_tag()
855
4.07M
{
856
4.07M
    auto rollback = rollback_point();
857
4.07M
    auto rule = enter_rule();
858
859
    // ETag ::= '</' Name S? '>'
860
4.07M
    TRY(expect("</"sv));
861
13.2k
    auto accept = accept_rule();
862
863
13.2k
    auto name = TRY(parse_name());
864
12.9k
    TRY(skip_whitespace());
865
12.9k
    TRY(expect(">"sv));
866
867
12.3k
    rollback.disarm();
868
12.3k
    return name;
869
12.9k
}
870
871
// 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
872
ErrorOr<void, ParseError> Parser::parse_content()
873
4.31M
{
874
4.31M
    auto rollback = rollback_point();
875
4.31M
    auto rule = enter_rule();
876
4.31M
    auto accept = accept_rule();
877
878
    // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
879
4.31M
    auto content_start = m_lexer.tell();
880
4.31M
    if (auto result = parse_char_data(); !result.is_error())
881
4.31M
        append_text(result.release_value(), m_lexer.position_for(content_start));
882
883
7.67M
    while (true) {
884
7.67M
        auto node_start = m_lexer.tell();
885
886
7.67M
        if (auto result = parse_element(); !result.is_error())
887
3.01M
            goto try_char_data;
888
4.65M
        if (auto result = parse_reference(); !result.is_error()) {
889
250k
            auto reference = result.release_value();
890
250k
            auto reference_offset = m_lexer.position_for(node_start);
891
250k
            if (auto char_reference = reference.get_pointer<ByteString>())
892
7.85k
                append_text(*char_reference, reference_offset);
893
242k
            else
894
242k
                append_text(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content)), reference_offset);
895
250k
            goto try_char_data;
896
250k
        }
897
4.40M
        if (auto result = parse_cdata_section(); !result.is_error()) {
898
1.09k
            if (m_options.preserve_cdata)
899
1.09k
                append_text(result.release_value(), m_lexer.position_for(node_start));
900
1.09k
            goto try_char_data;
901
1.09k
        }
902
4.40M
        if (auto result = parse_processing_instruction(); !result.is_error())
903
91.7k
            goto try_char_data;
904
4.31M
        if (auto result = parse_comment(); !result.is_error())
905
4.82k
            goto try_char_data;
906
907
4.30M
        break;
908
909
4.30M
    try_char_data:;
910
3.36M
        if (auto result = parse_char_data(); !result.is_error())
911
3.36M
            append_text(result.release_value(), m_lexer.position_for(node_start));
912
3.36M
    }
913
914
4.30M
    rollback.disarm();
915
4.30M
    return {};
916
4.31M
}
917
918
// 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
919
ErrorOr<StringView, ParseError> Parser::parse_char_data()
920
7.67M
{
921
7.67M
    auto rollback = rollback_point();
922
7.67M
    auto rule = enter_rule();
923
924
    // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
925
7.67M
    auto cend_state = 0; // 1: ], 2: ], 3: >
926
1.09G
    auto text = m_lexer.consume_while([&](auto ch) {
927
1.09G
        if (ch == '<' || ch == '&' || cend_state == 3)
928
7.66M
            return false;
929
1.08G
        switch (cend_state) {
930
1.08G
        case 0:
931
1.08G
        case 1:
932
1.08G
            if (ch == ']')
933
86.5k
                cend_state++;
934
1.08G
            else
935
1.08G
                cend_state = 0;
936
1.08G
            return true;
937
10.5k
        case 2:
938
10.5k
            if (ch == '>') {
939
225
                cend_state++;
940
225
                return true;
941
225
            }
942
10.3k
            cend_state = 0;
943
10.3k
            return true;
944
0
        default:
945
0
            VERIFY_NOT_REACHED();
946
1.08G
        }
947
1.08G
    });
948
7.67M
    if (cend_state == 3) {
949
225
        m_lexer.retreat(3);
950
225
        text = text.substring_view(0, text.length() - 3);
951
225
    }
952
953
7.67M
    rollback.disarm();
954
7.67M
    return text;
955
7.67M
}
956
957
// 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
958
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
959
5.30k
{
960
5.30k
    auto rollback = rollback_point();
961
5.30k
    auto rule = enter_rule();
962
5.30k
    Vector<MarkupDeclaration> declarations;
963
964
    // intSubset ::= (markupdecl | DeclSep)*
965
260k
    while (true) {
966
260k
        if (auto result = parse_markup_declaration(); !result.is_error()) {
967
235k
            auto maybe_declaration = result.release_value();
968
235k
            if (maybe_declaration.has_value())
969
228k
                declarations.append(maybe_declaration.release_value());
970
235k
            continue;
971
235k
        }
972
24.2k
        if (auto result = parse_declaration_separator(); !result.is_error()) {
973
            // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
974
            // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
975
18.9k
            auto maybe_replacement_text = result.release_value();
976
18.9k
            if (maybe_replacement_text.has_value()) {
977
1.51k
                TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
978
1.51k
                TemporaryChange lexer { m_lexer, LineTrackingLexer { m_source } };
979
980
1.51k
                auto contained_declarations = TRY(parse_external_subset_declaration());
981
1.51k
                declarations.extend(move(contained_declarations));
982
1.51k
            }
983
18.9k
            continue;
984
18.9k
        }
985
5.30k
        break;
986
24.2k
    }
987
988
5.30k
    rollback.disarm();
989
5.30k
    return declarations;
990
5.30k
}
991
992
// 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
993
ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
994
261k
{
995
261k
    auto rollback = rollback_point();
996
261k
    auto rule = enter_rule();
997
998
    // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
999
261k
    if (auto result = parse_element_declaration(); !result.is_error()) {
1000
71.4k
        rollback.disarm();
1001
71.4k
        return MarkupDeclaration { result.release_value() };
1002
71.4k
    }
1003
190k
    if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
1004
29.1k
        rollback.disarm();
1005
29.1k
        return MarkupDeclaration { result.release_value() };
1006
29.1k
    }
1007
160k
    if (auto result = parse_entity_declaration(); !result.is_error()) {
1008
112k
        rollback.disarm();
1009
112k
        return MarkupDeclaration { result.release_value() };
1010
112k
    }
1011
48.8k
    if (auto result = parse_notation_declaration(); !result.is_error()) {
1012
16.1k
        rollback.disarm();
1013
16.1k
        return MarkupDeclaration { result.release_value() };
1014
16.1k
    }
1015
32.6k
    if (auto result = parse_processing_instruction(); !result.is_error()) {
1016
2.96k
        rollback.disarm();
1017
2.96k
        return Optional<MarkupDeclaration> {};
1018
2.96k
    }
1019
29.7k
    if (auto result = parse_comment(); !result.is_error()) {
1020
4.00k
        rollback.disarm();
1021
4.00k
        return Optional<MarkupDeclaration> {};
1022
4.00k
    }
1023
1024
25.7k
    return parse_error(m_lexer.current_position(), Expectation { "one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment"sv });
1025
29.7k
}
1026
1027
// 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
1028
ErrorOr<Optional<ByteString>, ParseError> Parser::parse_declaration_separator()
1029
25.7k
{
1030
25.7k
    auto rollback = rollback_point();
1031
25.7k
    auto rule = enter_rule();
1032
1033
    // DeclSep ::= PEReference | S
1034
25.7k
    if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
1035
1.51k
        rollback.disarm();
1036
        // FIXME: Resolve this PEReference.
1037
1.51k
        return "";
1038
1.51k
    }
1039
1040
24.2k
    if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
1041
17.3k
        rollback.disarm();
1042
17.3k
        return Optional<ByteString> {};
1043
17.3k
    }
1044
1045
6.82k
    return parse_error(m_lexer.current_position(), Expectation { "either whitespace, or a PEReference"sv });
1046
24.2k
}
1047
1048
// 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
1049
ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
1050
31.2k
{
1051
31.2k
    auto rollback = rollback_point();
1052
31.2k
    auto rule = enter_rule();
1053
1054
    // PEReference ::= '%' Name ';'
1055
31.2k
    TRY(expect("%"sv));
1056
7.11k
    auto accept = accept_rule();
1057
1058
7.11k
    auto name = TRY(parse_name());
1059
7.09k
    TRY(expect(";"sv));
1060
1061
7.05k
    rollback.disarm();
1062
7.05k
    return name;
1063
7.09k
}
1064
1065
// 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
1066
ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
1067
261k
{
1068
261k
    auto rollback = rollback_point();
1069
261k
    auto rule = enter_rule();
1070
1071
    // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
1072
    //        but the grammar does not allow that, figure this out.
1073
    // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1074
261k
    TRY(expect("<!ELEMENT"sv));
1075
72.5k
    auto accept = accept_rule();
1076
1077
72.5k
    TRY(skip_whitespace(Required::Yes));
1078
72.5k
    auto name = TRY(parse_name());
1079
72.5k
    TRY(skip_whitespace(Required::Yes));
1080
72.5k
    auto spec = TRY(parse_content_spec());
1081
71.5k
    TRY(expect(">"sv));
1082
1083
71.4k
    rollback.disarm();
1084
71.4k
    return ElementDeclaration {
1085
71.4k
        move(name),
1086
71.4k
        move(spec),
1087
71.4k
    };
1088
71.5k
}
1089
1090
// 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
1091
ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
1092
190k
{
1093
190k
    auto rollback = rollback_point();
1094
190k
    auto rule = enter_rule();
1095
190k
    AttributeListDeclaration declaration;
1096
1097
    // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1098
190k
    TRY(expect("<!ATTLIST"sv));
1099
30.2k
    auto accept = accept_rule();
1100
1101
30.2k
    TRY(skip_whitespace(Required::Yes));
1102
30.2k
    declaration.type = TRY(parse_name());
1103
1104
584k
    while (true) {
1105
584k
        if (auto result = parse_attribute_definition(); !result.is_error())
1106
554k
            declaration.attributes.append(result.release_value());
1107
30.2k
        else
1108
30.2k
            break;
1109
584k
    }
1110
1111
30.2k
    TRY(skip_whitespace());
1112
30.2k
    TRY(expect(">"sv));
1113
1114
29.1k
    rollback.disarm();
1115
29.1k
    return declaration;
1116
30.2k
}
1117
1118
// 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
1119
ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
1120
584k
{
1121
584k
    auto rollback = rollback_point();
1122
584k
    auto rule = enter_rule();
1123
584k
    Optional<AttributeListDeclaration::Type> type;
1124
584k
    Optional<AttributeListDeclaration::Default> default_;
1125
1126
    // AttDef ::= S Name S AttType S DefaultDecl
1127
584k
    TRY(skip_whitespace(Required::Yes));
1128
574k
    auto name = TRY(parse_name());
1129
555k
    auto accept = accept_rule();
1130
1131
555k
    TRY(skip_whitespace(Required::Yes));
1132
1133
    // AttType ::= StringType | TokenizedType | EnumeratedType
1134
    // StringType ::= 'CDATA'
1135
    // TokenizedType ::= 'ID'
1136
    //                  | 'IDREF'
1137
    //                  | 'IDREFS'
1138
    //                  | 'ENTITY'
1139
    //                  | 'ENTITIES'
1140
    //                  | 'NMTOKEN'
1141
    //                  | 'NMTOKENS'
1142
    // EnumeratedType ::= NotationType | Enumeration
1143
    // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
1144
    // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1145
555k
    if (m_lexer.consume_specific("CDATA"sv)) {
1146
5.17k
        type = AttributeListDeclaration::StringType::CData;
1147
550k
    } else if (m_lexer.consume_specific("IDREFS"sv)) {
1148
2.90k
        type = AttributeListDeclaration::TokenizedType::IDRefs;
1149
547k
    } else if (m_lexer.consume_specific("IDREF"sv)) {
1150
1.21k
        type = AttributeListDeclaration::TokenizedType::IDRef;
1151
546k
    } else if (m_lexer.consume_specific("ID"sv)) {
1152
524k
        type = AttributeListDeclaration::TokenizedType::ID;
1153
524k
    } else if (m_lexer.consume_specific("ENTITIES"sv)) {
1154
454
        type = AttributeListDeclaration::TokenizedType::Entities;
1155
21.2k
    } else if (m_lexer.consume_specific("ENTITY"sv)) {
1156
244
        type = AttributeListDeclaration::TokenizedType::Entity;
1157
20.9k
    } else if (m_lexer.consume_specific("NMTOKENS"sv)) {
1158
194
        type = AttributeListDeclaration::TokenizedType::NMTokens;
1159
20.7k
    } else if (m_lexer.consume_specific("NMTOKEN"sv)) {
1160
260
        type = AttributeListDeclaration::TokenizedType::NMToken;
1161
20.5k
    } else if (m_lexer.consume_specific("NOTATION"sv)) {
1162
2.98k
        HashTable<Name> names;
1163
2.98k
        TRY(skip_whitespace(Required::Yes));
1164
2.98k
        TRY(expect("("sv));
1165
2.98k
        TRY(skip_whitespace());
1166
2.98k
        names.set(TRY(parse_name()));
1167
3.92k
        while (true) {
1168
3.92k
            TRY(skip_whitespace());
1169
3.92k
            if (auto result = expect("|"sv); result.is_error())
1170
2.97k
                break;
1171
3.92k
            TRY(skip_whitespace());
1172
1.89k
            names.set(TRY(parse_name()));
1173
941
        }
1174
5.95k
        TRY(skip_whitespace());
1175
5.95k
        TRY(expect(")"sv));
1176
2.96k
        type = AttributeListDeclaration::NotationType { move(names) };
1177
17.5k
    } else {
1178
17.5k
        HashTable<ByteString> names;
1179
17.5k
        TRY(expect("("sv));
1180
17.3k
        TRY(skip_whitespace());
1181
17.3k
        names.set(TRY(parse_nm_token()));
1182
98.6k
        while (true) {
1183
98.6k
            TRY(skip_whitespace());
1184
98.6k
            if (auto result = expect("|"sv); result.is_error())
1185
17.3k
                break;
1186
162k
            TRY(skip_whitespace());
1187
162k
            names.set(TRY(parse_nm_token()));
1188
81.3k
        }
1189
34.6k
        TRY(skip_whitespace());
1190
34.6k
        TRY(expect(")"sv));
1191
17.1k
        type = AttributeListDeclaration::Enumeration { move(names) };
1192
17.1k
    }
1193
1194
1.10M
    TRY(skip_whitespace(Required::Yes));
1195
1196
    // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1197
    //               | (('#FIXED' S)? AttValue)
1198
1.10M
    if (m_lexer.consume_specific("#REQUIRED"sv)) {
1199
5.99k
        default_ = AttributeListDeclaration::Required {};
1200
548k
    } else if (m_lexer.consume_specific("#IMPLIED"sv)) {
1201
8.46k
        default_ = AttributeListDeclaration::Implied {};
1202
540k
    } else {
1203
540k
        bool fixed = false;
1204
540k
        if (m_lexer.consume_specific("#FIXED"sv)) {
1205
1.20k
            TRY(skip_whitespace(Required::Yes));
1206
1.20k
            fixed = true;
1207
1.20k
        }
1208
540k
        auto value = TRY(parse_attribute_value());
1209
540k
        if (fixed)
1210
1.20k
            default_ = AttributeListDeclaration::Fixed { move(value) };
1211
538k
        else
1212
538k
            default_ = AttributeListDeclaration::DefaultValue { move(value) };
1213
540k
    }
1214
1215
1.10M
    rollback.disarm();
1216
554k
    return AttributeListDeclaration::Definition {
1217
554k
        move(name),
1218
554k
        type.release_value(),
1219
554k
        default_.release_value(),
1220
554k
    };
1221
1.10M
}
1222
1223
// 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
1224
ErrorOr<StringView, ParseError> Parser::parse_nm_token()
1225
98.7k
{
1226
98.7k
    auto rollback = rollback_point();
1227
98.7k
    auto rule = enter_rule();
1228
1229
    // Nmtoken ::= (NameChar)+
1230
98.7k
    auto token = TRY(expect_many(s_name_characters, "a NameChar"sv));
1231
1232
98.6k
    rollback.disarm();
1233
98.6k
    return token;
1234
98.7k
}
1235
1236
// 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
1237
ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
1238
48.8k
{
1239
48.8k
    auto rollback = rollback_point();
1240
48.8k
    auto rule = enter_rule();
1241
48.8k
    Variant<ExternalID, PublicID, Empty> notation;
1242
1243
    // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
1244
48.8k
    TRY(expect("<!NOTATION"sv));
1245
16.3k
    auto accept = accept_rule();
1246
1247
16.3k
    TRY(skip_whitespace(Required::Yes));
1248
16.3k
    auto name = TRY(parse_name());
1249
16.3k
    TRY(skip_whitespace(Required::Yes));
1250
1251
16.3k
    if (auto result = parse_external_id(); !result.is_error())
1252
11.8k
        notation = result.release_value();
1253
4.48k
    else
1254
4.48k
        notation = TRY(parse_public_id());
1255
1256
16.3k
    TRY(expect(">"sv));
1257
1258
16.1k
    rollback.disarm();
1259
16.1k
    return NotationDeclaration {
1260
16.1k
        move(name),
1261
16.1k
        move(notation).downcast<ExternalID, PublicID>(),
1262
16.1k
    };
1263
16.2k
}
1264
1265
// 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
1266
ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
1267
72.5k
{
1268
72.5k
    auto rollback = rollback_point();
1269
72.5k
    auto rule = enter_rule();
1270
72.5k
    Optional<ElementDeclaration::ContentSpec> content_spec;
1271
1272
    // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
1273
72.5k
    if (m_lexer.consume_specific("EMPTY"sv)) {
1274
1.68k
        content_spec = ElementDeclaration::Empty {};
1275
70.8k
    } else if (m_lexer.consume_specific("ANY"sv)) {
1276
997
        content_spec = ElementDeclaration::Any {};
1277
69.8k
    } else {
1278
69.8k
        TRY(expect("("sv));
1279
69.7k
        TRY(skip_whitespace());
1280
69.7k
        if (m_lexer.consume_specific("#PCDATA"sv)) {
1281
62.8k
            HashTable<Name> names;
1282
            // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
1283
            //         | '(' S? '#PCDATA' S? ')'
1284
62.8k
            TRY(skip_whitespace());
1285
62.8k
            if (m_lexer.consume_specific(")*"sv)) {
1286
102
                content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
1287
62.7k
            } else if (m_lexer.consume_specific(')')) {
1288
62.1k
                content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
1289
62.1k
            } else {
1290
6.67k
                while (true) {
1291
6.67k
                    TRY(skip_whitespace());
1292
6.67k
                    if (!m_lexer.consume_specific('|'))
1293
530
                        break;
1294
12.2k
                    TRY(skip_whitespace());
1295
12.2k
                    if (auto result = parse_name(); !result.is_error())
1296
6.10k
                        names.set(result.release_value());
1297
38
                    else
1298
38
                        return parse_error(m_lexer.current_position(), Expectation { "a Name"sv });
1299
12.2k
                }
1300
1.06k
                TRY(skip_whitespace());
1301
1.06k
                TRY(expect(")*"sv));
1302
433
                content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
1303
433
            }
1304
62.8k
        } else {
1305
700k
            while (!m_lexer.next_is('('))
1306
693k
                m_lexer.retreat();
1307
            // children ::= (choice | seq) ('?' | '*' | '+')?
1308
            //   cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1309
            //   choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
1310
            //   seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
1311
6.95k
            Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
1312
6.95k
            Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
1313
1314
23.8M
            auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
1315
23.8M
                if (auto result = parse_name(); !result.is_error())
1316
13.6M
                    return result.release_value();
1317
10.1M
                if (auto result = parse_choice(); !result.is_error())
1318
17.5k
                    return result.release_value();
1319
10.1M
                return TRY(parse_sequence());
1320
10.1M
            };
1321
14.4M
            auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
1322
14.4M
                ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
1323
14.4M
                if (m_lexer.consume_specific('?'))
1324
17.3k
                    qualifier = ElementDeclaration::Children::Qualifier::Optional;
1325
14.4M
                else if (m_lexer.consume_specific('*'))
1326
2.83k
                    qualifier = ElementDeclaration::Children::Qualifier::Any;
1327
14.4M
                else if (m_lexer.consume_specific('+'))
1328
1.07k
                    qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
1329
14.4M
                return qualifier;
1330
14.4M
            };
1331
23.8M
            auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
1332
23.8M
                auto sub_entry = TRY(parse_cp_init());
1333
14.0M
                auto qualifier = parse_qualifier();
1334
14.0M
                return ElementDeclaration::Children::Entry {
1335
14.0M
                    move(sub_entry),
1336
14.0M
                    qualifier,
1337
14.0M
                };
1338
23.8M
            };
1339
10.1M
            parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
1340
10.1M
                auto rollback = rollback_point();
1341
10.1M
                auto rule = enter_rule();
1342
1343
10.1M
                TRY(expect("("sv));
1344
8.56M
                auto accept = accept_rule();
1345
1346
8.56M
                TRY(skip_whitespace());
1347
8.56M
                Vector<ElementDeclaration::Children::Entry> choices;
1348
8.56M
                choices.append(TRY(parse_cp()));
1349
6.61M
                while (true) {
1350
6.61M
                    TRY(skip_whitespace());
1351
6.61M
                    if (!m_lexer.consume_specific('|'))
1352
5.31M
                        break;
1353
6.61M
                    TRY(skip_whitespace());
1354
2.60M
                    choices.append(TRY(parse_cp()));
1355
161k
                }
1356
1357
6.45M
                TRY(expect(")"sv));
1358
1359
5.71M
                if (choices.size() < 2)
1360
388k
                    return parse_error(m_lexer.current_position(), Expectation { "more than one choice"sv });
1361
1362
407k
                TRY(skip_whitespace());
1363
18.8k
                auto qualifier = parse_qualifier();
1364
1365
18.8k
                rollback.disarm();
1366
18.8k
                return ElementDeclaration::Children::Choice {
1367
18.8k
                    move(choices),
1368
18.8k
                    qualifier,
1369
18.8k
                };
1370
18.8k
            };
1371
10.1M
            parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
1372
10.1M
                auto rollback = rollback_point();
1373
10.1M
                auto rule = enter_rule();
1374
1375
10.1M
                TRY(expect("("sv));
1376
8.54M
                auto accept = accept_rule();
1377
1378
8.54M
                TRY(skip_whitespace());
1379
8.54M
                Vector<ElementDeclaration::Children::Entry> entries;
1380
8.54M
                entries.append(TRY(parse_cp()));
1381
7.45M
                while (true) {
1382
7.45M
                    TRY(skip_whitespace());
1383
7.45M
                    if (!m_lexer.consume_specific(','))
1384
2.07M
                        break;
1385
10.7M
                    TRY(skip_whitespace());
1386
10.7M
                    entries.append(TRY(parse_cp()));
1387
1.02M
                }
1388
1389
6.43M
                TRY(expect(")"sv));
1390
1391
2.46M
                TRY(skip_whitespace());
1392
388k
                auto qualifier = parse_qualifier();
1393
1394
388k
                rollback.disarm();
1395
388k
                return ElementDeclaration::Children::Sequence {
1396
388k
                    move(entries),
1397
388k
                    qualifier,
1398
388k
                };
1399
2.46M
            };
1400
6.95k
            if (auto result = parse_choice(); !result.is_error()) {
1401
1.24k
                auto qualifier = parse_qualifier();
1402
1.24k
                content_spec = ElementDeclaration::Children {
1403
1.24k
                    result.release_value(),
1404
1.24k
                    qualifier,
1405
1.24k
                };
1406
5.71k
            } else {
1407
5.71k
                auto sequence = TRY(parse_sequence());
1408
4.94k
                auto qualifier = parse_qualifier();
1409
4.94k
                content_spec = ElementDeclaration::Children {
1410
4.94k
                    move(sequence),
1411
4.94k
                    qualifier,
1412
4.94k
                };
1413
4.94k
            }
1414
6.95k
        }
1415
69.7k
    }
1416
1417
71.5k
    rollback.disarm();
1418
71.5k
    return content_spec.release_value();
1419
72.5k
}
1420
1421
// 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
1422
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
1423
1.51k
{
1424
1.51k
    auto rollback = rollback_point();
1425
1.51k
    auto rule = enter_rule();
1426
1.51k
    Vector<MarkupDeclaration> declarations;
1427
1428
    // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
1429
1.51k
    while (true) {
1430
1.51k
        if (auto result = parse_markup_declaration(); !result.is_error()) {
1431
0
            if (result.value().has_value())
1432
0
                declarations.append(result.release_value().release_value());
1433
0
            continue;
1434
0
        }
1435
1436
        // FIXME: conditionalSect
1437
1438
1.51k
        if (auto result = parse_declaration_separator(); !result.is_error())
1439
0
            continue;
1440
1441
1.51k
        break;
1442
1.51k
    }
1443
1444
1.51k
    rollback.disarm();
1445
1.51k
    return declarations;
1446
1.51k
}
1447
1448
// 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
1449
ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
1450
160k
{
1451
    // EntityDecl ::= GEDecl | PEDecl
1452
160k
    if (auto result = parse_general_entity_declaration(); !result.is_error())
1453
104k
        return result;
1454
1455
56.2k
    return parse_parameter_entity_declaration();
1456
160k
}
1457
1458
// 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
1459
ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
1460
160k
{
1461
160k
    auto rollback = rollback_point();
1462
160k
    auto rule = enter_rule();
1463
160k
    Variant<ByteString, EntityDefinition, Empty> definition;
1464
1465
    // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1466
160k
    TRY(expect("<!ENTITY"sv));
1467
112k
    auto accept = accept_rule();
1468
1469
112k
    TRY(skip_whitespace(Required::Yes));
1470
112k
    auto name = TRY(parse_name());
1471
105k
    TRY(skip_whitespace(Required::Yes));
1472
    // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1473
104k
    if (auto result = parse_entity_value(); !result.is_error()) {
1474
94.3k
        definition = result.release_value();
1475
94.3k
    } else {
1476
10.6k
        auto external_id = TRY(parse_external_id());
1477
10.3k
        Optional<Name> notation;
1478
10.3k
        if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
1479
6.90k
            notation = notation_result.release_value();
1480
1481
10.3k
        definition = EntityDefinition {
1482
10.3k
            move(external_id),
1483
10.3k
            move(notation),
1484
10.3k
        };
1485
10.3k
    }
1486
1487
209k
    TRY(skip_whitespace());
1488
209k
    TRY(expect(">"sv));
1489
1490
104k
    rollback.disarm();
1491
104k
    return GEDeclaration {
1492
104k
        move(name),
1493
104k
        move(definition).downcast<ByteString, EntityDefinition>(),
1494
104k
    };
1495
209k
}
1496
1497
// 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
1498
ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
1499
56.2k
{
1500
56.2k
    auto rollback = rollback_point();
1501
56.2k
    auto rule = enter_rule();
1502
1503
56.2k
    Variant<ByteString, ExternalID, Empty> definition;
1504
    // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1505
56.2k
    TRY(expect("<!ENTITY"sv));
1506
7.83k
    auto accept = accept_rule();
1507
1508
7.83k
    TRY(skip_whitespace(Required::Yes));
1509
7.82k
    TRY(expect("%"sv));
1510
7.48k
    TRY(skip_whitespace(Required::Yes));
1511
7.48k
    auto name = TRY(parse_name());
1512
7.48k
    TRY(skip_whitespace(Required::Yes));
1513
    // PEDef ::= EntityValue | ExternalID
1514
7.47k
    if (auto result = parse_entity_value(); !result.is_error())
1515
5.89k
        definition = result.release_value();
1516
1.58k
    else
1517
1.58k
        definition = TRY(parse_external_id());
1518
1519
14.8k
    TRY(skip_whitespace());
1520
14.8k
    TRY(expect(">"sv));
1521
1522
7.40k
    rollback.disarm();
1523
7.40k
    return PEDeclaration {
1524
7.40k
        move(name),
1525
7.40k
        move(definition).downcast<ByteString, ExternalID>(),
1526
7.40k
    };
1527
14.8k
}
1528
1529
// 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
1530
ErrorOr<PublicID, ParseError> Parser::parse_public_id()
1531
4.48k
{
1532
4.48k
    auto rollback = rollback_point();
1533
4.48k
    auto rule = enter_rule();
1534
1535
    // PublicID ::= 'PUBLIC' S PubidLiteral
1536
4.48k
    TRY(expect("PUBLIC"sv));
1537
4.44k
    auto accept = accept_rule();
1538
1539
4.44k
    TRY(skip_whitespace(Required::Yes));
1540
4.43k
    auto text = TRY(parse_public_id_literal());
1541
1542
4.31k
    rollback.disarm();
1543
4.31k
    return PublicID {
1544
4.31k
        text,
1545
4.31k
    };
1546
4.43k
}
1547
1548
constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
1549
1550
// 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
1551
ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
1552
14.3k
{
1553
14.3k
    auto rollback = rollback_point();
1554
14.3k
    auto rule = enter_rule();
1555
1556
    // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1557
14.3k
    auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1558
14.2k
    auto accept = accept_rule();
1559
1560
14.2k
    auto id = TRY(expect_many(
1561
14.2k
        [q = quote[0]](auto x) {
1562
14.2k
            return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
1563
14.2k
        },
1564
14.2k
        "a PubidChar"sv,
1565
14.2k
        true));
1566
14.2k
    TRY(expect(quote));
1567
1568
13.8k
    rollback.disarm();
1569
13.8k
    return id;
1570
14.2k
}
1571
1572
// 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
1573
ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
1574
23.9k
{
1575
23.9k
    auto rollback = rollback_point();
1576
23.9k
    auto rule = enter_rule();
1577
1578
    // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1579
23.9k
    auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1580
23.9k
    auto accept = accept_rule();
1581
1582
23.9k
    auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv, true));
1583
23.9k
    TRY(expect(quote));
1584
1585
23.8k
    rollback.disarm();
1586
23.8k
    return id;
1587
23.9k
}
1588
1589
// 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
1590
ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
1591
29.4k
{
1592
29.4k
    auto rollback = rollback_point();
1593
29.4k
    auto rule = enter_rule();
1594
1595
    // ExternalID ::= 'SYSTEM' S SystemLiteral
1596
    //              | 'PUBLIC' S PubidLiteral S SystemLiteral
1597
29.4k
    Optional<PublicID> public_id;
1598
29.4k
    SystemID system_id;
1599
1600
29.4k
    if (m_lexer.consume_specific("SYSTEM"sv)) {
1601
18.7k
        auto accept = accept_rule();
1602
18.7k
        TRY(skip_whitespace(Required::Yes));
1603
18.7k
        system_id = SystemID { TRY(parse_system_id_literal()) };
1604
18.7k
    } else {
1605
10.6k
        TRY(expect("PUBLIC"sv));
1606
9.86k
        auto accept = accept_rule();
1607
1608
9.86k
        TRY(skip_whitespace(Required::Yes));
1609
9.86k
        public_id = PublicID { TRY(parse_public_id_literal()) };
1610
9.52k
        TRY(skip_whitespace(Required::Yes));
1611
5.21k
        system_id = SystemID { TRY(parse_system_id_literal()) };
1612
5.18k
    }
1613
1614
29.4k
    rollback.disarm();
1615
23.8k
    return ExternalID {
1616
23.8k
        move(public_id),
1617
23.8k
        move(system_id),
1618
23.8k
    };
1619
29.4k
}
1620
1621
// 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
1622
ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
1623
10.3k
{
1624
10.3k
    auto rollback = rollback_point();
1625
10.3k
    auto rule = enter_rule();
1626
1627
    // NDataDecl ::= S 'NDATA' S Name
1628
10.3k
    TRY(skip_whitespace(Required::Yes));
1629
8.49k
    auto accept = accept_rule();
1630
1631
8.49k
    TRY(expect("NDATA"sv));
1632
6.90k
    TRY(skip_whitespace(Required::Yes));
1633
6.90k
    auto name = TRY(parse_name());
1634
1635
6.90k
    rollback.disarm();
1636
6.90k
    return name;
1637
6.90k
}
1638
1639
// 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
1640
ErrorOr<ByteString, ParseError> Parser::parse_entity_value()
1641
112k
{
1642
112k
    auto rollback = rollback_point();
1643
112k
    auto rule = enter_rule();
1644
112k
    StringBuilder builder;
1645
1646
    // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
1647
    //               |  "'" ([^%&'] | PEReference | Reference)* "'"
1648
112k
    auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1649
100k
    auto accept = accept_rule();
1650
1651
9.37M
    while (true) {
1652
9.37M
        if (m_lexer.is_eof())
1653
88
            break;
1654
9.37M
        if (m_lexer.next_is(quote))
1655
100k
            break;
1656
9.27M
        if (m_lexer.next_is('%')) {
1657
5.57k
            auto start = m_lexer.tell();
1658
            // FIXME: Resolve this PEReference.
1659
5.57k
            TRY(parse_parameter_entity_reference());
1660
5.53k
            builder.append(m_source.substring_view(start, m_lexer.tell() - start));
1661
5.53k
            continue;
1662
5.57k
        }
1663
9.26M
        if (m_lexer.next_is('&')) {
1664
2.26k
            auto reference = TRY(parse_reference());
1665
2.24k
            if (auto char_reference = reference.get_pointer<ByteString>())
1666
1.75k
                builder.append(*char_reference);
1667
491
            else
1668
491
                builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
1669
2.24k
            continue;
1670
2.24k
        }
1671
9.26M
        builder.append(m_lexer.consume());
1672
9.26M
    }
1673
100k
    TRY(expect(quote));
1674
1675
100k
    rollback.disarm();
1676
100k
    return builder.to_byte_string();
1677
100k
}
1678
1679
// 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
1680
ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
1681
4.40M
{
1682
4.40M
    auto rollback = rollback_point();
1683
4.40M
    auto rule = enter_rule();
1684
1685
    // CDSect ::= CDStart CData CDEnd
1686
    // CDStart ::= '<![CDATA['
1687
    // CData ::= (Char* - (Char* ']]>' Char*))
1688
    // CDEnd ::= ']]>'
1689
4.40M
    TRY(expect("<![CDATA["sv));
1690
1.64k
    auto accept = accept_rule();
1691
1692
1.64k
    auto section_start = m_lexer.tell();
1693
1.79M
    while (!m_lexer.next_is("]]>")) {
1694
1.79M
        if (m_lexer.is_eof())
1695
548
            break;
1696
1.78M
        m_lexer.ignore();
1697
1.78M
    }
1698
1.64k
    auto section_end = m_lexer.tell();
1699
1.64k
    TRY(expect("]]>"sv));
1700
1701
1.09k
    rollback.disarm();
1702
1.09k
    return m_source.substring_view(section_start, section_end - section_start);
1703
1.64k
}
1704
1705
// 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
1706
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
1707
0
{
1708
0
    auto rollback = rollback_point();
1709
0
    auto rule = enter_rule();
1710
1711
    // extSubset ::= TextDecl? extSubsetDecl
1712
0
    (void)parse_text_declaration();
1713
0
    auto result = TRY(parse_external_subset_declaration());
1714
1715
0
    rollback.disarm();
1716
0
    return result;
1717
0
}
1718
1719
// 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
1720
ErrorOr<void, ParseError> Parser::parse_text_declaration()
1721
0
{
1722
0
    auto rollback = rollback_point();
1723
0
    auto rule = enter_rule();
1724
1725
    // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
1726
0
    TRY(expect("<?xml"sv));
1727
0
    auto accept = accept_rule();
1728
1729
0
    (void)parse_version_info();
1730
0
    TRY(parse_encoding_decl());
1731
0
    TRY(skip_whitespace());
1732
0
    TRY(expect("?>"sv));
1733
1734
0
    rollback.disarm();
1735
0
    return {};
1736
0
}
1737
1738
ErrorOr<ByteString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
1739
268k
{
1740
268k
    static HashTable<Name> reference_lookup {};
1741
268k
    if (reference_lookup.contains(reference.name))
1742
15.1k
        return parse_error(m_lexer.current_position(), ByteString::formatted("Invalid recursive definition for '{}'", reference.name));
1743
1744
253k
    reference_lookup.set(reference.name);
1745
253k
    ScopeGuard remove_lookup {
1746
253k
        [&] {
1747
253k
            reference_lookup.remove(reference.name);
1748
253k
        }
1749
253k
    };
1750
1751
253k
    Optional<ByteString> resolved;
1752
253k
    if (m_doctype.has_value()) {
1753
        // FIXME: Split these up and resolve them ahead of time.
1754
275k
        for (auto& declaration : m_doctype->markup_declarations) {
1755
275k
            auto entity = declaration.get_pointer<EntityDeclaration>();
1756
275k
            if (!entity)
1757
16.9k
                continue;
1758
258k
            auto ge_declaration = entity->get_pointer<GEDeclaration>();
1759
258k
            if (!ge_declaration)
1760
1.35k
                continue;
1761
256k
            if (ge_declaration->name != reference.name)
1762
10.6k
                continue;
1763
256k
            TRY(ge_declaration->definition.visit(
1764
246k
                [&](ByteString const& definition) -> ErrorOr<void, ParseError> {
1765
246k
                    resolved = definition;
1766
246k
                    return {};
1767
246k
                },
1768
246k
                [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
1769
246k
                    if (placement == ReferencePlacement::AttributeValue)
1770
246k
                        return parse_error(m_lexer.current_position(), ByteString::formatted("Attribute references external entity '{}'", reference.name));
1771
1772
246k
                    if (definition.notation.has_value())
1773
246k
                        return parse_error(m_lexer.position_for(0), ByteString::formatted("Entity reference to unparsed entity '{}'", reference.name));
1774
1775
246k
                    if (!m_options.resolve_external_resource)
1776
246k
                        return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}'", reference.name));
1777
1778
246k
                    auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
1779
246k
                    if (result.is_error())
1780
246k
                        return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
1781
1782
246k
                    if (!result.value().has<ByteString>())
1783
246k
                        return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}': Resource is of the wrong type", reference.name));
1784
1785
246k
                    resolved = result.release_value().get<ByteString>();
1786
246k
                    return {};
1787
246k
                }));
1788
246k
            break;
1789
246k
        }
1790
251k
    }
1791
1792
253k
    if (!resolved.has_value()) {
1793
7.23k
        if (reference.name == "amp")
1794
3.81k
            return "&";
1795
3.42k
        if (reference.name == "lt")
1796
727
            return "<";
1797
2.69k
        if (reference.name == "gt")
1798
695
            return ">";
1799
2.00k
        if (reference.name == "apos")
1800
211
            return "'";
1801
1.79k
        if (reference.name == "quot")
1802
331
            return "\"";
1803
1.46k
        return parse_error(m_lexer.position_for(0), ByteString::formatted("Reference to undeclared entity '{}'", reference.name));
1804
1.79k
    }
1805
1806
246k
    StringView resolved_source = *resolved;
1807
246k
    TemporaryChange source { m_source, resolved_source };
1808
246k
    TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
1809
246k
    switch (placement) {
1810
10.3k
    case ReferencePlacement::AttributeValue:
1811
10.3k
        return TRY(parse_attribute_value_inner(""sv));
1812
235k
    case ReferencePlacement::Content:
1813
235k
        TRY(parse_content());
1814
235k
        return "";
1815
0
    default:
1816
0
        VERIFY_NOT_REACHED();
1817
246k
    }
1818
246k
}
1819
1820
}