/src/serenity/Userland/Libraries/LibXML/Parser/Parser.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <LibXML/DOM/Document.h> |
8 | | #include <LibXML/Parser/Parser.h> |
9 | | |
10 | | struct Range { |
11 | | consteval Range(u32 start, u32 end) |
12 | | : start(start) |
13 | | , end(end) |
14 | 0 | { |
15 | 0 | } |
16 | | |
17 | | u32 start; |
18 | | u32 end; |
19 | | }; |
20 | | |
21 | | template<auto... ranges> |
22 | | struct ranges_for_search { |
23 | | auto contains(u32 value) const |
24 | 151M | { |
25 | 3.11G | return ((value >= ranges.start && value <= ranges.end) || ...); |
26 | 151M | } ranges_for_search<Range{1u, 8u}, Range{11u, 12u}, Range{14u, 31u}, Range{127u, 132u}, Range{134u, 159u}>::contains(unsigned int) constLine | Count | Source | 24 | 2.13M | { | 25 | 17.0M | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 2.13M | } |
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}>::contains(unsigned int) constLine | Count | Source | 24 | 54.8M | { | 25 | 623M | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 54.8M | } |
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}>::contains(unsigned int) constLine | Count | Source | 24 | 86.7M | { | 25 | 2.31G | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 86.7M | } |
ranges_for_search<Range{1u, 55295u}, Range{57344u, 65533u}, Range{65536u, 1114111u}>::contains(unsigned int) constLine | Count | Source | 24 | 14.6k | { | 25 | 30.0k | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 14.6k | } |
ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}>::contains(unsigned int) constLine | Count | Source | 24 | 2.07M | { | 25 | 6.19M | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 2.07M | } |
ranges_for_search<Range{48u, 57u}>::contains(unsigned int) constLine | Count | Source | 24 | 2.25M | { | 25 | 2.25M | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 2.25M | } |
ranges_for_search<Range{32u, 32u}, Range{13u, 13u}, Range{10u, 10u}, Range{45u, 45u}, Range{39u, 39u}, Range{40u, 40u}, Range{41u, 41u}, Range{43u, 43u}, Range{44u, 44u}, Range{46u, 46u}, Range{47u, 47u}, Range{58u, 58u}, Range{61u, 61u}, Range{63u, 63u}, Range{59u, 59u}, Range{33u, 33u}, Range{42u, 42u}, Range{35u, 35u}, Range{64u, 64u}, Range{36u, 36u}, Range{95u, 95u}, Range{37u, 37u}, Range{97u, 122u}, Range{65u, 90u}, Range{48u, 57u}>::contains(unsigned int) constLine | Count | Source | 24 | 3.78M | { | 25 | 150M | return ((value >= ranges.start && value <= ranges.end) || ...); | 26 | 3.78M | } |
|
27 | | |
28 | | bool operator()(u32 value) const |
29 | 10.4M | { |
30 | 10.4M | return contains(value); |
31 | 10.4M | } ranges_for_search<Range{1u, 8u}, Range{11u, 12u}, Range{14u, 31u}, Range{127u, 132u}, Range{134u, 159u}>::operator()(unsigned int) constLine | Count | Source | 29 | 2.13M | { | 30 | 2.13M | return contains(value); | 31 | 2.13M | } |
ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}>::operator()(unsigned int) constLine | Count | Source | 29 | 2.07M | { | 30 | 2.07M | return contains(value); | 31 | 2.07M | } |
ranges_for_search<Range{48u, 57u}>::operator()(unsigned int) constLine | Count | Source | 29 | 2.25M | { | 30 | 2.25M | return contains(value); | 31 | 2.25M | } |
ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}>::operator()(unsigned int) constLine | Count | Source | 29 | 4.03M | { | 30 | 4.03M | return contains(value); | 31 | 4.03M | } |
|
32 | | |
33 | | template<auto... ranges_to_include> |
34 | | consteval auto with() const |
35 | | { |
36 | | return ranges_for_search<ranges..., ranges_to_include...>(); |
37 | | } |
38 | | |
39 | | template<auto... ranges_to_include> |
40 | | consteval auto unify(ranges_for_search<ranges_to_include...> const&) const |
41 | | { |
42 | | return ranges_for_search<ranges..., ranges_to_include...>(); |
43 | | } |
44 | | }; |
45 | | |
46 | | template<size_t Count, typename Element> |
47 | | struct StringSet { |
48 | | consteval StringSet(Element const (&entries)[Count]) |
49 | | { |
50 | | for (size_t i = 0; i < Count - 1; ++i) |
51 | | elements[i] = entries[i]; |
52 | | } |
53 | | |
54 | | consteval auto operator[](size_t i) const { return elements[i]; } |
55 | | |
56 | | Element elements[Count - 1]; |
57 | | }; |
58 | | |
59 | | template<StringSet chars> |
60 | | consteval static auto set_to_search() |
61 | | { |
62 | | return ([&]<auto... Ix>(IndexSequence<Ix...>) { |
63 | | return ranges_for_search<Range(chars[Ix], chars[Ix])...>(); |
64 | | }(MakeIndexSequence<array_size(chars.elements)>())); |
65 | | } |
66 | | |
67 | | namespace XML { |
68 | | |
69 | | size_t Parser::s_debug_indent_level { 0 }; |
70 | | |
71 | | void Parser::append_node(NonnullOwnPtr<Node> node) |
72 | 7.08M | { |
73 | 7.08M | if (m_entered_node) { |
74 | 7.08M | auto& entered_element = m_entered_node->content.get<Node::Element>(); |
75 | 7.08M | entered_element.children.append(move(node)); |
76 | 7.08M | enter_node(*entered_element.children.last()); |
77 | 7.08M | } else { |
78 | 2.49k | m_root_node = move(node); |
79 | 2.49k | enter_node(*m_root_node); |
80 | 2.49k | } |
81 | 7.08M | } |
82 | | |
83 | | void Parser::append_text(StringView text, LineTrackingLexer::Position position) |
84 | 7.92M | { |
85 | 7.92M | if (m_listener) { |
86 | 0 | m_listener->text(text); |
87 | 0 | return; |
88 | 0 | } |
89 | | |
90 | 7.92M | if (!m_entered_node) { |
91 | 0 | Node::Text node; |
92 | 0 | node.builder.append(text); |
93 | 0 | m_root_node = make<Node>(position, move(node)); |
94 | 0 | return; |
95 | 0 | } |
96 | | |
97 | 7.92M | m_entered_node->content.visit( |
98 | 7.92M | [&](Node::Element& node) { |
99 | 7.92M | if (!node.children.is_empty()) { |
100 | 3.84M | auto* text_node = node.children.last()->content.get_pointer<Node::Text>(); |
101 | 3.84M | if (text_node) { |
102 | 652k | text_node->builder.append(text); |
103 | 652k | return; |
104 | 652k | } |
105 | 3.84M | } |
106 | 7.26M | Node::Text text_node; |
107 | 7.26M | text_node.builder.append(text); |
108 | 7.26M | node.children.append(make<Node>(position, move(text_node), m_entered_node)); |
109 | 7.26M | }, |
110 | 7.92M | [&](auto&) { |
111 | | // Can't enter a text or comment node. |
112 | 0 | VERIFY_NOT_REACHED(); |
113 | 0 | }); Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_text(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Text>(XML::Node::Text&) const Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_text(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Comment>(XML::Node::Comment&) const |
114 | 7.92M | } |
115 | | |
116 | | void Parser::append_comment(StringView text, LineTrackingLexer::Position position) |
117 | 0 | { |
118 | 0 | if (m_listener) { |
119 | 0 | m_listener->comment(text); |
120 | 0 | return; |
121 | 0 | } |
122 | | |
123 | | // If there's no node to attach this to, drop it on the floor. |
124 | | // This can happen to comments in the prolog. |
125 | 0 | if (!m_entered_node) |
126 | 0 | return; |
127 | | |
128 | 0 | m_entered_node->content.visit( |
129 | 0 | [&](Node::Element& node) { |
130 | 0 | node.children.append(make<Node>(position, Node::Comment { text }, m_entered_node)); |
131 | 0 | }, |
132 | 0 | [&](auto&) { |
133 | | // Can't enter a text or comment node. |
134 | 0 | VERIFY_NOT_REACHED(); |
135 | 0 | }); Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_comment(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Text>(XML::Node::Text&) const Unexecuted instantiation: Parser.cpp:auto XML::Parser::append_comment(AK::StringView, AK::LineTrackingLexer::Position)::$_1::operator()<XML::Node::Comment>(XML::Node::Comment&) const |
136 | 0 | } |
137 | | |
138 | | void Parser::enter_node(Node& node) |
139 | 7.08M | { |
140 | 7.08M | if (m_listener) { |
141 | 0 | auto& element = node.content.get<Node::Element>(); |
142 | 0 | m_listener->element_start(element.name, element.attributes); |
143 | 0 | } |
144 | | |
145 | 7.08M | if (&node != m_root_node.ptr()) |
146 | 7.08M | node.parent = m_entered_node; |
147 | 7.08M | m_entered_node = &node; |
148 | 7.08M | } |
149 | | |
150 | | void Parser::leave_node() |
151 | 7.08M | { |
152 | 7.08M | if (m_listener) { |
153 | 0 | auto& element = m_entered_node->content.get<Node::Element>(); |
154 | 0 | m_listener->element_end(element.name); |
155 | 0 | } |
156 | | |
157 | 7.08M | m_entered_node = m_entered_node->parent; |
158 | 7.08M | } |
159 | | |
160 | | ErrorOr<Document, ParseError> Parser::parse() |
161 | 8.69k | { |
162 | 8.69k | if (auto result = parse_internal(); result.is_error()) { |
163 | 8.59k | if (m_parse_errors.is_empty()) |
164 | 1.02k | return result.release_error(); |
165 | 7.57k | return m_parse_errors.take_first(); |
166 | 8.59k | } |
167 | 108 | return Document { |
168 | 108 | m_root_node.release_nonnull(), |
169 | 108 | move(m_doctype), |
170 | 108 | move(m_processing_instructions), |
171 | 108 | m_version, |
172 | 108 | }; |
173 | 8.69k | } |
174 | | |
175 | | ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener) |
176 | 0 | { |
177 | 0 | m_listener = &listener; |
178 | 0 | ScopeGuard unset_listener { [this] { m_listener = nullptr; } }; |
179 | 0 | m_listener->set_source(m_source); |
180 | 0 | m_listener->document_start(); |
181 | 0 | auto result = parse_internal(); |
182 | 0 | if (result.is_error()) |
183 | 0 | m_listener->error(result.error()); |
184 | 0 | m_listener->document_end(); |
185 | 0 | if (m_doctype.has_value()) { |
186 | 0 | m_listener->set_doctype(m_doctype.release_value()); |
187 | 0 | } |
188 | 0 | m_root_node.clear(); |
189 | 0 | return result; |
190 | 0 | } |
191 | | |
192 | | // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S |
193 | | ErrorOr<void, ParseError> Parser::skip_whitespace(Required required) |
194 | 113M | { |
195 | 113M | auto rollback = rollback_point(); |
196 | 113M | auto rule = enter_rule(); |
197 | | |
198 | | // S ::= (#x20 | #x9 | #xD | #xA)+ |
199 | 113M | auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv)); |
200 | 113M | if (required == Required::Yes && matched.is_empty()) |
201 | 8.27M | return parse_error(m_lexer.current_position(), Expectation { "whitespace"sv }); |
202 | | |
203 | 105M | rollback.disarm(); |
204 | 105M | return {}; |
205 | 113M | } |
206 | | |
207 | | // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar |
208 | | constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>(); |
209 | | |
210 | | // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed |
211 | | ErrorOr<void, ParseError> Parser::parse_internal() |
212 | 8.69k | { |
213 | 8.69k | auto rule = enter_rule(); |
214 | | |
215 | | // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) |
216 | 8.69k | TRY(parse_prolog()); |
217 | 8.69k | TRY(parse_element()); |
218 | 671 | while (true) { |
219 | 671 | if (auto result = parse_misc(); result.is_error()) |
220 | 190 | break; |
221 | 671 | } |
222 | | |
223 | 190 | auto matched_source = m_source.substring_view(0, m_lexer.tell()); |
224 | 190 | if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) { |
225 | 55 | return parse_error( |
226 | 55 | m_lexer.position_for(it.index()), |
227 | 55 | ByteString::formatted("Invalid character #{:x} used in document", *it)); |
228 | 55 | } |
229 | | |
230 | 135 | if (!m_lexer.is_eof()) |
231 | 27 | return parse_error(m_lexer.current_position(), ByteString { "Garbage after document"sv }); |
232 | | |
233 | 108 | return {}; |
234 | 135 | } |
235 | | |
236 | | ErrorOr<void, ParseError> Parser::expect(StringView expected) |
237 | 108M | { |
238 | 108M | auto rollback = rollback_point(); |
239 | | |
240 | 108M | if (!m_lexer.consume_specific(expected)) { |
241 | 36.9M | if (m_options.treat_errors_as_fatal) |
242 | 36.9M | return parse_error(m_lexer.current_position(), ByteString::formatted("Expected '{}'", expected)); |
243 | 36.9M | } |
244 | | |
245 | 71.7M | rollback.disarm(); |
246 | 71.7M | return {}; |
247 | 108M | } |
248 | | |
249 | | template<typename Pred> |
250 | | requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description) |
251 | 17.1M | { |
252 | 17.1M | auto rollback = rollback_point(); |
253 | 17.1M | auto start = m_lexer.tell(); |
254 | 17.1M | if (!m_lexer.next_is(predicate)) { |
255 | 24.2k | if (m_options.treat_errors_as_fatal) |
256 | 24.2k | return parse_error(m_lexer.current_position(), Expectation { description }); |
257 | 24.2k | } |
258 | | |
259 | 17.1M | m_lexer.ignore(); |
260 | 17.1M | rollback.disarm(); |
261 | 17.1M | return m_source.substring_view(start, m_lexer.tell() - start); |
262 | 17.1M | } |
263 | | |
264 | | template<typename Pred> |
265 | | requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description, bool allow_empty) |
266 | 153k | { |
267 | 153k | auto rollback = rollback_point(); |
268 | 153k | auto start = m_lexer.tell(); |
269 | 27.0M | while (m_lexer.next_is(predicate)) { |
270 | 26.8M | if (m_lexer.is_eof()) |
271 | 54 | break; |
272 | 26.8M | m_lexer.ignore(); |
273 | 26.8M | } |
274 | | |
275 | 153k | if (m_lexer.tell() == start && !allow_empty) { |
276 | 1.39k | if (m_options.treat_errors_as_fatal) { |
277 | 1.39k | return parse_error(m_lexer.current_position(), Expectation { description }); |
278 | 1.39k | } |
279 | 1.39k | } |
280 | | |
281 | 152k | rollback.disarm(); |
282 | 152k | return m_source.substring_view(start, m_lexer.tell() - start); |
283 | 153k | } _ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj48ELj57EEEXtlS3_Lj97ELj102EEEXtlS3_Lj65ELj70EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b Line | Count | Source | 266 | 7.85k | { | 267 | 7.85k | auto rollback = rollback_point(); | 268 | 7.85k | auto start = m_lexer.tell(); | 269 | 2.07M | while (m_lexer.next_is(predicate)) { | 270 | 2.06M | if (m_lexer.is_eof()) | 271 | 0 | break; | 272 | 2.06M | m_lexer.ignore(); | 273 | 2.06M | } | 274 | | | 275 | 7.85k | if (m_lexer.tell() == start && !allow_empty) { | 276 | 884 | if (m_options.treat_errors_as_fatal) { | 277 | 884 | return parse_error(m_lexer.current_position(), Expectation { description }); | 278 | 884 | } | 279 | 884 | } | 280 | | | 281 | 6.96k | rollback.disarm(); | 282 | 6.96k | return m_source.substring_view(start, m_lexer.tell() - start); | 283 | 7.85k | } |
_ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj48ELj57EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b Line | Count | Source | 266 | 8.84k | { | 267 | 8.84k | auto rollback = rollback_point(); | 268 | 8.84k | auto start = m_lexer.tell(); | 269 | 2.25M | while (m_lexer.next_is(predicate)) { | 270 | 2.24M | if (m_lexer.is_eof()) | 271 | 0 | break; | 272 | 2.24M | m_lexer.ignore(); | 273 | 2.24M | } | 274 | | | 275 | 8.84k | if (m_lexer.tell() == start && !allow_empty) { | 276 | 453 | if (m_options.treat_errors_as_fatal) { | 277 | 453 | return parse_error(m_lexer.current_position(), Expectation { description }); | 278 | 453 | } | 279 | 453 | } | 280 | | | 281 | 8.39k | rollback.disarm(); | 282 | 8.39k | return m_source.substring_view(start, m_lexer.tell() - start); | 283 | 8.84k | } |
_ZN3XML6Parser11expect_manyI17ranges_for_searchIJXtl5RangeLj58ELj58EEEXtlS3_Lj65ELj90EEEXtlS3_Lj95ELj95EEEXtlS3_Lj97ELj122EEEXtlS3_Lj192ELj214EEEXtlS3_Lj216ELj246EEEXtlS3_Lj248ELj767EEEXtlS3_Lj880ELj893EEEXtlS3_Lj895ELj8191EEEXtlS3_Lj8204ELj8205EEEXtlS3_Lj8304ELj8591EEEXtlS3_Lj11264ELj12271EEEXtlS3_Lj12289ELj55295EEEXtlS3_Lj63744ELj64975EEEXtlS3_Lj65008ELj65533EEEXtlS3_Lj65536ELj983039EEEXtlS3_Lj45ELj45EEEXtlS3_Lj46ELj46EEEXtlS3_Lj48ELj57EEEXtlS3_Lj183ELj183EEEXtlS3_Lj768ELj879EEEXtlS3_Lj8255ELj8256EEEEEQ23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS6_10StringViewENS_10ParseErrorEEES5_S8_b Line | Count | Source | 266 | 98.7k | { | 267 | 98.7k | auto rollback = rollback_point(); | 268 | 98.7k | auto start = m_lexer.tell(); | 269 | 4.03M | while (m_lexer.next_is(predicate)) { | 270 | 3.93M | if (m_lexer.is_eof()) | 271 | 0 | break; | 272 | 3.93M | m_lexer.ignore(); | 273 | 3.93M | } | 274 | | | 275 | 98.7k | if (m_lexer.tell() == start && !allow_empty) { | 276 | 60 | if (m_options.treat_errors_as_fatal) { | 277 | 60 | return parse_error(m_lexer.current_position(), Expectation { description }); | 278 | 60 | } | 279 | 60 | } | 280 | | | 281 | 98.6k | rollback.disarm(); | 282 | 98.6k | return m_source.substring_view(start, m_lexer.tell() - start); | 283 | 98.7k | } |
Parser.cpp:_ZN3XML6Parser11expect_manyIZNS0_23parse_public_id_literalEvE3$_0Q23IsCallableWithArgumentsIT_bcEEEN2AK7ErrorOrINS4_10StringViewENS_10ParseErrorEEES3_S6_b Line | Count | Source | 266 | 14.2k | { | 267 | 14.2k | auto rollback = rollback_point(); | 268 | 14.2k | auto start = m_lexer.tell(); | 269 | 3.78M | while (m_lexer.next_is(predicate)) { | 270 | 3.77M | if (m_lexer.is_eof()) | 271 | 0 | break; | 272 | 3.77M | m_lexer.ignore(); | 273 | 3.77M | } | 274 | | | 275 | 14.2k | if (m_lexer.tell() == start && !allow_empty) { | 276 | 0 | if (m_options.treat_errors_as_fatal) { | 277 | 0 | return parse_error(m_lexer.current_position(), Expectation { description }); | 278 | 0 | } | 279 | 0 | } | 280 | | | 281 | 14.2k | rollback.disarm(); | 282 | 14.2k | return m_source.substring_view(start, m_lexer.tell() - start); | 283 | 14.2k | } |
_ZN3XML6Parser11expect_manyIZN2AK13is_not_any_ofENS2_10StringViewEEUlT_E_Q23IsCallableWithArgumentsIS4_bcEEENS2_7ErrorOrIS3_NS_10ParseErrorEEES4_S3_b Line | Count | Source | 266 | 23.9k | { | 267 | 23.9k | auto rollback = rollback_point(); | 268 | 23.9k | auto start = m_lexer.tell(); | 269 | 14.8M | while (m_lexer.next_is(predicate)) { | 270 | 14.8M | if (m_lexer.is_eof()) | 271 | 54 | break; | 272 | 14.8M | m_lexer.ignore(); | 273 | 14.8M | } | 274 | | | 275 | 23.9k | if (m_lexer.tell() == start && !allow_empty) { | 276 | 0 | if (m_options.treat_errors_as_fatal) { | 277 | 0 | return parse_error(m_lexer.current_position(), Expectation { description }); | 278 | 0 | } | 279 | 0 | } | 280 | | | 281 | 23.9k | rollback.disarm(); | 282 | 23.9k | return m_source.substring_view(start, m_lexer.tell() - start); | 283 | 23.9k | } |
|
284 | | |
285 | | // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog |
286 | | ErrorOr<void, ParseError> Parser::parse_prolog() |
287 | 8.69k | { |
288 | 8.69k | auto rollback = rollback_point(); |
289 | 8.69k | auto rule = enter_rule(); |
290 | | |
291 | | // prolog ::= XMLDecl Misc* (doctypedecl Misc*)? |
292 | | // The following is valid in XML 1.0. |
293 | | // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? |
294 | 8.69k | if (auto result = parse_xml_decl(); result.is_error()) { |
295 | 8.56k | m_version = Version::Version10; |
296 | 8.56k | m_in_compatibility_mode = true; |
297 | 8.56k | } |
298 | 8.69k | auto accept = accept_rule(); |
299 | | |
300 | 12.8k | while (true) { |
301 | 12.8k | if (auto result = parse_misc(); result.is_error()) |
302 | 8.69k | break; |
303 | 12.8k | } |
304 | | |
305 | 8.69k | if (auto result = parse_doctype_decl(); !result.is_error()) { |
306 | 1.66k | while (true) { |
307 | 1.66k | if (auto result = parse_misc(); result.is_error()) |
308 | 1.45k | break; |
309 | 1.66k | } |
310 | 1.45k | } |
311 | | |
312 | 8.69k | rollback.disarm(); |
313 | 8.69k | return {}; |
314 | 8.69k | } |
315 | | |
316 | | // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl |
317 | | ErrorOr<void, ParseError> Parser::parse_xml_decl() |
318 | 8.69k | { |
319 | 8.69k | auto rollback = rollback_point(); |
320 | 8.69k | auto rule = enter_rule(); |
321 | | |
322 | | // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' |
323 | | |
324 | 8.69k | TRY(expect("<?xml"sv)); |
325 | 404 | auto accept = accept_rule(); |
326 | | |
327 | 404 | TRY(parse_version_info()); |
328 | 250 | (void)parse_encoding_decl(); |
329 | 250 | (void)parse_standalone_document_decl(); |
330 | 250 | TRY(skip_whitespace()); |
331 | 250 | TRY(expect("?>"sv)); |
332 | | |
333 | 129 | rollback.disarm(); |
334 | 129 | return {}; |
335 | 250 | } |
336 | | |
337 | | // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo |
338 | | ErrorOr<void, ParseError> Parser::parse_version_info() |
339 | 404 | { |
340 | 404 | auto rollback = rollback_point(); |
341 | 404 | auto rule = enter_rule(); |
342 | | |
343 | | // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') |
344 | 404 | TRY(skip_whitespace(Required::Yes)); |
345 | 400 | TRY(expect("version"sv)); |
346 | 397 | auto accept = accept_rule(); |
347 | | |
348 | 397 | TRY(parse_eq()); |
349 | 393 | TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); |
350 | 392 | m_lexer.retreat(); |
351 | | |
352 | 392 | auto version_string = m_lexer.consume_quoted_string(); |
353 | 392 | if (version_string == "1.0") { |
354 | | // FIXME: Compatibility mode, figure out which rules are different in XML 1.0. |
355 | 194 | m_version = Version::Version10; |
356 | 194 | m_in_compatibility_mode = true; |
357 | 198 | } else { |
358 | 198 | if (version_string != "1.1" && m_options.treat_errors_as_fatal) |
359 | 142 | return parse_error(m_lexer.current_position(), ByteString::formatted("Expected '1.1', found '{}'", version_string)); |
360 | 198 | } |
361 | | |
362 | 250 | m_version = Version::Version11; |
363 | 250 | rollback.disarm(); |
364 | 250 | return {}; |
365 | 392 | } |
366 | | |
367 | | // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq |
368 | | ErrorOr<void, ParseError> Parser::parse_eq() |
369 | 16.5M | { |
370 | 16.5M | auto rollback = rollback_point(); |
371 | 16.5M | auto rule = enter_rule(); |
372 | | |
373 | | // Eq ::= S? '=' S? |
374 | 16.5M | auto accept = accept_rule(); |
375 | 16.5M | TRY(skip_whitespace()); |
376 | 16.5M | TRY(expect("="sv)); |
377 | 16.4M | TRY(skip_whitespace()); |
378 | 16.4M | rollback.disarm(); |
379 | 16.4M | return {}; |
380 | 16.4M | } |
381 | | |
382 | | // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl |
383 | | ErrorOr<void, ParseError> Parser::parse_encoding_decl() |
384 | 250 | { |
385 | 250 | auto rollback = rollback_point(); |
386 | 250 | auto rule = enter_rule(); |
387 | | |
388 | | // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) |
389 | 250 | TRY(skip_whitespace(Required::Yes)); |
390 | 233 | TRY(expect("encoding"sv)); |
391 | 149 | auto accept = accept_rule(); |
392 | | |
393 | 149 | TRY(parse_eq()); |
394 | 148 | TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); |
395 | 147 | m_lexer.retreat(); |
396 | | |
397 | | // FIXME: Actually do something with this encoding. |
398 | 147 | m_encoding = m_lexer.consume_quoted_string(); |
399 | | |
400 | 147 | rollback.disarm(); |
401 | 147 | return {}; |
402 | 148 | } |
403 | | |
404 | | // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd |
405 | | ErrorOr<void, ParseError> Parser::parse_standalone_document_decl() |
406 | 250 | { |
407 | 250 | auto rollback = rollback_point(); |
408 | 250 | auto rule = enter_rule(); |
409 | | |
410 | | // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) |
411 | 250 | TRY(skip_whitespace(Required::Yes)); |
412 | 109 | TRY(expect("standalone"sv)); |
413 | 70 | auto accept = accept_rule(); |
414 | | |
415 | 70 | TRY(parse_eq()); |
416 | 68 | TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); |
417 | 66 | m_lexer.retreat(); |
418 | | |
419 | 66 | auto value = m_lexer.consume_quoted_string(); |
420 | 66 | if (!value.is_one_of("yes", "no")) |
421 | 64 | return parse_error(m_lexer.position_for(m_lexer.tell() - value.length()), Expectation { "one of 'yes' or 'no'"sv }); |
422 | | |
423 | 2 | m_standalone = value == "yes"; |
424 | | |
425 | 2 | rollback.disarm(); |
426 | 2 | return {}; |
427 | 66 | } |
428 | | |
429 | | // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc |
430 | | ErrorOr<void, ParseError> Parser::parse_misc() |
431 | 15.1k | { |
432 | 15.1k | auto rollback = rollback_point(); |
433 | 15.1k | auto rule = enter_rule(); |
434 | | |
435 | | // Misc ::= Comment | PI | S |
436 | 15.1k | if (auto result = parse_comment(); !result.is_error()) { |
437 | 260 | rollback.disarm(); |
438 | 260 | return {}; |
439 | 260 | } |
440 | | |
441 | 14.9k | if (auto result = parse_processing_instruction(); !result.is_error()) { |
442 | 3.09k | rollback.disarm(); |
443 | 3.09k | return {}; |
444 | 3.09k | } |
445 | | |
446 | 11.8k | if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { |
447 | 1.46k | rollback.disarm(); |
448 | 1.46k | return {}; |
449 | 1.46k | } |
450 | | |
451 | 10.3k | return parse_error(m_lexer.current_position(), Expectation { "a match for 'Misc'"sv }); |
452 | 11.8k | } |
453 | | |
454 | | // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment |
455 | | ErrorOr<void, ParseError> Parser::parse_comment() |
456 | 4.35M | { |
457 | 4.35M | auto rollback = rollback_point(); |
458 | 4.35M | auto rule = enter_rule(); |
459 | | |
460 | | // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' |
461 | 4.35M | auto comment_start = m_lexer.tell(); |
462 | 4.35M | TRY(expect("<!--"sv)); |
463 | 9.63k | auto accept = accept_rule(); |
464 | | |
465 | 9.63k | bool last_seen_a_dash = false; |
466 | | // FIXME: This should disallow surrogate blocks |
467 | 21.7M | auto text = m_lexer.consume_while([&](auto ch) { |
468 | 21.7M | if (ch != '-') { |
469 | 21.7M | last_seen_a_dash = false; |
470 | 21.7M | return true; |
471 | 21.7M | } |
472 | | |
473 | 27.1k | if (last_seen_a_dash) |
474 | 9.15k | return false; |
475 | | |
476 | 18.0k | last_seen_a_dash = true; |
477 | 18.0k | return true; |
478 | 27.1k | }); |
479 | | |
480 | 9.63k | if (last_seen_a_dash) { |
481 | 9.17k | m_lexer.retreat(); |
482 | 9.17k | text = text.substring_view(0, text.length() - 1); |
483 | 9.17k | } |
484 | | |
485 | 9.63k | TRY(expect("-->"sv)); |
486 | | |
487 | 9.09k | if (m_options.preserve_comments) |
488 | 0 | append_comment(text, m_lexer.position_for(comment_start)); |
489 | | |
490 | 9.09k | rollback.disarm(); |
491 | 9.09k | return {}; |
492 | 9.63k | } |
493 | | |
494 | | // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI |
495 | | ErrorOr<void, ParseError> Parser::parse_processing_instruction() |
496 | 4.45M | { |
497 | 4.45M | auto rollback = rollback_point(); |
498 | 4.45M | auto rule = enter_rule(); |
499 | | |
500 | | // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' |
501 | 4.45M | TRY(expect("<?"sv)); |
502 | 101k | auto accept = accept_rule(); |
503 | | |
504 | 101k | auto target = TRY(parse_processing_instruction_target()); |
505 | 100k | ByteString data; |
506 | 100k | if (auto result = skip_whitespace(Required::Yes); !result.is_error()) |
507 | 56.4k | data = m_lexer.consume_until("?>"); |
508 | 100k | TRY(expect("?>"sv)); |
509 | | |
510 | 97.8k | m_processing_instructions.set(target, data); |
511 | 97.8k | rollback.disarm(); |
512 | 97.8k | return {}; |
513 | 100k | } |
514 | | |
515 | | // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget |
516 | | ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target() |
517 | 101k | { |
518 | 101k | auto rollback = rollback_point(); |
519 | 101k | auto rule = enter_rule(); |
520 | | |
521 | | // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) |
522 | 101k | auto target = TRY(parse_name()); |
523 | 101k | auto accept = accept_rule(); |
524 | | |
525 | 101k | if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) { |
526 | 956 | return parse_error( |
527 | 956 | m_lexer.position_for(m_lexer.tell() - target.length()), |
528 | 956 | ByteString { "Use of the reserved 'xml' name for processing instruction target name is disallowed"sv }); |
529 | 956 | } |
530 | | |
531 | 100k | rollback.disarm(); |
532 | 100k | return target; |
533 | 101k | } |
534 | | |
535 | | // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] |
536 | | constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {}; |
537 | | |
538 | | // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] |
539 | | constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>(); |
540 | | |
541 | | // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name |
542 | | ErrorOr<Name, ParseError> Parser::parse_name() |
543 | 56.4M | { |
544 | 56.4M | auto rollback = rollback_point(); |
545 | 56.4M | auto rule = enter_rule(); |
546 | | |
547 | | // Name ::= NameStartChar (NameChar)* |
548 | | |
549 | | // FIXME: This is a hacky workaround to read code points instead of bytes. |
550 | | // Replace this once we have a unicode-aware lexer. |
551 | 56.4M | auto start = m_lexer.tell(); |
552 | 56.4M | StringView remaining = m_lexer.input().substring_view(start); |
553 | 56.4M | Utf8View view { remaining }; |
554 | 56.4M | auto code_points = view.begin(); |
555 | 56.4M | if (code_points.done() || !s_name_start_characters.contains(*code_points)) { |
556 | 13.5M | if (m_options.treat_errors_as_fatal) |
557 | 13.5M | return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv }); |
558 | 13.5M | } |
559 | | |
560 | 42.8M | m_lexer.ignore(code_points.underlying_code_point_length_in_bytes()); |
561 | 42.8M | ++code_points; |
562 | | |
563 | 42.8M | auto accept = accept_rule(); |
564 | | |
565 | 83.2M | while (!code_points.done() && s_name_characters.contains(*code_points)) { |
566 | 40.3M | m_lexer.ignore(code_points.underlying_code_point_length_in_bytes()); |
567 | 40.3M | ++code_points; |
568 | 40.3M | } |
569 | | |
570 | 42.8M | rollback.disarm(); |
571 | 42.8M | return remaining.substring_view(0, m_lexer.tell() - start); |
572 | 56.4M | } |
573 | | |
574 | | // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl |
575 | | ErrorOr<void, ParseError> Parser::parse_doctype_decl() |
576 | 8.69k | { |
577 | 8.69k | auto rollback = rollback_point(); |
578 | 8.69k | auto rule = enter_rule(); |
579 | 8.69k | Doctype doctype; |
580 | | |
581 | | // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' |
582 | 8.69k | TRY(expect("<!DOCTYPE"sv)); |
583 | 5.64k | auto accept = accept_rule(); |
584 | | |
585 | 5.64k | TRY(skip_whitespace(Required::Yes)); |
586 | 5.64k | doctype.type = TRY(parse_name()); |
587 | 5.64k | if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { |
588 | 883 | if (auto id_result = parse_external_id(); !id_result.is_error()) { |
589 | 67 | doctype.external_id = id_result.release_value(); |
590 | 67 | if (m_options.resolve_external_resource) { |
591 | 0 | auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id); |
592 | 0 | if (!resource_result.is_error()) { |
593 | 0 | auto declarations = TRY(resource_result.release_value().visit( |
594 | 0 | [&](ByteString resolved_source) -> ErrorOr<Vector<MarkupDeclaration>, ParseError> { |
595 | 0 | TemporaryChange source { m_source, resolved_source.view() }; |
596 | 0 | TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) }; |
597 | 0 | auto declarations = TRY(parse_external_subset()); |
598 | 0 | if (!m_lexer.is_eof()) { |
599 | 0 | return parse_error( |
600 | 0 | m_lexer.current_position(), |
601 | 0 | ByteString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal)); |
602 | 0 | } |
603 | 0 | return declarations; |
604 | 0 | }, |
605 | 0 | [&](Vector<MarkupDeclaration> declarations) -> ErrorOr<Vector<MarkupDeclaration>, ParseError> { |
606 | 0 | return declarations; |
607 | 0 | })); |
608 | 0 | doctype.markup_declarations.extend(move(declarations)); |
609 | 0 | } |
610 | 0 | } |
611 | 67 | } |
612 | 883 | } |
613 | 11.2k | TRY(skip_whitespace(Required::No)); |
614 | 11.2k | if (m_lexer.consume_specific('[')) { |
615 | 5.30k | auto internal_subset = TRY(parse_internal_subset()); |
616 | 5.30k | TRY(expect("]"sv)); |
617 | 1.96k | TRY(skip_whitespace()); |
618 | 1.96k | doctype.markup_declarations.extend(internal_subset); |
619 | 1.96k | } |
620 | | |
621 | 11.2k | TRY(expect(">"sv)); |
622 | | |
623 | 1.45k | rollback.disarm(); |
624 | 1.45k | m_doctype = move(doctype); |
625 | 1.45k | return {}; |
626 | 2.30k | } |
627 | | |
628 | | // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element |
629 | | ErrorOr<void, ParseError> Parser::parse_element() |
630 | 7.68M | { |
631 | 7.68M | auto rollback = rollback_point(); |
632 | 7.68M | auto rule = enter_rule(); |
633 | | |
634 | | // element ::= EmptyElemTag |
635 | | // | STag content ETag |
636 | 7.68M | if (auto result = parse_empty_element_tag(); !result.is_error()) { |
637 | 3.00M | append_node(result.release_value()); |
638 | 3.00M | leave_node(); |
639 | 3.00M | rollback.disarm(); |
640 | 3.00M | return {}; |
641 | 3.00M | } |
642 | | |
643 | 4.67M | auto accept = accept_rule(); |
644 | 4.67M | auto start_tag = TRY(parse_start_tag()); |
645 | 4.07M | auto& node = *start_tag; |
646 | 4.07M | auto& tag = node.content.get<Node::Element>(); |
647 | 4.07M | append_node(move(start_tag)); |
648 | 4.07M | ScopeGuard quit { |
649 | 4.07M | [&] { |
650 | 4.07M | leave_node(); |
651 | 4.07M | } |
652 | 4.07M | }; |
653 | | |
654 | 4.07M | TRY(parse_content()); |
655 | | |
656 | 4.07M | auto tag_location = m_lexer.tell(); |
657 | 4.07M | auto closing_name = TRY(parse_end_tag()); |
658 | | |
659 | | // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag. |
660 | 12.3k | if (m_options.treat_errors_as_fatal && closing_name != tag.name) |
661 | 4.04k | return parse_error(m_lexer.position_for(tag_location), ByteString { "Invalid closing tag"sv }); |
662 | | |
663 | 8.33k | rollback.disarm(); |
664 | 8.33k | return {}; |
665 | 12.3k | } |
666 | | |
667 | | // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag |
668 | | ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag() |
669 | 7.68M | { |
670 | 7.68M | auto rollback = rollback_point(); |
671 | 7.68M | auto rule = enter_rule(); |
672 | | |
673 | | // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' |
674 | 7.68M | auto tag_start = m_lexer.tell(); |
675 | 7.68M | TRY(expect("<"sv)); |
676 | | |
677 | 7.41M | auto name = TRY(parse_name()); |
678 | 7.24M | HashMap<Name, ByteString> attributes; |
679 | | |
680 | 16.8M | while (true) { |
681 | 16.8M | if (auto result = skip_whitespace(Required::Yes); result.is_error()) |
682 | 4.10M | break; |
683 | | |
684 | 12.7M | if (auto result = parse_attribute(); !result.is_error()) { |
685 | 9.57M | auto attribute = result.release_value(); |
686 | 9.57M | attributes.set(move(attribute.name), move(attribute.value)); |
687 | 9.57M | } else { |
688 | 3.14M | break; |
689 | 3.14M | } |
690 | 12.7M | } |
691 | | |
692 | 7.24M | TRY(skip_whitespace()); |
693 | 7.24M | TRY(expect("/>"sv)); |
694 | | |
695 | 3.00M | auto accept = accept_rule(); |
696 | | |
697 | 3.00M | rollback.disarm(); |
698 | 3.00M | return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} }); |
699 | 7.24M | } |
700 | | |
701 | | // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute |
702 | | ErrorOr<Attribute, ParseError> Parser::parse_attribute() |
703 | 19.5M | { |
704 | 19.5M | auto rollback = rollback_point(); |
705 | 19.5M | auto rule = enter_rule(); |
706 | | |
707 | | // Attribute ::= Name Eq AttValue |
708 | 19.5M | auto name = TRY(parse_name()); |
709 | 16.5M | auto accept = accept_rule(); |
710 | | |
711 | 16.5M | TRY(parse_eq()); |
712 | 16.4M | auto value = TRY(parse_attribute_value()); |
713 | | |
714 | 16.2M | rollback.disarm(); |
715 | 16.2M | return Attribute { |
716 | 16.2M | move(name), |
717 | 16.2M | move(value), |
718 | 16.2M | }; |
719 | 16.4M | } |
720 | | |
721 | | // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue |
722 | | ErrorOr<ByteString, ParseError> Parser::parse_attribute_value() |
723 | 17.0M | { |
724 | 17.0M | auto rollback = rollback_point(); |
725 | 17.0M | auto rule = enter_rule(); |
726 | | |
727 | | // AttValue ::= '"' ([^<&"] | Reference)* '"' |
728 | | // | "'" ([^<&'] | Reference)* "'" |
729 | 17.0M | auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); |
730 | 17.0M | auto accept = accept_rule(); |
731 | | |
732 | 17.0M | auto text = TRY(parse_attribute_value_inner(quote)); |
733 | 16.9M | TRY(expect(quote)); |
734 | | |
735 | 16.7M | rollback.disarm(); |
736 | 16.7M | return text; |
737 | 16.9M | } |
738 | | |
739 | | ErrorOr<ByteString, ParseError> Parser::parse_attribute_value_inner(StringView disallow) |
740 | 17.0M | { |
741 | 17.0M | StringBuilder builder; |
742 | 180M | while (true) { |
743 | 180M | if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof()) |
744 | 16.9M | break; |
745 | | |
746 | 163M | if (m_lexer.next_is('<')) { |
747 | | // Not allowed, return a nice error to make it easier to debug. |
748 | 17.7k | return parse_error(m_lexer.current_position(), ByteString { "Unescaped '<' not allowed in attribute values"sv }); |
749 | 17.7k | } |
750 | | |
751 | 163M | if (m_lexer.next_is('&')) { |
752 | 33.8k | auto reference = TRY(parse_reference()); |
753 | 28.6k | if (auto* char_reference = reference.get_pointer<ByteString>()) |
754 | 3.26k | builder.append(*char_reference); |
755 | 25.4k | else |
756 | 25.4k | builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue))); |
757 | 163M | } else { |
758 | 163M | builder.append(m_lexer.consume()); |
759 | 163M | } |
760 | 163M | } |
761 | 16.9M | return builder.to_byte_string(); |
762 | 17.0M | } |
763 | | |
764 | | // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] |
765 | | constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>(); |
766 | | |
767 | | // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference |
768 | | ErrorOr<Variant<Parser::EntityReference, ByteString>, ParseError> Parser::parse_reference() |
769 | 4.69M | { |
770 | 4.69M | auto rollback = rollback_point(); |
771 | 4.69M | auto rule = enter_rule(); |
772 | | // Reference ::= EntityRef | CharRef |
773 | | |
774 | | // 4.1.68. EntityRef |
775 | | // EntityRef ::= '&' Name ';' |
776 | | |
777 | | // 4.1.66. CharRef |
778 | | // CharRef ::= '&#' [0-9]+ ';' |
779 | | // | '&#x' [0-9a-fA-F]+ ';' |
780 | | |
781 | 4.69M | auto reference_start = m_lexer.tell(); |
782 | 4.69M | TRY(expect("&"sv)); |
783 | 287k | auto accept = accept_rule(); |
784 | | |
785 | 287k | auto name_result = parse_name(); |
786 | 287k | if (name_result.is_error()) { |
787 | 18.0k | TRY(expect("#"sv)); |
788 | 16.6k | Optional<u32> code_point; |
789 | 16.6k | if (m_lexer.consume_specific('x')) { |
790 | 7.85k | auto hex = TRY(expect_many( |
791 | 6.96k | ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(), |
792 | 6.96k | "any of [0-9a-fA-F]"sv)); |
793 | 6.96k | code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex); |
794 | 8.84k | } else { |
795 | 8.84k | auto decimal = TRY(expect_many( |
796 | 8.39k | ranges_for_search<Range('0', '9')>(), |
797 | 8.39k | "any of [0-9]"sv)); |
798 | 8.39k | code_point = decimal.to_number<u32>(); |
799 | 8.39k | } |
800 | | |
801 | 16.6k | if (!code_point.has_value() || !s_characters.contains(*code_point)) |
802 | 1.66k | return parse_error(m_lexer.position_for(reference_start), ByteString { "Invalid character reference"sv }); |
803 | | |
804 | 15.3k | TRY(expect(";"sv)); |
805 | | |
806 | 12.8k | StringBuilder builder; |
807 | 12.8k | builder.append_code_point(*code_point); |
808 | | |
809 | 12.8k | rollback.disarm(); |
810 | 12.8k | return builder.to_byte_string(); |
811 | 13.6k | } |
812 | | |
813 | 269k | auto name = name_result.release_value(); |
814 | 269k | TRY(expect(";"sv)); |
815 | | |
816 | 268k | rollback.disarm(); |
817 | 268k | return EntityReference { move(name) }; |
818 | 269k | } |
819 | | |
820 | | // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag |
821 | | ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag() |
822 | 4.67M | { |
823 | 4.67M | auto rollback = rollback_point(); |
824 | 4.67M | auto rule = enter_rule(); |
825 | | |
826 | | // STag ::= '<' Name (S Attribute)* S? '>' |
827 | 4.67M | auto tag_start = m_lexer.tell(); |
828 | 4.67M | TRY(expect("<"sv)); |
829 | 4.40M | auto accept = accept_rule(); |
830 | | |
831 | 4.40M | auto name = TRY(parse_name()); |
832 | 4.23M | HashMap<Name, ByteString> attributes; |
833 | | |
834 | 10.9M | while (true) { |
835 | 10.9M | if (auto result = skip_whitespace(Required::Yes); result.is_error()) |
836 | 4.09M | break; |
837 | | |
838 | 6.80M | if (auto result = parse_attribute(); !result.is_error()) { |
839 | 6.66M | auto attribute = result.release_value(); |
840 | 6.66M | attributes.set(move(attribute.name), move(attribute.value)); |
841 | 6.66M | } else { |
842 | 140k | break; |
843 | 140k | } |
844 | 6.80M | } |
845 | | |
846 | 4.23M | TRY(skip_whitespace()); |
847 | 4.23M | TRY(expect(">"sv)); |
848 | | |
849 | 4.07M | rollback.disarm(); |
850 | 4.07M | return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} }); |
851 | 4.23M | } |
852 | | |
853 | | // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag |
854 | | ErrorOr<Name, ParseError> Parser::parse_end_tag() |
855 | 4.07M | { |
856 | 4.07M | auto rollback = rollback_point(); |
857 | 4.07M | auto rule = enter_rule(); |
858 | | |
859 | | // ETag ::= '</' Name S? '>' |
860 | 4.07M | TRY(expect("</"sv)); |
861 | 13.2k | auto accept = accept_rule(); |
862 | | |
863 | 13.2k | auto name = TRY(parse_name()); |
864 | 12.9k | TRY(skip_whitespace()); |
865 | 12.9k | TRY(expect(">"sv)); |
866 | | |
867 | 12.3k | rollback.disarm(); |
868 | 12.3k | return name; |
869 | 12.9k | } |
870 | | |
871 | | // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content |
872 | | ErrorOr<void, ParseError> Parser::parse_content() |
873 | 4.31M | { |
874 | 4.31M | auto rollback = rollback_point(); |
875 | 4.31M | auto rule = enter_rule(); |
876 | 4.31M | auto accept = accept_rule(); |
877 | | |
878 | | // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* |
879 | 4.31M | auto content_start = m_lexer.tell(); |
880 | 4.31M | if (auto result = parse_char_data(); !result.is_error()) |
881 | 4.31M | append_text(result.release_value(), m_lexer.position_for(content_start)); |
882 | | |
883 | 7.67M | while (true) { |
884 | 7.67M | auto node_start = m_lexer.tell(); |
885 | | |
886 | 7.67M | if (auto result = parse_element(); !result.is_error()) |
887 | 3.01M | goto try_char_data; |
888 | 4.65M | if (auto result = parse_reference(); !result.is_error()) { |
889 | 250k | auto reference = result.release_value(); |
890 | 250k | auto reference_offset = m_lexer.position_for(node_start); |
891 | 250k | if (auto char_reference = reference.get_pointer<ByteString>()) |
892 | 7.85k | append_text(*char_reference, reference_offset); |
893 | 242k | else |
894 | 242k | append_text(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content)), reference_offset); |
895 | 250k | goto try_char_data; |
896 | 250k | } |
897 | 4.40M | if (auto result = parse_cdata_section(); !result.is_error()) { |
898 | 1.09k | if (m_options.preserve_cdata) |
899 | 1.09k | append_text(result.release_value(), m_lexer.position_for(node_start)); |
900 | 1.09k | goto try_char_data; |
901 | 1.09k | } |
902 | 4.40M | if (auto result = parse_processing_instruction(); !result.is_error()) |
903 | 91.7k | goto try_char_data; |
904 | 4.31M | if (auto result = parse_comment(); !result.is_error()) |
905 | 4.82k | goto try_char_data; |
906 | | |
907 | 4.30M | break; |
908 | | |
909 | 4.30M | try_char_data:; |
910 | 3.36M | if (auto result = parse_char_data(); !result.is_error()) |
911 | 3.36M | append_text(result.release_value(), m_lexer.position_for(node_start)); |
912 | 3.36M | } |
913 | | |
914 | 4.30M | rollback.disarm(); |
915 | 4.30M | return {}; |
916 | 4.31M | } |
917 | | |
918 | | // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData |
919 | | ErrorOr<StringView, ParseError> Parser::parse_char_data() |
920 | 7.67M | { |
921 | 7.67M | auto rollback = rollback_point(); |
922 | 7.67M | auto rule = enter_rule(); |
923 | | |
924 | | // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) |
925 | 7.67M | auto cend_state = 0; // 1: ], 2: ], 3: > |
926 | 1.09G | auto text = m_lexer.consume_while([&](auto ch) { |
927 | 1.09G | if (ch == '<' || ch == '&' || cend_state == 3) |
928 | 7.66M | return false; |
929 | 1.08G | switch (cend_state) { |
930 | 1.08G | case 0: |
931 | 1.08G | case 1: |
932 | 1.08G | if (ch == ']') |
933 | 86.5k | cend_state++; |
934 | 1.08G | else |
935 | 1.08G | cend_state = 0; |
936 | 1.08G | return true; |
937 | 10.5k | case 2: |
938 | 10.5k | if (ch == '>') { |
939 | 225 | cend_state++; |
940 | 225 | return true; |
941 | 225 | } |
942 | 10.3k | cend_state = 0; |
943 | 10.3k | return true; |
944 | 0 | default: |
945 | 0 | VERIFY_NOT_REACHED(); |
946 | 1.08G | } |
947 | 1.08G | }); |
948 | 7.67M | if (cend_state == 3) { |
949 | 225 | m_lexer.retreat(3); |
950 | 225 | text = text.substring_view(0, text.length() - 3); |
951 | 225 | } |
952 | | |
953 | 7.67M | rollback.disarm(); |
954 | 7.67M | return text; |
955 | 7.67M | } |
956 | | |
957 | | // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset |
958 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset() |
959 | 5.30k | { |
960 | 5.30k | auto rollback = rollback_point(); |
961 | 5.30k | auto rule = enter_rule(); |
962 | 5.30k | Vector<MarkupDeclaration> declarations; |
963 | | |
964 | | // intSubset ::= (markupdecl | DeclSep)* |
965 | 260k | while (true) { |
966 | 260k | if (auto result = parse_markup_declaration(); !result.is_error()) { |
967 | 235k | auto maybe_declaration = result.release_value(); |
968 | 235k | if (maybe_declaration.has_value()) |
969 | 228k | declarations.append(maybe_declaration.release_value()); |
970 | 235k | continue; |
971 | 235k | } |
972 | 24.2k | if (auto result = parse_declaration_separator(); !result.is_error()) { |
973 | | // The markup declarations may be made up in whole or in part of the replacement text of parameter entities. |
974 | | // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl. |
975 | 18.9k | auto maybe_replacement_text = result.release_value(); |
976 | 18.9k | if (maybe_replacement_text.has_value()) { |
977 | 1.51k | TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() }; |
978 | 1.51k | TemporaryChange lexer { m_lexer, LineTrackingLexer { m_source } }; |
979 | | |
980 | 1.51k | auto contained_declarations = TRY(parse_external_subset_declaration()); |
981 | 1.51k | declarations.extend(move(contained_declarations)); |
982 | 1.51k | } |
983 | 18.9k | continue; |
984 | 18.9k | } |
985 | 5.30k | break; |
986 | 24.2k | } |
987 | | |
988 | 5.30k | rollback.disarm(); |
989 | 5.30k | return declarations; |
990 | 5.30k | } |
991 | | |
992 | | // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl |
993 | | ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration() |
994 | 261k | { |
995 | 261k | auto rollback = rollback_point(); |
996 | 261k | auto rule = enter_rule(); |
997 | | |
998 | | // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment |
999 | 261k | if (auto result = parse_element_declaration(); !result.is_error()) { |
1000 | 71.4k | rollback.disarm(); |
1001 | 71.4k | return MarkupDeclaration { result.release_value() }; |
1002 | 71.4k | } |
1003 | 190k | if (auto result = parse_attribute_list_declaration(); !result.is_error()) { |
1004 | 29.1k | rollback.disarm(); |
1005 | 29.1k | return MarkupDeclaration { result.release_value() }; |
1006 | 29.1k | } |
1007 | 160k | if (auto result = parse_entity_declaration(); !result.is_error()) { |
1008 | 112k | rollback.disarm(); |
1009 | 112k | return MarkupDeclaration { result.release_value() }; |
1010 | 112k | } |
1011 | 48.8k | if (auto result = parse_notation_declaration(); !result.is_error()) { |
1012 | 16.1k | rollback.disarm(); |
1013 | 16.1k | return MarkupDeclaration { result.release_value() }; |
1014 | 16.1k | } |
1015 | 32.6k | if (auto result = parse_processing_instruction(); !result.is_error()) { |
1016 | 2.96k | rollback.disarm(); |
1017 | 2.96k | return Optional<MarkupDeclaration> {}; |
1018 | 2.96k | } |
1019 | 29.7k | if (auto result = parse_comment(); !result.is_error()) { |
1020 | 4.00k | rollback.disarm(); |
1021 | 4.00k | return Optional<MarkupDeclaration> {}; |
1022 | 4.00k | } |
1023 | | |
1024 | 25.7k | return parse_error(m_lexer.current_position(), Expectation { "one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment"sv }); |
1025 | 29.7k | } |
1026 | | |
1027 | | // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep |
1028 | | ErrorOr<Optional<ByteString>, ParseError> Parser::parse_declaration_separator() |
1029 | 25.7k | { |
1030 | 25.7k | auto rollback = rollback_point(); |
1031 | 25.7k | auto rule = enter_rule(); |
1032 | | |
1033 | | // DeclSep ::= PEReference | S |
1034 | 25.7k | if (auto name = parse_parameter_entity_reference(); !name.is_error()) { |
1035 | 1.51k | rollback.disarm(); |
1036 | | // FIXME: Resolve this PEReference. |
1037 | 1.51k | return ""; |
1038 | 1.51k | } |
1039 | | |
1040 | 24.2k | if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { |
1041 | 17.3k | rollback.disarm(); |
1042 | 17.3k | return Optional<ByteString> {}; |
1043 | 17.3k | } |
1044 | | |
1045 | 6.82k | return parse_error(m_lexer.current_position(), Expectation { "either whitespace, or a PEReference"sv }); |
1046 | 24.2k | } |
1047 | | |
1048 | | // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference |
1049 | | ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference() |
1050 | 31.2k | { |
1051 | 31.2k | auto rollback = rollback_point(); |
1052 | 31.2k | auto rule = enter_rule(); |
1053 | | |
1054 | | // PEReference ::= '%' Name ';' |
1055 | 31.2k | TRY(expect("%"sv)); |
1056 | 7.11k | auto accept = accept_rule(); |
1057 | | |
1058 | 7.11k | auto name = TRY(parse_name()); |
1059 | 7.09k | TRY(expect(";"sv)); |
1060 | | |
1061 | 7.05k | rollback.disarm(); |
1062 | 7.05k | return name; |
1063 | 7.09k | } |
1064 | | |
1065 | | // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl |
1066 | | ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration() |
1067 | 261k | { |
1068 | 261k | auto rollback = rollback_point(); |
1069 | 261k | auto rule = enter_rule(); |
1070 | | |
1071 | | // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences, |
1072 | | // but the grammar does not allow that, figure this out. |
1073 | | // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' |
1074 | 261k | TRY(expect("<!ELEMENT"sv)); |
1075 | 72.5k | auto accept = accept_rule(); |
1076 | | |
1077 | 72.5k | TRY(skip_whitespace(Required::Yes)); |
1078 | 72.5k | auto name = TRY(parse_name()); |
1079 | 72.5k | TRY(skip_whitespace(Required::Yes)); |
1080 | 72.5k | auto spec = TRY(parse_content_spec()); |
1081 | 71.5k | TRY(expect(">"sv)); |
1082 | | |
1083 | 71.4k | rollback.disarm(); |
1084 | 71.4k | return ElementDeclaration { |
1085 | 71.4k | move(name), |
1086 | 71.4k | move(spec), |
1087 | 71.4k | }; |
1088 | 71.5k | } |
1089 | | |
1090 | | // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl |
1091 | | ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration() |
1092 | 190k | { |
1093 | 190k | auto rollback = rollback_point(); |
1094 | 190k | auto rule = enter_rule(); |
1095 | 190k | AttributeListDeclaration declaration; |
1096 | | |
1097 | | // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' |
1098 | 190k | TRY(expect("<!ATTLIST"sv)); |
1099 | 30.2k | auto accept = accept_rule(); |
1100 | | |
1101 | 30.2k | TRY(skip_whitespace(Required::Yes)); |
1102 | 30.2k | declaration.type = TRY(parse_name()); |
1103 | | |
1104 | 584k | while (true) { |
1105 | 584k | if (auto result = parse_attribute_definition(); !result.is_error()) |
1106 | 554k | declaration.attributes.append(result.release_value()); |
1107 | 30.2k | else |
1108 | 30.2k | break; |
1109 | 584k | } |
1110 | | |
1111 | 30.2k | TRY(skip_whitespace()); |
1112 | 30.2k | TRY(expect(">"sv)); |
1113 | | |
1114 | 29.1k | rollback.disarm(); |
1115 | 29.1k | return declaration; |
1116 | 30.2k | } |
1117 | | |
1118 | | // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef |
1119 | | ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition() |
1120 | 584k | { |
1121 | 584k | auto rollback = rollback_point(); |
1122 | 584k | auto rule = enter_rule(); |
1123 | 584k | Optional<AttributeListDeclaration::Type> type; |
1124 | 584k | Optional<AttributeListDeclaration::Default> default_; |
1125 | | |
1126 | | // AttDef ::= S Name S AttType S DefaultDecl |
1127 | 584k | TRY(skip_whitespace(Required::Yes)); |
1128 | 574k | auto name = TRY(parse_name()); |
1129 | 555k | auto accept = accept_rule(); |
1130 | | |
1131 | 555k | TRY(skip_whitespace(Required::Yes)); |
1132 | | |
1133 | | // AttType ::= StringType | TokenizedType | EnumeratedType |
1134 | | // StringType ::= 'CDATA' |
1135 | | // TokenizedType ::= 'ID' |
1136 | | // | 'IDREF' |
1137 | | // | 'IDREFS' |
1138 | | // | 'ENTITY' |
1139 | | // | 'ENTITIES' |
1140 | | // | 'NMTOKEN' |
1141 | | // | 'NMTOKENS' |
1142 | | // EnumeratedType ::= NotationType | Enumeration |
1143 | | // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' |
1144 | | // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' |
1145 | 555k | if (m_lexer.consume_specific("CDATA"sv)) { |
1146 | 5.17k | type = AttributeListDeclaration::StringType::CData; |
1147 | 550k | } else if (m_lexer.consume_specific("IDREFS"sv)) { |
1148 | 2.90k | type = AttributeListDeclaration::TokenizedType::IDRefs; |
1149 | 547k | } else if (m_lexer.consume_specific("IDREF"sv)) { |
1150 | 1.21k | type = AttributeListDeclaration::TokenizedType::IDRef; |
1151 | 546k | } else if (m_lexer.consume_specific("ID"sv)) { |
1152 | 524k | type = AttributeListDeclaration::TokenizedType::ID; |
1153 | 524k | } else if (m_lexer.consume_specific("ENTITIES"sv)) { |
1154 | 454 | type = AttributeListDeclaration::TokenizedType::Entities; |
1155 | 21.2k | } else if (m_lexer.consume_specific("ENTITY"sv)) { |
1156 | 244 | type = AttributeListDeclaration::TokenizedType::Entity; |
1157 | 20.9k | } else if (m_lexer.consume_specific("NMTOKENS"sv)) { |
1158 | 194 | type = AttributeListDeclaration::TokenizedType::NMTokens; |
1159 | 20.7k | } else if (m_lexer.consume_specific("NMTOKEN"sv)) { |
1160 | 260 | type = AttributeListDeclaration::TokenizedType::NMToken; |
1161 | 20.5k | } else if (m_lexer.consume_specific("NOTATION"sv)) { |
1162 | 2.98k | HashTable<Name> names; |
1163 | 2.98k | TRY(skip_whitespace(Required::Yes)); |
1164 | 2.98k | TRY(expect("("sv)); |
1165 | 2.98k | TRY(skip_whitespace()); |
1166 | 2.98k | names.set(TRY(parse_name())); |
1167 | 3.92k | while (true) { |
1168 | 3.92k | TRY(skip_whitespace()); |
1169 | 3.92k | if (auto result = expect("|"sv); result.is_error()) |
1170 | 2.97k | break; |
1171 | 3.92k | TRY(skip_whitespace()); |
1172 | 1.89k | names.set(TRY(parse_name())); |
1173 | 941 | } |
1174 | 5.95k | TRY(skip_whitespace()); |
1175 | 5.95k | TRY(expect(")"sv)); |
1176 | 2.96k | type = AttributeListDeclaration::NotationType { move(names) }; |
1177 | 17.5k | } else { |
1178 | 17.5k | HashTable<ByteString> names; |
1179 | 17.5k | TRY(expect("("sv)); |
1180 | 17.3k | TRY(skip_whitespace()); |
1181 | 17.3k | names.set(TRY(parse_nm_token())); |
1182 | 98.6k | while (true) { |
1183 | 98.6k | TRY(skip_whitespace()); |
1184 | 98.6k | if (auto result = expect("|"sv); result.is_error()) |
1185 | 17.3k | break; |
1186 | 162k | TRY(skip_whitespace()); |
1187 | 162k | names.set(TRY(parse_nm_token())); |
1188 | 81.3k | } |
1189 | 34.6k | TRY(skip_whitespace()); |
1190 | 34.6k | TRY(expect(")"sv)); |
1191 | 17.1k | type = AttributeListDeclaration::Enumeration { move(names) }; |
1192 | 17.1k | } |
1193 | | |
1194 | 1.10M | TRY(skip_whitespace(Required::Yes)); |
1195 | | |
1196 | | // DefaultDecl ::= '#REQUIRED' | '#IMPLIED' |
1197 | | // | (('#FIXED' S)? AttValue) |
1198 | 1.10M | if (m_lexer.consume_specific("#REQUIRED"sv)) { |
1199 | 5.99k | default_ = AttributeListDeclaration::Required {}; |
1200 | 548k | } else if (m_lexer.consume_specific("#IMPLIED"sv)) { |
1201 | 8.46k | default_ = AttributeListDeclaration::Implied {}; |
1202 | 540k | } else { |
1203 | 540k | bool fixed = false; |
1204 | 540k | if (m_lexer.consume_specific("#FIXED"sv)) { |
1205 | 1.20k | TRY(skip_whitespace(Required::Yes)); |
1206 | 1.20k | fixed = true; |
1207 | 1.20k | } |
1208 | 540k | auto value = TRY(parse_attribute_value()); |
1209 | 540k | if (fixed) |
1210 | 1.20k | default_ = AttributeListDeclaration::Fixed { move(value) }; |
1211 | 538k | else |
1212 | 538k | default_ = AttributeListDeclaration::DefaultValue { move(value) }; |
1213 | 540k | } |
1214 | | |
1215 | 1.10M | rollback.disarm(); |
1216 | 554k | return AttributeListDeclaration::Definition { |
1217 | 554k | move(name), |
1218 | 554k | type.release_value(), |
1219 | 554k | default_.release_value(), |
1220 | 554k | }; |
1221 | 1.10M | } |
1222 | | |
1223 | | // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken |
1224 | | ErrorOr<StringView, ParseError> Parser::parse_nm_token() |
1225 | 98.7k | { |
1226 | 98.7k | auto rollback = rollback_point(); |
1227 | 98.7k | auto rule = enter_rule(); |
1228 | | |
1229 | | // Nmtoken ::= (NameChar)+ |
1230 | 98.7k | auto token = TRY(expect_many(s_name_characters, "a NameChar"sv)); |
1231 | | |
1232 | 98.6k | rollback.disarm(); |
1233 | 98.6k | return token; |
1234 | 98.7k | } |
1235 | | |
1236 | | // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations |
1237 | | ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration() |
1238 | 48.8k | { |
1239 | 48.8k | auto rollback = rollback_point(); |
1240 | 48.8k | auto rule = enter_rule(); |
1241 | 48.8k | Variant<ExternalID, PublicID, Empty> notation; |
1242 | | |
1243 | | // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' |
1244 | 48.8k | TRY(expect("<!NOTATION"sv)); |
1245 | 16.3k | auto accept = accept_rule(); |
1246 | | |
1247 | 16.3k | TRY(skip_whitespace(Required::Yes)); |
1248 | 16.3k | auto name = TRY(parse_name()); |
1249 | 16.3k | TRY(skip_whitespace(Required::Yes)); |
1250 | | |
1251 | 16.3k | if (auto result = parse_external_id(); !result.is_error()) |
1252 | 11.8k | notation = result.release_value(); |
1253 | 4.48k | else |
1254 | 4.48k | notation = TRY(parse_public_id()); |
1255 | | |
1256 | 16.3k | TRY(expect(">"sv)); |
1257 | | |
1258 | 16.1k | rollback.disarm(); |
1259 | 16.1k | return NotationDeclaration { |
1260 | 16.1k | move(name), |
1261 | 16.1k | move(notation).downcast<ExternalID, PublicID>(), |
1262 | 16.1k | }; |
1263 | 16.2k | } |
1264 | | |
1265 | | // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec |
1266 | | ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec() |
1267 | 72.5k | { |
1268 | 72.5k | auto rollback = rollback_point(); |
1269 | 72.5k | auto rule = enter_rule(); |
1270 | 72.5k | Optional<ElementDeclaration::ContentSpec> content_spec; |
1271 | | |
1272 | | // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children |
1273 | 72.5k | if (m_lexer.consume_specific("EMPTY"sv)) { |
1274 | 1.68k | content_spec = ElementDeclaration::Empty {}; |
1275 | 70.8k | } else if (m_lexer.consume_specific("ANY"sv)) { |
1276 | 997 | content_spec = ElementDeclaration::Any {}; |
1277 | 69.8k | } else { |
1278 | 69.8k | TRY(expect("("sv)); |
1279 | 69.7k | TRY(skip_whitespace()); |
1280 | 69.7k | if (m_lexer.consume_specific("#PCDATA"sv)) { |
1281 | 62.8k | HashTable<Name> names; |
1282 | | // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' |
1283 | | // | '(' S? '#PCDATA' S? ')' |
1284 | 62.8k | TRY(skip_whitespace()); |
1285 | 62.8k | if (m_lexer.consume_specific(")*"sv)) { |
1286 | 102 | content_spec = ElementDeclaration::Mixed { .types = {}, .many = true }; |
1287 | 62.7k | } else if (m_lexer.consume_specific(')')) { |
1288 | 62.1k | content_spec = ElementDeclaration::Mixed { .types = {}, .many = false }; |
1289 | 62.1k | } else { |
1290 | 6.67k | while (true) { |
1291 | 6.67k | TRY(skip_whitespace()); |
1292 | 6.67k | if (!m_lexer.consume_specific('|')) |
1293 | 530 | break; |
1294 | 12.2k | TRY(skip_whitespace()); |
1295 | 12.2k | if (auto result = parse_name(); !result.is_error()) |
1296 | 6.10k | names.set(result.release_value()); |
1297 | 38 | else |
1298 | 38 | return parse_error(m_lexer.current_position(), Expectation { "a Name"sv }); |
1299 | 12.2k | } |
1300 | 1.06k | TRY(skip_whitespace()); |
1301 | 1.06k | TRY(expect(")*"sv)); |
1302 | 433 | content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true }; |
1303 | 433 | } |
1304 | 62.8k | } else { |
1305 | 700k | while (!m_lexer.next_is('(')) |
1306 | 693k | m_lexer.retreat(); |
1307 | | // children ::= (choice | seq) ('?' | '*' | '+')? |
1308 | | // cp ::= (Name | choice | seq) ('?' | '*' | '+')? |
1309 | | // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' |
1310 | | // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' |
1311 | 6.95k | Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice; |
1312 | 6.95k | Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence; |
1313 | | |
1314 | 23.8M | auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> { |
1315 | 23.8M | if (auto result = parse_name(); !result.is_error()) |
1316 | 13.6M | return result.release_value(); |
1317 | 10.1M | if (auto result = parse_choice(); !result.is_error()) |
1318 | 17.5k | return result.release_value(); |
1319 | 10.1M | return TRY(parse_sequence()); |
1320 | 10.1M | }; |
1321 | 14.4M | auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier { |
1322 | 14.4M | ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce }; |
1323 | 14.4M | if (m_lexer.consume_specific('?')) |
1324 | 17.3k | qualifier = ElementDeclaration::Children::Qualifier::Optional; |
1325 | 14.4M | else if (m_lexer.consume_specific('*')) |
1326 | 2.83k | qualifier = ElementDeclaration::Children::Qualifier::Any; |
1327 | 14.4M | else if (m_lexer.consume_specific('+')) |
1328 | 1.07k | qualifier = ElementDeclaration::Children::Qualifier::OneOrMore; |
1329 | 14.4M | return qualifier; |
1330 | 14.4M | }; |
1331 | 23.8M | auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> { |
1332 | 23.8M | auto sub_entry = TRY(parse_cp_init()); |
1333 | 14.0M | auto qualifier = parse_qualifier(); |
1334 | 14.0M | return ElementDeclaration::Children::Entry { |
1335 | 14.0M | move(sub_entry), |
1336 | 14.0M | qualifier, |
1337 | 14.0M | }; |
1338 | 23.8M | }; |
1339 | 10.1M | parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> { |
1340 | 10.1M | auto rollback = rollback_point(); |
1341 | 10.1M | auto rule = enter_rule(); |
1342 | | |
1343 | 10.1M | TRY(expect("("sv)); |
1344 | 8.56M | auto accept = accept_rule(); |
1345 | | |
1346 | 8.56M | TRY(skip_whitespace()); |
1347 | 8.56M | Vector<ElementDeclaration::Children::Entry> choices; |
1348 | 8.56M | choices.append(TRY(parse_cp())); |
1349 | 6.61M | while (true) { |
1350 | 6.61M | TRY(skip_whitespace()); |
1351 | 6.61M | if (!m_lexer.consume_specific('|')) |
1352 | 5.31M | break; |
1353 | 6.61M | TRY(skip_whitespace()); |
1354 | 2.60M | choices.append(TRY(parse_cp())); |
1355 | 161k | } |
1356 | | |
1357 | 6.45M | TRY(expect(")"sv)); |
1358 | | |
1359 | 5.71M | if (choices.size() < 2) |
1360 | 388k | return parse_error(m_lexer.current_position(), Expectation { "more than one choice"sv }); |
1361 | | |
1362 | 407k | TRY(skip_whitespace()); |
1363 | 18.8k | auto qualifier = parse_qualifier(); |
1364 | | |
1365 | 18.8k | rollback.disarm(); |
1366 | 18.8k | return ElementDeclaration::Children::Choice { |
1367 | 18.8k | move(choices), |
1368 | 18.8k | qualifier, |
1369 | 18.8k | }; |
1370 | 18.8k | }; |
1371 | 10.1M | parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> { |
1372 | 10.1M | auto rollback = rollback_point(); |
1373 | 10.1M | auto rule = enter_rule(); |
1374 | | |
1375 | 10.1M | TRY(expect("("sv)); |
1376 | 8.54M | auto accept = accept_rule(); |
1377 | | |
1378 | 8.54M | TRY(skip_whitespace()); |
1379 | 8.54M | Vector<ElementDeclaration::Children::Entry> entries; |
1380 | 8.54M | entries.append(TRY(parse_cp())); |
1381 | 7.45M | while (true) { |
1382 | 7.45M | TRY(skip_whitespace()); |
1383 | 7.45M | if (!m_lexer.consume_specific(',')) |
1384 | 2.07M | break; |
1385 | 10.7M | TRY(skip_whitespace()); |
1386 | 10.7M | entries.append(TRY(parse_cp())); |
1387 | 1.02M | } |
1388 | | |
1389 | 6.43M | TRY(expect(")"sv)); |
1390 | | |
1391 | 2.46M | TRY(skip_whitespace()); |
1392 | 388k | auto qualifier = parse_qualifier(); |
1393 | | |
1394 | 388k | rollback.disarm(); |
1395 | 388k | return ElementDeclaration::Children::Sequence { |
1396 | 388k | move(entries), |
1397 | 388k | qualifier, |
1398 | 388k | }; |
1399 | 2.46M | }; |
1400 | 6.95k | if (auto result = parse_choice(); !result.is_error()) { |
1401 | 1.24k | auto qualifier = parse_qualifier(); |
1402 | 1.24k | content_spec = ElementDeclaration::Children { |
1403 | 1.24k | result.release_value(), |
1404 | 1.24k | qualifier, |
1405 | 1.24k | }; |
1406 | 5.71k | } else { |
1407 | 5.71k | auto sequence = TRY(parse_sequence()); |
1408 | 4.94k | auto qualifier = parse_qualifier(); |
1409 | 4.94k | content_spec = ElementDeclaration::Children { |
1410 | 4.94k | move(sequence), |
1411 | 4.94k | qualifier, |
1412 | 4.94k | }; |
1413 | 4.94k | } |
1414 | 6.95k | } |
1415 | 69.7k | } |
1416 | | |
1417 | 71.5k | rollback.disarm(); |
1418 | 71.5k | return content_spec.release_value(); |
1419 | 72.5k | } |
1420 | | |
1421 | | // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl |
1422 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration() |
1423 | 1.51k | { |
1424 | 1.51k | auto rollback = rollback_point(); |
1425 | 1.51k | auto rule = enter_rule(); |
1426 | 1.51k | Vector<MarkupDeclaration> declarations; |
1427 | | |
1428 | | // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )* |
1429 | 1.51k | while (true) { |
1430 | 1.51k | if (auto result = parse_markup_declaration(); !result.is_error()) { |
1431 | 0 | if (result.value().has_value()) |
1432 | 0 | declarations.append(result.release_value().release_value()); |
1433 | 0 | continue; |
1434 | 0 | } |
1435 | | |
1436 | | // FIXME: conditionalSect |
1437 | | |
1438 | 1.51k | if (auto result = parse_declaration_separator(); !result.is_error()) |
1439 | 0 | continue; |
1440 | | |
1441 | 1.51k | break; |
1442 | 1.51k | } |
1443 | | |
1444 | 1.51k | rollback.disarm(); |
1445 | 1.51k | return declarations; |
1446 | 1.51k | } |
1447 | | |
1448 | | // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl |
1449 | | ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration() |
1450 | 160k | { |
1451 | | // EntityDecl ::= GEDecl | PEDecl |
1452 | 160k | if (auto result = parse_general_entity_declaration(); !result.is_error()) |
1453 | 104k | return result; |
1454 | | |
1455 | 56.2k | return parse_parameter_entity_declaration(); |
1456 | 160k | } |
1457 | | |
1458 | | // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl |
1459 | | ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration() |
1460 | 160k | { |
1461 | 160k | auto rollback = rollback_point(); |
1462 | 160k | auto rule = enter_rule(); |
1463 | 160k | Variant<ByteString, EntityDefinition, Empty> definition; |
1464 | | |
1465 | | // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' |
1466 | 160k | TRY(expect("<!ENTITY"sv)); |
1467 | 112k | auto accept = accept_rule(); |
1468 | | |
1469 | 112k | TRY(skip_whitespace(Required::Yes)); |
1470 | 112k | auto name = TRY(parse_name()); |
1471 | 105k | TRY(skip_whitespace(Required::Yes)); |
1472 | | // EntityDef ::= EntityValue | (ExternalID NDataDecl?) |
1473 | 104k | if (auto result = parse_entity_value(); !result.is_error()) { |
1474 | 94.3k | definition = result.release_value(); |
1475 | 94.3k | } else { |
1476 | 10.6k | auto external_id = TRY(parse_external_id()); |
1477 | 10.3k | Optional<Name> notation; |
1478 | 10.3k | if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error()) |
1479 | 6.90k | notation = notation_result.release_value(); |
1480 | | |
1481 | 10.3k | definition = EntityDefinition { |
1482 | 10.3k | move(external_id), |
1483 | 10.3k | move(notation), |
1484 | 10.3k | }; |
1485 | 10.3k | } |
1486 | | |
1487 | 209k | TRY(skip_whitespace()); |
1488 | 209k | TRY(expect(">"sv)); |
1489 | | |
1490 | 104k | rollback.disarm(); |
1491 | 104k | return GEDeclaration { |
1492 | 104k | move(name), |
1493 | 104k | move(definition).downcast<ByteString, EntityDefinition>(), |
1494 | 104k | }; |
1495 | 209k | } |
1496 | | |
1497 | | // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl |
1498 | | ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration() |
1499 | 56.2k | { |
1500 | 56.2k | auto rollback = rollback_point(); |
1501 | 56.2k | auto rule = enter_rule(); |
1502 | | |
1503 | 56.2k | Variant<ByteString, ExternalID, Empty> definition; |
1504 | | // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' |
1505 | 56.2k | TRY(expect("<!ENTITY"sv)); |
1506 | 7.83k | auto accept = accept_rule(); |
1507 | | |
1508 | 7.83k | TRY(skip_whitespace(Required::Yes)); |
1509 | 7.82k | TRY(expect("%"sv)); |
1510 | 7.48k | TRY(skip_whitespace(Required::Yes)); |
1511 | 7.48k | auto name = TRY(parse_name()); |
1512 | 7.48k | TRY(skip_whitespace(Required::Yes)); |
1513 | | // PEDef ::= EntityValue | ExternalID |
1514 | 7.47k | if (auto result = parse_entity_value(); !result.is_error()) |
1515 | 5.89k | definition = result.release_value(); |
1516 | 1.58k | else |
1517 | 1.58k | definition = TRY(parse_external_id()); |
1518 | | |
1519 | 14.8k | TRY(skip_whitespace()); |
1520 | 14.8k | TRY(expect(">"sv)); |
1521 | | |
1522 | 7.40k | rollback.disarm(); |
1523 | 7.40k | return PEDeclaration { |
1524 | 7.40k | move(name), |
1525 | 7.40k | move(definition).downcast<ByteString, ExternalID>(), |
1526 | 7.40k | }; |
1527 | 14.8k | } |
1528 | | |
1529 | | // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID |
1530 | | ErrorOr<PublicID, ParseError> Parser::parse_public_id() |
1531 | 4.48k | { |
1532 | 4.48k | auto rollback = rollback_point(); |
1533 | 4.48k | auto rule = enter_rule(); |
1534 | | |
1535 | | // PublicID ::= 'PUBLIC' S PubidLiteral |
1536 | 4.48k | TRY(expect("PUBLIC"sv)); |
1537 | 4.44k | auto accept = accept_rule(); |
1538 | | |
1539 | 4.44k | TRY(skip_whitespace(Required::Yes)); |
1540 | 4.43k | auto text = TRY(parse_public_id_literal()); |
1541 | | |
1542 | 4.31k | rollback.disarm(); |
1543 | 4.31k | return PublicID { |
1544 | 4.31k | text, |
1545 | 4.31k | }; |
1546 | 4.43k | } |
1547 | | |
1548 | | constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>()); |
1549 | | |
1550 | | // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral |
1551 | | ErrorOr<StringView, ParseError> Parser::parse_public_id_literal() |
1552 | 14.3k | { |
1553 | 14.3k | auto rollback = rollback_point(); |
1554 | 14.3k | auto rule = enter_rule(); |
1555 | | |
1556 | | // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" |
1557 | 14.3k | auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); |
1558 | 14.2k | auto accept = accept_rule(); |
1559 | | |
1560 | 14.2k | auto id = TRY(expect_many( |
1561 | 14.2k | [q = quote[0]](auto x) { |
1562 | 14.2k | return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x); |
1563 | 14.2k | }, |
1564 | 14.2k | "a PubidChar"sv, |
1565 | 14.2k | true)); |
1566 | 14.2k | TRY(expect(quote)); |
1567 | | |
1568 | 13.8k | rollback.disarm(); |
1569 | 13.8k | return id; |
1570 | 14.2k | } |
1571 | | |
1572 | | // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral |
1573 | | ErrorOr<StringView, ParseError> Parser::parse_system_id_literal() |
1574 | 23.9k | { |
1575 | 23.9k | auto rollback = rollback_point(); |
1576 | 23.9k | auto rule = enter_rule(); |
1577 | | |
1578 | | // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") |
1579 | 23.9k | auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); |
1580 | 23.9k | auto accept = accept_rule(); |
1581 | | |
1582 | 23.9k | auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv, true)); |
1583 | 23.9k | TRY(expect(quote)); |
1584 | | |
1585 | 23.8k | rollback.disarm(); |
1586 | 23.8k | return id; |
1587 | 23.9k | } |
1588 | | |
1589 | | // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID |
1590 | | ErrorOr<ExternalID, ParseError> Parser::parse_external_id() |
1591 | 29.4k | { |
1592 | 29.4k | auto rollback = rollback_point(); |
1593 | 29.4k | auto rule = enter_rule(); |
1594 | | |
1595 | | // ExternalID ::= 'SYSTEM' S SystemLiteral |
1596 | | // | 'PUBLIC' S PubidLiteral S SystemLiteral |
1597 | 29.4k | Optional<PublicID> public_id; |
1598 | 29.4k | SystemID system_id; |
1599 | | |
1600 | 29.4k | if (m_lexer.consume_specific("SYSTEM"sv)) { |
1601 | 18.7k | auto accept = accept_rule(); |
1602 | 18.7k | TRY(skip_whitespace(Required::Yes)); |
1603 | 18.7k | system_id = SystemID { TRY(parse_system_id_literal()) }; |
1604 | 18.7k | } else { |
1605 | 10.6k | TRY(expect("PUBLIC"sv)); |
1606 | 9.86k | auto accept = accept_rule(); |
1607 | | |
1608 | 9.86k | TRY(skip_whitespace(Required::Yes)); |
1609 | 9.86k | public_id = PublicID { TRY(parse_public_id_literal()) }; |
1610 | 9.52k | TRY(skip_whitespace(Required::Yes)); |
1611 | 5.21k | system_id = SystemID { TRY(parse_system_id_literal()) }; |
1612 | 5.18k | } |
1613 | | |
1614 | 29.4k | rollback.disarm(); |
1615 | 23.8k | return ExternalID { |
1616 | 23.8k | move(public_id), |
1617 | 23.8k | move(system_id), |
1618 | 23.8k | }; |
1619 | 29.4k | } |
1620 | | |
1621 | | // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl |
1622 | | ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration() |
1623 | 10.3k | { |
1624 | 10.3k | auto rollback = rollback_point(); |
1625 | 10.3k | auto rule = enter_rule(); |
1626 | | |
1627 | | // NDataDecl ::= S 'NDATA' S Name |
1628 | 10.3k | TRY(skip_whitespace(Required::Yes)); |
1629 | 8.49k | auto accept = accept_rule(); |
1630 | | |
1631 | 8.49k | TRY(expect("NDATA"sv)); |
1632 | 6.90k | TRY(skip_whitespace(Required::Yes)); |
1633 | 6.90k | auto name = TRY(parse_name()); |
1634 | | |
1635 | 6.90k | rollback.disarm(); |
1636 | 6.90k | return name; |
1637 | 6.90k | } |
1638 | | |
1639 | | // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue |
1640 | | ErrorOr<ByteString, ParseError> Parser::parse_entity_value() |
1641 | 112k | { |
1642 | 112k | auto rollback = rollback_point(); |
1643 | 112k | auto rule = enter_rule(); |
1644 | 112k | StringBuilder builder; |
1645 | | |
1646 | | // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |
1647 | | // | "'" ([^%&'] | PEReference | Reference)* "'" |
1648 | 112k | auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); |
1649 | 100k | auto accept = accept_rule(); |
1650 | | |
1651 | 9.37M | while (true) { |
1652 | 9.37M | if (m_lexer.is_eof()) |
1653 | 88 | break; |
1654 | 9.37M | if (m_lexer.next_is(quote)) |
1655 | 100k | break; |
1656 | 9.27M | if (m_lexer.next_is('%')) { |
1657 | 5.57k | auto start = m_lexer.tell(); |
1658 | | // FIXME: Resolve this PEReference. |
1659 | 5.57k | TRY(parse_parameter_entity_reference()); |
1660 | 5.53k | builder.append(m_source.substring_view(start, m_lexer.tell() - start)); |
1661 | 5.53k | continue; |
1662 | 5.57k | } |
1663 | 9.26M | if (m_lexer.next_is('&')) { |
1664 | 2.26k | auto reference = TRY(parse_reference()); |
1665 | 2.24k | if (auto char_reference = reference.get_pointer<ByteString>()) |
1666 | 1.75k | builder.append(*char_reference); |
1667 | 491 | else |
1668 | 491 | builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue))); |
1669 | 2.24k | continue; |
1670 | 2.24k | } |
1671 | 9.26M | builder.append(m_lexer.consume()); |
1672 | 9.26M | } |
1673 | 100k | TRY(expect(quote)); |
1674 | | |
1675 | 100k | rollback.disarm(); |
1676 | 100k | return builder.to_byte_string(); |
1677 | 100k | } |
1678 | | |
1679 | | // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect |
1680 | | ErrorOr<StringView, ParseError> Parser::parse_cdata_section() |
1681 | 4.40M | { |
1682 | 4.40M | auto rollback = rollback_point(); |
1683 | 4.40M | auto rule = enter_rule(); |
1684 | | |
1685 | | // CDSect ::= CDStart CData CDEnd |
1686 | | // CDStart ::= '<![CDATA[' |
1687 | | // CData ::= (Char* - (Char* ']]>' Char*)) |
1688 | | // CDEnd ::= ']]>' |
1689 | 4.40M | TRY(expect("<![CDATA["sv)); |
1690 | 1.64k | auto accept = accept_rule(); |
1691 | | |
1692 | 1.64k | auto section_start = m_lexer.tell(); |
1693 | 1.79M | while (!m_lexer.next_is("]]>")) { |
1694 | 1.79M | if (m_lexer.is_eof()) |
1695 | 548 | break; |
1696 | 1.78M | m_lexer.ignore(); |
1697 | 1.78M | } |
1698 | 1.64k | auto section_end = m_lexer.tell(); |
1699 | 1.64k | TRY(expect("]]>"sv)); |
1700 | | |
1701 | 1.09k | rollback.disarm(); |
1702 | 1.09k | return m_source.substring_view(section_start, section_end - section_start); |
1703 | 1.64k | } |
1704 | | |
1705 | | // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset |
1706 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset() |
1707 | 0 | { |
1708 | 0 | auto rollback = rollback_point(); |
1709 | 0 | auto rule = enter_rule(); |
1710 | | |
1711 | | // extSubset ::= TextDecl? extSubsetDecl |
1712 | 0 | (void)parse_text_declaration(); |
1713 | 0 | auto result = TRY(parse_external_subset_declaration()); |
1714 | |
|
1715 | 0 | rollback.disarm(); |
1716 | 0 | return result; |
1717 | 0 | } |
1718 | | |
1719 | | // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl |
1720 | | ErrorOr<void, ParseError> Parser::parse_text_declaration() |
1721 | 0 | { |
1722 | 0 | auto rollback = rollback_point(); |
1723 | 0 | auto rule = enter_rule(); |
1724 | | |
1725 | | // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' |
1726 | 0 | TRY(expect("<?xml"sv)); |
1727 | 0 | auto accept = accept_rule(); |
1728 | |
|
1729 | 0 | (void)parse_version_info(); |
1730 | 0 | TRY(parse_encoding_decl()); |
1731 | 0 | TRY(skip_whitespace()); |
1732 | 0 | TRY(expect("?>"sv)); |
1733 | |
|
1734 | 0 | rollback.disarm(); |
1735 | 0 | return {}; |
1736 | 0 | } |
1737 | | |
1738 | | ErrorOr<ByteString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement) |
1739 | 268k | { |
1740 | 268k | static HashTable<Name> reference_lookup {}; |
1741 | 268k | if (reference_lookup.contains(reference.name)) |
1742 | 15.1k | return parse_error(m_lexer.current_position(), ByteString::formatted("Invalid recursive definition for '{}'", reference.name)); |
1743 | | |
1744 | 253k | reference_lookup.set(reference.name); |
1745 | 253k | ScopeGuard remove_lookup { |
1746 | 253k | [&] { |
1747 | 253k | reference_lookup.remove(reference.name); |
1748 | 253k | } |
1749 | 253k | }; |
1750 | | |
1751 | 253k | Optional<ByteString> resolved; |
1752 | 253k | if (m_doctype.has_value()) { |
1753 | | // FIXME: Split these up and resolve them ahead of time. |
1754 | 275k | for (auto& declaration : m_doctype->markup_declarations) { |
1755 | 275k | auto entity = declaration.get_pointer<EntityDeclaration>(); |
1756 | 275k | if (!entity) |
1757 | 16.9k | continue; |
1758 | 258k | auto ge_declaration = entity->get_pointer<GEDeclaration>(); |
1759 | 258k | if (!ge_declaration) |
1760 | 1.35k | continue; |
1761 | 256k | if (ge_declaration->name != reference.name) |
1762 | 10.6k | continue; |
1763 | 256k | TRY(ge_declaration->definition.visit( |
1764 | 246k | [&](ByteString const& definition) -> ErrorOr<void, ParseError> { |
1765 | 246k | resolved = definition; |
1766 | 246k | return {}; |
1767 | 246k | }, |
1768 | 246k | [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> { |
1769 | 246k | if (placement == ReferencePlacement::AttributeValue) |
1770 | 246k | return parse_error(m_lexer.current_position(), ByteString::formatted("Attribute references external entity '{}'", reference.name)); |
1771 | | |
1772 | 246k | if (definition.notation.has_value()) |
1773 | 246k | return parse_error(m_lexer.position_for(0), ByteString::formatted("Entity reference to unparsed entity '{}'", reference.name)); |
1774 | | |
1775 | 246k | if (!m_options.resolve_external_resource) |
1776 | 246k | return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}'", reference.name)); |
1777 | | |
1778 | 246k | auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id); |
1779 | 246k | if (result.is_error()) |
1780 | 246k | return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error())); |
1781 | | |
1782 | 246k | if (!result.value().has<ByteString>()) |
1783 | 246k | return parse_error(m_lexer.position_for(0), ByteString::formatted("Failed to resolve external entity '{}': Resource is of the wrong type", reference.name)); |
1784 | | |
1785 | 246k | resolved = result.release_value().get<ByteString>(); |
1786 | 246k | return {}; |
1787 | 246k | })); |
1788 | 246k | break; |
1789 | 246k | } |
1790 | 251k | } |
1791 | | |
1792 | 253k | if (!resolved.has_value()) { |
1793 | 7.23k | if (reference.name == "amp") |
1794 | 3.81k | return "&"; |
1795 | 3.42k | if (reference.name == "lt") |
1796 | 727 | return "<"; |
1797 | 2.69k | if (reference.name == "gt") |
1798 | 695 | return ">"; |
1799 | 2.00k | if (reference.name == "apos") |
1800 | 211 | return "'"; |
1801 | 1.79k | if (reference.name == "quot") |
1802 | 331 | return "\""; |
1803 | 1.46k | return parse_error(m_lexer.position_for(0), ByteString::formatted("Reference to undeclared entity '{}'", reference.name)); |
1804 | 1.79k | } |
1805 | | |
1806 | 246k | StringView resolved_source = *resolved; |
1807 | 246k | TemporaryChange source { m_source, resolved_source }; |
1808 | 246k | TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) }; |
1809 | 246k | switch (placement) { |
1810 | 10.3k | case ReferencePlacement::AttributeValue: |
1811 | 10.3k | return TRY(parse_attribute_value_inner(""sv)); |
1812 | 235k | case ReferencePlacement::Content: |
1813 | 235k | TRY(parse_content()); |
1814 | 235k | return ""; |
1815 | 0 | default: |
1816 | 0 | VERIFY_NOT_REACHED(); |
1817 | 246k | } |
1818 | 246k | } |
1819 | | |
1820 | | } |