/src/serenity/Userland/Libraries/LibXML/Parser/Parser.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #pragma once |
8 | | |
9 | | #include <AK/ByteString.h> |
10 | | #include <AK/Debug.h> |
11 | | #include <AK/Function.h> |
12 | | #include <AK/GenericLexer.h> |
13 | | #include <AK/HashMap.h> |
14 | | #include <AK/OwnPtr.h> |
15 | | #include <AK/SourceLocation.h> |
16 | | #include <AK/TemporaryChange.h> |
17 | | #include <LibXML/DOM/Document.h> |
18 | | #include <LibXML/DOM/DocumentTypeDeclaration.h> |
19 | | #include <LibXML/DOM/Node.h> |
20 | | #include <LibXML/Forward.h> |
21 | | |
22 | | namespace XML { |
23 | | |
24 | | struct Expectation { |
25 | | StringView expected; |
26 | | }; |
27 | | |
28 | | struct ParseError { |
29 | | LineTrackingLexer::Position position {}; |
30 | | Variant<ByteString, Expectation> error; |
31 | | }; |
32 | | |
33 | | struct Listener { |
34 | 0 | virtual ~Listener() { } |
35 | | |
36 | 0 | virtual void set_source(ByteString) { } |
37 | 0 | virtual void set_doctype(XML::Doctype) { } |
38 | 0 | virtual void document_start() { } |
39 | 0 | virtual void document_end() { } |
40 | 0 | virtual void element_start(Name const&, HashMap<Name, ByteString> const&) { } |
41 | 0 | virtual void element_end(Name const&) { } |
42 | 0 | virtual void text(StringView) { } |
43 | 0 | virtual void comment(StringView) { } |
44 | 0 | virtual void error(ParseError const&) { } |
45 | | }; |
46 | | |
47 | | class Parser { |
48 | | public: |
49 | | struct Options { |
50 | | bool preserve_cdata { true }; |
51 | | bool preserve_comments { false }; |
52 | | bool treat_errors_as_fatal { true }; |
53 | | Function<ErrorOr<Variant<ByteString, Vector<MarkupDeclaration>>>(SystemID const&, Optional<PublicID> const&)> resolve_external_resource {}; |
54 | | }; |
55 | | |
56 | | Parser(StringView source, Options options) |
57 | | : m_source(source) |
58 | | , m_lexer(source) |
59 | | , m_options(move(options)) |
60 | 0 | { |
61 | 0 | } |
62 | | |
63 | | explicit Parser(StringView source) |
64 | 8.57k | : m_source(source) |
65 | 8.57k | , m_lexer(source) |
66 | 8.57k | { |
67 | 8.57k | } |
68 | | |
69 | | ErrorOr<Document, ParseError> parse(); |
70 | | ErrorOr<void, ParseError> parse_with_listener(Listener&); |
71 | | |
72 | 0 | Vector<ParseError> const& parse_error_causes() const { return m_parse_errors; } |
73 | | |
74 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset(); |
75 | | |
76 | | private: |
77 | | struct EntityReference { |
78 | | Name name; |
79 | | }; |
80 | | |
81 | | ErrorOr<void, ParseError> parse_internal(); |
82 | | void append_node(NonnullOwnPtr<Node>); |
83 | | void append_text(StringView, LineTrackingLexer::Position); |
84 | | void append_comment(StringView, LineTrackingLexer::Position); |
85 | | void enter_node(Node&); |
86 | | void leave_node(); |
87 | | |
88 | | enum class ReferencePlacement { |
89 | | AttributeValue, |
90 | | Content, |
91 | | }; |
92 | | ErrorOr<ByteString, ParseError> resolve_reference(EntityReference const&, ReferencePlacement); |
93 | | |
94 | | enum class Required { |
95 | | No, |
96 | | Yes, |
97 | | }; |
98 | | ErrorOr<void, ParseError> skip_whitespace(Required = Required::No); |
99 | | |
100 | | ErrorOr<void, ParseError> parse_prolog(); |
101 | | ErrorOr<void, ParseError> parse_element(); |
102 | | ErrorOr<void, ParseError> parse_misc(); |
103 | | ErrorOr<void, ParseError> parse_xml_decl(); |
104 | | ErrorOr<void, ParseError> parse_doctype_decl(); |
105 | | ErrorOr<void, ParseError> parse_version_info(); |
106 | | ErrorOr<void, ParseError> parse_encoding_decl(); |
107 | | ErrorOr<void, ParseError> parse_standalone_document_decl(); |
108 | | ErrorOr<void, ParseError> parse_eq(); |
109 | | ErrorOr<void, ParseError> parse_comment(); |
110 | | ErrorOr<void, ParseError> parse_processing_instruction(); |
111 | | ErrorOr<Name, ParseError> parse_processing_instruction_target(); |
112 | | ErrorOr<Name, ParseError> parse_name(); |
113 | | ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_empty_element_tag(); |
114 | | ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_start_tag(); |
115 | | ErrorOr<Name, ParseError> parse_end_tag(); |
116 | | ErrorOr<void, ParseError> parse_content(); |
117 | | ErrorOr<Attribute, ParseError> parse_attribute(); |
118 | | ErrorOr<ByteString, ParseError> parse_attribute_value(); |
119 | | ErrorOr<Variant<EntityReference, ByteString>, ParseError> parse_reference(); |
120 | | ErrorOr<StringView, ParseError> parse_char_data(); |
121 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_internal_subset(); |
122 | | ErrorOr<Optional<MarkupDeclaration>, ParseError> parse_markup_declaration(); |
123 | | ErrorOr<Optional<ByteString>, ParseError> parse_declaration_separator(); |
124 | | ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset_declaration(); |
125 | | ErrorOr<ElementDeclaration, ParseError> parse_element_declaration(); |
126 | | ErrorOr<AttributeListDeclaration, ParseError> parse_attribute_list_declaration(); |
127 | | ErrorOr<EntityDeclaration, ParseError> parse_entity_declaration(); |
128 | | ErrorOr<NotationDeclaration, ParseError> parse_notation_declaration(); |
129 | | ErrorOr<Name, ParseError> parse_parameter_entity_reference(); |
130 | | ErrorOr<ElementDeclaration::ContentSpec, ParseError> parse_content_spec(); |
131 | | ErrorOr<AttributeListDeclaration::Definition, ParseError> parse_attribute_definition(); |
132 | | ErrorOr<StringView, ParseError> parse_nm_token(); |
133 | | ErrorOr<EntityDeclaration, ParseError> parse_general_entity_declaration(); |
134 | | ErrorOr<EntityDeclaration, ParseError> parse_parameter_entity_declaration(); |
135 | | ErrorOr<PublicID, ParseError> parse_public_id(); |
136 | | ErrorOr<SystemID, ParseError> parse_system_id(); |
137 | | ErrorOr<ExternalID, ParseError> parse_external_id(); |
138 | | ErrorOr<ByteString, ParseError> parse_entity_value(); |
139 | | ErrorOr<Name, ParseError> parse_notation_data_declaration(); |
140 | | ErrorOr<StringView, ParseError> parse_public_id_literal(); |
141 | | ErrorOr<StringView, ParseError> parse_system_id_literal(); |
142 | | ErrorOr<StringView, ParseError> parse_cdata_section(); |
143 | | ErrorOr<ByteString, ParseError> parse_attribute_value_inner(StringView disallow); |
144 | | ErrorOr<void, ParseError> parse_text_declaration(); |
145 | | |
146 | | ErrorOr<void, ParseError> expect(StringView); |
147 | | template<typename Pred> |
148 | | requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect(Pred, StringView description); |
149 | | template<typename Pred> |
150 | | requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect_many(Pred, StringView description, bool allow_empty = false); |
151 | | |
152 | | static size_t s_debug_indent_level; |
153 | | [[nodiscard]] auto rollback_point(SourceLocation location = SourceLocation::current()) |
154 | 256M | { |
155 | 256M | return ArmedScopeGuard { |
156 | 256M | [this, position = m_lexer.tell(), location] { |
157 | 67.5M | m_lexer.retreat(m_lexer.tell() - position); |
158 | 67.5M | (void)location; |
159 | 67.5M | dbgln_if(XML_PARSER_DEBUG, "{:->{}}FAIL @ {} -- \x1b[31m{}\x1b[0m", " ", s_debug_indent_level * 2, location, m_lexer.remaining().substring_view(0, min(16, m_lexer.tell_remaining())).replace("\n"sv, "\\n"sv, ReplaceMode::All)); |
160 | 67.5M | } |
161 | 256M | }; |
162 | 256M | } |
163 | | |
164 | | [[nodiscard]] auto accept_rule() |
165 | 77.0M | { |
166 | 77.0M | return TemporaryChange { m_current_rule.accept, true }; |
167 | 77.0M | } |
168 | | [[nodiscard]] auto enter_rule(SourceLocation location = SourceLocation::current()) |
169 | 177M | { |
170 | 177M | dbgln_if(XML_PARSER_DEBUG, "{:->{}}Enter {}", " ", s_debug_indent_level * 2, location); |
171 | 177M | ++s_debug_indent_level; |
172 | 177M | auto rule = m_current_rule; |
173 | 177M | m_current_rule = { location.function_name(), false }; |
174 | 177M | return ScopeGuard { |
175 | 177M | [location, rule, this] { |
176 | 177M | m_current_rule = rule; |
177 | 177M | --s_debug_indent_level; |
178 | 177M | (void)location; |
179 | 177M | dbgln_if(XML_PARSER_DEBUG, "{:->{}}Leave {}", " ", s_debug_indent_level * 2, location); |
180 | 177M | } |
181 | 177M | }; |
182 | 177M | } |
183 | | |
184 | | template<typename... Ts> |
185 | | ParseError parse_error(Ts&&... args) |
186 | 35.7M | { |
187 | 35.7M | auto error = ParseError { forward<Ts>(args)... }; |
188 | 35.7M | if (m_current_rule.accept) { |
189 | 2.20M | auto rule_name = m_current_rule.rule.value_or("<?>"sv); |
190 | 2.20M | if (rule_name.starts_with("parse_"sv)) |
191 | 207k | rule_name = rule_name.substring_view(6); |
192 | | |
193 | 2.20M | auto error_string = error.error.visit( |
194 | 2.20M | [](ByteString const& error) -> ByteString { return error; },Unexecuted instantiation: XML::Parser::parse_error<AK::LineTrackingLexer::Position, XML::Expectation>(AK::LineTrackingLexer::Position&&, XML::Expectation&&)::{lambda(AK::ByteString const&)#1}::operator()(AK::ByteString const&) constXML::Parser::parse_error<AK::LineTrackingLexer::Position, AK::ByteString>(AK::LineTrackingLexer::Position&&, AK::ByteString&&)::{lambda(AK::ByteString const&)#1}::operator()(AK::ByteString const&) constLine | Count | Source | 194 | 1.70M | [](ByteString const& error) -> ByteString { return error; }, |
|
195 | 2.20M | [](XML::Expectation const& expectation) -> ByteString { return ByteString::formatted("Expected {}", expectation.expected); });XML::Parser::parse_error<AK::LineTrackingLexer::Position, XML::Expectation>(AK::LineTrackingLexer::Position&&, XML::Expectation&&)::{lambda(XML::Expectation const&)#1}::operator()(XML::Expectation const&) constLine | Count | Source | 195 | 504k | [](XML::Expectation const& expectation) -> ByteString { return ByteString::formatted("Expected {}", expectation.expected); }); |
Unexecuted instantiation: XML::Parser::parse_error<AK::LineTrackingLexer::Position, AK::ByteString>(AK::LineTrackingLexer::Position&&, AK::ByteString&&)::{lambda(XML::Expectation const&)#1}::operator()(XML::Expectation const&) const |
196 | 2.20M | m_parse_errors.append({ |
197 | 2.20M | error.position, |
198 | 2.20M | ByteString::formatted("{}: {}", rule_name, error_string), |
199 | 2.20M | }); |
200 | 2.20M | } |
201 | 35.7M | return error; |
202 | 35.7M | } XML::ParseError XML::Parser::parse_error<AK::LineTrackingLexer::Position, XML::Expectation>(AK::LineTrackingLexer::Position&&, XML::Expectation&&) Line | Count | Source | 186 | 11.7M | { | 187 | 11.7M | auto error = ParseError { forward<Ts>(args)... }; | 188 | 11.7M | if (m_current_rule.accept) { | 189 | 504k | auto rule_name = m_current_rule.rule.value_or("<?>"sv); | 190 | 504k | if (rule_name.starts_with("parse_"sv)) | 191 | 1.65k | rule_name = rule_name.substring_view(6); | 192 | | | 193 | 504k | auto error_string = error.error.visit( | 194 | 504k | [](ByteString const& error) -> ByteString { return error; }, | 195 | 504k | [](XML::Expectation const& expectation) -> ByteString { return ByteString::formatted("Expected {}", expectation.expected); }); | 196 | 504k | m_parse_errors.append({ | 197 | 504k | error.position, | 198 | 504k | ByteString::formatted("{}: {}", rule_name, error_string), | 199 | 504k | }); | 200 | 504k | } | 201 | 11.7M | return error; | 202 | 11.7M | } |
XML::ParseError XML::Parser::parse_error<AK::LineTrackingLexer::Position, AK::ByteString>(AK::LineTrackingLexer::Position&&, AK::ByteString&&) Line | Count | Source | 186 | 24.0M | { | 187 | 24.0M | auto error = ParseError { forward<Ts>(args)... }; | 188 | 24.0M | if (m_current_rule.accept) { | 189 | 1.70M | auto rule_name = m_current_rule.rule.value_or("<?>"sv); | 190 | 1.70M | if (rule_name.starts_with("parse_"sv)) | 191 | 205k | rule_name = rule_name.substring_view(6); | 192 | | | 193 | 1.70M | auto error_string = error.error.visit( | 194 | 1.70M | [](ByteString const& error) -> ByteString { return error; }, | 195 | 1.70M | [](XML::Expectation const& expectation) -> ByteString { return ByteString::formatted("Expected {}", expectation.expected); }); | 196 | 1.70M | m_parse_errors.append({ | 197 | 1.70M | error.position, | 198 | 1.70M | ByteString::formatted("{}: {}", rule_name, error_string), | 199 | 1.70M | }); | 200 | 1.70M | } | 201 | 24.0M | return error; | 202 | 24.0M | } |
|
203 | | |
204 | | StringView m_source; |
205 | | LineTrackingLexer m_lexer; |
206 | | Options m_options; |
207 | | Listener* m_listener { nullptr }; |
208 | | |
209 | | OwnPtr<Node> m_root_node; |
210 | | Node* m_entered_node { nullptr }; |
211 | | Version m_version { Version::Version11 }; |
212 | | bool m_in_compatibility_mode { false }; |
213 | | ByteString m_encoding; |
214 | | bool m_standalone { false }; |
215 | | HashMap<Name, ByteString> m_processing_instructions; |
216 | | struct AcceptedRule { |
217 | | Optional<StringView> rule {}; |
218 | | bool accept { false }; |
219 | | } m_current_rule {}; |
220 | | |
221 | | Vector<ParseError> m_parse_errors; |
222 | | |
223 | | Optional<Doctype> m_doctype; |
224 | | }; |
225 | | } |
226 | | |
227 | | template<> |
228 | | struct AK::Formatter<XML::ParseError> : public AK::Formatter<FormatString> { |
229 | | ErrorOr<void> format(FormatBuilder& builder, XML::ParseError const& error) |
230 | 0 | { |
231 | 0 | auto error_string = error.error.visit( |
232 | 0 | [](ByteString const& error) -> ByteString { return error; }, |
233 | 0 | [](XML::Expectation const& expectation) -> ByteString { return ByteString::formatted("Expected {}", expectation.expected); }); |
234 | 0 | return Formatter<FormatString>::format(builder, "{} at line: {}, col: {} (offset {})"sv, error_string, error.position.line, error.position.column, error.position.offset); |
235 | 0 | } |
236 | | }; |