/src/serenity/Userland/Libraries/LibRegex/RegexParser.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #pragma once |
8 | | |
9 | | #include "RegexByteCode.h" |
10 | | #include "RegexError.h" |
11 | | #include "RegexLexer.h" |
12 | | #include "RegexOptions.h" |
13 | | |
14 | | #include <AK/Forward.h> |
15 | | #include <AK/StringBuilder.h> |
16 | | #include <AK/Types.h> |
17 | | #include <AK/Vector.h> |
18 | | #include <LibUnicode/Forward.h> |
19 | | |
20 | | namespace regex { |
21 | | |
22 | | class PosixExtendedParser; |
23 | | class PosixBasicParser; |
24 | | class ECMA262Parser; |
25 | | |
26 | | template<typename T> |
27 | | struct GenericParserTraits { |
28 | | using OptionsType = T; |
29 | | }; |
30 | | |
31 | | template<typename T> |
32 | | struct ParserTraits : public GenericParserTraits<T> { |
33 | | }; |
34 | | |
35 | | template<> |
36 | | struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> { |
37 | | }; |
38 | | |
39 | | template<> |
40 | | struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> { |
41 | | }; |
42 | | |
43 | | template<> |
44 | | struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> { |
45 | | }; |
46 | | |
47 | | class Parser { |
48 | | public: |
49 | | struct Result { |
50 | | ByteCode bytecode; |
51 | | size_t capture_groups_count; |
52 | | size_t named_capture_groups_count; |
53 | | size_t match_length_minimum; |
54 | | Error error; |
55 | | Token error_token; |
56 | | Vector<FlyString> capture_groups; |
57 | | AllOptions options; |
58 | | }; |
59 | | |
60 | | explicit Parser(Lexer& lexer) |
61 | | : m_parser_state(lexer) |
62 | 0 | { |
63 | 0 | } |
64 | | |
65 | | Parser(Lexer& lexer, AllOptions regex_options) |
66 | | : m_parser_state(lexer, regex_options) |
67 | 18 | { |
68 | 18 | } |
69 | | |
70 | 18 | virtual ~Parser() = default; |
71 | | |
72 | | Result parse(Optional<AllOptions> regex_options = {}); |
73 | 882 | bool has_error() const { return m_parser_state.error != Error::NoError; } |
74 | 0 | Error error() const { return m_parser_state.error; } |
75 | 0 | AllOptions options() const { return m_parser_state.regex_options; } |
76 | | |
77 | | protected: |
78 | | virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0; |
79 | | |
80 | | ALWAYS_INLINE bool match(TokenType type) const; |
81 | | ALWAYS_INLINE bool match(char ch) const; |
82 | | ALWAYS_INLINE bool match_ordinary_characters(); |
83 | | ALWAYS_INLINE Token consume(); |
84 | | ALWAYS_INLINE Token consume(TokenType type, Error error); |
85 | | ALWAYS_INLINE bool consume(String const&); |
86 | | ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode); |
87 | | ALWAYS_INLINE bool try_skip(StringView); |
88 | | ALWAYS_INLINE bool lookahead_any(StringView); |
89 | | ALWAYS_INLINE unsigned char skip(); |
90 | | ALWAYS_INLINE void back(size_t = 1); |
91 | | ALWAYS_INLINE void reset(); |
92 | | ALWAYS_INLINE bool done() const; |
93 | | ALWAYS_INLINE bool set_error(Error error); |
94 | | |
95 | | struct NamedCaptureGroup { |
96 | | size_t group_index { 0 }; |
97 | | size_t minimum_length { 0 }; |
98 | | }; |
99 | | |
100 | | struct ParserState { |
101 | | Lexer& lexer; |
102 | | Token current_token; |
103 | | Error error = Error::NoError; |
104 | | Token error_token { TokenType::Eof, 0, StringView(nullptr) }; |
105 | | ByteCode bytecode; |
106 | | size_t capture_groups_count { 0 }; |
107 | | size_t named_capture_groups_count { 0 }; |
108 | | size_t match_length_minimum { 0 }; |
109 | | size_t repetition_mark_count { 0 }; |
110 | | AllOptions regex_options; |
111 | | HashMap<int, size_t> capture_group_minimum_lengths; |
112 | | HashMap<FlyString, NamedCaptureGroup> named_capture_groups; |
113 | | |
114 | | explicit ParserState(Lexer& lexer) |
115 | | : lexer(lexer) |
116 | | , current_token(lexer.next()) |
117 | 0 | { |
118 | 0 | } |
119 | | explicit ParserState(Lexer& lexer, AllOptions regex_options) |
120 | | : lexer(lexer) |
121 | | , current_token(lexer.next()) |
122 | | , regex_options(regex_options) |
123 | 18 | { |
124 | 18 | } |
125 | | }; |
126 | | |
127 | | ParserState m_parser_state; |
128 | | }; |
129 | | |
130 | | class AbstractPosixParser : public Parser { |
131 | | protected: |
132 | | explicit AbstractPosixParser(Lexer& lexer) |
133 | | : Parser(lexer) |
134 | 0 | { |
135 | 0 | } |
136 | | |
137 | | AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options) |
138 | | : Parser(lexer, regex_options.value_or({})) |
139 | 0 | { |
140 | 0 | } |
141 | | |
142 | | ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&); |
143 | | }; |
144 | | |
145 | | class PosixBasicParser final : public AbstractPosixParser { |
146 | | public: |
147 | | explicit PosixBasicParser(Lexer& lexer) |
148 | | : AbstractPosixParser(lexer) |
149 | 0 | { |
150 | 0 | } |
151 | | |
152 | | PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options) |
153 | | : AbstractPosixParser(lexer, regex_options.value_or({})) |
154 | 0 | { |
155 | 0 | } |
156 | | |
157 | | ~PosixBasicParser() = default; |
158 | | |
159 | | private: |
160 | | bool parse_internal(ByteCode&, size_t&) override; |
161 | | |
162 | | bool parse_root(ByteCode&, size_t&); |
163 | | bool parse_re_expression(ByteCode&, size_t&); |
164 | | bool parse_simple_re(ByteCode&, size_t&); |
165 | | bool parse_nonduplicating_re(ByteCode&, size_t&); |
166 | | bool parse_one_char_or_collation_element(ByteCode&, size_t&); |
167 | | |
168 | | constexpr static size_t number_of_addressable_capture_groups = 9; |
169 | | size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 }; |
170 | | bool m_capture_group_seen[number_of_addressable_capture_groups] { false }; |
171 | | size_t m_current_capture_group_depth { 0 }; |
172 | | }; |
173 | | |
174 | | class PosixExtendedParser final : public AbstractPosixParser { |
175 | | constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline); |
176 | | |
177 | | public: |
178 | | explicit PosixExtendedParser(Lexer& lexer) |
179 | | : AbstractPosixParser(lexer, default_options) |
180 | 0 | { |
181 | 0 | } |
182 | | |
183 | | PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options) |
184 | | : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value()) |
185 | 0 | { |
186 | 0 | } |
187 | | |
188 | | ~PosixExtendedParser() = default; |
189 | | |
190 | | private: |
191 | | ALWAYS_INLINE bool match_repetition_symbol(); |
192 | | |
193 | | bool parse_internal(ByteCode&, size_t&) override; |
194 | | |
195 | | bool parse_root(ByteCode&, size_t&); |
196 | | ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&); |
197 | | ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&); |
198 | | ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&); |
199 | | }; |
200 | | |
201 | | class ECMA262Parser final : public Parser { |
202 | | constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline); |
203 | | |
204 | | public: |
205 | | explicit ECMA262Parser(Lexer& lexer) |
206 | | : Parser(lexer, default_options) |
207 | 0 | { |
208 | 0 | m_capture_groups_in_scope.empend(); |
209 | 0 | } |
210 | | |
211 | | ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options) |
212 | | : Parser(lexer, regex_options.value_or({}) | default_options.value()) |
213 | 18 | { |
214 | 18 | m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended); |
215 | 18 | m_capture_groups_in_scope.empend(); |
216 | 18 | } |
217 | | |
218 | 18 | ~ECMA262Parser() = default; |
219 | | |
220 | | private: |
221 | | bool parse_internal(ByteCode&, size_t&) override; |
222 | | |
223 | | enum class ReadDigitsInitialZeroState { |
224 | | Allow, |
225 | | Disallow, |
226 | | }; |
227 | | StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1); |
228 | | Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1); |
229 | | FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false); |
230 | | |
231 | | struct Script { |
232 | | Unicode::Script script {}; |
233 | | bool is_extension { false }; |
234 | | }; |
235 | | using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>; |
236 | | Optional<PropertyEscape> read_unicode_property_escape(); |
237 | | |
238 | | bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named); |
239 | | bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named); |
240 | | bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named); |
241 | | bool parse_term(ByteCode&, size_t&, bool unicode, bool named); |
242 | | bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named); |
243 | | bool parse_atom(ByteCode&, size_t&, bool unicode, bool named); |
244 | | bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named); |
245 | | bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max); |
246 | | bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named); |
247 | | bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named); |
248 | | bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named); |
249 | | Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false); |
250 | | bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode); |
251 | | bool parse_unicode_property_escape(PropertyEscape& property, bool& negated); |
252 | | |
253 | | // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) |
254 | | bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named); |
255 | | bool parse_extended_atom(ByteCode&, size_t&, bool named); |
256 | | bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named); |
257 | | bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false. |
258 | | bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length); |
259 | | Optional<u8> parse_legacy_octal_escape(); |
260 | | |
261 | | size_t ensure_total_number_of_capturing_parenthesis(); |
262 | | |
263 | 856 | void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); } |
264 | | |
265 | | void exit_capture_group_scope() |
266 | 59 | { |
267 | 59 | auto last = m_capture_groups_in_scope.take_last(); |
268 | 59 | m_capture_groups_in_scope.last().extend(move(last)); |
269 | 59 | } |
270 | | |
271 | | void clear_all_capture_groups_in_scope(ByteCode& stack) |
272 | 59 | { |
273 | 59 | for (auto& index : m_capture_groups_in_scope.last()) |
274 | 0 | stack.insert_bytecode_clear_capture_group(index); |
275 | 59 | }; |
276 | | |
277 | | // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs |
278 | | // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always |
279 | | // parse it twice, so we'll just do so when it's actually needed. |
280 | | // Most patterns should have no need to ever populate this field. |
281 | | Optional<size_t> m_total_number_of_capturing_parenthesis; |
282 | | |
283 | | // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. |
284 | | bool m_should_use_browser_extended_grammar { false }; |
285 | | |
286 | | // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it, |
287 | | // by requiring that (...)+ only contain the matches for the last iteration. |
288 | | // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed. |
289 | | Vector<Vector<size_t>> m_capture_groups_in_scope; |
290 | | }; |
291 | | |
292 | | using PosixExtended = PosixExtendedParser; |
293 | | using PosixBasic = PosixBasicParser; |
294 | | using ECMA262 = ECMA262Parser; |
295 | | |
296 | | } |
297 | | |
298 | | using regex::ECMA262; |
299 | | using regex::PosixBasic; |
300 | | using regex::PosixExtended; |