/src/serenity/Userland/Libraries/LibRegex/RegexParser.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * Copyright (c) 2020-2021, the SerenityOS developers. |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #include "RegexParser.h" |
9 | | #include "RegexDebug.h" |
10 | | #include <AK/AnyOf.h> |
11 | | #include <AK/CharacterTypes.h> |
12 | | #include <AK/GenericLexer.h> |
13 | | #include <AK/ScopeGuard.h> |
14 | | #include <AK/String.h> |
15 | | #include <AK/StringBuilder.h> |
16 | | #include <AK/StringUtils.h> |
17 | | #include <AK/TemporaryChange.h> |
18 | | #include <AK/Utf16View.h> |
19 | | #include <LibUnicode/CharacterTypes.h> |
20 | | |
21 | | namespace regex { |
22 | | |
23 | | static constexpr size_t s_maximum_repetition_count = 1024 * 1024; |
24 | | static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1; |
25 | | static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv; |
26 | | static constexpr auto s_decimal_characters = "0123456789"sv; |
27 | | |
28 | | static constexpr StringView identity_escape_characters(bool unicode, bool browser_extended) |
29 | 0 | { |
30 | 0 | if (unicode) |
31 | 0 | return "^$\\.*+?()[]{}|/"sv; |
32 | 0 | if (browser_extended) |
33 | 0 | return "^$\\.*+?()[|"sv; |
34 | 0 | return "^$\\.*+?()[]{}|"sv; |
35 | 0 | } |
36 | | |
37 | | ALWAYS_INLINE bool Parser::set_error(Error error) |
38 | 20.4k | { |
39 | 20.4k | if (m_parser_state.error == Error::NoError) { |
40 | 1.73k | m_parser_state.error = error; |
41 | 1.73k | m_parser_state.error_token = m_parser_state.current_token; |
42 | 1.73k | } |
43 | 20.4k | return false; // always return false, that eases the API usage (return set_error(...)) :^) |
44 | 20.4k | } |
45 | | |
46 | | ALWAYS_INLINE bool Parser::done() const |
47 | 54.4M | { |
48 | 54.4M | return match(TokenType::Eof); |
49 | 54.4M | } |
50 | | |
51 | | ALWAYS_INLINE bool Parser::match(TokenType type) const |
52 | 263M | { |
53 | 263M | return m_parser_state.current_token.type() == type; |
54 | 263M | } |
55 | | |
56 | | ALWAYS_INLINE bool Parser::match(char ch) const |
57 | 0 | { |
58 | 0 | return m_parser_state.current_token.type() == TokenType::Char && m_parser_state.current_token.value().length() == 1 && m_parser_state.current_token.value()[0] == ch; |
59 | 0 | } |
60 | | |
61 | | ALWAYS_INLINE Token Parser::consume() |
62 | 18.1M | { |
63 | 18.1M | auto old_token = m_parser_state.current_token; |
64 | 18.1M | m_parser_state.current_token = m_parser_state.lexer.next(); |
65 | 18.1M | return old_token; |
66 | 18.1M | } |
67 | | |
68 | | ALWAYS_INLINE Token Parser::consume(TokenType type, Error error) |
69 | 771k | { |
70 | 771k | if (m_parser_state.current_token.type() != type) { |
71 | 13.5k | set_error(error); |
72 | 13.5k | dbgln_if(REGEX_DEBUG, "[PARSER] Error: Unexpected token {}. Expected: {}", m_parser_state.current_token.name(), Token::name(type)); |
73 | 13.5k | } |
74 | 771k | return consume(); |
75 | 771k | } |
76 | | |
77 | | ALWAYS_INLINE bool Parser::consume(String const& str) |
78 | 844k | { |
79 | 844k | size_t potentially_go_back { 1 }; |
80 | 1.31M | for (auto ch : str) { |
81 | 1.31M | if (match(TokenType::Char)) { |
82 | 1.31M | if (m_parser_state.current_token.value()[0] != ch) { |
83 | 748k | m_parser_state.lexer.back(potentially_go_back); |
84 | 748k | m_parser_state.current_token = m_parser_state.lexer.next(); |
85 | 748k | return false; |
86 | 748k | } |
87 | 1.31M | } else { |
88 | 888 | m_parser_state.lexer.back(potentially_go_back); |
89 | 888 | m_parser_state.current_token = m_parser_state.lexer.next(); |
90 | 888 | return false; |
91 | 888 | } |
92 | 563k | consume(TokenType::Char, Error::NoError); |
93 | 563k | ++potentially_go_back; |
94 | 563k | } |
95 | 94.2k | return true; |
96 | 844k | } |
97 | | |
98 | | ALWAYS_INLINE Optional<u32> Parser::consume_escaped_code_point(bool unicode) |
99 | 0 | { |
100 | 0 | if (match(TokenType::LeftCurly) && !unicode) { |
101 | | // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u'). |
102 | 0 | return static_cast<u32>('u'); |
103 | 0 | } |
104 | | |
105 | 0 | m_parser_state.lexer.retreat(2 + !done()); // Go back to just before '\u' (+1 char, because we will have consumed an extra character) |
106 | |
|
107 | 0 | if (auto code_point_or_error = m_parser_state.lexer.consume_escaped_code_point(unicode); !code_point_or_error.is_error()) { |
108 | 0 | m_parser_state.current_token = m_parser_state.lexer.next(); |
109 | 0 | return code_point_or_error.value(); |
110 | 0 | } |
111 | | |
112 | 0 | if (!unicode) { |
113 | | // '\u' is allowed in non-unicode mode, just matches 'u'. |
114 | 0 | return static_cast<u32>('u'); |
115 | 0 | } |
116 | | |
117 | 0 | set_error(Error::InvalidPattern); |
118 | 0 | return {}; |
119 | 0 | } |
120 | | |
121 | | ALWAYS_INLINE bool Parser::try_skip(StringView str) |
122 | 140M | { |
123 | 140M | if (str.starts_with(m_parser_state.current_token.value())) |
124 | 252k | str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length()); |
125 | 140M | else |
126 | 140M | return false; |
127 | | |
128 | 252k | size_t potentially_go_back { 0 }; |
129 | 252k | for (auto ch : str) { |
130 | 148k | if (!m_parser_state.lexer.consume_specific(ch)) { |
131 | 113k | m_parser_state.lexer.back(potentially_go_back); |
132 | 113k | return false; |
133 | 113k | } |
134 | 34.9k | ++potentially_go_back; |
135 | 34.9k | } |
136 | | |
137 | 138k | m_parser_state.current_token = m_parser_state.lexer.next(); |
138 | 138k | return true; |
139 | 252k | } |
140 | | |
141 | | ALWAYS_INLINE bool Parser::lookahead_any(StringView str) |
142 | 0 | { |
143 | 0 | return AK::any_of(str, [this](auto ch) { return match(ch); }); |
144 | 0 | } |
145 | | |
146 | | ALWAYS_INLINE unsigned char Parser::skip() |
147 | 33.8M | { |
148 | 33.8M | unsigned char ch; |
149 | 33.8M | if (m_parser_state.current_token.value().length() == 1) { |
150 | 31.8M | ch = m_parser_state.current_token.value()[0]; |
151 | 31.8M | } else { |
152 | 2.01M | m_parser_state.lexer.back(m_parser_state.current_token.value().length()); |
153 | 2.01M | ch = m_parser_state.lexer.consume(); |
154 | 2.01M | } |
155 | | |
156 | 33.8M | m_parser_state.current_token = m_parser_state.lexer.next(); |
157 | 33.8M | return ch; |
158 | 33.8M | } |
159 | | |
160 | | ALWAYS_INLINE void Parser::back(size_t count) |
161 | 113k | { |
162 | 113k | m_parser_state.lexer.back(count); |
163 | 113k | m_parser_state.current_token = m_parser_state.lexer.next(); |
164 | 113k | } |
165 | | |
166 | | ALWAYS_INLINE void Parser::reset() |
167 | 4.20k | { |
168 | 4.20k | m_parser_state.bytecode.clear(); |
169 | 4.20k | m_parser_state.lexer.reset(); |
170 | 4.20k | m_parser_state.current_token = m_parser_state.lexer.next(); |
171 | 4.20k | m_parser_state.error = Error::NoError; |
172 | 4.20k | m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) }; |
173 | 4.20k | m_parser_state.capture_group_minimum_lengths.clear(); |
174 | 4.20k | m_parser_state.capture_groups_count = 0; |
175 | 4.20k | m_parser_state.named_capture_groups_count = 0; |
176 | 4.20k | m_parser_state.named_capture_groups.clear(); |
177 | 4.20k | } |
178 | | |
179 | | Parser::Result Parser::parse(Optional<AllOptions> regex_options) |
180 | 4.20k | { |
181 | 4.20k | reset(); |
182 | 4.20k | if (regex_options.has_value()) |
183 | 0 | m_parser_state.regex_options = regex_options.value(); |
184 | 4.20k | if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) |
185 | 2.70k | consume(TokenType::Eof, Error::InvalidPattern); |
186 | 1.49k | else |
187 | 1.49k | set_error(Error::InvalidPattern); |
188 | | |
189 | 4.20k | dbgln_if(REGEX_DEBUG, "[PARSER] Produced bytecode with {} entries (opcodes + arguments)", m_parser_state.bytecode.size()); |
190 | 4.20k | return { |
191 | 4.20k | move(m_parser_state.bytecode), |
192 | 4.20k | move(m_parser_state.capture_groups_count), |
193 | 4.20k | move(m_parser_state.named_capture_groups_count), |
194 | 4.20k | move(m_parser_state.match_length_minimum), |
195 | 4.20k | move(m_parser_state.error), |
196 | 4.20k | move(m_parser_state.error_token), |
197 | 4.20k | m_parser_state.named_capture_groups.keys(), |
198 | 4.20k | m_parser_state.regex_options, |
199 | 4.20k | }; |
200 | 4.20k | } |
201 | | |
202 | | ALWAYS_INLINE bool Parser::match_ordinary_characters() |
203 | 0 | { |
204 | | // NOTE: This method must not be called during bracket and repetition parsing! |
205 | | // FIXME: Add assertion for that? |
206 | 0 | auto type = m_parser_state.current_token.type(); |
207 | 0 | return (type == TokenType::Char |
208 | 0 | || type == TokenType::Comma |
209 | 0 | || type == TokenType::Slash |
210 | 0 | || type == TokenType::EqualSign |
211 | 0 | || type == TokenType::HyphenMinus |
212 | 0 | || type == TokenType::Colon); |
213 | 0 | } |
214 | | |
215 | | // ============================= |
216 | | // Abstract Posix Parser |
217 | | // ============================= |
218 | | |
219 | | ALWAYS_INLINE bool AbstractPosixParser::parse_bracket_expression(Vector<CompareTypeAndValuePair>& values, size_t& match_length_minimum) |
220 | 8.53k | { |
221 | 38.1M | for (; !done();) { |
222 | 38.1M | if (match(TokenType::HyphenMinus)) { |
223 | 3.50M | consume(); |
224 | | |
225 | 3.50M | if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { |
226 | | // first in the bracket expression |
227 | 1.93k | values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); |
228 | 3.50M | } else if (match(TokenType::RightBracket)) { |
229 | | // Last in the bracket expression |
230 | 2.44k | values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); |
231 | 3.50M | } else if (values.last().type == CharacterCompareType::Char) { |
232 | 3.50M | values.append({ CharacterCompareType::RangeExpressionDummy, 0 }); |
233 | | |
234 | 3.50M | if (done()) |
235 | 16 | return set_error(Error::MismatchingBracket); |
236 | | |
237 | 3.50M | if (match(TokenType::HyphenMinus)) { |
238 | 40.8k | consume(); |
239 | | // Valid range, add ordinary character |
240 | 40.8k | values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); |
241 | 40.8k | } |
242 | 3.50M | } else { |
243 | 16 | return set_error(Error::InvalidRange); |
244 | 16 | } |
245 | | |
246 | 34.6M | } else if (match(TokenType::Circumflex)) { |
247 | 603k | auto t = consume(); |
248 | | |
249 | 603k | if (values.is_empty()) |
250 | 3.03k | values.append({ CharacterCompareType::Inverse, 0 }); |
251 | 600k | else |
252 | 600k | values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() }); |
253 | | |
254 | 33.9M | } else if (match(TokenType::LeftBracket)) { |
255 | 98.7k | consume(); |
256 | | |
257 | 98.7k | if (match(TokenType::Period)) { |
258 | 3.29k | consume(); |
259 | | |
260 | | // FIXME: Parse collating element, this is needed when we have locale support |
261 | | // This could have impact on length parameter, I guess. |
262 | 3.29k | set_error(Error::InvalidCollationElement); |
263 | | |
264 | 3.29k | consume(TokenType::Period, Error::InvalidCollationElement); |
265 | 3.29k | consume(TokenType::RightBracket, Error::MismatchingBracket); |
266 | | |
267 | 95.4k | } else if (match(TokenType::EqualSign)) { |
268 | 979 | consume(); |
269 | | // FIXME: Parse collating element, this is needed when we have locale support |
270 | | // This could have impact on length parameter, I guess. |
271 | 979 | set_error(Error::InvalidCollationElement); |
272 | | |
273 | 979 | consume(TokenType::EqualSign, Error::InvalidCollationElement); |
274 | 979 | consume(TokenType::RightBracket, Error::MismatchingBracket); |
275 | | |
276 | 94.4k | } else if (match(TokenType::Colon)) { |
277 | 94.4k | consume(); |
278 | | |
279 | 94.4k | CharClass ch_class; |
280 | | // parse character class |
281 | 94.4k | if (match(TokenType::Char)) { |
282 | 94.4k | if (consume("alnum")) |
283 | 230 | ch_class = CharClass::Alnum; |
284 | 94.2k | else if (consume("alpha")) |
285 | 233 | ch_class = CharClass::Alpha; |
286 | 94.0k | else if (consume("blank")) |
287 | 236 | ch_class = CharClass::Blank; |
288 | 93.7k | else if (consume("cntrl")) |
289 | 303 | ch_class = CharClass::Cntrl; |
290 | 93.4k | else if (consume("digit")) |
291 | 231 | ch_class = CharClass::Digit; |
292 | 93.2k | else if (consume("graph")) |
293 | 248 | ch_class = CharClass::Graph; |
294 | 92.9k | else if (consume("lower")) |
295 | 246 | ch_class = CharClass::Lower; |
296 | 92.7k | else if (consume("print")) |
297 | 243 | ch_class = CharClass::Print; |
298 | 92.4k | else if (consume("punct")) |
299 | 91.2k | ch_class = CharClass::Punct; |
300 | 1.25k | else if (consume("space")) |
301 | 231 | ch_class = CharClass::Space; |
302 | 1.02k | else if (consume("upper")) |
303 | 513 | ch_class = CharClass::Upper; |
304 | 507 | else if (consume("xdigit")) |
305 | 320 | ch_class = CharClass::Xdigit; |
306 | 187 | else |
307 | 187 | return set_error(Error::InvalidCharacterClass); |
308 | | |
309 | 94.2k | values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class }); |
310 | | |
311 | 94.2k | } else |
312 | 9 | return set_error(Error::InvalidCharacterClass); |
313 | | |
314 | | // FIXME: we do not support locale specific character classes until locales are implemented |
315 | | |
316 | 94.2k | consume(TokenType::Colon, Error::InvalidCharacterClass); |
317 | 94.2k | consume(TokenType::RightBracket, Error::MismatchingBracket); |
318 | 94.2k | } else { |
319 | 23 | return set_error(Error::MismatchingBracket); |
320 | 23 | } |
321 | | |
322 | 33.9M | } else if (match(TokenType::RightBracket)) { |
323 | | |
324 | 8.46k | if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { |
325 | | // handle bracket as ordinary character |
326 | 629 | values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() }); |
327 | 7.83k | } else { |
328 | | // closing bracket expression |
329 | 7.83k | break; |
330 | 7.83k | } |
331 | 33.8M | } else { |
332 | 33.8M | values.append({ CharacterCompareType::Char, (ByteCodeValueType)skip() }); |
333 | 33.8M | } |
334 | | |
335 | | // check if range expression has to be completed... |
336 | 38.1M | if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) { |
337 | 3.50M | if (values.last().type != CharacterCompareType::Char) |
338 | 3 | return set_error(Error::InvalidRange); |
339 | | |
340 | 3.50M | auto value2 = values.take_last(); |
341 | 3.50M | values.take_last(); // RangeExpressionDummy |
342 | 3.50M | auto value1 = values.take_last(); |
343 | | |
344 | 3.50M | values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) }); |
345 | 3.50M | } |
346 | 38.1M | } |
347 | | |
348 | 8.28k | if (!values.is_empty()) { |
349 | 8.20k | match_length_minimum = 1; |
350 | 8.20k | if (values.first().type == CharacterCompareType::Inverse) |
351 | 3.02k | match_length_minimum = 0; |
352 | 8.20k | } |
353 | | |
354 | 8.28k | return true; |
355 | 8.53k | } |
356 | | |
357 | | // ============================= |
358 | | // PosixBasic Parser |
359 | | // ============================= |
360 | | |
361 | | bool PosixBasicParser::parse_internal(ByteCode& stack, size_t& match_length_minimum) |
362 | 4.20k | { |
363 | 4.20k | return parse_root(stack, match_length_minimum); |
364 | 4.20k | } |
365 | | |
366 | | bool PosixBasicParser::parse_root(ByteCode& bytecode, size_t& match_length_minimum) |
367 | 4.20k | { |
368 | | // basic_reg_exp : L_ANCHOR? RE_expression R_ANCHOR? |
369 | 4.20k | if (match(TokenType::Circumflex)) { |
370 | 21 | consume(); |
371 | 21 | bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin); |
372 | 21 | } |
373 | | |
374 | 4.20k | if (!parse_re_expression(bytecode, match_length_minimum)) |
375 | 1.49k | return false; |
376 | | |
377 | 2.70k | if (match(TokenType::Dollar)) { |
378 | 0 | consume(); |
379 | 0 | bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd); |
380 | 0 | } |
381 | | |
382 | 2.70k | return !has_error(); |
383 | 4.20k | } |
384 | | |
385 | | bool PosixBasicParser::parse_re_expression(ByteCode& bytecode, size_t& match_length_minimum) |
386 | 40.1k | { |
387 | | // RE_expression : RE_expression? simple_RE |
388 | 12.8M | while (!done()) { |
389 | 12.8M | if (!parse_simple_re(bytecode, match_length_minimum)) |
390 | 37.4k | break; |
391 | 12.8M | } |
392 | | |
393 | 40.1k | return !has_error(); |
394 | 40.1k | } |
395 | | |
396 | | bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_minimum) |
397 | 12.8M | { |
398 | | // simple_RE : nondupl_RE RE_dupl_symbol? |
399 | 12.8M | ByteCode simple_re_bytecode; |
400 | 12.8M | size_t re_match_length_minimum = 0; |
401 | 12.8M | if (!parse_nonduplicating_re(simple_re_bytecode, re_match_length_minimum)) |
402 | 36.9k | return false; |
403 | | |
404 | | // RE_dupl_symbol : '*' | Back_open_brace DUP_COUNT (',' DUP_COUNT?)? Back_close_brace |
405 | 12.7M | if (match(TokenType::Asterisk)) { |
406 | 30.0k | consume(); |
407 | 30.0k | ByteCode::transform_bytecode_repetition_any(simple_re_bytecode, true); |
408 | 12.7M | } else if (try_skip("\\{")) { |
409 | 62.8k | auto read_number = [&]() -> Optional<size_t> { |
410 | 62.8k | if (!match(TokenType::Char)) |
411 | 34 | return {}; |
412 | 62.8k | size_t value = 0; |
413 | 137k | while (match(TokenType::Char)) { |
414 | 107k | auto c = m_parser_state.current_token.value().substring_view(0, 1); |
415 | 107k | auto c_value = c.to_uint(); |
416 | 107k | if (!c_value.has_value()) |
417 | 33.5k | break; |
418 | 74.2k | value *= 10; |
419 | 74.2k | value += *c_value; |
420 | 74.2k | consume(); |
421 | 74.2k | } |
422 | 62.8k | return value; |
423 | 62.8k | }; |
424 | | |
425 | 33.6k | size_t min_limit; |
426 | 33.6k | Optional<size_t> max_limit; |
427 | | |
428 | 33.6k | if (auto limit = read_number(); !limit.has_value()) |
429 | 31 | return set_error(Error::InvalidRepetitionMarker); |
430 | 33.5k | else |
431 | 33.5k | min_limit = *limit; |
432 | | |
433 | 33.5k | if (match(TokenType::Comma)) { |
434 | 29.2k | consume(); |
435 | 29.2k | max_limit = read_number(); |
436 | 29.2k | } |
437 | | |
438 | 33.5k | if (!try_skip("\\}")) |
439 | 122 | return set_error(Error::MismatchingBrace); |
440 | | |
441 | 33.4k | if (max_limit.value_or(min_limit) < min_limit) |
442 | 34 | return set_error(Error::InvalidBraceContent); |
443 | | |
444 | 33.4k | if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count)) |
445 | 303 | return set_error(Error::InvalidBraceContent); |
446 | | |
447 | 33.1k | auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; |
448 | 33.1k | auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; |
449 | 33.1k | ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, min_repetition_mark_id, max_repetition_mark_id, true); |
450 | 33.1k | match_length_minimum += re_match_length_minimum * min_limit; |
451 | 12.7M | } else { |
452 | 12.7M | match_length_minimum += re_match_length_minimum; |
453 | 12.7M | } |
454 | | |
455 | 12.7M | bytecode.extend(move(simple_re_bytecode)); |
456 | 12.7M | return true; |
457 | 12.7M | } |
458 | | |
459 | | bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match_length_minimum) |
460 | 12.8M | { |
461 | | // nondupl_RE : one_char_or_coll_elem_RE | Back_open_paren RE_expression Back_close_paren | BACKREF |
462 | 12.8M | if (try_skip("\\(")) { |
463 | 35.9k | TemporaryChange change { m_current_capture_group_depth, m_current_capture_group_depth + 1 }; |
464 | | // Max number of addressable capture groups is 10, let's just be lenient |
465 | | // and accept 20; anything past that is probably a silly pattern anyway. |
466 | 35.9k | if (m_current_capture_group_depth > 20) |
467 | 6 | return set_error(Error::InvalidPattern); |
468 | 35.9k | ByteCode capture_bytecode; |
469 | 35.9k | size_t capture_length_minimum = 0; |
470 | 35.9k | auto capture_group_index = ++m_parser_state.capture_groups_count; |
471 | | |
472 | 35.9k | if (!parse_re_expression(capture_bytecode, capture_length_minimum)) |
473 | 1.46k | return false; |
474 | | |
475 | 34.4k | if (!try_skip("\\)")) |
476 | 235 | return set_error(Error::MismatchingParen); |
477 | | |
478 | 34.2k | match_length_minimum += capture_length_minimum; |
479 | 34.2k | if (capture_group_index <= number_of_addressable_capture_groups) { |
480 | 7.45k | m_capture_group_minimum_lengths[capture_group_index - 1] = capture_length_minimum; |
481 | 7.45k | m_capture_group_seen[capture_group_index - 1] = true; |
482 | 7.45k | bytecode.insert_bytecode_group_capture_left(capture_group_index); |
483 | 7.45k | } |
484 | | |
485 | 34.2k | bytecode.extend(capture_bytecode); |
486 | | |
487 | 34.2k | if (capture_group_index <= number_of_addressable_capture_groups) |
488 | 7.45k | bytecode.insert_bytecode_group_capture_right(capture_group_index); |
489 | 34.2k | return true; |
490 | 34.4k | } |
491 | | |
492 | 127M | for (size_t i = 1; i < 10; ++i) { |
493 | 115M | char backref_name[2] { '\\', '0' }; |
494 | 115M | backref_name[1] += i; |
495 | 115M | if (try_skip({ backref_name, 2 })) { |
496 | 1.47k | if (!m_capture_group_seen[i - 1]) |
497 | 39 | return set_error(Error::InvalidNumber); |
498 | 1.43k | match_length_minimum += m_capture_group_minimum_lengths[i - 1]; |
499 | 1.43k | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } }); |
500 | 1.43k | return true; |
501 | 1.47k | } |
502 | 115M | } |
503 | | |
504 | 12.7M | return parse_one_char_or_collation_element(bytecode, match_length_minimum); |
505 | 12.7M | } |
506 | | |
507 | | bool PosixBasicParser::parse_one_char_or_collation_element(ByteCode& bytecode, size_t& match_length_minimum) |
508 | 12.7M | { |
509 | | // one_char_or_coll_elem_RE : ORD_CHAR | QUOTED_CHAR | '.' | bracket_expression |
510 | 12.7M | if (match(TokenType::Period)) { |
511 | 3.26k | consume(); |
512 | 3.26k | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } }); |
513 | 3.26k | match_length_minimum += 1; |
514 | 3.26k | return true; |
515 | 3.26k | } |
516 | | |
517 | | // Dollars are special if at the end of a pattern. |
518 | 12.7M | if (match(TokenType::Dollar)) { |
519 | 113k | consume(); |
520 | | |
521 | | // If we are at the end of a pattern, emit an end check instruction. |
522 | 113k | if (match(TokenType::Eof)) { |
523 | 48 | bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd); |
524 | 48 | return true; |
525 | 48 | } |
526 | | |
527 | | // We are not at the end of the string, so we should roll back and continue as normal. |
528 | 113k | back(2); |
529 | 113k | } |
530 | | |
531 | | // None of these are special in BRE. |
532 | 12.7M | if (match(TokenType::Char) || match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus) |
533 | 12.7M | || match(TokenType::Circumflex) || match(TokenType::RightCurly) || match(TokenType::Comma) || match(TokenType::Colon) |
534 | 12.7M | || match(TokenType::Dollar) || match(TokenType::EqualSign) || match(TokenType::LeftCurly) || match(TokenType::LeftParen) |
535 | 12.7M | || match(TokenType::Pipe) || match(TokenType::Slash) || match(TokenType::RightBracket) || match(TokenType::RightParen)) { |
536 | | |
537 | 12.7M | auto ch = consume().value()[0]; |
538 | 12.7M | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); |
539 | 12.7M | match_length_minimum += 1; |
540 | 12.7M | return true; |
541 | 12.7M | } |
542 | | |
543 | 47.7k | if (match(TokenType::EscapeSequence)) { |
544 | 39.1k | if (m_parser_state.current_token.value().is_one_of("\\)"sv, "\\}"sv, "\\("sv, "\\{"sv)) |
545 | 34.4k | return false; |
546 | 4.69k | auto ch = consume().value()[1]; |
547 | 4.69k | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); |
548 | 4.69k | match_length_minimum += 1; |
549 | 4.69k | return true; |
550 | 39.1k | } |
551 | | |
552 | 8.56k | Vector<CompareTypeAndValuePair> values; |
553 | 8.56k | size_t bracket_minimum_length = 0; |
554 | | |
555 | 8.56k | if (match(TokenType::LeftBracket)) { |
556 | 8.53k | consume(); |
557 | 8.53k | if (!AbstractPosixParser::parse_bracket_expression(values, bracket_minimum_length)) |
558 | 254 | return false; |
559 | | |
560 | 8.28k | consume(TokenType::RightBracket, Error::MismatchingBracket); |
561 | | |
562 | 8.28k | if (!has_error()) |
563 | 7.83k | bytecode.insert_bytecode_compare_values(move(values)); |
564 | 8.28k | match_length_minimum += bracket_minimum_length; |
565 | 8.28k | return !has_error(); |
566 | 8.53k | } |
567 | | |
568 | 25 | return set_error(Error::InvalidPattern); |
569 | 8.56k | } |
570 | | |
571 | | // ============================= |
572 | | // PosixExtended Parser |
573 | | // ============================= |
574 | | |
575 | | bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum) |
576 | 0 | { |
577 | 0 | return parse_root(stack, match_length_minimum); |
578 | 0 | } |
579 | | |
580 | | ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol() |
581 | 0 | { |
582 | 0 | auto type = m_parser_state.current_token.type(); |
583 | 0 | return (type == TokenType::Asterisk |
584 | 0 | || type == TokenType::Plus |
585 | 0 | || type == TokenType::Questionmark |
586 | 0 | || type == TokenType::LeftCurly); |
587 | 0 | } |
588 | | |
589 | | ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum) |
590 | 0 | { |
591 | 0 | if (match(TokenType::LeftCurly)) { |
592 | 0 | consume(); |
593 | |
|
594 | 0 | StringBuilder number_builder; |
595 | |
|
596 | 0 | while (match(TokenType::Char)) { |
597 | 0 | number_builder.append(consume().value()); |
598 | 0 | } |
599 | |
|
600 | 0 | auto maybe_minimum = number_builder.build().to_uint(); |
601 | 0 | if (!maybe_minimum.has_value()) |
602 | 0 | return set_error(Error::InvalidBraceContent); |
603 | | |
604 | 0 | auto minimum = maybe_minimum.value(); |
605 | 0 | match_length_minimum *= minimum; |
606 | |
|
607 | 0 | if (minimum > s_maximum_repetition_count) |
608 | 0 | return set_error(Error::InvalidBraceContent); |
609 | | |
610 | 0 | if (match(TokenType::Comma)) { |
611 | 0 | consume(); |
612 | 0 | } else { |
613 | 0 | auto repetition_mark_id = m_parser_state.repetition_mark_count++; |
614 | |
|
615 | 0 | ByteCode bytecode; |
616 | 0 | bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id); |
617 | 0 | bytecode_to_repeat = move(bytecode); |
618 | |
|
619 | 0 | consume(TokenType::RightCurly, Error::MismatchingBrace); |
620 | 0 | return !has_error(); |
621 | 0 | } |
622 | | |
623 | 0 | Optional<u32> maybe_maximum {}; |
624 | 0 | number_builder.clear(); |
625 | 0 | while (match(TokenType::Char)) { |
626 | 0 | number_builder.append(consume().value()); |
627 | 0 | } |
628 | 0 | if (!number_builder.is_empty()) { |
629 | 0 | auto value = number_builder.build().to_uint(); |
630 | 0 | if (!value.has_value() || minimum > value.value() || *value > s_maximum_repetition_count) |
631 | 0 | return set_error(Error::InvalidBraceContent); |
632 | | |
633 | 0 | maybe_maximum = value.value(); |
634 | 0 | } |
635 | | |
636 | 0 | auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; |
637 | 0 | auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; |
638 | 0 | ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, min_repetition_mark_id, max_repetition_mark_id); |
639 | |
|
640 | 0 | consume(TokenType::RightCurly, Error::MismatchingBrace); |
641 | 0 | return !has_error(); |
642 | 0 | } |
643 | 0 | if (match(TokenType::Plus)) { |
644 | 0 | consume(); |
645 | |
|
646 | 0 | bool nongreedy = match(TokenType::Questionmark); |
647 | 0 | if (nongreedy) |
648 | 0 | consume(); |
649 | | |
650 | | // Note: don't touch match_length_minimum, it's already correct |
651 | 0 | ByteCode::transform_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy); |
652 | 0 | return !has_error(); |
653 | 0 | } |
654 | 0 | if (match(TokenType::Asterisk)) { |
655 | 0 | consume(); |
656 | 0 | match_length_minimum = 0; |
657 | |
|
658 | 0 | bool nongreedy = match(TokenType::Questionmark); |
659 | 0 | if (nongreedy) |
660 | 0 | consume(); |
661 | |
|
662 | 0 | ByteCode::transform_bytecode_repetition_any(bytecode_to_repeat, !nongreedy); |
663 | |
|
664 | 0 | return !has_error(); |
665 | 0 | } |
666 | 0 | if (match(TokenType::Questionmark)) { |
667 | 0 | consume(); |
668 | 0 | match_length_minimum = 0; |
669 | |
|
670 | 0 | bool nongreedy = match(TokenType::Questionmark); |
671 | 0 | if (nongreedy) |
672 | 0 | consume(); |
673 | |
|
674 | 0 | ByteCode::transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy); |
675 | 0 | return !has_error(); |
676 | 0 | } |
677 | | |
678 | 0 | return false; |
679 | 0 | } |
680 | | |
681 | | ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum) |
682 | 0 | { |
683 | 0 | Vector<CompareTypeAndValuePair> values; |
684 | 0 | if (!AbstractPosixParser::parse_bracket_expression(values, match_length_minimum)) |
685 | 0 | return false; |
686 | | |
687 | 0 | if (!has_error()) |
688 | 0 | stack.insert_bytecode_compare_values(move(values)); |
689 | |
|
690 | 0 | return !has_error(); |
691 | 0 | } |
692 | | |
693 | | ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum) |
694 | 0 | { |
695 | 0 | ByteCode bytecode; |
696 | 0 | size_t length = 0; |
697 | 0 | bool should_parse_repetition_symbol { false }; |
698 | |
|
699 | 0 | for (;;) { |
700 | 0 | if (match_ordinary_characters()) { |
701 | 0 | Token start_token = m_parser_state.current_token; |
702 | 0 | Token last_token = m_parser_state.current_token; |
703 | 0 | for (;;) { |
704 | 0 | if (!match_ordinary_characters()) |
705 | 0 | break; |
706 | 0 | ++length; |
707 | 0 | last_token = consume(); |
708 | 0 | } |
709 | |
|
710 | 0 | if (length > 1) { |
711 | | // last character is inserted into 'bytecode' for duplication symbol handling |
712 | 0 | auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0); |
713 | 0 | stack.insert_bytecode_compare_string({ start_token.value().characters_without_null_termination(), new_length }); |
714 | 0 | } |
715 | |
|
716 | 0 | if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol |
717 | 0 | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } }); |
718 | |
|
719 | 0 | should_parse_repetition_symbol = true; |
720 | 0 | break; |
721 | 0 | } |
722 | | |
723 | 0 | if (match_repetition_symbol()) |
724 | 0 | return set_error(Error::InvalidRepetitionMarker); |
725 | | |
726 | 0 | if (match(TokenType::Period)) { |
727 | 0 | length = 1; |
728 | 0 | consume(); |
729 | 0 | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } }); |
730 | 0 | should_parse_repetition_symbol = true; |
731 | 0 | break; |
732 | 0 | } |
733 | | |
734 | 0 | if (match(TokenType::EscapeSequence)) { |
735 | 0 | length = 1; |
736 | 0 | Token t = consume(); |
737 | 0 | dbgln_if(REGEX_DEBUG, "[PARSER] EscapeSequence with substring {}", t.value()); |
738 | |
|
739 | 0 | bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } }); |
740 | 0 | should_parse_repetition_symbol = true; |
741 | 0 | break; |
742 | 0 | } |
743 | | |
744 | 0 | if (match(TokenType::LeftBracket)) { |
745 | 0 | consume(); |
746 | |
|
747 | 0 | ByteCode sub_ops; |
748 | 0 | if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size()) |
749 | 0 | return set_error(Error::InvalidBracketContent); |
750 | | |
751 | 0 | bytecode.extend(move(sub_ops)); |
752 | |
|
753 | 0 | consume(TokenType::RightBracket, Error::MismatchingBracket); |
754 | 0 | should_parse_repetition_symbol = true; |
755 | 0 | break; |
756 | 0 | } |
757 | | |
758 | 0 | if (match(TokenType::RightBracket)) { |
759 | 0 | return set_error(Error::MismatchingBracket); |
760 | 0 | } |
761 | | |
762 | 0 | if (match(TokenType::RightCurly)) { |
763 | 0 | return set_error(Error::MismatchingBrace); |
764 | 0 | } |
765 | | |
766 | 0 | if (match(TokenType::Circumflex)) { |
767 | 0 | consume(); |
768 | 0 | bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin); |
769 | 0 | break; |
770 | 0 | } |
771 | | |
772 | 0 | if (match(TokenType::Dollar)) { |
773 | 0 | consume(); |
774 | 0 | bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd); |
775 | 0 | break; |
776 | 0 | } |
777 | | |
778 | 0 | if (match(TokenType::RightParen)) |
779 | 0 | return false; |
780 | | |
781 | 0 | if (match(TokenType::LeftParen)) { |
782 | 0 | enum GroupMode { |
783 | 0 | Normal, |
784 | 0 | Lookahead, |
785 | 0 | NegativeLookahead, |
786 | 0 | Lookbehind, |
787 | 0 | NegativeLookbehind, |
788 | 0 | } group_mode { Normal }; |
789 | 0 | consume(); |
790 | 0 | Optional<StringView> capture_group_name; |
791 | 0 | bool prevent_capture_group = false; |
792 | 0 | if (match(TokenType::Questionmark)) { |
793 | 0 | consume(); |
794 | |
|
795 | 0 | if (match(TokenType::Colon)) { |
796 | 0 | consume(); |
797 | 0 | prevent_capture_group = true; |
798 | 0 | } else if (consume("<")) { // named capturing group |
799 | |
|
800 | 0 | Token start_token = m_parser_state.current_token; |
801 | 0 | Token last_token = m_parser_state.current_token; |
802 | 0 | size_t capture_group_name_length = 0; |
803 | 0 | for (;;) { |
804 | 0 | if (!match_ordinary_characters()) |
805 | 0 | return set_error(Error::InvalidNameForCaptureGroup); |
806 | 0 | if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') { |
807 | 0 | consume(); |
808 | 0 | break; |
809 | 0 | } |
810 | 0 | ++capture_group_name_length; |
811 | 0 | last_token = consume(); |
812 | 0 | } |
813 | 0 | capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length); |
814 | 0 | ++m_parser_state.named_capture_groups_count; |
815 | |
|
816 | 0 | } else if (match(TokenType::EqualSign)) { // positive lookahead |
817 | 0 | consume(); |
818 | 0 | group_mode = Lookahead; |
819 | 0 | } else if (consume("!")) { // negative lookahead |
820 | 0 | group_mode = NegativeLookahead; |
821 | 0 | } else if (consume("<")) { |
822 | 0 | if (match(TokenType::EqualSign)) { // positive lookbehind |
823 | 0 | consume(); |
824 | 0 | group_mode = Lookbehind; |
825 | 0 | } |
826 | 0 | if (consume("!")) // negative lookbehind |
827 | 0 | group_mode = NegativeLookbehind; |
828 | 0 | } else { |
829 | 0 | return set_error(Error::InvalidRepetitionMarker); |
830 | 0 | } |
831 | 0 | } |
832 | | |
833 | 0 | auto current_capture_group = m_parser_state.capture_groups_count; |
834 | 0 | if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { |
835 | 0 | bytecode.insert_bytecode_group_capture_left(current_capture_group); |
836 | 0 | m_parser_state.capture_groups_count++; |
837 | 0 | } |
838 | |
|
839 | 0 | ByteCode capture_group_bytecode; |
840 | |
|
841 | 0 | if (!parse_root(capture_group_bytecode, length)) |
842 | 0 | return set_error(Error::InvalidPattern); |
843 | | |
844 | 0 | switch (group_mode) { |
845 | 0 | case Normal: |
846 | 0 | bytecode.extend(move(capture_group_bytecode)); |
847 | 0 | break; |
848 | 0 | case Lookahead: |
849 | 0 | bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookAhead, length); |
850 | 0 | break; |
851 | 0 | case NegativeLookahead: |
852 | 0 | bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookAhead, length); |
853 | 0 | break; |
854 | 0 | case Lookbehind: |
855 | 0 | bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookBehind, length); |
856 | 0 | break; |
857 | 0 | case NegativeLookbehind: |
858 | 0 | bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookBehind, length); |
859 | 0 | break; |
860 | 0 | } |
861 | | |
862 | 0 | consume(TokenType::RightParen, Error::MismatchingParen); |
863 | |
|
864 | 0 | if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { |
865 | 0 | if (capture_group_name.has_value()) |
866 | 0 | bytecode.insert_bytecode_group_capture_right(current_capture_group, capture_group_name.value()); |
867 | 0 | else |
868 | 0 | bytecode.insert_bytecode_group_capture_right(current_capture_group); |
869 | 0 | } |
870 | 0 | should_parse_repetition_symbol = true; |
871 | 0 | break; |
872 | 0 | } |
873 | | |
874 | 0 | return false; |
875 | 0 | } |
876 | | |
877 | 0 | if (match_repetition_symbol()) { |
878 | 0 | if (should_parse_repetition_symbol) |
879 | 0 | parse_repetition_symbol(bytecode, length); |
880 | 0 | else |
881 | 0 | return set_error(Error::InvalidRepetitionMarker); |
882 | 0 | } |
883 | | |
884 | 0 | stack.extend(move(bytecode)); |
885 | 0 | match_length_minimum += length; |
886 | |
|
887 | 0 | return true; |
888 | 0 | } |
889 | | |
890 | | bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum) |
891 | 0 | { |
892 | 0 | ByteCode bytecode_left; |
893 | 0 | size_t match_length_minimum_left { 0 }; |
894 | |
|
895 | 0 | if (match_repetition_symbol()) |
896 | 0 | return set_error(Error::InvalidRepetitionMarker); |
897 | | |
898 | 0 | for (;;) { |
899 | 0 | if (!parse_sub_expression(bytecode_left, match_length_minimum_left)) |
900 | 0 | break; |
901 | | |
902 | 0 | if (match(TokenType::Pipe)) { |
903 | 0 | consume(); |
904 | |
|
905 | 0 | ByteCode bytecode_right; |
906 | 0 | size_t match_length_minimum_right { 0 }; |
907 | |
|
908 | 0 | if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty()) |
909 | 0 | return set_error(Error::InvalidPattern); |
910 | | |
911 | 0 | ByteCode new_bytecode; |
912 | 0 | new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right)); |
913 | 0 | bytecode_left = move(new_bytecode); |
914 | 0 | match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left); |
915 | 0 | } |
916 | 0 | } |
917 | | |
918 | 0 | if (bytecode_left.is_empty()) |
919 | 0 | set_error(Error::EmptySubExpression); |
920 | |
|
921 | 0 | stack.extend(move(bytecode_left)); |
922 | 0 | match_length_minimum = match_length_minimum_left; |
923 | 0 | return !has_error(); |
924 | 0 | } |
925 | | |
926 | | // ============================= |
927 | | // ECMA262 Parser |
928 | | // ============================= |
929 | | |
930 | | bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum) |
931 | 0 | { |
932 | 0 | if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) { |
933 | 0 | return parse_pattern(stack, match_length_minimum, true, true); |
934 | 0 | } |
935 | | |
936 | 0 | ByteCode new_stack; |
937 | 0 | size_t new_match_length = 0; |
938 | 0 | auto res = parse_pattern(new_stack, new_match_length, false, false); |
939 | 0 | if (m_parser_state.named_capture_groups_count > 0) { |
940 | 0 | reset(); |
941 | 0 | return parse_pattern(stack, match_length_minimum, false, true); |
942 | 0 | } |
943 | | |
944 | 0 | if (!res) |
945 | 0 | return false; |
946 | | |
947 | 0 | stack.extend(new_stack); |
948 | 0 | match_length_minimum = new_match_length; |
949 | 0 | return res; |
950 | 0 | } |
951 | | |
952 | | bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
953 | 0 | { |
954 | 0 | return parse_disjunction(stack, match_length_minimum, unicode, named); |
955 | 0 | } |
956 | | |
957 | | bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
958 | 0 | { |
959 | 0 | size_t total_match_length_minimum = NumericLimits<size_t>::max(); |
960 | 0 | Vector<ByteCode> alternatives; |
961 | 0 | while (true) { |
962 | 0 | ByteCode alternative_stack; |
963 | 0 | size_t alternative_minimum_length = 0; |
964 | 0 | auto alt_ok = parse_alternative(alternative_stack, alternative_minimum_length, unicode, named); |
965 | 0 | if (!alt_ok) |
966 | 0 | return false; |
967 | | |
968 | 0 | alternatives.append(move(alternative_stack)); |
969 | 0 | total_match_length_minimum = min(alternative_minimum_length, total_match_length_minimum); |
970 | |
|
971 | 0 | if (!match(TokenType::Pipe)) |
972 | 0 | break; |
973 | 0 | consume(); |
974 | 0 | } |
975 | | |
976 | 0 | Optimizer::append_alternation(stack, alternatives.span()); |
977 | 0 | match_length_minimum = total_match_length_minimum; |
978 | 0 | return true; |
979 | 0 | } |
980 | | |
981 | | bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
982 | 0 | { |
983 | 0 | for (;;) { |
984 | 0 | if (match(TokenType::Eof)) |
985 | 0 | return true; |
986 | | |
987 | 0 | if (parse_term(stack, match_length_minimum, unicode, named)) |
988 | 0 | continue; |
989 | | |
990 | 0 | return !has_error(); |
991 | 0 | } |
992 | 0 | } |
993 | | |
994 | | bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
995 | 0 | { |
996 | 0 | if (parse_assertion(stack, match_length_minimum, unicode, named)) |
997 | 0 | return true; |
998 | | |
999 | 0 | ByteCode atom_stack; |
1000 | 0 | size_t minimum_atom_length = 0; |
1001 | 0 | auto parse_with_quantifier = [&] { |
1002 | 0 | bool did_parse_one = false; |
1003 | 0 | if (m_should_use_browser_extended_grammar) |
1004 | 0 | did_parse_one = parse_extended_atom(atom_stack, minimum_atom_length, named); |
1005 | |
|
1006 | 0 | if (!did_parse_one) |
1007 | 0 | did_parse_one = parse_atom(atom_stack, minimum_atom_length, unicode, named); |
1008 | |
|
1009 | 0 | if (!did_parse_one) |
1010 | 0 | return false; |
1011 | | |
1012 | 0 | VERIFY(did_parse_one); |
1013 | 0 | return parse_quantifier(atom_stack, minimum_atom_length, unicode, named); |
1014 | 0 | }; |
1015 | |
|
1016 | 0 | if (!parse_with_quantifier()) |
1017 | 0 | return false; |
1018 | | |
1019 | 0 | stack.extend(move(atom_stack)); |
1020 | 0 | match_length_minimum += minimum_atom_length; |
1021 | 0 | return true; |
1022 | 0 | } |
1023 | | |
1024 | | bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named) |
1025 | 0 | { |
1026 | 0 | if (match(TokenType::Circumflex)) { |
1027 | 0 | consume(); |
1028 | 0 | stack.empend((ByteCodeValueType)OpCodeId::CheckBegin); |
1029 | 0 | return true; |
1030 | 0 | } |
1031 | | |
1032 | 0 | if (match(TokenType::Dollar)) { |
1033 | 0 | consume(); |
1034 | 0 | stack.empend((ByteCodeValueType)OpCodeId::CheckEnd); |
1035 | 0 | return true; |
1036 | 0 | } |
1037 | | |
1038 | 0 | if (try_skip("\\b")) { |
1039 | 0 | stack.insert_bytecode_check_boundary(BoundaryCheckType::Word); |
1040 | 0 | return true; |
1041 | 0 | } |
1042 | | |
1043 | 0 | if (try_skip("\\B")) { |
1044 | 0 | stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord); |
1045 | 0 | return true; |
1046 | 0 | } |
1047 | | |
1048 | 0 | if (match(TokenType::LeftParen)) { |
1049 | 0 | if (!try_skip("(?")) |
1050 | 0 | return false; |
1051 | | |
1052 | 0 | if (done()) { |
1053 | 0 | set_error(Error::InvalidCaptureGroup); |
1054 | 0 | return false; |
1055 | 0 | } |
1056 | | |
1057 | 0 | ByteCode assertion_stack; |
1058 | 0 | size_t length_dummy = 0; |
1059 | |
|
1060 | 0 | bool should_parse_forward_assertion = m_should_use_browser_extended_grammar ? unicode : true; |
1061 | 0 | if (should_parse_forward_assertion && try_skip("=")) { |
1062 | 0 | if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) |
1063 | 0 | return false; |
1064 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead); |
1065 | 0 | return true; |
1066 | 0 | } |
1067 | 0 | if (should_parse_forward_assertion && try_skip("!")) { |
1068 | 0 | enter_capture_group_scope(); |
1069 | 0 | ScopeGuard quit_scope { |
1070 | 0 | [this] { |
1071 | 0 | exit_capture_group_scope(); |
1072 | 0 | } |
1073 | 0 | }; |
1074 | 0 | if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) |
1075 | 0 | return false; |
1076 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); |
1077 | 0 | clear_all_capture_groups_in_scope(stack); |
1078 | |
|
1079 | 0 | return true; |
1080 | 0 | } |
1081 | 0 | if (m_should_use_browser_extended_grammar) { |
1082 | 0 | if (!unicode) { |
1083 | 0 | if (parse_quantifiable_assertion(assertion_stack, match_length_minimum, named)) { |
1084 | 0 | if (!parse_quantifier(assertion_stack, match_length_minimum, unicode, named)) |
1085 | 0 | return false; |
1086 | | |
1087 | 0 | stack.extend(move(assertion_stack)); |
1088 | 0 | return true; |
1089 | 0 | } |
1090 | 0 | } |
1091 | 0 | } |
1092 | 0 | if (try_skip("<=")) { |
1093 | 0 | if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) |
1094 | 0 | return false; |
1095 | | // FIXME: Somehow ensure that this assertion regexp has a fixed length. |
1096 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy); |
1097 | 0 | return true; |
1098 | 0 | } |
1099 | 0 | if (try_skip("<!")) { |
1100 | 0 | enter_capture_group_scope(); |
1101 | 0 | ScopeGuard quit_scope { |
1102 | 0 | [this] { |
1103 | 0 | exit_capture_group_scope(); |
1104 | 0 | } |
1105 | 0 | }; |
1106 | 0 | if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) |
1107 | 0 | return false; |
1108 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy); |
1109 | 0 | clear_all_capture_groups_in_scope(stack); |
1110 | 0 | return true; |
1111 | 0 | } |
1112 | | |
1113 | | // If none of these matched, put the '(?' back. |
1114 | 0 | m_parser_state.lexer.back(3); |
1115 | 0 | m_parser_state.current_token = m_parser_state.lexer.next(); |
1116 | 0 | return false; |
1117 | 0 | } |
1118 | | |
1119 | 0 | return false; |
1120 | 0 | } |
1121 | | |
1122 | | bool ECMA262Parser::parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named) |
1123 | 0 | { |
1124 | 0 | auto disjunction_ok = parse_disjunction(bytecode_stack, length, unicode, named); |
1125 | 0 | if (!disjunction_ok) |
1126 | 0 | return false; |
1127 | 0 | consume(TokenType::RightParen, Error::MismatchingParen); |
1128 | 0 | return true; |
1129 | 0 | } |
1130 | | |
1131 | | bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool named) |
1132 | 0 | { |
1133 | 0 | VERIFY(m_should_use_browser_extended_grammar); |
1134 | 0 | ByteCode assertion_stack; |
1135 | 0 | size_t match_length_minimum = 0; |
1136 | |
|
1137 | 0 | if (try_skip("=")) { |
1138 | 0 | if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named)) |
1139 | 0 | return false; |
1140 | | |
1141 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead); |
1142 | 0 | return true; |
1143 | 0 | } |
1144 | 0 | if (try_skip("!")) { |
1145 | 0 | enter_capture_group_scope(); |
1146 | 0 | ScopeGuard quit_scope { |
1147 | 0 | [this] { |
1148 | 0 | exit_capture_group_scope(); |
1149 | 0 | } |
1150 | 0 | }; |
1151 | 0 | if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named)) |
1152 | 0 | return false; |
1153 | | |
1154 | 0 | stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); |
1155 | 0 | clear_all_capture_groups_in_scope(stack); |
1156 | 0 | return true; |
1157 | 0 | } |
1158 | | |
1159 | 0 | return false; |
1160 | 0 | } |
1161 | | |
1162 | | StringView ECMA262Parser::read_digits_as_string(ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count) |
1163 | 0 | { |
1164 | 0 | if (!match(TokenType::Char)) |
1165 | 0 | return {}; |
1166 | | |
1167 | 0 | if (initial_zero == ReadDigitsInitialZeroState::Disallow && m_parser_state.current_token.value() == "0") |
1168 | 0 | return {}; |
1169 | | |
1170 | 0 | int count = 0; |
1171 | 0 | size_t offset = 0; |
1172 | 0 | auto start_token = m_parser_state.current_token; |
1173 | 0 | while (match(TokenType::Char)) { |
1174 | 0 | auto const c = m_parser_state.current_token.value(); |
1175 | |
|
1176 | 0 | if (max_count > 0 && count >= max_count) |
1177 | 0 | break; |
1178 | | |
1179 | 0 | if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value()) |
1180 | 0 | break; |
1181 | 0 | if (!hex && !c.to_uint().has_value()) |
1182 | 0 | break; |
1183 | | |
1184 | 0 | offset += consume().value().length(); |
1185 | 0 | ++count; |
1186 | 0 | } |
1187 | |
|
1188 | 0 | if (count < min_count) |
1189 | 0 | return {}; |
1190 | | |
1191 | 0 | return StringView { start_token.value().characters_without_null_termination(), offset }; |
1192 | 0 | } |
1193 | | |
1194 | | Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count) |
1195 | 0 | { |
1196 | 0 | auto str = read_digits_as_string(initial_zero, hex, max_count, min_count); |
1197 | 0 | if (str.is_empty()) |
1198 | 0 | return {}; |
1199 | 0 | if (hex) |
1200 | 0 | return AK::StringUtils::convert_to_uint_from_hex(str); |
1201 | 0 | return str.to_uint(); |
1202 | 0 | } |
1203 | | |
1204 | | bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool) |
1205 | 0 | { |
1206 | 0 | enum class Repetition { |
1207 | 0 | OneOrMore, |
1208 | 0 | ZeroOrMore, |
1209 | 0 | Optional, |
1210 | 0 | Explicit, |
1211 | 0 | None, |
1212 | 0 | } repetition_mark { Repetition::None }; |
1213 | |
|
1214 | 0 | bool ungreedy = false; |
1215 | 0 | Optional<u64> repeat_min, repeat_max; |
1216 | |
|
1217 | 0 | if (match(TokenType::Asterisk)) { |
1218 | 0 | consume(); |
1219 | 0 | repetition_mark = Repetition::ZeroOrMore; |
1220 | 0 | } else if (match(TokenType::Plus)) { |
1221 | 0 | consume(); |
1222 | 0 | repetition_mark = Repetition::OneOrMore; |
1223 | 0 | } else if (match(TokenType::Questionmark)) { |
1224 | 0 | consume(); |
1225 | 0 | repetition_mark = Repetition::Optional; |
1226 | 0 | } else if (match(TokenType::LeftCurly)) { |
1227 | 0 | repetition_mark = Repetition::Explicit; |
1228 | 0 | if (!parse_interval_quantifier(repeat_min, repeat_max)) { |
1229 | 0 | if (unicode) { |
1230 | | // Invalid interval quantifiers are disallowed in Unicode mod - they must be escaped with '\{'. |
1231 | 0 | set_error(Error::InvalidPattern); |
1232 | 0 | } |
1233 | 0 | return !has_error(); |
1234 | 0 | } |
1235 | 0 | } else { |
1236 | 0 | return true; |
1237 | 0 | } |
1238 | | |
1239 | 0 | if (match(TokenType::Questionmark)) { |
1240 | 0 | consume(); |
1241 | 0 | ungreedy = true; |
1242 | 0 | } |
1243 | |
|
1244 | 0 | switch (repetition_mark) { |
1245 | 0 | case Repetition::OneOrMore: |
1246 | 0 | ByteCode::transform_bytecode_repetition_min_one(stack, !ungreedy); |
1247 | 0 | break; |
1248 | 0 | case Repetition::ZeroOrMore: |
1249 | 0 | ByteCode::transform_bytecode_repetition_any(stack, !ungreedy); |
1250 | 0 | match_length_minimum = 0; |
1251 | 0 | break; |
1252 | 0 | case Repetition::Optional: |
1253 | 0 | ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy); |
1254 | 0 | match_length_minimum = 0; |
1255 | 0 | break; |
1256 | 0 | case Repetition::Explicit: { |
1257 | 0 | auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; |
1258 | 0 | auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; |
1259 | 0 | ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, min_repetition_mark_id, max_repetition_mark_id, !ungreedy); |
1260 | 0 | match_length_minimum *= repeat_min.value(); |
1261 | 0 | break; |
1262 | 0 | } |
1263 | 0 | case Repetition::None: |
1264 | 0 | VERIFY_NOT_REACHED(); |
1265 | 0 | } |
1266 | | |
1267 | 0 | return true; |
1268 | 0 | } |
1269 | | |
1270 | | bool ECMA262Parser::parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max) |
1271 | 0 | { |
1272 | 0 | VERIFY(match(TokenType::LeftCurly)); |
1273 | 0 | consume(); |
1274 | 0 | auto chars_consumed = 1; |
1275 | |
|
1276 | 0 | auto low_bound_string = read_digits_as_string(); |
1277 | 0 | chars_consumed += low_bound_string.length(); |
1278 | |
|
1279 | 0 | auto low_bound = low_bound_string.to_uint<u64>(); |
1280 | |
|
1281 | 0 | if (!low_bound.has_value()) { |
1282 | 0 | if (!m_should_use_browser_extended_grammar && done()) |
1283 | 0 | return set_error(Error::MismatchingBrace); |
1284 | | |
1285 | 0 | back(chars_consumed + !done()); |
1286 | 0 | return false; |
1287 | 0 | } |
1288 | | |
1289 | 0 | repeat_min = low_bound.value(); |
1290 | |
|
1291 | 0 | if (match(TokenType::Comma)) { |
1292 | 0 | consume(); |
1293 | 0 | ++chars_consumed; |
1294 | 0 | auto high_bound_string = read_digits_as_string(); |
1295 | 0 | auto high_bound = high_bound_string.to_uint<u64>(); |
1296 | 0 | if (high_bound.has_value()) { |
1297 | 0 | repeat_max = high_bound.value(); |
1298 | 0 | chars_consumed += high_bound_string.length(); |
1299 | 0 | } |
1300 | 0 | } else { |
1301 | 0 | repeat_max = repeat_min; |
1302 | 0 | } |
1303 | |
|
1304 | 0 | if (!match(TokenType::RightCurly)) { |
1305 | 0 | if (!m_should_use_browser_extended_grammar && done()) |
1306 | 0 | return set_error(Error::MismatchingBrace); |
1307 | | |
1308 | 0 | back(chars_consumed + !done()); |
1309 | 0 | return false; |
1310 | 0 | } |
1311 | | |
1312 | 0 | consume(); |
1313 | 0 | ++chars_consumed; |
1314 | |
|
1315 | 0 | if (repeat_max.has_value()) { |
1316 | 0 | if (repeat_min.value() > repeat_max.value()) |
1317 | 0 | set_error(Error::InvalidBraceContent); |
1318 | 0 | } |
1319 | |
|
1320 | 0 | if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count))) |
1321 | 0 | return set_error(Error::InvalidBraceContent); |
1322 | | |
1323 | 0 | return true; |
1324 | 0 | } |
1325 | | |
1326 | | bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
1327 | 0 | { |
1328 | 0 | if (match(TokenType::EscapeSequence)) { |
1329 | | // Also part of AtomEscape. |
1330 | 0 | auto token = consume(); |
1331 | 0 | match_length_minimum += 1; |
1332 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[1] } }); |
1333 | 0 | return true; |
1334 | 0 | } |
1335 | 0 | if (try_skip("\\")) { |
1336 | | // AtomEscape. |
1337 | 0 | return parse_atom_escape(stack, match_length_minimum, unicode, named); |
1338 | 0 | } |
1339 | | |
1340 | 0 | if (match(TokenType::LeftBracket)) { |
1341 | | // Character class. |
1342 | 0 | return parse_character_class(stack, match_length_minimum, unicode, named); |
1343 | 0 | } |
1344 | | |
1345 | 0 | if (match(TokenType::LeftParen)) { |
1346 | | // Non-capturing group, or a capture group. |
1347 | 0 | return parse_capture_group(stack, match_length_minimum, unicode, named); |
1348 | 0 | } |
1349 | | |
1350 | 0 | if (match(TokenType::Period)) { |
1351 | 0 | consume(); |
1352 | 0 | match_length_minimum += 1; |
1353 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } }); |
1354 | 0 | return true; |
1355 | 0 | } |
1356 | | |
1357 | 0 | if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightParen) |
1358 | 0 | || match(TokenType::Pipe) || match(TokenType::Plus) || match(TokenType::Asterisk) |
1359 | 0 | || match(TokenType::Questionmark)) { |
1360 | |
|
1361 | 0 | return false; |
1362 | 0 | } |
1363 | | |
1364 | 0 | if (match(TokenType::RightBracket) || match(TokenType::RightCurly) || match(TokenType::LeftCurly)) { |
1365 | 0 | if (unicode) |
1366 | 0 | return set_error(Error::InvalidPattern); |
1367 | | |
1368 | 0 | if (m_should_use_browser_extended_grammar) { |
1369 | 0 | auto token = consume(); |
1370 | 0 | match_length_minimum += 1; |
1371 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } }); |
1372 | 0 | return true; |
1373 | 0 | } |
1374 | 0 | return false; |
1375 | 0 | } |
1376 | | |
1377 | 0 | if (match_ordinary_characters()) { |
1378 | 0 | auto token = consume().value(); |
1379 | 0 | match_length_minimum += 1; |
1380 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token[0] } }); |
1381 | 0 | return true; |
1382 | 0 | } |
1383 | | |
1384 | 0 | set_error(Error::InvalidPattern); |
1385 | 0 | return false; |
1386 | 0 | } |
1387 | | |
1388 | | bool ECMA262Parser::parse_extended_atom(ByteCode&, size_t&, bool) |
1389 | 0 | { |
1390 | | // Note: This includes only rules *not* present in parse_atom() |
1391 | 0 | VERIFY(m_should_use_browser_extended_grammar); |
1392 | | |
1393 | 0 | return parse_invalid_braced_quantifier(); // FAIL FAIL FAIL |
1394 | 0 | } |
1395 | | |
1396 | | bool ECMA262Parser::parse_invalid_braced_quantifier() |
1397 | 0 | { |
1398 | 0 | if (!match(TokenType::LeftCurly)) |
1399 | 0 | return false; |
1400 | 0 | consume(); |
1401 | 0 | size_t chars_consumed = 1; |
1402 | 0 | auto low_bound = read_digits_as_string(); |
1403 | 0 | StringView high_bound; |
1404 | |
|
1405 | 0 | if (low_bound.is_empty()) { |
1406 | 0 | back(chars_consumed + !done()); |
1407 | 0 | return false; |
1408 | 0 | } |
1409 | 0 | chars_consumed += low_bound.length(); |
1410 | 0 | if (match(TokenType::Comma)) { |
1411 | 0 | consume(); |
1412 | 0 | ++chars_consumed; |
1413 | |
|
1414 | 0 | high_bound = read_digits_as_string(); |
1415 | 0 | chars_consumed += high_bound.length(); |
1416 | 0 | } |
1417 | |
|
1418 | 0 | if (!match(TokenType::RightCurly)) { |
1419 | 0 | back(chars_consumed + !done()); |
1420 | 0 | return false; |
1421 | 0 | } |
1422 | | |
1423 | 0 | consume(); |
1424 | 0 | set_error(Error::InvalidPattern); |
1425 | 0 | return true; |
1426 | 0 | } |
1427 | | |
1428 | | bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
1429 | 0 | { |
1430 | 0 | if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) { |
1431 | 0 | if (auto escape = escape_str.to_uint(); escape.has_value()) { |
1432 | | // See if this is a "back"-reference (we've already parsed the group it refers to) |
1433 | 0 | auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); |
1434 | 0 | if (maybe_length.has_value()) { |
1435 | 0 | match_length_minimum += maybe_length.value(); |
1436 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); |
1437 | 0 | return true; |
1438 | 0 | } |
1439 | | // It's not a pattern seen before, so we have to see if it's a valid reference to a future group. |
1440 | 0 | if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) { |
1441 | | // This refers to a future group, and it will _always_ be matching an empty string |
1442 | | // So just match nothing and move on. |
1443 | 0 | return true; |
1444 | 0 | } |
1445 | 0 | if (!m_should_use_browser_extended_grammar) { |
1446 | 0 | set_error(Error::InvalidNumber); |
1447 | 0 | return false; |
1448 | 0 | } |
1449 | 0 | } |
1450 | | |
1451 | | // If not, put the characters back. |
1452 | 0 | back(escape_str.length()); |
1453 | 0 | } |
1454 | | |
1455 | | // CharacterEscape > ControlEscape |
1456 | 0 | if (try_skip("f")) { |
1457 | 0 | match_length_minimum += 1; |
1458 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } }); |
1459 | 0 | return true; |
1460 | 0 | } |
1461 | | |
1462 | 0 | if (try_skip("n")) { |
1463 | 0 | match_length_minimum += 1; |
1464 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } }); |
1465 | 0 | return true; |
1466 | 0 | } |
1467 | | |
1468 | 0 | if (try_skip("r")) { |
1469 | 0 | match_length_minimum += 1; |
1470 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } }); |
1471 | 0 | return true; |
1472 | 0 | } |
1473 | | |
1474 | 0 | if (try_skip("t")) { |
1475 | 0 | match_length_minimum += 1; |
1476 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } }); |
1477 | 0 | return true; |
1478 | 0 | } |
1479 | | |
1480 | 0 | if (try_skip("v")) { |
1481 | 0 | match_length_minimum += 1; |
1482 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } }); |
1483 | 0 | return true; |
1484 | 0 | } |
1485 | | |
1486 | | // CharacterEscape > ControlLetter |
1487 | 0 | if (try_skip("c")) { |
1488 | 0 | for (auto c : s_alphabetic_characters) { |
1489 | 0 | if (try_skip({ &c, 1 })) { |
1490 | 0 | match_length_minimum += 1; |
1491 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } }); |
1492 | 0 | return true; |
1493 | 0 | } |
1494 | 0 | } |
1495 | | |
1496 | 0 | if (unicode) { |
1497 | 0 | set_error(Error::InvalidPattern); |
1498 | 0 | return false; |
1499 | 0 | } |
1500 | | |
1501 | 0 | if (m_should_use_browser_extended_grammar) { |
1502 | 0 | back(1 + !done()); |
1503 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } }); |
1504 | 0 | match_length_minimum += 1; |
1505 | 0 | return true; |
1506 | 0 | } |
1507 | | |
1508 | | // Allow '\c' in non-unicode mode, just matches 'c'. |
1509 | 0 | match_length_minimum += 1; |
1510 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } }); |
1511 | 0 | return true; |
1512 | 0 | } |
1513 | | |
1514 | | // '\0' |
1515 | 0 | if (try_skip("0")) { |
1516 | 0 | if (!lookahead_any(s_decimal_characters)) { |
1517 | 0 | match_length_minimum += 1; |
1518 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); |
1519 | 0 | return true; |
1520 | 0 | } |
1521 | | |
1522 | 0 | back(); |
1523 | 0 | } |
1524 | | |
1525 | | // LegacyOctalEscapeSequence |
1526 | 0 | if (m_should_use_browser_extended_grammar) { |
1527 | 0 | if (!unicode) { |
1528 | 0 | if (auto escape = parse_legacy_octal_escape(); escape.has_value()) { |
1529 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } }); |
1530 | 0 | match_length_minimum += 1; |
1531 | 0 | return true; |
1532 | 0 | } |
1533 | 0 | } |
1534 | 0 | } |
1535 | | |
1536 | | // HexEscape |
1537 | 0 | if (try_skip("x")) { |
1538 | 0 | if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { |
1539 | 0 | match_length_minimum += 1; |
1540 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } }); |
1541 | 0 | return true; |
1542 | 0 | } |
1543 | 0 | if (!unicode) { |
1544 | | // '\x' is allowed in non-unicode mode, just matches 'x'. |
1545 | 0 | match_length_minimum += 1; |
1546 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } }); |
1547 | 0 | return true; |
1548 | 0 | } |
1549 | | |
1550 | 0 | set_error(Error::InvalidPattern); |
1551 | 0 | return false; |
1552 | 0 | } |
1553 | | |
1554 | 0 | if (try_skip("u")) { |
1555 | 0 | if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) { |
1556 | 0 | match_length_minimum += 1; |
1557 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); |
1558 | 0 | return true; |
1559 | 0 | } |
1560 | | |
1561 | 0 | return false; |
1562 | 0 | } |
1563 | | |
1564 | | // IdentityEscape |
1565 | 0 | for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) { |
1566 | 0 | if (try_skip({ &ch, 1 })) { |
1567 | 0 | match_length_minimum += 1; |
1568 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); |
1569 | 0 | return true; |
1570 | 0 | } |
1571 | 0 | } |
1572 | | |
1573 | 0 | if (unicode) { |
1574 | 0 | if (try_skip("/")) { |
1575 | 0 | match_length_minimum += 1; |
1576 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } }); |
1577 | 0 | return true; |
1578 | 0 | } |
1579 | 0 | } |
1580 | | |
1581 | 0 | if (named && try_skip("k")) { |
1582 | 0 | auto name = read_capture_group_specifier(true); |
1583 | 0 | if (name.is_empty()) { |
1584 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
1585 | 0 | return false; |
1586 | 0 | } |
1587 | 0 | auto maybe_capture_group = m_parser_state.named_capture_groups.get(name); |
1588 | 0 | if (!maybe_capture_group.has_value()) { |
1589 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
1590 | 0 | return false; |
1591 | 0 | } |
1592 | 0 | match_length_minimum += maybe_capture_group->minimum_length; |
1593 | |
|
1594 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } }); |
1595 | 0 | return true; |
1596 | 0 | } |
1597 | | |
1598 | 0 | if (unicode) { |
1599 | 0 | PropertyEscape property {}; |
1600 | 0 | bool negated = false; |
1601 | |
|
1602 | 0 | if (parse_unicode_property_escape(property, negated)) { |
1603 | 0 | Vector<CompareTypeAndValuePair> compares; |
1604 | 0 | if (negated) |
1605 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); |
1606 | 0 | property.visit( |
1607 | 0 | [&](Unicode::Property property) { |
1608 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property }); |
1609 | 0 | }, |
1610 | 0 | [&](Unicode::GeneralCategory general_category) { |
1611 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); |
1612 | 0 | }, |
1613 | 0 | [&](Script script) { |
1614 | 0 | if (script.is_extension) |
1615 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script }); |
1616 | 0 | else |
1617 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script }); |
1618 | 0 | }, |
1619 | 0 | [](Empty&) { VERIFY_NOT_REACHED(); }); |
1620 | 0 | stack.insert_bytecode_compare_values(move(compares)); |
1621 | 0 | match_length_minimum += 1; |
1622 | 0 | return true; |
1623 | 0 | } |
1624 | 0 | } |
1625 | | |
1626 | 0 | if (done()) |
1627 | 0 | return set_error(Error::InvalidTrailingEscape); |
1628 | | |
1629 | 0 | bool negate = false; |
1630 | 0 | auto ch = parse_character_class_escape(negate); |
1631 | 0 | if (!ch.has_value()) { |
1632 | 0 | if (!unicode) { |
1633 | | // Allow all SourceCharacter's as escapes here. |
1634 | 0 | auto token = consume(); |
1635 | 0 | match_length_minimum += 1; |
1636 | 0 | stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } }); |
1637 | 0 | return true; |
1638 | 0 | } |
1639 | | |
1640 | 0 | set_error(Error::InvalidCharacterClass); |
1641 | 0 | return false; |
1642 | 0 | } |
1643 | | |
1644 | 0 | Vector<CompareTypeAndValuePair> compares; |
1645 | 0 | if (negate) |
1646 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); |
1647 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() }); |
1648 | 0 | match_length_minimum += 1; |
1649 | 0 | stack.insert_bytecode_compare_values(move(compares)); |
1650 | 0 | return true; |
1651 | 0 | } |
1652 | | Optional<u8> ECMA262Parser::parse_legacy_octal_escape() |
1653 | 0 | { |
1654 | 0 | constexpr auto all_octal_digits = "01234567"; |
1655 | 0 | auto read_octal_digit = [&](auto start, auto end, bool should_ensure_no_following_octal_digit) -> Optional<u8> { |
1656 | 0 | for (char c = '0' + start; c <= '0' + end; ++c) { |
1657 | 0 | if (try_skip({ &c, 1 })) { |
1658 | 0 | if (!should_ensure_no_following_octal_digit || !lookahead_any(all_octal_digits)) |
1659 | 0 | return c - '0'; |
1660 | 0 | back(2); |
1661 | 0 | return {}; |
1662 | 0 | } |
1663 | 0 | } |
1664 | 0 | return {}; |
1665 | 0 | }; |
1666 | | |
1667 | | // OctalDigit(1) |
1668 | 0 | if (auto digit = read_octal_digit(0, 7, true); digit.has_value()) { |
1669 | 0 | return digit.value(); |
1670 | 0 | } |
1671 | | |
1672 | | // OctalDigit(2) |
1673 | 0 | if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) { |
1674 | 0 | if (auto right_digit = read_octal_digit(0, 7, true); right_digit.has_value()) { |
1675 | 0 | return left_digit.value() * 8 + right_digit.value(); |
1676 | 0 | } |
1677 | | |
1678 | 0 | back(2); |
1679 | 0 | } |
1680 | | |
1681 | | // OctalDigit(2) |
1682 | 0 | if (auto left_digit = read_octal_digit(4, 7, false); left_digit.has_value()) { |
1683 | 0 | if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) { |
1684 | 0 | return left_digit.value() * 8 + right_digit.value(); |
1685 | 0 | } |
1686 | | |
1687 | 0 | back(2); |
1688 | 0 | } |
1689 | | |
1690 | | // OctalDigit(3) |
1691 | 0 | if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) { |
1692 | 0 | size_t chars_consumed = 1; |
1693 | 0 | if (auto mid_digit = read_octal_digit(0, 7, false); mid_digit.has_value()) { |
1694 | 0 | ++chars_consumed; |
1695 | 0 | if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) { |
1696 | 0 | return left_digit.value() * 64 + mid_digit.value() * 8 + right_digit.value(); |
1697 | 0 | } |
1698 | 0 | } |
1699 | | |
1700 | 0 | back(chars_consumed); |
1701 | 0 | } |
1702 | | |
1703 | 0 | return {}; |
1704 | 0 | } |
1705 | | |
1706 | | Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash) |
1707 | 0 | { |
1708 | 0 | if (expect_backslash && !try_skip("\\")) |
1709 | 0 | return {}; |
1710 | | |
1711 | | // CharacterClassEscape |
1712 | 0 | CharClass ch_class; |
1713 | 0 | if (try_skip("d")) { |
1714 | 0 | ch_class = CharClass::Digit; |
1715 | 0 | } else if (try_skip("D")) { |
1716 | 0 | ch_class = CharClass::Digit; |
1717 | 0 | negate = true; |
1718 | 0 | } else if (try_skip("s")) { |
1719 | 0 | ch_class = CharClass::Space; |
1720 | 0 | } else if (try_skip("S")) { |
1721 | 0 | ch_class = CharClass::Space; |
1722 | 0 | negate = true; |
1723 | 0 | } else if (try_skip("w")) { |
1724 | 0 | ch_class = CharClass::Word; |
1725 | 0 | } else if (try_skip("W")) { |
1726 | 0 | ch_class = CharClass::Word; |
1727 | 0 | negate = true; |
1728 | 0 | } else { |
1729 | 0 | return {}; |
1730 | 0 | } |
1731 | | |
1732 | 0 | return ch_class; |
1733 | 0 | } |
1734 | | |
1735 | | bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool) |
1736 | 0 | { |
1737 | 0 | consume(TokenType::LeftBracket, Error::InvalidPattern); |
1738 | |
|
1739 | 0 | Vector<CompareTypeAndValuePair> compares; |
1740 | |
|
1741 | 0 | if (match(TokenType::Circumflex)) { |
1742 | | // Negated charclass |
1743 | 0 | consume(); |
1744 | 0 | compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); |
1745 | 0 | } |
1746 | |
|
1747 | 0 | if (match(TokenType::RightBracket)) { |
1748 | 0 | consume(); |
1749 | | // Should only have at most an 'Inverse' |
1750 | 0 | VERIFY(compares.size() <= 1); |
1751 | 0 | stack.insert_bytecode_compare_values(move(compares)); |
1752 | 0 | return true; |
1753 | 0 | } |
1754 | | |
1755 | 0 | if (!parse_nonempty_class_ranges(compares, unicode)) |
1756 | 0 | return false; |
1757 | | |
1758 | 0 | match_length_minimum += 1; |
1759 | 0 | stack.insert_bytecode_compare_values(move(compares)); |
1760 | 0 | return true; |
1761 | 0 | } |
1762 | | |
1763 | | struct CharClassRangeElement { |
1764 | | union { |
1765 | | CharClass character_class; |
1766 | | u32 code_point { 0 }; |
1767 | | Unicode::Property property; |
1768 | | Unicode::GeneralCategory general_category; |
1769 | | Unicode::Script script; |
1770 | | }; |
1771 | | |
1772 | | bool is_negated { false }; |
1773 | | bool is_character_class { false }; |
1774 | | bool is_property { false }; |
1775 | | bool is_general_category { false }; |
1776 | | bool is_script { false }; |
1777 | | bool is_script_extension { false }; |
1778 | | }; |
1779 | | |
1780 | | bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode) |
1781 | 0 | { |
1782 | 0 | auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> { |
1783 | 0 | if (match(TokenType::EscapeSequence)) { |
1784 | 0 | auto token = consume().value(); |
1785 | 0 | return { CharClassRangeElement { .code_point = (u32)token[1], .is_character_class = false } }; |
1786 | 0 | } |
1787 | | |
1788 | 0 | if (try_skip("\\")) { |
1789 | 0 | if (done()) { |
1790 | 0 | set_error(Error::InvalidTrailingEscape); |
1791 | 0 | return {}; |
1792 | 0 | } |
1793 | | |
1794 | 0 | if (try_skip("f")) |
1795 | 0 | return { CharClassRangeElement { .code_point = '\f', .is_character_class = false } }; |
1796 | 0 | if (try_skip("n")) |
1797 | 0 | return { CharClassRangeElement { .code_point = '\n', .is_character_class = false } }; |
1798 | 0 | if (try_skip("r")) |
1799 | 0 | return { CharClassRangeElement { .code_point = '\r', .is_character_class = false } }; |
1800 | 0 | if (try_skip("t")) |
1801 | 0 | return { CharClassRangeElement { .code_point = '\t', .is_character_class = false } }; |
1802 | 0 | if (try_skip("v")) |
1803 | 0 | return { CharClassRangeElement { .code_point = '\v', .is_character_class = false } }; |
1804 | 0 | if (try_skip("b")) |
1805 | 0 | return { CharClassRangeElement { .code_point = '\b', .is_character_class = false } }; |
1806 | 0 | if (try_skip("/")) |
1807 | 0 | return { CharClassRangeElement { .code_point = '/', .is_character_class = false } }; |
1808 | | |
1809 | | // CharacterEscape > ControlLetter |
1810 | 0 | if (try_skip("c")) { |
1811 | 0 | for (auto c : s_alphabetic_characters) { |
1812 | 0 | if (try_skip({ &c, 1 })) { |
1813 | 0 | return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } }; |
1814 | 0 | } |
1815 | 0 | } |
1816 | | |
1817 | 0 | if (unicode) { |
1818 | 0 | set_error(Error::InvalidPattern); |
1819 | 0 | return {}; |
1820 | 0 | } |
1821 | | |
1822 | 0 | if (m_should_use_browser_extended_grammar) { |
1823 | 0 | for (auto c = '0'; c <= '9'; ++c) { |
1824 | 0 | if (try_skip({ &c, 1 })) |
1825 | 0 | return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } }; |
1826 | 0 | } |
1827 | 0 | if (try_skip("_")) |
1828 | 0 | return { CharClassRangeElement { .code_point = (u32)('_' % 32), .is_character_class = false } }; |
1829 | | |
1830 | 0 | back(1 + !done()); |
1831 | 0 | return { CharClassRangeElement { .code_point = '\\', .is_character_class = false } }; |
1832 | 0 | } |
1833 | 0 | } |
1834 | | |
1835 | | // '\0' |
1836 | 0 | if (try_skip("0")) { |
1837 | 0 | if (!lookahead_any(s_decimal_characters)) |
1838 | 0 | return { CharClassRangeElement { .code_point = 0, .is_character_class = false } }; |
1839 | 0 | back(); |
1840 | 0 | } |
1841 | | |
1842 | | // LegacyOctalEscapeSequence |
1843 | 0 | if (m_should_use_browser_extended_grammar && !unicode) { |
1844 | 0 | if (auto escape = parse_legacy_octal_escape(); escape.has_value()) |
1845 | 0 | return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } }; |
1846 | 0 | } |
1847 | | |
1848 | | // HexEscape |
1849 | 0 | if (try_skip("x")) { |
1850 | 0 | if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { |
1851 | 0 | return { CharClassRangeElement { .code_point = hex_escape.value(), .is_character_class = false } }; |
1852 | 0 | } else if (!unicode) { |
1853 | | // '\x' is allowed in non-unicode mode, just matches 'x'. |
1854 | 0 | return { CharClassRangeElement { .code_point = 'x', .is_character_class = false } }; |
1855 | 0 | } else { |
1856 | 0 | set_error(Error::InvalidPattern); |
1857 | 0 | return {}; |
1858 | 0 | } |
1859 | 0 | } |
1860 | | |
1861 | 0 | if (try_skip("u")) { |
1862 | 0 | if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) { |
1863 | | // FIXME: While code point ranges are supported, code point matches as "Char" are not! |
1864 | 0 | return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } }; |
1865 | 0 | } |
1866 | 0 | return {}; |
1867 | 0 | } |
1868 | | |
1869 | | // IdentityEscape |
1870 | 0 | for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) { |
1871 | 0 | if (try_skip({ &ch, 1 })) |
1872 | 0 | return { CharClassRangeElement { .code_point = (u32)ch, .is_character_class = false } }; |
1873 | 0 | } |
1874 | | |
1875 | 0 | if (unicode) { |
1876 | 0 | if (try_skip("-")) |
1877 | 0 | return { CharClassRangeElement { .code_point = '-', .is_character_class = false } }; |
1878 | | |
1879 | 0 | PropertyEscape property {}; |
1880 | 0 | bool negated = false; |
1881 | 0 | if (parse_unicode_property_escape(property, negated)) { |
1882 | 0 | return property.visit( |
1883 | 0 | [&](Unicode::Property property) { |
1884 | 0 | return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true }; |
1885 | 0 | }, |
1886 | 0 | [&](Unicode::GeneralCategory general_category) { |
1887 | 0 | return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true }; |
1888 | 0 | }, |
1889 | 0 | [&](Script script) { |
1890 | 0 | if (script.is_extension) |
1891 | 0 | return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true }; |
1892 | | |
1893 | 0 | return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true }; |
1894 | 0 | }, |
1895 | 0 | [](Empty&) -> CharClassRangeElement { VERIFY_NOT_REACHED(); }); |
1896 | 0 | } |
1897 | 0 | } |
1898 | | |
1899 | 0 | if (try_skip("d")) |
1900 | 0 | return { CharClassRangeElement { .character_class = CharClass::Digit, .is_character_class = true } }; |
1901 | 0 | if (try_skip("s")) |
1902 | 0 | return { CharClassRangeElement { .character_class = CharClass::Space, .is_character_class = true } }; |
1903 | 0 | if (try_skip("w")) |
1904 | 0 | return { CharClassRangeElement { .character_class = CharClass::Word, .is_character_class = true } }; |
1905 | 0 | if (try_skip("D")) |
1906 | 0 | return { CharClassRangeElement { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } }; |
1907 | 0 | if (try_skip("S")) |
1908 | 0 | return { CharClassRangeElement { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } }; |
1909 | 0 | if (try_skip("W")) |
1910 | 0 | return { CharClassRangeElement { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } }; |
1911 | | |
1912 | 0 | if (!unicode) { |
1913 | | // Any unrecognised escape is allowed in non-unicode mode. |
1914 | 0 | return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } }; |
1915 | 0 | } |
1916 | | |
1917 | 0 | set_error(Error::InvalidPattern); |
1918 | 0 | return {}; |
1919 | 0 | } |
1920 | | |
1921 | 0 | if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus)) |
1922 | 0 | return {}; |
1923 | | |
1924 | | // Allow any (other) SourceCharacter. |
1925 | 0 | return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } }; |
1926 | 0 | }; |
1927 | 0 | auto read_class_atom = [&]() -> Optional<CharClassRangeElement> { |
1928 | 0 | if (match(TokenType::HyphenMinus)) { |
1929 | 0 | consume(); |
1930 | 0 | return { CharClassRangeElement { .code_point = '-', .is_character_class = false } }; |
1931 | 0 | } |
1932 | | |
1933 | 0 | return read_class_atom_no_dash(); |
1934 | 0 | }; |
1935 | |
|
1936 | 0 | auto empend_atom = [&](auto& atom) { |
1937 | 0 | if (atom.is_character_class) { |
1938 | 0 | if (atom.is_negated) |
1939 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 }); |
1940 | |
|
1941 | 0 | if (atom.is_property) |
1942 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) }); |
1943 | 0 | else if (atom.is_general_category) |
1944 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); |
1945 | 0 | else if (atom.is_script) |
1946 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) }); |
1947 | 0 | else if (atom.is_script_extension) |
1948 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) }); |
1949 | 0 | else |
1950 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class }); |
1951 | 0 | } else { |
1952 | 0 | VERIFY(!atom.is_negated); |
1953 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, atom.code_point }); |
1954 | 0 | } |
1955 | 0 | }; |
1956 | |
|
1957 | 0 | while (!match(TokenType::RightBracket)) { |
1958 | 0 | if (match(TokenType::Eof)) { |
1959 | 0 | set_error(Error::MismatchingBracket); |
1960 | 0 | return false; |
1961 | 0 | } |
1962 | | |
1963 | 0 | auto first_atom = read_class_atom(); |
1964 | 0 | if (!first_atom.has_value()) |
1965 | 0 | return false; |
1966 | | |
1967 | 0 | if (match(TokenType::HyphenMinus)) { |
1968 | 0 | consume(); |
1969 | 0 | if (match(TokenType::RightBracket)) { |
1970 | | // Allow '-' as the last element in a charclass, even after an atom. |
1971 | 0 | m_parser_state.lexer.back(2); // -] |
1972 | 0 | m_parser_state.current_token = m_parser_state.lexer.next(); |
1973 | 0 | goto read_as_single_atom; |
1974 | 0 | } |
1975 | 0 | auto second_atom = read_class_atom(); |
1976 | 0 | if (!second_atom.has_value()) |
1977 | 0 | return false; |
1978 | | |
1979 | 0 | if (first_atom.value().is_character_class || second_atom.value().is_character_class) { |
1980 | 0 | if (m_should_use_browser_extended_grammar) { |
1981 | 0 | if (unicode) { |
1982 | 0 | set_error(Error::InvalidRange); |
1983 | 0 | return false; |
1984 | 0 | } |
1985 | | |
1986 | | // CharacterRangeOrUnion > !Unicode > CharClass |
1987 | 0 | empend_atom(*first_atom); |
1988 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' }); |
1989 | 0 | empend_atom(*second_atom); |
1990 | 0 | continue; |
1991 | 0 | } |
1992 | | |
1993 | 0 | set_error(Error::InvalidRange); |
1994 | 0 | return false; |
1995 | 0 | } |
1996 | | |
1997 | 0 | if (first_atom.value().code_point > second_atom.value().code_point) { |
1998 | 0 | set_error(Error::InvalidRange); |
1999 | 0 | return false; |
2000 | 0 | } |
2001 | | |
2002 | 0 | VERIFY(!first_atom.value().is_negated); |
2003 | 0 | VERIFY(!second_atom.value().is_negated); |
2004 | | |
2005 | 0 | ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point } }); |
2006 | 0 | continue; |
2007 | 0 | } |
2008 | | |
2009 | 0 | read_as_single_atom:; |
2010 | |
|
2011 | 0 | auto atom = first_atom.value(); |
2012 | 0 | empend_atom(atom); |
2013 | 0 | } |
2014 | | |
2015 | 0 | consume(TokenType::RightBracket, Error::MismatchingBracket); |
2016 | |
|
2017 | 0 | return true; |
2018 | 0 | } |
2019 | | |
2020 | | bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated) |
2021 | 0 | { |
2022 | 0 | negated = false; |
2023 | |
|
2024 | 0 | if (try_skip("p")) |
2025 | 0 | negated = false; |
2026 | 0 | else if (try_skip("P")) |
2027 | 0 | negated = true; |
2028 | 0 | else |
2029 | 0 | return false; |
2030 | | |
2031 | 0 | auto parsed_property = read_unicode_property_escape(); |
2032 | 0 | if (!parsed_property.has_value()) { |
2033 | 0 | set_error(Error::InvalidNameForProperty); |
2034 | 0 | return false; |
2035 | 0 | } |
2036 | | |
2037 | 0 | property = move(*parsed_property); |
2038 | |
|
2039 | 0 | return property.visit( |
2040 | 0 | [this](Unicode::Property property) { |
2041 | 0 | if (!Unicode::is_ecma262_property(property)) { |
2042 | 0 | set_error(Error::InvalidNameForProperty); |
2043 | 0 | return false; |
2044 | 0 | } |
2045 | 0 | return true; |
2046 | 0 | }, |
2047 | 0 | [](Unicode::GeneralCategory) { return true; }, |
2048 | 0 | [](Script) { return true; }, |
2049 | 0 | [](Empty&) -> bool { VERIFY_NOT_REACHED(); }); |
2050 | 0 | } |
2051 | | |
2052 | | FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) |
2053 | 0 | { |
2054 | 0 | static auto id_start_category = Unicode::property_from_string("ID_Start"sv); |
2055 | 0 | static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); |
2056 | 0 | static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD; |
2057 | 0 | constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C }; |
2058 | 0 | constexpr const u32 ZERO_WIDTH_JOINER { 0x200D }; |
2059 | |
|
2060 | 0 | if (take_starting_angle_bracket && !consume("<")) |
2061 | 0 | return {}; |
2062 | | |
2063 | 0 | StringBuilder builder; |
2064 | |
|
2065 | 0 | auto consume_code_point = [&] { |
2066 | 0 | Utf8View utf_8_view { m_parser_state.lexer.source().substring_view(m_parser_state.lexer.tell() - 1) }; |
2067 | 0 | if (utf_8_view.is_empty()) |
2068 | 0 | return REPLACEMENT_CHARACTER; |
2069 | 0 | u32 code_point = *utf_8_view.begin(); |
2070 | 0 | auto characters = utf_8_view.byte_offset_of(1); |
2071 | |
|
2072 | 0 | while (characters-- > 0) |
2073 | 0 | consume(); |
2074 | |
|
2075 | 0 | return code_point; |
2076 | 0 | }; |
2077 | |
|
2078 | 0 | { |
2079 | | // The first character is limited to: https://tc39.es/ecma262/#prod-RegExpIdentifierStart |
2080 | | // RegExpIdentifierStart[UnicodeMode] :: |
2081 | | // IdentifierStartChar |
2082 | | // \ RegExpUnicodeEscapeSequence[+UnicodeMode] |
2083 | | // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate |
2084 | |
|
2085 | 0 | auto code_point = consume_code_point(); |
2086 | |
|
2087 | 0 | if (code_point == '\\' && match('u')) { |
2088 | 0 | consume(); |
2089 | |
|
2090 | 0 | if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) { |
2091 | 0 | code_point = *maybe_code_point; |
2092 | 0 | } else { |
2093 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2094 | 0 | return {}; |
2095 | 0 | } |
2096 | 0 | } |
2097 | | |
2098 | 0 | if (is_ascii(code_point)) { |
2099 | | // The only valid ID_Start unicode characters in ascii are the letters. |
2100 | 0 | if (!is_ascii_alpha(code_point) && code_point != '$' && code_point != '_') { |
2101 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2102 | 0 | return {}; |
2103 | 0 | } |
2104 | 0 | } else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) { |
2105 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2106 | 0 | return {}; |
2107 | 0 | } |
2108 | 0 | builder.append_code_point(code_point); |
2109 | 0 | } |
2110 | | |
2111 | 0 | bool hit_end = false; |
2112 | | |
2113 | | // Any following characters are limited to: |
2114 | | // RegExpIdentifierPart[UnicodeMode] :: |
2115 | | // IdentifierPartChar |
2116 | | // \ RegExpUnicodeEscapeSequence[+UnicodeMode] |
2117 | | // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate |
2118 | |
|
2119 | 0 | while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) { |
2120 | 0 | auto code_point = consume_code_point(); |
2121 | |
|
2122 | 0 | if (code_point == '>') { |
2123 | 0 | hit_end = true; |
2124 | 0 | break; |
2125 | 0 | } |
2126 | | |
2127 | 0 | if (code_point == '\\') { |
2128 | 0 | if (!try_skip("u")) { |
2129 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2130 | 0 | return {}; |
2131 | 0 | } |
2132 | 0 | if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) { |
2133 | 0 | code_point = *maybe_code_point; |
2134 | 0 | } else { |
2135 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2136 | 0 | return {}; |
2137 | 0 | } |
2138 | 0 | } |
2139 | | |
2140 | 0 | if (is_ascii(code_point)) { |
2141 | | // The only valid ID_Continue unicode characters in ascii are the letters and numbers. |
2142 | 0 | if (!is_ascii_alphanumeric(code_point) && code_point != '$' && code_point != '_') { |
2143 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2144 | 0 | return {}; |
2145 | 0 | } |
2146 | 0 | } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) { |
2147 | 0 | if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) { |
2148 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2149 | 0 | return {}; |
2150 | 0 | } |
2151 | 0 | } |
2152 | 0 | builder.append_code_point(code_point); |
2153 | 0 | } |
2154 | | |
2155 | 0 | FlyString name = builder.build(); |
2156 | 0 | if (!hit_end || name.is_empty()) |
2157 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2158 | |
|
2159 | 0 | return name; |
2160 | 0 | } |
2161 | | |
2162 | | Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape() |
2163 | 0 | { |
2164 | 0 | consume(TokenType::LeftCurly, Error::InvalidPattern); |
2165 | | |
2166 | | // Note: clang-format is disabled here because it doesn't handle templated lambdas yet. |
2167 | | // clang-format off |
2168 | 0 | auto read_until = [&]<typename... Ts>(Ts&&... terminators) { |
2169 | 0 | auto start_token = m_parser_state.current_token; |
2170 | 0 | size_t offset = 0; |
2171 | |
|
2172 | 0 | while (match(TokenType::Char)) { |
2173 | 0 | if (m_parser_state.current_token.value().is_one_of(forward<Ts>(terminators)...)) |
2174 | 0 | break; |
2175 | 0 | offset += consume().value().length(); |
2176 | 0 | } |
2177 | |
|
2178 | 0 | return StringView { start_token.value().characters_without_null_termination(), offset }; |
2179 | 0 | }; Unexecuted instantiation: RegexParser.cpp:auto regex::ECMA262Parser::read_unicode_property_escape()::$_10::operator()<AK::StringView, AK::StringView>(AK::StringView&&, AK::StringView&&) const Unexecuted instantiation: RegexParser.cpp:auto regex::ECMA262Parser::read_unicode_property_escape()::$_10::operator()<AK::StringView>(AK::StringView&&) const |
2180 | | // clang-format on |
2181 | |
|
2182 | 0 | StringView property_type; |
2183 | 0 | StringView property_name = read_until("="sv, "}"sv); |
2184 | |
|
2185 | 0 | if (try_skip("="sv)) { |
2186 | 0 | if (property_name.is_empty()) |
2187 | 0 | return {}; |
2188 | 0 | property_type = property_name; |
2189 | 0 | property_name = read_until("}"sv); |
2190 | 0 | } |
2191 | | |
2192 | 0 | consume(TokenType::RightCurly, Error::InvalidPattern); |
2193 | |
|
2194 | 0 | if (property_type.is_empty()) { |
2195 | 0 | if (auto property = Unicode::property_from_string(property_name); property.has_value()) |
2196 | 0 | return { *property }; |
2197 | 0 | if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value()) |
2198 | 0 | return { *general_category }; |
2199 | 0 | } else if ((property_type == "General_Category"sv) || (property_type == "gc"sv)) { |
2200 | 0 | if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value()) |
2201 | 0 | return { *general_category }; |
2202 | 0 | } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) { |
2203 | 0 | if (auto script = Unicode::script_from_string(property_name); script.has_value()) |
2204 | 0 | return Script { *script, false }; |
2205 | 0 | } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) { |
2206 | 0 | if (auto script = Unicode::script_from_string(property_name); script.has_value()) |
2207 | 0 | return Script { *script, true }; |
2208 | 0 | } |
2209 | | |
2210 | 0 | return {}; |
2211 | 0 | } |
2212 | | |
2213 | | bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) |
2214 | 0 | { |
2215 | 0 | consume(TokenType::LeftParen, Error::InvalidPattern); |
2216 | |
|
2217 | 0 | auto register_capture_group_in_current_scope = [&](auto identifier) { |
2218 | 0 | m_capture_groups_in_scope.last().empend(identifier); |
2219 | 0 | }; |
2220 | |
|
2221 | 0 | if (match(TokenType::Questionmark)) { |
2222 | | // Non-capturing group or group with specifier. |
2223 | 0 | consume(); |
2224 | |
|
2225 | 0 | if (match(TokenType::Colon)) { |
2226 | 0 | consume(); |
2227 | 0 | ByteCode noncapture_group_bytecode; |
2228 | 0 | size_t length = 0; |
2229 | |
|
2230 | 0 | enter_capture_group_scope(); |
2231 | 0 | if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named)) |
2232 | 0 | return set_error(Error::InvalidPattern); |
2233 | 0 | clear_all_capture_groups_in_scope(stack); |
2234 | 0 | exit_capture_group_scope(); |
2235 | |
|
2236 | 0 | consume(TokenType::RightParen, Error::MismatchingParen); |
2237 | |
|
2238 | 0 | stack.extend(move(noncapture_group_bytecode)); |
2239 | 0 | match_length_minimum += length; |
2240 | 0 | return true; |
2241 | 0 | } |
2242 | | |
2243 | 0 | if (consume("<")) { |
2244 | 0 | ++m_parser_state.named_capture_groups_count; |
2245 | 0 | auto group_index = ++m_parser_state.capture_groups_count; // Named capture groups count as normal capture groups too. |
2246 | 0 | auto name = read_capture_group_specifier(); |
2247 | |
|
2248 | 0 | if (name.is_empty()) { |
2249 | 0 | set_error(Error::InvalidNameForCaptureGroup); |
2250 | 0 | return false; |
2251 | 0 | } |
2252 | | |
2253 | 0 | if (m_parser_state.named_capture_groups.contains(name)) { |
2254 | 0 | set_error(Error::DuplicateNamedCapture); |
2255 | 0 | return false; |
2256 | 0 | } |
2257 | | |
2258 | 0 | ByteCode capture_group_bytecode; |
2259 | 0 | size_t length = 0; |
2260 | 0 | enter_capture_group_scope(); |
2261 | 0 | if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) |
2262 | 0 | return set_error(Error::InvalidPattern); |
2263 | 0 | clear_all_capture_groups_in_scope(stack); |
2264 | 0 | exit_capture_group_scope(); |
2265 | |
|
2266 | 0 | register_capture_group_in_current_scope(group_index); |
2267 | |
|
2268 | 0 | consume(TokenType::RightParen, Error::MismatchingParen); |
2269 | |
|
2270 | 0 | stack.insert_bytecode_group_capture_left(group_index); |
2271 | 0 | stack.extend(move(capture_group_bytecode)); |
2272 | 0 | stack.insert_bytecode_group_capture_right(group_index, name.view()); |
2273 | |
|
2274 | 0 | match_length_minimum += length; |
2275 | |
|
2276 | 0 | m_parser_state.capture_group_minimum_lengths.set(group_index, length); |
2277 | 0 | m_parser_state.named_capture_groups.set(name, { group_index, length }); |
2278 | 0 | return true; |
2279 | 0 | } |
2280 | | |
2281 | 0 | set_error(Error::InvalidCaptureGroup); |
2282 | 0 | return false; |
2283 | 0 | } |
2284 | | |
2285 | 0 | auto group_index = ++m_parser_state.capture_groups_count; |
2286 | 0 | enter_capture_group_scope(); |
2287 | |
|
2288 | 0 | ByteCode capture_group_bytecode; |
2289 | 0 | size_t length = 0; |
2290 | |
|
2291 | 0 | if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) |
2292 | 0 | return set_error(Error::InvalidPattern); |
2293 | | |
2294 | 0 | clear_all_capture_groups_in_scope(stack); |
2295 | 0 | exit_capture_group_scope(); |
2296 | |
|
2297 | 0 | register_capture_group_in_current_scope(group_index); |
2298 | |
|
2299 | 0 | stack.insert_bytecode_group_capture_left(group_index); |
2300 | 0 | stack.extend(move(capture_group_bytecode)); |
2301 | |
|
2302 | 0 | m_parser_state.capture_group_minimum_lengths.set(group_index, length); |
2303 | |
|
2304 | 0 | consume(TokenType::RightParen, Error::MismatchingParen); |
2305 | |
|
2306 | 0 | stack.insert_bytecode_group_capture_right(group_index); |
2307 | |
|
2308 | 0 | match_length_minimum += length; |
2309 | |
|
2310 | 0 | return true; |
2311 | 0 | } |
2312 | | |
2313 | | size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis() |
2314 | 0 | { |
2315 | 0 | if (m_total_number_of_capturing_parenthesis.has_value()) |
2316 | 0 | return m_total_number_of_capturing_parenthesis.value(); |
2317 | | |
2318 | 0 | GenericLexer lexer { m_parser_state.lexer.source() }; |
2319 | 0 | size_t count = 0; |
2320 | 0 | while (!lexer.is_eof()) { |
2321 | 0 | switch (lexer.peek()) { |
2322 | 0 | case '\\': |
2323 | 0 | lexer.consume(2); |
2324 | 0 | continue; |
2325 | 0 | case '[': |
2326 | 0 | while (!lexer.is_eof()) { |
2327 | 0 | if (lexer.consume_specific('\\')) |
2328 | 0 | lexer.consume(); |
2329 | 0 | else if (lexer.consume_specific(']')) |
2330 | 0 | break; |
2331 | 0 | lexer.consume(); |
2332 | 0 | } |
2333 | 0 | break; |
2334 | 0 | case '(': |
2335 | 0 | lexer.consume(); |
2336 | 0 | if (lexer.consume_specific('?')) { |
2337 | | // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<' |
2338 | 0 | if (!lexer.consume_specific('<')) |
2339 | 0 | break; |
2340 | | |
2341 | 0 | if (lexer.next_is(is_any_of("=!"))) |
2342 | 0 | break; |
2343 | | |
2344 | 0 | ++count; |
2345 | 0 | } else { |
2346 | 0 | ++count; |
2347 | 0 | } |
2348 | 0 | break; |
2349 | 0 | default: |
2350 | 0 | lexer.consume(); |
2351 | 0 | break; |
2352 | 0 | } |
2353 | 0 | } |
2354 | | |
2355 | 0 | m_total_number_of_capturing_parenthesis = count; |
2356 | 0 | return count; |
2357 | 0 | } |
2358 | | } |