Coverage Report

Created: 2022-05-20 06:17

/src/serenity/Userland/Libraries/LibRegex/RegexParser.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 * Copyright (c) 2020-2021, the SerenityOS developers.
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#include "RegexParser.h"
9
#include "RegexDebug.h"
10
#include <AK/AnyOf.h>
11
#include <AK/CharacterTypes.h>
12
#include <AK/GenericLexer.h>
13
#include <AK/ScopeGuard.h>
14
#include <AK/String.h>
15
#include <AK/StringBuilder.h>
16
#include <AK/StringUtils.h>
17
#include <AK/TemporaryChange.h>
18
#include <AK/Utf16View.h>
19
#include <LibUnicode/CharacterTypes.h>
20
21
namespace regex {
22
23
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
24
static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1;
25
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
26
static constexpr auto s_decimal_characters = "0123456789"sv;
27
28
static constexpr StringView identity_escape_characters(bool unicode, bool browser_extended)
29
0
{
30
0
    if (unicode)
31
0
        return "^$\\.*+?()[]{}|/"sv;
32
0
    if (browser_extended)
33
0
        return "^$\\.*+?()[|"sv;
34
0
    return "^$\\.*+?()[]{}|"sv;
35
0
}
36
37
ALWAYS_INLINE bool Parser::set_error(Error error)
38
20.4k
{
39
20.4k
    if (m_parser_state.error == Error::NoError) {
40
1.73k
        m_parser_state.error = error;
41
1.73k
        m_parser_state.error_token = m_parser_state.current_token;
42
1.73k
    }
43
20.4k
    return false; // always return false, that eases the API usage (return set_error(...)) :^)
44
20.4k
}
45
46
ALWAYS_INLINE bool Parser::done() const
47
54.4M
{
48
54.4M
    return match(TokenType::Eof);
49
54.4M
}
50
51
ALWAYS_INLINE bool Parser::match(TokenType type) const
52
263M
{
53
263M
    return m_parser_state.current_token.type() == type;
54
263M
}
55
56
ALWAYS_INLINE bool Parser::match(char ch) const
57
0
{
58
0
    return m_parser_state.current_token.type() == TokenType::Char && m_parser_state.current_token.value().length() == 1 && m_parser_state.current_token.value()[0] == ch;
59
0
}
60
61
ALWAYS_INLINE Token Parser::consume()
62
18.1M
{
63
18.1M
    auto old_token = m_parser_state.current_token;
64
18.1M
    m_parser_state.current_token = m_parser_state.lexer.next();
65
18.1M
    return old_token;
66
18.1M
}
67
68
ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
69
771k
{
70
771k
    if (m_parser_state.current_token.type() != type) {
71
13.5k
        set_error(error);
72
13.5k
        dbgln_if(REGEX_DEBUG, "[PARSER] Error: Unexpected token {}. Expected: {}", m_parser_state.current_token.name(), Token::name(type));
73
13.5k
    }
74
771k
    return consume();
75
771k
}
76
77
ALWAYS_INLINE bool Parser::consume(String const& str)
78
844k
{
79
844k
    size_t potentially_go_back { 1 };
80
1.31M
    for (auto ch : str) {
81
1.31M
        if (match(TokenType::Char)) {
82
1.31M
            if (m_parser_state.current_token.value()[0] != ch) {
83
748k
                m_parser_state.lexer.back(potentially_go_back);
84
748k
                m_parser_state.current_token = m_parser_state.lexer.next();
85
748k
                return false;
86
748k
            }
87
1.31M
        } else {
88
888
            m_parser_state.lexer.back(potentially_go_back);
89
888
            m_parser_state.current_token = m_parser_state.lexer.next();
90
888
            return false;
91
888
        }
92
563k
        consume(TokenType::Char, Error::NoError);
93
563k
        ++potentially_go_back;
94
563k
    }
95
94.2k
    return true;
96
844k
}
97
98
ALWAYS_INLINE Optional<u32> Parser::consume_escaped_code_point(bool unicode)
99
0
{
100
0
    if (match(TokenType::LeftCurly) && !unicode) {
101
        // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
102
0
        return static_cast<u32>('u');
103
0
    }
104
105
0
    m_parser_state.lexer.retreat(2 + !done()); // Go back to just before '\u' (+1 char, because we will have consumed an extra character)
106
107
0
    if (auto code_point_or_error = m_parser_state.lexer.consume_escaped_code_point(unicode); !code_point_or_error.is_error()) {
108
0
        m_parser_state.current_token = m_parser_state.lexer.next();
109
0
        return code_point_or_error.value();
110
0
    }
111
112
0
    if (!unicode) {
113
        // '\u' is allowed in non-unicode mode, just matches 'u'.
114
0
        return static_cast<u32>('u');
115
0
    }
116
117
0
    set_error(Error::InvalidPattern);
118
0
    return {};
119
0
}
120
121
ALWAYS_INLINE bool Parser::try_skip(StringView str)
122
140M
{
123
140M
    if (str.starts_with(m_parser_state.current_token.value()))
124
252k
        str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
125
140M
    else
126
140M
        return false;
127
128
252k
    size_t potentially_go_back { 0 };
129
252k
    for (auto ch : str) {
130
148k
        if (!m_parser_state.lexer.consume_specific(ch)) {
131
113k
            m_parser_state.lexer.back(potentially_go_back);
132
113k
            return false;
133
113k
        }
134
34.9k
        ++potentially_go_back;
135
34.9k
    }
136
137
138k
    m_parser_state.current_token = m_parser_state.lexer.next();
138
138k
    return true;
139
252k
}
140
141
ALWAYS_INLINE bool Parser::lookahead_any(StringView str)
142
0
{
143
0
    return AK::any_of(str, [this](auto ch) { return match(ch); });
144
0
}
145
146
ALWAYS_INLINE unsigned char Parser::skip()
147
33.8M
{
148
33.8M
    unsigned char ch;
149
33.8M
    if (m_parser_state.current_token.value().length() == 1) {
150
31.8M
        ch = m_parser_state.current_token.value()[0];
151
31.8M
    } else {
152
2.01M
        m_parser_state.lexer.back(m_parser_state.current_token.value().length());
153
2.01M
        ch = m_parser_state.lexer.consume();
154
2.01M
    }
155
156
33.8M
    m_parser_state.current_token = m_parser_state.lexer.next();
157
33.8M
    return ch;
158
33.8M
}
159
160
ALWAYS_INLINE void Parser::back(size_t count)
161
113k
{
162
113k
    m_parser_state.lexer.back(count);
163
113k
    m_parser_state.current_token = m_parser_state.lexer.next();
164
113k
}
165
166
ALWAYS_INLINE void Parser::reset()
167
4.20k
{
168
4.20k
    m_parser_state.bytecode.clear();
169
4.20k
    m_parser_state.lexer.reset();
170
4.20k
    m_parser_state.current_token = m_parser_state.lexer.next();
171
4.20k
    m_parser_state.error = Error::NoError;
172
4.20k
    m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
173
4.20k
    m_parser_state.capture_group_minimum_lengths.clear();
174
4.20k
    m_parser_state.capture_groups_count = 0;
175
4.20k
    m_parser_state.named_capture_groups_count = 0;
176
4.20k
    m_parser_state.named_capture_groups.clear();
177
4.20k
}
178
179
Parser::Result Parser::parse(Optional<AllOptions> regex_options)
180
4.20k
{
181
4.20k
    reset();
182
4.20k
    if (regex_options.has_value())
183
0
        m_parser_state.regex_options = regex_options.value();
184
4.20k
    if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
185
2.70k
        consume(TokenType::Eof, Error::InvalidPattern);
186
1.49k
    else
187
1.49k
        set_error(Error::InvalidPattern);
188
189
4.20k
    dbgln_if(REGEX_DEBUG, "[PARSER] Produced bytecode with {} entries (opcodes + arguments)", m_parser_state.bytecode.size());
190
4.20k
    return {
191
4.20k
        move(m_parser_state.bytecode),
192
4.20k
        move(m_parser_state.capture_groups_count),
193
4.20k
        move(m_parser_state.named_capture_groups_count),
194
4.20k
        move(m_parser_state.match_length_minimum),
195
4.20k
        move(m_parser_state.error),
196
4.20k
        move(m_parser_state.error_token),
197
4.20k
        m_parser_state.named_capture_groups.keys(),
198
4.20k
        m_parser_state.regex_options,
199
4.20k
    };
200
4.20k
}
201
202
ALWAYS_INLINE bool Parser::match_ordinary_characters()
203
0
{
204
    // NOTE: This method must not be called during bracket and repetition parsing!
205
    // FIXME: Add assertion for that?
206
0
    auto type = m_parser_state.current_token.type();
207
0
    return (type == TokenType::Char
208
0
        || type == TokenType::Comma
209
0
        || type == TokenType::Slash
210
0
        || type == TokenType::EqualSign
211
0
        || type == TokenType::HyphenMinus
212
0
        || type == TokenType::Colon);
213
0
}
214
215
// =============================
216
// Abstract Posix Parser
217
// =============================
218
219
ALWAYS_INLINE bool AbstractPosixParser::parse_bracket_expression(Vector<CompareTypeAndValuePair>& values, size_t& match_length_minimum)
220
8.53k
{
221
38.1M
    for (; !done();) {
222
38.1M
        if (match(TokenType::HyphenMinus)) {
223
3.50M
            consume();
224
225
3.50M
            if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
226
                // first in the bracket expression
227
1.93k
                values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
228
3.50M
            } else if (match(TokenType::RightBracket)) {
229
                // Last in the bracket expression
230
2.44k
                values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
231
3.50M
            } else if (values.last().type == CharacterCompareType::Char) {
232
3.50M
                values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
233
234
3.50M
                if (done())
235
16
                    return set_error(Error::MismatchingBracket);
236
237
3.50M
                if (match(TokenType::HyphenMinus)) {
238
40.8k
                    consume();
239
                    // Valid range, add ordinary character
240
40.8k
                    values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
241
40.8k
                }
242
3.50M
            } else {
243
16
                return set_error(Error::InvalidRange);
244
16
            }
245
246
34.6M
        } else if (match(TokenType::Circumflex)) {
247
603k
            auto t = consume();
248
249
603k
            if (values.is_empty())
250
3.03k
                values.append({ CharacterCompareType::Inverse, 0 });
251
600k
            else
252
600k
                values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
253
254
33.9M
        } else if (match(TokenType::LeftBracket)) {
255
98.7k
            consume();
256
257
98.7k
            if (match(TokenType::Period)) {
258
3.29k
                consume();
259
260
                // FIXME: Parse collating element, this is needed when we have locale support
261
                //        This could have impact on length parameter, I guess.
262
3.29k
                set_error(Error::InvalidCollationElement);
263
264
3.29k
                consume(TokenType::Period, Error::InvalidCollationElement);
265
3.29k
                consume(TokenType::RightBracket, Error::MismatchingBracket);
266
267
95.4k
            } else if (match(TokenType::EqualSign)) {
268
979
                consume();
269
                // FIXME: Parse collating element, this is needed when we have locale support
270
                //        This could have impact on length parameter, I guess.
271
979
                set_error(Error::InvalidCollationElement);
272
273
979
                consume(TokenType::EqualSign, Error::InvalidCollationElement);
274
979
                consume(TokenType::RightBracket, Error::MismatchingBracket);
275
276
94.4k
            } else if (match(TokenType::Colon)) {
277
94.4k
                consume();
278
279
94.4k
                CharClass ch_class;
280
                // parse character class
281
94.4k
                if (match(TokenType::Char)) {
282
94.4k
                    if (consume("alnum"))
283
230
                        ch_class = CharClass::Alnum;
284
94.2k
                    else if (consume("alpha"))
285
233
                        ch_class = CharClass::Alpha;
286
94.0k
                    else if (consume("blank"))
287
236
                        ch_class = CharClass::Blank;
288
93.7k
                    else if (consume("cntrl"))
289
303
                        ch_class = CharClass::Cntrl;
290
93.4k
                    else if (consume("digit"))
291
231
                        ch_class = CharClass::Digit;
292
93.2k
                    else if (consume("graph"))
293
248
                        ch_class = CharClass::Graph;
294
92.9k
                    else if (consume("lower"))
295
246
                        ch_class = CharClass::Lower;
296
92.7k
                    else if (consume("print"))
297
243
                        ch_class = CharClass::Print;
298
92.4k
                    else if (consume("punct"))
299
91.2k
                        ch_class = CharClass::Punct;
300
1.25k
                    else if (consume("space"))
301
231
                        ch_class = CharClass::Space;
302
1.02k
                    else if (consume("upper"))
303
513
                        ch_class = CharClass::Upper;
304
507
                    else if (consume("xdigit"))
305
320
                        ch_class = CharClass::Xdigit;
306
187
                    else
307
187
                        return set_error(Error::InvalidCharacterClass);
308
309
94.2k
                    values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
310
311
94.2k
                } else
312
9
                    return set_error(Error::InvalidCharacterClass);
313
314
                // FIXME: we do not support locale specific character classes until locales are implemented
315
316
94.2k
                consume(TokenType::Colon, Error::InvalidCharacterClass);
317
94.2k
                consume(TokenType::RightBracket, Error::MismatchingBracket);
318
94.2k
            } else {
319
23
                return set_error(Error::MismatchingBracket);
320
23
            }
321
322
33.9M
        } else if (match(TokenType::RightBracket)) {
323
324
8.46k
            if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
325
                // handle bracket as ordinary character
326
629
                values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
327
7.83k
            } else {
328
                // closing bracket expression
329
7.83k
                break;
330
7.83k
            }
331
33.8M
        } else {
332
33.8M
            values.append({ CharacterCompareType::Char, (ByteCodeValueType)skip() });
333
33.8M
        }
334
335
        // check if range expression has to be completed...
336
38.1M
        if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
337
3.50M
            if (values.last().type != CharacterCompareType::Char)
338
3
                return set_error(Error::InvalidRange);
339
340
3.50M
            auto value2 = values.take_last();
341
3.50M
            values.take_last(); // RangeExpressionDummy
342
3.50M
            auto value1 = values.take_last();
343
344
3.50M
            values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
345
3.50M
        }
346
38.1M
    }
347
348
8.28k
    if (!values.is_empty()) {
349
8.20k
        match_length_minimum = 1;
350
8.20k
        if (values.first().type == CharacterCompareType::Inverse)
351
3.02k
            match_length_minimum = 0;
352
8.20k
    }
353
354
8.28k
    return true;
355
8.53k
}
356
357
// =============================
358
// PosixBasic Parser
359
// =============================
360
361
bool PosixBasicParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
362
4.20k
{
363
4.20k
    return parse_root(stack, match_length_minimum);
364
4.20k
}
365
366
bool PosixBasicParser::parse_root(ByteCode& bytecode, size_t& match_length_minimum)
367
4.20k
{
368
    // basic_reg_exp : L_ANCHOR? RE_expression R_ANCHOR?
369
4.20k
    if (match(TokenType::Circumflex)) {
370
21
        consume();
371
21
        bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
372
21
    }
373
374
4.20k
    if (!parse_re_expression(bytecode, match_length_minimum))
375
1.49k
        return false;
376
377
2.70k
    if (match(TokenType::Dollar)) {
378
0
        consume();
379
0
        bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
380
0
    }
381
382
2.70k
    return !has_error();
383
4.20k
}
384
385
bool PosixBasicParser::parse_re_expression(ByteCode& bytecode, size_t& match_length_minimum)
386
40.1k
{
387
    // RE_expression : RE_expression? simple_RE
388
12.8M
    while (!done()) {
389
12.8M
        if (!parse_simple_re(bytecode, match_length_minimum))
390
37.4k
            break;
391
12.8M
    }
392
393
40.1k
    return !has_error();
394
40.1k
}
395
396
bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_minimum)
397
12.8M
{
398
    // simple_RE : nondupl_RE RE_dupl_symbol?
399
12.8M
    ByteCode simple_re_bytecode;
400
12.8M
    size_t re_match_length_minimum = 0;
401
12.8M
    if (!parse_nonduplicating_re(simple_re_bytecode, re_match_length_minimum))
402
36.9k
        return false;
403
404
    // RE_dupl_symbol : '*' | Back_open_brace DUP_COUNT (',' DUP_COUNT?)? Back_close_brace
405
12.7M
    if (match(TokenType::Asterisk)) {
406
30.0k
        consume();
407
30.0k
        ByteCode::transform_bytecode_repetition_any(simple_re_bytecode, true);
408
12.7M
    } else if (try_skip("\\{")) {
409
62.8k
        auto read_number = [&]() -> Optional<size_t> {
410
62.8k
            if (!match(TokenType::Char))
411
34
                return {};
412
62.8k
            size_t value = 0;
413
137k
            while (match(TokenType::Char)) {
414
107k
                auto c = m_parser_state.current_token.value().substring_view(0, 1);
415
107k
                auto c_value = c.to_uint();
416
107k
                if (!c_value.has_value())
417
33.5k
                    break;
418
74.2k
                value *= 10;
419
74.2k
                value += *c_value;
420
74.2k
                consume();
421
74.2k
            }
422
62.8k
            return value;
423
62.8k
        };
424
425
33.6k
        size_t min_limit;
426
33.6k
        Optional<size_t> max_limit;
427
428
33.6k
        if (auto limit = read_number(); !limit.has_value())
429
31
            return set_error(Error::InvalidRepetitionMarker);
430
33.5k
        else
431
33.5k
            min_limit = *limit;
432
433
33.5k
        if (match(TokenType::Comma)) {
434
29.2k
            consume();
435
29.2k
            max_limit = read_number();
436
29.2k
        }
437
438
33.5k
        if (!try_skip("\\}"))
439
122
            return set_error(Error::MismatchingBrace);
440
441
33.4k
        if (max_limit.value_or(min_limit) < min_limit)
442
34
            return set_error(Error::InvalidBraceContent);
443
444
33.4k
        if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count))
445
303
            return set_error(Error::InvalidBraceContent);
446
447
33.1k
        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
448
33.1k
        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
449
33.1k
        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, min_repetition_mark_id, max_repetition_mark_id, true);
450
33.1k
        match_length_minimum += re_match_length_minimum * min_limit;
451
12.7M
    } else {
452
12.7M
        match_length_minimum += re_match_length_minimum;
453
12.7M
    }
454
455
12.7M
    bytecode.extend(move(simple_re_bytecode));
456
12.7M
    return true;
457
12.7M
}
458
459
bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match_length_minimum)
460
12.8M
{
461
    // nondupl_RE : one_char_or_coll_elem_RE | Back_open_paren RE_expression Back_close_paren | BACKREF
462
12.8M
    if (try_skip("\\(")) {
463
35.9k
        TemporaryChange change { m_current_capture_group_depth, m_current_capture_group_depth + 1 };
464
        // Max number of addressable capture groups is 10, let's just be lenient
465
        // and accept 20; anything past that is probably a silly pattern anyway.
466
35.9k
        if (m_current_capture_group_depth > 20)
467
6
            return set_error(Error::InvalidPattern);
468
35.9k
        ByteCode capture_bytecode;
469
35.9k
        size_t capture_length_minimum = 0;
470
35.9k
        auto capture_group_index = ++m_parser_state.capture_groups_count;
471
472
35.9k
        if (!parse_re_expression(capture_bytecode, capture_length_minimum))
473
1.46k
            return false;
474
475
34.4k
        if (!try_skip("\\)"))
476
235
            return set_error(Error::MismatchingParen);
477
478
34.2k
        match_length_minimum += capture_length_minimum;
479
34.2k
        if (capture_group_index <= number_of_addressable_capture_groups) {
480
7.45k
            m_capture_group_minimum_lengths[capture_group_index - 1] = capture_length_minimum;
481
7.45k
            m_capture_group_seen[capture_group_index - 1] = true;
482
7.45k
            bytecode.insert_bytecode_group_capture_left(capture_group_index);
483
7.45k
        }
484
485
34.2k
        bytecode.extend(capture_bytecode);
486
487
34.2k
        if (capture_group_index <= number_of_addressable_capture_groups)
488
7.45k
            bytecode.insert_bytecode_group_capture_right(capture_group_index);
489
34.2k
        return true;
490
34.4k
    }
491
492
127M
    for (size_t i = 1; i < 10; ++i) {
493
115M
        char backref_name[2] { '\\', '0' };
494
115M
        backref_name[1] += i;
495
115M
        if (try_skip({ backref_name, 2 })) {
496
1.47k
            if (!m_capture_group_seen[i - 1])
497
39
                return set_error(Error::InvalidNumber);
498
1.43k
            match_length_minimum += m_capture_group_minimum_lengths[i - 1];
499
1.43k
            bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } });
500
1.43k
            return true;
501
1.47k
        }
502
115M
    }
503
504
12.7M
    return parse_one_char_or_collation_element(bytecode, match_length_minimum);
505
12.7M
}
506
507
bool PosixBasicParser::parse_one_char_or_collation_element(ByteCode& bytecode, size_t& match_length_minimum)
508
12.7M
{
509
    // one_char_or_coll_elem_RE : ORD_CHAR | QUOTED_CHAR | '.' | bracket_expression
510
12.7M
    if (match(TokenType::Period)) {
511
3.26k
        consume();
512
3.26k
        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
513
3.26k
        match_length_minimum += 1;
514
3.26k
        return true;
515
3.26k
    }
516
517
    // Dollars are special if at the end of a pattern.
518
12.7M
    if (match(TokenType::Dollar)) {
519
113k
        consume();
520
521
        // If we are at the end of a pattern, emit an end check instruction.
522
113k
        if (match(TokenType::Eof)) {
523
48
            bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
524
48
            return true;
525
48
        }
526
527
        // We are not at the end of the string, so we should roll back and continue as normal.
528
113k
        back(2);
529
113k
    }
530
531
    // None of these are special in BRE.
532
12.7M
    if (match(TokenType::Char) || match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus)
533
12.7M
        || match(TokenType::Circumflex) || match(TokenType::RightCurly) || match(TokenType::Comma) || match(TokenType::Colon)
534
12.7M
        || match(TokenType::Dollar) || match(TokenType::EqualSign) || match(TokenType::LeftCurly) || match(TokenType::LeftParen)
535
12.7M
        || match(TokenType::Pipe) || match(TokenType::Slash) || match(TokenType::RightBracket) || match(TokenType::RightParen)) {
536
537
12.7M
        auto ch = consume().value()[0];
538
12.7M
        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
539
12.7M
        match_length_minimum += 1;
540
12.7M
        return true;
541
12.7M
    }
542
543
47.7k
    if (match(TokenType::EscapeSequence)) {
544
39.1k
        if (m_parser_state.current_token.value().is_one_of("\\)"sv, "\\}"sv, "\\("sv, "\\{"sv))
545
34.4k
            return false;
546
4.69k
        auto ch = consume().value()[1];
547
4.69k
        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
548
4.69k
        match_length_minimum += 1;
549
4.69k
        return true;
550
39.1k
    }
551
552
8.56k
    Vector<CompareTypeAndValuePair> values;
553
8.56k
    size_t bracket_minimum_length = 0;
554
555
8.56k
    if (match(TokenType::LeftBracket)) {
556
8.53k
        consume();
557
8.53k
        if (!AbstractPosixParser::parse_bracket_expression(values, bracket_minimum_length))
558
254
            return false;
559
560
8.28k
        consume(TokenType::RightBracket, Error::MismatchingBracket);
561
562
8.28k
        if (!has_error())
563
7.83k
            bytecode.insert_bytecode_compare_values(move(values));
564
8.28k
        match_length_minimum += bracket_minimum_length;
565
8.28k
        return !has_error();
566
8.53k
    }
567
568
25
    return set_error(Error::InvalidPattern);
569
8.56k
}
570
571
// =============================
572
// PosixExtended Parser
573
// =============================
574
575
bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
576
0
{
577
0
    return parse_root(stack, match_length_minimum);
578
0
}
579
580
ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
581
0
{
582
0
    auto type = m_parser_state.current_token.type();
583
0
    return (type == TokenType::Asterisk
584
0
        || type == TokenType::Plus
585
0
        || type == TokenType::Questionmark
586
0
        || type == TokenType::LeftCurly);
587
0
}
588
589
ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
590
0
{
591
0
    if (match(TokenType::LeftCurly)) {
592
0
        consume();
593
594
0
        StringBuilder number_builder;
595
596
0
        while (match(TokenType::Char)) {
597
0
            number_builder.append(consume().value());
598
0
        }
599
600
0
        auto maybe_minimum = number_builder.build().to_uint();
601
0
        if (!maybe_minimum.has_value())
602
0
            return set_error(Error::InvalidBraceContent);
603
604
0
        auto minimum = maybe_minimum.value();
605
0
        match_length_minimum *= minimum;
606
607
0
        if (minimum > s_maximum_repetition_count)
608
0
            return set_error(Error::InvalidBraceContent);
609
610
0
        if (match(TokenType::Comma)) {
611
0
            consume();
612
0
        } else {
613
0
            auto repetition_mark_id = m_parser_state.repetition_mark_count++;
614
615
0
            ByteCode bytecode;
616
0
            bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
617
0
            bytecode_to_repeat = move(bytecode);
618
619
0
            consume(TokenType::RightCurly, Error::MismatchingBrace);
620
0
            return !has_error();
621
0
        }
622
623
0
        Optional<u32> maybe_maximum {};
624
0
        number_builder.clear();
625
0
        while (match(TokenType::Char)) {
626
0
            number_builder.append(consume().value());
627
0
        }
628
0
        if (!number_builder.is_empty()) {
629
0
            auto value = number_builder.build().to_uint();
630
0
            if (!value.has_value() || minimum > value.value() || *value > s_maximum_repetition_count)
631
0
                return set_error(Error::InvalidBraceContent);
632
633
0
            maybe_maximum = value.value();
634
0
        }
635
636
0
        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
637
0
        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
638
0
        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, min_repetition_mark_id, max_repetition_mark_id);
639
640
0
        consume(TokenType::RightCurly, Error::MismatchingBrace);
641
0
        return !has_error();
642
0
    }
643
0
    if (match(TokenType::Plus)) {
644
0
        consume();
645
646
0
        bool nongreedy = match(TokenType::Questionmark);
647
0
        if (nongreedy)
648
0
            consume();
649
650
        // Note: don't touch match_length_minimum, it's already correct
651
0
        ByteCode::transform_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
652
0
        return !has_error();
653
0
    }
654
0
    if (match(TokenType::Asterisk)) {
655
0
        consume();
656
0
        match_length_minimum = 0;
657
658
0
        bool nongreedy = match(TokenType::Questionmark);
659
0
        if (nongreedy)
660
0
            consume();
661
662
0
        ByteCode::transform_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
663
664
0
        return !has_error();
665
0
    }
666
0
    if (match(TokenType::Questionmark)) {
667
0
        consume();
668
0
        match_length_minimum = 0;
669
670
0
        bool nongreedy = match(TokenType::Questionmark);
671
0
        if (nongreedy)
672
0
            consume();
673
674
0
        ByteCode::transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
675
0
        return !has_error();
676
0
    }
677
678
0
    return false;
679
0
}
680
681
ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
682
0
{
683
0
    Vector<CompareTypeAndValuePair> values;
684
0
    if (!AbstractPosixParser::parse_bracket_expression(values, match_length_minimum))
685
0
        return false;
686
687
0
    if (!has_error())
688
0
        stack.insert_bytecode_compare_values(move(values));
689
690
0
    return !has_error();
691
0
}
692
693
ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
694
0
{
695
0
    ByteCode bytecode;
696
0
    size_t length = 0;
697
0
    bool should_parse_repetition_symbol { false };
698
699
0
    for (;;) {
700
0
        if (match_ordinary_characters()) {
701
0
            Token start_token = m_parser_state.current_token;
702
0
            Token last_token = m_parser_state.current_token;
703
0
            for (;;) {
704
0
                if (!match_ordinary_characters())
705
0
                    break;
706
0
                ++length;
707
0
                last_token = consume();
708
0
            }
709
710
0
            if (length > 1) {
711
                // last character is inserted into 'bytecode' for duplication symbol handling
712
0
                auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
713
0
                stack.insert_bytecode_compare_string({ start_token.value().characters_without_null_termination(), new_length });
714
0
            }
715
716
0
            if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
717
0
                bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
718
719
0
            should_parse_repetition_symbol = true;
720
0
            break;
721
0
        }
722
723
0
        if (match_repetition_symbol())
724
0
            return set_error(Error::InvalidRepetitionMarker);
725
726
0
        if (match(TokenType::Period)) {
727
0
            length = 1;
728
0
            consume();
729
0
            bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
730
0
            should_parse_repetition_symbol = true;
731
0
            break;
732
0
        }
733
734
0
        if (match(TokenType::EscapeSequence)) {
735
0
            length = 1;
736
0
            Token t = consume();
737
0
            dbgln_if(REGEX_DEBUG, "[PARSER] EscapeSequence with substring {}", t.value());
738
739
0
            bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
740
0
            should_parse_repetition_symbol = true;
741
0
            break;
742
0
        }
743
744
0
        if (match(TokenType::LeftBracket)) {
745
0
            consume();
746
747
0
            ByteCode sub_ops;
748
0
            if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
749
0
                return set_error(Error::InvalidBracketContent);
750
751
0
            bytecode.extend(move(sub_ops));
752
753
0
            consume(TokenType::RightBracket, Error::MismatchingBracket);
754
0
            should_parse_repetition_symbol = true;
755
0
            break;
756
0
        }
757
758
0
        if (match(TokenType::RightBracket)) {
759
0
            return set_error(Error::MismatchingBracket);
760
0
        }
761
762
0
        if (match(TokenType::RightCurly)) {
763
0
            return set_error(Error::MismatchingBrace);
764
0
        }
765
766
0
        if (match(TokenType::Circumflex)) {
767
0
            consume();
768
0
            bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
769
0
            break;
770
0
        }
771
772
0
        if (match(TokenType::Dollar)) {
773
0
            consume();
774
0
            bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
775
0
            break;
776
0
        }
777
778
0
        if (match(TokenType::RightParen))
779
0
            return false;
780
781
0
        if (match(TokenType::LeftParen)) {
782
0
            enum GroupMode {
783
0
                Normal,
784
0
                Lookahead,
785
0
                NegativeLookahead,
786
0
                Lookbehind,
787
0
                NegativeLookbehind,
788
0
            } group_mode { Normal };
789
0
            consume();
790
0
            Optional<StringView> capture_group_name;
791
0
            bool prevent_capture_group = false;
792
0
            if (match(TokenType::Questionmark)) {
793
0
                consume();
794
795
0
                if (match(TokenType::Colon)) {
796
0
                    consume();
797
0
                    prevent_capture_group = true;
798
0
                } else if (consume("<")) { // named capturing group
799
800
0
                    Token start_token = m_parser_state.current_token;
801
0
                    Token last_token = m_parser_state.current_token;
802
0
                    size_t capture_group_name_length = 0;
803
0
                    for (;;) {
804
0
                        if (!match_ordinary_characters())
805
0
                            return set_error(Error::InvalidNameForCaptureGroup);
806
0
                        if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
807
0
                            consume();
808
0
                            break;
809
0
                        }
810
0
                        ++capture_group_name_length;
811
0
                        last_token = consume();
812
0
                    }
813
0
                    capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
814
0
                    ++m_parser_state.named_capture_groups_count;
815
816
0
                } else if (match(TokenType::EqualSign)) { // positive lookahead
817
0
                    consume();
818
0
                    group_mode = Lookahead;
819
0
                } else if (consume("!")) { // negative lookahead
820
0
                    group_mode = NegativeLookahead;
821
0
                } else if (consume("<")) {
822
0
                    if (match(TokenType::EqualSign)) { // positive lookbehind
823
0
                        consume();
824
0
                        group_mode = Lookbehind;
825
0
                    }
826
0
                    if (consume("!")) // negative lookbehind
827
0
                        group_mode = NegativeLookbehind;
828
0
                } else {
829
0
                    return set_error(Error::InvalidRepetitionMarker);
830
0
                }
831
0
            }
832
833
0
            auto current_capture_group = m_parser_state.capture_groups_count;
834
0
            if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
835
0
                bytecode.insert_bytecode_group_capture_left(current_capture_group);
836
0
                m_parser_state.capture_groups_count++;
837
0
            }
838
839
0
            ByteCode capture_group_bytecode;
840
841
0
            if (!parse_root(capture_group_bytecode, length))
842
0
                return set_error(Error::InvalidPattern);
843
844
0
            switch (group_mode) {
845
0
            case Normal:
846
0
                bytecode.extend(move(capture_group_bytecode));
847
0
                break;
848
0
            case Lookahead:
849
0
                bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookAhead, length);
850
0
                break;
851
0
            case NegativeLookahead:
852
0
                bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookAhead, length);
853
0
                break;
854
0
            case Lookbehind:
855
0
                bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookBehind, length);
856
0
                break;
857
0
            case NegativeLookbehind:
858
0
                bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookBehind, length);
859
0
                break;
860
0
            }
861
862
0
            consume(TokenType::RightParen, Error::MismatchingParen);
863
864
0
            if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
865
0
                if (capture_group_name.has_value())
866
0
                    bytecode.insert_bytecode_group_capture_right(current_capture_group, capture_group_name.value());
867
0
                else
868
0
                    bytecode.insert_bytecode_group_capture_right(current_capture_group);
869
0
            }
870
0
            should_parse_repetition_symbol = true;
871
0
            break;
872
0
        }
873
874
0
        return false;
875
0
    }
876
877
0
    if (match_repetition_symbol()) {
878
0
        if (should_parse_repetition_symbol)
879
0
            parse_repetition_symbol(bytecode, length);
880
0
        else
881
0
            return set_error(Error::InvalidRepetitionMarker);
882
0
    }
883
884
0
    stack.extend(move(bytecode));
885
0
    match_length_minimum += length;
886
887
0
    return true;
888
0
}
889
890
bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
891
0
{
892
0
    ByteCode bytecode_left;
893
0
    size_t match_length_minimum_left { 0 };
894
895
0
    if (match_repetition_symbol())
896
0
        return set_error(Error::InvalidRepetitionMarker);
897
898
0
    for (;;) {
899
0
        if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
900
0
            break;
901
902
0
        if (match(TokenType::Pipe)) {
903
0
            consume();
904
905
0
            ByteCode bytecode_right;
906
0
            size_t match_length_minimum_right { 0 };
907
908
0
            if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
909
0
                return set_error(Error::InvalidPattern);
910
911
0
            ByteCode new_bytecode;
912
0
            new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
913
0
            bytecode_left = move(new_bytecode);
914
0
            match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
915
0
        }
916
0
    }
917
918
0
    if (bytecode_left.is_empty())
919
0
        set_error(Error::EmptySubExpression);
920
921
0
    stack.extend(move(bytecode_left));
922
0
    match_length_minimum = match_length_minimum_left;
923
0
    return !has_error();
924
0
}
925
926
// =============================
927
// ECMA262 Parser
928
// =============================
929
930
bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
931
0
{
932
0
    if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) {
933
0
        return parse_pattern(stack, match_length_minimum, true, true);
934
0
    }
935
936
0
    ByteCode new_stack;
937
0
    size_t new_match_length = 0;
938
0
    auto res = parse_pattern(new_stack, new_match_length, false, false);
939
0
    if (m_parser_state.named_capture_groups_count > 0) {
940
0
        reset();
941
0
        return parse_pattern(stack, match_length_minimum, false, true);
942
0
    }
943
944
0
    if (!res)
945
0
        return false;
946
947
0
    stack.extend(new_stack);
948
0
    match_length_minimum = new_match_length;
949
0
    return res;
950
0
}
951
952
bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
953
0
{
954
0
    return parse_disjunction(stack, match_length_minimum, unicode, named);
955
0
}
956
957
bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
958
0
{
959
0
    size_t total_match_length_minimum = NumericLimits<size_t>::max();
960
0
    Vector<ByteCode> alternatives;
961
0
    while (true) {
962
0
        ByteCode alternative_stack;
963
0
        size_t alternative_minimum_length = 0;
964
0
        auto alt_ok = parse_alternative(alternative_stack, alternative_minimum_length, unicode, named);
965
0
        if (!alt_ok)
966
0
            return false;
967
968
0
        alternatives.append(move(alternative_stack));
969
0
        total_match_length_minimum = min(alternative_minimum_length, total_match_length_minimum);
970
971
0
        if (!match(TokenType::Pipe))
972
0
            break;
973
0
        consume();
974
0
    }
975
976
0
    Optimizer::append_alternation(stack, alternatives.span());
977
0
    match_length_minimum = total_match_length_minimum;
978
0
    return true;
979
0
}
980
981
bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
982
0
{
983
0
    for (;;) {
984
0
        if (match(TokenType::Eof))
985
0
            return true;
986
987
0
        if (parse_term(stack, match_length_minimum, unicode, named))
988
0
            continue;
989
990
0
        return !has_error();
991
0
    }
992
0
}
993
994
bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
995
0
{
996
0
    if (parse_assertion(stack, match_length_minimum, unicode, named))
997
0
        return true;
998
999
0
    ByteCode atom_stack;
1000
0
    size_t minimum_atom_length = 0;
1001
0
    auto parse_with_quantifier = [&] {
1002
0
        bool did_parse_one = false;
1003
0
        if (m_should_use_browser_extended_grammar)
1004
0
            did_parse_one = parse_extended_atom(atom_stack, minimum_atom_length, named);
1005
1006
0
        if (!did_parse_one)
1007
0
            did_parse_one = parse_atom(atom_stack, minimum_atom_length, unicode, named);
1008
1009
0
        if (!did_parse_one)
1010
0
            return false;
1011
1012
0
        VERIFY(did_parse_one);
1013
0
        return parse_quantifier(atom_stack, minimum_atom_length, unicode, named);
1014
0
    };
1015
1016
0
    if (!parse_with_quantifier())
1017
0
        return false;
1018
1019
0
    stack.extend(move(atom_stack));
1020
0
    match_length_minimum += minimum_atom_length;
1021
0
    return true;
1022
0
}
1023
1024
bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
1025
0
{
1026
0
    if (match(TokenType::Circumflex)) {
1027
0
        consume();
1028
0
        stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
1029
0
        return true;
1030
0
    }
1031
1032
0
    if (match(TokenType::Dollar)) {
1033
0
        consume();
1034
0
        stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
1035
0
        return true;
1036
0
    }
1037
1038
0
    if (try_skip("\\b")) {
1039
0
        stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
1040
0
        return true;
1041
0
    }
1042
1043
0
    if (try_skip("\\B")) {
1044
0
        stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
1045
0
        return true;
1046
0
    }
1047
1048
0
    if (match(TokenType::LeftParen)) {
1049
0
        if (!try_skip("(?"))
1050
0
            return false;
1051
1052
0
        if (done()) {
1053
0
            set_error(Error::InvalidCaptureGroup);
1054
0
            return false;
1055
0
        }
1056
1057
0
        ByteCode assertion_stack;
1058
0
        size_t length_dummy = 0;
1059
1060
0
        bool should_parse_forward_assertion = m_should_use_browser_extended_grammar ? unicode : true;
1061
0
        if (should_parse_forward_assertion && try_skip("=")) {
1062
0
            if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
1063
0
                return false;
1064
0
            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
1065
0
            return true;
1066
0
        }
1067
0
        if (should_parse_forward_assertion && try_skip("!")) {
1068
0
            enter_capture_group_scope();
1069
0
            ScopeGuard quit_scope {
1070
0
                [this] {
1071
0
                    exit_capture_group_scope();
1072
0
                }
1073
0
            };
1074
0
            if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
1075
0
                return false;
1076
0
            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
1077
0
            clear_all_capture_groups_in_scope(stack);
1078
1079
0
            return true;
1080
0
        }
1081
0
        if (m_should_use_browser_extended_grammar) {
1082
0
            if (!unicode) {
1083
0
                if (parse_quantifiable_assertion(assertion_stack, match_length_minimum, named)) {
1084
0
                    if (!parse_quantifier(assertion_stack, match_length_minimum, unicode, named))
1085
0
                        return false;
1086
1087
0
                    stack.extend(move(assertion_stack));
1088
0
                    return true;
1089
0
                }
1090
0
            }
1091
0
        }
1092
0
        if (try_skip("<=")) {
1093
0
            if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
1094
0
                return false;
1095
            // FIXME: Somehow ensure that this assertion regexp has a fixed length.
1096
0
            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
1097
0
            return true;
1098
0
        }
1099
0
        if (try_skip("<!")) {
1100
0
            enter_capture_group_scope();
1101
0
            ScopeGuard quit_scope {
1102
0
                [this] {
1103
0
                    exit_capture_group_scope();
1104
0
                }
1105
0
            };
1106
0
            if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
1107
0
                return false;
1108
0
            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
1109
0
            clear_all_capture_groups_in_scope(stack);
1110
0
            return true;
1111
0
        }
1112
1113
        // If none of these matched, put the '(?' back.
1114
0
        m_parser_state.lexer.back(3);
1115
0
        m_parser_state.current_token = m_parser_state.lexer.next();
1116
0
        return false;
1117
0
    }
1118
1119
0
    return false;
1120
0
}
1121
1122
bool ECMA262Parser::parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named)
1123
0
{
1124
0
    auto disjunction_ok = parse_disjunction(bytecode_stack, length, unicode, named);
1125
0
    if (!disjunction_ok)
1126
0
        return false;
1127
0
    consume(TokenType::RightParen, Error::MismatchingParen);
1128
0
    return true;
1129
0
}
1130
1131
bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool named)
1132
0
{
1133
0
    VERIFY(m_should_use_browser_extended_grammar);
1134
0
    ByteCode assertion_stack;
1135
0
    size_t match_length_minimum = 0;
1136
1137
0
    if (try_skip("=")) {
1138
0
        if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
1139
0
            return false;
1140
1141
0
        stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
1142
0
        return true;
1143
0
    }
1144
0
    if (try_skip("!")) {
1145
0
        enter_capture_group_scope();
1146
0
        ScopeGuard quit_scope {
1147
0
            [this] {
1148
0
                exit_capture_group_scope();
1149
0
            }
1150
0
        };
1151
0
        if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
1152
0
            return false;
1153
1154
0
        stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
1155
0
        clear_all_capture_groups_in_scope(stack);
1156
0
        return true;
1157
0
    }
1158
1159
0
    return false;
1160
0
}
1161
1162
StringView ECMA262Parser::read_digits_as_string(ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count)
1163
0
{
1164
0
    if (!match(TokenType::Char))
1165
0
        return {};
1166
1167
0
    if (initial_zero == ReadDigitsInitialZeroState::Disallow && m_parser_state.current_token.value() == "0")
1168
0
        return {};
1169
1170
0
    int count = 0;
1171
0
    size_t offset = 0;
1172
0
    auto start_token = m_parser_state.current_token;
1173
0
    while (match(TokenType::Char)) {
1174
0
        auto const c = m_parser_state.current_token.value();
1175
1176
0
        if (max_count > 0 && count >= max_count)
1177
0
            break;
1178
1179
0
        if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
1180
0
            break;
1181
0
        if (!hex && !c.to_uint().has_value())
1182
0
            break;
1183
1184
0
        offset += consume().value().length();
1185
0
        ++count;
1186
0
    }
1187
1188
0
    if (count < min_count)
1189
0
        return {};
1190
1191
0
    return StringView { start_token.value().characters_without_null_termination(), offset };
1192
0
}
1193
1194
Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count)
1195
0
{
1196
0
    auto str = read_digits_as_string(initial_zero, hex, max_count, min_count);
1197
0
    if (str.is_empty())
1198
0
        return {};
1199
0
    if (hex)
1200
0
        return AK::StringUtils::convert_to_uint_from_hex(str);
1201
0
    return str.to_uint();
1202
0
}
1203
1204
bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
1205
0
{
1206
0
    enum class Repetition {
1207
0
        OneOrMore,
1208
0
        ZeroOrMore,
1209
0
        Optional,
1210
0
        Explicit,
1211
0
        None,
1212
0
    } repetition_mark { Repetition::None };
1213
1214
0
    bool ungreedy = false;
1215
0
    Optional<u64> repeat_min, repeat_max;
1216
1217
0
    if (match(TokenType::Asterisk)) {
1218
0
        consume();
1219
0
        repetition_mark = Repetition::ZeroOrMore;
1220
0
    } else if (match(TokenType::Plus)) {
1221
0
        consume();
1222
0
        repetition_mark = Repetition::OneOrMore;
1223
0
    } else if (match(TokenType::Questionmark)) {
1224
0
        consume();
1225
0
        repetition_mark = Repetition::Optional;
1226
0
    } else if (match(TokenType::LeftCurly)) {
1227
0
        repetition_mark = Repetition::Explicit;
1228
0
        if (!parse_interval_quantifier(repeat_min, repeat_max)) {
1229
0
            if (unicode) {
1230
                // Invalid interval quantifiers are disallowed in Unicode mod - they must be escaped with '\{'.
1231
0
                set_error(Error::InvalidPattern);
1232
0
            }
1233
0
            return !has_error();
1234
0
        }
1235
0
    } else {
1236
0
        return true;
1237
0
    }
1238
1239
0
    if (match(TokenType::Questionmark)) {
1240
0
        consume();
1241
0
        ungreedy = true;
1242
0
    }
1243
1244
0
    switch (repetition_mark) {
1245
0
    case Repetition::OneOrMore:
1246
0
        ByteCode::transform_bytecode_repetition_min_one(stack, !ungreedy);
1247
0
        break;
1248
0
    case Repetition::ZeroOrMore:
1249
0
        ByteCode::transform_bytecode_repetition_any(stack, !ungreedy);
1250
0
        match_length_minimum = 0;
1251
0
        break;
1252
0
    case Repetition::Optional:
1253
0
        ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy);
1254
0
        match_length_minimum = 0;
1255
0
        break;
1256
0
    case Repetition::Explicit: {
1257
0
        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
1258
0
        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
1259
0
        ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, min_repetition_mark_id, max_repetition_mark_id, !ungreedy);
1260
0
        match_length_minimum *= repeat_min.value();
1261
0
        break;
1262
0
    }
1263
0
    case Repetition::None:
1264
0
        VERIFY_NOT_REACHED();
1265
0
    }
1266
1267
0
    return true;
1268
0
}
1269
1270
bool ECMA262Parser::parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max)
1271
0
{
1272
0
    VERIFY(match(TokenType::LeftCurly));
1273
0
    consume();
1274
0
    auto chars_consumed = 1;
1275
1276
0
    auto low_bound_string = read_digits_as_string();
1277
0
    chars_consumed += low_bound_string.length();
1278
1279
0
    auto low_bound = low_bound_string.to_uint<u64>();
1280
1281
0
    if (!low_bound.has_value()) {
1282
0
        if (!m_should_use_browser_extended_grammar && done())
1283
0
            return set_error(Error::MismatchingBrace);
1284
1285
0
        back(chars_consumed + !done());
1286
0
        return false;
1287
0
    }
1288
1289
0
    repeat_min = low_bound.value();
1290
1291
0
    if (match(TokenType::Comma)) {
1292
0
        consume();
1293
0
        ++chars_consumed;
1294
0
        auto high_bound_string = read_digits_as_string();
1295
0
        auto high_bound = high_bound_string.to_uint<u64>();
1296
0
        if (high_bound.has_value()) {
1297
0
            repeat_max = high_bound.value();
1298
0
            chars_consumed += high_bound_string.length();
1299
0
        }
1300
0
    } else {
1301
0
        repeat_max = repeat_min;
1302
0
    }
1303
1304
0
    if (!match(TokenType::RightCurly)) {
1305
0
        if (!m_should_use_browser_extended_grammar && done())
1306
0
            return set_error(Error::MismatchingBrace);
1307
1308
0
        back(chars_consumed + !done());
1309
0
        return false;
1310
0
    }
1311
1312
0
    consume();
1313
0
    ++chars_consumed;
1314
1315
0
    if (repeat_max.has_value()) {
1316
0
        if (repeat_min.value() > repeat_max.value())
1317
0
            set_error(Error::InvalidBraceContent);
1318
0
    }
1319
1320
0
    if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count)))
1321
0
        return set_error(Error::InvalidBraceContent);
1322
1323
0
    return true;
1324
0
}
1325
1326
bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
1327
0
{
1328
0
    if (match(TokenType::EscapeSequence)) {
1329
        // Also part of AtomEscape.
1330
0
        auto token = consume();
1331
0
        match_length_minimum += 1;
1332
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[1] } });
1333
0
        return true;
1334
0
    }
1335
0
    if (try_skip("\\")) {
1336
        // AtomEscape.
1337
0
        return parse_atom_escape(stack, match_length_minimum, unicode, named);
1338
0
    }
1339
1340
0
    if (match(TokenType::LeftBracket)) {
1341
        // Character class.
1342
0
        return parse_character_class(stack, match_length_minimum, unicode, named);
1343
0
    }
1344
1345
0
    if (match(TokenType::LeftParen)) {
1346
        // Non-capturing group, or a capture group.
1347
0
        return parse_capture_group(stack, match_length_minimum, unicode, named);
1348
0
    }
1349
1350
0
    if (match(TokenType::Period)) {
1351
0
        consume();
1352
0
        match_length_minimum += 1;
1353
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
1354
0
        return true;
1355
0
    }
1356
1357
0
    if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightParen)
1358
0
        || match(TokenType::Pipe) || match(TokenType::Plus) || match(TokenType::Asterisk)
1359
0
        || match(TokenType::Questionmark)) {
1360
1361
0
        return false;
1362
0
    }
1363
1364
0
    if (match(TokenType::RightBracket) || match(TokenType::RightCurly) || match(TokenType::LeftCurly)) {
1365
0
        if (unicode)
1366
0
            return set_error(Error::InvalidPattern);
1367
1368
0
        if (m_should_use_browser_extended_grammar) {
1369
0
            auto token = consume();
1370
0
            match_length_minimum += 1;
1371
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } });
1372
0
            return true;
1373
0
        }
1374
0
        return false;
1375
0
    }
1376
1377
0
    if (match_ordinary_characters()) {
1378
0
        auto token = consume().value();
1379
0
        match_length_minimum += 1;
1380
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token[0] } });
1381
0
        return true;
1382
0
    }
1383
1384
0
    set_error(Error::InvalidPattern);
1385
0
    return false;
1386
0
}
1387
1388
bool ECMA262Parser::parse_extended_atom(ByteCode&, size_t&, bool)
1389
0
{
1390
    // Note: This includes only rules *not* present in parse_atom()
1391
0
    VERIFY(m_should_use_browser_extended_grammar);
1392
1393
0
    return parse_invalid_braced_quantifier(); // FAIL FAIL FAIL
1394
0
}
1395
1396
bool ECMA262Parser::parse_invalid_braced_quantifier()
1397
0
{
1398
0
    if (!match(TokenType::LeftCurly))
1399
0
        return false;
1400
0
    consume();
1401
0
    size_t chars_consumed = 1;
1402
0
    auto low_bound = read_digits_as_string();
1403
0
    StringView high_bound;
1404
1405
0
    if (low_bound.is_empty()) {
1406
0
        back(chars_consumed + !done());
1407
0
        return false;
1408
0
    }
1409
0
    chars_consumed += low_bound.length();
1410
0
    if (match(TokenType::Comma)) {
1411
0
        consume();
1412
0
        ++chars_consumed;
1413
1414
0
        high_bound = read_digits_as_string();
1415
0
        chars_consumed += high_bound.length();
1416
0
    }
1417
1418
0
    if (!match(TokenType::RightCurly)) {
1419
0
        back(chars_consumed + !done());
1420
0
        return false;
1421
0
    }
1422
1423
0
    consume();
1424
0
    set_error(Error::InvalidPattern);
1425
0
    return true;
1426
0
}
1427
1428
bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
1429
0
{
1430
0
    if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
1431
0
        if (auto escape = escape_str.to_uint(); escape.has_value()) {
1432
            // See if this is a "back"-reference (we've already parsed the group it refers to)
1433
0
            auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
1434
0
            if (maybe_length.has_value()) {
1435
0
                match_length_minimum += maybe_length.value();
1436
0
                stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
1437
0
                return true;
1438
0
            }
1439
            // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
1440
0
            if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
1441
                // This refers to a future group, and it will _always_ be matching an empty string
1442
                // So just match nothing and move on.
1443
0
                return true;
1444
0
            }
1445
0
            if (!m_should_use_browser_extended_grammar) {
1446
0
                set_error(Error::InvalidNumber);
1447
0
                return false;
1448
0
            }
1449
0
        }
1450
1451
        // If not, put the characters back.
1452
0
        back(escape_str.length());
1453
0
    }
1454
1455
    // CharacterEscape > ControlEscape
1456
0
    if (try_skip("f")) {
1457
0
        match_length_minimum += 1;
1458
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
1459
0
        return true;
1460
0
    }
1461
1462
0
    if (try_skip("n")) {
1463
0
        match_length_minimum += 1;
1464
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
1465
0
        return true;
1466
0
    }
1467
1468
0
    if (try_skip("r")) {
1469
0
        match_length_minimum += 1;
1470
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
1471
0
        return true;
1472
0
    }
1473
1474
0
    if (try_skip("t")) {
1475
0
        match_length_minimum += 1;
1476
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
1477
0
        return true;
1478
0
    }
1479
1480
0
    if (try_skip("v")) {
1481
0
        match_length_minimum += 1;
1482
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
1483
0
        return true;
1484
0
    }
1485
1486
    // CharacterEscape > ControlLetter
1487
0
    if (try_skip("c")) {
1488
0
        for (auto c : s_alphabetic_characters) {
1489
0
            if (try_skip({ &c, 1 })) {
1490
0
                match_length_minimum += 1;
1491
0
                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
1492
0
                return true;
1493
0
            }
1494
0
        }
1495
1496
0
        if (unicode) {
1497
0
            set_error(Error::InvalidPattern);
1498
0
            return false;
1499
0
        }
1500
1501
0
        if (m_should_use_browser_extended_grammar) {
1502
0
            back(1 + !done());
1503
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
1504
0
            match_length_minimum += 1;
1505
0
            return true;
1506
0
        }
1507
1508
        // Allow '\c' in non-unicode mode, just matches 'c'.
1509
0
        match_length_minimum += 1;
1510
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
1511
0
        return true;
1512
0
    }
1513
1514
    // '\0'
1515
0
    if (try_skip("0")) {
1516
0
        if (!lookahead_any(s_decimal_characters)) {
1517
0
            match_length_minimum += 1;
1518
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
1519
0
            return true;
1520
0
        }
1521
1522
0
        back();
1523
0
    }
1524
1525
    // LegacyOctalEscapeSequence
1526
0
    if (m_should_use_browser_extended_grammar) {
1527
0
        if (!unicode) {
1528
0
            if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
1529
0
                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
1530
0
                match_length_minimum += 1;
1531
0
                return true;
1532
0
            }
1533
0
        }
1534
0
    }
1535
1536
    // HexEscape
1537
0
    if (try_skip("x")) {
1538
0
        if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
1539
0
            match_length_minimum += 1;
1540
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
1541
0
            return true;
1542
0
        }
1543
0
        if (!unicode) {
1544
            // '\x' is allowed in non-unicode mode, just matches 'x'.
1545
0
            match_length_minimum += 1;
1546
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
1547
0
            return true;
1548
0
        }
1549
1550
0
        set_error(Error::InvalidPattern);
1551
0
        return false;
1552
0
    }
1553
1554
0
    if (try_skip("u")) {
1555
0
        if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
1556
0
            match_length_minimum += 1;
1557
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
1558
0
            return true;
1559
0
        }
1560
1561
0
        return false;
1562
0
    }
1563
1564
    // IdentityEscape
1565
0
    for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) {
1566
0
        if (try_skip({ &ch, 1 })) {
1567
0
            match_length_minimum += 1;
1568
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
1569
0
            return true;
1570
0
        }
1571
0
    }
1572
1573
0
    if (unicode) {
1574
0
        if (try_skip("/")) {
1575
0
            match_length_minimum += 1;
1576
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
1577
0
            return true;
1578
0
        }
1579
0
    }
1580
1581
0
    if (named && try_skip("k")) {
1582
0
        auto name = read_capture_group_specifier(true);
1583
0
        if (name.is_empty()) {
1584
0
            set_error(Error::InvalidNameForCaptureGroup);
1585
0
            return false;
1586
0
        }
1587
0
        auto maybe_capture_group = m_parser_state.named_capture_groups.get(name);
1588
0
        if (!maybe_capture_group.has_value()) {
1589
0
            set_error(Error::InvalidNameForCaptureGroup);
1590
0
            return false;
1591
0
        }
1592
0
        match_length_minimum += maybe_capture_group->minimum_length;
1593
1594
0
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } });
1595
0
        return true;
1596
0
    }
1597
1598
0
    if (unicode) {
1599
0
        PropertyEscape property {};
1600
0
        bool negated = false;
1601
1602
0
        if (parse_unicode_property_escape(property, negated)) {
1603
0
            Vector<CompareTypeAndValuePair> compares;
1604
0
            if (negated)
1605
0
                compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
1606
0
            property.visit(
1607
0
                [&](Unicode::Property property) {
1608
0
                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
1609
0
                },
1610
0
                [&](Unicode::GeneralCategory general_category) {
1611
0
                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
1612
0
                },
1613
0
                [&](Script script) {
1614
0
                    if (script.is_extension)
1615
0
                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
1616
0
                    else
1617
0
                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
1618
0
                },
1619
0
                [](Empty&) { VERIFY_NOT_REACHED(); });
1620
0
            stack.insert_bytecode_compare_values(move(compares));
1621
0
            match_length_minimum += 1;
1622
0
            return true;
1623
0
        }
1624
0
    }
1625
1626
0
    if (done())
1627
0
        return set_error(Error::InvalidTrailingEscape);
1628
1629
0
    bool negate = false;
1630
0
    auto ch = parse_character_class_escape(negate);
1631
0
    if (!ch.has_value()) {
1632
0
        if (!unicode) {
1633
            // Allow all SourceCharacter's as escapes here.
1634
0
            auto token = consume();
1635
0
            match_length_minimum += 1;
1636
0
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } });
1637
0
            return true;
1638
0
        }
1639
1640
0
        set_error(Error::InvalidCharacterClass);
1641
0
        return false;
1642
0
    }
1643
1644
0
    Vector<CompareTypeAndValuePair> compares;
1645
0
    if (negate)
1646
0
        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
1647
0
    compares.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() });
1648
0
    match_length_minimum += 1;
1649
0
    stack.insert_bytecode_compare_values(move(compares));
1650
0
    return true;
1651
0
}
1652
Optional<u8> ECMA262Parser::parse_legacy_octal_escape()
1653
0
{
1654
0
    constexpr auto all_octal_digits = "01234567";
1655
0
    auto read_octal_digit = [&](auto start, auto end, bool should_ensure_no_following_octal_digit) -> Optional<u8> {
1656
0
        for (char c = '0' + start; c <= '0' + end; ++c) {
1657
0
            if (try_skip({ &c, 1 })) {
1658
0
                if (!should_ensure_no_following_octal_digit || !lookahead_any(all_octal_digits))
1659
0
                    return c - '0';
1660
0
                back(2);
1661
0
                return {};
1662
0
            }
1663
0
        }
1664
0
        return {};
1665
0
    };
1666
1667
    // OctalDigit(1)
1668
0
    if (auto digit = read_octal_digit(0, 7, true); digit.has_value()) {
1669
0
        return digit.value();
1670
0
    }
1671
1672
    // OctalDigit(2)
1673
0
    if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
1674
0
        if (auto right_digit = read_octal_digit(0, 7, true); right_digit.has_value()) {
1675
0
            return left_digit.value() * 8 + right_digit.value();
1676
0
        }
1677
1678
0
        back(2);
1679
0
    }
1680
1681
    // OctalDigit(2)
1682
0
    if (auto left_digit = read_octal_digit(4, 7, false); left_digit.has_value()) {
1683
0
        if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
1684
0
            return left_digit.value() * 8 + right_digit.value();
1685
0
        }
1686
1687
0
        back(2);
1688
0
    }
1689
1690
    // OctalDigit(3)
1691
0
    if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
1692
0
        size_t chars_consumed = 1;
1693
0
        if (auto mid_digit = read_octal_digit(0, 7, false); mid_digit.has_value()) {
1694
0
            ++chars_consumed;
1695
0
            if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
1696
0
                return left_digit.value() * 64 + mid_digit.value() * 8 + right_digit.value();
1697
0
            }
1698
0
        }
1699
1700
0
        back(chars_consumed);
1701
0
    }
1702
1703
0
    return {};
1704
0
}
1705
1706
Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
1707
0
{
1708
0
    if (expect_backslash && !try_skip("\\"))
1709
0
        return {};
1710
1711
    // CharacterClassEscape
1712
0
    CharClass ch_class;
1713
0
    if (try_skip("d")) {
1714
0
        ch_class = CharClass::Digit;
1715
0
    } else if (try_skip("D")) {
1716
0
        ch_class = CharClass::Digit;
1717
0
        negate = true;
1718
0
    } else if (try_skip("s")) {
1719
0
        ch_class = CharClass::Space;
1720
0
    } else if (try_skip("S")) {
1721
0
        ch_class = CharClass::Space;
1722
0
        negate = true;
1723
0
    } else if (try_skip("w")) {
1724
0
        ch_class = CharClass::Word;
1725
0
    } else if (try_skip("W")) {
1726
0
        ch_class = CharClass::Word;
1727
0
        negate = true;
1728
0
    } else {
1729
0
        return {};
1730
0
    }
1731
1732
0
    return ch_class;
1733
0
}
1734
1735
bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
1736
0
{
1737
0
    consume(TokenType::LeftBracket, Error::InvalidPattern);
1738
1739
0
    Vector<CompareTypeAndValuePair> compares;
1740
1741
0
    if (match(TokenType::Circumflex)) {
1742
        // Negated charclass
1743
0
        consume();
1744
0
        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
1745
0
    }
1746
1747
0
    if (match(TokenType::RightBracket)) {
1748
0
        consume();
1749
        // Should only have at most an 'Inverse'
1750
0
        VERIFY(compares.size() <= 1);
1751
0
        stack.insert_bytecode_compare_values(move(compares));
1752
0
        return true;
1753
0
    }
1754
1755
0
    if (!parse_nonempty_class_ranges(compares, unicode))
1756
0
        return false;
1757
1758
0
    match_length_minimum += 1;
1759
0
    stack.insert_bytecode_compare_values(move(compares));
1760
0
    return true;
1761
0
}
1762
1763
struct CharClassRangeElement {
1764
    union {
1765
        CharClass character_class;
1766
        u32 code_point { 0 };
1767
        Unicode::Property property;
1768
        Unicode::GeneralCategory general_category;
1769
        Unicode::Script script;
1770
    };
1771
1772
    bool is_negated { false };
1773
    bool is_character_class { false };
1774
    bool is_property { false };
1775
    bool is_general_category { false };
1776
    bool is_script { false };
1777
    bool is_script_extension { false };
1778
};
1779
1780
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
1781
0
{
1782
0
    auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
1783
0
        if (match(TokenType::EscapeSequence)) {
1784
0
            auto token = consume().value();
1785
0
            return { CharClassRangeElement { .code_point = (u32)token[1], .is_character_class = false } };
1786
0
        }
1787
1788
0
        if (try_skip("\\")) {
1789
0
            if (done()) {
1790
0
                set_error(Error::InvalidTrailingEscape);
1791
0
                return {};
1792
0
            }
1793
1794
0
            if (try_skip("f"))
1795
0
                return { CharClassRangeElement { .code_point = '\f', .is_character_class = false } };
1796
0
            if (try_skip("n"))
1797
0
                return { CharClassRangeElement { .code_point = '\n', .is_character_class = false } };
1798
0
            if (try_skip("r"))
1799
0
                return { CharClassRangeElement { .code_point = '\r', .is_character_class = false } };
1800
0
            if (try_skip("t"))
1801
0
                return { CharClassRangeElement { .code_point = '\t', .is_character_class = false } };
1802
0
            if (try_skip("v"))
1803
0
                return { CharClassRangeElement { .code_point = '\v', .is_character_class = false } };
1804
0
            if (try_skip("b"))
1805
0
                return { CharClassRangeElement { .code_point = '\b', .is_character_class = false } };
1806
0
            if (try_skip("/"))
1807
0
                return { CharClassRangeElement { .code_point = '/', .is_character_class = false } };
1808
1809
            // CharacterEscape > ControlLetter
1810
0
            if (try_skip("c")) {
1811
0
                for (auto c : s_alphabetic_characters) {
1812
0
                    if (try_skip({ &c, 1 })) {
1813
0
                        return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } };
1814
0
                    }
1815
0
                }
1816
1817
0
                if (unicode) {
1818
0
                    set_error(Error::InvalidPattern);
1819
0
                    return {};
1820
0
                }
1821
1822
0
                if (m_should_use_browser_extended_grammar) {
1823
0
                    for (auto c = '0'; c <= '9'; ++c) {
1824
0
                        if (try_skip({ &c, 1 }))
1825
0
                            return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } };
1826
0
                    }
1827
0
                    if (try_skip("_"))
1828
0
                        return { CharClassRangeElement { .code_point = (u32)('_' % 32), .is_character_class = false } };
1829
1830
0
                    back(1 + !done());
1831
0
                    return { CharClassRangeElement { .code_point = '\\', .is_character_class = false } };
1832
0
                }
1833
0
            }
1834
1835
            // '\0'
1836
0
            if (try_skip("0")) {
1837
0
                if (!lookahead_any(s_decimal_characters))
1838
0
                    return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
1839
0
                back();
1840
0
            }
1841
1842
            // LegacyOctalEscapeSequence
1843
0
            if (m_should_use_browser_extended_grammar && !unicode) {
1844
0
                if (auto escape = parse_legacy_octal_escape(); escape.has_value())
1845
0
                    return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
1846
0
            }
1847
1848
            // HexEscape
1849
0
            if (try_skip("x")) {
1850
0
                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
1851
0
                    return { CharClassRangeElement { .code_point = hex_escape.value(), .is_character_class = false } };
1852
0
                } else if (!unicode) {
1853
                    // '\x' is allowed in non-unicode mode, just matches 'x'.
1854
0
                    return { CharClassRangeElement { .code_point = 'x', .is_character_class = false } };
1855
0
                } else {
1856
0
                    set_error(Error::InvalidPattern);
1857
0
                    return {};
1858
0
                }
1859
0
            }
1860
1861
0
            if (try_skip("u")) {
1862
0
                if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
1863
                    // FIXME: While code point ranges are supported, code point matches as "Char" are not!
1864
0
                    return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } };
1865
0
                }
1866
0
                return {};
1867
0
            }
1868
1869
            // IdentityEscape
1870
0
            for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) {
1871
0
                if (try_skip({ &ch, 1 }))
1872
0
                    return { CharClassRangeElement { .code_point = (u32)ch, .is_character_class = false } };
1873
0
            }
1874
1875
0
            if (unicode) {
1876
0
                if (try_skip("-"))
1877
0
                    return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
1878
1879
0
                PropertyEscape property {};
1880
0
                bool negated = false;
1881
0
                if (parse_unicode_property_escape(property, negated)) {
1882
0
                    return property.visit(
1883
0
                        [&](Unicode::Property property) {
1884
0
                            return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true };
1885
0
                        },
1886
0
                        [&](Unicode::GeneralCategory general_category) {
1887
0
                            return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
1888
0
                        },
1889
0
                        [&](Script script) {
1890
0
                            if (script.is_extension)
1891
0
                                return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true };
1892
1893
0
                            return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true };
1894
0
                        },
1895
0
                        [](Empty&) -> CharClassRangeElement { VERIFY_NOT_REACHED(); });
1896
0
                }
1897
0
            }
1898
1899
0
            if (try_skip("d"))
1900
0
                return { CharClassRangeElement { .character_class = CharClass::Digit, .is_character_class = true } };
1901
0
            if (try_skip("s"))
1902
0
                return { CharClassRangeElement { .character_class = CharClass::Space, .is_character_class = true } };
1903
0
            if (try_skip("w"))
1904
0
                return { CharClassRangeElement { .character_class = CharClass::Word, .is_character_class = true } };
1905
0
            if (try_skip("D"))
1906
0
                return { CharClassRangeElement { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
1907
0
            if (try_skip("S"))
1908
0
                return { CharClassRangeElement { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
1909
0
            if (try_skip("W"))
1910
0
                return { CharClassRangeElement { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
1911
1912
0
            if (!unicode) {
1913
                // Any unrecognised escape is allowed in non-unicode mode.
1914
0
                return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } };
1915
0
            }
1916
1917
0
            set_error(Error::InvalidPattern);
1918
0
            return {};
1919
0
        }
1920
1921
0
        if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
1922
0
            return {};
1923
1924
        // Allow any (other) SourceCharacter.
1925
0
        return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } };
1926
0
    };
1927
0
    auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
1928
0
        if (match(TokenType::HyphenMinus)) {
1929
0
            consume();
1930
0
            return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
1931
0
        }
1932
1933
0
        return read_class_atom_no_dash();
1934
0
    };
1935
1936
0
    auto empend_atom = [&](auto& atom) {
1937
0
        if (atom.is_character_class) {
1938
0
            if (atom.is_negated)
1939
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
1940
1941
0
            if (atom.is_property)
1942
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
1943
0
            else if (atom.is_general_category)
1944
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
1945
0
            else if (atom.is_script)
1946
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
1947
0
            else if (atom.is_script_extension)
1948
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) });
1949
0
            else
1950
0
                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
1951
0
        } else {
1952
0
            VERIFY(!atom.is_negated);
1953
0
            ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, atom.code_point });
1954
0
        }
1955
0
    };
1956
1957
0
    while (!match(TokenType::RightBracket)) {
1958
0
        if (match(TokenType::Eof)) {
1959
0
            set_error(Error::MismatchingBracket);
1960
0
            return false;
1961
0
        }
1962
1963
0
        auto first_atom = read_class_atom();
1964
0
        if (!first_atom.has_value())
1965
0
            return false;
1966
1967
0
        if (match(TokenType::HyphenMinus)) {
1968
0
            consume();
1969
0
            if (match(TokenType::RightBracket)) {
1970
                //  Allow '-' as the last element in a charclass, even after an atom.
1971
0
                m_parser_state.lexer.back(2); // -]
1972
0
                m_parser_state.current_token = m_parser_state.lexer.next();
1973
0
                goto read_as_single_atom;
1974
0
            }
1975
0
            auto second_atom = read_class_atom();
1976
0
            if (!second_atom.has_value())
1977
0
                return false;
1978
1979
0
            if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
1980
0
                if (m_should_use_browser_extended_grammar) {
1981
0
                    if (unicode) {
1982
0
                        set_error(Error::InvalidRange);
1983
0
                        return false;
1984
0
                    }
1985
1986
                    // CharacterRangeOrUnion > !Unicode > CharClass
1987
0
                    empend_atom(*first_atom);
1988
0
                    ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' });
1989
0
                    empend_atom(*second_atom);
1990
0
                    continue;
1991
0
                }
1992
1993
0
                set_error(Error::InvalidRange);
1994
0
                return false;
1995
0
            }
1996
1997
0
            if (first_atom.value().code_point > second_atom.value().code_point) {
1998
0
                set_error(Error::InvalidRange);
1999
0
                return false;
2000
0
            }
2001
2002
0
            VERIFY(!first_atom.value().is_negated);
2003
0
            VERIFY(!second_atom.value().is_negated);
2004
2005
0
            ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point } });
2006
0
            continue;
2007
0
        }
2008
2009
0
    read_as_single_atom:;
2010
2011
0
        auto atom = first_atom.value();
2012
0
        empend_atom(atom);
2013
0
    }
2014
2015
0
    consume(TokenType::RightBracket, Error::MismatchingBracket);
2016
2017
0
    return true;
2018
0
}
2019
2020
bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
2021
0
{
2022
0
    negated = false;
2023
2024
0
    if (try_skip("p"))
2025
0
        negated = false;
2026
0
    else if (try_skip("P"))
2027
0
        negated = true;
2028
0
    else
2029
0
        return false;
2030
2031
0
    auto parsed_property = read_unicode_property_escape();
2032
0
    if (!parsed_property.has_value()) {
2033
0
        set_error(Error::InvalidNameForProperty);
2034
0
        return false;
2035
0
    }
2036
2037
0
    property = move(*parsed_property);
2038
2039
0
    return property.visit(
2040
0
        [this](Unicode::Property property) {
2041
0
            if (!Unicode::is_ecma262_property(property)) {
2042
0
                set_error(Error::InvalidNameForProperty);
2043
0
                return false;
2044
0
            }
2045
0
            return true;
2046
0
        },
2047
0
        [](Unicode::GeneralCategory) { return true; },
2048
0
        [](Script) { return true; },
2049
0
        [](Empty&) -> bool { VERIFY_NOT_REACHED(); });
2050
0
}
2051
2052
FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
2053
0
{
2054
0
    static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
2055
0
    static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
2056
0
    static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
2057
0
    constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
2058
0
    constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
2059
2060
0
    if (take_starting_angle_bracket && !consume("<"))
2061
0
        return {};
2062
2063
0
    StringBuilder builder;
2064
2065
0
    auto consume_code_point = [&] {
2066
0
        Utf8View utf_8_view { m_parser_state.lexer.source().substring_view(m_parser_state.lexer.tell() - 1) };
2067
0
        if (utf_8_view.is_empty())
2068
0
            return REPLACEMENT_CHARACTER;
2069
0
        u32 code_point = *utf_8_view.begin();
2070
0
        auto characters = utf_8_view.byte_offset_of(1);
2071
2072
0
        while (characters-- > 0)
2073
0
            consume();
2074
2075
0
        return code_point;
2076
0
    };
2077
2078
0
    {
2079
        // The first character is limited to: https://tc39.es/ecma262/#prod-RegExpIdentifierStart
2080
        //  RegExpIdentifierStart[UnicodeMode] ::
2081
        //      IdentifierStartChar
2082
        //      \ RegExpUnicodeEscapeSequence[+UnicodeMode]
2083
        //      [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
2084
2085
0
        auto code_point = consume_code_point();
2086
2087
0
        if (code_point == '\\' && match('u')) {
2088
0
            consume();
2089
2090
0
            if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
2091
0
                code_point = *maybe_code_point;
2092
0
            } else {
2093
0
                set_error(Error::InvalidNameForCaptureGroup);
2094
0
                return {};
2095
0
            }
2096
0
        }
2097
2098
0
        if (is_ascii(code_point)) {
2099
            // The only valid ID_Start unicode characters in ascii are the letters.
2100
0
            if (!is_ascii_alpha(code_point) && code_point != '$' && code_point != '_') {
2101
0
                set_error(Error::InvalidNameForCaptureGroup);
2102
0
                return {};
2103
0
            }
2104
0
        } else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) {
2105
0
            set_error(Error::InvalidNameForCaptureGroup);
2106
0
            return {};
2107
0
        }
2108
0
        builder.append_code_point(code_point);
2109
0
    }
2110
2111
0
    bool hit_end = false;
2112
2113
    // Any following characters are limited to:
2114
    //  RegExpIdentifierPart[UnicodeMode] ::
2115
    //      IdentifierPartChar
2116
    //      \ RegExpUnicodeEscapeSequence[+UnicodeMode]
2117
    //      [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
2118
2119
0
    while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) {
2120
0
        auto code_point = consume_code_point();
2121
2122
0
        if (code_point == '>') {
2123
0
            hit_end = true;
2124
0
            break;
2125
0
        }
2126
2127
0
        if (code_point == '\\') {
2128
0
            if (!try_skip("u")) {
2129
0
                set_error(Error::InvalidNameForCaptureGroup);
2130
0
                return {};
2131
0
            }
2132
0
            if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
2133
0
                code_point = *maybe_code_point;
2134
0
            } else {
2135
0
                set_error(Error::InvalidNameForCaptureGroup);
2136
0
                return {};
2137
0
            }
2138
0
        }
2139
2140
0
        if (is_ascii(code_point)) {
2141
            // The only valid ID_Continue unicode characters in ascii are the letters and numbers.
2142
0
            if (!is_ascii_alphanumeric(code_point) && code_point != '$' && code_point != '_') {
2143
0
                set_error(Error::InvalidNameForCaptureGroup);
2144
0
                return {};
2145
0
            }
2146
0
        } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) {
2147
0
            if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) {
2148
0
                set_error(Error::InvalidNameForCaptureGroup);
2149
0
                return {};
2150
0
            }
2151
0
        }
2152
0
        builder.append_code_point(code_point);
2153
0
    }
2154
2155
0
    FlyString name = builder.build();
2156
0
    if (!hit_end || name.is_empty())
2157
0
        set_error(Error::InvalidNameForCaptureGroup);
2158
2159
0
    return name;
2160
0
}
2161
2162
Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape()
2163
0
{
2164
0
    consume(TokenType::LeftCurly, Error::InvalidPattern);
2165
2166
    // Note: clang-format is disabled here because it doesn't handle templated lambdas yet.
2167
    // clang-format off
2168
0
    auto read_until = [&]<typename... Ts>(Ts&&... terminators) {
2169
0
        auto start_token = m_parser_state.current_token;
2170
0
        size_t offset = 0;
2171
2172
0
        while (match(TokenType::Char)) {
2173
0
            if (m_parser_state.current_token.value().is_one_of(forward<Ts>(terminators)...))
2174
0
                break;
2175
0
            offset += consume().value().length();
2176
0
        }
2177
2178
0
        return StringView { start_token.value().characters_without_null_termination(), offset };
2179
0
    };
Unexecuted instantiation: RegexParser.cpp:auto regex::ECMA262Parser::read_unicode_property_escape()::$_10::operator()<AK::StringView, AK::StringView>(AK::StringView&&, AK::StringView&&) const
Unexecuted instantiation: RegexParser.cpp:auto regex::ECMA262Parser::read_unicode_property_escape()::$_10::operator()<AK::StringView>(AK::StringView&&) const
2180
    // clang-format on
2181
2182
0
    StringView property_type;
2183
0
    StringView property_name = read_until("="sv, "}"sv);
2184
2185
0
    if (try_skip("="sv)) {
2186
0
        if (property_name.is_empty())
2187
0
            return {};
2188
0
        property_type = property_name;
2189
0
        property_name = read_until("}"sv);
2190
0
    }
2191
2192
0
    consume(TokenType::RightCurly, Error::InvalidPattern);
2193
2194
0
    if (property_type.is_empty()) {
2195
0
        if (auto property = Unicode::property_from_string(property_name); property.has_value())
2196
0
            return { *property };
2197
0
        if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
2198
0
            return { *general_category };
2199
0
    } else if ((property_type == "General_Category"sv) || (property_type == "gc"sv)) {
2200
0
        if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
2201
0
            return { *general_category };
2202
0
    } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
2203
0
        if (auto script = Unicode::script_from_string(property_name); script.has_value())
2204
0
            return Script { *script, false };
2205
0
    } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) {
2206
0
        if (auto script = Unicode::script_from_string(property_name); script.has_value())
2207
0
            return Script { *script, true };
2208
0
    }
2209
2210
0
    return {};
2211
0
}
2212
2213
bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
2214
0
{
2215
0
    consume(TokenType::LeftParen, Error::InvalidPattern);
2216
2217
0
    auto register_capture_group_in_current_scope = [&](auto identifier) {
2218
0
        m_capture_groups_in_scope.last().empend(identifier);
2219
0
    };
2220
2221
0
    if (match(TokenType::Questionmark)) {
2222
        // Non-capturing group or group with specifier.
2223
0
        consume();
2224
2225
0
        if (match(TokenType::Colon)) {
2226
0
            consume();
2227
0
            ByteCode noncapture_group_bytecode;
2228
0
            size_t length = 0;
2229
2230
0
            enter_capture_group_scope();
2231
0
            if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
2232
0
                return set_error(Error::InvalidPattern);
2233
0
            clear_all_capture_groups_in_scope(stack);
2234
0
            exit_capture_group_scope();
2235
2236
0
            consume(TokenType::RightParen, Error::MismatchingParen);
2237
2238
0
            stack.extend(move(noncapture_group_bytecode));
2239
0
            match_length_minimum += length;
2240
0
            return true;
2241
0
        }
2242
2243
0
        if (consume("<")) {
2244
0
            ++m_parser_state.named_capture_groups_count;
2245
0
            auto group_index = ++m_parser_state.capture_groups_count; // Named capture groups count as normal capture groups too.
2246
0
            auto name = read_capture_group_specifier();
2247
2248
0
            if (name.is_empty()) {
2249
0
                set_error(Error::InvalidNameForCaptureGroup);
2250
0
                return false;
2251
0
            }
2252
2253
0
            if (m_parser_state.named_capture_groups.contains(name)) {
2254
0
                set_error(Error::DuplicateNamedCapture);
2255
0
                return false;
2256
0
            }
2257
2258
0
            ByteCode capture_group_bytecode;
2259
0
            size_t length = 0;
2260
0
            enter_capture_group_scope();
2261
0
            if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
2262
0
                return set_error(Error::InvalidPattern);
2263
0
            clear_all_capture_groups_in_scope(stack);
2264
0
            exit_capture_group_scope();
2265
2266
0
            register_capture_group_in_current_scope(group_index);
2267
2268
0
            consume(TokenType::RightParen, Error::MismatchingParen);
2269
2270
0
            stack.insert_bytecode_group_capture_left(group_index);
2271
0
            stack.extend(move(capture_group_bytecode));
2272
0
            stack.insert_bytecode_group_capture_right(group_index, name.view());
2273
2274
0
            match_length_minimum += length;
2275
2276
0
            m_parser_state.capture_group_minimum_lengths.set(group_index, length);
2277
0
            m_parser_state.named_capture_groups.set(name, { group_index, length });
2278
0
            return true;
2279
0
        }
2280
2281
0
        set_error(Error::InvalidCaptureGroup);
2282
0
        return false;
2283
0
    }
2284
2285
0
    auto group_index = ++m_parser_state.capture_groups_count;
2286
0
    enter_capture_group_scope();
2287
2288
0
    ByteCode capture_group_bytecode;
2289
0
    size_t length = 0;
2290
2291
0
    if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
2292
0
        return set_error(Error::InvalidPattern);
2293
2294
0
    clear_all_capture_groups_in_scope(stack);
2295
0
    exit_capture_group_scope();
2296
2297
0
    register_capture_group_in_current_scope(group_index);
2298
2299
0
    stack.insert_bytecode_group_capture_left(group_index);
2300
0
    stack.extend(move(capture_group_bytecode));
2301
2302
0
    m_parser_state.capture_group_minimum_lengths.set(group_index, length);
2303
2304
0
    consume(TokenType::RightParen, Error::MismatchingParen);
2305
2306
0
    stack.insert_bytecode_group_capture_right(group_index);
2307
2308
0
    match_length_minimum += length;
2309
2310
0
    return true;
2311
0
}
2312
2313
size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
2314
0
{
2315
0
    if (m_total_number_of_capturing_parenthesis.has_value())
2316
0
        return m_total_number_of_capturing_parenthesis.value();
2317
2318
0
    GenericLexer lexer { m_parser_state.lexer.source() };
2319
0
    size_t count = 0;
2320
0
    while (!lexer.is_eof()) {
2321
0
        switch (lexer.peek()) {
2322
0
        case '\\':
2323
0
            lexer.consume(2);
2324
0
            continue;
2325
0
        case '[':
2326
0
            while (!lexer.is_eof()) {
2327
0
                if (lexer.consume_specific('\\'))
2328
0
                    lexer.consume();
2329
0
                else if (lexer.consume_specific(']'))
2330
0
                    break;
2331
0
                lexer.consume();
2332
0
            }
2333
0
            break;
2334
0
        case '(':
2335
0
            lexer.consume();
2336
0
            if (lexer.consume_specific('?')) {
2337
                // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
2338
0
                if (!lexer.consume_specific('<'))
2339
0
                    break;
2340
2341
0
                if (lexer.next_is(is_any_of("=!")))
2342
0
                    break;
2343
2344
0
                ++count;
2345
0
            } else {
2346
0
                ++count;
2347
0
            }
2348
0
            break;
2349
0
        default:
2350
0
            lexer.consume();
2351
0
            break;
2352
0
        }
2353
0
    }
2354
2355
0
    m_total_number_of_capturing_parenthesis = count;
2356
0
    return count;
2357
0
}
2358
}