Coverage Report

Created: 2022-05-20 06:25

/src/serenity/Userland/Libraries/LibRegex/RegexParser.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#pragma once
8
9
#include "RegexByteCode.h"
10
#include "RegexError.h"
11
#include "RegexLexer.h"
12
#include "RegexOptions.h"
13
14
#include <AK/Forward.h>
15
#include <AK/StringBuilder.h>
16
#include <AK/Types.h>
17
#include <AK/Vector.h>
18
#include <LibUnicode/Forward.h>
19
20
namespace regex {
21
22
class PosixExtendedParser;
23
class PosixBasicParser;
24
class ECMA262Parser;
25
26
template<typename T>
27
struct GenericParserTraits {
28
    using OptionsType = T;
29
};
30
31
template<typename T>
32
struct ParserTraits : public GenericParserTraits<T> {
33
};
34
35
template<>
36
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
37
};
38
39
template<>
40
struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
41
};
42
43
template<>
44
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
45
};
46
47
class Parser {
48
public:
49
    struct Result {
50
        ByteCode bytecode;
51
        size_t capture_groups_count;
52
        size_t named_capture_groups_count;
53
        size_t match_length_minimum;
54
        Error error;
55
        Token error_token;
56
        Vector<FlyString> capture_groups;
57
        AllOptions options;
58
    };
59
60
    explicit Parser(Lexer& lexer)
61
        : m_parser_state(lexer)
62
0
    {
63
0
    }
64
65
    Parser(Lexer& lexer, AllOptions regex_options)
66
        : m_parser_state(lexer, regex_options)
67
18
    {
68
18
    }
69
70
18
    virtual ~Parser() = default;
71
72
    Result parse(Optional<AllOptions> regex_options = {});
73
882
    bool has_error() const { return m_parser_state.error != Error::NoError; }
74
0
    Error error() const { return m_parser_state.error; }
75
0
    AllOptions options() const { return m_parser_state.regex_options; }
76
77
protected:
78
    virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
79
80
    ALWAYS_INLINE bool match(TokenType type) const;
81
    ALWAYS_INLINE bool match(char ch) const;
82
    ALWAYS_INLINE bool match_ordinary_characters();
83
    ALWAYS_INLINE Token consume();
84
    ALWAYS_INLINE Token consume(TokenType type, Error error);
85
    ALWAYS_INLINE bool consume(String const&);
86
    ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
87
    ALWAYS_INLINE bool try_skip(StringView);
88
    ALWAYS_INLINE bool lookahead_any(StringView);
89
    ALWAYS_INLINE unsigned char skip();
90
    ALWAYS_INLINE void back(size_t = 1);
91
    ALWAYS_INLINE void reset();
92
    ALWAYS_INLINE bool done() const;
93
    ALWAYS_INLINE bool set_error(Error error);
94
95
    struct NamedCaptureGroup {
96
        size_t group_index { 0 };
97
        size_t minimum_length { 0 };
98
    };
99
100
    struct ParserState {
101
        Lexer& lexer;
102
        Token current_token;
103
        Error error = Error::NoError;
104
        Token error_token { TokenType::Eof, 0, StringView(nullptr) };
105
        ByteCode bytecode;
106
        size_t capture_groups_count { 0 };
107
        size_t named_capture_groups_count { 0 };
108
        size_t match_length_minimum { 0 };
109
        size_t repetition_mark_count { 0 };
110
        AllOptions regex_options;
111
        HashMap<int, size_t> capture_group_minimum_lengths;
112
        HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
113
114
        explicit ParserState(Lexer& lexer)
115
            : lexer(lexer)
116
            , current_token(lexer.next())
117
0
        {
118
0
        }
119
        explicit ParserState(Lexer& lexer, AllOptions regex_options)
120
            : lexer(lexer)
121
            , current_token(lexer.next())
122
            , regex_options(regex_options)
123
18
        {
124
18
        }
125
    };
126
127
    ParserState m_parser_state;
128
};
129
130
class AbstractPosixParser : public Parser {
131
protected:
132
    explicit AbstractPosixParser(Lexer& lexer)
133
        : Parser(lexer)
134
0
    {
135
0
    }
136
137
    AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
138
        : Parser(lexer, regex_options.value_or({}))
139
0
    {
140
0
    }
141
142
    ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
143
};
144
145
class PosixBasicParser final : public AbstractPosixParser {
146
public:
147
    explicit PosixBasicParser(Lexer& lexer)
148
        : AbstractPosixParser(lexer)
149
0
    {
150
0
    }
151
152
    PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
153
        : AbstractPosixParser(lexer, regex_options.value_or({}))
154
0
    {
155
0
    }
156
157
    ~PosixBasicParser() = default;
158
159
private:
160
    bool parse_internal(ByteCode&, size_t&) override;
161
162
    bool parse_root(ByteCode&, size_t&);
163
    bool parse_re_expression(ByteCode&, size_t&);
164
    bool parse_simple_re(ByteCode&, size_t&);
165
    bool parse_nonduplicating_re(ByteCode&, size_t&);
166
    bool parse_one_char_or_collation_element(ByteCode&, size_t&);
167
168
    constexpr static size_t number_of_addressable_capture_groups = 9;
169
    size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
170
    bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
171
    size_t m_current_capture_group_depth { 0 };
172
};
173
174
class PosixExtendedParser final : public AbstractPosixParser {
175
    constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
176
177
public:
178
    explicit PosixExtendedParser(Lexer& lexer)
179
        : AbstractPosixParser(lexer, default_options)
180
0
    {
181
0
    }
182
183
    PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
184
        : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
185
0
    {
186
0
    }
187
188
    ~PosixExtendedParser() = default;
189
190
private:
191
    ALWAYS_INLINE bool match_repetition_symbol();
192
193
    bool parse_internal(ByteCode&, size_t&) override;
194
195
    bool parse_root(ByteCode&, size_t&);
196
    ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
197
    ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
198
    ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
199
};
200
201
class ECMA262Parser final : public Parser {
202
    constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
203
204
public:
205
    explicit ECMA262Parser(Lexer& lexer)
206
        : Parser(lexer, default_options)
207
0
    {
208
0
        m_capture_groups_in_scope.empend();
209
0
    }
210
211
    ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
212
        : Parser(lexer, regex_options.value_or({}) | default_options.value())
213
18
    {
214
18
        m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
215
18
        m_capture_groups_in_scope.empend();
216
18
    }
217
218
18
    ~ECMA262Parser() = default;
219
220
private:
221
    bool parse_internal(ByteCode&, size_t&) override;
222
223
    enum class ReadDigitsInitialZeroState {
224
        Allow,
225
        Disallow,
226
    };
227
    StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
228
    Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
229
    FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
230
231
    struct Script {
232
        Unicode::Script script {};
233
        bool is_extension { false };
234
    };
235
    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
236
    Optional<PropertyEscape> read_unicode_property_escape();
237
238
    bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
239
    bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
240
    bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
241
    bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
242
    bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
243
    bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
244
    bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
245
    bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
246
    bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
247
    bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
248
    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
249
    Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
250
    bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
251
    bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
252
253
    // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
254
    bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
255
    bool parse_extended_atom(ByteCode&, size_t&, bool named);
256
    bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
257
    bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
258
    bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
259
    Optional<u8> parse_legacy_octal_escape();
260
261
    size_t ensure_total_number_of_capturing_parenthesis();
262
263
856
    void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
264
265
    void exit_capture_group_scope()
266
59
    {
267
59
        auto last = m_capture_groups_in_scope.take_last();
268
59
        m_capture_groups_in_scope.last().extend(move(last));
269
59
    }
270
271
    void clear_all_capture_groups_in_scope(ByteCode& stack)
272
59
    {
273
59
        for (auto& index : m_capture_groups_in_scope.last())
274
0
            stack.insert_bytecode_clear_capture_group(index);
275
59
    };
276
277
    // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
278
    // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
279
    // parse it twice, so we'll just do so when it's actually needed.
280
    // Most patterns should have no need to ever populate this field.
281
    Optional<size_t> m_total_number_of_capturing_parenthesis;
282
283
    // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
284
    bool m_should_use_browser_extended_grammar { false };
285
286
    // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
287
    // by requiring that (...)+ only contain the matches for the last iteration.
288
    // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
289
    Vector<Vector<size_t>> m_capture_groups_in_scope;
290
};
291
292
using PosixExtended = PosixExtendedParser;
293
using PosixBasic = PosixBasicParser;
294
using ECMA262 = ECMA262Parser;
295
296
}
297
298
using regex::ECMA262;
299
using regex::PosixBasic;
300
using regex::PosixExtended;