/src/serenity/Userland/Libraries/LibRegex/RegexParser.h

Source (jump to first uncovered line)
/*
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include "RegexByteCode.h"
#include "RegexError.h"
#include "RegexLexer.h"
#include "RegexOptions.h"

#include <AK/Forward.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>

namespace regex {

class PosixExtendedParser;
class PosixBasicParser;
class ECMA262Parser;

template<typename T>
struct GenericParserTraits {
    using OptionsType = T;
};

template<typename T>
struct ParserTraits : public GenericParserTraits<T> {
};

template<>
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
};

template<>
struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
};

template<>
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
};

class Parser {
public:
    struct Result {
        ByteCode bytecode;
        size_t capture_groups_count;
        size_t named_capture_groups_count;
        size_t match_length_minimum;
        Error error;
        Token error_token;
        Vector<FlyString> capture_groups;
        AllOptions options;
    };

    explicit Parser(Lexer& lexer)
        : m_parser_state(lexer)
    {
    }

    Parser(Lexer& lexer, AllOptions regex_options)
        : m_parser_state(lexer, regex_options)
    {
    }

    virtual ~Parser() = default;

    Result parse(Optional<AllOptions> regex_options = {});
    bool has_error() const { return m_parser_state.error != Error::NoError; }
    Error error() const { return m_parser_state.error; }
    AllOptions options() const { return m_parser_state.regex_options; }

protected:
    virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;

    ALWAYS_INLINE bool match(TokenType type) const;
    ALWAYS_INLINE bool match(char ch) const;
    ALWAYS_INLINE bool match_ordinary_characters();
    ALWAYS_INLINE Token consume();
    ALWAYS_INLINE Token consume(TokenType type, Error error);
    ALWAYS_INLINE bool consume(String const&);
    ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
    ALWAYS_INLINE bool try_skip(StringView);
    ALWAYS_INLINE bool lookahead_any(StringView);
    ALWAYS_INLINE unsigned char skip();
    ALWAYS_INLINE void back(size_t = 1);
    ALWAYS_INLINE void reset();
    ALWAYS_INLINE bool done() const;
    ALWAYS_INLINE bool set_error(Error error);

    struct NamedCaptureGroup {
        size_t group_index { 0 };
        size_t minimum_length { 0 };
    };

    struct ParserState {
        Lexer& lexer;
        Token current_token;
        Error error = Error::NoError;
        Token error_token { TokenType::Eof, 0, StringView(nullptr) };
        ByteCode bytecode;
        size_t capture_groups_count { 0 };
        size_t named_capture_groups_count { 0 };
        size_t match_length_minimum { 0 };
        size_t repetition_mark_count { 0 };
        AllOptions regex_options;
        HashMap<int, size_t> capture_group_minimum_lengths;
        HashMap<FlyString, NamedCaptureGroup> named_capture_groups;

        explicit ParserState(Lexer& lexer)
            : lexer(lexer)
            , current_token(lexer.next())
        {
        }
        explicit ParserState(Lexer& lexer, AllOptions regex_options)
            : lexer(lexer)
            , current_token(lexer.next())
            , regex_options(regex_options)
        {
        }
    };

    ParserState m_parser_state;
};

class AbstractPosixParser : public Parser {
protected:
    explicit AbstractPosixParser(Lexer& lexer)
        : Parser(lexer)
    {
    }

    AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
        : Parser(lexer, regex_options.value_or({}))
    {
    }

    ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
};

class PosixBasicParser final : public AbstractPosixParser {
public:
    explicit PosixBasicParser(Lexer& lexer)
        : AbstractPosixParser(lexer)
    {
    }

    PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
        : AbstractPosixParser(lexer, regex_options.value_or({}))
    {
    }

    ~PosixBasicParser() = default;

private:
    bool parse_internal(ByteCode&, size_t&) override;

    bool parse_root(ByteCode&, size_t&);
    bool parse_re_expression(ByteCode&, size_t&);
    bool parse_simple_re(ByteCode&, size_t&);
    bool parse_nonduplicating_re(ByteCode&, size_t&);
    bool parse_one_char_or_collation_element(ByteCode&, size_t&);

    constexpr static size_t number_of_addressable_capture_groups = 9;
    size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
    bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
    size_t m_current_capture_group_depth { 0 };
};

class PosixExtendedParser final : public AbstractPosixParser {
    constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);

public:
    explicit PosixExtendedParser(Lexer& lexer)
        : AbstractPosixParser(lexer, default_options)
    {
    }

    PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
        : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
    {
    }

    ~PosixExtendedParser() = default;

private:
    ALWAYS_INLINE bool match_repetition_symbol();

    bool parse_internal(ByteCode&, size_t&) override;

    bool parse_root(ByteCode&, size_t&);
    ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
    ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
    ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
};

class ECMA262Parser final : public Parser {
    constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);

public:
    explicit ECMA262Parser(Lexer& lexer)
        : Parser(lexer, default_options)
    {
        m_capture_groups_in_scope.empend();
    }

    ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
        : Parser(lexer, regex_options.value_or({}) | default_options.value())
    {
        m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
        m_capture_groups_in_scope.empend();
    }

    ~ECMA262Parser() = default;

private:
    bool parse_internal(ByteCode&, size_t&) override;

    enum class ReadDigitsInitialZeroState {
        Allow,
        Disallow,
    };
    StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
    Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
    FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);

    struct Script {
        Unicode::Script script {};
        bool is_extension { false };
    };
    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
    Optional<PropertyEscape> read_unicode_property_escape();

    bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
    bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
    Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
    bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
    bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);

    // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
    bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
    bool parse_extended_atom(ByteCode&, size_t&, bool named);
    bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
    bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
    bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
    Optional<u8> parse_legacy_octal_escape();

    size_t ensure_total_number_of_capturing_parenthesis();

    void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }

    void exit_capture_group_scope()
    {
        auto last = m_capture_groups_in_scope.take_last();
        m_capture_groups_in_scope.last().extend(move(last));
    }

    void clear_all_capture_groups_in_scope(ByteCode& stack)
    {
        for (auto& index : m_capture_groups_in_scope.last())
            stack.insert_bytecode_clear_capture_group(index);
    };

    // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
    // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
    // parse it twice, so we'll just do so when it's actually needed.
    // Most patterns should have no need to ever populate this field.
    Optional<size_t> m_total_number_of_capturing_parenthesis;

    // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
    bool m_should_use_browser_extended_grammar { false };

    // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
    // by requiring that (...)+ only contain the matches for the last iteration.
    // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
    Vector<Vector<size_t>> m_capture_groups_in_scope;
};

using PosixExtended = PosixExtendedParser;
using PosixBasic = PosixBasicParser;
using ECMA262 = ECMA262Parser;

}

using regex::ECMA262;
using regex::PosixBasic;
using regex::PosixExtended;

Coverage Report

Created: 2022-05-20 06:25

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#pragma once
8
9		#include "RegexByteCode.h"
10		#include "RegexError.h"
11		#include "RegexLexer.h"
12		#include "RegexOptions.h"
13
14		#include <AK/Forward.h>
15		#include <AK/StringBuilder.h>
16		#include <AK/Types.h>
17		#include <AK/Vector.h>
18		#include <LibUnicode/Forward.h>
19
20		namespace regex {
21
22		class PosixExtendedParser;
23		class PosixBasicParser;
24		class ECMA262Parser;
25
26		template<typename T>
27		struct GenericParserTraits {
28		using OptionsType = T;
29		};
30
31		template<typename T>
32		struct ParserTraits : public GenericParserTraits<T> {
33		};
34
35		template<>
36		struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
37		};
38
39		template<>
40		struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
41		};
42
43		template<>
44		struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
45		};
46
47		class Parser {
48		public:
49		struct Result {
50		ByteCode bytecode;
51		size_t capture_groups_count;
52		size_t named_capture_groups_count;
53		size_t match_length_minimum;
54		Error error;
55		Token error_token;
56		Vector<FlyString> capture_groups;
57		AllOptions options;
58		};
59
60		explicit Parser(Lexer& lexer)
61		: m_parser_state(lexer)
62	0	{
63	0	}
64
65		Parser(Lexer& lexer, AllOptions regex_options)
66		: m_parser_state(lexer, regex_options)
67	18	{
68	18	}
69
70	18	virtual ~Parser() = default;
71
72		Result parse(Optional<AllOptions> regex_options = {});
73	882	bool has_error() const { return m_parser_state.error != Error::NoError; }
74	0	Error error() const { return m_parser_state.error; }
75	0	AllOptions options() const { return m_parser_state.regex_options; }
76
77		protected:
78		virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
79
80		ALWAYS_INLINE bool match(TokenType type) const;
81		ALWAYS_INLINE bool match(char ch) const;
82		ALWAYS_INLINE bool match_ordinary_characters();
83		ALWAYS_INLINE Token consume();
84		ALWAYS_INLINE Token consume(TokenType type, Error error);
85		ALWAYS_INLINE bool consume(String const&);
86		ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
87		ALWAYS_INLINE bool try_skip(StringView);
88		ALWAYS_INLINE bool lookahead_any(StringView);
89		ALWAYS_INLINE unsigned char skip();
90		ALWAYS_INLINE void back(size_t = 1);
91		ALWAYS_INLINE void reset();
92		ALWAYS_INLINE bool done() const;
93		ALWAYS_INLINE bool set_error(Error error);
94
95		struct NamedCaptureGroup {
96		size_t group_index { 0 };
97		size_t minimum_length { 0 };
98		};
99
100		struct ParserState {
101		Lexer& lexer;
102		Token current_token;
103		Error error = Error::NoError;
104		Token error_token { TokenType::Eof, 0, StringView(nullptr) };
105		ByteCode bytecode;
106		size_t capture_groups_count { 0 };
107		size_t named_capture_groups_count { 0 };
108		size_t match_length_minimum { 0 };
109		size_t repetition_mark_count { 0 };
110		AllOptions regex_options;
111		HashMap<int, size_t> capture_group_minimum_lengths;
112		HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
113
114		explicit ParserState(Lexer& lexer)
115		: lexer(lexer)
116		, current_token(lexer.next())
117	0	{
118	0	}
119		explicit ParserState(Lexer& lexer, AllOptions regex_options)
120		: lexer(lexer)
121		, current_token(lexer.next())
122		, regex_options(regex_options)
123	18	{
124	18	}
125		};
126
127		ParserState m_parser_state;
128		};
129
130		class AbstractPosixParser : public Parser {
131		protected:
132		explicit AbstractPosixParser(Lexer& lexer)
133		: Parser(lexer)
134	0	{
135	0	}
136
137		AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
138		: Parser(lexer, regex_options.value_or({}))
139	0	{
140	0	}
141
142		ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
143		};
144
145		class PosixBasicParser final : public AbstractPosixParser {
146		public:
147		explicit PosixBasicParser(Lexer& lexer)
148		: AbstractPosixParser(lexer)
149	0	{
150	0	}
151
152		PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
153		: AbstractPosixParser(lexer, regex_options.value_or({}))
154	0	{
155	0	}
156
157		~PosixBasicParser() = default;
158
159		private:
160		bool parse_internal(ByteCode&, size_t&) override;
161
162		bool parse_root(ByteCode&, size_t&);
163		bool parse_re_expression(ByteCode&, size_t&);
164		bool parse_simple_re(ByteCode&, size_t&);
165		bool parse_nonduplicating_re(ByteCode&, size_t&);
166		bool parse_one_char_or_collation_element(ByteCode&, size_t&);
167
168		constexpr static size_t number_of_addressable_capture_groups = 9;
169		size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
170		bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
171		size_t m_current_capture_group_depth { 0 };
172		};
173
174		class PosixExtendedParser final : public AbstractPosixParser {
175		constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) \| static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
176
177		public:
178		explicit PosixExtendedParser(Lexer& lexer)
179		: AbstractPosixParser(lexer, default_options)
180	0	{
181	0	}
182
183		PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
184		: AbstractPosixParser(lexer, regex_options.value_or({}) \| default_options.value())
185	0	{
186	0	}
187
188		~PosixExtendedParser() = default;
189
190		private:
191		ALWAYS_INLINE bool match_repetition_symbol();
192
193		bool parse_internal(ByteCode&, size_t&) override;
194
195		bool parse_root(ByteCode&, size_t&);
196		ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
197		ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
198		ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
199		};
200
201		class ECMA262Parser final : public Parser {
202		constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
203
204		public:
205		explicit ECMA262Parser(Lexer& lexer)
206		: Parser(lexer, default_options)
207	0	{
208	0	m_capture_groups_in_scope.empend();
209	0	}
210
211		ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
212		: Parser(lexer, regex_options.value_or({}) \| default_options.value())
213	18	{
214	18	m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
215	18	m_capture_groups_in_scope.empend();
216	18	}
217
218	18	~ECMA262Parser() = default;
219
220		private:
221		bool parse_internal(ByteCode&, size_t&) override;
222
223		enum class ReadDigitsInitialZeroState {
224		Allow,
225		Disallow,
226		};
227		StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
228		Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
229		FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
230
231		struct Script {
232		Unicode::Script script {};
233		bool is_extension { false };
234		};
235		using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
236		Optional<PropertyEscape> read_unicode_property_escape();
237
238		bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
239		bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
240		bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
241		bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
242		bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
243		bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
244		bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
245		bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
246		bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
247		bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
248		bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
249		Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
250		bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
251		bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
252
253		// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
254		bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
255		bool parse_extended_atom(ByteCode&, size_t&, bool named);
256		bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
257		bool parse_invalid_braced_quantifier(); // Note: This function either parses and fails, or doesn't parse anything and returns false.
258		bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
259		Optional<u8> parse_legacy_octal_escape();
260
261		size_t ensure_total_number_of_capturing_parenthesis();
262
263	856	void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
264
265		void exit_capture_group_scope()
266	59	{
267	59	auto last = m_capture_groups_in_scope.take_last();
268	59	m_capture_groups_in_scope.last().extend(move(last));
269	59	}
270
271		void clear_all_capture_groups_in_scope(ByteCode& stack)
272	59	{
273	59	for (auto& index : m_capture_groups_in_scope.last())
274	0	stack.insert_bytecode_clear_capture_group(index);
275	59	};
276
277		// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
278		// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
279		// parse it twice, so we'll just do so when it's actually needed.
280		// Most patterns should have no need to ever populate this field.
281		Optional<size_t> m_total_number_of_capturing_parenthesis;
282
283		// Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
284		bool m_should_use_browser_extended_grammar { false };
285
286		// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
287		// by requiring that (...)+ only contain the matches for the last iteration.
288		// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
289		Vector<Vector<size_t>> m_capture_groups_in_scope;
290		};
291
292		using PosixExtended = PosixExtendedParser;
293		using PosixBasic = PosixBasicParser;
294		using ECMA262 = ECMA262Parser;
295
296		}
297
298		using regex::ECMA262;
299		using regex::PosixBasic;
300		using regex::PosixExtended;