Coverage Report

Created: 2025-11-02 07:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibRegex/RegexMatcher.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/BumpAllocator.h>
8
#include <AK/ByteString.h>
9
#include <AK/Debug.h>
10
#include <AK/StringBuilder.h>
11
#include <LibRegex/RegexMatcher.h>
12
#include <LibRegex/RegexParser.h>
13
14
#if REGEX_DEBUG
15
#    include <LibRegex/RegexDebug.h>
16
#endif
17
18
namespace regex {
19
20
#if REGEX_DEBUG
21
static RegexDebug s_regex_dbg(stderr);
22
#endif
23
24
template<class Parser>
25
regex::Parser::Result Regex<Parser>::parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
26
0
{
27
0
    regex::Lexer lexer(pattern);
28
29
0
    Parser parser(lexer, regex_options);
30
0
    return parser.parse();
31
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::ECMAScriptFlags>)
32
33
template<class Parser>
34
Regex<Parser>::Regex(ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options)
35
7.81k
    : pattern_value(move(pattern))
36
7.81k
{
37
7.81k
    regex::Lexer lexer(pattern_value);
38
39
7.81k
    Parser parser(lexer, regex_options);
40
7.81k
    parser_result = parser.parse();
41
42
7.81k
    run_optimization_passes();
43
7.81k
    if (parser_result.error == regex::Error::NoError)
44
5.04k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
7.81k
}
regex::Regex<regex::PosixBasicParser>::Regex(AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Line
Count
Source
35
4.82k
    : pattern_value(move(pattern))
36
4.82k
{
37
4.82k
    regex::Lexer lexer(pattern_value);
38
39
4.82k
    Parser parser(lexer, regex_options);
40
4.82k
    parser_result = parser.parse();
41
42
4.82k
    run_optimization_passes();
43
4.82k
    if (parser_result.error == regex::Error::NoError)
44
3.26k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
4.82k
}
regex::Regex<regex::PosixExtendedParser>::Regex(AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Line
Count
Source
35
686
    : pattern_value(move(pattern))
36
686
{
37
686
    regex::Lexer lexer(pattern_value);
38
39
686
    Parser parser(lexer, regex_options);
40
686
    parser_result = parser.parse();
41
42
686
    run_optimization_passes();
43
686
    if (parser_result.error == regex::Error::NoError)
44
269
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
686
}
regex::Regex<regex::ECMA262Parser>::Regex(AK::ByteString, regex::RegexOptions<regex::ECMAScriptFlags>)
Line
Count
Source
35
2.30k
    : pattern_value(move(pattern))
36
2.30k
{
37
2.30k
    regex::Lexer lexer(pattern_value);
38
39
2.30k
    Parser parser(lexer, regex_options);
40
2.30k
    parser_result = parser.parse();
41
42
2.30k
    run_optimization_passes();
43
2.30k
    if (parser_result.error == regex::Error::NoError)
44
1.50k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
2.30k
}
46
47
template<class Parser>
48
Regex<Parser>::Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options)
49
0
    : pattern_value(move(pattern))
50
0
    , parser_result(move(parse_result))
51
0
{
52
0
    run_optimization_passes();
53
0
    if (parser_result.error == regex::Error::NoError)
54
0
        matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value()));
55
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::ECMAScriptFlags>)
56
57
template<class Parser>
58
Regex<Parser>::Regex(Regex&& regex)
59
5.28k
    : pattern_value(move(regex.pattern_value))
60
5.28k
    , parser_result(move(regex.parser_result))
61
5.28k
    , matcher(move(regex.matcher))
62
5.28k
    , start_offset(regex.start_offset)
63
5.28k
{
64
5.28k
    if (matcher)
65
4.49k
        matcher->reset_pattern({}, this);
66
5.28k
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::Regex(regex::Regex<regex::PosixBasicParser>&&)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::Regex(regex::Regex<regex::PosixExtendedParser>&&)
regex::Regex<regex::ECMA262Parser>::Regex(regex::Regex<regex::ECMA262Parser>&&)
Line
Count
Source
59
5.28k
    : pattern_value(move(regex.pattern_value))
60
5.28k
    , parser_result(move(regex.parser_result))
61
5.28k
    , matcher(move(regex.matcher))
62
5.28k
    , start_offset(regex.start_offset)
63
5.28k
{
64
5.28k
    if (matcher)
65
4.49k
        matcher->reset_pattern({}, this);
66
5.28k
}
67
68
template<class Parser>
69
Regex<Parser>& Regex<Parser>::operator=(Regex&& regex)
70
0
{
71
0
    pattern_value = move(regex.pattern_value);
72
0
    parser_result = move(regex.parser_result);
73
0
    matcher = move(regex.matcher);
74
0
    if (matcher)
75
0
        matcher->reset_pattern({}, this);
76
0
    start_offset = regex.start_offset;
77
0
    return *this;
78
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::operator=(regex::Regex<regex::PosixBasicParser>&&)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::operator=(regex::Regex<regex::PosixExtendedParser>&&)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::operator=(regex::Regex<regex::ECMA262Parser>&&)
79
80
template<class Parser>
81
typename ParserTraits<Parser>::OptionsType Regex<Parser>::options() const
82
0
{
83
0
    if (!matcher || parser_result.error != Error::NoError)
84
0
        return {};
85
86
0
    return matcher->options();
87
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::options() const
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::options() const
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::options() const
88
89
template<class Parser>
90
ByteString Regex<Parser>::error_string(Optional<ByteString> message) const
91
0
{
92
0
    StringBuilder eb;
93
0
    eb.append("Error during parsing of regular expression:\n"sv);
94
0
    eb.appendff("    {}\n    ", pattern_value);
95
0
    for (size_t i = 0; i < parser_result.error_token.position(); ++i)
96
0
        eb.append(' ');
97
98
0
    eb.appendff("^---- {}", message.value_or(get_error_string(parser_result.error)));
99
0
    return eb.to_byte_string();
100
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::error_string(AK::Optional<AK::ByteString>) const
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::error_string(AK::Optional<AK::ByteString>) const
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::error_string(AK::Optional<AK::ByteString>) const
101
102
template<typename Parser>
103
RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
104
34.3M
{
105
34.3M
    AllOptions options = m_regex_options | regex_options.value_or({}).value();
106
107
34.3M
    if constexpr (!IsSame<Parser, ECMA262>) {
108
0
        if (options.has_flag_set(AllFlags::Multiline))
109
0
            return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
110
0
    }
111
112
0
    Vector<RegexStringView> views;
113
34.3M
    views.append(view);
114
34.3M
    return match(views, regex_options);
115
34.3M
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
regex::Matcher<regex::ECMA262Parser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const
Line
Count
Source
104
34.3M
{
105
34.3M
    AllOptions options = m_regex_options | regex_options.value_or({}).value();
106
107
    if constexpr (!IsSame<Parser, ECMA262>) {
108
        if (options.has_flag_set(AllFlags::Multiline))
109
            return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
110
    }
111
112
34.3M
    Vector<RegexStringView> views;
113
34.3M
    views.append(view);
114
34.3M
    return match(views, regex_options);
115
34.3M
}
116
117
template<typename Parser>
118
RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
119
34.3M
{
120
    // If the pattern *itself* isn't stateful, reset any changes to start_offset.
121
34.3M
    if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
122
34.3M
        m_pattern->start_offset = 0;
123
124
34.3M
    size_t match_count { 0 };
125
126
34.3M
    MatchInput input;
127
34.3M
    MatchState state;
128
34.3M
    size_t operations = 0;
129
130
34.3M
    input.regex_options = m_regex_options | regex_options.value_or({}).value();
131
34.3M
    input.start_offset = m_pattern->start_offset;
132
34.3M
    size_t lines_to_skip = 0;
133
134
34.3M
    bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
135
34.3M
    for (auto const& view : views)
136
34.3M
        const_cast<RegexStringView&>(view).set_unicode(unicode);
137
138
34.3M
    if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
139
0
        if (views.size() > 1 && input.start_offset > views.first().length()) {
140
0
            dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
141
0
            for (auto const& view : views) {
142
0
                if (input.start_offset < view.length() + 1)
143
0
                    break;
144
0
                ++lines_to_skip;
145
0
                input.start_offset -= view.length() + 1;
146
0
                input.global_offset += view.length() + 1;
147
0
            }
148
0
            dbgln_if(REGEX_DEBUG, "Ended with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
149
0
        }
150
0
    }
151
152
34.3M
    if (c_match_preallocation_count) {
153
0
        state.matches.ensure_capacity(c_match_preallocation_count);
154
0
        state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
155
0
        auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
156
157
0
        for (size_t j = 0; j < c_match_preallocation_count; ++j) {
158
0
            state.matches.empend();
159
0
            state.capture_group_matches.empend();
160
0
            state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count);
161
0
            for (size_t k = 0; k < capture_groups_count; ++k)
162
0
                state.capture_group_matches.mutable_at(j).unchecked_append({});
163
0
        }
164
0
    }
165
166
34.3M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
1.07M
        if (state.matches.size() == input.match_index)
168
1.07M
            state.matches.empend();
169
170
1.07M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
1.07M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
0
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
1.07M
        } else { // let the view point to the original string ...
174
1.07M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
1.07M
        }
176
1.07M
    };
Unexecuted instantiation: auto regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
Unexecuted instantiation: auto regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
auto regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
Line
Count
Source
166
1.07M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
1.07M
        if (state.matches.size() == input.match_index)
168
1.07M
            state.matches.empend();
169
170
1.07M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
1.07M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
0
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
1.07M
        } else { // let the view point to the original string ...
174
1.07M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
1.07M
        }
176
1.07M
    };
177
178
#if REGEX_DEBUG
179
    s_regex_dbg.print_header();
180
#endif
181
182
34.3M
    bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
183
34.3M
    if (input.regex_options.has_flag_set(AllFlags::Sticky))
184
0
        continue_search = false;
185
186
34.3M
    auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch);
187
188
34.3M
    for (auto const& view : views) {
189
34.3M
        if (lines_to_skip != 0) {
190
0
            ++input.line;
191
0
            --lines_to_skip;
192
0
            continue;
193
0
        }
194
34.3M
        input.view = view;
195
34.3M
        dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
196
197
34.3M
        auto view_length = view.length();
198
34.3M
        size_t view_index = m_pattern->start_offset;
199
34.3M
        state.string_position = view_index;
200
34.3M
        state.string_position_in_code_units = view_index;
201
34.3M
        bool succeeded = false;
202
203
34.3M
        if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
204
            // Run the code until it tries to consume something.
205
            // This allows non-consuming code to run on empty strings, for instance
206
            // e.g. "Exit"
207
0
            size_t temp_operations = operations;
208
209
0
            input.column = match_count;
210
0
            input.match_index = match_count;
211
212
0
            state.string_position = view_index;
213
0
            state.string_position_in_code_units = view_index;
214
0
            state.instruction_position = 0;
215
0
            state.repetition_marks.clear();
216
217
0
            auto success = execute(input, state, temp_operations);
218
            // This success is acceptable only if it doesn't read anything from the input (input length is 0).
219
0
            if (success && (state.string_position <= view_index)) {
220
0
                operations = temp_operations;
221
0
                if (!match_count) {
222
                    // Nothing was *actually* matched, so append an empty match.
223
0
                    append_match(input, state, view_index);
224
0
                    ++match_count;
225
226
                    // This prevents a regex pattern like ".*" from matching the empty string
227
                    // multiple times, once in this block and once in the following for loop.
228
0
                    if (view_index == 0 && view_length == 0)
229
0
                        ++view_index;
230
0
                }
231
0
            }
232
0
        }
233
234
34.3M
        for (; view_index <= view_length; ++view_index) {
235
34.3M
            if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline))
236
0
                break;
237
238
34.3M
            auto& match_length_minimum = m_pattern->parser_result.match_length_minimum;
239
            // FIXME: More performant would be to know the remaining minimum string
240
            //        length needed to match from the current position onwards within
241
            //        the vm. Add new OpCode for MinMatchLengthFromSp with the value of
242
            //        the remaining string length from the current path. The value though
243
            //        has to be filled in reverse. That implies a second run over bytecode
244
            //        after generation has finished.
245
34.3M
            if (match_length_minimum && match_length_minimum > view_length - view_index)
246
29.7M
                break;
247
248
4.59M
            input.column = match_count;
249
4.59M
            input.match_index = match_count;
250
251
4.59M
            state.string_position = view_index;
252
4.59M
            state.string_position_in_code_units = view_index;
253
4.59M
            state.instruction_position = 0;
254
4.59M
            state.repetition_marks.clear();
255
256
4.59M
            auto success = execute(input, state, operations);
257
4.59M
            if (success) {
258
1.07M
                succeeded = true;
259
260
1.07M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
261
0
                    if (!continue_search)
262
0
                        break;
263
0
                    continue;
264
0
                }
265
1.07M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
266
0
                    if (!continue_search)
267
0
                        break;
268
0
                    continue;
269
0
                }
270
271
1.07M
                dbgln_if(REGEX_DEBUG, "state.string_position={}, view_index={}", state.string_position, view_index);
272
1.07M
                dbgln_if(REGEX_DEBUG, "[match] Found a match (length={}): '{}'", state.string_position - view_index, input.view.substring_view(view_index, state.string_position - view_index));
273
274
1.07M
                ++match_count;
275
276
1.07M
                if (continue_search) {
277
0
                    append_match(input, state, view_index);
278
279
0
                    bool has_zero_length = state.string_position == view_index;
280
0
                    view_index = state.string_position - (has_zero_length ? 0 : 1);
281
0
                    if (single_match_only)
282
0
                        break;
283
0
                    continue;
284
0
                }
285
1.07M
                if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
286
0
                    append_match(input, state, view_index);
287
0
                    break;
288
0
                }
289
1.07M
                if (state.string_position < view_length) {
290
0
                    return { false, 0, {}, {}, {}, operations };
291
0
                }
292
293
1.07M
                append_match(input, state, view_index);
294
1.07M
                break;
295
1.07M
            }
296
297
3.52M
            if (!continue_search)
298
3.52M
                break;
299
3.52M
        }
300
301
34.3M
        ++input.line;
302
34.3M
        input.global_offset += view.length() + 1; // +1 includes the line break character
303
304
34.3M
        if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
305
0
            m_pattern->start_offset = state.string_position;
306
307
34.3M
        if (succeeded && !continue_search)
308
1.07M
            break;
309
34.3M
    }
310
311
34.3M
    RegexResult result {
312
34.3M
        match_count != 0,
313
34.3M
        match_count,
314
34.3M
        move(state.matches).release(),
315
34.3M
        move(state.capture_group_matches).release(),
316
34.3M
        operations,
317
34.3M
        m_pattern->parser_result.capture_groups_count,
318
34.3M
        m_pattern->parser_result.named_capture_groups_count,
319
34.3M
    };
320
321
34.3M
    if (match_count) {
322
        // Make sure there are as many capture matches as there are actual matches.
323
1.07M
        if (result.capture_group_matches.size() < match_count)
324
0
            result.capture_group_matches.resize(match_count);
325
1.07M
        for (auto& matches : result.capture_group_matches)
326
1.07M
            matches.resize(m_pattern->parser_result.capture_groups_count + 1);
327
1.07M
        if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
328
1.07M
            for (auto& matches : result.capture_group_matches)
329
5.27M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
Unexecuted instantiation: auto regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
Unexecuted instantiation: auto regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
auto regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
Line
Count
Source
329
5.27M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
330
1.07M
        }
331
33.2M
    } else {
332
33.2M
        result.capture_group_matches.clear_with_capacity();
333
33.2M
    }
334
335
34.3M
    return result;
336
34.3M
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const
Line
Count
Source
119
34.3M
{
120
    // If the pattern *itself* isn't stateful, reset any changes to start_offset.
121
34.3M
    if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
122
34.3M
        m_pattern->start_offset = 0;
123
124
34.3M
    size_t match_count { 0 };
125
126
34.3M
    MatchInput input;
127
34.3M
    MatchState state;
128
34.3M
    size_t operations = 0;
129
130
34.3M
    input.regex_options = m_regex_options | regex_options.value_or({}).value();
131
34.3M
    input.start_offset = m_pattern->start_offset;
132
34.3M
    size_t lines_to_skip = 0;
133
134
34.3M
    bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
135
34.3M
    for (auto const& view : views)
136
34.3M
        const_cast<RegexStringView&>(view).set_unicode(unicode);
137
138
34.3M
    if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
139
0
        if (views.size() > 1 && input.start_offset > views.first().length()) {
140
0
            dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
141
0
            for (auto const& view : views) {
142
0
                if (input.start_offset < view.length() + 1)
143
0
                    break;
144
0
                ++lines_to_skip;
145
0
                input.start_offset -= view.length() + 1;
146
0
                input.global_offset += view.length() + 1;
147
0
            }
148
0
            dbgln_if(REGEX_DEBUG, "Ended with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
149
0
        }
150
0
    }
151
152
34.3M
    if (c_match_preallocation_count) {
153
0
        state.matches.ensure_capacity(c_match_preallocation_count);
154
0
        state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
155
0
        auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
156
157
0
        for (size_t j = 0; j < c_match_preallocation_count; ++j) {
158
0
            state.matches.empend();
159
0
            state.capture_group_matches.empend();
160
0
            state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count);
161
0
            for (size_t k = 0; k < capture_groups_count; ++k)
162
0
                state.capture_group_matches.mutable_at(j).unchecked_append({});
163
0
        }
164
0
    }
165
166
34.3M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
34.3M
        if (state.matches.size() == input.match_index)
168
34.3M
            state.matches.empend();
169
170
34.3M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
34.3M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
34.3M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
34.3M
        } else { // let the view point to the original string ...
174
34.3M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
34.3M
        }
176
34.3M
    };
177
178
#if REGEX_DEBUG
179
    s_regex_dbg.print_header();
180
#endif
181
182
34.3M
    bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
183
34.3M
    if (input.regex_options.has_flag_set(AllFlags::Sticky))
184
0
        continue_search = false;
185
186
34.3M
    auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch);
187
188
34.3M
    for (auto const& view : views) {
189
34.3M
        if (lines_to_skip != 0) {
190
0
            ++input.line;
191
0
            --lines_to_skip;
192
0
            continue;
193
0
        }
194
34.3M
        input.view = view;
195
34.3M
        dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
196
197
34.3M
        auto view_length = view.length();
198
34.3M
        size_t view_index = m_pattern->start_offset;
199
34.3M
        state.string_position = view_index;
200
34.3M
        state.string_position_in_code_units = view_index;
201
34.3M
        bool succeeded = false;
202
203
34.3M
        if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
204
            // Run the code until it tries to consume something.
205
            // This allows non-consuming code to run on empty strings, for instance
206
            // e.g. "Exit"
207
0
            size_t temp_operations = operations;
208
209
0
            input.column = match_count;
210
0
            input.match_index = match_count;
211
212
0
            state.string_position = view_index;
213
0
            state.string_position_in_code_units = view_index;
214
0
            state.instruction_position = 0;
215
0
            state.repetition_marks.clear();
216
217
0
            auto success = execute(input, state, temp_operations);
218
            // This success is acceptable only if it doesn't read anything from the input (input length is 0).
219
0
            if (success && (state.string_position <= view_index)) {
220
0
                operations = temp_operations;
221
0
                if (!match_count) {
222
                    // Nothing was *actually* matched, so append an empty match.
223
0
                    append_match(input, state, view_index);
224
0
                    ++match_count;
225
226
                    // This prevents a regex pattern like ".*" from matching the empty string
227
                    // multiple times, once in this block and once in the following for loop.
228
0
                    if (view_index == 0 && view_length == 0)
229
0
                        ++view_index;
230
0
                }
231
0
            }
232
0
        }
233
234
34.3M
        for (; view_index <= view_length; ++view_index) {
235
34.3M
            if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline))
236
0
                break;
237
238
34.3M
            auto& match_length_minimum = m_pattern->parser_result.match_length_minimum;
239
            // FIXME: More performant would be to know the remaining minimum string
240
            //        length needed to match from the current position onwards within
241
            //        the vm. Add new OpCode for MinMatchLengthFromSp with the value of
242
            //        the remaining string length from the current path. The value though
243
            //        has to be filled in reverse. That implies a second run over bytecode
244
            //        after generation has finished.
245
34.3M
            if (match_length_minimum && match_length_minimum > view_length - view_index)
246
29.7M
                break;
247
248
4.59M
            input.column = match_count;
249
4.59M
            input.match_index = match_count;
250
251
4.59M
            state.string_position = view_index;
252
4.59M
            state.string_position_in_code_units = view_index;
253
4.59M
            state.instruction_position = 0;
254
4.59M
            state.repetition_marks.clear();
255
256
4.59M
            auto success = execute(input, state, operations);
257
4.59M
            if (success) {
258
1.07M
                succeeded = true;
259
260
1.07M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
261
0
                    if (!continue_search)
262
0
                        break;
263
0
                    continue;
264
0
                }
265
1.07M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
266
0
                    if (!continue_search)
267
0
                        break;
268
0
                    continue;
269
0
                }
270
271
1.07M
                dbgln_if(REGEX_DEBUG, "state.string_position={}, view_index={}", state.string_position, view_index);
272
1.07M
                dbgln_if(REGEX_DEBUG, "[match] Found a match (length={}): '{}'", state.string_position - view_index, input.view.substring_view(view_index, state.string_position - view_index));
273
274
1.07M
                ++match_count;
275
276
1.07M
                if (continue_search) {
277
0
                    append_match(input, state, view_index);
278
279
0
                    bool has_zero_length = state.string_position == view_index;
280
0
                    view_index = state.string_position - (has_zero_length ? 0 : 1);
281
0
                    if (single_match_only)
282
0
                        break;
283
0
                    continue;
284
0
                }
285
1.07M
                if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
286
0
                    append_match(input, state, view_index);
287
0
                    break;
288
0
                }
289
1.07M
                if (state.string_position < view_length) {
290
0
                    return { false, 0, {}, {}, {}, operations };
291
0
                }
292
293
1.07M
                append_match(input, state, view_index);
294
1.07M
                break;
295
1.07M
            }
296
297
3.52M
            if (!continue_search)
298
3.52M
                break;
299
3.52M
        }
300
301
34.3M
        ++input.line;
302
34.3M
        input.global_offset += view.length() + 1; // +1 includes the line break character
303
304
34.3M
        if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
305
0
            m_pattern->start_offset = state.string_position;
306
307
34.3M
        if (succeeded && !continue_search)
308
1.07M
            break;
309
34.3M
    }
310
311
34.3M
    RegexResult result {
312
34.3M
        match_count != 0,
313
34.3M
        match_count,
314
34.3M
        move(state.matches).release(),
315
34.3M
        move(state.capture_group_matches).release(),
316
34.3M
        operations,
317
34.3M
        m_pattern->parser_result.capture_groups_count,
318
34.3M
        m_pattern->parser_result.named_capture_groups_count,
319
34.3M
    };
320
321
34.3M
    if (match_count) {
322
        // Make sure there are as many capture matches as there are actual matches.
323
1.07M
        if (result.capture_group_matches.size() < match_count)
324
0
            result.capture_group_matches.resize(match_count);
325
1.07M
        for (auto& matches : result.capture_group_matches)
326
1.07M
            matches.resize(m_pattern->parser_result.capture_groups_count + 1);
327
1.07M
        if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
328
1.07M
            for (auto& matches : result.capture_group_matches)
329
1.07M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
330
1.07M
        }
331
33.2M
    } else {
332
33.2M
        result.capture_group_matches.clear_with_capacity();
333
33.2M
    }
334
335
34.3M
    return result;
336
34.3M
}
337
338
template<typename T>
339
class BumpAllocatedLinkedList {
340
public:
341
4.59M
    BumpAllocatedLinkedList() = default;
342
343
    ALWAYS_INLINE void append(T value)
344
60.9M
    {
345
60.9M
        auto node_ptr = m_allocator.allocate(move(value));
346
60.9M
        VERIFY(node_ptr);
347
348
60.9M
        if (!m_first) {
349
10.9M
            m_first = node_ptr;
350
10.9M
            m_last = node_ptr;
351
10.9M
            return;
352
10.9M
        }
353
354
50.0M
        node_ptr->previous = m_last;
355
50.0M
        m_last->next = node_ptr;
356
50.0M
        m_last = node_ptr;
357
50.0M
    }
358
359
    ALWAYS_INLINE T take_last()
360
21.6M
    {
361
21.6M
        VERIFY(m_last);
362
21.6M
        T value = move(m_last->value);
363
21.6M
        if (m_last == m_first) {
364
10.9M
            m_last = nullptr;
365
10.9M
            m_first = nullptr;
366
10.9M
        } else {
367
10.7M
            m_last = m_last->previous;
368
10.7M
            m_last->next = nullptr;
369
10.7M
        }
370
21.6M
        return value;
371
21.6M
    }
372
373
    ALWAYS_INLINE T& last()
374
114M
    {
375
114M
        return m_last->value;
376
114M
    }
377
378
    ALWAYS_INLINE bool is_empty() const
379
25.2M
    {
380
25.2M
        return m_first == nullptr;
381
25.2M
    }
382
383
55.2k
    auto reverse_begin() { return ReverseIterator(m_last); }
384
321M
    auto reverse_end() { return ReverseIterator(); }
385
386
private:
387
    struct Node {
388
        T value;
389
        Node* next { nullptr };
390
        Node* previous { nullptr };
391
    };
392
393
    struct ReverseIterator {
394
        ReverseIterator() = default;
395
        explicit ReverseIterator(Node* node)
396
55.2k
            : m_node(node)
397
55.2k
        {
398
55.2k
        }
399
400
321M
        T* operator->() { return &m_node->value; }
401
223
        T& operator*() { return m_node->value; }
402
321M
        bool operator==(ReverseIterator const& it) const { return m_node == it.m_node; }
403
        ReverseIterator& operator++()
404
321M
        {
405
321M
            if (m_node)
406
321M
                m_node = m_node->previous;
407
321M
            return *this;
408
321M
        }
409
410
    private:
411
        Node* m_node;
412
    };
413
414
    UniformBumpAllocator<Node, true, 2 * MiB> m_allocator;
415
    Node* m_first { nullptr };
416
    Node* m_last { nullptr };
417
};
418
419
template<class Parser>
420
bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
421
4.59M
{
422
4.59M
    if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
423
        // Yay, we can do a simple substring search!
424
0
        auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
425
0
        if (needle.length() + state.string_position > input.view.length())
426
0
            return false;
427
428
0
        auto haystack = input.view.string_view().substring_view(state.string_position);
429
0
        if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
430
0
            if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
431
0
                return false;
432
0
        } else {
433
0
            if (!haystack.starts_with(needle))
434
0
                return false;
435
0
        }
436
437
0
        state.string_position += needle.length();
438
0
        state.string_position_in_code_units += needle.length();
439
0
        return true;
440
0
    }
441
442
4.59M
    BumpAllocatedLinkedList<MatchState> states_to_try_next;
443
#if REGEX_DEBUG
444
    size_t recursion_level = 0;
445
#endif
446
447
4.59M
    auto& bytecode = m_pattern->parser_result.bytecode;
448
449
278M
    for (;;) {
450
278M
        auto& opcode = bytecode.get_opcode(state);
451
278M
        ++operations;
452
453
#if REGEX_DEBUG
454
        s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false);
455
#endif
456
457
278M
        ExecutionResult result;
458
278M
        if (input.fail_counter > 0) {
459
0
            --input.fail_counter;
460
0
            result = ExecutionResult::Failed_ExecuteLowPrioForks;
461
278M
        } else {
462
278M
            result = opcode.execute(input, state);
463
278M
        }
464
465
#if REGEX_DEBUG
466
        s_regex_dbg.print_result(opcode, bytecode, input, state, result);
467
#endif
468
469
278M
        state.instruction_position += opcode.size();
470
471
278M
        switch (result) {
472
54.0M
        case ExecutionResult::Fork_PrioLow: {
473
54.0M
            bool found = false;
474
54.0M
            if (input.fork_to_replace.has_value()) {
475
321M
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
476
321M
                    if (it->initiating_fork == input.fork_to_replace.value()) {
477
223
                        (*it) = state;
478
223
                        it->instruction_position = state.fork_at_position;
479
223
                        it->initiating_fork = *input.fork_to_replace;
480
223
                        found = true;
481
223
                        break;
482
223
                    }
483
321M
                }
484
55.2k
                input.fork_to_replace.clear();
485
55.2k
            }
486
54.0M
            if (!found) {
487
54.0M
                states_to_try_next.append(state);
488
54.0M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
489
54.0M
                states_to_try_next.last().instruction_position = state.fork_at_position;
490
54.0M
            }
491
54.0M
            continue;
492
0
        }
493
6.94M
        case ExecutionResult::Fork_PrioHigh: {
494
6.94M
            bool found = false;
495
6.94M
            if (input.fork_to_replace.has_value()) {
496
0
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
497
0
                    if (it->initiating_fork == input.fork_to_replace.value()) {
498
0
                        (*it) = state;
499
0
                        it->initiating_fork = *input.fork_to_replace;
500
0
                        found = true;
501
0
                        break;
502
0
                    }
503
0
                }
504
0
                input.fork_to_replace.clear();
505
0
            }
506
6.94M
            if (!found) {
507
6.94M
                states_to_try_next.append(state);
508
6.94M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
509
6.94M
            }
510
6.94M
            state.instruction_position = state.fork_at_position;
511
#if REGEX_DEBUG
512
            ++recursion_level;
513
#endif
514
6.94M
            continue;
515
0
        }
516
190M
        case ExecutionResult::Continue:
517
190M
            continue;
518
1.07M
        case ExecutionResult::Succeeded:
519
1.07M
            return true;
520
0
        case ExecutionResult::Failed:
521
0
            if (!states_to_try_next.is_empty()) {
522
0
                state = states_to_try_next.take_last();
523
0
                continue;
524
0
            }
525
0
            return false;
526
25.2M
        case ExecutionResult::Failed_ExecuteLowPrioForks: {
527
25.2M
            if (states_to_try_next.is_empty()) {
528
3.52M
                return false;
529
3.52M
            }
530
21.6M
            state = states_to_try_next.take_last();
531
#if REGEX_DEBUG
532
            ++recursion_level;
533
#endif
534
21.6M
            continue;
535
25.2M
        }
536
278M
        }
537
278M
    }
538
539
0
    VERIFY_NOT_REACHED();
540
0
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
regex::Matcher<regex::ECMA262Parser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
Line
Count
Source
421
4.59M
{
422
4.59M
    if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
423
        // Yay, we can do a simple substring search!
424
0
        auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
425
0
        if (needle.length() + state.string_position > input.view.length())
426
0
            return false;
427
428
0
        auto haystack = input.view.string_view().substring_view(state.string_position);
429
0
        if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
430
0
            if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
431
0
                return false;
432
0
        } else {
433
0
            if (!haystack.starts_with(needle))
434
0
                return false;
435
0
        }
436
437
0
        state.string_position += needle.length();
438
0
        state.string_position_in_code_units += needle.length();
439
0
        return true;
440
0
    }
441
442
4.59M
    BumpAllocatedLinkedList<MatchState> states_to_try_next;
443
#if REGEX_DEBUG
444
    size_t recursion_level = 0;
445
#endif
446
447
4.59M
    auto& bytecode = m_pattern->parser_result.bytecode;
448
449
278M
    for (;;) {
450
278M
        auto& opcode = bytecode.get_opcode(state);
451
278M
        ++operations;
452
453
#if REGEX_DEBUG
454
        s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false);
455
#endif
456
457
278M
        ExecutionResult result;
458
278M
        if (input.fail_counter > 0) {
459
0
            --input.fail_counter;
460
0
            result = ExecutionResult::Failed_ExecuteLowPrioForks;
461
278M
        } else {
462
278M
            result = opcode.execute(input, state);
463
278M
        }
464
465
#if REGEX_DEBUG
466
        s_regex_dbg.print_result(opcode, bytecode, input, state, result);
467
#endif
468
469
278M
        state.instruction_position += opcode.size();
470
471
278M
        switch (result) {
472
54.0M
        case ExecutionResult::Fork_PrioLow: {
473
54.0M
            bool found = false;
474
54.0M
            if (input.fork_to_replace.has_value()) {
475
321M
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
476
321M
                    if (it->initiating_fork == input.fork_to_replace.value()) {
477
223
                        (*it) = state;
478
223
                        it->instruction_position = state.fork_at_position;
479
223
                        it->initiating_fork = *input.fork_to_replace;
480
223
                        found = true;
481
223
                        break;
482
223
                    }
483
321M
                }
484
55.2k
                input.fork_to_replace.clear();
485
55.2k
            }
486
54.0M
            if (!found) {
487
54.0M
                states_to_try_next.append(state);
488
54.0M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
489
54.0M
                states_to_try_next.last().instruction_position = state.fork_at_position;
490
54.0M
            }
491
54.0M
            continue;
492
0
        }
493
6.94M
        case ExecutionResult::Fork_PrioHigh: {
494
6.94M
            bool found = false;
495
6.94M
            if (input.fork_to_replace.has_value()) {
496
0
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
497
0
                    if (it->initiating_fork == input.fork_to_replace.value()) {
498
0
                        (*it) = state;
499
0
                        it->initiating_fork = *input.fork_to_replace;
500
0
                        found = true;
501
0
                        break;
502
0
                    }
503
0
                }
504
0
                input.fork_to_replace.clear();
505
0
            }
506
6.94M
            if (!found) {
507
6.94M
                states_to_try_next.append(state);
508
6.94M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
509
6.94M
            }
510
6.94M
            state.instruction_position = state.fork_at_position;
511
#if REGEX_DEBUG
512
            ++recursion_level;
513
#endif
514
6.94M
            continue;
515
0
        }
516
190M
        case ExecutionResult::Continue:
517
190M
            continue;
518
1.07M
        case ExecutionResult::Succeeded:
519
1.07M
            return true;
520
0
        case ExecutionResult::Failed:
521
0
            if (!states_to_try_next.is_empty()) {
522
0
                state = states_to_try_next.take_last();
523
0
                continue;
524
0
            }
525
0
            return false;
526
25.2M
        case ExecutionResult::Failed_ExecuteLowPrioForks: {
527
25.2M
            if (states_to_try_next.is_empty()) {
528
3.52M
                return false;
529
3.52M
            }
530
21.6M
            state = states_to_try_next.take_last();
531
#if REGEX_DEBUG
532
            ++recursion_level;
533
#endif
534
21.6M
            continue;
535
25.2M
        }
536
278M
        }
537
278M
    }
538
539
0
    VERIFY_NOT_REACHED();
540
0
}
541
542
template class Matcher<PosixBasicParser>;
543
template class Regex<PosixBasicParser>;
544
545
template class Matcher<PosixExtendedParser>;
546
template class Regex<PosixExtendedParser>;
547
548
template class Matcher<ECMA262Parser>;
549
template class Regex<ECMA262Parser>;
550
}