Coverage Report

Created: 2025-12-18 07:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibRegex/RegexMatcher.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/BumpAllocator.h>
8
#include <AK/ByteString.h>
9
#include <AK/Debug.h>
10
#include <AK/StringBuilder.h>
11
#include <LibRegex/RegexMatcher.h>
12
#include <LibRegex/RegexParser.h>
13
14
#if REGEX_DEBUG
15
#    include <LibRegex/RegexDebug.h>
16
#endif
17
18
namespace regex {
19
20
#if REGEX_DEBUG
21
static RegexDebug s_regex_dbg(stderr);
22
#endif
23
24
template<class Parser>
25
regex::Parser::Result Regex<Parser>::parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
26
0
{
27
0
    regex::Lexer lexer(pattern);
28
29
0
    Parser parser(lexer, regex_options);
30
0
    return parser.parse();
31
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::parse_pattern(AK::StringView, regex::RegexOptions<regex::ECMAScriptFlags>)
32
33
template<class Parser>
34
Regex<Parser>::Regex(ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options)
35
7.52k
    : pattern_value(move(pattern))
36
7.52k
{
37
7.52k
    regex::Lexer lexer(pattern_value);
38
39
7.52k
    Parser parser(lexer, regex_options);
40
7.52k
    parser_result = parser.parse();
41
42
7.52k
    run_optimization_passes();
43
7.52k
    if (parser_result.error == regex::Error::NoError)
44
4.95k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
7.52k
}
regex::Regex<regex::PosixBasicParser>::Regex(AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Line
Count
Source
35
4.85k
    : pattern_value(move(pattern))
36
4.85k
{
37
4.85k
    regex::Lexer lexer(pattern_value);
38
39
4.85k
    Parser parser(lexer, regex_options);
40
4.85k
    parser_result = parser.parse();
41
42
4.85k
    run_optimization_passes();
43
4.85k
    if (parser_result.error == regex::Error::NoError)
44
3.30k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
4.85k
}
regex::Regex<regex::PosixExtendedParser>::Regex(AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Line
Count
Source
35
630
    : pattern_value(move(pattern))
36
630
{
37
630
    regex::Lexer lexer(pattern_value);
38
39
630
    Parser parser(lexer, regex_options);
40
630
    parser_result = parser.parse();
41
42
630
    run_optimization_passes();
43
630
    if (parser_result.error == regex::Error::NoError)
44
260
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
630
}
regex::Regex<regex::ECMA262Parser>::Regex(AK::ByteString, regex::RegexOptions<regex::ECMAScriptFlags>)
Line
Count
Source
35
2.04k
    : pattern_value(move(pattern))
36
2.04k
{
37
2.04k
    regex::Lexer lexer(pattern_value);
38
39
2.04k
    Parser parser(lexer, regex_options);
40
2.04k
    parser_result = parser.parse();
41
42
2.04k
    run_optimization_passes();
43
2.04k
    if (parser_result.error == regex::Error::NoError)
44
1.38k
        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
45
2.04k
}
46
47
template<class Parser>
48
Regex<Parser>::Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options)
49
0
    : pattern_value(move(pattern))
50
0
    , parser_result(move(parse_result))
51
0
{
52
0
    run_optimization_passes();
53
0
    if (parser_result.error == regex::Error::NoError)
54
0
        matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value()));
55
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::PosixFlags>)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::Regex(regex::Parser::Result, AK::ByteString, regex::RegexOptions<regex::ECMAScriptFlags>)
56
57
template<class Parser>
58
Regex<Parser>::Regex(Regex&& regex)
59
4.79k
    : pattern_value(move(regex.pattern_value))
60
4.79k
    , parser_result(move(regex.parser_result))
61
4.79k
    , matcher(move(regex.matcher))
62
4.79k
    , start_offset(regex.start_offset)
63
4.79k
{
64
4.79k
    if (matcher)
65
4.14k
        matcher->reset_pattern({}, this);
66
4.79k
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::Regex(regex::Regex<regex::PosixBasicParser>&&)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::Regex(regex::Regex<regex::PosixExtendedParser>&&)
regex::Regex<regex::ECMA262Parser>::Regex(regex::Regex<regex::ECMA262Parser>&&)
Line
Count
Source
59
4.79k
    : pattern_value(move(regex.pattern_value))
60
4.79k
    , parser_result(move(regex.parser_result))
61
4.79k
    , matcher(move(regex.matcher))
62
4.79k
    , start_offset(regex.start_offset)
63
4.79k
{
64
4.79k
    if (matcher)
65
4.14k
        matcher->reset_pattern({}, this);
66
4.79k
}
67
68
template<class Parser>
69
Regex<Parser>& Regex<Parser>::operator=(Regex&& regex)
70
0
{
71
0
    pattern_value = move(regex.pattern_value);
72
0
    parser_result = move(regex.parser_result);
73
0
    matcher = move(regex.matcher);
74
0
    if (matcher)
75
0
        matcher->reset_pattern({}, this);
76
0
    start_offset = regex.start_offset;
77
0
    return *this;
78
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::operator=(regex::Regex<regex::PosixBasicParser>&&)
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::operator=(regex::Regex<regex::PosixExtendedParser>&&)
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::operator=(regex::Regex<regex::ECMA262Parser>&&)
79
80
template<class Parser>
81
typename ParserTraits<Parser>::OptionsType Regex<Parser>::options() const
82
0
{
83
0
    if (!matcher || parser_result.error != Error::NoError)
84
0
        return {};
85
86
0
    return matcher->options();
87
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::options() const
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::options() const
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::options() const
88
89
template<class Parser>
90
ByteString Regex<Parser>::error_string(Optional<ByteString> message) const
91
0
{
92
0
    StringBuilder eb;
93
0
    eb.append("Error during parsing of regular expression:\n"sv);
94
0
    eb.appendff("    {}\n    ", pattern_value);
95
0
    for (size_t i = 0; i < parser_result.error_token.position(); ++i)
96
0
        eb.append(' ');
97
98
0
    eb.appendff("^---- {}", message.value_or(get_error_string(parser_result.error)));
99
0
    return eb.to_byte_string();
100
0
}
Unexecuted instantiation: regex::Regex<regex::PosixBasicParser>::error_string(AK::Optional<AK::ByteString>) const
Unexecuted instantiation: regex::Regex<regex::PosixExtendedParser>::error_string(AK::Optional<AK::ByteString>) const
Unexecuted instantiation: regex::Regex<regex::ECMA262Parser>::error_string(AK::Optional<AK::ByteString>) const
101
102
template<typename Parser>
103
RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
104
33.4M
{
105
33.4M
    AllOptions options = m_regex_options | regex_options.value_or({}).value();
106
107
33.4M
    if constexpr (!IsSame<Parser, ECMA262>) {
108
0
        if (options.has_flag_set(AllFlags::Multiline))
109
0
            return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
110
0
    }
111
112
0
    Vector<RegexStringView> views;
113
33.4M
    views.append(view);
114
33.4M
    return match(views, regex_options);
115
33.4M
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
regex::Matcher<regex::ECMA262Parser>::match(regex::RegexStringView, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const
Line
Count
Source
104
33.4M
{
105
33.4M
    AllOptions options = m_regex_options | regex_options.value_or({}).value();
106
107
    if constexpr (!IsSame<Parser, ECMA262>) {
108
        if (options.has_flag_set(AllFlags::Multiline))
109
            return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
110
    }
111
112
33.4M
    Vector<RegexStringView> views;
113
33.4M
    views.append(view);
114
33.4M
    return match(views, regex_options);
115
33.4M
}
116
117
template<typename Parser>
118
RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
119
33.4M
{
120
    // If the pattern *itself* isn't stateful, reset any changes to start_offset.
121
33.4M
    if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
122
33.4M
        m_pattern->start_offset = 0;
123
124
33.4M
    size_t match_count { 0 };
125
126
33.4M
    MatchInput input;
127
33.4M
    MatchState state;
128
33.4M
    size_t operations = 0;
129
130
33.4M
    input.regex_options = m_regex_options | regex_options.value_or({}).value();
131
33.4M
    input.start_offset = m_pattern->start_offset;
132
33.4M
    size_t lines_to_skip = 0;
133
134
33.4M
    bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
135
33.4M
    for (auto const& view : views)
136
33.4M
        const_cast<RegexStringView&>(view).set_unicode(unicode);
137
138
33.4M
    if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
139
0
        if (views.size() > 1 && input.start_offset > views.first().length()) {
140
0
            dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
141
0
            for (auto const& view : views) {
142
0
                if (input.start_offset < view.length() + 1)
143
0
                    break;
144
0
                ++lines_to_skip;
145
0
                input.start_offset -= view.length() + 1;
146
0
                input.global_offset += view.length() + 1;
147
0
            }
148
0
            dbgln_if(REGEX_DEBUG, "Ended with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
149
0
        }
150
0
    }
151
152
33.4M
    if (c_match_preallocation_count) {
153
0
        state.matches.ensure_capacity(c_match_preallocation_count);
154
0
        state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
155
0
        auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
156
157
0
        for (size_t j = 0; j < c_match_preallocation_count; ++j) {
158
0
            state.matches.empend();
159
0
            state.capture_group_matches.empend();
160
0
            state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count);
161
0
            for (size_t k = 0; k < capture_groups_count; ++k)
162
0
                state.capture_group_matches.mutable_at(j).unchecked_append({});
163
0
        }
164
0
    }
165
166
33.4M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
1.01M
        if (state.matches.size() == input.match_index)
168
1.01M
            state.matches.empend();
169
170
1.01M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
1.01M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
0
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
1.01M
        } else { // let the view point to the original string ...
174
1.01M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
1.01M
        }
176
1.01M
    };
Unexecuted instantiation: auto regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
Unexecuted instantiation: auto regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
auto regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1&, auto:2&, auto:3&)#1}::operator()<regex::MatchInput, regex::MatchState, unsigned long>(regex::MatchInput&, regex::MatchState&, unsigned long&) const
Line
Count
Source
166
1.01M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
1.01M
        if (state.matches.size() == input.match_index)
168
1.01M
            state.matches.empend();
169
170
1.01M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
1.01M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
0
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
1.01M
        } else { // let the view point to the original string ...
174
1.01M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
1.01M
        }
176
1.01M
    };
177
178
#if REGEX_DEBUG
179
    s_regex_dbg.print_header();
180
#endif
181
182
33.4M
    bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
183
33.4M
    if (input.regex_options.has_flag_set(AllFlags::Sticky))
184
0
        continue_search = false;
185
186
33.4M
    auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch);
187
188
33.4M
    for (auto const& view : views) {
189
33.4M
        if (lines_to_skip != 0) {
190
0
            ++input.line;
191
0
            --lines_to_skip;
192
0
            continue;
193
0
        }
194
33.4M
        input.view = view;
195
33.4M
        dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
196
197
33.4M
        auto view_length = view.length();
198
33.4M
        size_t view_index = m_pattern->start_offset;
199
33.4M
        state.string_position = view_index;
200
33.4M
        state.string_position_in_code_units = view_index;
201
33.4M
        bool succeeded = false;
202
203
33.4M
        if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
204
            // Run the code until it tries to consume something.
205
            // This allows non-consuming code to run on empty strings, for instance
206
            // e.g. "Exit"
207
0
            size_t temp_operations = operations;
208
209
0
            input.column = match_count;
210
0
            input.match_index = match_count;
211
212
0
            state.string_position = view_index;
213
0
            state.string_position_in_code_units = view_index;
214
0
            state.instruction_position = 0;
215
0
            state.repetition_marks.clear();
216
217
0
            auto success = execute(input, state, temp_operations);
218
            // This success is acceptable only if it doesn't read anything from the input (input length is 0).
219
0
            if (success && (state.string_position <= view_index)) {
220
0
                operations = temp_operations;
221
0
                if (!match_count) {
222
                    // Nothing was *actually* matched, so append an empty match.
223
0
                    append_match(input, state, view_index);
224
0
                    ++match_count;
225
226
                    // This prevents a regex pattern like ".*" from matching the empty string
227
                    // multiple times, once in this block and once in the following for loop.
228
0
                    if (view_index == 0 && view_length == 0)
229
0
                        ++view_index;
230
0
                }
231
0
            }
232
0
        }
233
234
33.4M
        for (; view_index <= view_length; ++view_index) {
235
33.4M
            if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline))
236
0
                break;
237
238
33.4M
            auto& match_length_minimum = m_pattern->parser_result.match_length_minimum;
239
            // FIXME: More performant would be to know the remaining minimum string
240
            //        length needed to match from the current position onwards within
241
            //        the vm. Add new OpCode for MinMatchLengthFromSp with the value of
242
            //        the remaining string length from the current path. The value though
243
            //        has to be filled in reverse. That implies a second run over bytecode
244
            //        after generation has finished.
245
33.4M
            if (match_length_minimum && match_length_minimum > view_length - view_index)
246
28.5M
                break;
247
248
4.82M
            input.column = match_count;
249
4.82M
            input.match_index = match_count;
250
251
4.82M
            state.string_position = view_index;
252
4.82M
            state.string_position_in_code_units = view_index;
253
4.82M
            state.instruction_position = 0;
254
4.82M
            state.repetition_marks.clear();
255
256
4.82M
            auto success = execute(input, state, operations);
257
4.82M
            if (success) {
258
1.01M
                succeeded = true;
259
260
1.01M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
261
0
                    if (!continue_search)
262
0
                        break;
263
0
                    continue;
264
0
                }
265
1.01M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
266
0
                    if (!continue_search)
267
0
                        break;
268
0
                    continue;
269
0
                }
270
271
1.01M
                dbgln_if(REGEX_DEBUG, "state.string_position={}, view_index={}", state.string_position, view_index);
272
1.01M
                dbgln_if(REGEX_DEBUG, "[match] Found a match (length={}): '{}'", state.string_position - view_index, input.view.substring_view(view_index, state.string_position - view_index));
273
274
1.01M
                ++match_count;
275
276
1.01M
                if (continue_search) {
277
0
                    append_match(input, state, view_index);
278
279
0
                    bool has_zero_length = state.string_position == view_index;
280
0
                    view_index = state.string_position - (has_zero_length ? 0 : 1);
281
0
                    if (single_match_only)
282
0
                        break;
283
0
                    continue;
284
0
                }
285
1.01M
                if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
286
0
                    append_match(input, state, view_index);
287
0
                    break;
288
0
                }
289
1.01M
                if (state.string_position < view_length) {
290
0
                    return { false, 0, {}, {}, {}, operations };
291
0
                }
292
293
1.01M
                append_match(input, state, view_index);
294
1.01M
                break;
295
1.01M
            }
296
297
3.80M
            if (!continue_search)
298
3.80M
                break;
299
3.80M
        }
300
301
33.4M
        ++input.line;
302
33.4M
        input.global_offset += view.length() + 1; // +1 includes the line break character
303
304
33.4M
        if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
305
0
            m_pattern->start_offset = state.string_position;
306
307
33.4M
        if (succeeded && !continue_search)
308
1.01M
            break;
309
33.4M
    }
310
311
33.4M
    RegexResult result {
312
33.4M
        match_count != 0,
313
33.4M
        match_count,
314
33.4M
        move(state.matches).release(),
315
33.4M
        move(state.capture_group_matches).release(),
316
33.4M
        operations,
317
33.4M
        m_pattern->parser_result.capture_groups_count,
318
33.4M
        m_pattern->parser_result.named_capture_groups_count,
319
33.4M
    };
320
321
33.4M
    if (match_count) {
322
        // Make sure there are as many capture matches as there are actual matches.
323
1.01M
        if (result.capture_group_matches.size() < match_count)
324
0
            result.capture_group_matches.resize(match_count);
325
1.01M
        for (auto& matches : result.capture_group_matches)
326
1.01M
            matches.resize(m_pattern->parser_result.capture_groups_count + 1);
327
1.01M
        if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
328
1.01M
            for (auto& matches : result.capture_group_matches)
329
4.91M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
Unexecuted instantiation: auto regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
Unexecuted instantiation: auto regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
auto regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1&)#1}::operator()<regex::Match>(regex::Match&) const
Line
Count
Source
329
4.91M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
330
1.01M
        }
331
32.4M
    } else {
332
32.4M
        result.capture_group_matches.clear_with_capacity();
333
32.4M
    }
334
335
33.4M
    return result;
336
33.4M
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const
regex::Matcher<regex::ECMA262Parser>::match(AK::Vector<regex::RegexStringView, 0ul> const&, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const
Line
Count
Source
119
33.4M
{
120
    // If the pattern *itself* isn't stateful, reset any changes to start_offset.
121
33.4M
    if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
122
33.4M
        m_pattern->start_offset = 0;
123
124
33.4M
    size_t match_count { 0 };
125
126
33.4M
    MatchInput input;
127
33.4M
    MatchState state;
128
33.4M
    size_t operations = 0;
129
130
33.4M
    input.regex_options = m_regex_options | regex_options.value_or({}).value();
131
33.4M
    input.start_offset = m_pattern->start_offset;
132
33.4M
    size_t lines_to_skip = 0;
133
134
33.4M
    bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
135
33.4M
    for (auto const& view : views)
136
33.4M
        const_cast<RegexStringView&>(view).set_unicode(unicode);
137
138
33.4M
    if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
139
0
        if (views.size() > 1 && input.start_offset > views.first().length()) {
140
0
            dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
141
0
            for (auto const& view : views) {
142
0
                if (input.start_offset < view.length() + 1)
143
0
                    break;
144
0
                ++lines_to_skip;
145
0
                input.start_offset -= view.length() + 1;
146
0
                input.global_offset += view.length() + 1;
147
0
            }
148
0
            dbgln_if(REGEX_DEBUG, "Ended with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
149
0
        }
150
0
    }
151
152
33.4M
    if (c_match_preallocation_count) {
153
0
        state.matches.ensure_capacity(c_match_preallocation_count);
154
0
        state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
155
0
        auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
156
157
0
        for (size_t j = 0; j < c_match_preallocation_count; ++j) {
158
0
            state.matches.empend();
159
0
            state.capture_group_matches.empend();
160
0
            state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count);
161
0
            for (size_t k = 0; k < capture_groups_count; ++k)
162
0
                state.capture_group_matches.mutable_at(j).unchecked_append({});
163
0
        }
164
0
    }
165
166
33.4M
    auto append_match = [](auto& input, auto& state, auto& start_position) {
167
33.4M
        if (state.matches.size() == input.match_index)
168
33.4M
            state.matches.empend();
169
170
33.4M
        VERIFY(start_position + state.string_position - start_position <= input.view.length());
171
33.4M
        if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
172
33.4M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position };
173
33.4M
        } else { // let the view point to the original string ...
174
33.4M
            state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
175
33.4M
        }
176
33.4M
    };
177
178
#if REGEX_DEBUG
179
    s_regex_dbg.print_header();
180
#endif
181
182
33.4M
    bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
183
33.4M
    if (input.regex_options.has_flag_set(AllFlags::Sticky))
184
0
        continue_search = false;
185
186
33.4M
    auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch);
187
188
33.4M
    for (auto const& view : views) {
189
33.4M
        if (lines_to_skip != 0) {
190
0
            ++input.line;
191
0
            --lines_to_skip;
192
0
            continue;
193
0
        }
194
33.4M
        input.view = view;
195
33.4M
        dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
196
197
33.4M
        auto view_length = view.length();
198
33.4M
        size_t view_index = m_pattern->start_offset;
199
33.4M
        state.string_position = view_index;
200
33.4M
        state.string_position_in_code_units = view_index;
201
33.4M
        bool succeeded = false;
202
203
33.4M
        if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
204
            // Run the code until it tries to consume something.
205
            // This allows non-consuming code to run on empty strings, for instance
206
            // e.g. "Exit"
207
0
            size_t temp_operations = operations;
208
209
0
            input.column = match_count;
210
0
            input.match_index = match_count;
211
212
0
            state.string_position = view_index;
213
0
            state.string_position_in_code_units = view_index;
214
0
            state.instruction_position = 0;
215
0
            state.repetition_marks.clear();
216
217
0
            auto success = execute(input, state, temp_operations);
218
            // This success is acceptable only if it doesn't read anything from the input (input length is 0).
219
0
            if (success && (state.string_position <= view_index)) {
220
0
                operations = temp_operations;
221
0
                if (!match_count) {
222
                    // Nothing was *actually* matched, so append an empty match.
223
0
                    append_match(input, state, view_index);
224
0
                    ++match_count;
225
226
                    // This prevents a regex pattern like ".*" from matching the empty string
227
                    // multiple times, once in this block and once in the following for loop.
228
0
                    if (view_index == 0 && view_length == 0)
229
0
                        ++view_index;
230
0
                }
231
0
            }
232
0
        }
233
234
33.4M
        for (; view_index <= view_length; ++view_index) {
235
33.4M
            if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline))
236
0
                break;
237
238
33.4M
            auto& match_length_minimum = m_pattern->parser_result.match_length_minimum;
239
            // FIXME: More performant would be to know the remaining minimum string
240
            //        length needed to match from the current position onwards within
241
            //        the vm. Add new OpCode for MinMatchLengthFromSp with the value of
242
            //        the remaining string length from the current path. The value though
243
            //        has to be filled in reverse. That implies a second run over bytecode
244
            //        after generation has finished.
245
33.4M
            if (match_length_minimum && match_length_minimum > view_length - view_index)
246
28.5M
                break;
247
248
4.82M
            input.column = match_count;
249
4.82M
            input.match_index = match_count;
250
251
4.82M
            state.string_position = view_index;
252
4.82M
            state.string_position_in_code_units = view_index;
253
4.82M
            state.instruction_position = 0;
254
4.82M
            state.repetition_marks.clear();
255
256
4.82M
            auto success = execute(input, state, operations);
257
4.82M
            if (success) {
258
1.01M
                succeeded = true;
259
260
1.01M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
261
0
                    if (!continue_search)
262
0
                        break;
263
0
                    continue;
264
0
                }
265
1.01M
                if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
266
0
                    if (!continue_search)
267
0
                        break;
268
0
                    continue;
269
0
                }
270
271
1.01M
                dbgln_if(REGEX_DEBUG, "state.string_position={}, view_index={}", state.string_position, view_index);
272
1.01M
                dbgln_if(REGEX_DEBUG, "[match] Found a match (length={}): '{}'", state.string_position - view_index, input.view.substring_view(view_index, state.string_position - view_index));
273
274
1.01M
                ++match_count;
275
276
1.01M
                if (continue_search) {
277
0
                    append_match(input, state, view_index);
278
279
0
                    bool has_zero_length = state.string_position == view_index;
280
0
                    view_index = state.string_position - (has_zero_length ? 0 : 1);
281
0
                    if (single_match_only)
282
0
                        break;
283
0
                    continue;
284
0
                }
285
1.01M
                if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
286
0
                    append_match(input, state, view_index);
287
0
                    break;
288
0
                }
289
1.01M
                if (state.string_position < view_length) {
290
0
                    return { false, 0, {}, {}, {}, operations };
291
0
                }
292
293
1.01M
                append_match(input, state, view_index);
294
1.01M
                break;
295
1.01M
            }
296
297
3.80M
            if (!continue_search)
298
3.80M
                break;
299
3.80M
        }
300
301
33.4M
        ++input.line;
302
33.4M
        input.global_offset += view.length() + 1; // +1 includes the line break character
303
304
33.4M
        if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
305
0
            m_pattern->start_offset = state.string_position;
306
307
33.4M
        if (succeeded && !continue_search)
308
1.01M
            break;
309
33.4M
    }
310
311
33.4M
    RegexResult result {
312
33.4M
        match_count != 0,
313
33.4M
        match_count,
314
33.4M
        move(state.matches).release(),
315
33.4M
        move(state.capture_group_matches).release(),
316
33.4M
        operations,
317
33.4M
        m_pattern->parser_result.capture_groups_count,
318
33.4M
        m_pattern->parser_result.named_capture_groups_count,
319
33.4M
    };
320
321
33.4M
    if (match_count) {
322
        // Make sure there are as many capture matches as there are actual matches.
323
1.01M
        if (result.capture_group_matches.size() < match_count)
324
0
            result.capture_group_matches.resize(match_count);
325
1.01M
        for (auto& matches : result.capture_group_matches)
326
1.01M
            matches.resize(m_pattern->parser_result.capture_groups_count + 1);
327
1.01M
        if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
328
1.01M
            for (auto& matches : result.capture_group_matches)
329
1.01M
                matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
330
1.01M
        }
331
32.4M
    } else {
332
32.4M
        result.capture_group_matches.clear_with_capacity();
333
32.4M
    }
334
335
33.4M
    return result;
336
33.4M
}
337
338
template<typename T>
339
class BumpAllocatedLinkedList {
340
public:
341
4.82M
    BumpAllocatedLinkedList() = default;
342
343
    ALWAYS_INLINE void append(T value)
344
59.0M
    {
345
59.0M
        auto node_ptr = m_allocator.allocate(move(value));
346
59.0M
        VERIFY(node_ptr);
347
348
59.0M
        if (!m_first) {
349
10.7M
            m_first = node_ptr;
350
10.7M
            m_last = node_ptr;
351
10.7M
            return;
352
10.7M
        }
353
354
48.3M
        node_ptr->previous = m_last;
355
48.3M
        m_last->next = node_ptr;
356
48.3M
        m_last = node_ptr;
357
48.3M
    }
358
359
    ALWAYS_INLINE T take_last()
360
20.8M
    {
361
20.8M
        VERIFY(m_last);
362
20.8M
        T value = move(m_last->value);
363
20.8M
        if (m_last == m_first) {
364
10.6M
            m_last = nullptr;
365
10.6M
            m_first = nullptr;
366
10.6M
        } else {
367
10.2M
            m_last = m_last->previous;
368
10.2M
            m_last->next = nullptr;
369
10.2M
        }
370
20.8M
        return value;
371
20.8M
    }
372
373
    ALWAYS_INLINE T& last()
374
111M
    {
375
111M
        return m_last->value;
376
111M
    }
377
378
    ALWAYS_INLINE bool is_empty() const
379
24.7M
    {
380
24.7M
        return m_first == nullptr;
381
24.7M
    }
382
383
12.5k
    auto reverse_begin() { return ReverseIterator(m_last); }
384
14.9M
    auto reverse_end() { return ReverseIterator(); }
385
386
private:
387
    struct Node {
388
        T value;
389
        Node* next { nullptr };
390
        Node* previous { nullptr };
391
    };
392
393
    struct ReverseIterator {
394
        ReverseIterator() = default;
395
        explicit ReverseIterator(Node* node)
396
12.5k
            : m_node(node)
397
12.5k
        {
398
12.5k
        }
399
400
14.9M
        T* operator->() { return &m_node->value; }
401
221
        T& operator*() { return m_node->value; }
402
14.9M
        bool operator==(ReverseIterator const& it) const { return m_node == it.m_node; }
403
        ReverseIterator& operator++()
404
14.9M
        {
405
14.9M
            if (m_node)
406
14.9M
                m_node = m_node->previous;
407
14.9M
            return *this;
408
14.9M
        }
409
410
    private:
411
        Node* m_node;
412
    };
413
414
    UniformBumpAllocator<Node, true, 2 * MiB> m_allocator;
415
    Node* m_first { nullptr };
416
    Node* m_last { nullptr };
417
};
418
419
template<class Parser>
420
bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
421
4.82M
{
422
4.82M
    if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
423
        // Yay, we can do a simple substring search!
424
0
        auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
425
0
        if (needle.length() + state.string_position > input.view.length())
426
0
            return false;
427
428
0
        auto haystack = input.view.string_view().substring_view(state.string_position);
429
0
        if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
430
0
            if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
431
0
                return false;
432
0
        } else {
433
0
            if (!haystack.starts_with(needle))
434
0
                return false;
435
0
        }
436
437
0
        state.string_position += needle.length();
438
0
        state.string_position_in_code_units += needle.length();
439
0
        return true;
440
0
    }
441
442
4.82M
    BumpAllocatedLinkedList<MatchState> states_to_try_next;
443
#if REGEX_DEBUG
444
    size_t recursion_level = 0;
445
#endif
446
447
4.82M
    auto& bytecode = m_pattern->parser_result.bytecode;
448
449
272M
    for (;;) {
450
272M
        auto& opcode = bytecode.get_opcode(state);
451
272M
        ++operations;
452
453
#if REGEX_DEBUG
454
        s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false);
455
#endif
456
457
272M
        ExecutionResult result;
458
272M
        if (input.fail_counter > 0) {
459
0
            --input.fail_counter;
460
0
            result = ExecutionResult::Failed_ExecuteLowPrioForks;
461
272M
        } else {
462
272M
            result = opcode.execute(input, state);
463
272M
        }
464
465
#if REGEX_DEBUG
466
        s_regex_dbg.print_result(opcode, bytecode, input, state, result);
467
#endif
468
469
272M
        state.instruction_position += opcode.size();
470
471
272M
        switch (result) {
472
52.7M
        case ExecutionResult::Fork_PrioLow: {
473
52.7M
            bool found = false;
474
52.7M
            if (input.fork_to_replace.has_value()) {
475
14.9M
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
476
14.9M
                    if (it->initiating_fork == input.fork_to_replace.value()) {
477
221
                        (*it) = state;
478
221
                        it->instruction_position = state.fork_at_position;
479
221
                        it->initiating_fork = *input.fork_to_replace;
480
221
                        found = true;
481
221
                        break;
482
221
                    }
483
14.9M
                }
484
12.5k
                input.fork_to_replace.clear();
485
12.5k
            }
486
52.7M
            if (!found) {
487
52.7M
                states_to_try_next.append(state);
488
52.7M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
489
52.7M
                states_to_try_next.last().instruction_position = state.fork_at_position;
490
52.7M
            }
491
52.7M
            continue;
492
0
        }
493
6.29M
        case ExecutionResult::Fork_PrioHigh: {
494
6.29M
            bool found = false;
495
6.29M
            if (input.fork_to_replace.has_value()) {
496
0
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
497
0
                    if (it->initiating_fork == input.fork_to_replace.value()) {
498
0
                        (*it) = state;
499
0
                        it->initiating_fork = *input.fork_to_replace;
500
0
                        found = true;
501
0
                        break;
502
0
                    }
503
0
                }
504
0
                input.fork_to_replace.clear();
505
0
            }
506
6.29M
            if (!found) {
507
6.29M
                states_to_try_next.append(state);
508
6.29M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
509
6.29M
            }
510
6.29M
            state.instruction_position = state.fork_at_position;
511
#if REGEX_DEBUG
512
            ++recursion_level;
513
#endif
514
6.29M
            continue;
515
0
        }
516
187M
        case ExecutionResult::Continue:
517
187M
            continue;
518
1.01M
        case ExecutionResult::Succeeded:
519
1.01M
            return true;
520
0
        case ExecutionResult::Failed:
521
0
            if (!states_to_try_next.is_empty()) {
522
0
                state = states_to_try_next.take_last();
523
0
                continue;
524
0
            }
525
0
            return false;
526
24.7M
        case ExecutionResult::Failed_ExecuteLowPrioForks: {
527
24.7M
            if (states_to_try_next.is_empty()) {
528
3.80M
                return false;
529
3.80M
            }
530
20.8M
            state = states_to_try_next.take_last();
531
#if REGEX_DEBUG
532
            ++recursion_level;
533
#endif
534
20.8M
            continue;
535
24.7M
        }
536
272M
        }
537
272M
    }
538
539
0
    VERIFY_NOT_REACHED();
540
0
}
Unexecuted instantiation: regex::Matcher<regex::PosixBasicParser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
Unexecuted instantiation: regex::Matcher<regex::PosixExtendedParser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
regex::Matcher<regex::ECMA262Parser>::execute(regex::MatchInput const&, regex::MatchState&, unsigned long&) const
Line
Count
Source
421
4.82M
{
422
4.82M
    if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
423
        // Yay, we can do a simple substring search!
424
0
        auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
425
0
        if (needle.length() + state.string_position > input.view.length())
426
0
            return false;
427
428
0
        auto haystack = input.view.string_view().substring_view(state.string_position);
429
0
        if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
430
0
            if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
431
0
                return false;
432
0
        } else {
433
0
            if (!haystack.starts_with(needle))
434
0
                return false;
435
0
        }
436
437
0
        state.string_position += needle.length();
438
0
        state.string_position_in_code_units += needle.length();
439
0
        return true;
440
0
    }
441
442
4.82M
    BumpAllocatedLinkedList<MatchState> states_to_try_next;
443
#if REGEX_DEBUG
444
    size_t recursion_level = 0;
445
#endif
446
447
4.82M
    auto& bytecode = m_pattern->parser_result.bytecode;
448
449
272M
    for (;;) {
450
272M
        auto& opcode = bytecode.get_opcode(state);
451
272M
        ++operations;
452
453
#if REGEX_DEBUG
454
        s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false);
455
#endif
456
457
272M
        ExecutionResult result;
458
272M
        if (input.fail_counter > 0) {
459
0
            --input.fail_counter;
460
0
            result = ExecutionResult::Failed_ExecuteLowPrioForks;
461
272M
        } else {
462
272M
            result = opcode.execute(input, state);
463
272M
        }
464
465
#if REGEX_DEBUG
466
        s_regex_dbg.print_result(opcode, bytecode, input, state, result);
467
#endif
468
469
272M
        state.instruction_position += opcode.size();
470
471
272M
        switch (result) {
472
52.7M
        case ExecutionResult::Fork_PrioLow: {
473
52.7M
            bool found = false;
474
52.7M
            if (input.fork_to_replace.has_value()) {
475
14.9M
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
476
14.9M
                    if (it->initiating_fork == input.fork_to_replace.value()) {
477
221
                        (*it) = state;
478
221
                        it->instruction_position = state.fork_at_position;
479
221
                        it->initiating_fork = *input.fork_to_replace;
480
221
                        found = true;
481
221
                        break;
482
221
                    }
483
14.9M
                }
484
12.5k
                input.fork_to_replace.clear();
485
12.5k
            }
486
52.7M
            if (!found) {
487
52.7M
                states_to_try_next.append(state);
488
52.7M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
489
52.7M
                states_to_try_next.last().instruction_position = state.fork_at_position;
490
52.7M
            }
491
52.7M
            continue;
492
0
        }
493
6.29M
        case ExecutionResult::Fork_PrioHigh: {
494
6.29M
            bool found = false;
495
6.29M
            if (input.fork_to_replace.has_value()) {
496
0
                for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) {
497
0
                    if (it->initiating_fork == input.fork_to_replace.value()) {
498
0
                        (*it) = state;
499
0
                        it->initiating_fork = *input.fork_to_replace;
500
0
                        found = true;
501
0
                        break;
502
0
                    }
503
0
                }
504
0
                input.fork_to_replace.clear();
505
0
            }
506
6.29M
            if (!found) {
507
6.29M
                states_to_try_next.append(state);
508
6.29M
                states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size();
509
6.29M
            }
510
6.29M
            state.instruction_position = state.fork_at_position;
511
#if REGEX_DEBUG
512
            ++recursion_level;
513
#endif
514
6.29M
            continue;
515
0
        }
516
187M
        case ExecutionResult::Continue:
517
187M
            continue;
518
1.01M
        case ExecutionResult::Succeeded:
519
1.01M
            return true;
520
0
        case ExecutionResult::Failed:
521
0
            if (!states_to_try_next.is_empty()) {
522
0
                state = states_to_try_next.take_last();
523
0
                continue;
524
0
            }
525
0
            return false;
526
24.7M
        case ExecutionResult::Failed_ExecuteLowPrioForks: {
527
24.7M
            if (states_to_try_next.is_empty()) {
528
3.80M
                return false;
529
3.80M
            }
530
20.8M
            state = states_to_try_next.take_last();
531
#if REGEX_DEBUG
532
            ++recursion_level;
533
#endif
534
20.8M
            continue;
535
24.7M
        }
536
272M
        }
537
272M
    }
538
539
0
    VERIFY_NOT_REACHED();
540
0
}
541
542
template class Matcher<PosixBasicParser>;
543
template class Regex<PosixBasicParser>;
544
545
template class Matcher<PosixExtendedParser>;
546
template class Regex<PosixExtendedParser>;
547
548
template class Matcher<ECMA262Parser>;
549
template class Regex<ECMA262Parser>;
550
}