Coverage Report

Created: 2026-02-14 08:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include "RegexLexer.h"
8
#include <AK/Assertions.h>
9
#include <AK/Debug.h>
10
#include <AK/Format.h>
11
#include <stdio.h>
12
13
namespace regex {
14
15
char const* Token::name(TokenType const type)
16
0
{
17
0
    switch (type) {
18
0
#define __ENUMERATE_REGEX_TOKEN(x) \
19
0
    case TokenType::x:             \
20
0
        return #x;
21
0
        ENUMERATE_REGEX_TOKENS
22
0
#undef __ENUMERATE_REGEX_TOKEN
23
0
    default:
24
0
        VERIFY_NOT_REACHED();
25
0
        return "<Unknown>";
26
0
    }
27
0
}
28
29
char const* Token::name() const
30
0
{
31
0
    return name(m_type);
32
0
}
33
34
Lexer::Lexer()
35
0
    : GenericLexer(StringView {})
36
0
{
37
0
}
38
39
Lexer::Lexer(StringView const source)
40
7.36k
    : GenericLexer(source)
41
7.36k
{
42
7.36k
}
43
44
void Lexer::back(size_t offset)
45
22.4M
{
46
22.4M
    if (offset == m_index + 1)
47
0
        offset = m_index; // 'position == 0' occurs twice.
48
49
22.4M
    VERIFY(offset <= m_index);
50
22.4M
    if (!offset)
51
17.7M
        return;
52
4.73M
    m_index -= offset;
53
4.73M
    m_previous_position = (m_index > 0) ? m_index - 1 : 0;
54
4.73M
}
55
56
char Lexer::consume()
57
300M
{
58
300M
    m_previous_position = m_index;
59
300M
    return GenericLexer::consume();
60
300M
}
61
62
void Lexer::reset()
63
7.57k
{
64
7.57k
    m_index = 0;
65
7.57k
    m_current_token = { TokenType::Eof, 0, {} };
66
7.57k
    m_previous_position = 0;
67
7.57k
}
68
69
Token Lexer::next()
70
290M
{
71
290M
    size_t token_start_position;
72
73
290M
    auto begin_token = [&] {
74
6.90M
        token_start_position = m_index;
75
6.90M
    };
76
77
290M
    auto commit_token = [&](auto type) -> Token& {
78
6.90M
        VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
79
6.90M
        auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
80
6.90M
        m_current_token = Token(type, token_start_position, substring);
81
6.90M
        return m_current_token;
82
6.90M
    };
83
84
290M
    auto emit_token = [&](auto type) -> Token& {
85
283M
        m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
86
283M
        consume();
87
283M
        return m_current_token;
88
283M
    };
89
90
290M
    auto match_escape_sequence = [&]() -> size_t {
91
25.1M
        switch (peek(1)) {
92
689
        case '^':
93
1.79k
        case '.':
94
1.03M
        case '[':
95
1.03M
        case ']':
96
4.43M
        case '$':
97
4.81M
        case '(':
98
5.18M
        case ')':
99
5.19M
        case '|':
100
5.20M
        case '*':
101
5.20M
        case '+':
102
5.20M
        case '?':
103
5.49M
        case '{':
104
6.90M
        case '\\':
105
6.90M
            return 2;
106
18.2M
        default:
107
18.2M
            dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
108
18.2M
            return 0;
109
25.1M
        }
110
25.1M
    };
111
112
290M
    while (m_index < m_input.length()) {
113
290M
        auto ch = peek();
114
290M
        if (ch == '(')
115
9.40M
            return emit_token(TokenType::LeftParen);
116
117
281M
        if (ch == ')')
118
1.62M
            return emit_token(TokenType::RightParen);
119
120
279M
        if (ch == '{')
121
347k
            return emit_token(TokenType::LeftCurly);
122
123
279M
        if (ch == '}')
124
832k
            return emit_token(TokenType::RightCurly);
125
126
278M
        if (ch == '[')
127
816k
            return emit_token(TokenType::LeftBracket);
128
129
277M
        if (ch == ']')
130
260k
            return emit_token(TokenType::RightBracket);
131
132
277M
        if (ch == '.')
133
2.97M
            return emit_token(TokenType::Period);
134
135
274M
        if (ch == '*')
136
903k
            return emit_token(TokenType::Asterisk);
137
138
273M
        if (ch == '+')
139
1.05M
            return emit_token(TokenType::Plus);
140
141
272M
        if (ch == '$')
142
2.90M
            return emit_token(TokenType::Dollar);
143
144
269M
        if (ch == '^')
145
2.74M
            return emit_token(TokenType::Circumflex);
146
147
266M
        if (ch == '|')
148
10.5M
            return emit_token(TokenType::Pipe);
149
150
256M
        if (ch == '?')
151
2.35M
            return emit_token(TokenType::Questionmark);
152
153
253M
        if (ch == ',')
154
3.61M
            return emit_token(TokenType::Comma);
155
156
250M
        if (ch == '/')
157
817k
            return emit_token(TokenType::Slash);
158
159
249M
        if (ch == '=')
160
158k
            return emit_token(TokenType::EqualSign);
161
162
249M
        if (ch == ':')
163
1.83M
            return emit_token(TokenType::Colon);
164
165
247M
        if (ch == '-')
166
6.80M
            return emit_token(TokenType::HyphenMinus);
167
168
240M
        if (ch == '\\') {
169
25.1M
            size_t escape = match_escape_sequence();
170
25.1M
            if (escape > 0) {
171
6.90M
                begin_token();
172
20.7M
                for (size_t i = 0; i < escape; ++i)
173
13.8M
                    consume();
174
6.90M
                return commit_token(TokenType::EscapeSequence);
175
6.90M
            }
176
25.1M
        }
177
178
233M
        return emit_token(TokenType::Char);
179
240M
    }
180
181
15.9k
    return Token(TokenType::Eof, m_index, {});
182
290M
}
183
184
}