Coverage Report

Created: 2025-09-05 06:52

/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include "RegexLexer.h"
8
#include <AK/Assertions.h>
9
#include <AK/Debug.h>
10
#include <AK/Format.h>
11
#include <stdio.h>
12
13
namespace regex {
14
15
char const* Token::name(TokenType const type)
16
0
{
17
0
    switch (type) {
18
0
#define __ENUMERATE_REGEX_TOKEN(x) \
19
0
    case TokenType::x:             \
20
0
        return #x;
21
0
        ENUMERATE_REGEX_TOKENS
22
0
#undef __ENUMERATE_REGEX_TOKEN
23
0
    default:
24
0
        VERIFY_NOT_REACHED();
25
0
        return "<Unknown>";
26
0
    }
27
0
}
28
29
char const* Token::name() const
30
0
{
31
0
    return name(m_type);
32
0
}
33
34
Lexer::Lexer()
35
0
    : GenericLexer(StringView {})
36
0
{
37
0
}
38
39
Lexer::Lexer(StringView const source)
40
9.04k
    : GenericLexer(source)
41
9.04k
{
42
9.04k
}
43
44
void Lexer::back(size_t offset)
45
34.8M
{
46
34.8M
    if (offset == m_index + 1)
47
0
        offset = m_index; // 'position == 0' occurs twice.
48
49
34.8M
    VERIFY(offset <= m_index);
50
34.8M
    if (!offset)
51
29.7M
        return;
52
5.18M
    m_index -= offset;
53
5.18M
    m_previous_position = (m_index > 0) ? m_index - 1 : 0;
54
5.18M
}
55
56
char Lexer::consume()
57
366M
{
58
366M
    m_previous_position = m_index;
59
366M
    return GenericLexer::consume();
60
366M
}
61
62
void Lexer::reset()
63
9.75k
{
64
9.75k
    m_index = 0;
65
9.75k
    m_current_token = { TokenType::Eof, 0, {} };
66
9.75k
    m_previous_position = 0;
67
9.75k
}
68
69
Token Lexer::next()
70
355M
{
71
355M
    size_t token_start_position;
72
73
355M
    auto begin_token = [&] {
74
8.42M
        token_start_position = m_index;
75
8.42M
    };
76
77
355M
    auto commit_token = [&](auto type) -> Token& {
78
8.42M
        VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
79
8.42M
        auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
80
8.42M
        m_current_token = Token(type, token_start_position, substring);
81
8.42M
        return m_current_token;
82
8.42M
    };
83
84
355M
    auto emit_token = [&](auto type) -> Token& {
85
346M
        m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
86
346M
        consume();
87
346M
        return m_current_token;
88
346M
    };
89
90
355M
    auto match_escape_sequence = [&]() -> size_t {
91
29.9M
        switch (peek(1)) {
92
419
        case '^':
93
1.26k
        case '.':
94
1.51M
        case '[':
95
1.51M
        case ']':
96
5.12M
        case '$':
97
5.64M
        case '(':
98
6.19M
        case ')':
99
6.19M
        case '|':
100
6.21M
        case '*':
101
6.21M
        case '+':
102
6.21M
        case '?':
103
6.54M
        case '{':
104
8.42M
        case '\\':
105
8.42M
            return 2;
106
21.5M
        default:
107
21.5M
            dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
108
21.5M
            return 0;
109
29.9M
        }
110
29.9M
    };
111
112
355M
    while (m_index < m_input.length()) {
113
355M
        auto ch = peek();
114
355M
        if (ch == '(')
115
15.3M
            return emit_token(TokenType::LeftParen);
116
117
339M
        if (ch == ')')
118
2.69M
            return emit_token(TokenType::RightParen);
119
120
336M
        if (ch == '{')
121
380k
            return emit_token(TokenType::LeftCurly);
122
123
336M
        if (ch == '}')
124
896k
            return emit_token(TokenType::RightCurly);
125
126
335M
        if (ch == '[')
127
1.06M
            return emit_token(TokenType::LeftBracket);
128
129
334M
        if (ch == ']')
130
251k
            return emit_token(TokenType::RightBracket);
131
132
334M
        if (ch == '.')
133
7.03M
            return emit_token(TokenType::Period);
134
135
327M
        if (ch == '*')
136
1.36M
            return emit_token(TokenType::Asterisk);
137
138
325M
        if (ch == '+')
139
1.16M
            return emit_token(TokenType::Plus);
140
141
324M
        if (ch == '$')
142
3.13M
            return emit_token(TokenType::Dollar);
143
144
321M
        if (ch == '^')
145
3.56M
            return emit_token(TokenType::Circumflex);
146
147
318M
        if (ch == '|')
148
20.1M
            return emit_token(TokenType::Pipe);
149
150
297M
        if (ch == '?')
151
2.57M
            return emit_token(TokenType::Questionmark);
152
153
295M
        if (ch == ',')
154
2.18M
            return emit_token(TokenType::Comma);
155
156
293M
        if (ch == '/')
157
1.01M
            return emit_token(TokenType::Slash);
158
159
292M
        if (ch == '=')
160
225k
            return emit_token(TokenType::EqualSign);
161
162
291M
        if (ch == ':')
163
839k
            return emit_token(TokenType::Colon);
164
165
291M
        if (ch == '-')
166
8.35M
            return emit_token(TokenType::HyphenMinus);
167
168
282M
        if (ch == '\\') {
169
29.9M
            size_t escape = match_escape_sequence();
170
29.9M
            if (escape > 0) {
171
8.42M
                begin_token();
172
25.2M
                for (size_t i = 0; i < escape; ++i)
173
16.8M
                    consume();
174
8.42M
                return commit_token(TokenType::EscapeSequence);
175
8.42M
            }
176
29.9M
        }
177
178
274M
        return emit_token(TokenType::Char);
179
282M
    }
180
181
24.8k
    return Token(TokenType::Eof, m_index, {});
182
355M
}
183
184
}