Coverage Report

Created: 2026-05-16 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include "RegexLexer.h"
8
#include <AK/Assertions.h>
9
#include <AK/Debug.h>
10
#include <AK/Format.h>
11
#include <stdio.h>
12
13
namespace regex {
14
15
char const* Token::name(TokenType const type)
16
0
{
17
0
    switch (type) {
18
0
#define __ENUMERATE_REGEX_TOKEN(x) \
19
0
    case TokenType::x:             \
20
0
        return #x;
21
0
        ENUMERATE_REGEX_TOKENS
22
0
#undef __ENUMERATE_REGEX_TOKEN
23
0
    default:
24
0
        VERIFY_NOT_REACHED();
25
0
        return "<Unknown>";
26
0
    }
27
0
}
28
29
char const* Token::name() const
30
0
{
31
0
    return name(m_type);
32
0
}
33
34
Lexer::Lexer()
35
0
    : GenericLexer(StringView {})
36
0
{
37
0
}
38
39
Lexer::Lexer(StringView const source)
40
9.80k
    : GenericLexer(source)
41
9.80k
{
42
9.80k
}
43
44
void Lexer::back(size_t offset)
45
36.9M
{
46
36.9M
    if (offset == m_index + 1)
47
0
        offset = m_index; // 'position == 0' occurs twice.
48
49
36.9M
    VERIFY(offset <= m_index);
50
36.9M
    if (!offset)
51
31.9M
        return;
52
4.97M
    m_index -= offset;
53
4.97M
    m_previous_position = (m_index > 0) ? m_index - 1 : 0;
54
4.97M
}
55
56
char Lexer::consume()
57
350M
{
58
350M
    m_previous_position = m_index;
59
350M
    return GenericLexer::consume();
60
350M
}
61
62
void Lexer::reset()
63
10.7k
{
64
10.7k
    m_index = 0;
65
10.7k
    m_current_token = { TokenType::Eof, 0, {} };
66
10.7k
    m_previous_position = 0;
67
10.7k
}
68
69
Token Lexer::next()
70
340M
{
71
340M
    size_t token_start_position;
72
73
340M
    auto begin_token = [&] {
74
7.07M
        token_start_position = m_index;
75
7.07M
    };
76
77
340M
    auto commit_token = [&](auto type) -> Token& {
78
7.07M
        VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
79
7.07M
        auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
80
7.07M
        m_current_token = Token(type, token_start_position, substring);
81
7.07M
        return m_current_token;
82
7.07M
    };
83
84
340M
    auto emit_token = [&](auto type) -> Token& {
85
333M
        m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
86
333M
        consume();
87
333M
        return m_current_token;
88
333M
    };
89
90
340M
    auto match_escape_sequence = [&]() -> size_t {
91
30.2M
        switch (peek(1)) {
92
719
        case '^':
93
2.02k
        case '.':
94
1.03M
        case '[':
95
1.03M
        case ']':
96
4.58M
        case '$':
97
4.96M
        case '(':
98
5.34M
        case ')':
99
5.34M
        case '|':
100
5.36M
        case '*':
101
5.36M
        case '+':
102
5.36M
        case '?':
103
5.66M
        case '{':
104
7.07M
        case '\\':
105
7.07M
            return 2;
106
23.2M
        default:
107
23.2M
            dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
108
23.2M
            return 0;
109
30.2M
        }
110
30.2M
    };
111
112
340M
    while (m_index < m_input.length()) {
113
340M
        auto ch = peek();
114
340M
        if (ch == '(')
115
11.7M
            return emit_token(TokenType::LeftParen);
116
117
328M
        if (ch == ')')
118
2.21M
            return emit_token(TokenType::RightParen);
119
120
326M
        if (ch == '{')
121
336k
            return emit_token(TokenType::LeftCurly);
122
123
326M
        if (ch == '}')
124
835k
            return emit_token(TokenType::RightCurly);
125
126
325M
        if (ch == '[')
127
835k
            return emit_token(TokenType::LeftBracket);
128
129
324M
        if (ch == ']')
130
277k
            return emit_token(TokenType::RightBracket);
131
132
324M
        if (ch == '.')
133
3.64M
            return emit_token(TokenType::Period);
134
135
320M
        if (ch == '*')
136
1.31M
            return emit_token(TokenType::Asterisk);
137
138
319M
        if (ch == '+')
139
836k
            return emit_token(TokenType::Plus);
140
141
318M
        if (ch == '$')
142
2.97M
            return emit_token(TokenType::Dollar);
143
144
315M
        if (ch == '^')
145
3.06M
            return emit_token(TokenType::Circumflex);
146
147
312M
        if (ch == '|')
148
17.5M
            return emit_token(TokenType::Pipe);
149
150
294M
        if (ch == '?')
151
2.63M
            return emit_token(TokenType::Questionmark);
152
153
292M
        if (ch == ',')
154
3.56M
            return emit_token(TokenType::Comma);
155
156
288M
        if (ch == '/')
157
899k
            return emit_token(TokenType::Slash);
158
159
287M
        if (ch == '=')
160
135k
            return emit_token(TokenType::EqualSign);
161
162
287M
        if (ch == ':')
163
1.86M
            return emit_token(TokenType::Colon);
164
165
285M
        if (ch == '-')
166
7.60M
            return emit_token(TokenType::HyphenMinus);
167
168
278M
        if (ch == '\\') {
169
30.2M
            size_t escape = match_escape_sequence();
170
30.2M
            if (escape > 0) {
171
7.07M
                begin_token();
172
21.2M
                for (size_t i = 0; i < escape; ++i)
173
14.1M
                    consume();
174
7.07M
                return commit_token(TokenType::EscapeSequence);
175
7.07M
            }
176
30.2M
        }
177
178
271M
        return emit_token(TokenType::Char);
179
278M
    }
180
181
21.4k
    return Token(TokenType::Eof, m_index, {});
182
340M
}
183
184
}