/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp

Source (jump to first uncovered line)
/*
 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include "RegexLexer.h"
#include <AK/Assertions.h>
#include <AK/Debug.h>
#include <AK/Format.h>
#include <stdio.h>

namespace regex {

char const* Token::name(TokenType const type)
{
    switch (type) {
#define __ENUMERATE_REGEX_TOKEN(x) \
    case TokenType::x:             \
        return #x;
        ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
    default:
        VERIFY_NOT_REACHED();
        return "<Unknown>";
    }
}

char const* Token::name() const
{
    return name(m_type);
}

Lexer::Lexer()
    : GenericLexer(StringView {})
{
}

Lexer::Lexer(StringView const source)
    : GenericLexer(source)
{
}

void Lexer::back(size_t offset)
{
    if (offset == m_index + 1)
        offset = m_index; // 'position == 0' occurs twice.

    VERIFY(offset <= m_index);
    if (!offset)
        return;
    m_index -= offset;
    m_previous_position = (m_index > 0) ? m_index - 1 : 0;
}

char Lexer::consume()
{
    m_previous_position = m_index;
    return GenericLexer::consume();
}

void Lexer::reset()
{
    m_index = 0;
    m_current_token = { TokenType::Eof, 0, {} };
    m_previous_position = 0;
}

Token Lexer::next()
{
    size_t token_start_position;

    auto begin_token = [&] {
        token_start_position = m_index;
    };

    auto commit_token = [&](auto type) -> Token& {
        VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
        auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
        m_current_token = Token(type, token_start_position, substring);
        return m_current_token;
    };

    auto emit_token = [&](auto type) -> Token& {
        m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
        consume();
        return m_current_token;
    };

    auto match_escape_sequence = [&]() -> size_t {
        switch (peek(1)) {
        case '^':
        case '.':
        case '[':
        case ']':
        case '$':
        case '(':
        case ')':
        case '|':
        case '*':
        case '+':
        case '?':
        case '{':
        case '\\':
            return 2;
        default:
            dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
            return 0;
        }
    };

    while (m_index < m_input.length()) {
        auto ch = peek();
        if (ch == '(')
            return emit_token(TokenType::LeftParen);

        if (ch == ')')
            return emit_token(TokenType::RightParen);

        if (ch == '{')
            return emit_token(TokenType::LeftCurly);

        if (ch == '}')
            return emit_token(TokenType::RightCurly);

        if (ch == '[')
            return emit_token(TokenType::LeftBracket);

        if (ch == ']')
            return emit_token(TokenType::RightBracket);

        if (ch == '.')
            return emit_token(TokenType::Period);

        if (ch == '*')
            return emit_token(TokenType::Asterisk);

        if (ch == '+')
            return emit_token(TokenType::Plus);

        if (ch == '$')
            return emit_token(TokenType::Dollar);

        if (ch == '^')
            return emit_token(TokenType::Circumflex);

        if (ch == '|')
            return emit_token(TokenType::Pipe);

        if (ch == '?')
            return emit_token(TokenType::Questionmark);

        if (ch == ',')
            return emit_token(TokenType::Comma);

        if (ch == '/')
            return emit_token(TokenType::Slash);

        if (ch == '=')
            return emit_token(TokenType::EqualSign);

        if (ch == ':')
            return emit_token(TokenType::Colon);

        if (ch == '-')
            return emit_token(TokenType::HyphenMinus);

        if (ch == '\\') {
            size_t escape = match_escape_sequence();
            if (escape > 0) {
                begin_token();
                for (size_t i = 0; i < escape; ++i)
                    consume();
                return commit_token(TokenType::EscapeSequence);
            }
        }

        return emit_token(TokenType::Char);
    }

    return Token(TokenType::Eof, m_index, {});
}

}

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#include "RegexLexer.h"
8		#include <AK/Assertions.h>
9		#include <AK/Debug.h>
10		#include <AK/Format.h>
11		#include <stdio.h>
12
13		namespace regex {
14
15		char const* Token::name(TokenType const type)
16	0	{
17	0	switch (type) {
18	0	#define __ENUMERATE_REGEX_TOKEN(x) \
19	0	case TokenType::x: \
20	0	return #x;
21	0	ENUMERATE_REGEX_TOKENS
22	0	#undef __ENUMERATE_REGEX_TOKEN
23	0	default:
24	0	VERIFY_NOT_REACHED();
25	0	return "<Unknown>";
26	0	}
27	0	}
28
29		char const* Token::name() const
30	0	{
31	0	return name(m_type);
32	0	}
33
34		Lexer::Lexer()
35	0	: GenericLexer(StringView {})
36	0	{
37	0	}
38
39		Lexer::Lexer(StringView const source)
40	9.04k	: GenericLexer(source)
41	9.04k	{
42	9.04k	}
43
44		void Lexer::back(size_t offset)
45	34.8M	{
46	34.8M	if (offset == m_index + 1)
47	0	offset = m_index; // 'position == 0' occurs twice.
48
49	34.8M	VERIFY(offset <= m_index);
50	34.8M	if (!offset)
51	29.7M	return;
52	5.18M	m_index -= offset;
53	5.18M	m_previous_position = (m_index > 0) ? m_index - 1 : 0;
54	5.18M	}
55
56		char Lexer::consume()
57	366M	{
58	366M	m_previous_position = m_index;
59	366M	return GenericLexer::consume();
60	366M	}
61
62		void Lexer::reset()
63	9.75k	{
64	9.75k	m_index = 0;
65	9.75k	m_current_token = { TokenType::Eof, 0, {} };
66	9.75k	m_previous_position = 0;
67	9.75k	}
68
69		Token Lexer::next()
70	355M	{
71	355M	size_t token_start_position;
72
73	355M	auto begin_token = [&] {
74	8.42M	token_start_position = m_index;
75	8.42M	};
76
77	355M	auto commit_token = [&](auto type) -> Token& {
78	8.42M	VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
79	8.42M	auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
80	8.42M	m_current_token = Token(type, token_start_position, substring);
81	8.42M	return m_current_token;
82	8.42M	};
83
84	355M	auto emit_token = [&](auto type) -> Token& {
85	346M	m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
86	346M	consume();
87	346M	return m_current_token;
88	346M	};
89
90	355M	auto match_escape_sequence = [&]() -> size_t {
91	29.9M	switch (peek(1)) {
92	419	case '^':
93	1.26k	case '.':
94	1.51M	case '[':
95	1.51M	case ']':
96	5.12M	case '$':
97	5.64M	case '(':
98	6.19M	case ')':
99	6.19M	case '\|':
100	6.21M	case '*':
101	6.21M	case '+':
102	6.21M	case '?':
103	6.54M	case '{':
104	8.42M	case '\\':
105	8.42M	return 2;
106	21.5M	default:
107	21.5M	dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
108	21.5M	return 0;
109	29.9M	}
110	29.9M	};
111
112	355M	while (m_index < m_input.length()) {
113	355M	auto ch = peek();
114	355M	if (ch == '(')
115	15.3M	return emit_token(TokenType::LeftParen);
116
117	339M	if (ch == ')')
118	2.69M	return emit_token(TokenType::RightParen);
119
120	336M	if (ch == '{')
121	380k	return emit_token(TokenType::LeftCurly);
122
123	336M	if (ch == '}')
124	896k	return emit_token(TokenType::RightCurly);
125
126	335M	if (ch == '[')
127	1.06M	return emit_token(TokenType::LeftBracket);
128
129	334M	if (ch == ']')
130	251k	return emit_token(TokenType::RightBracket);
131
132	334M	if (ch == '.')
133	7.03M	return emit_token(TokenType::Period);
134
135	327M	if (ch == '*')
136	1.36M	return emit_token(TokenType::Asterisk);
137
138	325M	if (ch == '+')
139	1.16M	return emit_token(TokenType::Plus);
140
141	324M	if (ch == '$')
142	3.13M	return emit_token(TokenType::Dollar);
143
144	321M	if (ch == '^')
145	3.56M	return emit_token(TokenType::Circumflex);
146
147	318M	if (ch == '\|')
148	20.1M	return emit_token(TokenType::Pipe);
149
150	297M	if (ch == '?')
151	2.57M	return emit_token(TokenType::Questionmark);
152
153	295M	if (ch == ',')
154	2.18M	return emit_token(TokenType::Comma);
155
156	293M	if (ch == '/')
157	1.01M	return emit_token(TokenType::Slash);
158
159	292M	if (ch == '=')
160	225k	return emit_token(TokenType::EqualSign);
161
162	291M	if (ch == ':')
163	839k	return emit_token(TokenType::Colon);
164
165	291M	if (ch == '-')
166	8.35M	return emit_token(TokenType::HyphenMinus);
167
168	282M	if (ch == '\\') {
169	29.9M	size_t escape = match_escape_sequence();
170	29.9M	if (escape > 0) {
171	8.42M	begin_token();
172	25.2M	for (size_t i = 0; i < escape; ++i)
173	16.8M	consume();
174	8.42M	return commit_token(TokenType::EscapeSequence);
175	8.42M	}
176	29.9M	}
177
178	274M	return emit_token(TokenType::Char);
179	282M	}
180
181	24.8k	return Token(TokenType::Eof, m_index, {});
182	355M	}
183
184		}

Coverage Report

Created: 2025-09-05 06:52