/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include "RegexLexer.h" |
8 | | #include <AK/Assertions.h> |
9 | | #include <AK/Debug.h> |
10 | | #include <AK/Format.h> |
11 | | #include <stdio.h> |
12 | | |
13 | | namespace regex { |
14 | | |
15 | | char const* Token::name(TokenType const type) |
16 | 0 | { |
17 | 0 | switch (type) { |
18 | 0 | #define __ENUMERATE_REGEX_TOKEN(x) \ |
19 | 0 | case TokenType::x: \ |
20 | 0 | return #x; |
21 | 0 | ENUMERATE_REGEX_TOKENS |
22 | 0 | #undef __ENUMERATE_REGEX_TOKEN |
23 | 0 | default: |
24 | 0 | VERIFY_NOT_REACHED(); |
25 | 0 | return "<Unknown>"; |
26 | 0 | } |
27 | 0 | } |
28 | | |
29 | | char const* Token::name() const |
30 | 0 | { |
31 | 0 | return name(m_type); |
32 | 0 | } |
33 | | |
34 | | Lexer::Lexer() |
35 | 0 | : GenericLexer(StringView {}) |
36 | 0 | { |
37 | 0 | } |
38 | | |
39 | | Lexer::Lexer(StringView const source) |
40 | 9.04k | : GenericLexer(source) |
41 | 9.04k | { |
42 | 9.04k | } |
43 | | |
44 | | void Lexer::back(size_t offset) |
45 | 34.8M | { |
46 | 34.8M | if (offset == m_index + 1) |
47 | 0 | offset = m_index; // 'position == 0' occurs twice. |
48 | | |
49 | 34.8M | VERIFY(offset <= m_index); |
50 | 34.8M | if (!offset) |
51 | 29.7M | return; |
52 | 5.18M | m_index -= offset; |
53 | 5.18M | m_previous_position = (m_index > 0) ? m_index - 1 : 0; |
54 | 5.18M | } |
55 | | |
56 | | char Lexer::consume() |
57 | 366M | { |
58 | 366M | m_previous_position = m_index; |
59 | 366M | return GenericLexer::consume(); |
60 | 366M | } |
61 | | |
62 | | void Lexer::reset() |
63 | 9.75k | { |
64 | 9.75k | m_index = 0; |
65 | 9.75k | m_current_token = { TokenType::Eof, 0, {} }; |
66 | 9.75k | m_previous_position = 0; |
67 | 9.75k | } |
68 | | |
69 | | Token Lexer::next() |
70 | 355M | { |
71 | 355M | size_t token_start_position; |
72 | | |
73 | 355M | auto begin_token = [&] { |
74 | 8.42M | token_start_position = m_index; |
75 | 8.42M | }; |
76 | | |
77 | 355M | auto commit_token = [&](auto type) -> Token& { |
78 | 8.42M | VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length()); |
79 | 8.42M | auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1); |
80 | 8.42M | m_current_token = Token(type, token_start_position, substring); |
81 | 8.42M | return m_current_token; |
82 | 8.42M | }; |
83 | | |
84 | 355M | auto emit_token = [&](auto type) -> Token& { |
85 | 346M | m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1)); |
86 | 346M | consume(); |
87 | 346M | return m_current_token; |
88 | 346M | }; |
89 | | |
90 | 355M | auto match_escape_sequence = [&]() -> size_t { |
91 | 29.9M | switch (peek(1)) { |
92 | 419 | case '^': |
93 | 1.26k | case '.': |
94 | 1.51M | case '[': |
95 | 1.51M | case ']': |
96 | 5.12M | case '$': |
97 | 5.64M | case '(': |
98 | 6.19M | case ')': |
99 | 6.19M | case '|': |
100 | 6.21M | case '*': |
101 | 6.21M | case '+': |
102 | 6.21M | case '?': |
103 | 6.54M | case '{': |
104 | 8.42M | case '\\': |
105 | 8.42M | return 2; |
106 | 21.5M | default: |
107 | 21.5M | dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1)); |
108 | 21.5M | return 0; |
109 | 29.9M | } |
110 | 29.9M | }; |
111 | | |
112 | 355M | while (m_index < m_input.length()) { |
113 | 355M | auto ch = peek(); |
114 | 355M | if (ch == '(') |
115 | 15.3M | return emit_token(TokenType::LeftParen); |
116 | | |
117 | 339M | if (ch == ')') |
118 | 2.69M | return emit_token(TokenType::RightParen); |
119 | | |
120 | 336M | if (ch == '{') |
121 | 380k | return emit_token(TokenType::LeftCurly); |
122 | | |
123 | 336M | if (ch == '}') |
124 | 896k | return emit_token(TokenType::RightCurly); |
125 | | |
126 | 335M | if (ch == '[') |
127 | 1.06M | return emit_token(TokenType::LeftBracket); |
128 | | |
129 | 334M | if (ch == ']') |
130 | 251k | return emit_token(TokenType::RightBracket); |
131 | | |
132 | 334M | if (ch == '.') |
133 | 7.03M | return emit_token(TokenType::Period); |
134 | | |
135 | 327M | if (ch == '*') |
136 | 1.36M | return emit_token(TokenType::Asterisk); |
137 | | |
138 | 325M | if (ch == '+') |
139 | 1.16M | return emit_token(TokenType::Plus); |
140 | | |
141 | 324M | if (ch == '$') |
142 | 3.13M | return emit_token(TokenType::Dollar); |
143 | | |
144 | 321M | if (ch == '^') |
145 | 3.56M | return emit_token(TokenType::Circumflex); |
146 | | |
147 | 318M | if (ch == '|') |
148 | 20.1M | return emit_token(TokenType::Pipe); |
149 | | |
150 | 297M | if (ch == '?') |
151 | 2.57M | return emit_token(TokenType::Questionmark); |
152 | | |
153 | 295M | if (ch == ',') |
154 | 2.18M | return emit_token(TokenType::Comma); |
155 | | |
156 | 293M | if (ch == '/') |
157 | 1.01M | return emit_token(TokenType::Slash); |
158 | | |
159 | 292M | if (ch == '=') |
160 | 225k | return emit_token(TokenType::EqualSign); |
161 | | |
162 | 291M | if (ch == ':') |
163 | 839k | return emit_token(TokenType::Colon); |
164 | | |
165 | 291M | if (ch == '-') |
166 | 8.35M | return emit_token(TokenType::HyphenMinus); |
167 | | |
168 | 282M | if (ch == '\\') { |
169 | 29.9M | size_t escape = match_escape_sequence(); |
170 | 29.9M | if (escape > 0) { |
171 | 8.42M | begin_token(); |
172 | 25.2M | for (size_t i = 0; i < escape; ++i) |
173 | 16.8M | consume(); |
174 | 8.42M | return commit_token(TokenType::EscapeSequence); |
175 | 8.42M | } |
176 | 29.9M | } |
177 | | |
178 | 274M | return emit_token(TokenType::Char); |
179 | 282M | } |
180 | | |
181 | 24.8k | return Token(TokenType::Eof, m_index, {}); |
182 | 355M | } |
183 | | |
184 | | } |