/src/serenity/Userland/Libraries/LibRegex/RegexLexer.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include "RegexLexer.h" |
8 | | #include <AK/Assertions.h> |
9 | | #include <AK/Debug.h> |
10 | | #include <AK/Format.h> |
11 | | #include <stdio.h> |
12 | | |
13 | | namespace regex { |
14 | | |
15 | | char const* Token::name(TokenType const type) |
16 | 0 | { |
17 | 0 | switch (type) { |
18 | 0 | #define __ENUMERATE_REGEX_TOKEN(x) \ |
19 | 0 | case TokenType::x: \ |
20 | 0 | return #x; |
21 | 0 | ENUMERATE_REGEX_TOKENS |
22 | 0 | #undef __ENUMERATE_REGEX_TOKEN |
23 | 0 | default: |
24 | 0 | VERIFY_NOT_REACHED(); |
25 | 0 | return "<Unknown>"; |
26 | 0 | } |
27 | 0 | } |
28 | | |
29 | | char const* Token::name() const |
30 | 0 | { |
31 | 0 | return name(m_type); |
32 | 0 | } |
33 | | |
34 | | Lexer::Lexer() |
35 | 0 | : GenericLexer(StringView {}) |
36 | 0 | { |
37 | 0 | } |
38 | | |
39 | | Lexer::Lexer(StringView const source) |
40 | 9.80k | : GenericLexer(source) |
41 | 9.80k | { |
42 | 9.80k | } |
43 | | |
44 | | void Lexer::back(size_t offset) |
45 | 36.9M | { |
46 | 36.9M | if (offset == m_index + 1) |
47 | 0 | offset = m_index; // 'position == 0' occurs twice. |
48 | | |
49 | 36.9M | VERIFY(offset <= m_index); |
50 | 36.9M | if (!offset) |
51 | 31.9M | return; |
52 | 4.97M | m_index -= offset; |
53 | 4.97M | m_previous_position = (m_index > 0) ? m_index - 1 : 0; |
54 | 4.97M | } |
55 | | |
56 | | char Lexer::consume() |
57 | 350M | { |
58 | 350M | m_previous_position = m_index; |
59 | 350M | return GenericLexer::consume(); |
60 | 350M | } |
61 | | |
62 | | void Lexer::reset() |
63 | 10.7k | { |
64 | 10.7k | m_index = 0; |
65 | 10.7k | m_current_token = { TokenType::Eof, 0, {} }; |
66 | 10.7k | m_previous_position = 0; |
67 | 10.7k | } |
68 | | |
69 | | Token Lexer::next() |
70 | 340M | { |
71 | 340M | size_t token_start_position; |
72 | | |
73 | 340M | auto begin_token = [&] { |
74 | 7.07M | token_start_position = m_index; |
75 | 7.07M | }; |
76 | | |
77 | 340M | auto commit_token = [&](auto type) -> Token& { |
78 | 7.07M | VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length()); |
79 | 7.07M | auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1); |
80 | 7.07M | m_current_token = Token(type, token_start_position, substring); |
81 | 7.07M | return m_current_token; |
82 | 7.07M | }; |
83 | | |
84 | 340M | auto emit_token = [&](auto type) -> Token& { |
85 | 333M | m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1)); |
86 | 333M | consume(); |
87 | 333M | return m_current_token; |
88 | 333M | }; |
89 | | |
90 | 340M | auto match_escape_sequence = [&]() -> size_t { |
91 | 30.2M | switch (peek(1)) { |
92 | 719 | case '^': |
93 | 2.02k | case '.': |
94 | 1.03M | case '[': |
95 | 1.03M | case ']': |
96 | 4.58M | case '$': |
97 | 4.96M | case '(': |
98 | 5.34M | case ')': |
99 | 5.34M | case '|': |
100 | 5.36M | case '*': |
101 | 5.36M | case '+': |
102 | 5.36M | case '?': |
103 | 5.66M | case '{': |
104 | 7.07M | case '\\': |
105 | 7.07M | return 2; |
106 | 23.2M | default: |
107 | 23.2M | dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1)); |
108 | 23.2M | return 0; |
109 | 30.2M | } |
110 | 30.2M | }; |
111 | | |
112 | 340M | while (m_index < m_input.length()) { |
113 | 340M | auto ch = peek(); |
114 | 340M | if (ch == '(') |
115 | 11.7M | return emit_token(TokenType::LeftParen); |
116 | | |
117 | 328M | if (ch == ')') |
118 | 2.21M | return emit_token(TokenType::RightParen); |
119 | | |
120 | 326M | if (ch == '{') |
121 | 336k | return emit_token(TokenType::LeftCurly); |
122 | | |
123 | 326M | if (ch == '}') |
124 | 835k | return emit_token(TokenType::RightCurly); |
125 | | |
126 | 325M | if (ch == '[') |
127 | 835k | return emit_token(TokenType::LeftBracket); |
128 | | |
129 | 324M | if (ch == ']') |
130 | 277k | return emit_token(TokenType::RightBracket); |
131 | | |
132 | 324M | if (ch == '.') |
133 | 3.64M | return emit_token(TokenType::Period); |
134 | | |
135 | 320M | if (ch == '*') |
136 | 1.31M | return emit_token(TokenType::Asterisk); |
137 | | |
138 | 319M | if (ch == '+') |
139 | 836k | return emit_token(TokenType::Plus); |
140 | | |
141 | 318M | if (ch == '$') |
142 | 2.97M | return emit_token(TokenType::Dollar); |
143 | | |
144 | 315M | if (ch == '^') |
145 | 3.06M | return emit_token(TokenType::Circumflex); |
146 | | |
147 | 312M | if (ch == '|') |
148 | 17.5M | return emit_token(TokenType::Pipe); |
149 | | |
150 | 294M | if (ch == '?') |
151 | 2.63M | return emit_token(TokenType::Questionmark); |
152 | | |
153 | 292M | if (ch == ',') |
154 | 3.56M | return emit_token(TokenType::Comma); |
155 | | |
156 | 288M | if (ch == '/') |
157 | 899k | return emit_token(TokenType::Slash); |
158 | | |
159 | 287M | if (ch == '=') |
160 | 135k | return emit_token(TokenType::EqualSign); |
161 | | |
162 | 287M | if (ch == ':') |
163 | 1.86M | return emit_token(TokenType::Colon); |
164 | | |
165 | 285M | if (ch == '-') |
166 | 7.60M | return emit_token(TokenType::HyphenMinus); |
167 | | |
168 | 278M | if (ch == '\\') { |
169 | 30.2M | size_t escape = match_escape_sequence(); |
170 | 30.2M | if (escape > 0) { |
171 | 7.07M | begin_token(); |
172 | 21.2M | for (size_t i = 0; i < escape; ++i) |
173 | 14.1M | consume(); |
174 | 7.07M | return commit_token(TokenType::EscapeSequence); |
175 | 7.07M | } |
176 | 30.2M | } |
177 | | |
178 | 271M | return emit_token(TokenType::Char); |
179 | 278M | } |
180 | | |
181 | 21.4k | return Token(TokenType::Eof, m_index, {}); |
182 | 340M | } |
183 | | |
184 | | } |