/src/serenity/AK/GenericLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/Assertions.h> |
8 | | #include <AK/CharacterTypes.h> |
9 | | #include <AK/GenericLexer.h> |
10 | | #include <AK/ScopeGuard.h> |
11 | | #include <AK/StringBuilder.h> |
12 | | |
13 | | #ifndef KERNEL |
14 | | # include <AK/ByteString.h> |
15 | | # include <AK/Utf16View.h> |
16 | | #endif |
17 | | |
18 | | namespace AK { |
19 | | // Consume a number of characters |
20 | | StringView GenericLexer::consume(size_t count) |
21 | 6.56M | { |
22 | 6.56M | size_t start = m_index; |
23 | 6.56M | size_t length = min(count, m_input.length() - m_index); |
24 | 6.56M | m_index += length; |
25 | | |
26 | 6.56M | return m_input.substring_view(start, length); |
27 | 6.56M | } |
28 | | |
29 | | // Consume the rest of the input |
30 | | StringView GenericLexer::consume_all() |
31 | 1.50k | { |
32 | 1.50k | auto rest = m_input.substring_view(m_index, m_input.length() - m_index); |
33 | 1.50k | m_index = m_input.length(); |
34 | 1.50k | return rest; |
35 | 1.50k | } |
36 | | |
37 | | // Consume until a new line is found |
38 | | StringView GenericLexer::consume_line() |
39 | 1.05M | { |
40 | 1.05M | size_t start = m_index; |
41 | 66.8M | while (!is_eof() && peek() != '\r' && peek() != '\n') |
42 | 65.8M | m_index++; |
43 | 1.05M | size_t length = m_index - start; |
44 | | |
45 | 1.05M | consume_specific('\r'); |
46 | 1.05M | consume_specific('\n'); |
47 | | |
48 | 1.05M | return m_input.substring_view(start, length); |
49 | 1.05M | } |
50 | | |
51 | | // Consume and return characters until `stop` is peek'd |
52 | | StringView GenericLexer::consume_until(char stop) |
53 | 0 | { |
54 | 0 | size_t start = m_index; |
55 | 0 | while (!is_eof() && peek() != stop) |
56 | 0 | m_index++; |
57 | 0 | size_t length = m_index - start; |
58 | |
|
59 | 0 | return m_input.substring_view(start, length); |
60 | 0 | } |
61 | | |
62 | | // Consume and return characters until the string `stop` is found |
63 | | StringView GenericLexer::consume_until(char const* stop) |
64 | 49.6k | { |
65 | 49.6k | size_t start = m_index; |
66 | 8.15M | while (!is_eof() && !next_is(stop)) |
67 | 8.11M | m_index++; |
68 | 49.6k | size_t length = m_index - start; |
69 | | |
70 | 49.6k | return m_input.substring_view(start, length); |
71 | 49.6k | } |
72 | | |
73 | | // Consume and return characters until the string `stop` is found |
74 | | StringView GenericLexer::consume_until(StringView stop) |
75 | 0 | { |
76 | 0 | size_t start = m_index; |
77 | 0 | while (!is_eof() && !next_is(stop)) |
78 | 0 | m_index++; |
79 | 0 | size_t length = m_index - start; |
80 | |
|
81 | 0 | return m_input.substring_view(start, length); |
82 | 0 | } |
83 | | |
84 | | /* |
85 | | * Consume a string surrounded by single or double quotes. The returned |
86 | | * StringView does not include the quotes. An escape character can be provided |
87 | | * to capture the enclosing quotes. Please note that the escape character will |
88 | | * still be in the resulting StringView |
89 | | */ |
90 | | StringView GenericLexer::consume_quoted_string(char escape_char) |
91 | 725 | { |
92 | 725 | if (!next_is(is_quote)) |
93 | 0 | return {}; |
94 | | |
95 | 725 | char quote_char = consume(); |
96 | 725 | size_t start = m_index; |
97 | 17.9M | while (!is_eof()) { |
98 | 17.9M | if (next_is(escape_char)) |
99 | 101k | m_index++; |
100 | 17.8M | else if (next_is(quote_char)) |
101 | 678 | break; |
102 | 17.9M | m_index++; |
103 | 17.9M | } |
104 | 725 | size_t length = m_index - start; |
105 | | |
106 | 725 | if (peek() != quote_char) { |
107 | | // Restore the index in case the string is unterminated |
108 | 47 | m_index = start - 1; |
109 | 47 | return {}; |
110 | 47 | } |
111 | | |
112 | | // Ignore closing quote |
113 | 678 | ignore(); |
114 | | |
115 | 678 | return m_input.substring_view(start, length); |
116 | 725 | } |
117 | | |
118 | | template<Integral T> |
119 | | ErrorOr<T> GenericLexer::consume_decimal_integer() |
120 | 0 | { |
121 | 0 | using UnsignedT = MakeUnsigned<T>; |
122 | |
|
123 | 0 | ArmedScopeGuard rollback { [&, rollback_position = m_index] { |
124 | 0 | m_index = rollback_position; |
125 | 0 | } }; Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv |
126 | |
|
127 | 0 | bool has_minus_sign = false; |
128 | |
|
129 | 0 | if (next_is('+') || next_is('-')) |
130 | 0 | if (consume() == '-') |
131 | 0 | has_minus_sign = true; |
132 | |
|
133 | 0 | StringView number_view = consume_while(is_ascii_digit); |
134 | 0 | if (number_view.is_empty()) |
135 | 0 | return Error::from_errno(EINVAL); |
136 | | |
137 | 0 | auto maybe_number = StringUtils::convert_to_uint<UnsignedT>(number_view, TrimWhitespace::No); |
138 | 0 | if (!maybe_number.has_value()) |
139 | 0 | return Error::from_errno(ERANGE); |
140 | 0 | auto number = maybe_number.value(); |
141 | |
|
142 | 0 | if (!has_minus_sign) { |
143 | 0 | if (NumericLimits<T>::max() < number) // This is only possible in a signed case. |
144 | 0 | return Error::from_errno(ERANGE); |
145 | | |
146 | 0 | rollback.disarm(); |
147 | 0 | return number; |
148 | 0 | } else { |
149 | 0 | if constexpr (IsUnsigned<T>) { |
150 | 0 | if (number == 0) { |
151 | 0 | rollback.disarm(); |
152 | 0 | return 0; |
153 | 0 | } |
154 | 0 | return Error::from_errno(ERANGE); |
155 | 0 | } else { |
156 | 0 | static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1; |
157 | 0 | if (number > max_value) |
158 | 0 | return Error::from_errno(ERANGE); |
159 | 0 | rollback.disarm(); |
160 | 0 | return -number; |
161 | 0 | } |
162 | 0 | } |
163 | 0 | } Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEv |
164 | | |
165 | | #if !defined(KERNEL) |
166 | | LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const |
167 | 60.9M | { |
168 | | // Sad case: we have no idea where the nearest newline is, so we have to |
169 | | // scan ahead a bit. |
170 | 66.8M | while (index > m_largest_known_line_start_position) { |
171 | 5.98M | auto next_newline = m_input.find('\n', m_largest_known_line_start_position); |
172 | 5.98M | if (!next_newline.has_value()) { |
173 | | // No more newlines, add the end of the input as a line start to avoid searching again. |
174 | 15.0k | m_line_start_positions->insert(m_input.length(), m_line_start_positions->size()); |
175 | 15.0k | m_largest_known_line_start_position = m_input.length(); |
176 | 15.0k | break; |
177 | 15.0k | } |
178 | 5.96M | m_line_start_positions->insert(next_newline.value() + 1, m_line_start_positions->size()); |
179 | 5.96M | m_largest_known_line_start_position = next_newline.value() + 1; |
180 | 5.96M | } |
181 | | // We should always have at least the first line start position. |
182 | 60.9M | auto previous_line_it = m_line_start_positions->find_largest_not_above_iterator(index); |
183 | 60.9M | auto previous_line_index = previous_line_it.key(); |
184 | | |
185 | 60.9M | auto line = *previous_line_it; |
186 | 60.9M | auto column = index - previous_line_index; |
187 | 60.9M | if (line == 0) { |
188 | | // First line, take into account the start position. |
189 | 44.0M | column += m_first_line_start_position.column; |
190 | 44.0M | } |
191 | | |
192 | 60.9M | line += m_first_line_start_position.line; |
193 | 60.9M | return { index, line, column }; |
194 | 60.9M | } |
195 | | #endif |
196 | | |
197 | | template ErrorOr<u8> GenericLexer::consume_decimal_integer<u8>(); |
198 | | template ErrorOr<i8> GenericLexer::consume_decimal_integer<i8>(); |
199 | | template ErrorOr<u16> GenericLexer::consume_decimal_integer<u16>(); |
200 | | template ErrorOr<i16> GenericLexer::consume_decimal_integer<i16>(); |
201 | | template ErrorOr<u32> GenericLexer::consume_decimal_integer<u32>(); |
202 | | template ErrorOr<i32> GenericLexer::consume_decimal_integer<i32>(); |
203 | | template ErrorOr<u64> GenericLexer::consume_decimal_integer<u64>(); |
204 | | template ErrorOr<i64> GenericLexer::consume_decimal_integer<i64>(); |
205 | | #ifdef AK_OS_MACOS |
206 | | template ErrorOr<size_t> GenericLexer::consume_decimal_integer<size_t>(); |
207 | | #endif |
208 | | |
209 | | #ifndef KERNEL |
210 | | Optional<ByteString> GenericLexer::consume_and_unescape_string(char escape_char) |
211 | 0 | { |
212 | 0 | auto view = consume_quoted_string(escape_char); |
213 | 0 | if (view.is_null()) |
214 | 0 | return {}; |
215 | | |
216 | 0 | StringBuilder builder; |
217 | 0 | for (size_t i = 0; i < view.length(); ++i) |
218 | 0 | builder.append(consume_escaped_character(escape_char)); |
219 | 0 | return builder.to_byte_string(); |
220 | 0 | } |
221 | | |
222 | | auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> |
223 | 3.25M | { |
224 | 3.25M | if (!consume_specific("\\u"sv)) |
225 | 365 | return UnicodeEscapeError::MalformedUnicodeEscape; |
226 | | |
227 | 3.25M | if (next_is('{')) |
228 | 22 | return decode_code_point(); |
229 | 3.25M | return decode_single_or_paired_surrogate(combine_surrogate_pairs); |
230 | 3.25M | } |
231 | | |
232 | | auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError> |
233 | 22 | { |
234 | 22 | bool starts_with_open_bracket = consume_specific('{'); |
235 | 22 | VERIFY(starts_with_open_bracket); |
236 | | |
237 | 22 | u32 code_point = 0; |
238 | | |
239 | 108 | while (true) { |
240 | 108 | if (!next_is(is_ascii_hex_digit)) |
241 | 0 | return UnicodeEscapeError::MalformedUnicodeEscape; |
242 | | |
243 | 108 | auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); |
244 | 108 | if (new_code_point < code_point) |
245 | 0 | return UnicodeEscapeError::UnicodeEscapeOverflow; |
246 | | |
247 | 108 | code_point = new_code_point; |
248 | 108 | if (consume_specific('}')) |
249 | 22 | break; |
250 | 108 | } |
251 | | |
252 | 22 | if (is_unicode(code_point)) |
253 | 16 | return code_point; |
254 | 6 | return UnicodeEscapeError::UnicodeEscapeOverflow; |
255 | 22 | } |
256 | | |
257 | | auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> |
258 | 4.21M | { |
259 | 4.21M | constexpr size_t surrogate_length = 4; |
260 | | |
261 | 4.37M | auto decode_one_surrogate = [&]() -> Optional<u16> { |
262 | 4.37M | u16 surrogate = 0; |
263 | | |
264 | 21.1M | for (size_t i = 0; i < surrogate_length; ++i) { |
265 | 17.1M | if (!next_is(is_ascii_hex_digit)) |
266 | 376k | return {}; |
267 | | |
268 | 16.7M | surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); |
269 | 16.7M | } |
270 | | |
271 | 3.99M | return surrogate; |
272 | 4.37M | }; |
273 | | |
274 | 4.21M | auto high_surrogate = decode_one_surrogate(); |
275 | 4.21M | if (!high_surrogate.has_value()) |
276 | 375k | return UnicodeEscapeError::MalformedUnicodeEscape; |
277 | 3.84M | if (!Utf16View::is_high_surrogate(*high_surrogate)) |
278 | 2.37M | return *high_surrogate; |
279 | 1.46M | if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) |
280 | 1.30M | return *high_surrogate; |
281 | | |
282 | 158k | auto low_surrogate = decode_one_surrogate(); |
283 | 158k | if (!low_surrogate.has_value()) |
284 | 70 | return UnicodeEscapeError::MalformedUnicodeEscape; |
285 | 158k | if (Utf16View::is_low_surrogate(*low_surrogate)) |
286 | 22.8k | return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); |
287 | | |
288 | 135k | retreat(6); |
289 | 135k | return *high_surrogate; |
290 | 158k | } |
291 | | #endif |
292 | | |
293 | | } |