/src/serenity/AK/GenericLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/Assertions.h> |
8 | | #include <AK/CharacterTypes.h> |
9 | | #include <AK/GenericLexer.h> |
10 | | #include <AK/StringBuilder.h> |
11 | | |
12 | | #ifndef KERNEL |
13 | | # include <AK/String.h> |
14 | | # include <AK/Utf16View.h> |
15 | | #endif |
16 | | |
17 | | namespace AK { |
18 | | // Consume a number of characters |
19 | | StringView GenericLexer::consume(size_t count) |
20 | 288 | { |
21 | 288 | if (count == 0) |
22 | 0 | return {}; |
23 | | |
24 | 288 | size_t start = m_index; |
25 | 288 | size_t length = min(count, m_input.length() - m_index); |
26 | 288 | m_index += length; |
27 | | |
28 | 288 | return m_input.substring_view(start, length); |
29 | 288 | } |
30 | | |
31 | | // Consume the rest of the input |
32 | | StringView GenericLexer::consume_all() |
33 | 0 | { |
34 | 0 | if (is_eof()) |
35 | 0 | return {}; |
36 | | |
37 | 0 | auto rest = m_input.substring_view(m_index, m_input.length() - m_index); |
38 | 0 | m_index = m_input.length(); |
39 | 0 | return rest; |
40 | 0 | } |
41 | | |
42 | | // Consume until a new line is found |
43 | | StringView GenericLexer::consume_line() |
44 | 0 | { |
45 | 0 | size_t start = m_index; |
46 | 0 | while (!is_eof() && peek() != '\r' && peek() != '\n') |
47 | 0 | m_index++; |
48 | 0 | size_t length = m_index - start; |
49 | |
|
50 | 0 | consume_specific('\r'); |
51 | 0 | consume_specific('\n'); |
52 | |
|
53 | 0 | if (length == 0) |
54 | 0 | return {}; |
55 | 0 | return m_input.substring_view(start, length); |
56 | 0 | } |
57 | | |
58 | | // Consume and return characters until `stop` is peek'd |
59 | | StringView GenericLexer::consume_until(char stop) |
60 | 0 | { |
61 | 0 | size_t start = m_index; |
62 | 0 | while (!is_eof() && peek() != stop) |
63 | 0 | m_index++; |
64 | 0 | size_t length = m_index - start; |
65 | |
|
66 | 0 | if (length == 0) |
67 | 0 | return {}; |
68 | 0 | return m_input.substring_view(start, length); |
69 | 0 | } |
70 | | |
71 | | // Consume and return characters until the string `stop` is found |
72 | | StringView GenericLexer::consume_until(char const* stop) |
73 | 0 | { |
74 | 0 | size_t start = m_index; |
75 | 0 | while (!is_eof() && !next_is(stop)) |
76 | 0 | m_index++; |
77 | 0 | size_t length = m_index - start; |
78 | |
|
79 | 0 | if (length == 0) |
80 | 0 | return {}; |
81 | 0 | return m_input.substring_view(start, length); |
82 | 0 | } |
83 | | |
84 | | // Consume and return characters until the string `stop` is found |
85 | | StringView GenericLexer::consume_until(StringView stop) |
86 | 0 | { |
87 | 0 | size_t start = m_index; |
88 | 0 | while (!is_eof() && !next_is(stop)) |
89 | 0 | m_index++; |
90 | 0 | size_t length = m_index - start; |
91 | |
|
92 | 0 | if (length == 0) |
93 | 0 | return {}; |
94 | 0 | return m_input.substring_view(start, length); |
95 | 0 | } |
96 | | |
97 | | /* |
98 | | * Consume a string surrounded by single or double quotes. The returned |
99 | | * StringView does not include the quotes. An escape character can be provided |
100 | | * to capture the enclosing quotes. Please note that the escape character will |
101 | | * still be in the resulting StringView |
102 | | */ |
103 | | StringView GenericLexer::consume_quoted_string(char escape_char) |
104 | 0 | { |
105 | 0 | if (!next_is(is_quote)) |
106 | 0 | return {}; |
107 | | |
108 | 0 | char quote_char = consume(); |
109 | 0 | size_t start = m_index; |
110 | 0 | while (!is_eof()) { |
111 | 0 | if (next_is(escape_char)) |
112 | 0 | m_index++; |
113 | 0 | else if (next_is(quote_char)) |
114 | 0 | break; |
115 | 0 | m_index++; |
116 | 0 | } |
117 | 0 | size_t length = m_index - start; |
118 | |
|
119 | 0 | if (peek() != quote_char) { |
120 | | // Restore the index in case the string is unterminated |
121 | 0 | m_index = start - 1; |
122 | 0 | return {}; |
123 | 0 | } |
124 | | |
125 | | // Ignore closing quote |
126 | 0 | ignore(); |
127 | |
|
128 | 0 | return m_input.substring_view(start, length); |
129 | 0 | } |
130 | | |
131 | | #ifndef KERNEL |
132 | | String GenericLexer::consume_and_unescape_string(char escape_char) |
133 | 0 | { |
134 | 0 | auto view = consume_quoted_string(escape_char); |
135 | 0 | if (view.is_null()) |
136 | 0 | return {}; |
137 | | |
138 | 0 | StringBuilder builder; |
139 | 0 | for (size_t i = 0; i < view.length(); ++i) |
140 | 0 | builder.append(consume_escaped_character(escape_char)); |
141 | 0 | return builder.to_string(); |
142 | 0 | } |
143 | | |
144 | | auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> |
145 | 453 | { |
146 | 453 | if (!consume_specific("\\u"sv)) |
147 | 8 | return UnicodeEscapeError::MalformedUnicodeEscape; |
148 | | |
149 | 445 | if (next_is('{')) |
150 | 0 | return decode_code_point(); |
151 | 445 | return decode_single_or_paired_surrogate(combine_surrogate_pairs); |
152 | 445 | } |
153 | | |
154 | | auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError> |
155 | 0 | { |
156 | 0 | bool starts_with_open_bracket = consume_specific('{'); |
157 | 0 | VERIFY(starts_with_open_bracket); |
158 | | |
159 | 0 | u32 code_point = 0; |
160 | |
|
161 | 0 | while (true) { |
162 | 0 | if (!next_is(is_ascii_hex_digit)) |
163 | 0 | return UnicodeEscapeError::MalformedUnicodeEscape; |
164 | | |
165 | 0 | auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); |
166 | 0 | if (new_code_point < code_point) |
167 | 0 | return UnicodeEscapeError::UnicodeEscapeOverflow; |
168 | | |
169 | 0 | code_point = new_code_point; |
170 | 0 | if (consume_specific('}')) |
171 | 0 | break; |
172 | 0 | } |
173 | | |
174 | 0 | if (is_unicode(code_point)) |
175 | 0 | return code_point; |
176 | 0 | return UnicodeEscapeError::UnicodeEscapeOverflow; |
177 | 0 | } |
178 | | |
179 | | auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> |
180 | 445 | { |
181 | 445 | constexpr size_t surrogate_length = 4; |
182 | | |
183 | 445 | auto decode_one_surrogate = [&]() -> Optional<u16> { |
184 | 445 | u16 surrogate = 0; |
185 | | |
186 | 2.21k | for (size_t i = 0; i < surrogate_length; ++i) { |
187 | 1.77k | if (!next_is(is_ascii_hex_digit)) |
188 | 4 | return {}; |
189 | | |
190 | 1.77k | surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); |
191 | 1.77k | } |
192 | | |
193 | 441 | return surrogate; |
194 | 445 | }; |
195 | | |
196 | 445 | auto high_surrogate = decode_one_surrogate(); |
197 | 445 | if (!high_surrogate.has_value()) |
198 | 4 | return UnicodeEscapeError::MalformedUnicodeEscape; |
199 | 441 | if (!Utf16View::is_high_surrogate(*high_surrogate)) |
200 | 427 | return *high_surrogate; |
201 | 14 | if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) |
202 | 14 | return *high_surrogate; |
203 | | |
204 | 0 | auto low_surrogate = decode_one_surrogate(); |
205 | 0 | if (!low_surrogate.has_value()) |
206 | 0 | return UnicodeEscapeError::MalformedUnicodeEscape; |
207 | 0 | if (Utf16View::is_low_surrogate(*low_surrogate)) |
208 | 0 | return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); |
209 | | |
210 | 0 | retreat(6); |
211 | 0 | return *high_surrogate; |
212 | 0 | } |
213 | | #endif |
214 | | |
215 | | } |