/src/solidity/liblangutil/Scanner.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * This file is part of solidity. |
3 | | * |
4 | | * solidity is free software: you can redistribute it and/or modify |
5 | | * it under the terms of the GNU General Public License as published by |
6 | | * the Free Software Foundation, either version 3 of the License, or |
7 | | * (at your option) any later version. |
8 | | * |
9 | | * solidity is distributed in the hope that it will be useful, |
10 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | * GNU General Public License for more details. |
13 | | * |
14 | | * You should have received a copy of the GNU General Public License |
15 | | * along with solidity. If not, see <http://www.gnu.org/licenses/>. |
16 | | * |
17 | | * This file is derived from the file "scanner.cc", which was part of the |
18 | | * V8 project. The original copyright header follows: |
19 | | * |
20 | | * Copyright 2006-2012, the V8 project authors. All rights reserved. |
21 | | * Redistribution and use in source and binary forms, with or without |
22 | | * modification, are permitted provided that the following conditions are |
23 | | * met: |
24 | | * |
25 | | * * Redistributions of source code must retain the above copyright |
26 | | * notice, this list of conditions and the following disclaimer. |
27 | | * * Redistributions in binary form must reproduce the above |
28 | | * copyright notice, this list of conditions and the following |
29 | | * disclaimer in the documentation and/or other materials provided |
30 | | * with the distribution. |
31 | | * * Neither the name of Google Inc. nor the names of its |
32 | | * contributors may be used to endorse or promote products derived |
33 | | * from this software without specific prior written permission. |
34 | | * |
35 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
36 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
37 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
38 | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
39 | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
40 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
41 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
42 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
43 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
44 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
45 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
46 | | */ |
47 | | /** |
48 | | * @author Christian <c@ethdev.com> |
49 | | * @date 2014 |
50 | | * Solidity scanner. |
51 | | */ |
52 | | |
53 | | #include <liblangutil/Common.h> |
54 | | #include <liblangutil/Exceptions.h> |
55 | | #include <liblangutil/Scanner.h> |
56 | | |
57 | | #include <boost/algorithm/string/classification.hpp> |
58 | | |
59 | | #include <optional> |
60 | | #include <string_view> |
61 | | #include <tuple> |
62 | | #include <array> |
63 | | |
64 | | using namespace std; |
65 | | |
66 | | namespace solidity::langutil |
67 | | { |
68 | | |
69 | | string to_string(ScannerError _errorCode) |
70 | 0 | { |
71 | 0 | switch (_errorCode) |
72 | 0 | { |
73 | 0 | case ScannerError::NoError: return "No error."; |
74 | 0 | case ScannerError::IllegalToken: return "Invalid token."; |
75 | 0 | case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles."; |
76 | 0 | case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid."; |
77 | 0 | case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator."; |
78 | 0 | case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence."; |
79 | 0 | case ScannerError::UnicodeCharacterInNonUnicodeString: return "Invalid character in string. If you are trying to use Unicode characters, use a unicode\"...\" string literal."; |
80 | 0 | case ScannerError::IllegalCharacterInString: return "Invalid character in string."; |
81 | 0 | case ScannerError::IllegalStringEndQuote: return "Expected string end-quote."; |
82 | 0 | case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'."; |
83 | 0 | case ScannerError::IllegalExponent: return "Invalid exponent."; |
84 | 0 | case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number."; |
85 | 0 | case ScannerError::OctalNotAllowed: return "Octal numbers not allowed."; |
86 | 0 | case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal."; |
87 | 0 | case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal."; |
88 | 0 | default: |
89 | 0 | solAssert(false, "Unhandled case in to_string(ScannerError)"); |
90 | 0 | return ""; |
91 | 0 | } |
92 | 0 | } |
93 | | |
94 | | |
95 | | ostream& operator<<(ostream& os, ScannerError _errorCode) |
96 | 0 | { |
97 | 0 | return os << to_string(_errorCode); |
98 | 0 | } |
99 | | |
100 | | /// Scoped helper for literal recording. Automatically drops the literal |
101 | | /// if aborting the scanning before it's complete. |
102 | | enum LiteralType |
103 | | { |
104 | | LITERAL_TYPE_STRING, |
105 | | LITERAL_TYPE_NUMBER, // not really different from string type in behaviour |
106 | | LITERAL_TYPE_COMMENT |
107 | | }; |
108 | | |
109 | | class LiteralScope |
110 | | { |
111 | | public: |
112 | | explicit LiteralScope(Scanner* _self, enum LiteralType _type): |
113 | | m_type(_type), |
114 | | m_scanner(_self), |
115 | | m_complete(false) |
116 | 1.31M | { |
117 | 1.31M | if (_type == LITERAL_TYPE_COMMENT) |
118 | 0 | m_scanner->m_skippedComments[Scanner::NextNext].literal.clear(); |
119 | 1.31M | else |
120 | 1.31M | m_scanner->m_tokens[Scanner::NextNext].literal.clear(); |
121 | 1.31M | } |
122 | | ~LiteralScope() |
123 | 1.31M | { |
124 | 1.31M | if (!m_complete) |
125 | 0 | { |
126 | 0 | if (m_type == LITERAL_TYPE_COMMENT) |
127 | 0 | m_scanner->m_skippedComments[Scanner::NextNext].literal.clear(); |
128 | 0 | else |
129 | 0 | m_scanner->m_tokens[Scanner::NextNext].literal.clear(); |
130 | 0 | } |
131 | 1.31M | } |
132 | 1.31M | void complete() { m_complete = true; } |
133 | | |
134 | | private: |
135 | | enum LiteralType m_type; |
136 | | Scanner* m_scanner; |
137 | | bool m_complete; |
138 | | }; |
139 | | |
140 | | void Scanner::reset() |
141 | 14.7k | { |
142 | 14.7k | m_source.reset(); |
143 | 14.7k | m_kind = ScannerKind::Solidity; |
144 | 14.7k | m_char = m_source.get(); |
145 | 14.7k | skipWhitespace(); |
146 | 14.7k | next(); |
147 | 14.7k | next(); |
148 | 14.7k | next(); |
149 | 14.7k | } |
150 | | |
151 | | void Scanner::setPosition(size_t _offset) |
152 | 0 | { |
153 | 0 | m_char = m_source.setPosition(_offset); |
154 | 0 | scanToken(); |
155 | 0 | next(); |
156 | 0 | next(); |
157 | 0 | } |
158 | | |
159 | | bool Scanner::scanHexByte(char& o_scannedByte) |
160 | 44.4k | { |
161 | 44.4k | char x = 0; |
162 | 133k | for (size_t i = 0; i < 2; i++) |
163 | 88.8k | { |
164 | 88.8k | int d = hexValue(m_char); |
165 | 88.8k | if (d < 0) |
166 | 0 | { |
167 | 0 | rollback(i); |
168 | 0 | return false; |
169 | 0 | } |
170 | 88.8k | x = static_cast<char>(x * 16 + d); |
171 | 88.8k | advance(); |
172 | 88.8k | } |
173 | 44.4k | o_scannedByte = x; |
174 | 44.4k | return true; |
175 | 44.4k | } |
176 | | |
177 | | std::optional<unsigned> Scanner::scanUnicode() |
178 | 0 | { |
179 | 0 | unsigned x = 0; |
180 | 0 | for (size_t i = 0; i < 4; i++) |
181 | 0 | { |
182 | 0 | int d = hexValue(m_char); |
183 | 0 | if (d < 0) |
184 | 0 | { |
185 | 0 | rollback(i); |
186 | 0 | return {}; |
187 | 0 | } |
188 | 0 | x = x * 16 + static_cast<unsigned>(d); |
189 | 0 | advance(); |
190 | 0 | } |
191 | 0 | return x; |
192 | 0 | } |
193 | | |
194 | | // This supports codepoints between 0000 and FFFF. |
195 | | void Scanner::addUnicodeAsUTF8(unsigned codepoint) |
196 | 0 | { |
197 | 0 | if (codepoint <= 0x7f) |
198 | 0 | addLiteralChar(char(codepoint)); |
199 | 0 | else if (codepoint <= 0x7ff) |
200 | 0 | { |
201 | 0 | addLiteralChar(char(0xc0u | (codepoint >> 6u))); |
202 | 0 | addLiteralChar(char(0x80u | (codepoint & 0x3fu))); |
203 | 0 | } |
204 | 0 | else |
205 | 0 | { |
206 | 0 | addLiteralChar(char(0xe0u | (codepoint >> 12u))); |
207 | 0 | addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu))); |
208 | 0 | addLiteralChar(char(0x80u | (codepoint & 0x3fu))); |
209 | 0 | } |
210 | 0 | } |
211 | | |
212 | | void Scanner::rescan() |
213 | 39.1k | { |
214 | 39.1k | size_t rollbackTo = 0; |
215 | 39.1k | if (m_skippedComments[Current].literal.empty()) |
216 | 39.1k | rollbackTo = static_cast<size_t>(m_tokens[Current].location.start); |
217 | 0 | else |
218 | 0 | rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start); |
219 | 39.1k | m_char = m_source.rollback(m_source.position() - rollbackTo); |
220 | 39.1k | next(); |
221 | 39.1k | next(); |
222 | 39.1k | next(); |
223 | 39.1k | } |
224 | | |
225 | | // Ensure that tokens can be stored in a byte. |
226 | | BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100); |
227 | | |
228 | | Token Scanner::next() |
229 | 2.97M | { |
230 | 2.97M | m_tokens[Current] = std::move(m_tokens[Next]); |
231 | 2.97M | m_tokens[Next] = std::move(m_tokens[NextNext]); |
232 | 2.97M | m_skippedComments[Current] = std::move(m_skippedComments[Next]); |
233 | 2.97M | m_skippedComments[Next] = std::move(m_skippedComments[NextNext]); |
234 | | |
235 | 2.97M | scanToken(); |
236 | | |
237 | 2.97M | return m_tokens[Current].token; |
238 | 2.97M | } |
239 | | |
240 | | Token Scanner::selectToken(char _next, Token _then, Token _else) |
241 | 0 | { |
242 | 0 | advance(); |
243 | 0 | if (m_char == _next) |
244 | 0 | return selectToken(_then); |
245 | 0 | else |
246 | 0 | return _else; |
247 | 0 | } |
248 | | |
249 | | bool Scanner::skipWhitespace() |
250 | 1.35M | { |
251 | 1.35M | size_t const startPosition = sourcePos(); |
252 | 2.58M | while (isWhiteSpace(m_char)) |
253 | 1.23M | advance(); |
254 | | // Return whether or not we skipped any characters. |
255 | 1.35M | return sourcePos() != startPosition; |
256 | 1.35M | } |
257 | | |
258 | | bool Scanner::skipWhitespaceExceptUnicodeLinebreak() |
259 | 0 | { |
260 | 0 | size_t const startPosition = sourcePos(); |
261 | 0 | while (isWhiteSpace(m_char) && !isUnicodeLinebreak()) |
262 | 0 | advance(); |
263 | | // Return whether or not we skipped any characters. |
264 | 0 | return sourcePos() != startPosition; |
265 | 0 | } |
266 | | |
267 | | |
268 | | namespace |
269 | | { |
270 | | |
271 | | /// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth. |
272 | | /// |
273 | | /// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired |
274 | | /// and error code in case the input's lexical parser state is invalid and this error should be reported |
275 | | /// to the user. |
276 | | static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition) |
277 | 0 | { |
278 | 0 | static array<pair<string_view, int>, 5> constexpr directionalSequences{ |
279 | 0 | pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override) |
280 | 0 | pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override) |
281 | 0 | pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding) |
282 | 0 | pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding) |
283 | 0 | pair<string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting |
284 | 0 | }; |
285 | |
|
286 | 0 | size_t endPosition = _stream.position(); |
287 | 0 | _stream.setPosition(_startPosition); |
288 | |
|
289 | 0 | int directionOverrideDepth = 0; |
290 | |
|
291 | 0 | for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos) |
292 | 0 | { |
293 | 0 | _stream.setPosition(currentPos); |
294 | |
|
295 | 0 | for (auto const& [sequence, depthChange]: directionalSequences) |
296 | 0 | if (_stream.prefixMatch(sequence)) |
297 | 0 | directionOverrideDepth += depthChange; |
298 | |
|
299 | 0 | if (directionOverrideDepth < 0) |
300 | 0 | return ScannerError::DirectionalOverrideUnderflow; |
301 | 0 | } |
302 | | |
303 | 0 | _stream.setPosition(endPosition); |
304 | |
|
305 | 0 | return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError; |
306 | 0 | } |
307 | | |
308 | | } |
309 | | |
310 | | Token Scanner::skipSingleLineComment() |
311 | 0 | { |
312 | | // Line terminator is not part of the comment. If it is a |
313 | | // non-ascii line terminator, it will result in a parser error. |
314 | 0 | size_t startPosition = m_source.position(); |
315 | 0 | while (!isUnicodeLinebreak()) |
316 | 0 | if (!advance()) |
317 | 0 | break; |
318 | |
|
319 | 0 | ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition); |
320 | 0 | if (unicodeDirectionError != ScannerError::NoError) |
321 | 0 | return setError(unicodeDirectionError); |
322 | | |
323 | 0 | return Token::Whitespace; |
324 | 0 | } |
325 | | |
326 | | bool Scanner::atEndOfLine() const |
327 | 0 | { |
328 | 0 | return m_char == '\n' || m_char == '\r'; |
329 | 0 | } |
330 | | |
331 | | bool Scanner::tryScanEndOfLine() |
332 | 0 | { |
333 | 0 | if (m_char == '\n') |
334 | 0 | { |
335 | 0 | advance(); |
336 | 0 | return true; |
337 | 0 | } |
338 | | |
339 | 0 | if (m_char == '\r') |
340 | 0 | { |
341 | 0 | if (advance() && m_char == '\n') |
342 | 0 | advance(); |
343 | 0 | return true; |
344 | 0 | } |
345 | | |
346 | 0 | return false; |
347 | 0 | } |
348 | | |
349 | | size_t Scanner::scanSingleLineDocComment() |
350 | 0 | { |
351 | 0 | LiteralScope literal(this, LITERAL_TYPE_COMMENT); |
352 | 0 | size_t endPosition = m_source.position(); |
353 | |
|
354 | 0 | skipWhitespaceExceptUnicodeLinebreak(); |
355 | |
|
356 | 0 | while (!isSourcePastEndOfInput()) |
357 | 0 | { |
358 | 0 | endPosition = m_source.position(); |
359 | 0 | if (tryScanEndOfLine()) |
360 | 0 | { |
361 | | // Check if next line is also a single-line comment. |
362 | | // If any whitespaces were skipped, use source position before. |
363 | 0 | if (!skipWhitespaceExceptUnicodeLinebreak()) |
364 | 0 | endPosition = m_source.position(); |
365 | |
|
366 | 0 | if (!m_source.isPastEndOfInput(3) && |
367 | 0 | m_source.get(0) == '/' && |
368 | 0 | m_source.get(1) == '/' && |
369 | 0 | m_source.get(2) == '/') |
370 | 0 | { |
371 | 0 | if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/') |
372 | 0 | break; // "////" is not a documentation comment |
373 | 0 | m_char = m_source.advanceAndGet(3); |
374 | 0 | if (atEndOfLine()) |
375 | 0 | continue; |
376 | 0 | addCommentLiteralChar('\n'); |
377 | 0 | } |
378 | 0 | else |
379 | 0 | break; // next line is not a documentation comment, we are done |
380 | 0 | } |
381 | 0 | else if (isUnicodeLinebreak()) |
382 | | // Any line terminator that is not '\n' is considered to end the |
383 | | // comment. |
384 | 0 | break; |
385 | 0 | addCommentLiteralChar(m_char); |
386 | 0 | advance(); |
387 | 0 | } |
388 | 0 | literal.complete(); |
389 | 0 | return endPosition; |
390 | 0 | } |
391 | | |
392 | | Token Scanner::skipMultiLineComment() |
393 | 0 | { |
394 | 0 | size_t startPosition = m_source.position(); |
395 | 0 | while (!isSourcePastEndOfInput()) |
396 | 0 | { |
397 | 0 | char prevChar = m_char; |
398 | 0 | advance(); |
399 | | |
400 | | // If we have reached the end of the multi-line comment, we |
401 | | // consume the '/' and insert a whitespace. This way all |
402 | | // multi-line comments are treated as whitespace. |
403 | 0 | if (prevChar == '*' && m_char == '/') |
404 | 0 | { |
405 | 0 | ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition); |
406 | 0 | if (unicodeDirectionError != ScannerError::NoError) |
407 | 0 | return setError(unicodeDirectionError); |
408 | | |
409 | 0 | m_char = ' '; |
410 | 0 | return Token::Whitespace; |
411 | 0 | } |
412 | 0 | } |
413 | | // Unterminated multi-line comment. |
414 | 0 | return setError(ScannerError::IllegalCommentTerminator); |
415 | 0 | } |
416 | | |
417 | | Token Scanner::scanMultiLineDocComment() |
418 | 0 | { |
419 | 0 | LiteralScope literal(this, LITERAL_TYPE_COMMENT); |
420 | 0 | bool endFound = false; |
421 | 0 | bool charsAdded = false; |
422 | |
|
423 | 0 | while (isWhiteSpace(m_char) && !atEndOfLine()) |
424 | 0 | advance(); |
425 | |
|
426 | 0 | while (!isSourcePastEndOfInput()) |
427 | 0 | { |
428 | | // handle newlines in multiline comments |
429 | 0 | if (atEndOfLine()) |
430 | 0 | { |
431 | 0 | skipWhitespace(); |
432 | 0 | if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*') |
433 | 0 | { // it is unknown if this leads to the end of the comment |
434 | 0 | addCommentLiteralChar('*'); |
435 | 0 | advance(); |
436 | 0 | } |
437 | 0 | else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/') |
438 | 0 | { // skip first '*' in subsequent lines |
439 | 0 | m_char = m_source.advanceAndGet(1); |
440 | 0 | if (atEndOfLine()) // ignores empty lines |
441 | 0 | continue; |
442 | 0 | if (charsAdded) |
443 | 0 | addCommentLiteralChar('\n'); // corresponds to the end of previous line |
444 | 0 | } |
445 | 0 | else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') |
446 | 0 | { // if after newline the comment ends, don't insert the newline |
447 | 0 | m_char = m_source.advanceAndGet(2); |
448 | 0 | endFound = true; |
449 | 0 | break; |
450 | 0 | } |
451 | 0 | else if (charsAdded) |
452 | 0 | addCommentLiteralChar('\n'); |
453 | 0 | } |
454 | | |
455 | 0 | if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') |
456 | 0 | { |
457 | 0 | m_char = m_source.advanceAndGet(2); |
458 | 0 | endFound = true; |
459 | 0 | break; |
460 | 0 | } |
461 | 0 | addCommentLiteralChar(m_char); |
462 | 0 | charsAdded = true; |
463 | 0 | advance(); |
464 | 0 | } |
465 | 0 | literal.complete(); |
466 | 0 | if (!endFound) |
467 | 0 | return setError(ScannerError::IllegalCommentTerminator); |
468 | 0 | else |
469 | 0 | return Token::CommentLiteral; |
470 | 0 | } |
471 | | |
472 | | Token Scanner::scanSlash() |
473 | 0 | { |
474 | 0 | int firstSlashPosition = static_cast<int>(sourcePos()); |
475 | 0 | advance(); |
476 | 0 | if (m_char == '/') |
477 | 0 | { |
478 | 0 | if (!advance()) /* double slash comment directly before EOS */ |
479 | 0 | return Token::Whitespace; |
480 | 0 | else if (m_char == '/') |
481 | 0 | { |
482 | 0 | advance(); //consume the last '/' at /// |
483 | | |
484 | | // "////" |
485 | 0 | if (m_char == '/') |
486 | 0 | return skipSingleLineComment(); |
487 | | // doxygen style /// comment |
488 | 0 | m_skippedComments[NextNext].location.start = firstSlashPosition; |
489 | 0 | m_skippedComments[NextNext].location.sourceName = m_sourceName; |
490 | 0 | m_skippedComments[NextNext].token = Token::CommentLiteral; |
491 | 0 | m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment()); |
492 | 0 | return Token::Whitespace; |
493 | 0 | } |
494 | 0 | else |
495 | 0 | return skipSingleLineComment(); |
496 | 0 | } |
497 | 0 | else if (m_char == '*') |
498 | 0 | { |
499 | | // doxygen style /** natspec comment |
500 | 0 | if (!advance()) /* slash star comment before EOS */ |
501 | 0 | return setError(ScannerError::IllegalCommentTerminator); |
502 | 0 | else if (m_char == '*') |
503 | 0 | { |
504 | 0 | advance(); //consume the last '*' at /** |
505 | | |
506 | | // "/**/" |
507 | 0 | if (m_char == '/') |
508 | 0 | { |
509 | 0 | advance(); //skip the closing slash |
510 | 0 | return Token::Whitespace; |
511 | 0 | } |
512 | | // "/***" |
513 | 0 | if (m_char == '*') |
514 | | // "/***/" may be interpreted as empty natspec or skipped; skipping is simpler |
515 | 0 | return skipMultiLineComment(); |
516 | | // we actually have a multiline documentation comment |
517 | 0 | m_skippedComments[NextNext].location.start = firstSlashPosition; |
518 | 0 | m_skippedComments[NextNext].location.sourceName = m_sourceName; |
519 | 0 | Token comment = scanMultiLineDocComment(); |
520 | 0 | m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos()); |
521 | 0 | m_skippedComments[NextNext].token = comment; |
522 | 0 | if (comment == Token::Illegal) |
523 | 0 | return Token::Illegal; // error already set |
524 | 0 | else |
525 | 0 | return Token::Whitespace; |
526 | 0 | } |
527 | 0 | else |
528 | 0 | return skipMultiLineComment(); |
529 | 0 | } |
530 | 0 | else if (m_char == '=') |
531 | 0 | return selectToken(Token::AssignDiv); |
532 | 0 | else |
533 | 0 | return Token::Div; |
534 | 0 | } |
535 | | |
536 | | void Scanner::scanToken() |
537 | 2.97M | { |
538 | 2.97M | m_tokens[NextNext] = {}; |
539 | 2.97M | m_skippedComments[NextNext] = {}; |
540 | | |
541 | 2.97M | Token token; |
542 | | // M and N are for the purposes of grabbing different type sizes |
543 | 2.97M | unsigned m; |
544 | 2.97M | unsigned n; |
545 | 2.97M | do |
546 | 4.21M | { |
547 | | // Remember the position of the next token |
548 | 4.21M | m_tokens[NextNext].location.start = static_cast<int>(sourcePos()); |
549 | 4.21M | switch (m_char) |
550 | 4.21M | { |
551 | 17.2k | case '"': |
552 | 17.2k | case '\'': |
553 | 17.2k | token = scanString(false); |
554 | 17.2k | break; |
555 | 0 | case '<': |
556 | | // < <= << <<= |
557 | 0 | advance(); |
558 | 0 | if (m_char == '=') |
559 | 0 | token = selectToken(Token::LessThanOrEqual); |
560 | 0 | else if (m_char == '<') |
561 | 0 | token = selectToken('=', Token::AssignShl, Token::SHL); |
562 | 0 | else |
563 | 0 | token = Token::LessThan; |
564 | 0 | break; |
565 | 0 | case '>': |
566 | | // > >= >> >>= >>> >>>= |
567 | 0 | advance(); |
568 | 0 | if (m_char == '=') |
569 | 0 | token = selectToken(Token::GreaterThanOrEqual); |
570 | 0 | else if (m_char == '>') |
571 | 0 | { |
572 | | // >> >>= >>> >>>= |
573 | 0 | advance(); |
574 | 0 | if (m_char == '=') |
575 | 0 | token = selectToken(Token::AssignSar); |
576 | 0 | else if (m_char == '>') |
577 | 0 | token = selectToken('=', Token::AssignShr, Token::SHR); |
578 | 0 | else |
579 | 0 | token = Token::SAR; |
580 | 0 | } |
581 | 0 | else |
582 | 0 | token = Token::GreaterThan; |
583 | 0 | break; |
584 | 0 | case '=': |
585 | | // = == => |
586 | 0 | advance(); |
587 | 0 | if (m_char == '=') |
588 | 0 | token = selectToken(Token::Equal); |
589 | 0 | else if (m_char == '>') |
590 | 0 | token = selectToken(Token::DoubleArrow); |
591 | 0 | else |
592 | 0 | token = Token::Assign; |
593 | 0 | break; |
594 | 0 | case '!': |
595 | | // ! != |
596 | 0 | advance(); |
597 | 0 | if (m_char == '=') |
598 | 0 | token = selectToken(Token::NotEqual); |
599 | 0 | else |
600 | 0 | token = Token::Not; |
601 | 0 | break; |
602 | 0 | case '+': |
603 | | // + ++ += |
604 | 0 | advance(); |
605 | 0 | if (m_char == '+') |
606 | 0 | token = selectToken(Token::Inc); |
607 | 0 | else if (m_char == '=') |
608 | 0 | token = selectToken(Token::AssignAdd); |
609 | 0 | else |
610 | 0 | token = Token::Add; |
611 | 0 | break; |
612 | 24.8k | case '-': |
613 | | // - -- -= -> |
614 | 24.8k | advance(); |
615 | 24.8k | if (m_char == '-') |
616 | 0 | token = selectToken(Token::Dec); |
617 | 24.8k | else if (m_char == '=') |
618 | 0 | token = selectToken(Token::AssignSub); |
619 | 24.8k | else if (m_char == '>') |
620 | 24.8k | token = selectToken(Token::RightArrow); |
621 | 0 | else |
622 | 0 | token = Token::Sub; |
623 | 24.8k | break; |
624 | 0 | case '*': |
625 | | // * ** *= |
626 | 0 | advance(); |
627 | 0 | if (m_char == '*') |
628 | 0 | token = selectToken(Token::Exp); |
629 | 0 | else if (m_char == '=') |
630 | 0 | token = selectToken(Token::AssignMul); |
631 | 0 | else |
632 | 0 | token = Token::Mul; |
633 | 0 | break; |
634 | 0 | case '%': |
635 | | // % %= |
636 | 0 | token = selectToken('=', Token::AssignMod, Token::Mod); |
637 | 0 | break; |
638 | 0 | case '/': |
639 | | // / // /* /= |
640 | 0 | token = scanSlash(); |
641 | 0 | break; |
642 | 0 | case '&': |
643 | | // & && &= |
644 | 0 | advance(); |
645 | 0 | if (m_char == '&') |
646 | 0 | token = selectToken(Token::And); |
647 | 0 | else if (m_char == '=') |
648 | 0 | token = selectToken(Token::AssignBitAnd); |
649 | 0 | else |
650 | 0 | token = Token::BitAnd; |
651 | 0 | break; |
652 | 0 | case '|': |
653 | | // | || |= |
654 | 0 | advance(); |
655 | 0 | if (m_char == '|') |
656 | 0 | token = selectToken(Token::Or); |
657 | 0 | else if (m_char == '=') |
658 | 0 | token = selectToken(Token::AssignBitOr); |
659 | 0 | else |
660 | 0 | token = Token::BitOr; |
661 | 0 | break; |
662 | 0 | case '^': |
663 | | // ^ ^= |
664 | 0 | token = selectToken('=', Token::AssignBitXor, Token::BitXor); |
665 | 0 | break; |
666 | 0 | case '.': |
667 | | // . Number |
668 | 0 | advance(); |
669 | 0 | if (isDecimalDigit(m_char)) |
670 | 0 | token = scanNumber('.'); |
671 | 0 | else |
672 | 0 | token = Token::Period; |
673 | 0 | break; |
674 | 53.8k | case ':': |
675 | | // : := |
676 | 53.8k | advance(); |
677 | 53.8k | if (m_char == '=') |
678 | 53.8k | token = selectToken(Token::AssemblyAssign); |
679 | 0 | else |
680 | 0 | token = Token::Colon; |
681 | 53.8k | break; |
682 | 0 | case ';': |
683 | 0 | token = selectToken(Token::Semicolon); |
684 | 0 | break; |
685 | 495k | case ',': |
686 | 495k | token = selectToken(Token::Comma); |
687 | 495k | break; |
688 | 343k | case '(': |
689 | 343k | token = selectToken(Token::LParen); |
690 | 343k | break; |
691 | 343k | case ')': |
692 | 343k | token = selectToken(Token::RParen); |
693 | 343k | break; |
694 | 0 | case '[': |
695 | 0 | token = selectToken(Token::LBrack); |
696 | 0 | break; |
697 | 0 | case ']': |
698 | 0 | token = selectToken(Token::RBrack); |
699 | 0 | break; |
700 | 167k | case '{': |
701 | 167k | token = selectToken(Token::LBrace); |
702 | 167k | break; |
703 | 134k | case '}': |
704 | 134k | token = selectToken(Token::RBrace); |
705 | 134k | break; |
706 | 0 | case '?': |
707 | 0 | token = selectToken(Token::Conditional); |
708 | 0 | break; |
709 | 0 | case '~': |
710 | 0 | token = selectToken(Token::BitNot); |
711 | 0 | break; |
712 | 2.63M | default: |
713 | 2.63M | if (isIdentifierStart(m_char)) |
714 | 900k | { |
715 | 900k | tie(token, m, n) = scanIdentifierOrKeyword(); |
716 | | |
717 | | // Special case for hexadecimal literals |
718 | 900k | if (token == Token::Hex) |
719 | 3.27k | { |
720 | | // reset |
721 | 3.27k | m = 0; |
722 | 3.27k | n = 0; |
723 | | |
724 | | // Special quoted hex string must follow |
725 | 3.27k | if (m_char == '"' || m_char == '\'') |
726 | 3.27k | token = scanHexString(); |
727 | 0 | else |
728 | 0 | token = setError(ScannerError::IllegalToken); |
729 | 3.27k | } |
730 | 897k | else if (token == Token::Unicode && m_kind != ScannerKind::Yul) |
731 | 0 | { |
732 | | // reset |
733 | 0 | m = 0; |
734 | 0 | n = 0; |
735 | | |
736 | | // Special quoted hex string must follow |
737 | 0 | if (m_char == '"' || m_char == '\'') |
738 | 0 | token = scanString(true); |
739 | 0 | else |
740 | 0 | token = setError(ScannerError::IllegalToken); |
741 | 0 | } |
742 | 900k | } |
743 | 1.73M | else if (isDecimalDigit(m_char)) |
744 | 393k | token = scanNumber(); |
745 | 1.33M | else if (skipWhitespace()) |
746 | 1.23M | token = Token::Whitespace; |
747 | 100k | else if (isSourcePastEndOfInput()) |
748 | 100k | token = Token::EOS; |
749 | 0 | else |
750 | 0 | token = selectErrorToken(ScannerError::IllegalToken); |
751 | 2.63M | break; |
752 | 4.21M | } |
753 | | // Continue scanning for tokens as long as we're just skipping |
754 | | // whitespace. |
755 | 4.21M | } |
756 | 4.21M | while (token == Token::Whitespace); |
757 | 2.97M | m_tokens[NextNext].location.end = static_cast<int>(sourcePos()); |
758 | 2.97M | m_tokens[NextNext].location.sourceName = m_sourceName; |
759 | 2.97M | m_tokens[NextNext].token = token; |
760 | 2.97M | m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n); |
761 | 2.97M | } |
762 | | |
763 | | bool Scanner::scanEscape() |
764 | 0 | { |
765 | 0 | char c = m_char; |
766 | | |
767 | | // Skip escaped newlines. |
768 | 0 | if (tryScanEndOfLine()) |
769 | 0 | return true; |
770 | 0 | advance(); |
771 | |
|
772 | 0 | switch (c) |
773 | 0 | { |
774 | 0 | case '\'': // fall through |
775 | 0 | case '"': // fall through |
776 | 0 | case '\\': |
777 | 0 | break; |
778 | 0 | case 'n': |
779 | 0 | c = '\n'; |
780 | 0 | break; |
781 | 0 | case 'r': |
782 | 0 | c = '\r'; |
783 | 0 | break; |
784 | 0 | case 't': |
785 | 0 | c = '\t'; |
786 | 0 | break; |
787 | 0 | case 'u': |
788 | 0 | { |
789 | 0 | if (auto const codepoint = scanUnicode(); codepoint.has_value()) |
790 | 0 | addUnicodeAsUTF8(*codepoint); |
791 | 0 | else |
792 | 0 | return false; |
793 | 0 | return true; |
794 | 0 | } |
795 | 0 | case 'x': |
796 | 0 | if (!scanHexByte(c)) |
797 | 0 | return false; |
798 | 0 | break; |
799 | 0 | default: |
800 | 0 | return false; |
801 | 0 | } |
802 | | |
803 | 0 | addLiteralChar(c); |
804 | 0 | return true; |
805 | 0 | } |
806 | | |
807 | | bool Scanner::isUnicodeLinebreak() |
808 | 126k | { |
809 | 126k | if (0x0a <= m_char && m_char <= 0x0d) |
810 | | // line feed, vertical tab, form feed, carriage return |
811 | 0 | return true; |
812 | 126k | if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85) |
813 | | // NEL - U+0085, C2 85 in utf8 |
814 | 0 | return true; |
815 | 126k | if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && ( |
816 | 0 | uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9 |
817 | 0 | )) |
818 | | // LS - U+2028, E2 80 A8 in utf8 |
819 | | // PS - U+2029, E2 80 A9 in utf8 |
820 | 0 | return true; |
821 | 126k | return false; |
822 | 126k | } |
823 | | |
824 | | Token Scanner::scanString(bool const _isUnicode) |
825 | 17.2k | { |
826 | 17.2k | size_t startPosition = m_source.position(); |
827 | 17.2k | char const quote = m_char; |
828 | 17.2k | advance(); // consume quote |
829 | 17.2k | LiteralScope literal(this, LITERAL_TYPE_STRING); |
830 | 143k | while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak()) |
831 | 126k | { |
832 | 126k | char c = m_char; |
833 | 126k | advance(); |
834 | 126k | if (c == '\\') |
835 | 0 | { |
836 | 0 | if (isSourcePastEndOfInput() || !scanEscape()) |
837 | 0 | return setError(ScannerError::IllegalEscapeSequence); |
838 | 0 | } |
839 | 126k | else |
840 | 126k | { |
841 | | // Report error on non-printable characters in string literals, however |
842 | | // allow anything for unicode string literals, because their validity will |
843 | | // be verified later (in the syntax checker). |
844 | | // |
845 | | // We are using a manual range and not isprint() to avoid |
846 | | // any potential complications with locale. |
847 | 126k | if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f)) |
848 | 0 | { |
849 | 0 | if (m_kind == ScannerKind::Yul) |
850 | 0 | return setError(ScannerError::IllegalCharacterInString); |
851 | 0 | return setError(ScannerError::UnicodeCharacterInNonUnicodeString); |
852 | 0 | } |
853 | 126k | addLiteralChar(c); |
854 | 126k | } |
855 | 126k | } |
856 | 17.2k | if (m_char != quote) |
857 | 0 | return setError(ScannerError::IllegalStringEndQuote); |
858 | | |
859 | 17.2k | if (_isUnicode) |
860 | 0 | { |
861 | 0 | ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition); |
862 | 0 | if (unicodeDirectionError != ScannerError::NoError) |
863 | 0 | return setError(unicodeDirectionError); |
864 | 0 | } |
865 | | |
866 | 17.2k | literal.complete(); |
867 | 17.2k | advance(); // consume quote |
868 | 17.2k | return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral; |
869 | 17.2k | } |
870 | | |
871 | | Token Scanner::scanHexString() |
872 | 3.27k | { |
873 | 3.27k | char const quote = m_char; |
874 | 3.27k | advance(); // consume quote |
875 | 3.27k | LiteralScope literal(this, LITERAL_TYPE_STRING); |
876 | 3.27k | bool allowUnderscore = false; |
877 | 47.7k | while (m_char != quote && !isSourcePastEndOfInput()) |
878 | 44.4k | { |
879 | 44.4k | char c = m_char; |
880 | | |
881 | 44.4k | if (scanHexByte(c)) |
882 | 44.4k | { |
883 | 44.4k | addLiteralChar(c); |
884 | 44.4k | allowUnderscore = true; |
885 | 44.4k | } |
886 | 0 | else if (c == '_') |
887 | 0 | { |
888 | 0 | advance(); |
889 | 0 | if (!allowUnderscore || m_char == quote) |
890 | 0 | return setError(ScannerError::IllegalNumberSeparator); |
891 | 0 | allowUnderscore = false; |
892 | 0 | } |
893 | 0 | else |
894 | 0 | return setError(ScannerError::IllegalHexString); |
895 | 44.4k | } |
896 | | |
897 | 3.27k | if (m_char != quote) |
898 | 0 | return setError(ScannerError::IllegalStringEndQuote); |
899 | | |
900 | 3.27k | literal.complete(); |
901 | 3.27k | advance(); // consume quote |
902 | 3.27k | return Token::HexStringLiteral; |
903 | 3.27k | } |
904 | | |
905 | | // Parse for regex [:digit:]+(_[:digit:]+)* |
906 | | void Scanner::scanDecimalDigits() |
907 | 149k | { |
908 | | // MUST begin with a decimal digit. |
909 | 149k | if (!isDecimalDigit(m_char)) |
910 | 10.7k | return; |
911 | | |
912 | | // May continue with decimal digit or underscore for grouping. |
913 | 139k | do |
914 | 438k | addLiteralCharAndAdvance(); |
915 | 438k | while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); |
916 | | |
917 | | // Defer further validation of underscore to SyntaxChecker. |
918 | 139k | } |
919 | | |
920 | | Token Scanner::scanNumber(char _charSeen) |
921 | 393k | { |
922 | 393k | enum { DECIMAL, HEX, BINARY } kind = DECIMAL; |
923 | 393k | LiteralScope literal(this, LITERAL_TYPE_NUMBER); |
924 | 393k | if (_charSeen == '.') |
925 | 0 | { |
926 | | // we have already seen a decimal point of the float |
927 | 0 | addLiteralChar('.'); |
928 | 0 | if (m_char == '_') |
929 | 0 | return setError(ScannerError::IllegalToken); |
930 | 0 | scanDecimalDigits(); // we know we have at least one digit |
931 | 0 | } |
932 | 393k | else |
933 | 393k | { |
934 | 393k | solAssert(_charSeen == 0, ""); |
935 | | // if the first character is '0' we must check for octals and hex |
936 | 393k | if (m_char == '0') |
937 | 254k | { |
938 | 254k | addLiteralCharAndAdvance(); |
939 | | // either 0, 0exxx, 0Exxx, 0.xxx or a hex number |
940 | 254k | if (m_char == 'x') |
941 | 243k | { |
942 | | // hex number |
943 | 243k | kind = HEX; |
944 | 243k | addLiteralCharAndAdvance(); |
945 | 243k | if (!isHexDigit(m_char)) |
946 | 0 | return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x' |
947 | | |
948 | 7.36M | while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation |
949 | 7.12M | addLiteralCharAndAdvance(); |
950 | 243k | } |
951 | 10.7k | else if (isDecimalDigit(m_char)) |
952 | | // We do not allow octal numbers |
953 | 0 | return setError(ScannerError::OctalNotAllowed); |
954 | 254k | } |
955 | | // Parse decimal digits and allow trailing fractional part. |
956 | 393k | if (kind == DECIMAL) |
957 | 149k | { |
958 | 149k | scanDecimalDigits(); // optional |
959 | 149k | if (m_char == '.') |
960 | 0 | { |
961 | 0 | if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') |
962 | 0 | { |
963 | | // Assume the input may be a floating point number with leading '_' in fraction part. |
964 | | // Recover by consuming it all but returning `Illegal` right away. |
965 | 0 | addLiteralCharAndAdvance(); // '.' |
966 | 0 | addLiteralCharAndAdvance(); // '_' |
967 | 0 | scanDecimalDigits(); |
968 | 0 | } |
969 | 0 | if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1))) |
970 | 0 | { |
971 | | // A '.' has to be followed by a number. |
972 | 0 | literal.complete(); |
973 | 0 | return Token::Number; |
974 | 0 | } |
975 | 0 | addLiteralCharAndAdvance(); |
976 | 0 | scanDecimalDigits(); |
977 | 0 | } |
978 | 149k | } |
979 | 393k | } |
980 | | // scan exponent, if any |
981 | 393k | if (m_char == 'e' || m_char == 'E') |
982 | 0 | { |
983 | 0 | solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); |
984 | 0 | if (kind != DECIMAL) |
985 | 0 | return setError(ScannerError::IllegalExponent); |
986 | 0 | else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') |
987 | 0 | { |
988 | | // Recover from wrongly placed underscore as delimiter in literal with scientific |
989 | | // notation by consuming until the end. |
990 | 0 | addLiteralCharAndAdvance(); // 'e' |
991 | 0 | addLiteralCharAndAdvance(); // '_' |
992 | 0 | scanDecimalDigits(); |
993 | 0 | literal.complete(); |
994 | 0 | return Token::Number; |
995 | 0 | } |
996 | | // scan exponent |
997 | 0 | addLiteralCharAndAdvance(); // 'e' | 'E' |
998 | 0 | if (m_char == '+' || m_char == '-') |
999 | 0 | addLiteralCharAndAdvance(); |
1000 | 0 | if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E' |
1001 | 0 | return setError(ScannerError::IllegalExponent); |
1002 | 0 | scanDecimalDigits(); |
1003 | 0 | } |
1004 | | // The source character immediately following a numeric literal must |
1005 | | // not be an identifier start or a decimal digit; see ECMA-262 |
1006 | | // section 7.8.3, page 17 (note that we read only one decimal digit |
1007 | | // if the value is 0). |
1008 | 393k | if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) |
1009 | 0 | return setError(ScannerError::IllegalNumberEnd); |
1010 | 393k | literal.complete(); |
1011 | 393k | return Token::Number; |
1012 | 393k | } |
1013 | | |
1014 | | tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword() |
1015 | 900k | { |
1016 | 900k | solAssert(isIdentifierStart(m_char), ""); |
1017 | 900k | LiteralScope literal(this, LITERAL_TYPE_STRING); |
1018 | 900k | addLiteralCharAndAdvance(); |
1019 | | // Scan the rest of the identifier characters. |
1020 | 4.85M | while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul)) |
1021 | 3.95M | addLiteralCharAndAdvance(); |
1022 | 900k | literal.complete(); |
1023 | 900k | auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal); |
1024 | 900k | if (m_kind == ScannerKind::Yul) |
1025 | 870k | { |
1026 | | // Turn Solidity identifier into a Yul keyword |
1027 | 870k | if (m_tokens[NextNext].literal == "leave") |
1028 | 1.28k | return std::make_tuple(Token::Leave, 0, 0); |
1029 | | // Turn non-Yul keywords into identifiers. |
1030 | 869k | if (!TokenTraits::isYulKeyword(std::get<0>(token))) |
1031 | 746k | return std::make_tuple(Token::Identifier, 0, 0); |
1032 | 869k | } |
1033 | 152k | return token; |
1034 | 900k | } |
1035 | | |
1036 | | } // namespace solidity::langutil |