Coverage Report

Created: 2025-09-04 07:34

/src/solidity/liblangutil/Scanner.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * This file is part of solidity.
3
 *
4
 * solidity is free software: you can redistribute it and/or modify
5
 * it under the terms of the GNU General Public License as published by
6
 * the Free Software Foundation, either version 3 of the License, or
7
 * (at your option) any later version.
8
 *
9
 * solidity is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU General Public License
15
 * along with solidity.  If not, see <http://www.gnu.org/licenses/>.
16
 *
17
 * This file is derived from the file "scanner.cc", which was part of the
18
 * V8 project. The original copyright header follows:
19
 *
20
 * Copyright 2006-2012, the V8 project authors. All rights reserved.
21
 * Redistribution and use in source and binary forms, with or without
22
 * modification, are permitted provided that the following conditions are
23
 * met:
24
 *
25
 * * Redistributions of source code must retain the above copyright
26
 *   notice, this list of conditions and the following disclaimer.
27
 * * Redistributions in binary form must reproduce the above
28
 *   copyright notice, this list of conditions and the following
29
 *   disclaimer in the documentation and/or other materials provided
30
 *   with the distribution.
31
 * * Neither the name of Google Inc. nor the names of its
32
 *   contributors may be used to endorse or promote products derived
33
 *   from this software without specific prior written permission.
34
 *
35
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
*/
47
/**
48
 * @author Christian <c@ethdev.com>
49
 * @date 2014
50
 * Solidity scanner.
51
 */
52
53
#include <liblangutil/Common.h>
54
#include <liblangutil/Exceptions.h>
55
#include <liblangutil/Scanner.h>
56
57
#include <boost/algorithm/string/classification.hpp>
58
59
#include <optional>
60
#include <string_view>
61
#include <tuple>
62
#include <array>
63
64
65
namespace solidity::langutil
66
{
67
68
std::string to_string(ScannerError _errorCode)
69
2.24k
{
70
2.24k
  switch (_errorCode)
71
2.24k
  {
72
0
    case ScannerError::NoError: return "No error.";
73
542
    case ScannerError::IllegalToken: return "Invalid token.";
74
82
    case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
75
39
    case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
76
452
    case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
77
268
    case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
78
6
    case ScannerError::UnicodeCharacterInNonUnicodeString: return "Invalid character in string. If you are trying to use Unicode characters, use a unicode\"...\" string literal.";
79
43
    case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
80
353
    case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
81
20
    case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
82
76
    case ScannerError::IllegalExponent: return "Invalid exponent.";
83
192
    case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
84
30
    case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
85
24
    case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
86
113
    case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
87
0
    default:
88
0
      solAssert(false, "Unhandled case in to_string(ScannerError)");
89
0
      return "";
90
2.24k
  }
91
2.24k
}
92
93
94
std::ostream& operator<<(std::ostream& os, ScannerError _errorCode)
95
0
{
96
0
  return os << to_string(_errorCode);
97
0
}
98
99
/// Scoped helper for literal recording. Automatically drops the literal
100
/// if aborting the scanning before it's complete.
101
enum LiteralType
102
{
103
  LITERAL_TYPE_STRING,
104
  LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
105
  LITERAL_TYPE_COMMENT
106
};
107
108
class LiteralScope
109
{
110
public:
111
  explicit LiteralScope(Scanner* _self, enum LiteralType _type):
112
    m_type(_type),
113
    m_scanner(_self),
114
    m_complete(false)
115
28.4M
  {
116
28.4M
    if (_type == LITERAL_TYPE_COMMENT)
117
459k
      m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
118
28.0M
    else
119
28.0M
      m_scanner->m_tokens[Scanner::NextNext].literal.clear();
120
28.4M
  }
121
  ~LiteralScope()
122
28.4M
  {
123
28.4M
    if (!m_complete)
124
12.5k
    {
125
12.5k
      if (m_type == LITERAL_TYPE_COMMENT)
126
0
        m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
127
12.5k
      else
128
12.5k
        m_scanner->m_tokens[Scanner::NextNext].literal.clear();
129
12.5k
    }
130
28.4M
  }
131
28.4M
  void complete() { m_complete = true; }
132
133
private:
134
  enum LiteralType m_type;
135
  Scanner* m_scanner;
136
  bool m_complete;
137
};
138
139
void Scanner::reset()
140
650k
{
141
650k
  m_source.reset();
142
650k
  m_kind = ScannerKind::Solidity;
143
650k
  m_char = m_source.get();
144
650k
  skipWhitespace();
145
650k
  next();
146
650k
  next();
147
650k
  next();
148
650k
}
149
150
void Scanner::setPosition(size_t _offset)
151
0
{
152
0
  m_char = m_source.setPosition(_offset);
153
0
  scanToken();
154
0
  next();
155
0
  next();
156
0
}
157
158
bool Scanner::scanHexByte(char& o_scannedByte)
159
3.62M
{
160
3.62M
  char x = 0;
161
10.8M
  for (size_t i = 0; i < 2; i++)
162
7.25M
  {
163
7.25M
    int d = hexValue(m_char);
164
7.25M
    if (d < 0)
165
2.58k
    {
166
2.58k
      rollback(i);
167
2.58k
      return false;
168
2.58k
    }
169
7.25M
    x = static_cast<char>(x * 16 + d);
170
7.25M
    advance();
171
7.25M
  }
172
3.62M
  o_scannedByte = x;
173
3.62M
  return true;
174
3.62M
}
175
176
std::optional<unsigned> Scanner::scanUnicode()
177
6.21k
{
178
6.21k
  unsigned x = 0;
179
26.2k
  for (size_t i = 0; i < 4; i++)
180
21.8k
  {
181
21.8k
    int d = hexValue(m_char);
182
21.8k
    if (d < 0)
183
1.79k
    {
184
1.79k
      rollback(i);
185
1.79k
      return {};
186
1.79k
    }
187
20.0k
    x = x * 16 + static_cast<unsigned>(d);
188
20.0k
    advance();
189
20.0k
  }
190
4.42k
  return x;
191
6.21k
}
192
193
// This supports codepoints between 0000 and FFFF.
194
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
195
4.42k
{
196
4.42k
  if (codepoint <= 0x7f)
197
821
    addLiteralChar(char(codepoint));
198
3.60k
  else if (codepoint <= 0x7ff)
199
849
  {
200
849
    addLiteralChar(char(0xc0u | (codepoint >> 6u)));
201
849
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
202
849
  }
203
2.75k
  else
204
2.75k
  {
205
2.75k
    addLiteralChar(char(0xe0u | (codepoint >> 12u)));
206
2.75k
    addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
207
2.75k
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
208
2.75k
  }
209
4.42k
}
210
211
void Scanner::rescan()
212
866k
{
213
866k
  size_t rollbackTo = 0;
214
866k
  if (m_skippedComments[Current].literal.empty())
215
848k
    rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
216
17.6k
  else
217
17.6k
    rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
218
866k
  m_char = m_source.rollback(m_source.position() - rollbackTo);
219
866k
  next();
220
866k
  next();
221
866k
  next();
222
866k
}
223
224
// Ensure that tokens can be stored in a byte.
225
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
226
227
Token Scanner::next()
228
61.6M
{
229
61.6M
  m_tokens[Current] = std::move(m_tokens[Next]);
230
61.6M
  m_tokens[Next] = std::move(m_tokens[NextNext]);
231
61.6M
  m_skippedComments[Current] = std::move(m_skippedComments[Next]);
232
61.6M
  m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
233
234
61.6M
  scanToken();
235
236
61.6M
  return m_tokens[Current].token;
237
61.6M
}
238
239
Token Scanner::selectToken(char _next, Token _then, Token _else)
240
12.9k
{
241
12.9k
  advance();
242
12.9k
  if (m_char == _next)
243
905
    return selectToken(_then);
244
12.0k
  else
245
12.0k
    return _else;
246
12.9k
}
247
248
bool Scanner::skipWhitespace()
249
29.1M
{
250
29.1M
  size_t const startPosition = sourcePos();
251
132M
  while (isWhiteSpace(m_char))
252
103M
    advance();
253
  // Return whether or not we skipped any characters.
254
29.1M
  return sourcePos() != startPosition;
255
29.1M
}
256
257
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
258
723k
{
259
723k
  size_t const startPosition = sourcePos();
260
5.60M
  while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
261
4.88M
    advance();
262
  // Return whether or not we skipped any characters.
263
723k
  return sourcePos() != startPosition;
264
723k
}
265
266
267
namespace
268
{
269
270
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
271
///
272
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
273
///          and error code in case the input's lexical parser state is invalid and this error should be reported
274
///          to the user.
275
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
276
176k
{
277
176k
  static std::array<std::pair<std::string_view, int>, 5> constexpr directionalSequences{
278
176k
    std::pair<std::string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
279
176k
    std::pair<std::string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
280
176k
    std::pair<std::string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
281
176k
    std::pair<std::string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
282
176k
    std::pair<std::string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
283
176k
  };
284
285
176k
  size_t endPosition = _stream.position();
286
176k
  _stream.setPosition(_startPosition);
287
288
176k
  int directionOverrideDepth = 0;
289
290
3.52M
  for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
291
3.34M
  {
292
3.34M
    _stream.setPosition(currentPos);
293
294
3.34M
    for (auto const& [sequence, depthChange]: directionalSequences)
295
16.7M
      if (_stream.prefixMatch(sequence))
296
3.95k
        directionOverrideDepth += depthChange;
297
298
3.34M
    if (directionOverrideDepth < 0)
299
122
      return ScannerError::DirectionalOverrideUnderflow;
300
3.34M
  }
301
302
176k
  _stream.setPosition(endPosition);
303
304
176k
  return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
305
176k
}
306
307
}
308
309
Token Scanner::skipSingleLineComment()
310
173k
{
311
  // Line terminator is not part of the comment. If it is a
312
  // non-ascii line terminator, it will result in a parser error.
313
173k
  size_t startPosition = m_source.position();
314
3.43M
  while (!isUnicodeLinebreak())
315
3.26M
    if (!advance())
316
1.94k
      break;
317
318
173k
  ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
319
173k
  if (unicodeDirectionError != ScannerError::NoError)
320
553
    return setError(unicodeDirectionError);
321
322
173k
  return Token::Whitespace;
323
173k
}
324
325
bool Scanner::atEndOfLine() const
326
2.99M
{
327
2.99M
  return m_char == '\n' || m_char == '\r';
328
2.99M
}
329
330
bool Scanner::tryScanEndOfLine()
331
11.8M
{
332
11.8M
  if (m_char == '\n')
333
364k
  {
334
364k
    advance();
335
364k
    return true;
336
364k
  }
337
338
11.4M
  if (m_char == '\r')
339
5.60k
  {
340
5.60k
    if (advance() && m_char == '\n')
341
1.56k
      advance();
342
5.60k
    return true;
343
5.60k
  }
344
345
11.4M
  return false;
346
11.4M
}
347
348
size_t Scanner::scanSingleLineDocComment()
349
355k
{
350
355k
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
351
355k
  size_t endPosition = m_source.position();
352
353
355k
  skipWhitespaceExceptUnicodeLinebreak();
354
355
11.7M
  while (!isSourcePastEndOfInput())
356
11.7M
  {
357
11.7M
    endPosition = m_source.position();
358
11.7M
    if (tryScanEndOfLine())
359
367k
    {
360
      // Check if next line is also a single-line comment.
361
      // If any whitespaces were skipped, use source position before.
362
367k
      if (!skipWhitespaceExceptUnicodeLinebreak())
363
56.0k
        endPosition = m_source.position();
364
365
367k
      if (!m_source.isPastEndOfInput(3) &&
366
367k
        m_source.get(0) == '/' &&
367
367k
        m_source.get(1) == '/' &&
368
367k
        m_source.get(2) == '/')
369
14.9k
      {
370
14.9k
        if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/')
371
523
          break; // "////" is not a documentation comment
372
14.3k
        m_char = m_source.advanceAndGet(3);
373
14.3k
        if (atEndOfLine())
374
2.72k
          continue;
375
11.6k
        addCommentLiteralChar('\n');
376
11.6k
      }
377
352k
      else
378
352k
        break; // next line is not a documentation comment, we are done
379
367k
    }
380
11.4M
    else if (isUnicodeLinebreak())
381
      // Any line terminator that is not '\n' is considered to end the
382
      // comment.
383
121
      break;
384
11.4M
    addCommentLiteralChar(m_char);
385
11.4M
    advance();
386
11.4M
  }
387
355k
  literal.complete();
388
355k
  return endPosition;
389
355k
}
390
391
Token Scanner::skipMultiLineComment()
392
2.15k
{
393
2.15k
  size_t startPosition = m_source.position();
394
29.6k
  while (!isSourcePastEndOfInput())
395
29.0k
  {
396
29.0k
    char prevChar = m_char;
397
29.0k
    advance();
398
399
    // If we have reached the end of the multi-line comment, we
400
    // consume the '/' and insert a whitespace. This way all
401
    // multi-line comments are treated as whitespace.
402
29.0k
    if (prevChar == '*' && m_char == '/')
403
1.55k
    {
404
1.55k
      ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
405
1.55k
      if (unicodeDirectionError != ScannerError::NoError)
406
126
        return setError(unicodeDirectionError);
407
408
1.42k
      m_char = ' ';
409
1.42k
      return Token::Whitespace;
410
1.55k
    }
411
29.0k
  }
412
  // Unterminated multi-line comment.
413
600
  return setError(ScannerError::IllegalCommentTerminator);
414
2.15k
}
415
416
Token Scanner::scanMultiLineDocComment()
417
103k
{
418
103k
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
419
103k
  bool endFound = false;
420
103k
  bool charsAdded = false;
421
422
202k
  while (isWhiteSpace(m_char) && !atEndOfLine())
423
99.2k
    advance();
424
425
2.87M
  while (!isSourcePastEndOfInput())
426
2.87M
  {
427
    // handle newlines in multiline comments
428
2.87M
    if (atEndOfLine())
429
18.8k
    {
430
18.8k
      skipWhitespace();
431
18.8k
      if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*')
432
1.08k
      { // it is unknown if this leads to the end of the comment
433
1.08k
        addCommentLiteralChar('*');
434
1.08k
        advance();
435
1.08k
      }
436
17.7k
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/')
437
5.02k
      { // skip first '*' in subsequent lines
438
5.02k
        m_char = m_source.advanceAndGet(1);
439
5.02k
        if (atEndOfLine()) // ignores empty lines
440
2.45k
          continue;
441
2.56k
        if (charsAdded)
442
1.29k
          addCommentLiteralChar('\n'); // corresponds to the end of previous line
443
2.56k
      }
444
12.7k
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
445
1.73k
      { // if after newline the comment ends, don't insert the newline
446
1.73k
        m_char = m_source.advanceAndGet(2);
447
1.73k
        endFound = true;
448
1.73k
        break;
449
1.73k
      }
450
11.0k
      else if (charsAdded)
451
9.51k
        addCommentLiteralChar('\n');
452
18.8k
    }
453
454
2.87M
    if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
455
99.8k
    {
456
99.8k
      m_char = m_source.advanceAndGet(2);
457
99.8k
      endFound = true;
458
99.8k
      break;
459
99.8k
    }
460
2.77M
    addCommentLiteralChar(m_char);
461
2.77M
    charsAdded = true;
462
2.77M
    advance();
463
2.77M
  }
464
103k
  literal.complete();
465
103k
  if (!endFound)
466
2.10k
    return setError(ScannerError::IllegalCommentTerminator);
467
101k
  else
468
101k
    return Token::CommentLiteral;
469
103k
}
470
471
Token Scanner::scanSlash()
472
647k
{
473
647k
  int firstSlashPosition = static_cast<int>(sourcePos());
474
647k
  advance();
475
647k
  if (m_char == '/')
476
529k
  {
477
529k
    if (!advance()) /* double slash comment directly before EOS */
478
447
      return Token::Whitespace;
479
529k
    else if (m_char == '/')
480
358k
    {
481
358k
      advance(); //consume the last '/' at ///
482
483
      // "////"
484
358k
      if (m_char == '/')
485
2.30k
        return skipSingleLineComment();
486
      // doxygen style /// comment
487
355k
      m_skippedComments[NextNext].location.start = firstSlashPosition;
488
355k
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
489
355k
      m_skippedComments[NextNext].token = Token::CommentLiteral;
490
355k
      m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
491
355k
      return Token::Whitespace;
492
358k
    }
493
171k
    else
494
171k
      return skipSingleLineComment();
495
529k
  }
496
118k
  else if (m_char == '*')
497
107k
  {
498
    // doxygen style /** natspec comment
499
107k
    if (!advance()) /* slash star comment before EOS */
500
55
      return setError(ScannerError::IllegalCommentTerminator);
501
107k
    else if (m_char == '*')
502
105k
    {
503
105k
      advance(); //consume the last '*' at /**
504
505
      // "/**/"
506
105k
      if (m_char == '/')
507
1.24k
      {
508
1.24k
        advance(); //skip the closing slash
509
1.24k
        return Token::Whitespace;
510
1.24k
      }
511
      // "/***"
512
104k
      if (m_char == '*')
513
        // "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
514
1.00k
        return skipMultiLineComment();
515
      // we actually have a multiline documentation comment
516
103k
      m_skippedComments[NextNext].location.start = firstSlashPosition;
517
103k
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
518
103k
      Token comment = scanMultiLineDocComment();
519
103k
      m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
520
103k
      m_skippedComments[NextNext].token = comment;
521
103k
      if (comment == Token::Illegal)
522
2.10k
        return Token::Illegal; // error already set
523
101k
      else
524
101k
        return Token::Whitespace;
525
103k
    }
526
1.14k
    else
527
1.14k
      return skipMultiLineComment();
528
107k
  }
529
10.8k
  else if (m_char == '=')
530
341
    return selectToken(Token::AssignDiv);
531
10.5k
  else
532
10.5k
    return Token::Div;
533
647k
}
534
535
void Scanner::scanToken()
536
61.6M
{
537
61.6M
  m_tokens[NextNext] = {};
538
61.6M
  m_skippedComments[NextNext] = {};
539
540
61.6M
  Token token;
541
  // M and N are for the purposes of grabbing different type sizes
542
61.6M
  unsigned m = 0;
543
61.6M
  unsigned n = 0;
544
61.6M
  do
545
88.3M
  {
546
    // Remember the position of the next token
547
88.3M
    m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
548
88.3M
    switch (m_char)
549
88.3M
    {
550
636k
    case '"':
551
648k
    case '\'':
552
648k
      token = scanString(false);
553
648k
      break;
554
8.15k
    case '<':
555
      // < <= << <<=
556
8.15k
      advance();
557
8.15k
      if (m_char == '=')
558
325
        token = selectToken(Token::LessThanOrEqual);
559
7.83k
      else if (m_char == '<')
560
1.35k
        token = selectToken('=', Token::AssignShl, Token::SHL);
561
6.48k
      else
562
6.48k
        token = Token::LessThan;
563
8.15k
      break;
564
7.42k
    case '>':
565
      // > >= >> >>= >>> >>>=
566
7.42k
      advance();
567
7.42k
      if (m_char == '=')
568
2.14k
        token = selectToken(Token::GreaterThanOrEqual);
569
5.28k
      else if (m_char == '>')
570
1.92k
      {
571
        // >> >>= >>> >>>=
572
1.92k
        advance();
573
1.92k
        if (m_char == '=')
574
117
          token = selectToken(Token::AssignSar);
575
1.81k
        else if (m_char == '>')
576
738
          token = selectToken('=', Token::AssignShr, Token::SHR);
577
1.07k
        else
578
1.07k
          token = Token::SAR;
579
1.92k
      }
580
3.35k
      else
581
3.35k
        token = Token::GreaterThan;
582
7.42k
      break;
583
58.2k
    case '=':
584
      // = == =>
585
58.2k
      advance();
586
58.2k
      if (m_char == '=')
587
4.83k
        token = selectToken(Token::Equal);
588
53.4k
      else if (m_char == '>')
589
1.23k
        token = selectToken(Token::DoubleArrow);
590
52.1k
      else
591
52.1k
        token = Token::Assign;
592
58.2k
      break;
593
46.0k
    case '!':
594
      // ! !=
595
46.0k
      advance();
596
46.0k
      if (m_char == '=')
597
33.1k
        token = selectToken(Token::NotEqual);
598
12.8k
      else
599
12.8k
        token = Token::Not;
600
46.0k
      break;
601
11.0k
    case '+':
602
      // + ++ +=
603
11.0k
      advance();
604
11.0k
      if (m_char == '+')
605
4.12k
        token = selectToken(Token::Inc);
606
6.91k
      else if (m_char == '=')
607
699
        token = selectToken(Token::AssignAdd);
608
6.21k
      else
609
6.21k
        token = Token::Add;
610
11.0k
      break;
611
651k
    case '-':
612
      // - -- -= ->
613
651k
      advance();
614
651k
      if (m_char == '-')
615
1.88k
        token = selectToken(Token::Dec);
616
649k
      else if (m_char == '=')
617
397
        token = selectToken(Token::AssignSub);
618
649k
      else if (m_char == '>')
619
516k
        token = selectToken(Token::RightArrow);
620
133k
      else
621
133k
        token = Token::Sub;
622
651k
      break;
623
21.4k
    case '*':
624
      // * ** *=
625
21.4k
      advance();
626
21.4k
      if (m_char == '*')
627
9.62k
        token = selectToken(Token::Exp);
628
11.8k
      else if (m_char == '=')
629
175
        token = selectToken(Token::AssignMul);
630
11.6k
      else
631
11.6k
        token = Token::Mul;
632
21.4k
      break;
633
6.45k
    case '%':
634
      // % %=
635
6.45k
      token = selectToken('=', Token::AssignMod, Token::Mod);
636
6.45k
      break;
637
647k
    case '/':
638
      // /  // /* /=
639
647k
      token = scanSlash();
640
647k
      break;
641
4.93k
    case '&':
642
      // & && &=
643
4.93k
      advance();
644
4.93k
      if (m_char == '&')
645
1.57k
        token = selectToken(Token::And);
646
3.36k
      else if (m_char == '=')
647
496
        token = selectToken(Token::AssignBitAnd);
648
2.86k
      else
649
2.86k
        token = Token::BitAnd;
650
4.93k
      break;
651
4.90k
    case '|':
652
      // | || |=
653
4.90k
      advance();
654
4.90k
      if (m_char == '|')
655
1.92k
        token = selectToken(Token::Or);
656
2.97k
      else if (m_char == '=')
657
150
        token = selectToken(Token::AssignBitOr);
658
2.82k
      else
659
2.82k
        token = Token::BitOr;
660
4.90k
      break;
661
4.41k
    case '^':
662
      // ^ ^=
663
4.41k
      token = selectToken('=', Token::AssignBitXor, Token::BitXor);
664
4.41k
      break;
665
169k
    case '.':
666
      // . Number
667
169k
      advance();
668
169k
      if (m_kind != ScannerKind::ExperimentalSolidity && isDecimalDigit(m_char))
669
3.34k
        token = scanNumber('.');
670
165k
      else
671
165k
        token = Token::Period;
672
169k
      break;
673
3.11M
    case ':':
674
      // : :=
675
3.11M
      advance();
676
3.11M
      if (m_char == '=')
677
2.01M
        token = selectToken(Token::AssemblyAssign);
678
1.09M
      else
679
1.09M
        token = Token::Colon;
680
3.11M
      break;
681
216k
    case ';':
682
216k
      token = selectToken(Token::Semicolon);
683
216k
      break;
684
5.93M
    case ',':
685
5.93M
      token = selectToken(Token::Comma);
686
5.93M
      break;
687
7.37M
    case '(':
688
7.37M
      token = selectToken(Token::LParen);
689
7.37M
      break;
690
7.27M
    case ')':
691
7.27M
      token = selectToken(Token::RParen);
692
7.27M
      break;
693
128k
    case '[':
694
128k
      token = selectToken(Token::LBrack);
695
128k
      break;
696
121k
    case ']':
697
121k
      token = selectToken(Token::RBrack);
698
121k
      break;
699
3.24M
    case '{':
700
3.24M
      token = selectToken(Token::LBrace);
701
3.24M
      break;
702
2.88M
    case '}':
703
2.88M
      token = selectToken(Token::RBrace);
704
2.88M
      break;
705
2.40k
    case '?':
706
2.40k
      token = selectToken(Token::Conditional);
707
2.40k
      break;
708
10.7k
    case '~':
709
10.7k
      token = selectToken(Token::BitNot);
710
10.7k
      break;
711
55.7M
    default:
712
55.7M
      if (isIdentifierStart(m_char))
713
19.4M
      {
714
19.4M
        std::tie(token, m, n) = scanIdentifierOrKeyword();
715
716
        // Special case for hexadecimal literals
717
19.4M
        if (token == Token::Hex)
718
51.2k
        {
719
          // reset
720
51.2k
          m = 0;
721
51.2k
          n = 0;
722
723
          // Special quoted hex string must follow
724
51.2k
          if (m_char == '"' || m_char == '\'')
725
51.0k
            token = scanHexString();
726
203
          else
727
203
            token = setError(ScannerError::IllegalToken);
728
51.2k
        }
729
19.4M
        else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
730
1.13k
        {
731
          // reset
732
1.13k
          m = 0;
733
1.13k
          n = 0;
734
735
          // Special quoted hex string must follow
736
1.13k
          if (m_char == '"' || m_char == '\'')
737
1.08k
            token = scanString(true);
738
51
          else
739
51
            token = setError(ScannerError::IllegalToken);
740
1.13k
        }
741
19.4M
      }
742
36.2M
      else if (isDecimalDigit(m_char))
743
7.85M
        token = scanNumber();
744
28.4M
      else if (skipWhitespace())
745
26.0M
        token = Token::Whitespace;
746
2.40M
      else if (isSourcePastEndOfInput())
747
2.36M
        token = Token::EOS;
748
38.6k
      else
749
38.6k
        token = selectErrorToken(ScannerError::IllegalToken);
750
55.7M
      break;
751
88.3M
    }
752
    // Continue scanning for tokens as long as we're just skipping
753
    // whitespace.
754
88.3M
  }
755
88.3M
  while (token == Token::Whitespace);
756
61.6M
  m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
757
61.6M
  m_tokens[NextNext].location.sourceName = m_sourceName;
758
61.6M
  m_tokens[NextNext].token = token;
759
61.6M
  m_tokens[NextNext].extendedTokenInfo = std::make_tuple(m, n);
760
61.6M
}
761
762
bool Scanner::scanEscape()
763
19.6k
{
764
19.6k
  char c = m_char;
765
766
  // Skip escaped newlines.
767
19.6k
  if (tryScanEndOfLine())
768
2.88k
    return true;
769
16.7k
  advance();
770
771
16.7k
  switch (c)
772
16.7k
  {
773
1.08k
  case '\'':  // fall through
774
2.32k
  case '"':  // fall through
775
4.91k
  case '\\':
776
4.91k
    break;
777
1.03k
  case 'n':
778
1.03k
    c = '\n';
779
1.03k
    break;
780
1.24k
  case 'r':
781
1.24k
    c = '\r';
782
1.24k
    break;
783
1.17k
  case 't':
784
1.17k
    c = '\t';
785
1.17k
    break;
786
6.21k
  case 'u':
787
6.21k
  {
788
6.21k
    if (auto const codepoint = scanUnicode(); codepoint.has_value())
789
4.42k
      addUnicodeAsUTF8(*codepoint);
790
1.79k
    else
791
1.79k
      return false;
792
4.42k
    return true;
793
6.21k
  }
794
1.85k
  case 'x':
795
1.85k
    if (!scanHexByte(c))
796
480
      return false;
797
1.37k
    break;
798
1.37k
  default:
799
290
    return false;
800
16.7k
  }
801
802
9.74k
  addLiteralChar(c);
803
9.74k
  return true;
804
16.7k
}
805
806
bool Scanner::isUnicodeLinebreak()
807
27.5M
{
808
27.5M
  if (0x0a <= m_char && m_char <= 0x0d)
809
    // line feed, vertical tab, form feed, carriage return
810
194k
    return true;
811
27.4M
  if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
812
    // NEL - U+0085, C2 85 in utf8
813
43
    return true;
814
27.4M
  if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
815
5.20k
    uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
816
5.20k
  ))
817
    // LS - U+2028, E2 80 A8  in utf8
818
    // PS - U+2029, E2 80 A9  in utf8
819
72
    return true;
820
27.4M
  return false;
821
27.4M
}
822
823
Token Scanner::scanString(bool const _isUnicode)
824
649k
{
825
649k
  size_t startPosition = m_source.position();
826
649k
  char const quote = m_char;
827
649k
  advance();  // consume quote
828
649k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
829
  // for source location comments we allow multiline string literals
830
8.47M
  while (m_char != quote && !isSourcePastEndOfInput() && (!isUnicodeLinebreak() || m_kind == ScannerKind::SpecialComment))
831
7.82M
  {
832
7.82M
    char c = m_char;
833
7.82M
    advance();
834
835
7.82M
    if (m_kind == ScannerKind::SpecialComment)
836
5.25M
    {
837
5.25M
      if (c == '\\')
838
127k
      {
839
127k
        if (isSourcePastEndOfInput())
840
58
          return setError(ScannerError::IllegalEscapeSequence);
841
127k
        advance();
842
127k
      }
843
5.12M
      else
844
5.12M
        addLiteralChar(c);
845
5.25M
    }
846
2.57M
    else
847
2.57M
    {
848
2.57M
      if (c == '\\')
849
19.7k
      {
850
19.7k
        if (isSourcePastEndOfInput() || !scanEscape())
851
2.69k
          return setError(ScannerError::IllegalEscapeSequence);
852
19.7k
      }
853
2.55M
      else
854
2.55M
      {
855
        // Report error on non-printable characters in string literals, however
856
        // allow anything for unicode string literals, because their validity will
857
        // be verified later (in the syntax checker).
858
        //
859
        // We are using a manual range and not isprint() to avoid
860
        // any potential complications with locale.
861
2.55M
        if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
862
1.02k
        {
863
1.02k
          if (m_kind == ScannerKind::Yul)
864
108
            return setError(ScannerError::IllegalCharacterInString);
865
918
          return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
866
1.02k
        }
867
2.55M
        addLiteralChar(c);
868
2.55M
      }
869
2.57M
    }
870
871
7.82M
  }
872
646k
  if (m_char != quote)
873
2.48k
    return setError(ScannerError::IllegalStringEndQuote);
874
875
643k
  if (_isUnicode)
876
1.00k
  {
877
1.00k
    ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
878
1.00k
    if (unicodeDirectionError != ScannerError::NoError)
879
8
      return setError(unicodeDirectionError);
880
1.00k
  }
881
882
643k
  literal.complete();
883
643k
  advance();  // consume quote
884
643k
  return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
885
643k
}
886
887
Token Scanner::scanHexString()
888
51.0k
{
889
51.0k
  char const quote = m_char;
890
51.0k
  advance();  // consume quote
891
51.0k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
892
51.0k
  bool allowUnderscore = false;
893
3.67M
  while (m_char != quote && !isSourcePastEndOfInput())
894
3.62M
  {
895
3.62M
    char c = m_char;
896
897
3.62M
    if (scanHexByte(c))
898
3.62M
    {
899
3.62M
      addLiteralChar(c);
900
3.62M
      allowUnderscore = true;
901
3.62M
    }
902
2.10k
    else if (c == '_')
903
1.09k
    {
904
1.09k
      advance();
905
1.09k
      if (!allowUnderscore || m_char == quote)
906
290
        return setError(ScannerError::IllegalNumberSeparator);
907
805
      allowUnderscore = false;
908
805
    }
909
1.00k
    else
910
1.00k
      return setError(ScannerError::IllegalHexString);
911
3.62M
  }
912
913
49.7k
  if (m_char != quote)
914
256
    return setError(ScannerError::IllegalStringEndQuote);
915
916
49.4k
  literal.complete();
917
49.4k
  advance();  // consume quote
918
49.4k
  return Token::HexStringLiteral;
919
49.7k
}
920
921
// Parse for regex [:digit:]+(_[:digit:]+)*
922
void Scanner::scanDecimalDigits()
923
5.97M
{
924
  // MUST begin with a decimal digit.
925
5.97M
  if (!isDecimalDigit(m_char))
926
2.04M
    return;
927
928
  // May continue with decimal digit or underscore for grouping.
929
3.93M
  do
930
12.7M
    addLiteralCharAndAdvance();
931
12.7M
  while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
932
933
  // Defer further validation of underscore to SyntaxChecker.
934
3.93M
}
935
936
Token Scanner::scanNumber(char _charSeen)
937
7.85M
{
938
7.85M
  enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
939
7.85M
  LiteralScope literal(this, LITERAL_TYPE_NUMBER);
940
7.85M
  if (_charSeen == '.')
941
3.34k
  {
942
    // we have already seen a decimal point of the float
943
3.34k
    addLiteralChar('.');
944
3.34k
    if (m_char == '_')
945
0
      return setError(ScannerError::IllegalToken);
946
3.34k
    scanDecimalDigits();  // we know we have at least one digit
947
3.34k
  }
948
7.85M
  else
949
7.85M
  {
950
7.85M
    solAssert(_charSeen == 0, "");
951
    // if the first character is '0' we must check for octals and hex
952
7.85M
    if (m_char == '0')
953
3.93M
    {
954
3.93M
      addLiteralCharAndAdvance();
955
      // either 0, 0exxx, 0Exxx, 0.xxx or a hex number
956
3.93M
      if (m_char == 'x')
957
1.89M
      {
958
        // hex number
959
1.89M
        kind = HEX;
960
1.89M
        addLiteralCharAndAdvance();
961
1.89M
        if (!isHexDigit(m_char))
962
393
          return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
963
964
41.9M
        while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
965
40.0M
          addLiteralCharAndAdvance();
966
1.89M
      }
967
2.04M
      else if (isDecimalDigit(m_char))
968
        // We do not allow octal numbers
969
1.17k
        return setError(ScannerError::OctalNotAllowed);
970
3.93M
    }
971
    // Parse decimal digits and allow trailing fractional part.
972
7.84M
    if (kind == DECIMAL)
973
5.95M
    {
974
5.95M
      scanDecimalDigits();  // optional
975
5.95M
      if (m_char == '.')
976
7.87k
      {
977
7.87k
        if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
978
987
        {
979
          // Assume the input may be a floating point number with leading '_' in fraction part.
980
          // Recover by consuming it all but returning `Illegal` right away.
981
987
          addLiteralCharAndAdvance(); // '.'
982
987
          addLiteralCharAndAdvance(); // '_'
983
987
          scanDecimalDigits();
984
987
        }
985
7.87k
        if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1)))
986
1.26k
        {
987
          // A '.' has to be followed by a number.
988
1.26k
          literal.complete();
989
1.26k
          return Token::Number;
990
1.26k
        }
991
6.61k
        addLiteralCharAndAdvance();
992
6.61k
        scanDecimalDigits();
993
6.61k
      }
994
5.95M
    }
995
7.84M
  }
996
  // scan exponent, if any
997
7.85M
  if (m_char == 'e' || m_char == 'E')
998
9.80k
  {
999
9.80k
    solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
1000
9.80k
    if (kind != DECIMAL)
1001
0
      return setError(ScannerError::IllegalExponent);
1002
9.80k
    else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
1003
645
    {
1004
      // Recover from wrongly placed underscore as delimiter in literal with scientific
1005
      // notation by consuming until the end.
1006
645
      addLiteralCharAndAdvance(); // 'e'
1007
645
      addLiteralCharAndAdvance(); // '_'
1008
645
      scanDecimalDigits();
1009
645
      literal.complete();
1010
645
      return Token::Number;
1011
645
    }
1012
    // scan exponent
1013
9.16k
    addLiteralCharAndAdvance(); // 'e' | 'E'
1014
9.16k
    if (m_char == '+' || m_char == '-')
1015
2.02k
      addLiteralCharAndAdvance();
1016
9.16k
    if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
1017
800
      return setError(ScannerError::IllegalExponent);
1018
8.36k
    scanDecimalDigits();
1019
8.36k
  }
1020
  // The source character immediately following a numeric literal must
1021
  // not be an identifier start or a decimal digit; see ECMA-262
1022
  // section 7.8.3, page 17 (note that we read only one decimal digit
1023
  // if the value is 0).
1024
7.84M
  if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
1025
2.37k
    return setError(ScannerError::IllegalNumberEnd);
1026
7.84M
  literal.complete();
1027
7.84M
  return Token::Number;
1028
7.84M
}
1029
1030
std::tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
1031
19.4M
{
1032
19.4M
  solAssert(isIdentifierStart(m_char), "");
1033
19.4M
  LiteralScope literal(this, LITERAL_TYPE_STRING);
1034
19.4M
  addLiteralCharAndAdvance();
1035
  // Scan the rest of the identifier characters.
1036
142M
  while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
1037
122M
    addLiteralCharAndAdvance();
1038
19.4M
  literal.complete();
1039
1040
19.4M
  auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
1041
19.4M
  switch (m_kind)
1042
19.4M
  {
1043
1.09k
  case ScannerKind::SpecialComment:
1044
    // there are no keywords in special comments
1045
1.09k
    return std::make_tuple(Token::Identifier, 0, 0);
1046
1.57M
  case ScannerKind::Solidity:
1047
    // Turn experimental Solidity keywords that are not keywords in legacy Solidity into identifiers.
1048
1.57M
    if (TokenTraits::isExperimentalSolidityOnlyKeyword(std::get<0>(token)))
1049
168
      return std::make_tuple(Token::Identifier, 0, 0);
1050
1.57M
    break;
1051
17.8M
  case ScannerKind::Yul:
1052
    // Turn Solidity identifier into a Yul keyword
1053
17.8M
    if (m_tokens[NextNext].literal == "leave")
1054
78.4k
      return std::make_tuple(Token::Leave, 0, 0);
1055
    // Turn non-Yul keywords into identifiers.
1056
17.8M
    if (!TokenTraits::isYulKeyword(std::get<0>(token)))
1057
14.8M
      return std::make_tuple(Token::Identifier, 0, 0);
1058
2.97M
    break;
1059
2.97M
  case ScannerKind::ExperimentalSolidity:
1060
    // Turn legacy Solidity keywords that are not keywords in experimental Solidity into identifiers.
1061
5.64k
    if (!TokenTraits::isExperimentalSolidityKeyword(std::get<0>(token)))
1062
4.02k
      return std::make_tuple(Token::Identifier, 0, 0);
1063
1.61k
    break;
1064
19.4M
  }
1065
4.54M
  return token;
1066
19.4M
}
1067
1068
} // namespace solidity::langutil