Coverage Report

Created: 2022-08-24 06:40

/src/solidity/liblangutil/Scanner.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * This file is part of solidity.
3
 *
4
 * solidity is free software: you can redistribute it and/or modify
5
 * it under the terms of the GNU General Public License as published by
6
 * the Free Software Foundation, either version 3 of the License, or
7
 * (at your option) any later version.
8
 *
9
 * solidity is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU General Public License
15
 * along with solidity.  If not, see <http://www.gnu.org/licenses/>.
16
 *
17
 * This file is derived from the file "scanner.cc", which was part of the
18
 * V8 project. The original copyright header follows:
19
 *
20
 * Copyright 2006-2012, the V8 project authors. All rights reserved.
21
 * Redistribution and use in source and binary forms, with or without
22
 * modification, are permitted provided that the following conditions are
23
 * met:
24
 *
25
 * * Redistributions of source code must retain the above copyright
26
 *   notice, this list of conditions and the following disclaimer.
27
 * * Redistributions in binary form must reproduce the above
28
 *   copyright notice, this list of conditions and the following
29
 *   disclaimer in the documentation and/or other materials provided
30
 *   with the distribution.
31
 * * Neither the name of Google Inc. nor the names of its
32
 *   contributors may be used to endorse or promote products derived
33
 *   from this software without specific prior written permission.
34
 *
35
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
*/
47
/**
48
 * @author Christian <c@ethdev.com>
49
 * @date 2014
50
 * Solidity scanner.
51
 */
52
53
#include <liblangutil/Common.h>
54
#include <liblangutil/Exceptions.h>
55
#include <liblangutil/Scanner.h>
56
57
#include <boost/algorithm/string/classification.hpp>
58
59
#include <optional>
60
#include <string_view>
61
#include <tuple>
62
#include <array>
63
64
using namespace std;
65
66
namespace solidity::langutil
67
{
68
69
string to_string(ScannerError _errorCode)
70
0
{
71
0
  switch (_errorCode)
72
0
  {
73
0
    case ScannerError::NoError: return "No error.";
74
0
    case ScannerError::IllegalToken: return "Invalid token.";
75
0
    case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
76
0
    case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
77
0
    case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
78
0
    case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
79
0
    case ScannerError::UnicodeCharacterInNonUnicodeString: return "Invalid character in string. If you are trying to use Unicode characters, use a unicode\"...\" string literal.";
80
0
    case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
81
0
    case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
82
0
    case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
83
0
    case ScannerError::IllegalExponent: return "Invalid exponent.";
84
0
    case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
85
0
    case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
86
0
    case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
87
0
    case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
88
0
    default:
89
0
      solAssert(false, "Unhandled case in to_string(ScannerError)");
90
0
      return "";
91
0
  }
92
0
}
93
94
95
ostream& operator<<(ostream& os, ScannerError _errorCode)
96
0
{
97
0
  return os << to_string(_errorCode);
98
0
}
99
100
/// Scoped helper for literal recording. Automatically drops the literal
101
/// if aborting the scanning before it's complete.
102
enum LiteralType
103
{
104
  LITERAL_TYPE_STRING,
105
  LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
106
  LITERAL_TYPE_COMMENT
107
};
108
109
class LiteralScope
110
{
111
public:
112
  explicit LiteralScope(Scanner* _self, enum LiteralType _type):
113
    m_type(_type),
114
    m_scanner(_self),
115
    m_complete(false)
116
1.31M
  {
117
1.31M
    if (_type == LITERAL_TYPE_COMMENT)
118
0
      m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
119
1.31M
    else
120
1.31M
      m_scanner->m_tokens[Scanner::NextNext].literal.clear();
121
1.31M
  }
122
  ~LiteralScope()
123
1.31M
  {
124
1.31M
    if (!m_complete)
125
0
    {
126
0
      if (m_type == LITERAL_TYPE_COMMENT)
127
0
        m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
128
0
      else
129
0
        m_scanner->m_tokens[Scanner::NextNext].literal.clear();
130
0
    }
131
1.31M
  }
132
1.31M
  void complete() { m_complete = true; }
133
134
private:
135
  enum LiteralType m_type;
136
  Scanner* m_scanner;
137
  bool m_complete;
138
};
139
140
void Scanner::reset()
141
14.7k
{
142
14.7k
  m_source.reset();
143
14.7k
  m_kind = ScannerKind::Solidity;
144
14.7k
  m_char = m_source.get();
145
14.7k
  skipWhitespace();
146
14.7k
  next();
147
14.7k
  next();
148
14.7k
  next();
149
14.7k
}
150
151
void Scanner::setPosition(size_t _offset)
152
0
{
153
0
  m_char = m_source.setPosition(_offset);
154
0
  scanToken();
155
0
  next();
156
0
  next();
157
0
}
158
159
bool Scanner::scanHexByte(char& o_scannedByte)
160
44.4k
{
161
44.4k
  char x = 0;
162
133k
  for (size_t i = 0; i < 2; i++)
163
88.8k
  {
164
88.8k
    int d = hexValue(m_char);
165
88.8k
    if (d < 0)
166
0
    {
167
0
      rollback(i);
168
0
      return false;
169
0
    }
170
88.8k
    x = static_cast<char>(x * 16 + d);
171
88.8k
    advance();
172
88.8k
  }
173
44.4k
  o_scannedByte = x;
174
44.4k
  return true;
175
44.4k
}
176
177
std::optional<unsigned> Scanner::scanUnicode()
178
0
{
179
0
  unsigned x = 0;
180
0
  for (size_t i = 0; i < 4; i++)
181
0
  {
182
0
    int d = hexValue(m_char);
183
0
    if (d < 0)
184
0
    {
185
0
      rollback(i);
186
0
      return {};
187
0
    }
188
0
    x = x * 16 + static_cast<unsigned>(d);
189
0
    advance();
190
0
  }
191
0
  return x;
192
0
}
193
194
// This supports codepoints between 0000 and FFFF.
195
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
196
0
{
197
0
  if (codepoint <= 0x7f)
198
0
    addLiteralChar(char(codepoint));
199
0
  else if (codepoint <= 0x7ff)
200
0
  {
201
0
    addLiteralChar(char(0xc0u | (codepoint >> 6u)));
202
0
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
203
0
  }
204
0
  else
205
0
  {
206
0
    addLiteralChar(char(0xe0u | (codepoint >> 12u)));
207
0
    addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
208
0
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
209
0
  }
210
0
}
211
212
void Scanner::rescan()
213
39.1k
{
214
39.1k
  size_t rollbackTo = 0;
215
39.1k
  if (m_skippedComments[Current].literal.empty())
216
39.1k
    rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
217
0
  else
218
0
    rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
219
39.1k
  m_char = m_source.rollback(m_source.position() - rollbackTo);
220
39.1k
  next();
221
39.1k
  next();
222
39.1k
  next();
223
39.1k
}
224
225
// Ensure that tokens can be stored in a byte.
226
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
227
228
Token Scanner::next()
229
2.97M
{
230
2.97M
  m_tokens[Current] = std::move(m_tokens[Next]);
231
2.97M
  m_tokens[Next] = std::move(m_tokens[NextNext]);
232
2.97M
  m_skippedComments[Current] = std::move(m_skippedComments[Next]);
233
2.97M
  m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
234
235
2.97M
  scanToken();
236
237
2.97M
  return m_tokens[Current].token;
238
2.97M
}
239
240
Token Scanner::selectToken(char _next, Token _then, Token _else)
241
0
{
242
0
  advance();
243
0
  if (m_char == _next)
244
0
    return selectToken(_then);
245
0
  else
246
0
    return _else;
247
0
}
248
249
bool Scanner::skipWhitespace()
250
1.35M
{
251
1.35M
  size_t const startPosition = sourcePos();
252
2.58M
  while (isWhiteSpace(m_char))
253
1.23M
    advance();
254
  // Return whether or not we skipped any characters.
255
1.35M
  return sourcePos() != startPosition;
256
1.35M
}
257
258
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
259
0
{
260
0
  size_t const startPosition = sourcePos();
261
0
  while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
262
0
    advance();
263
  // Return whether or not we skipped any characters.
264
0
  return sourcePos() != startPosition;
265
0
}
266
267
268
namespace
269
{
270
271
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
272
///
273
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
274
///          and error code in case the input's lexical parser state is invalid and this error should be reported
275
///          to the user.
276
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
277
0
{
278
0
  static array<pair<string_view, int>, 5> constexpr directionalSequences{
279
0
    pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
280
0
    pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
281
0
    pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
282
0
    pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
283
0
    pair<string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
284
0
  };
285
286
0
  size_t endPosition = _stream.position();
287
0
  _stream.setPosition(_startPosition);
288
289
0
  int directionOverrideDepth = 0;
290
291
0
  for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
292
0
  {
293
0
    _stream.setPosition(currentPos);
294
295
0
    for (auto const& [sequence, depthChange]: directionalSequences)
296
0
      if (_stream.prefixMatch(sequence))
297
0
        directionOverrideDepth += depthChange;
298
299
0
    if (directionOverrideDepth < 0)
300
0
      return ScannerError::DirectionalOverrideUnderflow;
301
0
  }
302
303
0
  _stream.setPosition(endPosition);
304
305
0
  return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
306
0
}
307
308
}
309
310
Token Scanner::skipSingleLineComment()
311
0
{
312
  // Line terminator is not part of the comment. If it is a
313
  // non-ascii line terminator, it will result in a parser error.
314
0
  size_t startPosition = m_source.position();
315
0
  while (!isUnicodeLinebreak())
316
0
    if (!advance())
317
0
      break;
318
319
0
  ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
320
0
  if (unicodeDirectionError != ScannerError::NoError)
321
0
    return setError(unicodeDirectionError);
322
323
0
  return Token::Whitespace;
324
0
}
325
326
bool Scanner::atEndOfLine() const
327
0
{
328
0
  return m_char == '\n' || m_char == '\r';
329
0
}
330
331
bool Scanner::tryScanEndOfLine()
332
0
{
333
0
  if (m_char == '\n')
334
0
  {
335
0
    advance();
336
0
    return true;
337
0
  }
338
339
0
  if (m_char == '\r')
340
0
  {
341
0
    if (advance() && m_char == '\n')
342
0
      advance();
343
0
    return true;
344
0
  }
345
346
0
  return false;
347
0
}
348
349
size_t Scanner::scanSingleLineDocComment()
350
0
{
351
0
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
352
0
  size_t endPosition = m_source.position();
353
354
0
  skipWhitespaceExceptUnicodeLinebreak();
355
356
0
  while (!isSourcePastEndOfInput())
357
0
  {
358
0
    endPosition = m_source.position();
359
0
    if (tryScanEndOfLine())
360
0
    {
361
      // Check if next line is also a single-line comment.
362
      // If any whitespaces were skipped, use source position before.
363
0
      if (!skipWhitespaceExceptUnicodeLinebreak())
364
0
        endPosition = m_source.position();
365
366
0
      if (!m_source.isPastEndOfInput(3) &&
367
0
        m_source.get(0) == '/' &&
368
0
        m_source.get(1) == '/' &&
369
0
        m_source.get(2) == '/')
370
0
      {
371
0
        if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/')
372
0
          break; // "////" is not a documentation comment
373
0
        m_char = m_source.advanceAndGet(3);
374
0
        if (atEndOfLine())
375
0
          continue;
376
0
        addCommentLiteralChar('\n');
377
0
      }
378
0
      else
379
0
        break; // next line is not a documentation comment, we are done
380
0
    }
381
0
    else if (isUnicodeLinebreak())
382
      // Any line terminator that is not '\n' is considered to end the
383
      // comment.
384
0
      break;
385
0
    addCommentLiteralChar(m_char);
386
0
    advance();
387
0
  }
388
0
  literal.complete();
389
0
  return endPosition;
390
0
}
391
392
Token Scanner::skipMultiLineComment()
393
0
{
394
0
  size_t startPosition = m_source.position();
395
0
  while (!isSourcePastEndOfInput())
396
0
  {
397
0
    char prevChar = m_char;
398
0
    advance();
399
400
    // If we have reached the end of the multi-line comment, we
401
    // consume the '/' and insert a whitespace. This way all
402
    // multi-line comments are treated as whitespace.
403
0
    if (prevChar == '*' && m_char == '/')
404
0
    {
405
0
      ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
406
0
      if (unicodeDirectionError != ScannerError::NoError)
407
0
        return setError(unicodeDirectionError);
408
409
0
      m_char = ' ';
410
0
      return Token::Whitespace;
411
0
    }
412
0
  }
413
  // Unterminated multi-line comment.
414
0
  return setError(ScannerError::IllegalCommentTerminator);
415
0
}
416
417
Token Scanner::scanMultiLineDocComment()
418
0
{
419
0
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
420
0
  bool endFound = false;
421
0
  bool charsAdded = false;
422
423
0
  while (isWhiteSpace(m_char) && !atEndOfLine())
424
0
    advance();
425
426
0
  while (!isSourcePastEndOfInput())
427
0
  {
428
    // handle newlines in multiline comments
429
0
    if (atEndOfLine())
430
0
    {
431
0
      skipWhitespace();
432
0
      if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*')
433
0
      { // it is unknown if this leads to the end of the comment
434
0
        addCommentLiteralChar('*');
435
0
        advance();
436
0
      }
437
0
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/')
438
0
      { // skip first '*' in subsequent lines
439
0
        m_char = m_source.advanceAndGet(1);
440
0
        if (atEndOfLine()) // ignores empty lines
441
0
          continue;
442
0
        if (charsAdded)
443
0
          addCommentLiteralChar('\n'); // corresponds to the end of previous line
444
0
      }
445
0
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
446
0
      { // if after newline the comment ends, don't insert the newline
447
0
        m_char = m_source.advanceAndGet(2);
448
0
        endFound = true;
449
0
        break;
450
0
      }
451
0
      else if (charsAdded)
452
0
        addCommentLiteralChar('\n');
453
0
    }
454
455
0
    if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
456
0
    {
457
0
      m_char = m_source.advanceAndGet(2);
458
0
      endFound = true;
459
0
      break;
460
0
    }
461
0
    addCommentLiteralChar(m_char);
462
0
    charsAdded = true;
463
0
    advance();
464
0
  }
465
0
  literal.complete();
466
0
  if (!endFound)
467
0
    return setError(ScannerError::IllegalCommentTerminator);
468
0
  else
469
0
    return Token::CommentLiteral;
470
0
}
471
472
Token Scanner::scanSlash()
473
0
{
474
0
  int firstSlashPosition = static_cast<int>(sourcePos());
475
0
  advance();
476
0
  if (m_char == '/')
477
0
  {
478
0
    if (!advance()) /* double slash comment directly before EOS */
479
0
      return Token::Whitespace;
480
0
    else if (m_char == '/')
481
0
    {
482
0
      advance(); //consume the last '/' at ///
483
484
      // "////"
485
0
      if (m_char == '/')
486
0
        return skipSingleLineComment();
487
      // doxygen style /// comment
488
0
      m_skippedComments[NextNext].location.start = firstSlashPosition;
489
0
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
490
0
      m_skippedComments[NextNext].token = Token::CommentLiteral;
491
0
      m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
492
0
      return Token::Whitespace;
493
0
    }
494
0
    else
495
0
      return skipSingleLineComment();
496
0
  }
497
0
  else if (m_char == '*')
498
0
  {
499
    // doxygen style /** natspec comment
500
0
    if (!advance()) /* slash star comment before EOS */
501
0
      return setError(ScannerError::IllegalCommentTerminator);
502
0
    else if (m_char == '*')
503
0
    {
504
0
      advance(); //consume the last '*' at /**
505
506
      // "/**/"
507
0
      if (m_char == '/')
508
0
      {
509
0
        advance(); //skip the closing slash
510
0
        return Token::Whitespace;
511
0
      }
512
      // "/***"
513
0
      if (m_char == '*')
514
        // "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
515
0
        return skipMultiLineComment();
516
      // we actually have a multiline documentation comment
517
0
      m_skippedComments[NextNext].location.start = firstSlashPosition;
518
0
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
519
0
      Token comment = scanMultiLineDocComment();
520
0
      m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
521
0
      m_skippedComments[NextNext].token = comment;
522
0
      if (comment == Token::Illegal)
523
0
        return Token::Illegal; // error already set
524
0
      else
525
0
        return Token::Whitespace;
526
0
    }
527
0
    else
528
0
      return skipMultiLineComment();
529
0
  }
530
0
  else if (m_char == '=')
531
0
    return selectToken(Token::AssignDiv);
532
0
  else
533
0
    return Token::Div;
534
0
}
535
536
void Scanner::scanToken()
537
2.97M
{
538
2.97M
  m_tokens[NextNext] = {};
539
2.97M
  m_skippedComments[NextNext] = {};
540
541
2.97M
  Token token;
542
  // M and N are for the purposes of grabbing different type sizes
543
2.97M
  unsigned m;
544
2.97M
  unsigned n;
545
2.97M
  do
546
4.21M
  {
547
    // Remember the position of the next token
548
4.21M
    m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
549
4.21M
    switch (m_char)
550
4.21M
    {
551
17.2k
    case '"':
552
17.2k
    case '\'':
553
17.2k
      token = scanString(false);
554
17.2k
      break;
555
0
    case '<':
556
      // < <= << <<=
557
0
      advance();
558
0
      if (m_char == '=')
559
0
        token = selectToken(Token::LessThanOrEqual);
560
0
      else if (m_char == '<')
561
0
        token = selectToken('=', Token::AssignShl, Token::SHL);
562
0
      else
563
0
        token = Token::LessThan;
564
0
      break;
565
0
    case '>':
566
      // > >= >> >>= >>> >>>=
567
0
      advance();
568
0
      if (m_char == '=')
569
0
        token = selectToken(Token::GreaterThanOrEqual);
570
0
      else if (m_char == '>')
571
0
      {
572
        // >> >>= >>> >>>=
573
0
        advance();
574
0
        if (m_char == '=')
575
0
          token = selectToken(Token::AssignSar);
576
0
        else if (m_char == '>')
577
0
          token = selectToken('=', Token::AssignShr, Token::SHR);
578
0
        else
579
0
          token = Token::SAR;
580
0
      }
581
0
      else
582
0
        token = Token::GreaterThan;
583
0
      break;
584
0
    case '=':
585
      // = == =>
586
0
      advance();
587
0
      if (m_char == '=')
588
0
        token = selectToken(Token::Equal);
589
0
      else if (m_char == '>')
590
0
        token = selectToken(Token::DoubleArrow);
591
0
      else
592
0
        token = Token::Assign;
593
0
      break;
594
0
    case '!':
595
      // ! !=
596
0
      advance();
597
0
      if (m_char == '=')
598
0
        token = selectToken(Token::NotEqual);
599
0
      else
600
0
        token = Token::Not;
601
0
      break;
602
0
    case '+':
603
      // + ++ +=
604
0
      advance();
605
0
      if (m_char == '+')
606
0
        token = selectToken(Token::Inc);
607
0
      else if (m_char == '=')
608
0
        token = selectToken(Token::AssignAdd);
609
0
      else
610
0
        token = Token::Add;
611
0
      break;
612
24.8k
    case '-':
613
      // - -- -= ->
614
24.8k
      advance();
615
24.8k
      if (m_char == '-')
616
0
        token = selectToken(Token::Dec);
617
24.8k
      else if (m_char == '=')
618
0
        token = selectToken(Token::AssignSub);
619
24.8k
      else if (m_char == '>')
620
24.8k
        token = selectToken(Token::RightArrow);
621
0
      else
622
0
        token = Token::Sub;
623
24.8k
      break;
624
0
    case '*':
625
      // * ** *=
626
0
      advance();
627
0
      if (m_char == '*')
628
0
        token = selectToken(Token::Exp);
629
0
      else if (m_char == '=')
630
0
        token = selectToken(Token::AssignMul);
631
0
      else
632
0
        token = Token::Mul;
633
0
      break;
634
0
    case '%':
635
      // % %=
636
0
      token = selectToken('=', Token::AssignMod, Token::Mod);
637
0
      break;
638
0
    case '/':
639
      // /  // /* /=
640
0
      token = scanSlash();
641
0
      break;
642
0
    case '&':
643
      // & && &=
644
0
      advance();
645
0
      if (m_char == '&')
646
0
        token = selectToken(Token::And);
647
0
      else if (m_char == '=')
648
0
        token = selectToken(Token::AssignBitAnd);
649
0
      else
650
0
        token = Token::BitAnd;
651
0
      break;
652
0
    case '|':
653
      // | || |=
654
0
      advance();
655
0
      if (m_char == '|')
656
0
        token = selectToken(Token::Or);
657
0
      else if (m_char == '=')
658
0
        token = selectToken(Token::AssignBitOr);
659
0
      else
660
0
        token = Token::BitOr;
661
0
      break;
662
0
    case '^':
663
      // ^ ^=
664
0
      token = selectToken('=', Token::AssignBitXor, Token::BitXor);
665
0
      break;
666
0
    case '.':
667
      // . Number
668
0
      advance();
669
0
      if (isDecimalDigit(m_char))
670
0
        token = scanNumber('.');
671
0
      else
672
0
        token = Token::Period;
673
0
      break;
674
53.8k
    case ':':
675
      // : :=
676
53.8k
      advance();
677
53.8k
      if (m_char == '=')
678
53.8k
        token = selectToken(Token::AssemblyAssign);
679
0
      else
680
0
        token = Token::Colon;
681
53.8k
      break;
682
0
    case ';':
683
0
      token = selectToken(Token::Semicolon);
684
0
      break;
685
495k
    case ',':
686
495k
      token = selectToken(Token::Comma);
687
495k
      break;
688
343k
    case '(':
689
343k
      token = selectToken(Token::LParen);
690
343k
      break;
691
343k
    case ')':
692
343k
      token = selectToken(Token::RParen);
693
343k
      break;
694
0
    case '[':
695
0
      token = selectToken(Token::LBrack);
696
0
      break;
697
0
    case ']':
698
0
      token = selectToken(Token::RBrack);
699
0
      break;
700
167k
    case '{':
701
167k
      token = selectToken(Token::LBrace);
702
167k
      break;
703
134k
    case '}':
704
134k
      token = selectToken(Token::RBrace);
705
134k
      break;
706
0
    case '?':
707
0
      token = selectToken(Token::Conditional);
708
0
      break;
709
0
    case '~':
710
0
      token = selectToken(Token::BitNot);
711
0
      break;
712
2.63M
    default:
713
2.63M
      if (isIdentifierStart(m_char))
714
900k
      {
715
900k
        tie(token, m, n) = scanIdentifierOrKeyword();
716
717
        // Special case for hexadecimal literals
718
900k
        if (token == Token::Hex)
719
3.27k
        {
720
          // reset
721
3.27k
          m = 0;
722
3.27k
          n = 0;
723
724
          // Special quoted hex string must follow
725
3.27k
          if (m_char == '"' || m_char == '\'')
726
3.27k
            token = scanHexString();
727
0
          else
728
0
            token = setError(ScannerError::IllegalToken);
729
3.27k
        }
730
897k
        else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
731
0
        {
732
          // reset
733
0
          m = 0;
734
0
          n = 0;
735
736
          // Special quoted hex string must follow
737
0
          if (m_char == '"' || m_char == '\'')
738
0
            token = scanString(true);
739
0
          else
740
0
            token = setError(ScannerError::IllegalToken);
741
0
        }
742
900k
      }
743
1.73M
      else if (isDecimalDigit(m_char))
744
393k
        token = scanNumber();
745
1.33M
      else if (skipWhitespace())
746
1.23M
        token = Token::Whitespace;
747
100k
      else if (isSourcePastEndOfInput())
748
100k
        token = Token::EOS;
749
0
      else
750
0
        token = selectErrorToken(ScannerError::IllegalToken);
751
2.63M
      break;
752
4.21M
    }
753
    // Continue scanning for tokens as long as we're just skipping
754
    // whitespace.
755
4.21M
  }
756
4.21M
  while (token == Token::Whitespace);
757
2.97M
  m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
758
2.97M
  m_tokens[NextNext].location.sourceName = m_sourceName;
759
2.97M
  m_tokens[NextNext].token = token;
760
2.97M
  m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n);
761
2.97M
}
762
763
bool Scanner::scanEscape()
764
0
{
765
0
  char c = m_char;
766
767
  // Skip escaped newlines.
768
0
  if (tryScanEndOfLine())
769
0
    return true;
770
0
  advance();
771
772
0
  switch (c)
773
0
  {
774
0
  case '\'':  // fall through
775
0
  case '"':  // fall through
776
0
  case '\\':
777
0
    break;
778
0
  case 'n':
779
0
    c = '\n';
780
0
    break;
781
0
  case 'r':
782
0
    c = '\r';
783
0
    break;
784
0
  case 't':
785
0
    c = '\t';
786
0
    break;
787
0
  case 'u':
788
0
  {
789
0
    if (auto const codepoint = scanUnicode(); codepoint.has_value())
790
0
      addUnicodeAsUTF8(*codepoint);
791
0
    else
792
0
      return false;
793
0
    return true;
794
0
  }
795
0
  case 'x':
796
0
    if (!scanHexByte(c))
797
0
      return false;
798
0
    break;
799
0
  default:
800
0
    return false;
801
0
  }
802
803
0
  addLiteralChar(c);
804
0
  return true;
805
0
}
806
807
bool Scanner::isUnicodeLinebreak()
808
126k
{
809
126k
  if (0x0a <= m_char && m_char <= 0x0d)
810
    // line feed, vertical tab, form feed, carriage return
811
0
    return true;
812
126k
  if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
813
    // NEL - U+0085, C2 85 in utf8
814
0
    return true;
815
126k
  if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
816
0
    uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
817
0
  ))
818
    // LS - U+2028, E2 80 A8  in utf8
819
    // PS - U+2029, E2 80 A9  in utf8
820
0
    return true;
821
126k
  return false;
822
126k
}
823
824
Token Scanner::scanString(bool const _isUnicode)
825
17.2k
{
826
17.2k
  size_t startPosition = m_source.position();
827
17.2k
  char const quote = m_char;
828
17.2k
  advance();  // consume quote
829
17.2k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
830
143k
  while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
831
126k
  {
832
126k
    char c = m_char;
833
126k
    advance();
834
126k
    if (c == '\\')
835
0
    {
836
0
      if (isSourcePastEndOfInput() || !scanEscape())
837
0
        return setError(ScannerError::IllegalEscapeSequence);
838
0
    }
839
126k
    else
840
126k
    {
841
      // Report error on non-printable characters in string literals, however
842
      // allow anything for unicode string literals, because their validity will
843
      // be verified later (in the syntax checker).
844
      //
845
      // We are using a manual range and not isprint() to avoid
846
      // any potential complications with locale.
847
126k
      if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
848
0
      {
849
0
        if (m_kind == ScannerKind::Yul)
850
0
          return setError(ScannerError::IllegalCharacterInString);
851
0
        return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
852
0
      }
853
126k
      addLiteralChar(c);
854
126k
    }
855
126k
  }
856
17.2k
  if (m_char != quote)
857
0
    return setError(ScannerError::IllegalStringEndQuote);
858
859
17.2k
  if (_isUnicode)
860
0
  {
861
0
    ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
862
0
    if (unicodeDirectionError != ScannerError::NoError)
863
0
      return setError(unicodeDirectionError);
864
0
  }
865
866
17.2k
  literal.complete();
867
17.2k
  advance();  // consume quote
868
17.2k
  return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
869
17.2k
}
870
871
Token Scanner::scanHexString()
872
3.27k
{
873
3.27k
  char const quote = m_char;
874
3.27k
  advance();  // consume quote
875
3.27k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
876
3.27k
  bool allowUnderscore = false;
877
47.7k
  while (m_char != quote && !isSourcePastEndOfInput())
878
44.4k
  {
879
44.4k
    char c = m_char;
880
881
44.4k
    if (scanHexByte(c))
882
44.4k
    {
883
44.4k
      addLiteralChar(c);
884
44.4k
      allowUnderscore = true;
885
44.4k
    }
886
0
    else if (c == '_')
887
0
    {
888
0
      advance();
889
0
      if (!allowUnderscore || m_char == quote)
890
0
        return setError(ScannerError::IllegalNumberSeparator);
891
0
      allowUnderscore = false;
892
0
    }
893
0
    else
894
0
      return setError(ScannerError::IllegalHexString);
895
44.4k
  }
896
897
3.27k
  if (m_char != quote)
898
0
    return setError(ScannerError::IllegalStringEndQuote);
899
900
3.27k
  literal.complete();
901
3.27k
  advance();  // consume quote
902
3.27k
  return Token::HexStringLiteral;
903
3.27k
}
904
905
// Parse for regex [:digit:]+(_[:digit:]+)*
906
void Scanner::scanDecimalDigits()
907
149k
{
908
  // MUST begin with a decimal digit.
909
149k
  if (!isDecimalDigit(m_char))
910
10.7k
    return;
911
912
  // May continue with decimal digit or underscore for grouping.
913
139k
  do
914
438k
    addLiteralCharAndAdvance();
915
438k
  while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
916
917
  // Defer further validation of underscore to SyntaxChecker.
918
139k
}
919
920
Token Scanner::scanNumber(char _charSeen)
921
393k
{
922
393k
  enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
923
393k
  LiteralScope literal(this, LITERAL_TYPE_NUMBER);
924
393k
  if (_charSeen == '.')
925
0
  {
926
    // we have already seen a decimal point of the float
927
0
    addLiteralChar('.');
928
0
    if (m_char == '_')
929
0
      return setError(ScannerError::IllegalToken);
930
0
    scanDecimalDigits();  // we know we have at least one digit
931
0
  }
932
393k
  else
933
393k
  {
934
393k
    solAssert(_charSeen == 0, "");
935
    // if the first character is '0' we must check for octals and hex
936
393k
    if (m_char == '0')
937
254k
    {
938
254k
      addLiteralCharAndAdvance();
939
      // either 0, 0exxx, 0Exxx, 0.xxx or a hex number
940
254k
      if (m_char == 'x')
941
243k
      {
942
        // hex number
943
243k
        kind = HEX;
944
243k
        addLiteralCharAndAdvance();
945
243k
        if (!isHexDigit(m_char))
946
0
          return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
947
948
7.36M
        while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
949
7.12M
          addLiteralCharAndAdvance();
950
243k
      }
951
10.7k
      else if (isDecimalDigit(m_char))
952
        // We do not allow octal numbers
953
0
        return setError(ScannerError::OctalNotAllowed);
954
254k
    }
955
    // Parse decimal digits and allow trailing fractional part.
956
393k
    if (kind == DECIMAL)
957
149k
    {
958
149k
      scanDecimalDigits();  // optional
959
149k
      if (m_char == '.')
960
0
      {
961
0
        if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
962
0
        {
963
          // Assume the input may be a floating point number with leading '_' in fraction part.
964
          // Recover by consuming it all but returning `Illegal` right away.
965
0
          addLiteralCharAndAdvance(); // '.'
966
0
          addLiteralCharAndAdvance(); // '_'
967
0
          scanDecimalDigits();
968
0
        }
969
0
        if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1)))
970
0
        {
971
          // A '.' has to be followed by a number.
972
0
          literal.complete();
973
0
          return Token::Number;
974
0
        }
975
0
        addLiteralCharAndAdvance();
976
0
        scanDecimalDigits();
977
0
      }
978
149k
    }
979
393k
  }
980
  // scan exponent, if any
981
393k
  if (m_char == 'e' || m_char == 'E')
982
0
  {
983
0
    solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
984
0
    if (kind != DECIMAL)
985
0
      return setError(ScannerError::IllegalExponent);
986
0
    else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
987
0
    {
988
      // Recover from wrongly placed underscore as delimiter in literal with scientific
989
      // notation by consuming until the end.
990
0
      addLiteralCharAndAdvance(); // 'e'
991
0
      addLiteralCharAndAdvance(); // '_'
992
0
      scanDecimalDigits();
993
0
      literal.complete();
994
0
      return Token::Number;
995
0
    }
996
    // scan exponent
997
0
    addLiteralCharAndAdvance(); // 'e' | 'E'
998
0
    if (m_char == '+' || m_char == '-')
999
0
      addLiteralCharAndAdvance();
1000
0
    if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
1001
0
      return setError(ScannerError::IllegalExponent);
1002
0
    scanDecimalDigits();
1003
0
  }
1004
  // The source character immediately following a numeric literal must
1005
  // not be an identifier start or a decimal digit; see ECMA-262
1006
  // section 7.8.3, page 17 (note that we read only one decimal digit
1007
  // if the value is 0).
1008
393k
  if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
1009
0
    return setError(ScannerError::IllegalNumberEnd);
1010
393k
  literal.complete();
1011
393k
  return Token::Number;
1012
393k
}
1013
1014
tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
1015
900k
{
1016
900k
  solAssert(isIdentifierStart(m_char), "");
1017
900k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
1018
900k
  addLiteralCharAndAdvance();
1019
  // Scan the rest of the identifier characters.
1020
4.85M
  while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
1021
3.95M
    addLiteralCharAndAdvance();
1022
900k
  literal.complete();
1023
900k
  auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
1024
900k
  if (m_kind == ScannerKind::Yul)
1025
870k
  {
1026
    // Turn Solidity identifier into a Yul keyword
1027
870k
    if (m_tokens[NextNext].literal == "leave")
1028
1.28k
      return std::make_tuple(Token::Leave, 0, 0);
1029
    // Turn non-Yul keywords into identifiers.
1030
869k
    if (!TokenTraits::isYulKeyword(std::get<0>(token)))
1031
746k
      return std::make_tuple(Token::Identifier, 0, 0);
1032
869k
  }
1033
152k
  return token;
1034
900k
}
1035
1036
} // namespace solidity::langutil