Coverage Report

Created: 2022-08-24 06:43

/src/solidity/liblangutil/Scanner.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * This file is part of solidity.
3
 *
4
 * solidity is free software: you can redistribute it and/or modify
5
 * it under the terms of the GNU General Public License as published by
6
 * the Free Software Foundation, either version 3 of the License, or
7
 * (at your option) any later version.
8
 *
9
 * solidity is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU General Public License
15
 * along with solidity.  If not, see <http://www.gnu.org/licenses/>.
16
 *
17
 * This file is derived from the file "scanner.cc", which was part of the
18
 * V8 project. The original copyright header follows:
19
 *
20
 * Copyright 2006-2012, the V8 project authors. All rights reserved.
21
 * Redistribution and use in source and binary forms, with or without
22
 * modification, are permitted provided that the following conditions are
23
 * met:
24
 *
25
 * * Redistributions of source code must retain the above copyright
26
 *   notice, this list of conditions and the following disclaimer.
27
 * * Redistributions in binary form must reproduce the above
28
 *   copyright notice, this list of conditions and the following
29
 *   disclaimer in the documentation and/or other materials provided
30
 *   with the distribution.
31
 * * Neither the name of Google Inc. nor the names of its
32
 *   contributors may be used to endorse or promote products derived
33
 *   from this software without specific prior written permission.
34
 *
35
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
*/
47
/**
48
 * @author Christian <c@ethdev.com>
49
 * @date 2014
50
 * Solidity scanner.
51
 */
52
53
#include <liblangutil/Common.h>
54
#include <liblangutil/Exceptions.h>
55
#include <liblangutil/Scanner.h>
56
57
#include <boost/algorithm/string/classification.hpp>
58
59
#include <optional>
60
#include <string_view>
61
#include <tuple>
62
#include <array>
63
64
using namespace std;
65
66
namespace solidity::langutil
67
{
68
69
string to_string(ScannerError _errorCode)
70
125
{
71
125
  switch (_errorCode)
72
125
  {
73
0
    case ScannerError::NoError: return "No error.";
74
49
    case ScannerError::IllegalToken: return "Invalid token.";
75
6
    case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
76
1
    case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
77
5
    case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
78
3
    case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
79
1
    case ScannerError::UnicodeCharacterInNonUnicodeString: return "Invalid character in string. If you are trying to use Unicode characters, use a unicode\"...\" string literal.";
80
1
    case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
81
12
    case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
82
3
    case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
83
6
    case ScannerError::IllegalExponent: return "Invalid exponent.";
84
19
    case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
85
14
    case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
86
2
    case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
87
3
    case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
88
0
    default:
89
0
      solAssert(false, "Unhandled case in to_string(ScannerError)");
90
0
      return "";
91
125
  }
92
125
}
93
94
95
ostream& operator<<(ostream& os, ScannerError _errorCode)
96
0
{
97
0
  return os << to_string(_errorCode);
98
0
}
99
100
/// Scoped helper for literal recording. Automatically drops the literal
101
/// if aborting the scanning before it's complete.
102
enum LiteralType
103
{
104
  LITERAL_TYPE_STRING,
105
  LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
106
  LITERAL_TYPE_COMMENT
107
};
108
109
class LiteralScope
110
{
111
public:
112
  explicit LiteralScope(Scanner* _self, enum LiteralType _type):
113
    m_type(_type),
114
    m_scanner(_self),
115
    m_complete(false)
116
4.32M
  {
117
4.32M
    if (_type == LITERAL_TYPE_COMMENT)
118
91.6k
      m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
119
4.22M
    else
120
4.22M
      m_scanner->m_tokens[Scanner::NextNext].literal.clear();
121
4.32M
  }
122
  ~LiteralScope()
123
4.32M
  {
124
4.32M
    if (!m_complete)
125
2.85k
    {
126
2.85k
      if (m_type == LITERAL_TYPE_COMMENT)
127
0
        m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
128
2.85k
      else
129
2.85k
        m_scanner->m_tokens[Scanner::NextNext].literal.clear();
130
2.85k
    }
131
4.32M
  }
132
4.31M
  void complete() { m_complete = true; }
133
134
private:
135
  enum LiteralType m_type;
136
  Scanner* m_scanner;
137
  bool m_complete;
138
};
139
140
void Scanner::reset()
141
67.6k
{
142
67.6k
  m_source.reset();
143
67.6k
  m_kind = ScannerKind::Solidity;
144
67.6k
  m_char = m_source.get();
145
67.6k
  skipWhitespace();
146
67.6k
  next();
147
67.6k
  next();
148
67.6k
  next();
149
67.6k
}
150
151
void Scanner::setPosition(size_t _offset)
152
0
{
153
0
  m_char = m_source.setPosition(_offset);
154
0
  scanToken();
155
0
  next();
156
0
  next();
157
0
}
158
159
bool Scanner::scanHexByte(char& o_scannedByte)
160
522k
{
161
522k
  char x = 0;
162
1.56M
  for (size_t i = 0; i < 2; i++)
163
1.04M
  {
164
1.04M
    int d = hexValue(m_char);
165
1.04M
    if (d < 0)
166
186
    {
167
186
      rollback(i);
168
186
      return false;
169
186
    }
170
1.04M
    x = static_cast<char>(x * 16 + d);
171
1.04M
    advance();
172
1.04M
  }
173
522k
  o_scannedByte = x;
174
522k
  return true;
175
522k
}
176
177
std::optional<unsigned> Scanner::scanUnicode()
178
912
{
179
912
  unsigned x = 0;
180
3.45k
  for (size_t i = 0; i < 4; i++)
181
2.95k
  {
182
2.95k
    int d = hexValue(m_char);
183
2.95k
    if (d < 0)
184
410
    {
185
410
      rollback(i);
186
410
      return {};
187
410
    }
188
2.54k
    x = x * 16 + static_cast<unsigned>(d);
189
2.54k
    advance();
190
2.54k
  }
191
502
  return x;
192
912
}
193
194
// This supports codepoints between 0000 and FFFF.
195
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
196
502
{
197
502
  if (codepoint <= 0x7f)
198
107
    addLiteralChar(char(codepoint));
199
395
  else if (codepoint <= 0x7ff)
200
78
  {
201
78
    addLiteralChar(char(0xc0u | (codepoint >> 6u)));
202
78
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
203
78
  }
204
317
  else
205
317
  {
206
317
    addLiteralChar(char(0xe0u | (codepoint >> 12u)));
207
317
    addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
208
317
    addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
209
317
  }
210
502
}
211
212
void Scanner::rescan()
213
94.9k
{
214
94.9k
  size_t rollbackTo = 0;
215
94.9k
  if (m_skippedComments[Current].literal.empty())
216
91.9k
    rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
217
2.95k
  else
218
2.95k
    rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
219
94.9k
  m_char = m_source.rollback(m_source.position() - rollbackTo);
220
94.9k
  next();
221
94.9k
  next();
222
94.9k
  next();
223
94.9k
}
224
225
// Ensure that tokens can be stored in a byte.
226
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
227
228
Token Scanner::next()
229
9.31M
{
230
9.31M
  m_tokens[Current] = std::move(m_tokens[Next]);
231
9.31M
  m_tokens[Next] = std::move(m_tokens[NextNext]);
232
9.31M
  m_skippedComments[Current] = std::move(m_skippedComments[Next]);
233
9.31M
  m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
234
235
9.31M
  scanToken();
236
237
9.31M
  return m_tokens[Current].token;
238
9.31M
}
239
240
Token Scanner::selectToken(char _next, Token _then, Token _else)
241
6.53k
{
242
6.53k
  advance();
243
6.53k
  if (m_char == _next)
244
361
    return selectToken(_then);
245
6.17k
  else
246
6.17k
    return _else;
247
6.53k
}
248
249
bool Scanner::skipWhitespace()
250
5.17M
{
251
5.17M
  size_t const startPosition = sourcePos();
252
24.6M
  while (isWhiteSpace(m_char))
253
19.5M
    advance();
254
  // Return whether or not we skipped any characters.
255
5.17M
  return sourcePos() != startPosition;
256
5.17M
}
257
258
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
259
186k
{
260
186k
  size_t const startPosition = sourcePos();
261
1.37M
  while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
262
1.18M
    advance();
263
  // Return whether or not we skipped any characters.
264
186k
  return sourcePos() != startPosition;
265
186k
}
266
267
268
namespace
269
{
270
271
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
272
///
273
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
274
///          and error code in case the input's lexical parser state is invalid and this error should be reported
275
///          to the user.
276
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
277
40.4k
{
278
40.4k
  static array<pair<string_view, int>, 5> constexpr directionalSequences{
279
40.4k
    pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
280
40.4k
    pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
281
40.4k
    pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
282
40.4k
    pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
283
40.4k
    pair<string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
284
40.4k
  };
285
286
40.4k
  size_t endPosition = _stream.position();
287
40.4k
  _stream.setPosition(_startPosition);
288
289
40.4k
  int directionOverrideDepth = 0;
290
291
1.31M
  for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
292
1.27M
  {
293
1.27M
    _stream.setPosition(currentPos);
294
295
1.27M
    for (auto const& [sequence, depthChange]: directionalSequences)
296
6.36M
      if (_stream.prefixMatch(sequence))
297
121
        directionOverrideDepth += depthChange;
298
299
1.27M
    if (directionOverrideDepth < 0)
300
26
      return ScannerError::DirectionalOverrideUnderflow;
301
1.27M
  }
302
303
40.4k
  _stream.setPosition(endPosition);
304
305
40.4k
  return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
306
40.4k
}
307
308
}
309
310
Token Scanner::skipSingleLineComment()
311
39.9k
{
312
  // Line terminator is not part of the comment. If it is a
313
  // non-ascii line terminator, it will result in a parser error.
314
39.9k
  size_t startPosition = m_source.position();
315
1.28M
  while (!isUnicodeLinebreak())
316
1.24M
    if (!advance())
317
0
      break;
318
319
39.9k
  ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
320
39.9k
  if (unicodeDirectionError != ScannerError::NoError)
321
45
    return setError(unicodeDirectionError);
322
323
39.8k
  return Token::Whitespace;
324
39.9k
}
325
326
bool Scanner::atEndOfLine() const
327
51.7k
{
328
51.7k
  return m_char == '\n' || m_char == '\r';
329
51.7k
}
330
331
bool Scanner::tryScanEndOfLine()
332
2.83M
{
333
2.83M
  if (m_char == '\n')
334
95.6k
  {
335
95.6k
    advance();
336
95.6k
    return true;
337
95.6k
  }
338
339
2.73M
  if (m_char == '\r')
340
282
  {
341
282
    if (advance() && m_char == '\n')
342
134
      advance();
343
282
    return true;
344
282
  }
345
346
2.73M
  return false;
347
2.73M
}
348
349
size_t Scanner::scanSingleLineDocComment()
350
90.7k
{
351
90.7k
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
352
90.7k
  size_t endPosition = m_source.position();
353
354
90.7k
  skipWhitespaceExceptUnicodeLinebreak();
355
356
2.83M
  while (!isSourcePastEndOfInput())
357
2.83M
  {
358
2.83M
    endPosition = m_source.position();
359
2.83M
    if (tryScanEndOfLine())
360
95.6k
    {
361
      // Check if next line is also a single-line comment.
362
      // If any whitespaces were skipped, use source position before.
363
95.6k
      if (!skipWhitespaceExceptUnicodeLinebreak())
364
14.3k
        endPosition = m_source.position();
365
366
95.6k
      if (!m_source.isPastEndOfInput(3) &&
367
95.6k
        m_source.get(0) == '/' &&
368
95.6k
        m_source.get(1) == '/' &&
369
95.6k
        m_source.get(2) == '/')
370
4.99k
      {
371
4.99k
        if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/')
372
63
          break; // "////" is not a documentation comment
373
4.92k
        m_char = m_source.advanceAndGet(3);
374
4.92k
        if (atEndOfLine())
375
189
          continue;
376
4.73k
        addCommentLiteralChar('\n');
377
4.73k
      }
378
90.7k
      else
379
90.7k
        break; // next line is not a documentation comment, we are done
380
95.6k
    }
381
2.73M
    else if (isUnicodeLinebreak())
382
      // Any line terminator that is not '\n' is considered to end the
383
      // comment.
384
13
      break;
385
2.74M
    addCommentLiteralChar(m_char);
386
2.74M
    advance();
387
2.74M
  }
388
90.7k
  literal.complete();
389
90.7k
  return endPosition;
390
90.7k
}
391
392
Token Scanner::skipMultiLineComment()
393
289
{
394
289
  size_t startPosition = m_source.position();
395
6.55k
  while (!isSourcePastEndOfInput())
396
6.51k
  {
397
6.51k
    char prevChar = m_char;
398
6.51k
    advance();
399
400
    // If we have reached the end of the multi-line comment, we
401
    // consume the '/' and insert a whitespace. This way all
402
    // multi-line comments are treated as whitespace.
403
6.51k
    if (prevChar == '*' && m_char == '/')
404
252
    {
405
252
      ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
406
252
      if (unicodeDirectionError != ScannerError::NoError)
407
2
        return setError(unicodeDirectionError);
408
409
250
      m_char = ' ';
410
250
      return Token::Whitespace;
411
252
    }
412
6.51k
  }
413
  // Unterminated multi-line comment.
414
37
  return setError(ScannerError::IllegalCommentTerminator);
415
289
}
416
417
Token Scanner::scanMultiLineDocComment()
418
854
{
419
854
  LiteralScope literal(this, LITERAL_TYPE_COMMENT);
420
854
  bool endFound = false;
421
854
  bool charsAdded = false;
422
423
1.07k
  while (isWhiteSpace(m_char) && !atEndOfLine())
424
225
    advance();
425
426
45.4k
  while (!isSourcePastEndOfInput())
427
45.2k
  {
428
    // handle newlines in multiline comments
429
45.2k
    if (atEndOfLine())
430
3.49k
    {
431
3.49k
      skipWhitespace();
432
3.49k
      if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*')
433
65
      { // it is unknown if this leads to the end of the comment
434
65
        addCommentLiteralChar('*');
435
65
        advance();
436
65
      }
437
3.42k
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/')
438
880
      { // skip first '*' in subsequent lines
439
880
        m_char = m_source.advanceAndGet(1);
440
880
        if (atEndOfLine()) // ignores empty lines
441
440
          continue;
442
440
        if (charsAdded)
443
357
          addCommentLiteralChar('\n'); // corresponds to the end of previous line
444
440
      }
445
2.54k
      else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
446
226
      { // if after newline the comment ends, don't insert the newline
447
226
        m_char = m_source.advanceAndGet(2);
448
226
        endFound = true;
449
226
        break;
450
226
      }
451
2.32k
      else if (charsAdded)
452
2.00k
        addCommentLiteralChar('\n');
453
3.49k
    }
454
455
44.6k
    if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
456
522
    {
457
522
      m_char = m_source.advanceAndGet(2);
458
522
      endFound = true;
459
522
      break;
460
522
    }
461
44.1k
    addCommentLiteralChar(m_char);
462
44.1k
    charsAdded = true;
463
44.1k
    advance();
464
44.1k
  }
465
854
  literal.complete();
466
854
  if (!endFound)
467
106
    return setError(ScannerError::IllegalCommentTerminator);
468
748
  else
469
748
    return Token::CommentLiteral;
470
854
}
471
472
Token Scanner::scanSlash()
473
136k
{
474
136k
  int firstSlashPosition = static_cast<int>(sourcePos());
475
136k
  advance();
476
136k
  if (m_char == '/')
477
130k
  {
478
130k
    if (!advance()) /* double slash comment directly before EOS */
479
0
      return Token::Whitespace;
480
130k
    else if (m_char == '/')
481
90.9k
    {
482
90.9k
      advance(); //consume the last '/' at ///
483
484
      // "////"
485
90.9k
      if (m_char == '/')
486
147
        return skipSingleLineComment();
487
      // doxygen style /// comment
488
90.7k
      m_skippedComments[NextNext].location.start = firstSlashPosition;
489
90.7k
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
490
90.7k
      m_skippedComments[NextNext].token = Token::CommentLiteral;
491
90.7k
      m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
492
90.7k
      return Token::Whitespace;
493
90.9k
    }
494
39.7k
    else
495
39.7k
      return skipSingleLineComment();
496
130k
  }
497
5.55k
  else if (m_char == '*')
498
1.17k
  {
499
    // doxygen style /** natspec comment
500
1.17k
    if (!advance()) /* slash star comment before EOS */
501
0
      return setError(ScannerError::IllegalCommentTerminator);
502
1.17k
    else if (m_char == '*')
503
994
    {
504
994
      advance(); //consume the last '*' at /**
505
506
      // "/**/"
507
994
      if (m_char == '/')
508
31
      {
509
31
        advance(); //skip the closing slash
510
31
        return Token::Whitespace;
511
31
      }
512
      // "/***"
513
963
      if (m_char == '*')
514
        // "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
515
109
        return skipMultiLineComment();
516
      // we actually have a multiline documentation comment
517
854
      m_skippedComments[NextNext].location.start = firstSlashPosition;
518
854
      m_skippedComments[NextNext].location.sourceName = m_sourceName;
519
854
      Token comment = scanMultiLineDocComment();
520
854
      m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
521
854
      m_skippedComments[NextNext].token = comment;
522
854
      if (comment == Token::Illegal)
523
106
        return Token::Illegal; // error already set
524
748
      else
525
748
        return Token::Whitespace;
526
854
    }
527
180
    else
528
180
      return skipMultiLineComment();
529
1.17k
  }
530
4.37k
  else if (m_char == '=')
531
100
    return selectToken(Token::AssignDiv);
532
4.27k
  else
533
4.27k
    return Token::Div;
534
136k
}
535
536
void Scanner::scanToken()
537
9.31M
{
538
9.31M
  m_tokens[NextNext] = {};
539
9.31M
  m_skippedComments[NextNext] = {};
540
541
9.31M
  Token token;
542
  // M and N are for the purposes of grabbing different type sizes
543
9.31M
  unsigned m;
544
9.31M
  unsigned n;
545
9.31M
  do
546
14.1M
  {
547
    // Remember the position of the next token
548
14.1M
    m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
549
14.1M
    switch (m_char)
550
14.1M
    {
551
34.2k
    case '"':
552
35.1k
    case '\'':
553
35.1k
      token = scanString(false);
554
35.1k
      break;
555
2.67k
    case '<':
556
      // < <= << <<=
557
2.67k
      advance();
558
2.67k
      if (m_char == '=')
559
163
        token = selectToken(Token::LessThanOrEqual);
560
2.51k
      else if (m_char == '<')
561
582
        token = selectToken('=', Token::AssignShl, Token::SHL);
562
1.93k
      else
563
1.93k
        token = Token::LessThan;
564
2.67k
      break;
565
4.34k
    case '>':
566
      // > >= >> >>= >>> >>>=
567
4.34k
      advance();
568
4.34k
      if (m_char == '=')
569
1.60k
        token = selectToken(Token::GreaterThanOrEqual);
570
2.73k
      else if (m_char == '>')
571
895
      {
572
        // >> >>= >>> >>>=
573
895
        advance();
574
895
        if (m_char == '=')
575
106
          token = selectToken(Token::AssignSar);
576
789
        else if (m_char == '>')
577
164
          token = selectToken('=', Token::AssignShr, Token::SHR);
578
625
        else
579
625
          token = Token::SAR;
580
895
      }
581
1.84k
      else
582
1.84k
        token = Token::GreaterThan;
583
4.34k
      break;
584
14.6k
    case '=':
585
      // = == =>
586
14.6k
      advance();
587
14.6k
      if (m_char == '=')
588
2.11k
        token = selectToken(Token::Equal);
589
12.5k
      else if (m_char == '>')
590
561
        token = selectToken(Token::DoubleArrow);
591
11.9k
      else
592
11.9k
        token = Token::Assign;
593
14.6k
      break;
594
2.78k
    case '!':
595
      // ! !=
596
2.78k
      advance();
597
2.78k
      if (m_char == '=')
598
291
        token = selectToken(Token::NotEqual);
599
2.49k
      else
600
2.49k
        token = Token::Not;
601
2.78k
      break;
602
4.21k
    case '+':
603
      // + ++ +=
604
4.21k
      advance();
605
4.21k
      if (m_char == '+')
606
721
        token = selectToken(Token::Inc);
607
3.49k
      else if (m_char == '=')
608
194
        token = selectToken(Token::AssignAdd);
609
3.29k
      else
610
3.29k
        token = Token::Add;
611
4.21k
      break;
612
167k
    case '-':
613
      // - -- -= ->
614
167k
      advance();
615
167k
      if (m_char == '-')
616
868
        token = selectToken(Token::Dec);
617
166k
      else if (m_char == '=')
618
253
        token = selectToken(Token::AssignSub);
619
165k
      else if (m_char == '>')
620
156k
        token = selectToken(Token::RightArrow);
621
9.50k
      else
622
9.50k
        token = Token::Sub;
623
167k
      break;
624
9.10k
    case '*':
625
      // * ** *=
626
9.10k
      advance();
627
9.10k
      if (m_char == '*')
628
2.60k
        token = selectToken(Token::Exp);
629
6.49k
      else if (m_char == '=')
630
156
        token = selectToken(Token::AssignMul);
631
6.34k
      else
632
6.34k
        token = Token::Mul;
633
9.10k
      break;
634
2.19k
    case '%':
635
      // % %=
636
2.19k
      token = selectToken('=', Token::AssignMod, Token::Mod);
637
2.19k
      break;
638
136k
    case '/':
639
      // /  // /* /=
640
136k
      token = scanSlash();
641
136k
      break;
642
2.15k
    case '&':
643
      // & && &=
644
2.15k
      advance();
645
2.15k
      if (m_char == '&')
646
396
        token = selectToken(Token::And);
647
1.75k
      else if (m_char == '=')
648
486
        token = selectToken(Token::AssignBitAnd);
649
1.27k
      else
650
1.27k
        token = Token::BitAnd;
651
2.15k
      break;
652
1.73k
    case '|':
653
      // | || |=
654
1.73k
      advance();
655
1.73k
      if (m_char == '|')
656
655
        token = selectToken(Token::Or);
657
1.07k
      else if (m_char == '=')
658
122
        token = selectToken(Token::AssignBitOr);
659
953
      else
660
953
        token = Token::BitOr;
661
1.73k
      break;
662
3.59k
    case '^':
663
      // ^ ^=
664
3.59k
      token = selectToken('=', Token::AssignBitXor, Token::BitXor);
665
3.59k
      break;
666
9.02k
    case '.':
667
      // . Number
668
9.02k
      advance();
669
9.02k
      if (isDecimalDigit(m_char))
670
1.90k
        token = scanNumber('.');
671
7.12k
      else
672
7.12k
        token = Token::Period;
673
9.02k
      break;
674
429k
    case ':':
675
      // : :=
676
429k
      advance();
677
429k
      if (m_char == '=')
678
420k
        token = selectToken(Token::AssemblyAssign);
679
8.35k
      else
680
8.35k
        token = Token::Colon;
681
429k
      break;
682
63.5k
    case ';':
683
63.5k
      token = selectToken(Token::Semicolon);
684
63.5k
      break;
685
670k
    case ',':
686
670k
      token = selectToken(Token::Comma);
687
670k
      break;
688
1.14M
    case '(':
689
1.14M
      token = selectToken(Token::LParen);
690
1.14M
      break;
691
1.12M
    case ')':
692
1.12M
      token = selectToken(Token::RParen);
693
1.12M
      break;
694
32.5k
    case '[':
695
32.5k
      token = selectToken(Token::LBrack);
696
32.5k
      break;
697
30.3k
    case ']':
698
30.3k
      token = selectToken(Token::RBrack);
699
30.3k
      break;
700
515k
    case '{':
701
515k
      token = selectToken(Token::LBrace);
702
515k
      break;
703
464k
    case '}':
704
464k
      token = selectToken(Token::RBrace);
705
464k
      break;
706
801
    case '?':
707
801
      token = selectToken(Token::Conditional);
708
801
      break;
709
7.23k
    case '~':
710
7.23k
      token = selectToken(Token::BitNot);
711
7.23k
      break;
712
9.28M
    default:
713
9.28M
      if (isIdentifierStart(m_char))
714
3.65M
      {
715
3.65M
        tie(token, m, n) = scanIdentifierOrKeyword();
716
717
        // Special case for hexadecimal literals
718
3.65M
        if (token == Token::Hex)
719
5.94k
        {
720
          // reset
721
5.94k
          m = 0;
722
5.94k
          n = 0;
723
724
          // Special quoted hex string must follow
725
5.94k
          if (m_char == '"' || m_char == '\'')
726
5.93k
            token = scanHexString();
727
8
          else
728
8
            token = setError(ScannerError::IllegalToken);
729
5.94k
        }
730
3.65M
        else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
731
294
        {
732
          // reset
733
294
          m = 0;
734
294
          n = 0;
735
736
          // Special quoted hex string must follow
737
294
          if (m_char == '"' || m_char == '\'')
738
285
            token = scanString(true);
739
9
          else
740
9
            token = setError(ScannerError::IllegalToken);
741
294
        }
742
3.65M
      }
743
5.62M
      else if (isDecimalDigit(m_char))
744
526k
        token = scanNumber();
745
5.10M
      else if (skipWhitespace())
746
4.71M
        token = Token::Whitespace;
747
380k
      else if (isSourcePastEndOfInput())
748
361k
        token = Token::EOS;
749
19.2k
      else
750
19.2k
        token = selectErrorToken(ScannerError::IllegalToken);
751
9.28M
      break;
752
14.1M
    }
753
    // Continue scanning for tokens as long as we're just skipping
754
    // whitespace.
755
14.1M
  }
756
14.1M
  while (token == Token::Whitespace);
757
9.31M
  m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
758
9.31M
  m_tokens[NextNext].location.sourceName = m_sourceName;
759
9.31M
  m_tokens[NextNext].token = token;
760
9.31M
  m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n);
761
9.31M
}
762
763
bool Scanner::scanEscape()
764
3.31k
{
765
3.31k
  char c = m_char;
766
767
  // Skip escaped newlines.
768
3.31k
  if (tryScanEndOfLine())
769
272
    return true;
770
3.04k
  advance();
771
772
3.04k
  switch (c)
773
3.04k
  {
774
75
  case '\'':  // fall through
775
298
  case '"':  // fall through
776
615
  case '\\':
777
615
    break;
778
211
  case 'n':
779
211
    c = '\n';
780
211
    break;
781
277
  case 'r':
782
277
    c = '\r';
783
277
    break;
784
224
  case 't':
785
224
    c = '\t';
786
224
    break;
787
912
  case 'u':
788
912
  {
789
912
    if (auto const codepoint = scanUnicode(); codepoint.has_value())
790
502
      addUnicodeAsUTF8(*codepoint);
791
410
    else
792
410
      return false;
793
502
    return true;
794
912
  }
795
704
  case 'x':
796
704
    if (!scanHexByte(c))
797
52
      return false;
798
652
    break;
799
652
  default:
800
100
    return false;
801
3.04k
  }
802
803
1.97k
  addLiteralChar(c);
804
1.97k
  return true;
805
3.04k
}
806
807
bool Scanner::isUnicodeLinebreak()
808
5.58M
{
809
5.58M
  if (0x0a <= m_char && m_char <= 0x0d)
810
    // line feed, vertical tab, form feed, carriage return
811
50.8k
    return true;
812
5.53M
  if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
813
    // NEL - U+0085, C2 85 in utf8
814
10
    return true;
815
5.53M
  if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
816
446
    uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
817
446
  ))
818
    // LS - U+2028, E2 80 A8  in utf8
819
    // PS - U+2029, E2 80 A9  in utf8
820
27
    return true;
821
5.53M
  return false;
822
5.53M
}
823
824
Token Scanner::scanString(bool const _isUnicode)
825
35.4k
{
826
35.4k
  size_t startPosition = m_source.position();
827
35.4k
  char const quote = m_char;
828
35.4k
  advance();  // consume quote
829
35.4k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
830
395k
  while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
831
360k
  {
832
360k
    char c = m_char;
833
360k
    advance();
834
360k
    if (c == '\\')
835
3.31k
    {
836
3.31k
      if (isSourcePastEndOfInput() || !scanEscape())
837
562
        return setError(ScannerError::IllegalEscapeSequence);
838
3.31k
    }
839
357k
    else
840
357k
    {
841
      // Report error on non-printable characters in string literals, however
842
      // allow anything for unicode string literals, because their validity will
843
      // be verified later (in the syntax checker).
844
      //
845
      // We are using a manual range and not isprint() to avoid
846
      // any potential complications with locale.
847
357k
      if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
848
449
      {
849
449
        if (m_kind == ScannerKind::Yul)
850
1
          return setError(ScannerError::IllegalCharacterInString);
851
448
        return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
852
449
      }
853
357k
      addLiteralChar(c);
854
357k
    }
855
360k
  }
856
34.3k
  if (m_char != quote)
857
242
    return setError(ScannerError::IllegalStringEndQuote);
858
859
34.1k
  if (_isUnicode)
860
275
  {
861
275
    ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
862
275
    if (unicodeDirectionError != ScannerError::NoError)
863
1
      return setError(unicodeDirectionError);
864
275
  }
865
866
34.1k
  literal.complete();
867
34.1k
  advance();  // consume quote
868
34.1k
  return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
869
34.1k
}
870
871
Token Scanner::scanHexString()
872
5.93k
{
873
5.93k
  char const quote = m_char;
874
5.93k
  advance();  // consume quote
875
5.93k
  LiteralScope literal(this, LITERAL_TYPE_STRING);
876
5.93k
  bool allowUnderscore = false;
877
527k
  while (m_char != quote && !isSourcePastEndOfInput())
878
521k
  {
879
521k
    char c = m_char;
880
881
521k
    if (scanHexByte(c))
882
521k
    {
883
521k
      addLiteralChar(c);
884
521k
      allowUnderscore = true;
885
521k
    }
886
134
    else if (c == '_')
887
50
    {
888
50
      advance();
889
50
      if (!allowUnderscore || m_char == quote)
890
19
        return setError(ScannerError::IllegalNumberSeparator);
891
31
      allowUnderscore = false;
892
31
    }
893
84
    else
894
84
      return setError(ScannerError::IllegalHexString);
895
521k
  }
896
897
5.83k
  if (m_char != quote)
898
0
    return setError(ScannerError::IllegalStringEndQuote);
899
900
5.83k
  literal.complete();
901
5.83k
  advance();  // consume quote
902
5.83k
  return Token::HexStringLiteral;
903
5.83k
}
904
905
// Parse for regex [:digit:]+(_[:digit:]+)*
906
void Scanner::scanDecimalDigits()
907
424k
{
908
  // MUST begin with a decimal digit.
909
424k
  if (!isDecimalDigit(m_char))
910
217k
    return;
911
912
  // May continue with decimal digit or underscore for grouping.
913
206k
  do
914
1.56M
    addLiteralCharAndAdvance();
915
1.56M
  while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
916
917
  // Defer further validation of underscore to SyntaxChecker.
918
206k
}
919
920
Token Scanner::scanNumber(char _charSeen)
921
528k
{
922
528k
  enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
923
528k
  LiteralScope literal(this, LITERAL_TYPE_NUMBER);
924
528k
  if (_charSeen == '.')
925
1.90k
  {
926
    // we have already seen a decimal point of the float
927
1.90k
    addLiteralChar('.');
928
1.90k
    if (m_char == '_')
929
0
      return setError(ScannerError::IllegalToken);
930
1.90k
    scanDecimalDigits();  // we know we have at least one digit
931
1.90k
  }
932
526k
  else
933
526k
  {
934
526k
    solAssert(_charSeen == 0, "");
935
    // if the first character is '0' we must check for octals and hex
936
526k
    if (m_char == '0')
937
328k
    {
938
328k
      addLiteralCharAndAdvance();
939
      // either 0, 0exxx, 0Exxx, 0.xxx or a hex number
940
328k
      if (m_char == 'x')
941
110k
      {
942
        // hex number
943
110k
        kind = HEX;
944
110k
        addLiteralCharAndAdvance();
945
110k
        if (!isHexDigit(m_char))
946
96
          return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
947
948
1.12M
        while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
949
1.01M
          addLiteralCharAndAdvance();
950
109k
      }
951
218k
      else if (isDecimalDigit(m_char))
952
        // We do not allow octal numbers
953
790
        return setError(ScannerError::OctalNotAllowed);
954
328k
    }
955
    // Parse decimal digits and allow trailing fractional part.
956
525k
    if (kind == DECIMAL)
957
415k
    {
958
415k
      scanDecimalDigits();  // optional
959
415k
      if (m_char == '.')
960
3.40k
      {
961
3.40k
        if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
962
450
        {
963
          // Assume the input may be a floating point number with leading '_' in fraction part.
964
          // Recover by consuming it all but returning `Illegal` right away.
965
450
          addLiteralCharAndAdvance(); // '.'
966
450
          addLiteralCharAndAdvance(); // '_'
967
450
          scanDecimalDigits();
968
450
        }
969
3.40k
        if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1)))
970
327
        {
971
          // A '.' has to be followed by a number.
972
327
          literal.complete();
973
327
          return Token::Number;
974
327
        }
975
3.08k
        addLiteralCharAndAdvance();
976
3.08k
        scanDecimalDigits();
977
3.08k
      }
978
415k
    }
979
525k
  }
980
  // scan exponent, if any
981
527k
  if (m_char == 'e' || m_char == 'E')
982
3.00k
  {
983
3.00k
    solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
984
3.00k
    if (kind != DECIMAL)
985
0
      return setError(ScannerError::IllegalExponent);
986
3.00k
    else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
987
67
    {
988
      // Recover from wrongly placed underscore as delimiter in literal with scientific
989
      // notation by consuming until the end.
990
67
      addLiteralCharAndAdvance(); // 'e'
991
67
      addLiteralCharAndAdvance(); // '_'
992
67
      scanDecimalDigits();
993
67
      literal.complete();
994
67
      return Token::Number;
995
67
    }
996
    // scan exponent
997
2.94k
    addLiteralCharAndAdvance(); // 'e' | 'E'
998
2.94k
    if (m_char == '+' || m_char == '-')
999
549
      addLiteralCharAndAdvance();
1000
2.94k
    if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
1001
105
      return setError(ScannerError::IllegalExponent);
1002
2.83k
    scanDecimalDigits();
1003
2.83k
  }
1004
  // The source character immediately following a numeric literal must
1005
  // not be an identifier start or a decimal digit; see ECMA-262
1006
  // section 7.8.3, page 17 (note that we read only one decimal digit
1007
  // if the value is 0).
1008
527k
  if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
1009
505
    return setError(ScannerError::IllegalNumberEnd);
1010
526k
  literal.complete();
1011
526k
  return Token::Number;
1012
527k
}
1013
1014
tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
1015
3.65M
{
1016
3.65M
  solAssert(isIdentifierStart(m_char), "");
1017
3.65M
  LiteralScope literal(this, LITERAL_TYPE_STRING);
1018
3.65M
  addLiteralCharAndAdvance();
1019
  // Scan the rest of the identifier characters.
1020
37.2M
  while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
1021
33.5M
    addLiteralCharAndAdvance();
1022
3.65M
  literal.complete();
1023
3.65M
  auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
1024
3.65M
  if (m_kind == ScannerKind::Yul)
1025
3.24M
  {
1026
    // Turn Solidity identifier into a Yul keyword
1027
3.24M
    if (m_tokens[NextNext].literal == "leave")
1028
9.47k
      return std::make_tuple(Token::Leave, 0, 0);
1029
    // Turn non-Yul keywords into identifiers.
1030
3.23M
    if (!TokenTraits::isYulKeyword(std::get<0>(token)))
1031
2.71M
      return std::make_tuple(Token::Identifier, 0, 0);
1032
3.23M
  }
1033
932k
  return token;
1034
3.65M
}
1035
1036
} // namespace solidity::langutil