/src/solidity/liblangutil/Scanner.h

Source (jump to first uncovered line)
/*
 * This file is part of solidity.
 *
 * solidity is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * solidity is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with solidity.  If not, see <http://www.gnu.org/licenses/>.
 *
 * This file is derived from the file "scanner.h", which was part of the
 * V8 project. The original copyright header follows:
 *
 * Copyright 2006-2012, the V8 project authors. All rights reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 * * Neither the name of Google Inc. nor the names of its
 *   contributors may be used to endorse or promote products derived
 *   from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
 * @author Christian <c@ethdev.com>
 * @date 2014
 * Solidity scanner.
 */

#pragma once

#include <liblangutil/Token.h>
#include <liblangutil/CharStream.h>
#include <liblangutil/SourceLocation.h>

#include <optional>
#include <iosfwd>

namespace solidity::langutil
{

class AstRawString;
class AstValueFactory;
class ParserRecorder;

enum class ScannerKind
{
  Solidity,
  Yul
};

enum class ScannerError
{
  NoError,

  IllegalToken,
  IllegalHexString,
  IllegalHexDigit,
  IllegalCommentTerminator,
  IllegalEscapeSequence,
  UnicodeCharacterInNonUnicodeString,
  IllegalCharacterInString,
  IllegalStringEndQuote,
  IllegalNumberSeparator,
  IllegalExponent,
  IllegalNumberEnd,

  DirectionalOverrideUnderflow,
  DirectionalOverrideMismatch,

  OctalNotAllowed,
};

std::string to_string(ScannerError _errorCode);
std::ostream& operator<<(std::ostream& os, ScannerError _errorCode);

class Scanner
{
  friend class LiteralScope;
public:
  explicit Scanner(CharStream& _source):
    m_source(_source),
    m_sourceName{std::make_shared<std::string>(_source.name())}
  {
    reset();
  }

  /// Resets scanner to the start of input.
  void reset();

  /// Changes the scanner mode.
  void setScannerMode(ScannerKind _kind)
  {
    m_kind = _kind;

    // Invalidate lookahead buffer.
    rescan();
  }

  CharStream const& charStream() const noexcept { return m_source; }

  /// @returns the next token and advances input
  Token next();

  /// Set scanner to a specific offset. This is used in error recovery.
  void setPosition(size_t _offset);

  ///@{
  ///@name Information about the current token

  /// @returns the current token
  Token currentToken() const
  {
    return m_tokens[Current].token;
  }
  ElementaryTypeNameToken currentElementaryTypeNameToken() const
  {
    unsigned firstSize;
    unsigned secondSize;
    std::tie(firstSize, secondSize) = m_tokens[Current].extendedTokenInfo;
    return ElementaryTypeNameToken(m_tokens[Current].token, firstSize, secondSize);
  }

  SourceLocation currentLocation() const { return m_tokens[Current].location; }
  std::string const& currentLiteral() const { return m_tokens[Current].literal; }
  std::tuple<unsigned, unsigned> const& currentTokenInfo() const { return m_tokens[Current].extendedTokenInfo; }

  /// Retrieves the last error that occurred during lexical analysis.
  /// @note If no error occurred, the value is undefined.
  ScannerError currentError() const noexcept { return m_tokens[Current].error; }
  ///@}

  ///@{
  ///@name Information about the current comment token

  SourceLocation currentCommentLocation() const { return m_skippedComments[Current].location; }
  std::string const& currentCommentLiteral() const { return m_skippedComments[Current].literal; }
  /// Called by the parser during FunctionDefinition parsing to clear the current comment
  void clearCurrentCommentLiteral() { m_skippedComments[Current].literal.clear(); }

  ScannerKind scannerKind() const { return m_kind; }

  ///@}

  ///@{
  ///@name Information about the next token

  /// @returns the next token without advancing input.
  Token peekNextToken() const { return m_tokens[Next].token; }
  SourceLocation peekLocation() const { return m_tokens[Next].location; }
  std::string const& peekLiteral() const { return m_tokens[Next].literal; }

  Token peekNextNextToken() const { return m_tokens[NextNext].token; }
  ///@}

private:

  inline Token setError(ScannerError _error) noexcept
  {
    m_tokens[NextNext].error = _error;
    return Token::Illegal;
  }

  /// Used for the current and look-ahead token and comments
  struct TokenDesc
  {
    Token token;
    SourceLocation location;
    std::string literal;
    ScannerError error = ScannerError::NoError;
    std::tuple<unsigned, unsigned> extendedTokenInfo;
  };

  ///@{
  ///@name Literal buffer support
  inline void addLiteralChar(char c) { m_tokens[NextNext].literal.push_back(c); }
  inline void addCommentLiteralChar(char c) { m_skippedComments[NextNext].literal.push_back(c); }
  inline void addLiteralCharAndAdvance() { addLiteralChar(m_char); advance(); }
  void addUnicodeAsUTF8(unsigned codepoint);
  ///@}

  bool advance() { m_char = m_source.advanceAndGet(); return !m_source.isPastEndOfInput(); }
  void rollback(size_t _amount) { m_char = m_source.rollback(_amount); }
  /// Rolls back to the start of the current token and re-runs the scanner.
  void rescan();

  inline Token selectErrorToken(ScannerError _err) { advance(); return setError(_err); }
  inline Token selectToken(Token _tok) { advance(); return _tok; }
  /// If the next character is _next, advance and return _then, otherwise return _else.
  inline Token selectToken(char _next, Token _then, Token _else);

  bool scanHexByte(char& o_scannedByte);
  std::optional<unsigned> scanUnicode();

  /// Scans a single Solidity token.
  void scanToken();

  /// Skips all whitespace and @returns true if something was skipped.
  bool skipWhitespace();
  /// Skips all whitespace that are neither '\r' nor '\n'.
  bool skipWhitespaceExceptUnicodeLinebreak();
  Token skipSingleLineComment();
  Token skipMultiLineComment();

  /// Tests if current source position is CR, LF or CRLF.
  bool atEndOfLine() const;

  /// Tries to consume CR, LF or CRLF line terminators and returns success or failure.
  bool tryScanEndOfLine();

  void scanDecimalDigits();
  Token scanNumber(char _charSeen = 0);
  std::tuple<Token, unsigned, unsigned> scanIdentifierOrKeyword();

  Token scanString(bool const _isUnicode);
  Token scanHexString();
  /// Scans a single line comment and returns its corrected end position.
  size_t scanSingleLineDocComment();
  Token scanMultiLineDocComment();
  /// Scans a slash '/' and depending on the characters returns the appropriate token
  Token scanSlash();

  /// Scans an escape-sequence which is part of a string and adds the
  /// decoded character to the current literal. Returns true if a pattern
  /// is scanned.
  bool scanEscape();

  /// @returns true iff we are currently positioned at a unicode line break.
  bool isUnicodeLinebreak();

  /// Return the current source position.
  size_t sourcePos() const { return m_source.position(); }
  bool isSourcePastEndOfInput() const { return m_source.isPastEndOfInput(); }

  enum TokenIndex { Current, Next, NextNext };

  TokenDesc m_skippedComments[3] = {}; // desc for the current, next and nextnext skipped comment
  TokenDesc m_tokens[3] = {}; // desc for the current, next and nextnext token

  CharStream& m_source;
  std::shared_ptr<std::string const> m_sourceName;

  ScannerKind m_kind = ScannerKind::Solidity;

  /// one character look-ahead, equals 0 at end of input
  char m_char;
};

}

Coverage Report

Created: 2022-08-24 06:31

Line	Count	Source (jump to first uncovered line)
1		/*
2		* This file is part of solidity.
3		*
4		* solidity is free software: you can redistribute it and/or modify
5		* it under the terms of the GNU General Public License as published by
6		* the Free Software Foundation, either version 3 of the License, or
7		* (at your option) any later version.
8		*
9		* solidity is distributed in the hope that it will be useful,
10		* but WITHOUT ANY WARRANTY; without even the implied warranty of
11		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		* GNU General Public License for more details.
13		*
14		* You should have received a copy of the GNU General Public License
15		* along with solidity. If not, see <http://www.gnu.org/licenses/>.
16		*
17		* This file is derived from the file "scanner.h", which was part of the
18		* V8 project. The original copyright header follows:
19		*
20		* Copyright 2006-2012, the V8 project authors. All rights reserved.
21		* Redistribution and use in source and binary forms, with or without
22		* modification, are permitted provided that the following conditions are
23		* met:
24		*
25		* * Redistributions of source code must retain the above copyright
26		* notice, this list of conditions and the following disclaimer.
27		* * Redistributions in binary form must reproduce the above
28		* copyright notice, this list of conditions and the following
29		* disclaimer in the documentation and/or other materials provided
30		* with the distribution.
31		* * Neither the name of Google Inc. nor the names of its
32		* contributors may be used to endorse or promote products derived
33		* from this software without specific prior written permission.
34		*
35		* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36		* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37		* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38		* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39		* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40		* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41		* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42		* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43		* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44		* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45		* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46		*/
47		/**
48		* @author Christian <c@ethdev.com>
49		* @date 2014
50		* Solidity scanner.
51		*/
52
53		#pragma once
54
55		#include <liblangutil/Token.h>
56		#include <liblangutil/CharStream.h>
57		#include <liblangutil/SourceLocation.h>
58
59		#include <optional>
60		#include <iosfwd>
61
62		namespace solidity::langutil
63		{
64
65		class AstRawString;
66		class AstValueFactory;
67		class ParserRecorder;
68
69		enum class ScannerKind
70		{
71		Solidity,
72		Yul
73		};
74
75		enum class ScannerError
76		{
77		NoError,
78
79		IllegalToken,
80		IllegalHexString,
81		IllegalHexDigit,
82		IllegalCommentTerminator,
83		IllegalEscapeSequence,
84		UnicodeCharacterInNonUnicodeString,
85		IllegalCharacterInString,
86		IllegalStringEndQuote,
87		IllegalNumberSeparator,
88		IllegalExponent,
89		IllegalNumberEnd,
90
91		DirectionalOverrideUnderflow,
92		DirectionalOverrideMismatch,
93
94		OctalNotAllowed,
95		};
96
97		std::string to_string(ScannerError _errorCode);
98		std::ostream& operator<<(std::ostream& os, ScannerError _errorCode);
99
100		class Scanner
101		{
102		friend class LiteralScope;
103		public:
104		explicit Scanner(CharStream& _source):
105		m_source(_source),
106		m_sourceName{std::make_shared<std::string>(_source.name())}
107	54.0k	{
108	54.0k	reset();
109	54.0k	}
110
111		/// Resets scanner to the start of input.
112		void reset();
113
114		/// Changes the scanner mode.
115		void setScannerMode(ScannerKind _kind)
116	74.6k	{
117	74.6k	m_kind = _kind;
118
119		// Invalidate lookahead buffer.
120	74.6k	rescan();
121	74.6k	}
122
123	14.8k	CharStream const& charStream() const noexcept { return m_source; }
124
125		/// @returns the next token and advances input
126		Token next();
127
128		/// Set scanner to a specific offset. This is used in error recovery.
129		void setPosition(size_t _offset);
130
131		///@{
132		///@name Information about the current token
133
134		/// @returns the current token
135		Token currentToken() const
136	11.4M	{
137	11.4M	return m_tokens[Current].token;
138	11.4M	}
139		ElementaryTypeNameToken currentElementaryTypeNameToken() const
140	12	{
141	12	unsigned firstSize;
142	12	unsigned secondSize;
143	12	std::tie(firstSize, secondSize) = m_tokens[Current].extendedTokenInfo;
144	12	return ElementaryTypeNameToken(m_tokens[Current].token, firstSize, secondSize);
145	12	}
146
147	4.92M	SourceLocation currentLocation() const { return m_tokens[Current].location; }
148	2.13M	std::string const& currentLiteral() const { return m_tokens[Current].literal; }
149	35.7k	std::tuple<unsigned, unsigned> const& currentTokenInfo() const { return m_tokens[Current].extendedTokenInfo; }
150
151		/// Retrieves the last error that occurred during lexical analysis.
152		/// @note If no error occurred, the value is undefined.
153	100	ScannerError currentError() const noexcept { return m_tokens[Current].error; }
154		///@}
155
156		///@{
157		///@name Information about the current comment token
158
159	33.4k	SourceLocation currentCommentLocation() const { return m_skippedComments[Current].location; }
160	1.59M	std::string const& currentCommentLiteral() const { return m_skippedComments[Current].literal; }
161		/// Called by the parser during FunctionDefinition parsing to clear the current comment
162	0	void clearCurrentCommentLiteral() { m_skippedComments[Current].literal.clear(); }
163
164	0	ScannerKind scannerKind() const { return m_kind; }
165
166		///@}
167
168		///@{
169		///@name Information about the next token
170
171		/// @returns the next token without advancing input.
172	106k	Token peekNextToken() const { return m_tokens[Next].token; }
173	0	SourceLocation peekLocation() const { return m_tokens[Next].location; }
174	0	std::string const& peekLiteral() const { return m_tokens[Next].literal; }
175
176	872	Token peekNextNextToken() const { return m_tokens[NextNext].token; }
177		///@}
178
179		private:
180
181		inline Token setError(ScannerError _error) noexcept
182	6.51k	{
183	6.51k	m_tokens[NextNext].error = _error;
184	6.51k	return Token::Illegal;
185	6.51k	}
186
187		/// Used for the current and look-ahead token and comments
188		struct TokenDesc
189		{
190		Token token;
191		SourceLocation location;
192		std::string literal;
193		ScannerError error = ScannerError::NoError;
194		std::tuple<unsigned, unsigned> extendedTokenInfo;
195		};
196
197		///@{
198		///@name Literal buffer support
199	21.3M	inline void addLiteralChar(char c) { m_tokens[NextNext].literal.push_back(c); }
200	1.38M	inline void addCommentLiteralChar(char c) { m_skippedComments[NextNext].literal.push_back(c); }
201	20.8M	inline void addLiteralCharAndAdvance() { addLiteralChar(m_char); advance(); }
202		void addUnicodeAsUTF8(unsigned codepoint);
203		///@}
204
205	36.4M	bool advance() { m_char = m_source.advanceAndGet(); return !m_source.isPastEndOfInput(); }
206	457	void rollback(size_t _amount) { m_char = m_source.rollback(_amount); }
207		/// Rolls back to the start of the current token and re-runs the scanner.
208		void rescan();
209
210	4.97k	inline Token selectErrorToken(ScannerError _err) { advance(); return setError(_err); }
211	2.56M	inline Token selectToken(Token _tok) { advance(); return _tok; }
212		/// If the next character is _next, advance and return _then, otherwise return _else.
213		inline Token selectToken(char _next, Token _then, Token _else);
214
215		bool scanHexByte(char& o_scannedByte);
216		std::optional<unsigned> scanUnicode();
217
218		/// Scans a single Solidity token.
219		void scanToken();
220
221		/// Skips all whitespace and @returns true if something was skipped.
222		bool skipWhitespace();
223		/// Skips all whitespace that are neither '\r' nor '\n'.
224		bool skipWhitespaceExceptUnicodeLinebreak();
225		Token skipSingleLineComment();
226		Token skipMultiLineComment();
227
228		/// Tests if current source position is CR, LF or CRLF.
229		bool atEndOfLine() const;
230
231		/// Tries to consume CR, LF or CRLF line terminators and returns success or failure.
232		bool tryScanEndOfLine();
233
234		void scanDecimalDigits();
235		Token scanNumber(char _charSeen = 0);
236		std::tuple<Token, unsigned, unsigned> scanIdentifierOrKeyword();
237
238		Token scanString(bool const _isUnicode);
239		Token scanHexString();
240		/// Scans a single line comment and returns its corrected end position.
241		size_t scanSingleLineDocComment();
242		Token scanMultiLineDocComment();
243		/// Scans a slash '/' and depending on the characters returns the appropriate token
244		Token scanSlash();
245
246		/// Scans an escape-sequence which is part of a string and adds the
247		/// decoded character to the current literal. Returns true if a pattern
248		/// is scanned.
249		bool scanEscape();
250
251		/// @returns true iff we are currently positioned at a unicode line break.
252		bool isUnicodeLinebreak();
253
254		/// Return the current source position.
255	18.9M	size_t sourcePos() const { return m_source.position(); }
256	2.18M	bool isSourcePastEndOfInput() const { return m_source.isPastEndOfInput(); }
257
258		enum TokenIndex { Current, Next, NextNext };
259
260		TokenDesc m_skippedComments[3] = {}; // desc for the current, next and nextnext skipped comment
261		TokenDesc m_tokens[3] = {}; // desc for the current, next and nextnext token
262
263		CharStream& m_source;
264		std::shared_ptr<std::string const> m_sourceName;
265
266		ScannerKind m_kind = ScannerKind::Solidity;
267
268		/// one character look-ahead, equals 0 at end of input
269		char m_char;
270		};
271
272		}