/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/Token.h

Source (jump to first uncovered line)
//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Tokens are the first level of abstraction above bytes used in pseudoparsing.
// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
// The tokens is wrapped into pseudo::Token, along with line/indent info.
//
// Unlike clang, we make multiple passes over the whole file, out-of-order.
// Therefore we retain the whole token sequence in memory. (This is feasible as
// we process one file at a time). pseudo::TokenStream holds such a stream.
// The initial stream holds the raw tokens read from the file, later passes
// operate on derived TokenStreams (e.g. with directives stripped).
//
// Similar facilities from clang that are *not* used:
//  - SourceManager: designed around multiple files and precise macro expansion.
//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
//                  (pseudo::Token is similar, but without SourceLocations).
//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
//                  (pseudo::TokenStream is similar, but a flat token list).
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_TOKEN_H
#define CLANG_PSEUDO_TOKEN_H

#include "clang/Basic/LLVM.h"
#include "clang/Basic/LangStandard.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>

namespace clang {
class LangOptions;
namespace pseudo {

/// A single C++ or preprocessor token.
///
/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
/// SourceManager - we are not dealing with multiple files.
struct Token {
  /// An Index identifies a token within a stream.
  using Index = uint32_t;
  /// A sentinel Index indicating no token.
  constexpr static Index Invalid = std::numeric_limits<Index>::max();
  struct Range;

  /// The token text.
  ///
  /// Typically from the original source file, but may have been synthesized.
  StringRef text() const { return StringRef(Data, Length); }
  const char *Data = nullptr;
  uint32_t Length = 0;

  /// Zero-based line number for the start of the token.
  /// This refers to the original source file as written.
  uint32_t Line = 0;
  /// Width of whitespace before the first token on this line.
  uint8_t Indent = 0;
  /// Flags have some meaning defined by the function that produced this stream.
  uint8_t Flags = 0;
  /// Index into the original token stream (as raw-lexed from the source code).
  Index OriginalIndex = Invalid;
  // Helpers to get/set Flags based on `enum class`.
  template <class T> bool flag(T Mask) const {
    return Flags & uint8_t{llvm::to_underlying(Mask)};
  }
  template <class T> void setFlag(T Mask) {
    Flags |= uint8_t{llvm::to_underlying(Mask)};
  }

  /// Returns the next token in the stream. this may not be a sentinel.
  const Token &next() const {
    assert(Kind != tok::eof);
    return *(this + 1);
  }
  /// Returns the next token in the stream, skipping over comments.
  const Token &nextNC() const {
    const Token *T = this;
    do
      T = &T->next();
    while (T->Kind == tok::comment);
    return *T;
  }
  /// Returns the previous token in the stream. this may not be a sentinel.
  const Token &prev() const {
    assert(Kind != tok::eof);
    return *(this - 1);
  }
  /// Returns the bracket paired with this one, if any.
  const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }

  /// The type of token as determined by clang's lexer.
  clang::tok::TokenKind Kind = clang::tok::unknown;
  /// If this token is a paired bracket, the offset of the pair in the stream.
  int32_t Pair = 0;
};
static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);

/// A half-open range of tokens within a stream.
struct Token::Range {
  Index Begin = 0;
  Index End = 0;

  uint32_t size() const { return End - Begin; }
  static Range emptyAt(Index Index) { return Range{Index, Index}; }
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);

/// A complete sequence of Tokens representing a source file.
///
/// This may match a raw file from disk, or be derived from a previous stream.
/// For example, stripping comments from a TokenStream results in a new stream.
///
/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
///       int      main   (        )        ;
///   eof kw_int   ident  l_paren  r_paren  semi   eof
///       front()                           back()
///       0        1      2        3        4      5
class TokenStream {
public:
  /// Create an empty stream.
  ///
  /// Initially, the stream is appendable and not finalized.
  /// The token sequence may only be accessed after finalize() is called.
  ///
  /// Payload is an opaque object which will be owned by the stream.
  /// e.g. an allocator to hold backing storage for synthesized token text.
  explicit TokenStream(std::shared_ptr<void> Payload = nullptr);

  /// Append a token to the stream, which must not be finalized.
  void push(Token T) {
    assert(!isFinalized());
    Storage.push_back(std::move(T));
  }

  /// Finalize the token stream, allowing tokens to be accessed.
  /// Tokens may no longer be appended.
  void finalize();
  bool isFinalized() const;

  /// Returns the index of T within the stream.
  ///
  /// T must be within the stream or the end sentinel (not the start sentinel).
  Token::Index index(const Token &T) const {
    assert(isFinalized());
    assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
    assert(&T != Storage.data() && "start sentinel");
    return &T - Tokens.data();
  }

  ArrayRef<Token> tokens() const {
    assert(isFinalized());
    return Tokens;
  }
  ArrayRef<Token> tokens(Token::Range R) const {
    return tokens().slice(R.Begin, R.End - R.Begin);
  }

  MutableArrayRef<Token> tokens() {
    assert(isFinalized());
    return Tokens;
  }

  /// May return the end sentinel if the stream is empty.
  const Token &front() const {
    assert(isFinalized());
    return Storage[1];
  }

  /// Returns the shared payload.
  std::shared_ptr<void> getPayload() const { return Payload; }
  /// Adds the given payload to the stream.
  void addPayload(std::shared_ptr<void> P) {
    if (!Payload)
      Payload = std::move(P);
    else
      Payload = std::make_shared<
          std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
          std::move(P), std::move(Payload));
  }

  /// Print the tokens in this stream to the output stream.
  ///
  /// The presence of newlines/spaces is preserved, but not the quantity.
  void print(llvm::raw_ostream &) const;

private:
  std::shared_ptr<void> Payload;

  MutableArrayRef<Token> Tokens;
  std::vector<Token> Storage; // eof + Tokens + eof
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);

/// Extracts a raw token stream from the source code.
///
/// All tokens will reference the data of the provided string.
/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
TokenStream lex(const std::string &, const clang::LangOptions &);
enum class LexFlags : uint8_t {
  /// Marks the token at the start of a logical preprocessor line.
  /// This is a position where a directive might start.
  ///
  /// Here, the first # is StartsPPLine, but second is not (same logical line).
  ///   #define X(error) \
  ///   #error // not a directive!
  ///
  /// Careful, the directive may not start exactly on the StartsPPLine token:
  ///   /*comment*/ #include <foo.h>
  StartsPPLine = 1 << 0,
  /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
  /// The text() of such tokens will contain the raw trigrah.
  NeedsCleaning = 1 << 1,
};
/// A generic lang options suitable for lexing/parsing a langage.
clang::LangOptions genericLangOpts(
    clang::Language = clang::Language::CXX,
    clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);

/// Decoding raw tokens written in the source code, returning a derived stream.
///
/// - escaped newlines within tokens are removed
/// - trigraphs are replaced with the characters they encode
/// - UCNs within raw_identifiers are replaced by the characters they encode
///   (UCNs within strings, comments etc are not translated)
/// - raw_identifier tokens are assigned their correct keyword type
/// - the >> token is split into separate > > tokens
///   (we use a modified grammar where >> is a nonterminal, not a token)
///
/// The StartsPPLine flag is preserved.
///
/// Formally the identifier correctly happens before preprocessing, while we
/// should only cook raw_identifiers that survive preprocessing.
/// However, ignoring the Token::Kind of tokens in directives achieves the same.
/// (And having cooked token kinds in PP-disabled sections is useful for us).
TokenStream cook(const TokenStream &, const clang::LangOptions &);

/// Drops comment tokens.
TokenStream stripComments(const TokenStream &);

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_TOKEN_H

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===--- Token.h - Tokens and token streams in the pseudoparser --- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		// Tokens are the first level of abstraction above bytes used in pseudoparsing.
10		// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11		// The tokens is wrapped into pseudo::Token, along with line/indent info.
12		//
13		// Unlike clang, we make multiple passes over the whole file, out-of-order.
14		// Therefore we retain the whole token sequence in memory. (This is feasible as
15		// we process one file at a time). pseudo::TokenStream holds such a stream.
16		// The initial stream holds the raw tokens read from the file, later passes
17		// operate on derived TokenStreams (e.g. with directives stripped).
18		//
19		// Similar facilities from clang that are not used:
20		// - SourceManager: designed around multiple files and precise macro expansion.
21		// - clang::Token: coupled to SourceManager, doesn't retain layout info.
22		// (pseudo::Token is similar, but without SourceLocations).
23		// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24		// (pseudo::TokenStream is similar, but a flat token list).
25		//
26		//===----------------------------------------------------------------------===//
27
28		#ifndef CLANG_PSEUDO_TOKEN_H
29		#define CLANG_PSEUDO_TOKEN_H
30
31		#include "clang/Basic/LLVM.h"
32		#include "clang/Basic/LangStandard.h"
33		#include "clang/Basic/TokenKinds.h"
34		#include "llvm/ADT/ArrayRef.h"
35		#include "llvm/ADT/STLForwardCompat.h"
36		#include "llvm/Support/raw_ostream.h"
37		#include <cstdint>
38		#include <limits>
39		#include <memory>
40		#include <vector>
41
42		namespace clang {
43		class LangOptions;
44		namespace pseudo {
45
46		/// A single C++ or preprocessor token.
47		///
48		/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49		/// SourceManager - we are not dealing with multiple files.
50		struct Token {
51		/// An Index identifies a token within a stream.
52		using Index = uint32_t;
53		/// A sentinel Index indicating no token.
54		constexpr static Index Invalid = std::numeric_limits<Index>::max();
55		struct Range;
56
57		/// The token text.
58		///
59		/// Typically from the original source file, but may have been synthesized.
60	43.1M	StringRef text() const { return StringRef(Data, Length); }
61		const char *Data = nullptr;
62		uint32_t Length = 0;
63
64		/// Zero-based line number for the start of the token.
65		/// This refers to the original source file as written.
66		uint32_t Line = 0;
67		/// Width of whitespace before the first token on this line.
68		uint8_t Indent = 0;
69		/// Flags have some meaning defined by the function that produced this stream.
70		uint8_t Flags = 0;
71		/// Index into the original token stream (as raw-lexed from the source code).
72		Index OriginalIndex = Invalid;
73		// Helpers to get/set Flags based on `enum class`.
74	106M	template <class T> bool flag(T Mask) const {
75	106M	return Flags & uint8_t{llvm::to_underlying(Mask)};
76	106M	}
77	1.32M	template <class T> void setFlag(T Mask) {
78	1.32M	Flags \|= uint8_t{llvm::to_underlying(Mask)};
79	1.32M	}
80
81		/// Returns the next token in the stream. this may not be a sentinel.
82	54.5k	const Token &next() const {
83	54.5k	assert(Kind != tok::eof);
84	0	return *(this + 1);
85	54.5k	}
86		/// Returns the next token in the stream, skipping over comments.
87	51.6k	const Token &nextNC() const {
88	51.6k	const Token *T = this;
89	51.6k	do
90	54.5k	T = &T->next();
91	54.5k	while (T->Kind == tok::comment);
92	51.6k	return *T;
93	51.6k	}
94		/// Returns the previous token in the stream. this may not be a sentinel.
95	8.79k	const Token &prev() const {
96	8.79k	assert(Kind != tok::eof);
97	0	return *(this - 1);
98	8.79k	}
99		/// Returns the bracket paired with this one, if any.
100	389k	const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
101
102		/// The type of token as determined by clang's lexer.
103		clang::tok::TokenKind Kind = clang::tok::unknown;
104		/// If this token is a paired bracket, the offset of the pair in the stream.
105		int32_t Pair = 0;
106		};
107		static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
108		llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109
110		/// A half-open range of tokens within a stream.
111		struct Token::Range {
112		Index Begin = 0;
113		Index End = 0;
114
115	0	uint32_t size() const { return End - Begin; }
116	16.1k	static Range emptyAt(Index Index) { return Range{Index, Index}; }
117		};
118		llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119
120		/// A complete sequence of Tokens representing a source file.
121		///
122		/// This may match a raw file from disk, or be derived from a previous stream.
123		/// For example, stripping comments from a TokenStream results in a new stream.
124		///
125		/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126		/// int main ( ) ;
127		/// eof kw_int ident l_paren r_paren semi eof
128		/// front() back()
129		/// 0 1 2 3 4 5
130		class TokenStream {
131		public:
132		/// Create an empty stream.
133		///
134		/// Initially, the stream is appendable and not finalized.
135		/// The token sequence may only be accessed after finalize() is called.
136		///
137		/// Payload is an opaque object which will be owned by the stream.
138		/// e.g. an allocator to hold backing storage for synthesized token text.
139		explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140
141		/// Append a token to the stream, which must not be finalized.
142	159M	void push(Token T) {
143	159M	assert(!isFinalized());
144	0	Storage.push_back(std::move(T));
145	159M	}
146
147		/// Finalize the token stream, allowing tokens to be accessed.
148		/// Tokens may no longer be appended.
149		void finalize();
150		bool isFinalized() const;
151
152		/// Returns the index of T within the stream.
153		///
154		/// T must be within the stream or the end sentinel (not the start sentinel).
155	552k	Token::Index index(const Token &T) const {
156	552k	assert(isFinalized());
157	0	assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158	0	assert(&T != Storage.data() && "start sentinel");
159	0	return &T - Tokens.data();
160	552k	}
161
162	5.43M	ArrayRef<Token> tokens() const {
163	5.43M	assert(isFinalized());
164	0	return Tokens;
165	5.43M	}
166	131k	ArrayRef<Token> tokens(Token::Range R) const {
167	131k	return tokens().slice(R.Begin, R.End - R.Begin);
168	131k	}
169
170	0	MutableArrayRef<Token> tokens() {
171	0	assert(isFinalized());
172	0	return Tokens;
173	0	}
174
175		/// May return the end sentinel if the stream is empty.
176	9.56k	const Token &front() const {
177	9.56k	assert(isFinalized());
178	0	return Storage[1];
179	9.56k	}
180
181		/// Returns the shared payload.
182	19.1k	std::shared_ptr<void> getPayload() const { return Payload; }
183		/// Adds the given payload to the stream.
184	9.56k	void addPayload(std::shared_ptr<void> P) {
185	9.56k	if (!Payload)
186	0	Payload = std::move(P);
187	9.56k	else
188	9.56k	Payload = std::make_shared<
189	9.56k	std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190	9.56k	std::move(P), std::move(Payload));
191	9.56k	}
192
193		/// Print the tokens in this stream to the output stream.
194		///
195		/// The presence of newlines/spaces is preserved, but not the quantity.
196		void print(llvm::raw_ostream &) const;
197
198		private:
199		std::shared_ptr<void> Payload;
200
201		MutableArrayRef<Token> Tokens;
202		std::vector<Token> Storage; // eof + Tokens + eof
203		};
204		llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205
206		/// Extracts a raw token stream from the source code.
207		///
208		/// All tokens will reference the data of the provided string.
209		/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210		TokenStream lex(const std::string &, const clang::LangOptions &);
211		enum class LexFlags : uint8_t {
212		/// Marks the token at the start of a logical preprocessor line.
213		/// This is a position where a directive might start.
214		///
215		/// Here, the first # is StartsPPLine, but second is not (same logical line).
216		/// #define X(error) \
217		/// #error // not a directive!
218		///
219		/// Careful, the directive may not start exactly on the StartsPPLine token:
220		/// /comment/ #include <foo.h>
221		StartsPPLine = 1 << 0,
222		/// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223		/// The text() of such tokens will contain the raw trigrah.
224		NeedsCleaning = 1 << 1,
225		};
226		/// A generic lang options suitable for lexing/parsing a langage.
227		clang::LangOptions genericLangOpts(
228		clang::Language = clang::Language::CXX,
229		clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230
231		/// Decoding raw tokens written in the source code, returning a derived stream.
232		///
233		/// - escaped newlines within tokens are removed
234		/// - trigraphs are replaced with the characters they encode
235		/// - UCNs within raw_identifiers are replaced by the characters they encode
236		/// (UCNs within strings, comments etc are not translated)
237		/// - raw_identifier tokens are assigned their correct keyword type
238		/// - the >> token is split into separate > > tokens
239		/// (we use a modified grammar where >> is a nonterminal, not a token)
240		///
241		/// The StartsPPLine flag is preserved.
242		///
243		/// Formally the identifier correctly happens before preprocessing, while we
244		/// should only cook raw_identifiers that survive preprocessing.
245		/// However, ignoring the Token::Kind of tokens in directives achieves the same.
246		/// (And having cooked token kinds in PP-disabled sections is useful for us).
247		TokenStream cook(const TokenStream &, const clang::LangOptions &);
248
249		/// Drops comment tokens.
250		TokenStream stripComments(const TokenStream &);
251
252		} // namespace pseudo
253		} // namespace clang
254
255		#endif // CLANG_PSEUDO_TOKEN_H