Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
Line
Count
Source (jump to first uncovered line)
1
//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// Tokens are the first level of abstraction above bytes used in pseudoparsing.
10
// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11
// The tokens is wrapped into pseudo::Token, along with line/indent info.
12
//
13
// Unlike clang, we make multiple passes over the whole file, out-of-order.
14
// Therefore we retain the whole token sequence in memory. (This is feasible as
15
// we process one file at a time). pseudo::TokenStream holds such a stream.
16
// The initial stream holds the raw tokens read from the file, later passes
17
// operate on derived TokenStreams (e.g. with directives stripped).
18
//
19
// Similar facilities from clang that are *not* used:
20
//  - SourceManager: designed around multiple files and precise macro expansion.
21
//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
22
//                  (pseudo::Token is similar, but without SourceLocations).
23
//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24
//                  (pseudo::TokenStream is similar, but a flat token list).
25
//
26
//===----------------------------------------------------------------------===//
27
28
#ifndef CLANG_PSEUDO_TOKEN_H
29
#define CLANG_PSEUDO_TOKEN_H
30
31
#include "clang/Basic/LLVM.h"
32
#include "clang/Basic/LangStandard.h"
33
#include "clang/Basic/TokenKinds.h"
34
#include "llvm/ADT/ArrayRef.h"
35
#include "llvm/ADT/STLForwardCompat.h"
36
#include "llvm/Support/raw_ostream.h"
37
#include <cstdint>
38
#include <limits>
39
#include <memory>
40
#include <vector>
41
42
namespace clang {
43
class LangOptions;
44
namespace pseudo {
45
46
/// A single C++ or preprocessor token.
47
///
48
/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49
/// SourceManager - we are not dealing with multiple files.
50
struct Token {
51
  /// An Index identifies a token within a stream.
52
  using Index = uint32_t;
53
  /// A sentinel Index indicating no token.
54
  constexpr static Index Invalid = std::numeric_limits<Index>::max();
55
  struct Range;
56
57
  /// The token text.
58
  ///
59
  /// Typically from the original source file, but may have been synthesized.
60
43.1M
  StringRef text() const { return StringRef(Data, Length); }
61
  const char *Data = nullptr;
62
  uint32_t Length = 0;
63
64
  /// Zero-based line number for the start of the token.
65
  /// This refers to the original source file as written.
66
  uint32_t Line = 0;
67
  /// Width of whitespace before the first token on this line.
68
  uint8_t Indent = 0;
69
  /// Flags have some meaning defined by the function that produced this stream.
70
  uint8_t Flags = 0;
71
  /// Index into the original token stream (as raw-lexed from the source code).
72
  Index OriginalIndex = Invalid;
73
  // Helpers to get/set Flags based on `enum class`.
74
106M
  template <class T> bool flag(T Mask) const {
75
106M
    return Flags & uint8_t{llvm::to_underlying(Mask)};
76
106M
  }
77
1.32M
  template <class T> void setFlag(T Mask) {
78
1.32M
    Flags |= uint8_t{llvm::to_underlying(Mask)};
79
1.32M
  }
80
81
  /// Returns the next token in the stream. this may not be a sentinel.
82
54.5k
  const Token &next() const {
83
54.5k
    assert(Kind != tok::eof);
84
0
    return *(this + 1);
85
54.5k
  }
86
  /// Returns the next token in the stream, skipping over comments.
87
51.6k
  const Token &nextNC() const {
88
51.6k
    const Token *T = this;
89
51.6k
    do
90
54.5k
      T = &T->next();
91
54.5k
    while (T->Kind == tok::comment);
92
51.6k
    return *T;
93
51.6k
  }
94
  /// Returns the previous token in the stream. this may not be a sentinel.
95
8.79k
  const Token &prev() const {
96
8.79k
    assert(Kind != tok::eof);
97
0
    return *(this - 1);
98
8.79k
  }
99
  /// Returns the bracket paired with this one, if any.
100
389k
  const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
101
102
  /// The type of token as determined by clang's lexer.
103
  clang::tok::TokenKind Kind = clang::tok::unknown;
104
  /// If this token is a paired bracket, the offset of the pair in the stream.
105
  int32_t Pair = 0;
106
};
107
static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
108
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109
110
/// A half-open range of tokens within a stream.
111
struct Token::Range {
112
  Index Begin = 0;
113
  Index End = 0;
114
115
0
  uint32_t size() const { return End - Begin; }
116
16.1k
  static Range emptyAt(Index Index) { return Range{Index, Index}; }
117
};
118
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119
120
/// A complete sequence of Tokens representing a source file.
121
///
122
/// This may match a raw file from disk, or be derived from a previous stream.
123
/// For example, stripping comments from a TokenStream results in a new stream.
124
///
125
/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126
///       int      main   (        )        ;
127
///   eof kw_int   ident  l_paren  r_paren  semi   eof
128
///       front()                           back()
129
///       0        1      2        3        4      5
130
class TokenStream {
131
public:
132
  /// Create an empty stream.
133
  ///
134
  /// Initially, the stream is appendable and not finalized.
135
  /// The token sequence may only be accessed after finalize() is called.
136
  ///
137
  /// Payload is an opaque object which will be owned by the stream.
138
  /// e.g. an allocator to hold backing storage for synthesized token text.
139
  explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140
141
  /// Append a token to the stream, which must not be finalized.
142
159M
  void push(Token T) {
143
159M
    assert(!isFinalized());
144
0
    Storage.push_back(std::move(T));
145
159M
  }
146
147
  /// Finalize the token stream, allowing tokens to be accessed.
148
  /// Tokens may no longer be appended.
149
  void finalize();
150
  bool isFinalized() const;
151
152
  /// Returns the index of T within the stream.
153
  ///
154
  /// T must be within the stream or the end sentinel (not the start sentinel).
155
552k
  Token::Index index(const Token &T) const {
156
552k
    assert(isFinalized());
157
0
    assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158
0
    assert(&T != Storage.data() && "start sentinel");
159
0
    return &T - Tokens.data();
160
552k
  }
161
162
5.43M
  ArrayRef<Token> tokens() const {
163
5.43M
    assert(isFinalized());
164
0
    return Tokens;
165
5.43M
  }
166
131k
  ArrayRef<Token> tokens(Token::Range R) const {
167
131k
    return tokens().slice(R.Begin, R.End - R.Begin);
168
131k
  }
169
170
0
  MutableArrayRef<Token> tokens() {
171
0
    assert(isFinalized());
172
0
    return Tokens;
173
0
  }
174
175
  /// May return the end sentinel if the stream is empty.
176
9.56k
  const Token &front() const {
177
9.56k
    assert(isFinalized());
178
0
    return Storage[1];
179
9.56k
  }
180
181
  /// Returns the shared payload.
182
19.1k
  std::shared_ptr<void> getPayload() const { return Payload; }
183
  /// Adds the given payload to the stream.
184
9.56k
  void addPayload(std::shared_ptr<void> P) {
185
9.56k
    if (!Payload)
186
0
      Payload = std::move(P);
187
9.56k
    else
188
9.56k
      Payload = std::make_shared<
189
9.56k
          std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190
9.56k
          std::move(P), std::move(Payload));
191
9.56k
  }
192
193
  /// Print the tokens in this stream to the output stream.
194
  ///
195
  /// The presence of newlines/spaces is preserved, but not the quantity.
196
  void print(llvm::raw_ostream &) const;
197
198
private:
199
  std::shared_ptr<void> Payload;
200
201
  MutableArrayRef<Token> Tokens;
202
  std::vector<Token> Storage; // eof + Tokens + eof
203
};
204
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205
206
/// Extracts a raw token stream from the source code.
207
///
208
/// All tokens will reference the data of the provided string.
209
/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210
TokenStream lex(const std::string &, const clang::LangOptions &);
211
enum class LexFlags : uint8_t {
212
  /// Marks the token at the start of a logical preprocessor line.
213
  /// This is a position where a directive might start.
214
  ///
215
  /// Here, the first # is StartsPPLine, but second is not (same logical line).
216
  ///   #define X(error) \
217
  ///   #error // not a directive!
218
  ///
219
  /// Careful, the directive may not start exactly on the StartsPPLine token:
220
  ///   /*comment*/ #include <foo.h>
221
  StartsPPLine = 1 << 0,
222
  /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223
  /// The text() of such tokens will contain the raw trigrah.
224
  NeedsCleaning = 1 << 1,
225
};
226
/// A generic lang options suitable for lexing/parsing a langage.
227
clang::LangOptions genericLangOpts(
228
    clang::Language = clang::Language::CXX,
229
    clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230
231
/// Decoding raw tokens written in the source code, returning a derived stream.
232
///
233
/// - escaped newlines within tokens are removed
234
/// - trigraphs are replaced with the characters they encode
235
/// - UCNs within raw_identifiers are replaced by the characters they encode
236
///   (UCNs within strings, comments etc are not translated)
237
/// - raw_identifier tokens are assigned their correct keyword type
238
/// - the >> token is split into separate > > tokens
239
///   (we use a modified grammar where >> is a nonterminal, not a token)
240
///
241
/// The StartsPPLine flag is preserved.
242
///
243
/// Formally the identifier correctly happens before preprocessing, while we
244
/// should only cook raw_identifiers that survive preprocessing.
245
/// However, ignoring the Token::Kind of tokens in directives achieves the same.
246
/// (And having cooked token kinds in PP-disabled sections is useful for us).
247
TokenStream cook(const TokenStream &, const clang::LangOptions &);
248
249
/// Drops comment tokens.
250
TokenStream stripComments(const TokenStream &);
251
252
} // namespace pseudo
253
} // namespace clang
254
255
#endif // CLANG_PSEUDO_TOKEN_H