/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // Tokens are the first level of abstraction above bytes used in pseudoparsing. |
10 | | // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). |
11 | | // The tokens is wrapped into pseudo::Token, along with line/indent info. |
12 | | // |
13 | | // Unlike clang, we make multiple passes over the whole file, out-of-order. |
14 | | // Therefore we retain the whole token sequence in memory. (This is feasible as |
15 | | // we process one file at a time). pseudo::TokenStream holds such a stream. |
16 | | // The initial stream holds the raw tokens read from the file, later passes |
17 | | // operate on derived TokenStreams (e.g. with directives stripped). |
18 | | // |
19 | | // Similar facilities from clang that are *not* used: |
20 | | // - SourceManager: designed around multiple files and precise macro expansion. |
21 | | // - clang::Token: coupled to SourceManager, doesn't retain layout info. |
22 | | // (pseudo::Token is similar, but without SourceLocations). |
23 | | // - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. |
24 | | // (pseudo::TokenStream is similar, but a flat token list). |
25 | | // |
26 | | //===----------------------------------------------------------------------===// |
27 | | |
28 | | #ifndef CLANG_PSEUDO_TOKEN_H |
29 | | #define CLANG_PSEUDO_TOKEN_H |
30 | | |
31 | | #include "clang/Basic/LLVM.h" |
32 | | #include "clang/Basic/LangStandard.h" |
33 | | #include "clang/Basic/TokenKinds.h" |
34 | | #include "llvm/ADT/ArrayRef.h" |
35 | | #include "llvm/ADT/STLForwardCompat.h" |
36 | | #include "llvm/Support/raw_ostream.h" |
37 | | #include <cstdint> |
38 | | #include <limits> |
39 | | #include <memory> |
40 | | #include <vector> |
41 | | |
42 | | namespace clang { |
43 | | class LangOptions; |
44 | | namespace pseudo { |
45 | | |
46 | | /// A single C++ or preprocessor token. |
47 | | /// |
48 | | /// Unlike clang::Token and syntax::Token, these tokens are not connected to a |
49 | | /// SourceManager - we are not dealing with multiple files. |
50 | | struct Token { |
51 | | /// An Index identifies a token within a stream. |
52 | | using Index = uint32_t; |
53 | | /// A sentinel Index indicating no token. |
54 | | constexpr static Index Invalid = std::numeric_limits<Index>::max(); |
55 | | struct Range; |
56 | | |
57 | | /// The token text. |
58 | | /// |
59 | | /// Typically from the original source file, but may have been synthesized. |
60 | 43.1M | StringRef text() const { return StringRef(Data, Length); } |
61 | | const char *Data = nullptr; |
62 | | uint32_t Length = 0; |
63 | | |
64 | | /// Zero-based line number for the start of the token. |
65 | | /// This refers to the original source file as written. |
66 | | uint32_t Line = 0; |
67 | | /// Width of whitespace before the first token on this line. |
68 | | uint8_t Indent = 0; |
69 | | /// Flags have some meaning defined by the function that produced this stream. |
70 | | uint8_t Flags = 0; |
71 | | /// Index into the original token stream (as raw-lexed from the source code). |
72 | | Index OriginalIndex = Invalid; |
73 | | // Helpers to get/set Flags based on `enum class`. |
74 | 106M | template <class T> bool flag(T Mask) const { |
75 | 106M | return Flags & uint8_t{llvm::to_underlying(Mask)}; |
76 | 106M | } |
77 | 1.32M | template <class T> void setFlag(T Mask) { |
78 | 1.32M | Flags |= uint8_t{llvm::to_underlying(Mask)}; |
79 | 1.32M | } |
80 | | |
81 | | /// Returns the next token in the stream. this may not be a sentinel. |
82 | 54.5k | const Token &next() const { |
83 | 54.5k | assert(Kind != tok::eof); |
84 | 0 | return *(this + 1); |
85 | 54.5k | } |
86 | | /// Returns the next token in the stream, skipping over comments. |
87 | 51.6k | const Token &nextNC() const { |
88 | 51.6k | const Token *T = this; |
89 | 51.6k | do |
90 | 54.5k | T = &T->next(); |
91 | 54.5k | while (T->Kind == tok::comment); |
92 | 51.6k | return *T; |
93 | 51.6k | } |
94 | | /// Returns the previous token in the stream. this may not be a sentinel. |
95 | 8.79k | const Token &prev() const { |
96 | 8.79k | assert(Kind != tok::eof); |
97 | 0 | return *(this - 1); |
98 | 8.79k | } |
99 | | /// Returns the bracket paired with this one, if any. |
100 | 389k | const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; } |
101 | | |
102 | | /// The type of token as determined by clang's lexer. |
103 | | clang::tok::TokenKind Kind = clang::tok::unknown; |
104 | | /// If this token is a paired bracket, the offset of the pair in the stream. |
105 | | int32_t Pair = 0; |
106 | | }; |
107 | | static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!"); |
108 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); |
109 | | |
110 | | /// A half-open range of tokens within a stream. |
111 | | struct Token::Range { |
112 | | Index Begin = 0; |
113 | | Index End = 0; |
114 | | |
115 | 0 | uint32_t size() const { return End - Begin; } |
116 | 16.1k | static Range emptyAt(Index Index) { return Range{Index, Index}; } |
117 | | }; |
118 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); |
119 | | |
120 | | /// A complete sequence of Tokens representing a source file. |
121 | | /// |
122 | | /// This may match a raw file from disk, or be derived from a previous stream. |
123 | | /// For example, stripping comments from a TokenStream results in a new stream. |
124 | | /// |
125 | | /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: |
126 | | /// int main ( ) ; |
127 | | /// eof kw_int ident l_paren r_paren semi eof |
128 | | /// front() back() |
129 | | /// 0 1 2 3 4 5 |
130 | | class TokenStream { |
131 | | public: |
132 | | /// Create an empty stream. |
133 | | /// |
134 | | /// Initially, the stream is appendable and not finalized. |
135 | | /// The token sequence may only be accessed after finalize() is called. |
136 | | /// |
137 | | /// Payload is an opaque object which will be owned by the stream. |
138 | | /// e.g. an allocator to hold backing storage for synthesized token text. |
139 | | explicit TokenStream(std::shared_ptr<void> Payload = nullptr); |
140 | | |
141 | | /// Append a token to the stream, which must not be finalized. |
142 | 159M | void push(Token T) { |
143 | 159M | assert(!isFinalized()); |
144 | 0 | Storage.push_back(std::move(T)); |
145 | 159M | } |
146 | | |
147 | | /// Finalize the token stream, allowing tokens to be accessed. |
148 | | /// Tokens may no longer be appended. |
149 | | void finalize(); |
150 | | bool isFinalized() const; |
151 | | |
152 | | /// Returns the index of T within the stream. |
153 | | /// |
154 | | /// T must be within the stream or the end sentinel (not the start sentinel). |
155 | 552k | Token::Index index(const Token &T) const { |
156 | 552k | assert(isFinalized()); |
157 | 0 | assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); |
158 | 0 | assert(&T != Storage.data() && "start sentinel"); |
159 | 0 | return &T - Tokens.data(); |
160 | 552k | } |
161 | | |
162 | 5.43M | ArrayRef<Token> tokens() const { |
163 | 5.43M | assert(isFinalized()); |
164 | 0 | return Tokens; |
165 | 5.43M | } |
166 | 131k | ArrayRef<Token> tokens(Token::Range R) const { |
167 | 131k | return tokens().slice(R.Begin, R.End - R.Begin); |
168 | 131k | } |
169 | | |
170 | 0 | MutableArrayRef<Token> tokens() { |
171 | 0 | assert(isFinalized()); |
172 | 0 | return Tokens; |
173 | 0 | } |
174 | | |
175 | | /// May return the end sentinel if the stream is empty. |
176 | 9.56k | const Token &front() const { |
177 | 9.56k | assert(isFinalized()); |
178 | 0 | return Storage[1]; |
179 | 9.56k | } |
180 | | |
181 | | /// Returns the shared payload. |
182 | 19.1k | std::shared_ptr<void> getPayload() const { return Payload; } |
183 | | /// Adds the given payload to the stream. |
184 | 9.56k | void addPayload(std::shared_ptr<void> P) { |
185 | 9.56k | if (!Payload) |
186 | 0 | Payload = std::move(P); |
187 | 9.56k | else |
188 | 9.56k | Payload = std::make_shared< |
189 | 9.56k | std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>( |
190 | 9.56k | std::move(P), std::move(Payload)); |
191 | 9.56k | } |
192 | | |
193 | | /// Print the tokens in this stream to the output stream. |
194 | | /// |
195 | | /// The presence of newlines/spaces is preserved, but not the quantity. |
196 | | void print(llvm::raw_ostream &) const; |
197 | | |
198 | | private: |
199 | | std::shared_ptr<void> Payload; |
200 | | |
201 | | MutableArrayRef<Token> Tokens; |
202 | | std::vector<Token> Storage; // eof + Tokens + eof |
203 | | }; |
204 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); |
205 | | |
206 | | /// Extracts a raw token stream from the source code. |
207 | | /// |
208 | | /// All tokens will reference the data of the provided string. |
209 | | /// "word-like" tokens such as identifiers and keywords will be raw_identifier. |
210 | | TokenStream lex(const std::string &, const clang::LangOptions &); |
211 | | enum class LexFlags : uint8_t { |
212 | | /// Marks the token at the start of a logical preprocessor line. |
213 | | /// This is a position where a directive might start. |
214 | | /// |
215 | | /// Here, the first # is StartsPPLine, but second is not (same logical line). |
216 | | /// #define X(error) \ |
217 | | /// #error // not a directive! |
218 | | /// |
219 | | /// Careful, the directive may not start exactly on the StartsPPLine token: |
220 | | /// /*comment*/ #include <foo.h> |
221 | | StartsPPLine = 1 << 0, |
222 | | /// Marks tokens containing trigraphs, escaped newlines, UCNs etc. |
223 | | /// The text() of such tokens will contain the raw trigrah. |
224 | | NeedsCleaning = 1 << 1, |
225 | | }; |
226 | | /// A generic lang options suitable for lexing/parsing a langage. |
227 | | clang::LangOptions genericLangOpts( |
228 | | clang::Language = clang::Language::CXX, |
229 | | clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); |
230 | | |
231 | | /// Decoding raw tokens written in the source code, returning a derived stream. |
232 | | /// |
233 | | /// - escaped newlines within tokens are removed |
234 | | /// - trigraphs are replaced with the characters they encode |
235 | | /// - UCNs within raw_identifiers are replaced by the characters they encode |
236 | | /// (UCNs within strings, comments etc are not translated) |
237 | | /// - raw_identifier tokens are assigned their correct keyword type |
238 | | /// - the >> token is split into separate > > tokens |
239 | | /// (we use a modified grammar where >> is a nonterminal, not a token) |
240 | | /// |
241 | | /// The StartsPPLine flag is preserved. |
242 | | /// |
243 | | /// Formally the identifier correctly happens before preprocessing, while we |
244 | | /// should only cook raw_identifiers that survive preprocessing. |
245 | | /// However, ignoring the Token::Kind of tokens in directives achieves the same. |
246 | | /// (And having cooked token kinds in PP-disabled sections is useful for us). |
247 | | TokenStream cook(const TokenStream &, const clang::LangOptions &); |
248 | | |
249 | | /// Drops comment tokens. |
250 | | TokenStream stripComments(const TokenStream &); |
251 | | |
252 | | } // namespace pseudo |
253 | | } // namespace clang |
254 | | |
255 | | #endif // CLANG_PSEUDO_TOKEN_H |