/src/llvm-project/clang/lib/Lex/DependencyDirectivesScanner.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- DependencyDirectivesScanner.cpp ------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | /// |
9 | | /// \file |
10 | | /// This is the interface for scanning header and source files to get the |
11 | | /// minimum necessary preprocessor directives for evaluating includes. It |
12 | | /// reduces the source down to #define, #include, #import, @import, and any |
13 | | /// conditional preprocessor logic that contains one of those. |
14 | | /// |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #include "clang/Lex/DependencyDirectivesScanner.h" |
18 | | #include "clang/Basic/CharInfo.h" |
19 | | #include "clang/Basic/Diagnostic.h" |
20 | | #include "clang/Lex/LexDiagnostic.h" |
21 | | #include "clang/Lex/Lexer.h" |
22 | | #include "clang/Lex/Pragma.h" |
23 | | #include "llvm/ADT/ScopeExit.h" |
24 | | #include "llvm/ADT/SmallString.h" |
25 | | #include "llvm/ADT/StringMap.h" |
26 | | #include "llvm/ADT/StringSwitch.h" |
27 | | #include <optional> |
28 | | |
29 | | using namespace clang; |
30 | | using namespace clang::dependency_directives_scan; |
31 | | using namespace llvm; |
32 | | |
33 | | namespace { |
34 | | |
35 | | struct DirectiveWithTokens { |
36 | | DirectiveKind Kind; |
37 | | unsigned NumTokens; |
38 | | |
39 | | DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens) |
40 | 0 | : Kind(Kind), NumTokens(NumTokens) {} |
41 | | }; |
42 | | |
43 | | /// Does an efficient "scan" of the sources to detect the presence of |
44 | | /// preprocessor (or module import) directives and collects the raw lexed tokens |
45 | | /// for those directives so that the \p Lexer can "replay" them when the file is |
46 | | /// included. |
47 | | /// |
48 | | /// Note that the behavior of the raw lexer is affected by the language mode, |
49 | | /// while at this point we want to do a scan and collect tokens once, |
50 | | /// irrespective of the language mode that the file will get included in. To |
51 | | /// compensate for that the \p Lexer, while "replaying", will adjust a token |
52 | | /// where appropriate, when it could affect the preprocessor's state. |
53 | | /// For example in a directive like |
54 | | /// |
55 | | /// \code |
56 | | /// #if __has_cpp_attribute(clang::fallthrough) |
57 | | /// \endcode |
58 | | /// |
59 | | /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2 |
60 | | /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon' |
61 | | /// while in C++ mode. |
62 | | struct Scanner { |
63 | | Scanner(StringRef Input, |
64 | | SmallVectorImpl<dependency_directives_scan::Token> &Tokens, |
65 | | DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) |
66 | | : Input(Input), Tokens(Tokens), Diags(Diags), |
67 | | InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()), |
68 | | TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(), |
69 | 0 | Input.end()) {} |
70 | | |
71 | 0 | static LangOptions getLangOptsForDepScanning() { |
72 | 0 | LangOptions LangOpts; |
73 | | // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'. |
74 | 0 | LangOpts.ObjC = true; |
75 | 0 | LangOpts.LineComment = true; |
76 | | // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and |
77 | | // R"()" literals. |
78 | 0 | return LangOpts; |
79 | 0 | } |
80 | | |
81 | | /// Lex the provided source and emit the directive tokens. |
82 | | /// |
83 | | /// \returns True on error. |
84 | | bool scan(SmallVectorImpl<Directive> &Directives); |
85 | | |
86 | | private: |
87 | | /// Lexes next token and advances \p First and the \p Lexer. |
88 | | [[nodiscard]] dependency_directives_scan::Token & |
89 | | lexToken(const char *&First, const char *const End); |
90 | | |
91 | | dependency_directives_scan::Token &lexIncludeFilename(const char *&First, |
92 | | const char *const End); |
93 | | |
94 | | void skipLine(const char *&First, const char *const End); |
95 | | void skipDirective(StringRef Name, const char *&First, const char *const End); |
96 | | |
97 | | /// Returns the spelling of a string literal or identifier after performing |
98 | | /// any processing needed to handle \c clang::Token::NeedsCleaning. |
99 | | StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok); |
100 | | |
101 | | /// Lexes next token and if it is identifier returns its string, otherwise |
102 | | /// it skips the current line and returns \p std::nullopt. |
103 | | /// |
104 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
105 | | /// advance beyond the token. |
106 | | [[nodiscard]] std::optional<StringRef> |
107 | | tryLexIdentifierOrSkipLine(const char *&First, const char *const End); |
108 | | |
109 | | /// Used when it is certain that next token is an identifier. |
110 | | [[nodiscard]] StringRef lexIdentifier(const char *&First, |
111 | | const char *const End); |
112 | | |
113 | | /// Lexes next token and returns true iff it is an identifier that matches \p |
114 | | /// Id, otherwise it skips the current line and returns false. |
115 | | /// |
116 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
117 | | /// advance beyond the token. |
118 | | [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id, |
119 | | const char *&First, |
120 | | const char *const End); |
121 | | |
122 | | /// Lexes next token and returns true iff it matches the kind \p K. |
123 | | /// Otherwise it skips the current line and returns false. |
124 | | /// |
125 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
126 | | /// advance beyond the token. |
127 | | [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First, |
128 | | const char *const End); |
129 | | |
130 | | /// Lexes next token and if it is string literal, returns its string. |
131 | | /// Otherwise, it skips the current line and returns \p std::nullopt. |
132 | | /// |
133 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
134 | | /// advance beyond the token. |
135 | | [[nodiscard]] std::optional<StringRef> |
136 | | tryLexStringLiteralOrSkipLine(const char *&First, const char *const End); |
137 | | |
138 | | [[nodiscard]] bool scanImpl(const char *First, const char *const End); |
139 | | [[nodiscard]] bool lexPPLine(const char *&First, const char *const End); |
140 | | [[nodiscard]] bool lexAt(const char *&First, const char *const End); |
141 | | [[nodiscard]] bool lexModule(const char *&First, const char *const End); |
142 | | [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First, |
143 | | const char *const End); |
144 | | [[nodiscard]] bool lexPragma(const char *&First, const char *const End); |
145 | | [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End); |
146 | | [[nodiscard]] bool lexEndif(const char *&First, const char *const End); |
147 | | [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First, |
148 | | const char *const End); |
149 | | [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind, |
150 | | const char *&First, |
151 | | const char *const End); |
152 | | void lexPPDirectiveBody(const char *&First, const char *const End); |
153 | | |
154 | 0 | DirectiveWithTokens &pushDirective(DirectiveKind Kind) { |
155 | 0 | Tokens.append(CurDirToks); |
156 | 0 | DirsWithToks.emplace_back(Kind, CurDirToks.size()); |
157 | 0 | CurDirToks.clear(); |
158 | 0 | return DirsWithToks.back(); |
159 | 0 | } |
160 | 0 | void popDirective() { |
161 | 0 | Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens); |
162 | 0 | } |
163 | 0 | DirectiveKind topDirective() const { |
164 | 0 | return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind; |
165 | 0 | } |
166 | | |
167 | 0 | unsigned getOffsetAt(const char *CurPtr) const { |
168 | 0 | return CurPtr - Input.data(); |
169 | 0 | } |
170 | | |
171 | | /// Reports a diagnostic if the diagnostic engine is provided. Always returns |
172 | | /// true at the end. |
173 | | bool reportError(const char *CurPtr, unsigned Err); |
174 | | |
175 | | StringMap<char> SplitIds; |
176 | | StringRef Input; |
177 | | SmallVectorImpl<dependency_directives_scan::Token> &Tokens; |
178 | | DiagnosticsEngine *Diags; |
179 | | SourceLocation InputSourceLoc; |
180 | | |
181 | | const char *LastTokenPtr = nullptr; |
182 | | /// Keeps track of the tokens for the currently lexed directive. Once a |
183 | | /// directive is fully lexed and "committed" then the tokens get appended to |
184 | | /// \p Tokens and \p CurDirToks is cleared for the next directive. |
185 | | SmallVector<dependency_directives_scan::Token, 32> CurDirToks; |
186 | | /// The directives that were lexed along with the number of tokens that each |
187 | | /// directive contains. The tokens of all the directives are kept in \p Tokens |
188 | | /// vector, in the same order as the directives order in \p DirsWithToks. |
189 | | SmallVector<DirectiveWithTokens, 64> DirsWithToks; |
190 | | LangOptions LangOpts; |
191 | | Lexer TheLexer; |
192 | | }; |
193 | | |
194 | | } // end anonymous namespace |
195 | | |
196 | 0 | bool Scanner::reportError(const char *CurPtr, unsigned Err) { |
197 | 0 | if (!Diags) |
198 | 0 | return true; |
199 | 0 | assert(CurPtr >= Input.data() && "invalid buffer ptr"); |
200 | 0 | Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err); |
201 | 0 | return true; |
202 | 0 | } |
203 | | |
204 | 0 | static void skipOverSpaces(const char *&First, const char *const End) { |
205 | 0 | while (First != End && isHorizontalWhitespace(*First)) |
206 | 0 | ++First; |
207 | 0 | } |
208 | | |
209 | | [[nodiscard]] static bool isRawStringLiteral(const char *First, |
210 | 0 | const char *Current) { |
211 | 0 | assert(First <= Current); |
212 | | |
213 | | // Check if we can even back up. |
214 | 0 | if (*Current != '"' || First == Current) |
215 | 0 | return false; |
216 | | |
217 | | // Check for an "R". |
218 | 0 | --Current; |
219 | 0 | if (*Current != 'R') |
220 | 0 | return false; |
221 | 0 | if (First == Current || !isAsciiIdentifierContinue(*--Current)) |
222 | 0 | return true; |
223 | | |
224 | | // Check for a prefix of "u", "U", or "L". |
225 | 0 | if (*Current == 'u' || *Current == 'U' || *Current == 'L') |
226 | 0 | return First == Current || !isAsciiIdentifierContinue(*--Current); |
227 | | |
228 | | // Check for a prefix of "u8". |
229 | 0 | if (*Current != '8' || First == Current || *Current-- != 'u') |
230 | 0 | return false; |
231 | 0 | return First == Current || !isAsciiIdentifierContinue(*--Current); |
232 | 0 | } |
233 | | |
234 | 0 | static void skipRawString(const char *&First, const char *const End) { |
235 | 0 | assert(First[0] == '"'); |
236 | 0 | assert(First[-1] == 'R'); |
237 | | |
238 | 0 | const char *Last = ++First; |
239 | 0 | while (Last != End && *Last != '(') |
240 | 0 | ++Last; |
241 | 0 | if (Last == End) { |
242 | 0 | First = Last; // Hit the end... just give up. |
243 | 0 | return; |
244 | 0 | } |
245 | | |
246 | 0 | StringRef Terminator(First, Last - First); |
247 | 0 | for (;;) { |
248 | | // Move First to just past the next ")". |
249 | 0 | First = Last; |
250 | 0 | while (First != End && *First != ')') |
251 | 0 | ++First; |
252 | 0 | if (First == End) |
253 | 0 | return; |
254 | 0 | ++First; |
255 | | |
256 | | // Look ahead for the terminator sequence. |
257 | 0 | Last = First; |
258 | 0 | while (Last != End && size_t(Last - First) < Terminator.size() && |
259 | 0 | Terminator[Last - First] == *Last) |
260 | 0 | ++Last; |
261 | | |
262 | | // Check if we hit it (or the end of the file). |
263 | 0 | if (Last == End) { |
264 | 0 | First = Last; |
265 | 0 | return; |
266 | 0 | } |
267 | 0 | if (size_t(Last - First) < Terminator.size()) |
268 | 0 | continue; |
269 | 0 | if (*Last != '"') |
270 | 0 | continue; |
271 | 0 | First = Last + 1; |
272 | 0 | return; |
273 | 0 | } |
274 | 0 | } |
275 | | |
276 | | // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n) |
277 | 0 | static unsigned isEOL(const char *First, const char *const End) { |
278 | 0 | if (First == End) |
279 | 0 | return 0; |
280 | 0 | if (End - First > 1 && isVerticalWhitespace(First[0]) && |
281 | 0 | isVerticalWhitespace(First[1]) && First[0] != First[1]) |
282 | 0 | return 2; |
283 | 0 | return !!isVerticalWhitespace(First[0]); |
284 | 0 | } |
285 | | |
286 | 0 | static void skipString(const char *&First, const char *const End) { |
287 | 0 | assert(*First == '\'' || *First == '"' || *First == '<'); |
288 | 0 | const char Terminator = *First == '<' ? '>' : *First; |
289 | 0 | for (++First; First != End && *First != Terminator; ++First) { |
290 | | // String and character literals don't extend past the end of the line. |
291 | 0 | if (isVerticalWhitespace(*First)) |
292 | 0 | return; |
293 | 0 | if (*First != '\\') |
294 | 0 | continue; |
295 | | // Skip past backslash to the next character. This ensures that the |
296 | | // character right after it is skipped as well, which matters if it's |
297 | | // the terminator. |
298 | 0 | if (++First == End) |
299 | 0 | return; |
300 | 0 | if (!isWhitespace(*First)) |
301 | 0 | continue; |
302 | | // Whitespace after the backslash might indicate a line continuation. |
303 | 0 | const char *FirstAfterBackslashPastSpace = First; |
304 | 0 | skipOverSpaces(FirstAfterBackslashPastSpace, End); |
305 | 0 | if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) { |
306 | | // Advance the character pointer to the next line for the next |
307 | | // iteration. |
308 | 0 | First = FirstAfterBackslashPastSpace + NLSize - 1; |
309 | 0 | } |
310 | 0 | } |
311 | 0 | if (First != End) |
312 | 0 | ++First; // Finish off the string. |
313 | 0 | } |
314 | | |
315 | | // Returns the length of the skipped newline |
316 | 0 | static unsigned skipNewline(const char *&First, const char *End) { |
317 | 0 | if (First == End) |
318 | 0 | return 0; |
319 | 0 | assert(isVerticalWhitespace(*First)); |
320 | 0 | unsigned Len = isEOL(First, End); |
321 | 0 | assert(Len && "expected newline"); |
322 | 0 | First += Len; |
323 | 0 | return Len; |
324 | 0 | } |
325 | | |
326 | 0 | static bool wasLineContinuation(const char *First, unsigned EOLLen) { |
327 | 0 | return *(First - (int)EOLLen - 1) == '\\'; |
328 | 0 | } |
329 | | |
330 | 0 | static void skipToNewlineRaw(const char *&First, const char *const End) { |
331 | 0 | for (;;) { |
332 | 0 | if (First == End) |
333 | 0 | return; |
334 | | |
335 | 0 | unsigned Len = isEOL(First, End); |
336 | 0 | if (Len) |
337 | 0 | return; |
338 | | |
339 | 0 | do { |
340 | 0 | if (++First == End) |
341 | 0 | return; |
342 | 0 | Len = isEOL(First, End); |
343 | 0 | } while (!Len); |
344 | | |
345 | 0 | if (First[-1] != '\\') |
346 | 0 | return; |
347 | | |
348 | 0 | First += Len; |
349 | | // Keep skipping lines... |
350 | 0 | } |
351 | 0 | } |
352 | | |
353 | 0 | static void skipLineComment(const char *&First, const char *const End) { |
354 | 0 | assert(First[0] == '/' && First[1] == '/'); |
355 | 0 | First += 2; |
356 | 0 | skipToNewlineRaw(First, End); |
357 | 0 | } |
358 | | |
359 | 0 | static void skipBlockComment(const char *&First, const char *const End) { |
360 | 0 | assert(First[0] == '/' && First[1] == '*'); |
361 | 0 | if (End - First < 4) { |
362 | 0 | First = End; |
363 | 0 | return; |
364 | 0 | } |
365 | 0 | for (First += 3; First != End; ++First) |
366 | 0 | if (First[-1] == '*' && First[0] == '/') { |
367 | 0 | ++First; |
368 | 0 | return; |
369 | 0 | } |
370 | 0 | } |
371 | | |
372 | | /// \returns True if the current single quotation mark character is a C++ 14 |
373 | | /// digit separator. |
374 | | static bool isQuoteCppDigitSeparator(const char *const Start, |
375 | | const char *const Cur, |
376 | 0 | const char *const End) { |
377 | 0 | assert(*Cur == '\'' && "expected quotation character"); |
378 | | // skipLine called in places where we don't expect a valid number |
379 | | // body before `start` on the same line, so always return false at the start. |
380 | 0 | if (Start == Cur) |
381 | 0 | return false; |
382 | | // The previous character must be a valid PP number character. |
383 | | // Make sure that the L, u, U, u8 prefixes don't get marked as a |
384 | | // separator though. |
385 | 0 | char Prev = *(Cur - 1); |
386 | 0 | if (Prev == 'L' || Prev == 'U' || Prev == 'u') |
387 | 0 | return false; |
388 | 0 | if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u') |
389 | 0 | return false; |
390 | 0 | if (!isPreprocessingNumberBody(Prev)) |
391 | 0 | return false; |
392 | | // The next character should be a valid identifier body character. |
393 | 0 | return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1)); |
394 | 0 | } |
395 | | |
396 | 0 | void Scanner::skipLine(const char *&First, const char *const End) { |
397 | 0 | for (;;) { |
398 | 0 | assert(First <= End); |
399 | 0 | if (First == End) |
400 | 0 | return; |
401 | | |
402 | 0 | if (isVerticalWhitespace(*First)) { |
403 | 0 | skipNewline(First, End); |
404 | 0 | return; |
405 | 0 | } |
406 | 0 | const char *Start = First; |
407 | 0 | while (First != End && !isVerticalWhitespace(*First)) { |
408 | | // Iterate over strings correctly to avoid comments and newlines. |
409 | 0 | if (*First == '"' || |
410 | 0 | (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) { |
411 | 0 | LastTokenPtr = First; |
412 | 0 | if (isRawStringLiteral(Start, First)) |
413 | 0 | skipRawString(First, End); |
414 | 0 | else |
415 | 0 | skipString(First, End); |
416 | 0 | continue; |
417 | 0 | } |
418 | | |
419 | | // Iterate over comments correctly. |
420 | 0 | if (*First != '/' || End - First < 2) { |
421 | 0 | LastTokenPtr = First; |
422 | 0 | ++First; |
423 | 0 | continue; |
424 | 0 | } |
425 | | |
426 | 0 | if (First[1] == '/') { |
427 | | // "//...". |
428 | 0 | skipLineComment(First, End); |
429 | 0 | continue; |
430 | 0 | } |
431 | | |
432 | 0 | if (First[1] != '*') { |
433 | 0 | LastTokenPtr = First; |
434 | 0 | ++First; |
435 | 0 | continue; |
436 | 0 | } |
437 | | |
438 | | // "/*...*/". |
439 | 0 | skipBlockComment(First, End); |
440 | 0 | } |
441 | 0 | if (First == End) |
442 | 0 | return; |
443 | | |
444 | | // Skip over the newline. |
445 | 0 | unsigned Len = skipNewline(First, End); |
446 | 0 | if (!wasLineContinuation(First, Len)) // Continue past line-continuations. |
447 | 0 | break; |
448 | 0 | } |
449 | 0 | } |
450 | | |
451 | | void Scanner::skipDirective(StringRef Name, const char *&First, |
452 | 0 | const char *const End) { |
453 | 0 | if (llvm::StringSwitch<bool>(Name) |
454 | 0 | .Case("warning", true) |
455 | 0 | .Case("error", true) |
456 | 0 | .Default(false)) |
457 | | // Do not process quotes or comments. |
458 | 0 | skipToNewlineRaw(First, End); |
459 | 0 | else |
460 | 0 | skipLine(First, End); |
461 | 0 | } |
462 | | |
463 | 0 | static void skipWhitespace(const char *&First, const char *const End) { |
464 | 0 | for (;;) { |
465 | 0 | assert(First <= End); |
466 | 0 | skipOverSpaces(First, End); |
467 | |
|
468 | 0 | if (End - First < 2) |
469 | 0 | return; |
470 | | |
471 | 0 | if (First[0] == '\\' && isVerticalWhitespace(First[1])) { |
472 | 0 | skipNewline(++First, End); |
473 | 0 | continue; |
474 | 0 | } |
475 | | |
476 | | // Check for a non-comment character. |
477 | 0 | if (First[0] != '/') |
478 | 0 | return; |
479 | | |
480 | | // "// ...". |
481 | 0 | if (First[1] == '/') { |
482 | 0 | skipLineComment(First, End); |
483 | 0 | return; |
484 | 0 | } |
485 | | |
486 | | // Cannot be a comment. |
487 | 0 | if (First[1] != '*') |
488 | 0 | return; |
489 | | |
490 | | // "/*...*/". |
491 | 0 | skipBlockComment(First, End); |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | | bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First, |
496 | 0 | const char *const End) { |
497 | 0 | const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset; |
498 | 0 | for (;;) { |
499 | 0 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
500 | 0 | if (Tok.is(tok::eof)) |
501 | 0 | return reportError( |
502 | 0 | DirectiveLoc, |
503 | 0 | diag::err_dep_source_scanner_missing_semi_after_at_import); |
504 | 0 | if (Tok.is(tok::semi)) |
505 | 0 | break; |
506 | 0 | } |
507 | 0 | pushDirective(Kind); |
508 | 0 | skipWhitespace(First, End); |
509 | 0 | if (First == End) |
510 | 0 | return false; |
511 | 0 | if (!isVerticalWhitespace(*First)) |
512 | 0 | return reportError( |
513 | 0 | DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); |
514 | 0 | skipNewline(First, End); |
515 | 0 | return false; |
516 | 0 | } |
517 | | |
518 | | dependency_directives_scan::Token &Scanner::lexToken(const char *&First, |
519 | 0 | const char *const End) { |
520 | 0 | clang::Token Tok; |
521 | 0 | TheLexer.LexFromRawLexer(Tok); |
522 | 0 | First = Input.data() + TheLexer.getCurrentBufferOffset(); |
523 | 0 | assert(First <= End); |
524 | | |
525 | 0 | unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); |
526 | 0 | CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), |
527 | 0 | Tok.getFlags()); |
528 | 0 | return CurDirToks.back(); |
529 | 0 | } |
530 | | |
531 | | dependency_directives_scan::Token & |
532 | 0 | Scanner::lexIncludeFilename(const char *&First, const char *const End) { |
533 | 0 | clang::Token Tok; |
534 | 0 | TheLexer.LexIncludeFilename(Tok); |
535 | 0 | First = Input.data() + TheLexer.getCurrentBufferOffset(); |
536 | 0 | assert(First <= End); |
537 | | |
538 | 0 | unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); |
539 | 0 | CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), |
540 | 0 | Tok.getFlags()); |
541 | 0 | return CurDirToks.back(); |
542 | 0 | } |
543 | | |
544 | 0 | void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) { |
545 | 0 | while (true) { |
546 | 0 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
547 | 0 | if (Tok.is(tok::eod)) |
548 | 0 | break; |
549 | 0 | } |
550 | 0 | } |
551 | | |
552 | | StringRef |
553 | 0 | Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) { |
554 | 0 | bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning; |
555 | 0 | if (LLVM_LIKELY(!NeedsCleaning)) |
556 | 0 | return Input.slice(Tok.Offset, Tok.getEnd()); |
557 | | |
558 | 0 | SmallString<64> Spelling; |
559 | 0 | Spelling.resize(Tok.Length); |
560 | | |
561 | | // FIXME: C++11 raw string literals need special handling (see getSpellingSlow |
562 | | // in the Lexer). Currently we cannot see them due to our LangOpts. |
563 | |
|
564 | 0 | unsigned SpellingLength = 0; |
565 | 0 | const char *BufPtr = Input.begin() + Tok.Offset; |
566 | 0 | const char *AfterIdent = Input.begin() + Tok.getEnd(); |
567 | 0 | while (BufPtr < AfterIdent) { |
568 | 0 | auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); |
569 | 0 | Spelling[SpellingLength++] = Char; |
570 | 0 | BufPtr += Size; |
571 | 0 | } |
572 | |
|
573 | 0 | return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0) |
574 | 0 | .first->first(); |
575 | 0 | } |
576 | | |
577 | | std::optional<StringRef> |
578 | 0 | Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) { |
579 | 0 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
580 | 0 | if (Tok.isNot(tok::raw_identifier)) { |
581 | 0 | if (!Tok.is(tok::eod)) |
582 | 0 | skipLine(First, End); |
583 | 0 | return std::nullopt; |
584 | 0 | } |
585 | | |
586 | 0 | return cleanStringIfNeeded(Tok); |
587 | 0 | } |
588 | | |
589 | 0 | StringRef Scanner::lexIdentifier(const char *&First, const char *const End) { |
590 | 0 | std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End); |
591 | 0 | assert(Id && "expected identifier token"); |
592 | 0 | return *Id; |
593 | 0 | } |
594 | | |
595 | | bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First, |
596 | 0 | const char *const End) { |
597 | 0 | if (std::optional<StringRef> FoundId = |
598 | 0 | tryLexIdentifierOrSkipLine(First, End)) { |
599 | 0 | if (*FoundId == Id) |
600 | 0 | return true; |
601 | 0 | skipLine(First, End); |
602 | 0 | } |
603 | 0 | return false; |
604 | 0 | } |
605 | | |
606 | | bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First, |
607 | 0 | const char *const End) { |
608 | 0 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
609 | 0 | if (Tok.is(K)) |
610 | 0 | return true; |
611 | 0 | skipLine(First, End); |
612 | 0 | return false; |
613 | 0 | } |
614 | | |
615 | | std::optional<StringRef> |
616 | | Scanner::tryLexStringLiteralOrSkipLine(const char *&First, |
617 | 0 | const char *const End) { |
618 | 0 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
619 | 0 | if (!tok::isStringLiteral(Tok.Kind)) { |
620 | 0 | if (!Tok.is(tok::eod)) |
621 | 0 | skipLine(First, End); |
622 | 0 | return std::nullopt; |
623 | 0 | } |
624 | | |
625 | 0 | return cleanStringIfNeeded(Tok); |
626 | 0 | } |
627 | | |
628 | 0 | bool Scanner::lexAt(const char *&First, const char *const End) { |
629 | | // Handle "@import". |
630 | | |
631 | | // Lex '@'. |
632 | 0 | const dependency_directives_scan::Token &AtTok = lexToken(First, End); |
633 | 0 | assert(AtTok.is(tok::at)); |
634 | 0 | (void)AtTok; |
635 | |
|
636 | 0 | if (!isNextIdentifierOrSkipLine("import", First, End)) |
637 | 0 | return false; |
638 | 0 | return lexModuleDirectiveBody(decl_at_import, First, End); |
639 | 0 | } |
640 | | |
641 | 0 | bool Scanner::lexModule(const char *&First, const char *const End) { |
642 | 0 | StringRef Id = lexIdentifier(First, End); |
643 | 0 | bool Export = false; |
644 | 0 | if (Id == "export") { |
645 | 0 | Export = true; |
646 | 0 | std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End); |
647 | 0 | if (!NextId) |
648 | 0 | return false; |
649 | 0 | Id = *NextId; |
650 | 0 | } |
651 | | |
652 | 0 | if (Id != "module" && Id != "import") { |
653 | 0 | skipLine(First, End); |
654 | 0 | return false; |
655 | 0 | } |
656 | | |
657 | 0 | skipWhitespace(First, End); |
658 | | |
659 | | // Ignore this as a module directive if the next character can't be part of |
660 | | // an import. |
661 | |
|
662 | 0 | switch (*First) { |
663 | 0 | case ':': |
664 | 0 | case '<': |
665 | 0 | case '"': |
666 | 0 | break; |
667 | 0 | default: |
668 | 0 | if (!isAsciiIdentifierContinue(*First)) { |
669 | 0 | skipLine(First, End); |
670 | 0 | return false; |
671 | 0 | } |
672 | 0 | } |
673 | | |
674 | 0 | TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false); |
675 | |
|
676 | 0 | DirectiveKind Kind; |
677 | 0 | if (Id == "module") |
678 | 0 | Kind = Export ? cxx_export_module_decl : cxx_module_decl; |
679 | 0 | else |
680 | 0 | Kind = Export ? cxx_export_import_decl : cxx_import_decl; |
681 | |
|
682 | 0 | return lexModuleDirectiveBody(Kind, First, End); |
683 | 0 | } |
684 | | |
685 | 0 | bool Scanner::lex_Pragma(const char *&First, const char *const End) { |
686 | 0 | if (!isNextTokenOrSkipLine(tok::l_paren, First, End)) |
687 | 0 | return false; |
688 | | |
689 | 0 | std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End); |
690 | |
|
691 | 0 | if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End)) |
692 | 0 | return false; |
693 | | |
694 | 0 | SmallString<64> Buffer(*Str); |
695 | 0 | prepare_PragmaString(Buffer); |
696 | | |
697 | | // Use a new scanner instance since the tokens will be inside the allocated |
698 | | // string. We should already have captured all the relevant tokens in the |
699 | | // current scanner. |
700 | 0 | SmallVector<dependency_directives_scan::Token> DiscardTokens; |
701 | 0 | const char *Begin = Buffer.c_str(); |
702 | 0 | Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags, |
703 | 0 | InputSourceLoc}; |
704 | |
|
705 | 0 | PragmaScanner.TheLexer.setParsingPreprocessorDirective(true); |
706 | 0 | if (PragmaScanner.lexPragma(Begin, Buffer.end())) |
707 | 0 | return true; |
708 | | |
709 | 0 | DirectiveKind K = PragmaScanner.topDirective(); |
710 | 0 | if (K == pp_none) { |
711 | 0 | skipLine(First, End); |
712 | 0 | return false; |
713 | 0 | } |
714 | | |
715 | 0 | assert(Begin == Buffer.end()); |
716 | 0 | pushDirective(K); |
717 | 0 | return false; |
718 | 0 | } |
719 | | |
720 | 0 | bool Scanner::lexPragma(const char *&First, const char *const End) { |
721 | 0 | std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); |
722 | 0 | if (!FoundId) |
723 | 0 | return false; |
724 | | |
725 | 0 | StringRef Id = *FoundId; |
726 | 0 | auto Kind = llvm::StringSwitch<DirectiveKind>(Id) |
727 | 0 | .Case("once", pp_pragma_once) |
728 | 0 | .Case("push_macro", pp_pragma_push_macro) |
729 | 0 | .Case("pop_macro", pp_pragma_pop_macro) |
730 | 0 | .Case("include_alias", pp_pragma_include_alias) |
731 | 0 | .Default(pp_none); |
732 | 0 | if (Kind != pp_none) { |
733 | 0 | lexPPDirectiveBody(First, End); |
734 | 0 | pushDirective(Kind); |
735 | 0 | return false; |
736 | 0 | } |
737 | | |
738 | 0 | if (Id != "clang") { |
739 | 0 | skipLine(First, End); |
740 | 0 | return false; |
741 | 0 | } |
742 | | |
743 | 0 | FoundId = tryLexIdentifierOrSkipLine(First, End); |
744 | 0 | if (!FoundId) |
745 | 0 | return false; |
746 | 0 | Id = *FoundId; |
747 | | |
748 | | // #pragma clang system_header |
749 | 0 | if (Id == "system_header") { |
750 | 0 | lexPPDirectiveBody(First, End); |
751 | 0 | pushDirective(pp_pragma_system_header); |
752 | 0 | return false; |
753 | 0 | } |
754 | | |
755 | 0 | if (Id != "module") { |
756 | 0 | skipLine(First, End); |
757 | 0 | return false; |
758 | 0 | } |
759 | | |
760 | | // #pragma clang module. |
761 | 0 | if (!isNextIdentifierOrSkipLine("import", First, End)) |
762 | 0 | return false; |
763 | | |
764 | | // #pragma clang module import. |
765 | 0 | lexPPDirectiveBody(First, End); |
766 | 0 | pushDirective(pp_pragma_import); |
767 | 0 | return false; |
768 | 0 | } |
769 | | |
770 | 0 | bool Scanner::lexEndif(const char *&First, const char *const End) { |
771 | | // Strip out "#else" if it's empty. |
772 | 0 | if (topDirective() == pp_else) |
773 | 0 | popDirective(); |
774 | | |
775 | | // If "#ifdef" is empty, strip it and skip the "#endif". |
776 | | // |
777 | | // FIXME: Once/if Clang starts disallowing __has_include in macro expansions, |
778 | | // we can skip empty `#if` and `#elif` blocks as well after scanning for a |
779 | | // literal __has_include in the condition. Even without that rule we could |
780 | | // drop the tokens if we scan for identifiers in the condition and find none. |
781 | 0 | if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) { |
782 | 0 | popDirective(); |
783 | 0 | skipLine(First, End); |
784 | 0 | return false; |
785 | 0 | } |
786 | | |
787 | 0 | return lexDefault(pp_endif, First, End); |
788 | 0 | } |
789 | | |
790 | | bool Scanner::lexDefault(DirectiveKind Kind, const char *&First, |
791 | 0 | const char *const End) { |
792 | 0 | lexPPDirectiveBody(First, End); |
793 | 0 | pushDirective(Kind); |
794 | 0 | return false; |
795 | 0 | } |
796 | | |
797 | 0 | static bool isStartOfRelevantLine(char First) { |
798 | 0 | switch (First) { |
799 | 0 | case '#': |
800 | 0 | case '@': |
801 | 0 | case 'i': |
802 | 0 | case 'e': |
803 | 0 | case 'm': |
804 | 0 | case '_': |
805 | 0 | return true; |
806 | 0 | } |
807 | 0 | return false; |
808 | 0 | } |
809 | | |
810 | 0 | bool Scanner::lexPPLine(const char *&First, const char *const End) { |
811 | 0 | assert(First != End); |
812 | | |
813 | 0 | skipWhitespace(First, End); |
814 | 0 | assert(First <= End); |
815 | 0 | if (First == End) |
816 | 0 | return false; |
817 | | |
818 | 0 | if (!isStartOfRelevantLine(*First)) { |
819 | 0 | skipLine(First, End); |
820 | 0 | assert(First <= End); |
821 | 0 | return false; |
822 | 0 | } |
823 | | |
824 | 0 | LastTokenPtr = First; |
825 | |
|
826 | 0 | TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true); |
827 | |
|
828 | 0 | auto ScEx1 = make_scope_exit([&]() { |
829 | | /// Clear Scanner's CurDirToks before returning, in case we didn't push a |
830 | | /// new directive. |
831 | 0 | CurDirToks.clear(); |
832 | 0 | }); |
833 | | |
834 | | // Handle "@import". |
835 | 0 | if (*First == '@') |
836 | 0 | return lexAt(First, End); |
837 | | |
838 | 0 | if (*First == 'i' || *First == 'e' || *First == 'm') |
839 | 0 | return lexModule(First, End); |
840 | | |
841 | 0 | if (*First == '_') { |
842 | 0 | if (isNextIdentifierOrSkipLine("_Pragma", First, End)) |
843 | 0 | return lex_Pragma(First, End); |
844 | 0 | return false; |
845 | 0 | } |
846 | | |
847 | | // Handle preprocessing directives. |
848 | | |
849 | 0 | TheLexer.setParsingPreprocessorDirective(true); |
850 | 0 | auto ScEx2 = make_scope_exit( |
851 | 0 | [&]() { TheLexer.setParsingPreprocessorDirective(false); }); |
852 | | |
853 | | // Lex '#'. |
854 | 0 | const dependency_directives_scan::Token &HashTok = lexToken(First, End); |
855 | 0 | if (HashTok.is(tok::hashhash)) { |
856 | | // A \p tok::hashhash at this location is passed by the preprocessor to the |
857 | | // parser to interpret, like any other token. So for dependency scanning |
858 | | // skip it like a normal token not affecting the preprocessor. |
859 | 0 | skipLine(First, End); |
860 | 0 | assert(First <= End); |
861 | 0 | return false; |
862 | 0 | } |
863 | 0 | assert(HashTok.is(tok::hash)); |
864 | 0 | (void)HashTok; |
865 | |
|
866 | 0 | std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); |
867 | 0 | if (!FoundId) |
868 | 0 | return false; |
869 | | |
870 | 0 | StringRef Id = *FoundId; |
871 | |
|
872 | 0 | if (Id == "pragma") |
873 | 0 | return lexPragma(First, End); |
874 | | |
875 | 0 | auto Kind = llvm::StringSwitch<DirectiveKind>(Id) |
876 | 0 | .Case("include", pp_include) |
877 | 0 | .Case("__include_macros", pp___include_macros) |
878 | 0 | .Case("define", pp_define) |
879 | 0 | .Case("undef", pp_undef) |
880 | 0 | .Case("import", pp_import) |
881 | 0 | .Case("include_next", pp_include_next) |
882 | 0 | .Case("if", pp_if) |
883 | 0 | .Case("ifdef", pp_ifdef) |
884 | 0 | .Case("ifndef", pp_ifndef) |
885 | 0 | .Case("elif", pp_elif) |
886 | 0 | .Case("elifdef", pp_elifdef) |
887 | 0 | .Case("elifndef", pp_elifndef) |
888 | 0 | .Case("else", pp_else) |
889 | 0 | .Case("endif", pp_endif) |
890 | 0 | .Default(pp_none); |
891 | 0 | if (Kind == pp_none) { |
892 | 0 | skipDirective(Id, First, End); |
893 | 0 | return false; |
894 | 0 | } |
895 | | |
896 | 0 | if (Kind == pp_endif) |
897 | 0 | return lexEndif(First, End); |
898 | | |
899 | 0 | switch (Kind) { |
900 | 0 | case pp_include: |
901 | 0 | case pp___include_macros: |
902 | 0 | case pp_include_next: |
903 | 0 | case pp_import: |
904 | 0 | lexIncludeFilename(First, End); |
905 | 0 | break; |
906 | 0 | default: |
907 | 0 | break; |
908 | 0 | } |
909 | | |
910 | | // Everything else. |
911 | 0 | return lexDefault(Kind, First, End); |
912 | 0 | } |
913 | | |
914 | 0 | static void skipUTF8ByteOrderMark(const char *&First, const char *const End) { |
915 | 0 | if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' && |
916 | 0 | First[2] == '\xbf') |
917 | 0 | First += 3; |
918 | 0 | } |
919 | | |
920 | 0 | bool Scanner::scanImpl(const char *First, const char *const End) { |
921 | 0 | skipUTF8ByteOrderMark(First, End); |
922 | 0 | while (First != End) |
923 | 0 | if (lexPPLine(First, End)) |
924 | 0 | return true; |
925 | 0 | return false; |
926 | 0 | } |
927 | | |
928 | 0 | bool Scanner::scan(SmallVectorImpl<Directive> &Directives) { |
929 | 0 | bool Error = scanImpl(Input.begin(), Input.end()); |
930 | |
|
931 | 0 | if (!Error) { |
932 | | // Add an EOF on success. |
933 | 0 | if (LastTokenPtr && |
934 | 0 | (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset)) |
935 | 0 | pushDirective(tokens_present_before_eof); |
936 | 0 | pushDirective(pp_eof); |
937 | 0 | } |
938 | |
|
939 | 0 | ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens; |
940 | 0 | for (const DirectiveWithTokens &DirWithToks : DirsWithToks) { |
941 | 0 | assert(RemainingTokens.size() >= DirWithToks.NumTokens); |
942 | 0 | Directives.emplace_back(DirWithToks.Kind, |
943 | 0 | RemainingTokens.take_front(DirWithToks.NumTokens)); |
944 | 0 | RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens); |
945 | 0 | } |
946 | 0 | assert(RemainingTokens.empty()); |
947 | | |
948 | 0 | return Error; |
949 | 0 | } |
950 | | |
951 | | bool clang::scanSourceForDependencyDirectives( |
952 | | StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, |
953 | | SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags, |
954 | 0 | SourceLocation InputSourceLoc) { |
955 | 0 | return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives); |
956 | 0 | } |
957 | | |
958 | | void clang::printDependencyDirectivesAsSource( |
959 | | StringRef Source, |
960 | | ArrayRef<dependency_directives_scan::Directive> Directives, |
961 | 0 | llvm::raw_ostream &OS) { |
962 | | // Add a space separator where it is convenient for testing purposes. |
963 | 0 | auto needsSpaceSeparator = |
964 | 0 | [](tok::TokenKind Prev, |
965 | 0 | const dependency_directives_scan::Token &Tok) -> bool { |
966 | 0 | if (Prev == Tok.Kind) |
967 | 0 | return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square, |
968 | 0 | tok::r_square); |
969 | 0 | if (Prev == tok::raw_identifier && |
970 | 0 | Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal, |
971 | 0 | tok::char_constant, tok::header_name)) |
972 | 0 | return true; |
973 | 0 | if (Prev == tok::r_paren && |
974 | 0 | Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal, |
975 | 0 | tok::char_constant, tok::unknown)) |
976 | 0 | return true; |
977 | 0 | if (Prev == tok::comma && |
978 | 0 | Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less)) |
979 | 0 | return true; |
980 | 0 | return false; |
981 | 0 | }; |
982 | |
|
983 | 0 | for (const dependency_directives_scan::Directive &Directive : Directives) { |
984 | 0 | if (Directive.Kind == tokens_present_before_eof) |
985 | 0 | OS << "<TokBeforeEOF>"; |
986 | 0 | std::optional<tok::TokenKind> PrevTokenKind; |
987 | 0 | for (const dependency_directives_scan::Token &Tok : Directive.Tokens) { |
988 | 0 | if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok)) |
989 | 0 | OS << ' '; |
990 | 0 | PrevTokenKind = Tok.Kind; |
991 | 0 | OS << Source.slice(Tok.Offset, Tok.getEnd()); |
992 | 0 | } |
993 | 0 | } |
994 | 0 | } |