/src/llvm-project/clang-tools-extra/pseudo/lib/Lex.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang-pseudo/Token.h" |
10 | | #include "clang/Basic/IdentifierTable.h" |
11 | | #include "clang/Basic/SourceLocation.h" |
12 | | #include "clang/Basic/TokenKinds.h" |
13 | | #include "clang/Lex/Lexer.h" |
14 | | #include "clang/Lex/LiteralSupport.h" |
15 | | |
16 | | namespace clang { |
17 | | namespace pseudo { |
18 | | |
19 | 9.56k | TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { |
20 | 9.56k | clang::SourceLocation Start; |
21 | | // Tokenize using clang's lexer in raw mode. |
22 | | // std::string guarantees null-termination, which the lexer needs. |
23 | 9.56k | clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), |
24 | 9.56k | Code.data() + Code.size()); |
25 | 9.56k | Lexer.SetCommentRetentionState(true); |
26 | | |
27 | 9.56k | TokenStream Result; |
28 | 9.56k | clang::Token CT; |
29 | | // Index into the token stream of original source code. |
30 | 9.56k | Token::Index TokenIndex = 0; |
31 | 9.56k | unsigned LastOffset = 0; |
32 | 9.56k | unsigned Line = 0; |
33 | 9.56k | unsigned Indent = 0; |
34 | 52.9M | for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; |
35 | 52.9M | Lexer.LexFromRawLexer(CT)) { |
36 | 52.9M | unsigned Offset = |
37 | 52.9M | CT.getLocation().getRawEncoding() - Start.getRawEncoding(); |
38 | | |
39 | 52.9M | Token Tok; |
40 | 52.9M | Tok.Data = &Code[Offset]; |
41 | 52.9M | Tok.Length = CT.getLength(); |
42 | 52.9M | Tok.Kind = CT.getKind(); |
43 | | |
44 | | // Update current line number and indentation from raw source code. |
45 | 52.9M | unsigned NewLineStart = 0; |
46 | 160M | for (unsigned I = LastOffset; I < Offset; ++I) { |
47 | 107M | if (Code[I] == '\n') { |
48 | 1.30M | NewLineStart = I + 1; |
49 | 1.30M | ++Line; |
50 | 1.30M | } |
51 | 107M | } |
52 | 52.9M | if (NewLineStart || !LastOffset) { |
53 | 1.23M | Indent = 0; |
54 | 1.23M | for (char C : StringRef(Code).slice(NewLineStart, Offset)) { |
55 | 332k | if (C == ' ') |
56 | 1.88k | ++Indent; |
57 | 330k | else if (C == '\t') |
58 | 2.86k | Indent += 8; |
59 | 327k | else |
60 | 327k | break; |
61 | 332k | } |
62 | 1.23M | } |
63 | 52.9M | Tok.Indent = Indent; |
64 | 52.9M | Tok.Line = Line; |
65 | | |
66 | 52.9M | if (CT.isAtStartOfLine()) |
67 | 1.17M | Tok.setFlag(LexFlags::StartsPPLine); |
68 | 52.9M | if (CT.needsCleaning() || CT.hasUCN()) |
69 | 155k | Tok.setFlag(LexFlags::NeedsCleaning); |
70 | | |
71 | 52.9M | Tok.OriginalIndex = TokenIndex++; |
72 | 52.9M | Result.push(Tok); |
73 | 52.9M | LastOffset = Offset; |
74 | 52.9M | } |
75 | 9.56k | Result.finalize(); |
76 | 9.56k | return Result; |
77 | 9.56k | } |
78 | | |
79 | 9.56k | TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { |
80 | 9.56k | auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>(); |
81 | 9.56k | clang::IdentifierTable Identifiers(LangOpts); |
82 | 9.56k | TokenStream Result(CleanedStorage); |
83 | 9.56k | Result.addPayload(Code.getPayload()); |
84 | 52.9M | for (auto Tok : Code.tokens()) { |
85 | 52.9M | if (Tok.flag(LexFlags::NeedsCleaning)) { |
86 | | // Remove escaped newlines and trigraphs. |
87 | 155k | llvm::SmallString<64> CleanBuffer; |
88 | 155k | const char *Pos = Tok.text().begin(); |
89 | 34.4M | while (Pos < Tok.text().end()) { |
90 | 34.3M | auto [Char, CharSize] = |
91 | 34.3M | clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts); |
92 | 34.3M | CleanBuffer.push_back(Char); |
93 | 34.3M | assert(CharSize != 0 && "no progress!"); |
94 | 0 | Pos += CharSize; |
95 | 34.3M | } |
96 | 155k | llvm::StringRef Text = CleanBuffer; |
97 | 155k | llvm::SmallString<64> UCNBuffer; |
98 | | // A surface reading of the standard suggests UCNs might appear anywhere. |
99 | | // But we need only decode them in raw_identifiers. |
100 | | // - they cannot appear in punctuation/keyword tokens, because UCNs |
101 | | // cannot encode basic characters outside of literals [lex.charset] |
102 | | // - they can appear in literals, but we need not unescape them now. |
103 | | // We treat them as escape sequences when evaluating the literal. |
104 | | // - comments are handled similarly to literals |
105 | | // This is good fortune, because expandUCNs requires its input to be a |
106 | | // reasonably valid identifier (e.g. without stray backslashes). |
107 | 155k | if (Tok.Kind == tok::raw_identifier) { |
108 | 13.5k | clang::expandUCNs(UCNBuffer, CleanBuffer); |
109 | 13.5k | Text = UCNBuffer; |
110 | 13.5k | } |
111 | | |
112 | 155k | Tok.Data = Text.copy(*CleanedStorage).data(); |
113 | 155k | Tok.Length = Text.size(); |
114 | 155k | Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); |
115 | 155k | } |
116 | | |
117 | 52.9M | if (Tok.Kind == tok::raw_identifier) { |
118 | | // Cook raw_identifiers into identifier, keyword, etc. |
119 | 3.12M | Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); |
120 | 49.7M | } else if (Tok.Kind == tok::greatergreater) { |
121 | | // Split the greatergreater token. |
122 | | // FIXME: split lessless token to support Cuda triple angle brackets <<<. |
123 | 228k | assert(Tok.text() == ">>"); |
124 | 0 | Tok.Kind = tok::greater; |
125 | 228k | Tok.Length = 1; |
126 | 228k | Result.push(Tok); |
127 | | // Line is wrong if the first greater is followed by an escaped newline! |
128 | 228k | Tok.Data = Tok.text().data() + 1; |
129 | 228k | } |
130 | | |
131 | 0 | Result.push(std::move(Tok)); |
132 | 52.9M | } |
133 | | |
134 | 9.56k | Result.finalize(); |
135 | 9.56k | return Result; |
136 | 9.56k | } |
137 | | |
138 | | } // namespace pseudo |
139 | | } // namespace clang |