/src/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | /// |
9 | | /// \file |
10 | | /// This file implements FormatTokenLexer, which tokenizes a source file |
11 | | /// into a FormatToken stream suitable for ClangFormat. |
12 | | /// |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "FormatTokenLexer.h" |
16 | | #include "FormatToken.h" |
17 | | #include "clang/Basic/SourceLocation.h" |
18 | | #include "clang/Basic/SourceManager.h" |
19 | | #include "clang/Format/Format.h" |
20 | | #include "llvm/Support/Regex.h" |
21 | | |
22 | | namespace clang { |
23 | | namespace format { |
24 | | |
25 | | FormatTokenLexer::FormatTokenLexer( |
26 | | const SourceManager &SourceMgr, FileID ID, unsigned Column, |
27 | | const FormatStyle &Style, encoding::Encoding Encoding, |
28 | | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, |
29 | | IdentifierTable &IdentTable) |
30 | | : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), |
31 | | Column(Column), TrailingWhitespace(0), |
32 | | LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), |
33 | | Style(Style), IdentTable(IdentTable), Keywords(IdentTable), |
34 | | Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), |
35 | | FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), |
36 | 4.04k | MacroBlockEndRegex(Style.MacroBlockEnd) { |
37 | 4.04k | Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); |
38 | 4.04k | Lex->SetKeepWhitespaceMode(true); |
39 | | |
40 | 12.1k | for (const std::string &ForEachMacro : Style.ForEachMacros) { |
41 | 12.1k | auto Identifier = &IdentTable.get(ForEachMacro); |
42 | 12.1k | Macros.insert({Identifier, TT_ForEachMacro}); |
43 | 12.1k | } |
44 | 4.04k | for (const std::string &IfMacro : Style.IfMacros) { |
45 | 4.04k | auto Identifier = &IdentTable.get(IfMacro); |
46 | 4.04k | Macros.insert({Identifier, TT_IfMacro}); |
47 | 4.04k | } |
48 | 4.04k | for (const std::string &AttributeMacro : Style.AttributeMacros) { |
49 | 4.04k | auto Identifier = &IdentTable.get(AttributeMacro); |
50 | 4.04k | Macros.insert({Identifier, TT_AttributeMacro}); |
51 | 4.04k | } |
52 | 8.08k | for (const std::string &StatementMacro : Style.StatementMacros) { |
53 | 8.08k | auto Identifier = &IdentTable.get(StatementMacro); |
54 | 8.08k | Macros.insert({Identifier, TT_StatementMacro}); |
55 | 8.08k | } |
56 | 4.04k | for (const std::string &TypenameMacro : Style.TypenameMacros) { |
57 | 0 | auto Identifier = &IdentTable.get(TypenameMacro); |
58 | 0 | Macros.insert({Identifier, TT_TypenameMacro}); |
59 | 0 | } |
60 | 4.04k | for (const std::string &NamespaceMacro : Style.NamespaceMacros) { |
61 | 0 | auto Identifier = &IdentTable.get(NamespaceMacro); |
62 | 0 | Macros.insert({Identifier, TT_NamespaceMacro}); |
63 | 0 | } |
64 | 4.04k | for (const std::string &WhitespaceSensitiveMacro : |
65 | 20.2k | Style.WhitespaceSensitiveMacros) { |
66 | 20.2k | auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro); |
67 | 20.2k | Macros.insert({Identifier, TT_UntouchableMacroFunc}); |
68 | 20.2k | } |
69 | 4.04k | for (const std::string &StatementAttributeLikeMacro : |
70 | 4.04k | Style.StatementAttributeLikeMacros) { |
71 | 4.04k | auto Identifier = &IdentTable.get(StatementAttributeLikeMacro); |
72 | 4.04k | Macros.insert({Identifier, TT_StatementAttributeLikeMacro}); |
73 | 4.04k | } |
74 | | |
75 | 4.04k | for (const auto &TypeName : Style.TypeNames) |
76 | 0 | TypeNames.insert(&IdentTable.get(TypeName)); |
77 | 4.04k | } |
78 | | |
79 | 4.04k | ArrayRef<FormatToken *> FormatTokenLexer::lex() { |
80 | 4.04k | assert(Tokens.empty()); |
81 | 0 | assert(FirstInLineIndex == 0); |
82 | 52.9M | do { |
83 | 52.9M | Tokens.push_back(getNextToken()); |
84 | 52.9M | if (Style.isJavaScript()) { |
85 | 0 | tryParseJSRegexLiteral(); |
86 | 0 | handleTemplateStrings(); |
87 | 0 | } |
88 | 52.9M | if (Style.Language == FormatStyle::LK_TextProto) |
89 | 0 | tryParsePythonComment(); |
90 | 52.9M | tryMergePreviousTokens(); |
91 | 52.9M | if (Style.isCSharp()) { |
92 | | // This needs to come after tokens have been merged so that C# |
93 | | // string literals are correctly identified. |
94 | 0 | handleCSharpVerbatimAndInterpolatedStrings(); |
95 | 0 | } |
96 | 52.9M | if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) |
97 | 2.60M | FirstInLineIndex = Tokens.size() - 1; |
98 | 52.9M | } while (Tokens.back()->isNot(tok::eof)); |
99 | 4.04k | return Tokens; |
100 | 4.04k | } |
101 | | |
102 | 52.9M | void FormatTokenLexer::tryMergePreviousTokens() { |
103 | 52.9M | if (tryMerge_TMacro()) |
104 | 0 | return; |
105 | 52.9M | if (tryMergeConflictMarkers()) |
106 | 261k | return; |
107 | 52.7M | if (tryMergeLessLess()) |
108 | 17.6k | return; |
109 | 52.6M | if (tryMergeGreaterGreater()) |
110 | 9 | return; |
111 | 52.6M | if (tryMergeForEach()) |
112 | 0 | return; |
113 | 52.6M | if (Style.isCpp() && tryTransformTryUsageForC()) |
114 | 2.76k | return; |
115 | | |
116 | 52.6M | if (Style.isJavaScript() || Style.isCSharp()) { |
117 | 0 | static const tok::TokenKind NullishCoalescingOperator[] = {tok::question, |
118 | 0 | tok::question}; |
119 | 0 | static const tok::TokenKind NullPropagatingOperator[] = {tok::question, |
120 | 0 | tok::period}; |
121 | 0 | static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater}; |
122 | |
|
123 | 0 | if (tryMergeTokens(FatArrow, TT_FatArrow)) |
124 | 0 | return; |
125 | 0 | if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) { |
126 | | // Treat like the "||" operator (as opposed to the ternary ?). |
127 | 0 | Tokens.back()->Tok.setKind(tok::pipepipe); |
128 | 0 | return; |
129 | 0 | } |
130 | 0 | if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) { |
131 | | // Treat like a regular "." access. |
132 | 0 | Tokens.back()->Tok.setKind(tok::period); |
133 | 0 | return; |
134 | 0 | } |
135 | 0 | if (tryMergeNullishCoalescingEqual()) |
136 | 0 | return; |
137 | 0 | } |
138 | | |
139 | 52.6M | if (Style.isCSharp()) { |
140 | 0 | static const tok::TokenKind CSharpNullConditionalLSquare[] = { |
141 | 0 | tok::question, tok::l_square}; |
142 | |
|
143 | 0 | if (tryMergeCSharpKeywordVariables()) |
144 | 0 | return; |
145 | 0 | if (tryMergeCSharpStringLiteral()) |
146 | 0 | return; |
147 | 0 | if (tryTransformCSharpForEach()) |
148 | 0 | return; |
149 | 0 | if (tryMergeTokens(CSharpNullConditionalLSquare, |
150 | 0 | TT_CSharpNullConditionalLSquare)) { |
151 | | // Treat like a regular "[" operator. |
152 | 0 | Tokens.back()->Tok.setKind(tok::l_square); |
153 | 0 | return; |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | 52.6M | if (tryMergeNSStringLiteral()) |
158 | 372 | return; |
159 | | |
160 | 52.6M | if (Style.isJavaScript()) { |
161 | 0 | static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; |
162 | 0 | static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, |
163 | 0 | tok::equal}; |
164 | 0 | static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, |
165 | 0 | tok::greaterequal}; |
166 | 0 | static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; |
167 | 0 | static const tok::TokenKind JSExponentiationEqual[] = {tok::star, |
168 | 0 | tok::starequal}; |
169 | 0 | static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal}; |
170 | 0 | static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal}; |
171 | | |
172 | | // FIXME: Investigate what token type gives the correct operator priority. |
173 | 0 | if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) |
174 | 0 | return; |
175 | 0 | if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) |
176 | 0 | return; |
177 | 0 | if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) |
178 | 0 | return; |
179 | 0 | if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) |
180 | 0 | return; |
181 | 0 | if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { |
182 | 0 | Tokens.back()->Tok.setKind(tok::starequal); |
183 | 0 | return; |
184 | 0 | } |
185 | 0 | if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) || |
186 | 0 | tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) { |
187 | | // Treat like the "=" assignment operator. |
188 | 0 | Tokens.back()->Tok.setKind(tok::equal); |
189 | 0 | return; |
190 | 0 | } |
191 | 0 | if (tryMergeJSPrivateIdentifier()) |
192 | 0 | return; |
193 | 0 | } |
194 | | |
195 | 52.6M | if (Style.Language == FormatStyle::LK_Java) { |
196 | 0 | static const tok::TokenKind JavaRightLogicalShiftAssign[] = { |
197 | 0 | tok::greater, tok::greater, tok::greaterequal}; |
198 | 0 | if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) |
199 | 0 | return; |
200 | 0 | } |
201 | | |
202 | 52.6M | if (Style.isVerilog()) { |
203 | | // Merge the number following a base like `'h?a0`. |
204 | 0 | if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) && |
205 | 0 | Tokens.end()[-2]->is(tok::numeric_constant) && |
206 | 0 | Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier, |
207 | 0 | tok::question) && |
208 | 0 | tryMergeTokens(2, TT_Unknown)) { |
209 | 0 | return; |
210 | 0 | } |
211 | | // Part select. |
212 | 0 | if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}}, |
213 | 0 | TT_BitFieldColon)) { |
214 | 0 | return; |
215 | 0 | } |
216 | | // Xnor. The combined token is treated as a caret which can also be either a |
217 | | // unary or binary operator. The actual type is determined in |
218 | | // TokenAnnotator. We also check the token length so we know it is not |
219 | | // already a merged token. |
220 | 0 | if (Tokens.back()->TokenText.size() == 1 && |
221 | 0 | tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}}, |
222 | 0 | TT_BinaryOperator)) { |
223 | 0 | Tokens.back()->Tok.setKind(tok::caret); |
224 | 0 | return; |
225 | 0 | } |
226 | | // Signed shift and distribution weight. |
227 | 0 | if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) { |
228 | 0 | Tokens.back()->Tok.setKind(tok::lessless); |
229 | 0 | return; |
230 | 0 | } |
231 | 0 | if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) { |
232 | 0 | Tokens.back()->Tok.setKind(tok::greatergreater); |
233 | 0 | return; |
234 | 0 | } |
235 | 0 | if (tryMergeTokensAny({{tok::lessless, tok::equal}, |
236 | 0 | {tok::lessless, tok::lessequal}, |
237 | 0 | {tok::greatergreater, tok::equal}, |
238 | 0 | {tok::greatergreater, tok::greaterequal}, |
239 | 0 | {tok::colon, tok::equal}, |
240 | 0 | {tok::colon, tok::slash}}, |
241 | 0 | TT_BinaryOperator)) { |
242 | 0 | Tokens.back()->ForcedPrecedence = prec::Assignment; |
243 | 0 | return; |
244 | 0 | } |
245 | | // Exponentiation, signed shift, case equality, and wildcard equality. |
246 | 0 | if (tryMergeTokensAny({{tok::star, tok::star}, |
247 | 0 | {tok::lessless, tok::less}, |
248 | 0 | {tok::greatergreater, tok::greater}, |
249 | 0 | {tok::exclaimequal, tok::equal}, |
250 | 0 | {tok::exclaimequal, tok::question}, |
251 | 0 | {tok::equalequal, tok::equal}, |
252 | 0 | {tok::equalequal, tok::question}}, |
253 | 0 | TT_BinaryOperator)) { |
254 | 0 | return; |
255 | 0 | } |
256 | | // Module paths in specify blocks and the implication and boolean equality |
257 | | // operators. |
258 | 0 | if (tryMergeTokensAny({{tok::plusequal, tok::greater}, |
259 | 0 | {tok::plus, tok::star, tok::greater}, |
260 | 0 | {tok::minusequal, tok::greater}, |
261 | 0 | {tok::minus, tok::star, tok::greater}, |
262 | 0 | {tok::less, tok::arrow}, |
263 | 0 | {tok::equal, tok::greater}, |
264 | 0 | {tok::star, tok::greater}, |
265 | 0 | {tok::pipeequal, tok::greater}, |
266 | 0 | {tok::pipe, tok::arrow}, |
267 | 0 | {tok::hash, tok::minus, tok::hash}, |
268 | 0 | {tok::hash, tok::equal, tok::hash}}, |
269 | 0 | TT_BinaryOperator) || |
270 | 0 | Tokens.back()->is(tok::arrow)) { |
271 | 0 | Tokens.back()->ForcedPrecedence = prec::Comma; |
272 | 0 | return; |
273 | 0 | } |
274 | 0 | } |
275 | 52.6M | } |
276 | | |
277 | 52.6M | bool FormatTokenLexer::tryMergeNSStringLiteral() { |
278 | 52.6M | if (Tokens.size() < 2) |
279 | 4.04k | return false; |
280 | 52.6M | auto &At = *(Tokens.end() - 2); |
281 | 52.6M | auto &String = *(Tokens.end() - 1); |
282 | 52.6M | if (At->isNot(tok::at) || String->isNot(tok::string_literal)) |
283 | 52.6M | return false; |
284 | 372 | At->Tok.setKind(tok::string_literal); |
285 | 372 | At->TokenText = StringRef(At->TokenText.begin(), |
286 | 372 | String->TokenText.end() - At->TokenText.begin()); |
287 | 372 | At->ColumnWidth += String->ColumnWidth; |
288 | 372 | At->setType(TT_ObjCStringLiteral); |
289 | 372 | Tokens.erase(Tokens.end() - 1); |
290 | 372 | return true; |
291 | 52.6M | } |
292 | | |
293 | 0 | bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { |
294 | | // Merges #idenfier into a single identifier with the text #identifier |
295 | | // but the token tok::identifier. |
296 | 0 | if (Tokens.size() < 2) |
297 | 0 | return false; |
298 | 0 | auto &Hash = *(Tokens.end() - 2); |
299 | 0 | auto &Identifier = *(Tokens.end() - 1); |
300 | 0 | if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier)) |
301 | 0 | return false; |
302 | 0 | Hash->Tok.setKind(tok::identifier); |
303 | 0 | Hash->TokenText = |
304 | 0 | StringRef(Hash->TokenText.begin(), |
305 | 0 | Identifier->TokenText.end() - Hash->TokenText.begin()); |
306 | 0 | Hash->ColumnWidth += Identifier->ColumnWidth; |
307 | 0 | Hash->setType(TT_JsPrivateIdentifier); |
308 | 0 | Tokens.erase(Tokens.end() - 1); |
309 | 0 | return true; |
310 | 0 | } |
311 | | |
312 | | // Search for verbatim or interpolated string literals @"ABC" or |
313 | | // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to |
314 | | // prevent splitting of @, $ and ". |
315 | | // Merging of multiline verbatim strings with embedded '"' is handled in |
316 | | // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. |
317 | 0 | bool FormatTokenLexer::tryMergeCSharpStringLiteral() { |
318 | 0 | if (Tokens.size() < 2) |
319 | 0 | return false; |
320 | | |
321 | | // Look for @"aaaaaa" or $"aaaaaa". |
322 | 0 | const auto String = *(Tokens.end() - 1); |
323 | 0 | if (String->isNot(tok::string_literal)) |
324 | 0 | return false; |
325 | | |
326 | 0 | auto Prefix = *(Tokens.end() - 2); |
327 | 0 | if (Prefix->isNot(tok::at) && Prefix->TokenText != "$") |
328 | 0 | return false; |
329 | | |
330 | 0 | if (Tokens.size() > 2) { |
331 | 0 | const auto Tok = *(Tokens.end() - 3); |
332 | 0 | if ((Tok->TokenText == "$" && Prefix->is(tok::at)) || |
333 | 0 | (Tok->is(tok::at) && Prefix->TokenText == "$")) { |
334 | | // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens. |
335 | 0 | Tok->ColumnWidth += Prefix->ColumnWidth; |
336 | 0 | Tokens.erase(Tokens.end() - 2); |
337 | 0 | Prefix = Tok; |
338 | 0 | } |
339 | 0 | } |
340 | | |
341 | | // Convert back into just a string_literal. |
342 | 0 | Prefix->Tok.setKind(tok::string_literal); |
343 | 0 | Prefix->TokenText = |
344 | 0 | StringRef(Prefix->TokenText.begin(), |
345 | 0 | String->TokenText.end() - Prefix->TokenText.begin()); |
346 | 0 | Prefix->ColumnWidth += String->ColumnWidth; |
347 | 0 | Prefix->setType(TT_CSharpStringLiteral); |
348 | 0 | Tokens.erase(Tokens.end() - 1); |
349 | 0 | return true; |
350 | 0 | } |
351 | | |
352 | | // Valid C# attribute targets: |
353 | | // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets |
354 | | const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { |
355 | | "assembly", "module", "field", "event", "method", |
356 | | "param", "property", "return", "type", |
357 | | }; |
358 | | |
359 | 0 | bool FormatTokenLexer::tryMergeNullishCoalescingEqual() { |
360 | 0 | if (Tokens.size() < 2) |
361 | 0 | return false; |
362 | 0 | auto &NullishCoalescing = *(Tokens.end() - 2); |
363 | 0 | auto &Equal = *(Tokens.end() - 1); |
364 | 0 | if (NullishCoalescing->getType() != TT_NullCoalescingOperator || |
365 | 0 | Equal->isNot(tok::equal)) { |
366 | 0 | return false; |
367 | 0 | } |
368 | 0 | NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens. |
369 | 0 | NullishCoalescing->TokenText = |
370 | 0 | StringRef(NullishCoalescing->TokenText.begin(), |
371 | 0 | Equal->TokenText.end() - NullishCoalescing->TokenText.begin()); |
372 | 0 | NullishCoalescing->ColumnWidth += Equal->ColumnWidth; |
373 | 0 | NullishCoalescing->setType(TT_NullCoalescingEqual); |
374 | 0 | Tokens.erase(Tokens.end() - 1); |
375 | 0 | return true; |
376 | 0 | } |
377 | | |
378 | 0 | bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { |
379 | 0 | if (Tokens.size() < 2) |
380 | 0 | return false; |
381 | 0 | const auto At = *(Tokens.end() - 2); |
382 | 0 | if (At->isNot(tok::at)) |
383 | 0 | return false; |
384 | 0 | const auto Keyword = *(Tokens.end() - 1); |
385 | 0 | if (Keyword->TokenText == "$") |
386 | 0 | return false; |
387 | 0 | if (!Keywords.isCSharpKeyword(*Keyword)) |
388 | 0 | return false; |
389 | | |
390 | 0 | At->Tok.setKind(tok::identifier); |
391 | 0 | At->TokenText = StringRef(At->TokenText.begin(), |
392 | 0 | Keyword->TokenText.end() - At->TokenText.begin()); |
393 | 0 | At->ColumnWidth += Keyword->ColumnWidth; |
394 | 0 | At->setType(Keyword->getType()); |
395 | 0 | Tokens.erase(Tokens.end() - 1); |
396 | 0 | return true; |
397 | 0 | } |
398 | | |
399 | | // In C# transform identifier foreach into kw_foreach |
400 | 0 | bool FormatTokenLexer::tryTransformCSharpForEach() { |
401 | 0 | if (Tokens.size() < 1) |
402 | 0 | return false; |
403 | 0 | auto &Identifier = *(Tokens.end() - 1); |
404 | 0 | if (Identifier->isNot(tok::identifier)) |
405 | 0 | return false; |
406 | 0 | if (Identifier->TokenText != "foreach") |
407 | 0 | return false; |
408 | | |
409 | 0 | Identifier->setType(TT_ForEachMacro); |
410 | 0 | Identifier->Tok.setKind(tok::kw_for); |
411 | 0 | return true; |
412 | 0 | } |
413 | | |
414 | 52.6M | bool FormatTokenLexer::tryMergeForEach() { |
415 | 52.6M | if (Tokens.size() < 2) |
416 | 4.04k | return false; |
417 | 52.6M | auto &For = *(Tokens.end() - 2); |
418 | 52.6M | auto &Each = *(Tokens.end() - 1); |
419 | 52.6M | if (For->isNot(tok::kw_for)) |
420 | 52.6M | return false; |
421 | 798 | if (Each->isNot(tok::identifier)) |
422 | 237 | return false; |
423 | 561 | if (Each->TokenText != "each") |
424 | 561 | return false; |
425 | | |
426 | 0 | For->setType(TT_ForEachMacro); |
427 | 0 | For->Tok.setKind(tok::kw_for); |
428 | |
|
429 | 0 | For->TokenText = StringRef(For->TokenText.begin(), |
430 | 0 | Each->TokenText.end() - For->TokenText.begin()); |
431 | 0 | For->ColumnWidth += Each->ColumnWidth; |
432 | 0 | Tokens.erase(Tokens.end() - 1); |
433 | 0 | return true; |
434 | 561 | } |
435 | | |
436 | 52.6M | bool FormatTokenLexer::tryTransformTryUsageForC() { |
437 | 52.6M | if (Tokens.size() < 2) |
438 | 4.04k | return false; |
439 | 52.6M | auto &Try = *(Tokens.end() - 2); |
440 | 52.6M | if (Try->isNot(tok::kw_try)) |
441 | 52.6M | return false; |
442 | 2.76k | auto &Next = *(Tokens.end() - 1); |
443 | 2.76k | if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment)) |
444 | 0 | return false; |
445 | | |
446 | 2.76k | if (Tokens.size() > 2) { |
447 | 2.76k | auto &At = *(Tokens.end() - 3); |
448 | 2.76k | if (At->is(tok::at)) |
449 | 0 | return false; |
450 | 2.76k | } |
451 | | |
452 | 2.76k | Try->Tok.setKind(tok::identifier); |
453 | 2.76k | return true; |
454 | 2.76k | } |
455 | | |
456 | 52.7M | bool FormatTokenLexer::tryMergeLessLess() { |
457 | | // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. |
458 | 52.7M | if (Tokens.size() < 3) |
459 | 8.08k | return false; |
460 | | |
461 | 52.6M | auto First = Tokens.end() - 3; |
462 | 52.6M | if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less)) |
463 | 52.5M | return false; |
464 | | |
465 | | // Only merge if there currently is no whitespace between the two "<". |
466 | 133k | if (First[1]->hasWhitespaceBefore()) |
467 | 0 | return false; |
468 | | |
469 | 133k | auto X = Tokens.size() > 3 ? First[-1] : nullptr; |
470 | 133k | if (X && X->is(tok::less)) |
471 | 92.1k | return false; |
472 | | |
473 | 41.4k | auto Y = First[2]; |
474 | 41.4k | if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less)) |
475 | 23.7k | return false; |
476 | | |
477 | 17.6k | First[0]->Tok.setKind(tok::lessless); |
478 | 17.6k | First[0]->TokenText = "<<"; |
479 | 17.6k | First[0]->ColumnWidth += 1; |
480 | 17.6k | Tokens.erase(Tokens.end() - 2); |
481 | 17.6k | return true; |
482 | 41.4k | } |
483 | | |
484 | 52.6M | bool FormatTokenLexer::tryMergeGreaterGreater() { |
485 | | // Merge kw_operator,greater,greater into kw_operator,greatergreater. |
486 | 52.6M | if (Tokens.size() < 2) |
487 | 4.04k | return false; |
488 | | |
489 | 52.6M | auto First = Tokens.end() - 2; |
490 | 52.6M | if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater)) |
491 | 51.7M | return false; |
492 | | |
493 | | // Only merge if there currently is no whitespace between the first two ">". |
494 | 975k | if (First[1]->hasWhitespaceBefore()) |
495 | 14.5k | return false; |
496 | | |
497 | 960k | auto Tok = Tokens.size() > 2 ? First[-1] : nullptr; |
498 | 960k | if (Tok && Tok->isNot(tok::kw_operator)) |
499 | 960k | return false; |
500 | | |
501 | 9 | First[0]->Tok.setKind(tok::greatergreater); |
502 | 9 | First[0]->TokenText = ">>"; |
503 | 9 | First[0]->ColumnWidth += 1; |
504 | 9 | Tokens.erase(Tokens.end() - 1); |
505 | 9 | return true; |
506 | 960k | } |
507 | | |
508 | | bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, |
509 | 0 | TokenType NewType) { |
510 | 0 | if (Tokens.size() < Kinds.size()) |
511 | 0 | return false; |
512 | | |
513 | 0 | SmallVectorImpl<FormatToken *>::const_iterator First = |
514 | 0 | Tokens.end() - Kinds.size(); |
515 | 0 | for (unsigned i = 0; i < Kinds.size(); ++i) |
516 | 0 | if (First[i]->isNot(Kinds[i])) |
517 | 0 | return false; |
518 | | |
519 | 0 | return tryMergeTokens(Kinds.size(), NewType); |
520 | 0 | } |
521 | | |
522 | 0 | bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) { |
523 | 0 | if (Tokens.size() < Count) |
524 | 0 | return false; |
525 | | |
526 | 0 | SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count; |
527 | 0 | unsigned AddLength = 0; |
528 | 0 | for (size_t i = 1; i < Count; ++i) { |
529 | | // If there is whitespace separating the token and the previous one, |
530 | | // they should not be merged. |
531 | 0 | if (First[i]->hasWhitespaceBefore()) |
532 | 0 | return false; |
533 | 0 | AddLength += First[i]->TokenText.size(); |
534 | 0 | } |
535 | | |
536 | 0 | Tokens.resize(Tokens.size() - Count + 1); |
537 | 0 | First[0]->TokenText = StringRef(First[0]->TokenText.data(), |
538 | 0 | First[0]->TokenText.size() + AddLength); |
539 | 0 | First[0]->ColumnWidth += AddLength; |
540 | 0 | First[0]->setType(NewType); |
541 | 0 | return true; |
542 | 0 | } |
543 | | |
544 | | bool FormatTokenLexer::tryMergeTokensAny( |
545 | 0 | ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) { |
546 | 0 | return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) { |
547 | 0 | return tryMergeTokens(Kinds, NewType); |
548 | 0 | }); |
549 | 0 | } |
550 | | |
551 | | // Returns \c true if \p Tok can only be followed by an operand in JavaScript. |
552 | 0 | bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { |
553 | | // NB: This is not entirely correct, as an r_paren can introduce an operand |
554 | | // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough |
555 | | // corner case to not matter in practice, though. |
556 | 0 | return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, |
557 | 0 | tok::r_brace, tok::l_square, tok::semi, tok::exclaim, |
558 | 0 | tok::colon, tok::question, tok::tilde) || |
559 | 0 | Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, |
560 | 0 | tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, |
561 | 0 | tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || |
562 | 0 | Tok->isBinaryOperator(); |
563 | 0 | } |
564 | | |
565 | 0 | bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { |
566 | 0 | if (!Prev) |
567 | 0 | return true; |
568 | | |
569 | | // Regex literals can only follow after prefix unary operators, not after |
570 | | // postfix unary operators. If the '++' is followed by a non-operand |
571 | | // introducing token, the slash here is the operand and not the start of a |
572 | | // regex. |
573 | | // `!` is an unary prefix operator, but also a post-fix operator that casts |
574 | | // away nullability, so the same check applies. |
575 | 0 | if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) |
576 | 0 | return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]); |
577 | | |
578 | | // The previous token must introduce an operand location where regex |
579 | | // literals can occur. |
580 | 0 | if (!precedesOperand(Prev)) |
581 | 0 | return false; |
582 | | |
583 | 0 | return true; |
584 | 0 | } |
585 | | |
586 | | // Tries to parse a JavaScript Regex literal starting at the current token, |
587 | | // if that begins with a slash and is in a location where JavaScript allows |
588 | | // regex literals. Changes the current token to a regex literal and updates |
589 | | // its text if successful. |
590 | 0 | void FormatTokenLexer::tryParseJSRegexLiteral() { |
591 | 0 | FormatToken *RegexToken = Tokens.back(); |
592 | 0 | if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) |
593 | 0 | return; |
594 | | |
595 | 0 | FormatToken *Prev = nullptr; |
596 | 0 | for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) { |
597 | | // NB: Because previous pointers are not initialized yet, this cannot use |
598 | | // Token.getPreviousNonComment. |
599 | 0 | if (FT->isNot(tok::comment)) { |
600 | 0 | Prev = FT; |
601 | 0 | break; |
602 | 0 | } |
603 | 0 | } |
604 | |
|
605 | 0 | if (!canPrecedeRegexLiteral(Prev)) |
606 | 0 | return; |
607 | | |
608 | | // 'Manually' lex ahead in the current file buffer. |
609 | 0 | const char *Offset = Lex->getBufferLocation(); |
610 | 0 | const char *RegexBegin = Offset - RegexToken->TokenText.size(); |
611 | 0 | StringRef Buffer = Lex->getBuffer(); |
612 | 0 | bool InCharacterClass = false; |
613 | 0 | bool HaveClosingSlash = false; |
614 | 0 | for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { |
615 | | // Regular expressions are terminated with a '/', which can only be |
616 | | // escaped using '\' or a character class between '[' and ']'. |
617 | | // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. |
618 | 0 | switch (*Offset) { |
619 | 0 | case '\\': |
620 | | // Skip the escaped character. |
621 | 0 | ++Offset; |
622 | 0 | break; |
623 | 0 | case '[': |
624 | 0 | InCharacterClass = true; |
625 | 0 | break; |
626 | 0 | case ']': |
627 | 0 | InCharacterClass = false; |
628 | 0 | break; |
629 | 0 | case '/': |
630 | 0 | if (!InCharacterClass) |
631 | 0 | HaveClosingSlash = true; |
632 | 0 | break; |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | 0 | RegexToken->setType(TT_RegexLiteral); |
637 | | // Treat regex literals like other string_literals. |
638 | 0 | RegexToken->Tok.setKind(tok::string_literal); |
639 | 0 | RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); |
640 | 0 | RegexToken->ColumnWidth = RegexToken->TokenText.size(); |
641 | |
|
642 | 0 | resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); |
643 | 0 | } |
644 | | |
645 | | static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, |
646 | 0 | bool Interpolated) { |
647 | 0 | auto Repeated = [&Begin, End]() { |
648 | 0 | return Begin + 1 < End && Begin[1] == Begin[0]; |
649 | 0 | }; |
650 | | |
651 | | // Look for a terminating '"' in the current file buffer. |
652 | | // Make no effort to format code within an interpolated or verbatim string. |
653 | | // |
654 | | // Interpolated strings could contain { } with " characters inside. |
655 | | // $"{x ?? "null"}" |
656 | | // should not be split into $"{x ?? ", null, "}" but should be treated as a |
657 | | // single string-literal. |
658 | | // |
659 | | // We opt not to try and format expressions inside {} within a C# |
660 | | // interpolated string. Formatting expressions within an interpolated string |
661 | | // would require similar work as that done for JavaScript template strings |
662 | | // in `handleTemplateStrings()`. |
663 | 0 | for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) { |
664 | 0 | switch (*Begin) { |
665 | 0 | case '\\': |
666 | 0 | if (!Verbatim) |
667 | 0 | ++Begin; |
668 | 0 | break; |
669 | 0 | case '{': |
670 | 0 | if (Interpolated) { |
671 | | // {{ inside an interpolated string is escaped, so skip it. |
672 | 0 | if (Repeated()) |
673 | 0 | ++Begin; |
674 | 0 | else |
675 | 0 | ++UnmatchedOpeningBraceCount; |
676 | 0 | } |
677 | 0 | break; |
678 | 0 | case '}': |
679 | 0 | if (Interpolated) { |
680 | | // }} inside an interpolated string is escaped, so skip it. |
681 | 0 | if (Repeated()) |
682 | 0 | ++Begin; |
683 | 0 | else if (UnmatchedOpeningBraceCount > 0) |
684 | 0 | --UnmatchedOpeningBraceCount; |
685 | 0 | else |
686 | 0 | return End; |
687 | 0 | } |
688 | 0 | break; |
689 | 0 | case '"': |
690 | 0 | if (UnmatchedOpeningBraceCount > 0) |
691 | 0 | break; |
692 | | // "" within a verbatim string is an escaped double quote: skip it. |
693 | 0 | if (Verbatim && Repeated()) { |
694 | 0 | ++Begin; |
695 | 0 | break; |
696 | 0 | } |
697 | 0 | return Begin; |
698 | 0 | } |
699 | 0 | } |
700 | | |
701 | 0 | return End; |
702 | 0 | } |
703 | | |
704 | 0 | void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { |
705 | 0 | FormatToken *CSharpStringLiteral = Tokens.back(); |
706 | |
|
707 | 0 | if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral)) |
708 | 0 | return; |
709 | | |
710 | 0 | auto &TokenText = CSharpStringLiteral->TokenText; |
711 | |
|
712 | 0 | bool Verbatim = false; |
713 | 0 | bool Interpolated = false; |
714 | 0 | if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) { |
715 | 0 | Verbatim = true; |
716 | 0 | Interpolated = true; |
717 | 0 | } else if (TokenText.starts_with(R"(@")")) { |
718 | 0 | Verbatim = true; |
719 | 0 | } else if (TokenText.starts_with(R"($")")) { |
720 | 0 | Interpolated = true; |
721 | 0 | } |
722 | | |
723 | | // Deal with multiline strings. |
724 | 0 | if (!Verbatim && !Interpolated) |
725 | 0 | return; |
726 | | |
727 | 0 | const char *StrBegin = Lex->getBufferLocation() - TokenText.size(); |
728 | 0 | const char *Offset = StrBegin; |
729 | 0 | if (Verbatim && Interpolated) |
730 | 0 | Offset += 3; |
731 | 0 | else |
732 | 0 | Offset += 2; |
733 | |
|
734 | 0 | const auto End = Lex->getBuffer().end(); |
735 | 0 | Offset = lexCSharpString(Offset, End, Verbatim, Interpolated); |
736 | | |
737 | | // Make no attempt to format code properly if a verbatim string is |
738 | | // unterminated. |
739 | 0 | if (Offset >= End) |
740 | 0 | return; |
741 | | |
742 | 0 | StringRef LiteralText(StrBegin, Offset - StrBegin + 1); |
743 | 0 | TokenText = LiteralText; |
744 | | |
745 | | // Adjust width for potentially multiline string literals. |
746 | 0 | size_t FirstBreak = LiteralText.find('\n'); |
747 | 0 | StringRef FirstLineText = FirstBreak == StringRef::npos |
748 | 0 | ? LiteralText |
749 | 0 | : LiteralText.substr(0, FirstBreak); |
750 | 0 | CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( |
751 | 0 | FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, |
752 | 0 | Encoding); |
753 | 0 | size_t LastBreak = LiteralText.rfind('\n'); |
754 | 0 | if (LastBreak != StringRef::npos) { |
755 | 0 | CSharpStringLiteral->IsMultiline = true; |
756 | 0 | unsigned StartColumn = 0; |
757 | 0 | CSharpStringLiteral->LastLineColumnWidth = |
758 | 0 | encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1), |
759 | 0 | StartColumn, Style.TabWidth, Encoding); |
760 | 0 | } |
761 | |
|
762 | 0 | assert(Offset < End); |
763 | 0 | resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1))); |
764 | 0 | } |
765 | | |
766 | 0 | void FormatTokenLexer::handleTemplateStrings() { |
767 | 0 | FormatToken *BacktickToken = Tokens.back(); |
768 | |
|
769 | 0 | if (BacktickToken->is(tok::l_brace)) { |
770 | 0 | StateStack.push(LexerState::NORMAL); |
771 | 0 | return; |
772 | 0 | } |
773 | 0 | if (BacktickToken->is(tok::r_brace)) { |
774 | 0 | if (StateStack.size() == 1) |
775 | 0 | return; |
776 | 0 | StateStack.pop(); |
777 | 0 | if (StateStack.top() != LexerState::TEMPLATE_STRING) |
778 | 0 | return; |
779 | | // If back in TEMPLATE_STRING, fallthrough and continue parsing the |
780 | 0 | } else if (BacktickToken->is(tok::unknown) && |
781 | 0 | BacktickToken->TokenText == "`") { |
782 | 0 | StateStack.push(LexerState::TEMPLATE_STRING); |
783 | 0 | } else { |
784 | 0 | return; // Not actually a template |
785 | 0 | } |
786 | | |
787 | | // 'Manually' lex ahead in the current file buffer. |
788 | 0 | const char *Offset = Lex->getBufferLocation(); |
789 | 0 | const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" |
790 | 0 | for (; Offset != Lex->getBuffer().end(); ++Offset) { |
791 | 0 | if (Offset[0] == '`') { |
792 | 0 | StateStack.pop(); |
793 | 0 | ++Offset; |
794 | 0 | break; |
795 | 0 | } |
796 | 0 | if (Offset[0] == '\\') { |
797 | 0 | ++Offset; // Skip the escaped character. |
798 | 0 | } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && |
799 | 0 | Offset[1] == '{') { |
800 | | // '${' introduces an expression interpolation in the template string. |
801 | 0 | StateStack.push(LexerState::NORMAL); |
802 | 0 | Offset += 2; |
803 | 0 | break; |
804 | 0 | } |
805 | 0 | } |
806 | |
|
807 | 0 | StringRef LiteralText(TmplBegin, Offset - TmplBegin); |
808 | 0 | BacktickToken->setType(TT_TemplateString); |
809 | 0 | BacktickToken->Tok.setKind(tok::string_literal); |
810 | 0 | BacktickToken->TokenText = LiteralText; |
811 | | |
812 | | // Adjust width for potentially multiline string literals. |
813 | 0 | size_t FirstBreak = LiteralText.find('\n'); |
814 | 0 | StringRef FirstLineText = FirstBreak == StringRef::npos |
815 | 0 | ? LiteralText |
816 | 0 | : LiteralText.substr(0, FirstBreak); |
817 | 0 | BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( |
818 | 0 | FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); |
819 | 0 | size_t LastBreak = LiteralText.rfind('\n'); |
820 | 0 | if (LastBreak != StringRef::npos) { |
821 | 0 | BacktickToken->IsMultiline = true; |
822 | 0 | unsigned StartColumn = 0; // The template tail spans the entire line. |
823 | 0 | BacktickToken->LastLineColumnWidth = |
824 | 0 | encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1), |
825 | 0 | StartColumn, Style.TabWidth, Encoding); |
826 | 0 | } |
827 | |
|
828 | 0 | SourceLocation loc = Lex->getSourceLocation(Offset); |
829 | 0 | resetLexer(SourceMgr.getFileOffset(loc)); |
830 | 0 | } |
831 | | |
832 | 0 | void FormatTokenLexer::tryParsePythonComment() { |
833 | 0 | FormatToken *HashToken = Tokens.back(); |
834 | 0 | if (!HashToken->isOneOf(tok::hash, tok::hashhash)) |
835 | 0 | return; |
836 | | // Turn the remainder of this line into a comment. |
837 | 0 | const char *CommentBegin = |
838 | 0 | Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" |
839 | 0 | size_t From = CommentBegin - Lex->getBuffer().begin(); |
840 | 0 | size_t To = Lex->getBuffer().find_first_of('\n', From); |
841 | 0 | if (To == StringRef::npos) |
842 | 0 | To = Lex->getBuffer().size(); |
843 | 0 | size_t Len = To - From; |
844 | 0 | HashToken->setType(TT_LineComment); |
845 | 0 | HashToken->Tok.setKind(tok::comment); |
846 | 0 | HashToken->TokenText = Lex->getBuffer().substr(From, Len); |
847 | 0 | SourceLocation Loc = To < Lex->getBuffer().size() |
848 | 0 | ? Lex->getSourceLocation(CommentBegin + Len) |
849 | 0 | : SourceMgr.getLocForEndOfFile(ID); |
850 | 0 | resetLexer(SourceMgr.getFileOffset(Loc)); |
851 | 0 | } |
852 | | |
853 | 52.9M | bool FormatTokenLexer::tryMerge_TMacro() { |
854 | 52.9M | if (Tokens.size() < 4) |
855 | 12.0k | return false; |
856 | 52.9M | FormatToken *Last = Tokens.back(); |
857 | 52.9M | if (Last->isNot(tok::r_paren)) |
858 | 52.3M | return false; |
859 | | |
860 | 577k | FormatToken *String = Tokens[Tokens.size() - 2]; |
861 | 577k | if (String->isNot(tok::string_literal) || String->IsMultiline) |
862 | 570k | return false; |
863 | | |
864 | 6.63k | if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren)) |
865 | 6.62k | return false; |
866 | | |
867 | 6 | FormatToken *Macro = Tokens[Tokens.size() - 4]; |
868 | 6 | if (Macro->TokenText != "_T") |
869 | 6 | return false; |
870 | | |
871 | 0 | const char *Start = Macro->TokenText.data(); |
872 | 0 | const char *End = Last->TokenText.data() + Last->TokenText.size(); |
873 | 0 | String->TokenText = StringRef(Start, End - Start); |
874 | 0 | String->IsFirst = Macro->IsFirst; |
875 | 0 | String->LastNewlineOffset = Macro->LastNewlineOffset; |
876 | 0 | String->WhitespaceRange = Macro->WhitespaceRange; |
877 | 0 | String->OriginalColumn = Macro->OriginalColumn; |
878 | 0 | String->ColumnWidth = encoding::columnWidthWithTabs( |
879 | 0 | String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); |
880 | 0 | String->NewlinesBefore = Macro->NewlinesBefore; |
881 | 0 | String->HasUnescapedNewline = Macro->HasUnescapedNewline; |
882 | |
|
883 | 0 | Tokens.pop_back(); |
884 | 0 | Tokens.pop_back(); |
885 | 0 | Tokens.pop_back(); |
886 | 0 | Tokens.back() = String; |
887 | 0 | if (FirstInLineIndex >= Tokens.size()) |
888 | 0 | FirstInLineIndex = Tokens.size() - 1; |
889 | 0 | return true; |
890 | 6 | } |
891 | | |
892 | 52.9M | bool FormatTokenLexer::tryMergeConflictMarkers() { |
893 | 52.9M | if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) |
894 | 50.4M | return false; |
895 | | |
896 | | // Conflict lines look like: |
897 | | // <marker> <text from the vcs> |
898 | | // For example: |
899 | | // >>>>>>> /file/in/file/system at revision 1234 |
900 | | // |
901 | | // We merge all tokens in a line that starts with a conflict marker |
902 | | // into a single token with a special token type that the unwrapped line |
903 | | // parser will use to correctly rebuild the underlying code. |
904 | | |
905 | 2.51M | FileID ID; |
906 | | // Get the position of the first token in the line. |
907 | 2.51M | unsigned FirstInLineOffset; |
908 | 2.51M | std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( |
909 | 2.51M | Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); |
910 | 2.51M | StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer(); |
911 | | // Calculate the offset of the start of the current line. |
912 | 2.51M | auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); |
913 | 2.51M | if (LineOffset == StringRef::npos) |
914 | 9.46k | LineOffset = 0; |
915 | 2.50M | else |
916 | 2.50M | ++LineOffset; |
917 | | |
918 | 2.51M | auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); |
919 | 2.51M | StringRef LineStart; |
920 | 2.51M | if (FirstSpace == StringRef::npos) |
921 | 852 | LineStart = Buffer.substr(LineOffset); |
922 | 2.51M | else |
923 | 2.51M | LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); |
924 | | |
925 | 2.51M | TokenType Type = TT_Unknown; |
926 | 2.51M | if (LineStart == "<<<<<<<" || LineStart == ">>>>") { |
927 | 261k | Type = TT_ConflictStart; |
928 | 2.25M | } else if (LineStart == "|||||||" || LineStart == "=======" || |
929 | 2.25M | LineStart == "====") { |
930 | 42 | Type = TT_ConflictAlternative; |
931 | 2.25M | } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { |
932 | 57 | Type = TT_ConflictEnd; |
933 | 57 | } |
934 | | |
935 | 2.51M | if (Type != TT_Unknown) { |
936 | 261k | FormatToken *Next = Tokens.back(); |
937 | | |
938 | 261k | Tokens.resize(FirstInLineIndex + 1); |
939 | | // We do not need to build a complete token here, as we will skip it |
940 | | // during parsing anyway (as we must not touch whitespace around conflict |
941 | | // markers). |
942 | 261k | Tokens.back()->setType(Type); |
943 | 261k | Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); |
944 | | |
945 | 261k | Tokens.push_back(Next); |
946 | 261k | return true; |
947 | 261k | } |
948 | | |
949 | 2.25M | return false; |
950 | 2.51M | } |
951 | | |
952 | 725k | FormatToken *FormatTokenLexer::getStashedToken() { |
953 | | // Create a synthesized second '>' or '<' token. |
954 | 725k | Token Tok = FormatTok->Tok; |
955 | 725k | StringRef TokenText = FormatTok->TokenText; |
956 | | |
957 | 725k | unsigned OriginalColumn = FormatTok->OriginalColumn; |
958 | 725k | FormatTok = new (Allocator.Allocate()) FormatToken; |
959 | 725k | FormatTok->Tok = Tok; |
960 | 725k | SourceLocation TokLocation = |
961 | 725k | FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); |
962 | 725k | FormatTok->Tok.setLocation(TokLocation); |
963 | 725k | FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); |
964 | 725k | FormatTok->TokenText = TokenText; |
965 | 725k | FormatTok->ColumnWidth = 1; |
966 | 725k | FormatTok->OriginalColumn = OriginalColumn + 1; |
967 | | |
968 | 725k | return FormatTok; |
969 | 725k | } |
970 | | |
971 | | /// Truncate the current token to the new length and make the lexer continue |
972 | | /// from the end of the truncated token. Used for other languages that have |
973 | | /// different token boundaries, like JavaScript in which a comment ends at a |
974 | | /// line break regardless of whether the line break follows a backslash. Also |
975 | | /// used to set the lexer to the end of whitespace if the lexer regards |
976 | | /// whitespace and an unrecognized symbol as one token. |
977 | 777 | void FormatTokenLexer::truncateToken(size_t NewLen) { |
978 | 777 | assert(NewLen <= FormatTok->TokenText.size()); |
979 | 0 | resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation( |
980 | 777 | Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen))); |
981 | 777 | FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen); |
982 | 777 | FormatTok->ColumnWidth = encoding::columnWidthWithTabs( |
983 | 777 | FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, |
984 | 777 | Encoding); |
985 | 777 | FormatTok->Tok.setLength(NewLen); |
986 | 777 | } |
987 | | |
988 | | /// Count the length of leading whitespace in a token. |
989 | 56.7M | static size_t countLeadingWhitespace(StringRef Text) { |
990 | | // Basically counting the length matched by this regex. |
991 | | // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+" |
992 | | // Directly using the regex turned out to be slow. With the regex |
993 | | // version formatting all files in this directory took about 1.25 |
994 | | // seconds. This version took about 0.5 seconds. |
995 | 56.7M | const unsigned char *const Begin = Text.bytes_begin(); |
996 | 56.7M | const unsigned char *const End = Text.bytes_end(); |
997 | 56.7M | const unsigned char *Cur = Begin; |
998 | 89.9M | while (Cur < End) { |
999 | 85.4M | if (isspace(Cur[0])) { |
1000 | 33.2M | ++Cur; |
1001 | 52.2M | } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) { |
1002 | | // A '\' followed by a newline always escapes the newline, regardless |
1003 | | // of whether there is another '\' before it. |
1004 | | // The source has a null byte at the end. So the end of the entire input |
1005 | | // isn't reached yet. Also the lexer doesn't break apart an escaped |
1006 | | // newline. |
1007 | 1.82k | assert(End - Cur >= 2); |
1008 | 0 | Cur += 2; |
1009 | 52.2M | } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' && |
1010 | 52.2M | (Cur[3] == '\n' || Cur[3] == '\r')) { |
1011 | | // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the |
1012 | | // characters are quoted individually in this comment because if we write |
1013 | | // them together some compilers warn that we have a trigraph in the code. |
1014 | 0 | assert(End - Cur >= 4); |
1015 | 0 | Cur += 4; |
1016 | 52.2M | } else { |
1017 | 52.2M | break; |
1018 | 52.2M | } |
1019 | 85.4M | } |
1020 | 56.7M | return Cur - Begin; |
1021 | 56.7M | } |
1022 | | |
1023 | 52.9M | FormatToken *FormatTokenLexer::getNextToken() { |
1024 | 52.9M | if (StateStack.top() == LexerState::TOKEN_STASHED) { |
1025 | 725k | StateStack.pop(); |
1026 | 725k | return getStashedToken(); |
1027 | 725k | } |
1028 | | |
1029 | 52.2M | FormatTok = new (Allocator.Allocate()) FormatToken; |
1030 | 52.2M | readRawToken(*FormatTok); |
1031 | 52.2M | SourceLocation WhitespaceStart = |
1032 | 52.2M | FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); |
1033 | 52.2M | FormatTok->IsFirst = IsFirstToken; |
1034 | 52.2M | IsFirstToken = false; |
1035 | | |
1036 | | // Consume and record whitespace until we find a significant token. |
1037 | | // Some tok::unknown tokens are not just whitespace, e.g. whitespace |
1038 | | // followed by a symbol such as backtick. Those symbols may be |
1039 | | // significant in other languages. |
1040 | 52.2M | unsigned WhitespaceLength = TrailingWhitespace; |
1041 | 56.7M | while (FormatTok->isNot(tok::eof)) { |
1042 | 56.7M | auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText); |
1043 | 56.7M | if (LeadingWhitespace == 0) |
1044 | 52.2M | break; |
1045 | 4.51M | if (LeadingWhitespace < FormatTok->TokenText.size()) |
1046 | 777 | truncateToken(LeadingWhitespace); |
1047 | 4.51M | StringRef Text = FormatTok->TokenText; |
1048 | 4.51M | bool InEscape = false; |
1049 | 37.7M | for (int i = 0, e = Text.size(); i != e; ++i) { |
1050 | 33.2M | switch (Text[i]) { |
1051 | 6.31M | case '\r': |
1052 | | // If this is a CRLF sequence, break here and the LF will be handled on |
1053 | | // the next loop iteration. Otherwise, this is a single Mac CR, treat it |
1054 | | // the same as a single LF. |
1055 | 6.31M | if (i + 1 < e && Text[i + 1] == '\n') |
1056 | 279k | break; |
1057 | 6.31M | [[fallthrough]]; |
1058 | 21.8M | case '\n': |
1059 | 21.8M | ++FormatTok->NewlinesBefore; |
1060 | 21.8M | if (!InEscape) |
1061 | 21.8M | FormatTok->HasUnescapedNewline = true; |
1062 | 1.82k | else |
1063 | 1.82k | InEscape = false; |
1064 | 21.8M | FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; |
1065 | 21.8M | Column = 0; |
1066 | 21.8M | break; |
1067 | 99.9k | case '\f': |
1068 | 626k | case '\v': |
1069 | 626k | Column = 0; |
1070 | 626k | break; |
1071 | 8.85M | case ' ': |
1072 | 8.85M | ++Column; |
1073 | 8.85M | break; |
1074 | 1.56M | case '\t': |
1075 | 1.56M | Column += |
1076 | 1.56M | Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); |
1077 | 1.56M | break; |
1078 | 1.82k | case '\\': |
1079 | 1.82k | case '?': |
1080 | 1.82k | case '/': |
1081 | | // The text was entirely whitespace when this loop was entered. Thus |
1082 | | // this has to be an escape sequence. |
1083 | 1.82k | assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" || |
1084 | 1.82k | Text.substr(i, 4) == "\?\?/\r" || |
1085 | 1.82k | Text.substr(i, 4) == "\?\?/\n" || |
1086 | 1.82k | (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" || |
1087 | 1.82k | Text.substr(i - 1, 4) == "\?\?/\n")) || |
1088 | 1.82k | (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" || |
1089 | 1.82k | Text.substr(i - 2, 4) == "\?\?/\n"))); |
1090 | 0 | InEscape = true; |
1091 | 1.82k | break; |
1092 | 0 | default: |
1093 | | // This shouldn't happen. |
1094 | 0 | assert(false); |
1095 | 0 | break; |
1096 | 33.2M | } |
1097 | 33.2M | } |
1098 | 4.51M | WhitespaceLength += Text.size(); |
1099 | 4.51M | readRawToken(*FormatTok); |
1100 | 4.51M | } |
1101 | | |
1102 | 52.2M | if (FormatTok->is(tok::unknown)) |
1103 | 27.1M | FormatTok->setType(TT_ImplicitStringLiteral); |
1104 | | |
1105 | | // JavaScript and Java do not allow to escape the end of the line with a |
1106 | | // backslash. Backslashes are syntax errors in plain source, but can occur in |
1107 | | // comments. When a single line comment ends with a \, it'll cause the next |
1108 | | // line of code to be lexed as a comment, breaking formatting. The code below |
1109 | | // finds comments that contain a backslash followed by a line break, truncates |
1110 | | // the comment token at the backslash, and resets the lexer to restart behind |
1111 | | // the backslash. |
1112 | 52.2M | if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) && |
1113 | 52.2M | FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) { |
1114 | 0 | size_t BackslashPos = FormatTok->TokenText.find('\\'); |
1115 | 0 | while (BackslashPos != StringRef::npos) { |
1116 | 0 | if (BackslashPos + 1 < FormatTok->TokenText.size() && |
1117 | 0 | FormatTok->TokenText[BackslashPos + 1] == '\n') { |
1118 | 0 | truncateToken(BackslashPos + 1); |
1119 | 0 | break; |
1120 | 0 | } |
1121 | 0 | BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); |
1122 | 0 | } |
1123 | 0 | } |
1124 | | |
1125 | 52.2M | if (Style.isVerilog()) { |
1126 | 0 | static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase); |
1127 | 0 | SmallVector<StringRef, 1> Matches; |
1128 | | // Verilog uses the backtick instead of the hash for preprocessor stuff. |
1129 | | // And it uses the hash for delays and parameter lists. In order to continue |
1130 | | // using `tok::hash` in other places, the backtick gets marked as the hash |
1131 | | // here. And in order to tell the backtick and hash apart for |
1132 | | // Verilog-specific stuff, the hash becomes an identifier. |
1133 | 0 | if (FormatTok->is(tok::numeric_constant)) { |
1134 | | // In Verilog the quote is not part of a number. |
1135 | 0 | auto Quote = FormatTok->TokenText.find('\''); |
1136 | 0 | if (Quote != StringRef::npos) |
1137 | 0 | truncateToken(Quote); |
1138 | 0 | } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) { |
1139 | 0 | FormatTok->Tok.setKind(tok::raw_identifier); |
1140 | 0 | } else if (FormatTok->is(tok::raw_identifier)) { |
1141 | 0 | if (FormatTok->TokenText == "`") { |
1142 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1143 | 0 | FormatTok->Tok.setKind(tok::hash); |
1144 | 0 | } else if (FormatTok->TokenText == "``") { |
1145 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1146 | 0 | FormatTok->Tok.setKind(tok::hashhash); |
1147 | 0 | } else if (Tokens.size() > 0 && |
1148 | 0 | Tokens.back()->is(Keywords.kw_apostrophe) && |
1149 | 0 | NumberBase.match(FormatTok->TokenText, &Matches)) { |
1150 | | // In Verilog in a based number literal like `'b10`, there may be |
1151 | | // whitespace between `'b` and `10`. Therefore we handle the base and |
1152 | | // the rest of the number literal as two tokens. But if there is no |
1153 | | // space in the input code, we need to manually separate the two parts. |
1154 | 0 | truncateToken(Matches[0].size()); |
1155 | 0 | FormatTok->setFinalizedType(TT_VerilogNumberBase); |
1156 | 0 | } |
1157 | 0 | } |
1158 | 0 | } |
1159 | | |
1160 | 52.2M | FormatTok->WhitespaceRange = SourceRange( |
1161 | 52.2M | WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); |
1162 | | |
1163 | 52.2M | FormatTok->OriginalColumn = Column; |
1164 | | |
1165 | 52.2M | TrailingWhitespace = 0; |
1166 | 52.2M | if (FormatTok->is(tok::comment)) { |
1167 | | // FIXME: Add the trimmed whitespace to Column. |
1168 | 38.3k | StringRef UntrimmedText = FormatTok->TokenText; |
1169 | 38.3k | FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); |
1170 | 38.3k | TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); |
1171 | 52.2M | } else if (FormatTok->is(tok::raw_identifier)) { |
1172 | 6.86M | IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); |
1173 | 6.86M | FormatTok->Tok.setIdentifierInfo(&Info); |
1174 | 6.86M | FormatTok->Tok.setKind(Info.getTokenID()); |
1175 | 6.86M | if (Style.Language == FormatStyle::LK_Java && |
1176 | 6.86M | FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, |
1177 | 0 | tok::kw_operator)) { |
1178 | 0 | FormatTok->Tok.setKind(tok::identifier); |
1179 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1180 | 6.86M | } else if (Style.isJavaScript() && |
1181 | 6.86M | FormatTok->isOneOf(tok::kw_struct, tok::kw_union, |
1182 | 0 | tok::kw_operator)) { |
1183 | 0 | FormatTok->Tok.setKind(tok::identifier); |
1184 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1185 | 6.86M | } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) { |
1186 | 0 | FormatTok->Tok.setKind(tok::identifier); |
1187 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1188 | 0 | } |
1189 | 45.3M | } else if (FormatTok->is(tok::greatergreater)) { |
1190 | 643k | FormatTok->Tok.setKind(tok::greater); |
1191 | 643k | FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); |
1192 | 643k | ++Column; |
1193 | 643k | StateStack.push(LexerState::TOKEN_STASHED); |
1194 | 44.6M | } else if (FormatTok->is(tok::lessless)) { |
1195 | 81.6k | FormatTok->Tok.setKind(tok::less); |
1196 | 81.6k | FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); |
1197 | 81.6k | ++Column; |
1198 | 81.6k | StateStack.push(LexerState::TOKEN_STASHED); |
1199 | 81.6k | } |
1200 | | |
1201 | 52.2M | if (Style.isVerilog() && Tokens.size() > 0 && |
1202 | 52.2M | Tokens.back()->is(TT_VerilogNumberBase) && |
1203 | 52.2M | FormatTok->Tok.isOneOf(tok::identifier, tok::question)) { |
1204 | | // Mark the number following a base like `'h?a0` as a number. |
1205 | 0 | FormatTok->Tok.setKind(tok::numeric_constant); |
1206 | 0 | } |
1207 | | |
1208 | | // Now FormatTok is the next non-whitespace token. |
1209 | | |
1210 | 52.2M | StringRef Text = FormatTok->TokenText; |
1211 | 52.2M | size_t FirstNewlinePos = Text.find('\n'); |
1212 | 52.2M | if (FirstNewlinePos == StringRef::npos) { |
1213 | | // FIXME: ColumnWidth actually depends on the start column, we need to |
1214 | | // take this into account when the token is moved. |
1215 | 52.1M | FormatTok->ColumnWidth = |
1216 | 52.1M | encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); |
1217 | 52.1M | Column += FormatTok->ColumnWidth; |
1218 | 52.1M | } else { |
1219 | 104k | FormatTok->IsMultiline = true; |
1220 | | // FIXME: ColumnWidth actually depends on the start column, we need to |
1221 | | // take this into account when the token is moved. |
1222 | 104k | FormatTok->ColumnWidth = encoding::columnWidthWithTabs( |
1223 | 104k | Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); |
1224 | | |
1225 | | // The last line of the token always starts in column 0. |
1226 | | // Thus, the length can be precomputed even in the presence of tabs. |
1227 | 104k | FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( |
1228 | 104k | Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); |
1229 | 104k | Column = FormatTok->LastLineColumnWidth; |
1230 | 104k | } |
1231 | | |
1232 | 52.2M | if (Style.isCpp()) { |
1233 | 52.2M | auto *Identifier = FormatTok->Tok.getIdentifierInfo(); |
1234 | 52.2M | auto it = Macros.find(Identifier); |
1235 | 52.2M | if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && |
1236 | 52.2M | Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == |
1237 | 6.86M | tok::pp_define) && |
1238 | 52.2M | it != Macros.end()) { |
1239 | 57 | FormatTok->setType(it->second); |
1240 | 57 | if (it->second == TT_IfMacro) { |
1241 | | // The lexer token currently has type tok::kw_unknown. However, for this |
1242 | | // substitution to be treated correctly in the TokenAnnotator, faking |
1243 | | // the tok value seems to be needed. Not sure if there's a more elegant |
1244 | | // way. |
1245 | 0 | FormatTok->Tok.setKind(tok::kw_if); |
1246 | 0 | } |
1247 | 52.2M | } else if (FormatTok->is(tok::identifier)) { |
1248 | 6.82M | if (MacroBlockBeginRegex.match(Text)) |
1249 | 0 | FormatTok->setType(TT_MacroBlockBegin); |
1250 | 6.82M | else if (MacroBlockEndRegex.match(Text)) |
1251 | 0 | FormatTok->setType(TT_MacroBlockEnd); |
1252 | 6.82M | else if (TypeNames.contains(Identifier)) |
1253 | 0 | FormatTok->setFinalizedType(TT_TypeName); |
1254 | 6.82M | } |
1255 | 52.2M | } |
1256 | | |
1257 | 52.2M | return FormatTok; |
1258 | 52.2M | } |
1259 | | |
1260 | 0 | bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) { |
1261 | | // In Verilog the quote is not a character literal. |
1262 | | // |
1263 | | // Make the backtick and double backtick identifiers to match against them |
1264 | | // more easily. |
1265 | | // |
1266 | | // In Verilog an escaped identifier starts with backslash and ends with |
1267 | | // whitespace. Unless that whitespace is an escaped newline. A backslash can |
1268 | | // also begin an escaped newline outside of an escaped identifier. We check |
1269 | | // for that outside of the Regex since we can't use negative lookhead |
1270 | | // assertions. Simply changing the '*' to '+' breaks stuff as the escaped |
1271 | | // identifier may have a length of 0 according to Section A.9.3. |
1272 | | // FIXME: If there is an escaped newline in the middle of an escaped |
1273 | | // identifier, allow for pasting the two lines together, But escaped |
1274 | | // identifiers usually occur only in generated code anyway. |
1275 | 0 | static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re" |
1276 | 0 | "(\r?\n|\r)|[^[:space:]])*)"); |
1277 | |
|
1278 | 0 | SmallVector<StringRef, 4> Matches; |
1279 | 0 | const char *Start = Lex->getBufferLocation(); |
1280 | 0 | if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start), |
1281 | 0 | &Matches)) { |
1282 | 0 | return false; |
1283 | 0 | } |
1284 | | // There is a null byte at the end of the buffer, so we don't have to check |
1285 | | // Start[1] is within the buffer. |
1286 | 0 | if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n')) |
1287 | 0 | return false; |
1288 | 0 | size_t Len = Matches[0].size(); |
1289 | | |
1290 | | // The kind has to be an identifier so we can match it against those defined |
1291 | | // in Keywords. The kind has to be set before the length because the setLength |
1292 | | // function checks that the kind is not an annotation. |
1293 | 0 | Tok.setKind(tok::raw_identifier); |
1294 | 0 | Tok.setLength(Len); |
1295 | 0 | Tok.setLocation(Lex->getSourceLocation(Start, Len)); |
1296 | 0 | Tok.setRawIdentifierData(Start); |
1297 | 0 | Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false); |
1298 | 0 | return true; |
1299 | 0 | } |
1300 | | |
1301 | 56.7M | void FormatTokenLexer::readRawToken(FormatToken &Tok) { |
1302 | | // For Verilog, first see if there is a special token, and fall back to the |
1303 | | // normal lexer if there isn't one. |
1304 | 56.7M | if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok)) |
1305 | 56.7M | Lex->LexFromRawLexer(Tok.Tok); |
1306 | 56.7M | Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), |
1307 | 56.7M | Tok.Tok.getLength()); |
1308 | | // For formatting, treat unterminated string literals like normal string |
1309 | | // literals. |
1310 | 56.7M | if (Tok.is(tok::unknown)) { |
1311 | 31.7M | if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { |
1312 | 71.7k | Tok.Tok.setKind(tok::string_literal); |
1313 | 71.7k | Tok.IsUnterminatedLiteral = true; |
1314 | 31.6M | } else if (Style.isJavaScript() && Tok.TokenText == "''") { |
1315 | 0 | Tok.Tok.setKind(tok::string_literal); |
1316 | 0 | } |
1317 | 31.7M | } |
1318 | | |
1319 | 56.7M | if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant)) |
1320 | 0 | Tok.Tok.setKind(tok::string_literal); |
1321 | | |
1322 | 56.7M | if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText)) |
1323 | 0 | FormattingDisabled = false; |
1324 | | |
1325 | 56.7M | Tok.Finalized = FormattingDisabled; |
1326 | | |
1327 | 56.7M | if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText)) |
1328 | 0 | FormattingDisabled = true; |
1329 | 56.7M | } |
1330 | | |
1331 | 777 | void FormatTokenLexer::resetLexer(unsigned Offset) { |
1332 | 777 | StringRef Buffer = SourceMgr.getBufferData(ID); |
1333 | 777 | LangOpts = getFormattingLangOpts(Style); |
1334 | 777 | Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, |
1335 | 777 | Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); |
1336 | 777 | Lex->SetKeepWhitespaceMode(true); |
1337 | 777 | TrailingWhitespace = 0; |
1338 | 777 | } |
1339 | | |
1340 | | } // namespace format |
1341 | | } // namespace clang |