/src/llvm-project/clang/lib/AST/CommentLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- CommentLexer.cpp -------------------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang/AST/CommentLexer.h" |
10 | | #include "clang/AST/CommentCommandTraits.h" |
11 | | #include "clang/AST/CommentDiagnostic.h" |
12 | | #include "clang/Basic/CharInfo.h" |
13 | | #include "llvm/ADT/StringExtras.h" |
14 | | #include "llvm/ADT/StringSwitch.h" |
15 | | #include "llvm/Support/ConvertUTF.h" |
16 | | #include "llvm/Support/ErrorHandling.h" |
17 | | |
18 | | namespace clang { |
19 | | namespace comments { |
20 | | |
21 | 0 | void Token::dump(const Lexer &L, const SourceManager &SM) const { |
22 | 0 | llvm::errs() << "comments::Token Kind=" << Kind << " "; |
23 | 0 | Loc.print(llvm::errs(), SM); |
24 | 0 | llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; |
25 | 0 | } |
26 | | |
27 | 0 | static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { |
28 | 0 | return isLetter(C); |
29 | 0 | } |
30 | | |
31 | 0 | static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { |
32 | 0 | return isDigit(C); |
33 | 0 | } |
34 | | |
35 | 0 | static inline bool isHTMLHexCharacterReferenceCharacter(char C) { |
36 | 0 | return isHexDigit(C); |
37 | 0 | } |
38 | | |
39 | | static inline StringRef convertCodePointToUTF8( |
40 | | llvm::BumpPtrAllocator &Allocator, |
41 | 0 | unsigned CodePoint) { |
42 | 0 | char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); |
43 | 0 | char *ResolvedPtr = Resolved; |
44 | 0 | if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) |
45 | 0 | return StringRef(Resolved, ResolvedPtr - Resolved); |
46 | 0 | else |
47 | 0 | return StringRef(); |
48 | 0 | } |
49 | | |
50 | | namespace { |
51 | | |
52 | | #include "clang/AST/CommentHTMLTags.inc" |
53 | | #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" |
54 | | |
55 | | } // end anonymous namespace |
56 | | |
57 | 0 | StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { |
58 | | // Fast path, first check a few most widely used named character references. |
59 | 0 | return llvm::StringSwitch<StringRef>(Name) |
60 | 0 | .Case("amp", "&") |
61 | 0 | .Case("lt", "<") |
62 | 0 | .Case("gt", ">") |
63 | 0 | .Case("quot", "\"") |
64 | 0 | .Case("apos", "\'") |
65 | | // Slow path. |
66 | 0 | .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); |
67 | 0 | } |
68 | | |
69 | 0 | StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { |
70 | 0 | unsigned CodePoint = 0; |
71 | 0 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
72 | 0 | assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); |
73 | 0 | CodePoint *= 10; |
74 | 0 | CodePoint += Name[i] - '0'; |
75 | 0 | } |
76 | 0 | return convertCodePointToUTF8(Allocator, CodePoint); |
77 | 0 | } |
78 | | |
79 | 0 | StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { |
80 | 0 | unsigned CodePoint = 0; |
81 | 0 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
82 | 0 | CodePoint *= 16; |
83 | 0 | const char C = Name[i]; |
84 | 0 | assert(isHTMLHexCharacterReferenceCharacter(C)); |
85 | 0 | CodePoint += llvm::hexDigitValue(C); |
86 | 0 | } |
87 | 0 | return convertCodePointToUTF8(Allocator, CodePoint); |
88 | 0 | } |
89 | | |
90 | 0 | void Lexer::skipLineStartingDecorations() { |
91 | | // This function should be called only for C comments |
92 | 0 | assert(CommentState == LCS_InsideCComment); |
93 | | |
94 | 0 | if (BufferPtr == CommentEnd) |
95 | 0 | return; |
96 | | |
97 | 0 | const char *NewBufferPtr = BufferPtr; |
98 | 0 | while (isHorizontalWhitespace(*NewBufferPtr)) |
99 | 0 | if (++NewBufferPtr == CommentEnd) |
100 | 0 | return; |
101 | 0 | if (*NewBufferPtr == '*') |
102 | 0 | BufferPtr = NewBufferPtr + 1; |
103 | 0 | } |
104 | | |
105 | | namespace { |
106 | | /// Returns pointer to the first newline character in the string. |
107 | 0 | const char *findNewline(const char *BufferPtr, const char *BufferEnd) { |
108 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
109 | 0 | if (isVerticalWhitespace(*BufferPtr)) |
110 | 0 | return BufferPtr; |
111 | 0 | } |
112 | 0 | return BufferEnd; |
113 | 0 | } |
114 | | |
115 | 0 | const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { |
116 | 0 | if (BufferPtr == BufferEnd) |
117 | 0 | return BufferPtr; |
118 | | |
119 | 0 | if (*BufferPtr == '\n') |
120 | 0 | BufferPtr++; |
121 | 0 | else { |
122 | 0 | assert(*BufferPtr == '\r'); |
123 | 0 | BufferPtr++; |
124 | 0 | if (BufferPtr != BufferEnd && *BufferPtr == '\n') |
125 | 0 | BufferPtr++; |
126 | 0 | } |
127 | 0 | return BufferPtr; |
128 | 0 | } |
129 | | |
130 | | const char *skipNamedCharacterReference(const char *BufferPtr, |
131 | 0 | const char *BufferEnd) { |
132 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
133 | 0 | if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) |
134 | 0 | return BufferPtr; |
135 | 0 | } |
136 | 0 | return BufferEnd; |
137 | 0 | } |
138 | | |
139 | | const char *skipDecimalCharacterReference(const char *BufferPtr, |
140 | 0 | const char *BufferEnd) { |
141 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
142 | 0 | if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) |
143 | 0 | return BufferPtr; |
144 | 0 | } |
145 | 0 | return BufferEnd; |
146 | 0 | } |
147 | | |
148 | | const char *skipHexCharacterReference(const char *BufferPtr, |
149 | 0 | const char *BufferEnd) { |
150 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
151 | 0 | if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) |
152 | 0 | return BufferPtr; |
153 | 0 | } |
154 | 0 | return BufferEnd; |
155 | 0 | } |
156 | | |
157 | 0 | bool isHTMLIdentifierStartingCharacter(char C) { |
158 | 0 | return isLetter(C); |
159 | 0 | } |
160 | | |
161 | 0 | bool isHTMLIdentifierCharacter(char C) { |
162 | 0 | return isAlphanumeric(C); |
163 | 0 | } |
164 | | |
165 | 0 | const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { |
166 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
167 | 0 | if (!isHTMLIdentifierCharacter(*BufferPtr)) |
168 | 0 | return BufferPtr; |
169 | 0 | } |
170 | 0 | return BufferEnd; |
171 | 0 | } |
172 | | |
173 | | /// Skip HTML string quoted in single or double quotes. Escaping quotes inside |
174 | | /// string allowed. |
175 | | /// |
176 | | /// Returns pointer to closing quote. |
177 | | const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) |
178 | 0 | { |
179 | 0 | const char Quote = *BufferPtr; |
180 | 0 | assert(Quote == '\"' || Quote == '\''); |
181 | | |
182 | 0 | BufferPtr++; |
183 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
184 | 0 | const char C = *BufferPtr; |
185 | 0 | if (C == Quote && BufferPtr[-1] != '\\') |
186 | 0 | return BufferPtr; |
187 | 0 | } |
188 | 0 | return BufferEnd; |
189 | 0 | } |
190 | | |
191 | 0 | const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { |
192 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
193 | 0 | if (!isWhitespace(*BufferPtr)) |
194 | 0 | return BufferPtr; |
195 | 0 | } |
196 | 0 | return BufferEnd; |
197 | 0 | } |
198 | | |
199 | 0 | bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { |
200 | 0 | return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; |
201 | 0 | } |
202 | | |
203 | 0 | bool isCommandNameStartCharacter(char C) { |
204 | 0 | return isLetter(C); |
205 | 0 | } |
206 | | |
207 | 0 | bool isCommandNameCharacter(char C) { |
208 | 0 | return isAlphanumeric(C); |
209 | 0 | } |
210 | | |
211 | 0 | const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { |
212 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
213 | 0 | if (!isCommandNameCharacter(*BufferPtr)) |
214 | 0 | return BufferPtr; |
215 | 0 | } |
216 | 0 | return BufferEnd; |
217 | 0 | } |
218 | | |
219 | | /// Return the one past end pointer for BCPL comments. |
220 | | /// Handles newlines escaped with backslash or trigraph for backslahs. |
221 | 0 | const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
222 | 0 | const char *CurPtr = BufferPtr; |
223 | 0 | while (CurPtr != BufferEnd) { |
224 | 0 | while (!isVerticalWhitespace(*CurPtr)) { |
225 | 0 | CurPtr++; |
226 | 0 | if (CurPtr == BufferEnd) |
227 | 0 | return BufferEnd; |
228 | 0 | } |
229 | | // We found a newline, check if it is escaped. |
230 | 0 | const char *EscapePtr = CurPtr - 1; |
231 | 0 | while(isHorizontalWhitespace(*EscapePtr)) |
232 | 0 | EscapePtr--; |
233 | |
|
234 | 0 | if (*EscapePtr == '\\' || |
235 | 0 | (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && |
236 | 0 | EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { |
237 | | // We found an escaped newline. |
238 | 0 | CurPtr = skipNewline(CurPtr, BufferEnd); |
239 | 0 | } else |
240 | 0 | return CurPtr; // Not an escaped newline. |
241 | 0 | } |
242 | 0 | return BufferEnd; |
243 | 0 | } |
244 | | |
245 | | /// Return the one past end pointer for C comments. |
246 | | /// Very dumb, does not handle escaped newlines or trigraphs. |
247 | 0 | const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
248 | 0 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
249 | 0 | if (*BufferPtr == '*') { |
250 | 0 | assert(BufferPtr + 1 != BufferEnd); |
251 | 0 | if (*(BufferPtr + 1) == '/') |
252 | 0 | return BufferPtr; |
253 | 0 | } |
254 | 0 | } |
255 | 0 | llvm_unreachable("buffer end hit before '*/' was seen"); |
256 | 0 | } |
257 | | |
258 | | } // end anonymous namespace |
259 | | |
260 | | void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, |
261 | 0 | tok::TokenKind Kind) { |
262 | 0 | const unsigned TokLen = TokEnd - BufferPtr; |
263 | 0 | Result.setLocation(getSourceLocation(BufferPtr)); |
264 | 0 | Result.setKind(Kind); |
265 | 0 | Result.setLength(TokLen); |
266 | 0 | #ifndef NDEBUG |
267 | 0 | Result.TextPtr = "<UNSET>"; |
268 | 0 | Result.IntVal = 7; |
269 | 0 | #endif |
270 | 0 | BufferPtr = TokEnd; |
271 | 0 | } |
272 | | |
273 | 0 | const char *Lexer::skipTextToken() { |
274 | 0 | const char *TokenPtr = BufferPtr; |
275 | 0 | assert(TokenPtr < CommentEnd); |
276 | 0 | StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r"; |
277 | |
|
278 | 0 | again: |
279 | 0 | size_t End = |
280 | 0 | StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols); |
281 | 0 | if (End == StringRef::npos) |
282 | 0 | return CommentEnd; |
283 | | |
284 | | // Doxygen doesn't recognize any commands in a one-line double quotation. |
285 | | // If we don't find an ending quotation mark, we pretend it never began. |
286 | 0 | if (*(TokenPtr + End) == '\"') { |
287 | 0 | TokenPtr += End + 1; |
288 | 0 | End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\""); |
289 | 0 | if (End != StringRef::npos && *(TokenPtr + End) == '\"') |
290 | 0 | TokenPtr += End + 1; |
291 | 0 | goto again; |
292 | 0 | } |
293 | 0 | return TokenPtr + End; |
294 | 0 | } |
295 | | |
296 | 0 | void Lexer::lexCommentText(Token &T) { |
297 | 0 | assert(CommentState == LCS_InsideBCPLComment || |
298 | 0 | CommentState == LCS_InsideCComment); |
299 | | |
300 | | // Handles lexing non-command text, i.e. text and newline. |
301 | 0 | auto HandleNonCommandToken = [&]() -> void { |
302 | 0 | assert(State == LS_Normal); |
303 | | |
304 | 0 | const char *TokenPtr = BufferPtr; |
305 | 0 | assert(TokenPtr < CommentEnd); |
306 | 0 | switch (*TokenPtr) { |
307 | 0 | case '\n': |
308 | 0 | case '\r': |
309 | 0 | TokenPtr = skipNewline(TokenPtr, CommentEnd); |
310 | 0 | formTokenWithChars(T, TokenPtr, tok::newline); |
311 | |
|
312 | 0 | if (CommentState == LCS_InsideCComment) |
313 | 0 | skipLineStartingDecorations(); |
314 | 0 | return; |
315 | | |
316 | 0 | default: |
317 | 0 | return formTextToken(T, skipTextToken()); |
318 | 0 | } |
319 | 0 | }; |
320 | |
|
321 | 0 | if (!ParseCommands) |
322 | 0 | return HandleNonCommandToken(); |
323 | | |
324 | 0 | switch (State) { |
325 | 0 | case LS_Normal: |
326 | 0 | break; |
327 | 0 | case LS_VerbatimBlockFirstLine: |
328 | 0 | lexVerbatimBlockFirstLine(T); |
329 | 0 | return; |
330 | 0 | case LS_VerbatimBlockBody: |
331 | 0 | lexVerbatimBlockBody(T); |
332 | 0 | return; |
333 | 0 | case LS_VerbatimLineText: |
334 | 0 | lexVerbatimLineText(T); |
335 | 0 | return; |
336 | 0 | case LS_HTMLStartTag: |
337 | 0 | lexHTMLStartTag(T); |
338 | 0 | return; |
339 | 0 | case LS_HTMLEndTag: |
340 | 0 | lexHTMLEndTag(T); |
341 | 0 | return; |
342 | 0 | } |
343 | | |
344 | 0 | assert(State == LS_Normal); |
345 | 0 | const char *TokenPtr = BufferPtr; |
346 | 0 | assert(TokenPtr < CommentEnd); |
347 | 0 | switch(*TokenPtr) { |
348 | 0 | case '\\': |
349 | 0 | case '@': { |
350 | | // Commands that start with a backslash and commands that start with |
351 | | // 'at' have equivalent semantics. But we keep information about the |
352 | | // exact syntax in AST for comments. |
353 | 0 | tok::TokenKind CommandKind = |
354 | 0 | (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; |
355 | 0 | TokenPtr++; |
356 | 0 | if (TokenPtr == CommentEnd) { |
357 | 0 | formTextToken(T, TokenPtr); |
358 | 0 | return; |
359 | 0 | } |
360 | 0 | char C = *TokenPtr; |
361 | 0 | switch (C) { |
362 | 0 | default: |
363 | 0 | break; |
364 | | |
365 | 0 | case '\\': case '@': case '&': case '$': |
366 | 0 | case '#': case '<': case '>': case '%': |
367 | 0 | case '\"': case '.': case ':': |
368 | | // This is one of \\ \@ \& \$ etc escape sequences. |
369 | 0 | TokenPtr++; |
370 | 0 | if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { |
371 | | // This is the \:: escape sequence. |
372 | 0 | TokenPtr++; |
373 | 0 | } |
374 | 0 | StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); |
375 | 0 | formTokenWithChars(T, TokenPtr, tok::text); |
376 | 0 | T.setText(UnescapedText); |
377 | 0 | return; |
378 | 0 | } |
379 | | |
380 | | // Don't make zero-length commands. |
381 | 0 | if (!isCommandNameStartCharacter(*TokenPtr)) { |
382 | 0 | formTextToken(T, TokenPtr); |
383 | 0 | return; |
384 | 0 | } |
385 | | |
386 | 0 | TokenPtr = skipCommandName(TokenPtr, CommentEnd); |
387 | 0 | unsigned Length = TokenPtr - (BufferPtr + 1); |
388 | | |
389 | | // Hardcoded support for lexing LaTeX formula commands |
390 | | // \f$ \f( \f) \f[ \f] \f{ \f} as a single command. |
391 | 0 | if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { |
392 | 0 | C = *TokenPtr; |
393 | 0 | if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' || |
394 | 0 | C == '{' || C == '}') { |
395 | 0 | TokenPtr++; |
396 | 0 | Length++; |
397 | 0 | } |
398 | 0 | } |
399 | |
|
400 | 0 | StringRef CommandName(BufferPtr + 1, Length); |
401 | |
|
402 | 0 | const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); |
403 | 0 | if (!Info) { |
404 | 0 | if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { |
405 | 0 | StringRef CorrectedName = Info->Name; |
406 | 0 | SourceLocation Loc = getSourceLocation(BufferPtr); |
407 | 0 | SourceLocation EndLoc = getSourceLocation(TokenPtr); |
408 | 0 | SourceRange FullRange = SourceRange(Loc, EndLoc); |
409 | 0 | SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); |
410 | 0 | Diag(Loc, diag::warn_correct_comment_command_name) |
411 | 0 | << FullRange << CommandName << CorrectedName |
412 | 0 | << FixItHint::CreateReplacement(CommandRange, CorrectedName); |
413 | 0 | } else { |
414 | 0 | formTokenWithChars(T, TokenPtr, tok::unknown_command); |
415 | 0 | T.setUnknownCommandName(CommandName); |
416 | 0 | Diag(T.getLocation(), diag::warn_unknown_comment_command_name) |
417 | 0 | << SourceRange(T.getLocation(), T.getEndLocation()); |
418 | 0 | return; |
419 | 0 | } |
420 | 0 | } |
421 | 0 | if (Info->IsVerbatimBlockCommand) { |
422 | 0 | setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); |
423 | 0 | return; |
424 | 0 | } |
425 | 0 | if (Info->IsVerbatimLineCommand) { |
426 | 0 | setupAndLexVerbatimLine(T, TokenPtr, Info); |
427 | 0 | return; |
428 | 0 | } |
429 | 0 | formTokenWithChars(T, TokenPtr, CommandKind); |
430 | 0 | T.setCommandID(Info->getID()); |
431 | 0 | return; |
432 | 0 | } |
433 | | |
434 | 0 | case '&': |
435 | 0 | lexHTMLCharacterReference(T); |
436 | 0 | return; |
437 | | |
438 | 0 | case '<': { |
439 | 0 | TokenPtr++; |
440 | 0 | if (TokenPtr == CommentEnd) { |
441 | 0 | formTextToken(T, TokenPtr); |
442 | 0 | return; |
443 | 0 | } |
444 | 0 | const char C = *TokenPtr; |
445 | 0 | if (isHTMLIdentifierStartingCharacter(C)) |
446 | 0 | setupAndLexHTMLStartTag(T); |
447 | 0 | else if (C == '/') |
448 | 0 | setupAndLexHTMLEndTag(T); |
449 | 0 | else |
450 | 0 | formTextToken(T, TokenPtr); |
451 | 0 | return; |
452 | 0 | } |
453 | | |
454 | 0 | default: |
455 | 0 | return HandleNonCommandToken(); |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | | void Lexer::setupAndLexVerbatimBlock(Token &T, |
460 | | const char *TextBegin, |
461 | 0 | char Marker, const CommandInfo *Info) { |
462 | 0 | assert(Info->IsVerbatimBlockCommand); |
463 | | |
464 | 0 | VerbatimBlockEndCommandName.clear(); |
465 | 0 | VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); |
466 | 0 | VerbatimBlockEndCommandName.append(Info->EndCommandName); |
467 | |
|
468 | 0 | formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); |
469 | 0 | T.setVerbatimBlockID(Info->getID()); |
470 | | |
471 | | // If there is a newline following the verbatim opening command, skip the |
472 | | // newline so that we don't create an tok::verbatim_block_line with empty |
473 | | // text content. |
474 | 0 | if (BufferPtr != CommentEnd && |
475 | 0 | isVerticalWhitespace(*BufferPtr)) { |
476 | 0 | BufferPtr = skipNewline(BufferPtr, CommentEnd); |
477 | 0 | State = LS_VerbatimBlockBody; |
478 | 0 | return; |
479 | 0 | } |
480 | | |
481 | 0 | State = LS_VerbatimBlockFirstLine; |
482 | 0 | } |
483 | | |
484 | 0 | void Lexer::lexVerbatimBlockFirstLine(Token &T) { |
485 | 0 | again: |
486 | 0 | assert(BufferPtr < CommentEnd); |
487 | | |
488 | | // FIXME: It would be better to scan the text once, finding either the block |
489 | | // end command or newline. |
490 | | // |
491 | | // Extract current line. |
492 | 0 | const char *Newline = findNewline(BufferPtr, CommentEnd); |
493 | 0 | StringRef Line(BufferPtr, Newline - BufferPtr); |
494 | | |
495 | | // Look for end command in current line. |
496 | 0 | size_t Pos = Line.find(VerbatimBlockEndCommandName); |
497 | 0 | const char *TextEnd; |
498 | 0 | const char *NextLine; |
499 | 0 | if (Pos == StringRef::npos) { |
500 | | // Current line is completely verbatim. |
501 | 0 | TextEnd = Newline; |
502 | 0 | NextLine = skipNewline(Newline, CommentEnd); |
503 | 0 | } else if (Pos == 0) { |
504 | | // Current line contains just an end command. |
505 | 0 | const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); |
506 | 0 | StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); |
507 | 0 | formTokenWithChars(T, End, tok::verbatim_block_end); |
508 | 0 | T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); |
509 | 0 | State = LS_Normal; |
510 | 0 | return; |
511 | 0 | } else { |
512 | | // There is some text, followed by end command. Extract text first. |
513 | 0 | TextEnd = BufferPtr + Pos; |
514 | 0 | NextLine = TextEnd; |
515 | | // If there is only whitespace before end command, skip whitespace. |
516 | 0 | if (isWhitespace(BufferPtr, TextEnd)) { |
517 | 0 | BufferPtr = TextEnd; |
518 | 0 | goto again; |
519 | 0 | } |
520 | 0 | } |
521 | | |
522 | 0 | StringRef Text(BufferPtr, TextEnd - BufferPtr); |
523 | 0 | formTokenWithChars(T, NextLine, tok::verbatim_block_line); |
524 | 0 | T.setVerbatimBlockText(Text); |
525 | |
|
526 | 0 | State = LS_VerbatimBlockBody; |
527 | 0 | } |
528 | | |
529 | 0 | void Lexer::lexVerbatimBlockBody(Token &T) { |
530 | 0 | assert(State == LS_VerbatimBlockBody); |
531 | | |
532 | 0 | if (CommentState == LCS_InsideCComment) |
533 | 0 | skipLineStartingDecorations(); |
534 | |
|
535 | 0 | if (BufferPtr == CommentEnd) { |
536 | 0 | formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); |
537 | 0 | T.setVerbatimBlockText(""); |
538 | 0 | return; |
539 | 0 | } |
540 | | |
541 | 0 | lexVerbatimBlockFirstLine(T); |
542 | 0 | } |
543 | | |
544 | | void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
545 | 0 | const CommandInfo *Info) { |
546 | 0 | assert(Info->IsVerbatimLineCommand); |
547 | 0 | formTokenWithChars(T, TextBegin, tok::verbatim_line_name); |
548 | 0 | T.setVerbatimLineID(Info->getID()); |
549 | |
|
550 | 0 | State = LS_VerbatimLineText; |
551 | 0 | } |
552 | | |
553 | 0 | void Lexer::lexVerbatimLineText(Token &T) { |
554 | 0 | assert(State == LS_VerbatimLineText); |
555 | | |
556 | | // Extract current line. |
557 | 0 | const char *Newline = findNewline(BufferPtr, CommentEnd); |
558 | 0 | StringRef Text(BufferPtr, Newline - BufferPtr); |
559 | 0 | formTokenWithChars(T, Newline, tok::verbatim_line_text); |
560 | 0 | T.setVerbatimLineText(Text); |
561 | |
|
562 | 0 | State = LS_Normal; |
563 | 0 | } |
564 | | |
565 | 0 | void Lexer::lexHTMLCharacterReference(Token &T) { |
566 | 0 | const char *TokenPtr = BufferPtr; |
567 | 0 | assert(*TokenPtr == '&'); |
568 | 0 | TokenPtr++; |
569 | 0 | if (TokenPtr == CommentEnd) { |
570 | 0 | formTextToken(T, TokenPtr); |
571 | 0 | return; |
572 | 0 | } |
573 | 0 | const char *NamePtr; |
574 | 0 | bool isNamed = false; |
575 | 0 | bool isDecimal = false; |
576 | 0 | char C = *TokenPtr; |
577 | 0 | if (isHTMLNamedCharacterReferenceCharacter(C)) { |
578 | 0 | NamePtr = TokenPtr; |
579 | 0 | TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); |
580 | 0 | isNamed = true; |
581 | 0 | } else if (C == '#') { |
582 | 0 | TokenPtr++; |
583 | 0 | if (TokenPtr == CommentEnd) { |
584 | 0 | formTextToken(T, TokenPtr); |
585 | 0 | return; |
586 | 0 | } |
587 | 0 | C = *TokenPtr; |
588 | 0 | if (isHTMLDecimalCharacterReferenceCharacter(C)) { |
589 | 0 | NamePtr = TokenPtr; |
590 | 0 | TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); |
591 | 0 | isDecimal = true; |
592 | 0 | } else if (C == 'x' || C == 'X') { |
593 | 0 | TokenPtr++; |
594 | 0 | NamePtr = TokenPtr; |
595 | 0 | TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); |
596 | 0 | } else { |
597 | 0 | formTextToken(T, TokenPtr); |
598 | 0 | return; |
599 | 0 | } |
600 | 0 | } else { |
601 | 0 | formTextToken(T, TokenPtr); |
602 | 0 | return; |
603 | 0 | } |
604 | 0 | if (NamePtr == TokenPtr || TokenPtr == CommentEnd || |
605 | 0 | *TokenPtr != ';') { |
606 | 0 | formTextToken(T, TokenPtr); |
607 | 0 | return; |
608 | 0 | } |
609 | 0 | StringRef Name(NamePtr, TokenPtr - NamePtr); |
610 | 0 | TokenPtr++; // Skip semicolon. |
611 | 0 | StringRef Resolved; |
612 | 0 | if (isNamed) |
613 | 0 | Resolved = resolveHTMLNamedCharacterReference(Name); |
614 | 0 | else if (isDecimal) |
615 | 0 | Resolved = resolveHTMLDecimalCharacterReference(Name); |
616 | 0 | else |
617 | 0 | Resolved = resolveHTMLHexCharacterReference(Name); |
618 | |
|
619 | 0 | if (Resolved.empty()) { |
620 | 0 | formTextToken(T, TokenPtr); |
621 | 0 | return; |
622 | 0 | } |
623 | 0 | formTokenWithChars(T, TokenPtr, tok::text); |
624 | 0 | T.setText(Resolved); |
625 | 0 | } |
626 | | |
627 | 0 | void Lexer::setupAndLexHTMLStartTag(Token &T) { |
628 | 0 | assert(BufferPtr[0] == '<' && |
629 | 0 | isHTMLIdentifierStartingCharacter(BufferPtr[1])); |
630 | 0 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); |
631 | 0 | StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); |
632 | 0 | if (!isHTMLTagName(Name)) { |
633 | 0 | formTextToken(T, TagNameEnd); |
634 | 0 | return; |
635 | 0 | } |
636 | | |
637 | 0 | formTokenWithChars(T, TagNameEnd, tok::html_start_tag); |
638 | 0 | T.setHTMLTagStartName(Name); |
639 | |
|
640 | 0 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
641 | |
|
642 | 0 | const char C = *BufferPtr; |
643 | 0 | if (BufferPtr != CommentEnd && |
644 | 0 | (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) |
645 | 0 | State = LS_HTMLStartTag; |
646 | 0 | } |
647 | | |
648 | 0 | void Lexer::lexHTMLStartTag(Token &T) { |
649 | 0 | assert(State == LS_HTMLStartTag); |
650 | | |
651 | 0 | const char *TokenPtr = BufferPtr; |
652 | 0 | char C = *TokenPtr; |
653 | 0 | if (isHTMLIdentifierCharacter(C)) { |
654 | 0 | TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); |
655 | 0 | StringRef Ident(BufferPtr, TokenPtr - BufferPtr); |
656 | 0 | formTokenWithChars(T, TokenPtr, tok::html_ident); |
657 | 0 | T.setHTMLIdent(Ident); |
658 | 0 | } else { |
659 | 0 | switch (C) { |
660 | 0 | case '=': |
661 | 0 | TokenPtr++; |
662 | 0 | formTokenWithChars(T, TokenPtr, tok::html_equals); |
663 | 0 | break; |
664 | 0 | case '\"': |
665 | 0 | case '\'': { |
666 | 0 | const char *OpenQuote = TokenPtr; |
667 | 0 | TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); |
668 | 0 | const char *ClosingQuote = TokenPtr; |
669 | 0 | if (TokenPtr != CommentEnd) // Skip closing quote. |
670 | 0 | TokenPtr++; |
671 | 0 | formTokenWithChars(T, TokenPtr, tok::html_quoted_string); |
672 | 0 | T.setHTMLQuotedString(StringRef(OpenQuote + 1, |
673 | 0 | ClosingQuote - (OpenQuote + 1))); |
674 | 0 | break; |
675 | 0 | } |
676 | 0 | case '>': |
677 | 0 | TokenPtr++; |
678 | 0 | formTokenWithChars(T, TokenPtr, tok::html_greater); |
679 | 0 | State = LS_Normal; |
680 | 0 | return; |
681 | 0 | case '/': |
682 | 0 | TokenPtr++; |
683 | 0 | if (TokenPtr != CommentEnd && *TokenPtr == '>') { |
684 | 0 | TokenPtr++; |
685 | 0 | formTokenWithChars(T, TokenPtr, tok::html_slash_greater); |
686 | 0 | } else |
687 | 0 | formTextToken(T, TokenPtr); |
688 | |
|
689 | 0 | State = LS_Normal; |
690 | 0 | return; |
691 | 0 | } |
692 | 0 | } |
693 | | |
694 | | // Now look ahead and return to normal state if we don't see any HTML tokens |
695 | | // ahead. |
696 | 0 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
697 | 0 | if (BufferPtr == CommentEnd) { |
698 | 0 | State = LS_Normal; |
699 | 0 | return; |
700 | 0 | } |
701 | | |
702 | 0 | C = *BufferPtr; |
703 | 0 | if (!isHTMLIdentifierStartingCharacter(C) && |
704 | 0 | C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') { |
705 | 0 | State = LS_Normal; |
706 | 0 | return; |
707 | 0 | } |
708 | 0 | } |
709 | | |
710 | 0 | void Lexer::setupAndLexHTMLEndTag(Token &T) { |
711 | 0 | assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); |
712 | | |
713 | 0 | const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); |
714 | 0 | const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); |
715 | 0 | StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); |
716 | 0 | if (!isHTMLTagName(Name)) { |
717 | 0 | formTextToken(T, TagNameEnd); |
718 | 0 | return; |
719 | 0 | } |
720 | | |
721 | 0 | const char *End = skipWhitespace(TagNameEnd, CommentEnd); |
722 | |
|
723 | 0 | formTokenWithChars(T, End, tok::html_end_tag); |
724 | 0 | T.setHTMLTagEndName(Name); |
725 | |
|
726 | 0 | if (BufferPtr != CommentEnd && *BufferPtr == '>') |
727 | 0 | State = LS_HTMLEndTag; |
728 | 0 | } |
729 | | |
730 | 0 | void Lexer::lexHTMLEndTag(Token &T) { |
731 | 0 | assert(BufferPtr != CommentEnd && *BufferPtr == '>'); |
732 | | |
733 | 0 | formTokenWithChars(T, BufferPtr + 1, tok::html_greater); |
734 | 0 | State = LS_Normal; |
735 | 0 | } |
736 | | |
737 | | Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
738 | | const CommandTraits &Traits, SourceLocation FileLoc, |
739 | | const char *BufferStart, const char *BufferEnd, bool ParseCommands) |
740 | | : Allocator(Allocator), Diags(Diags), Traits(Traits), |
741 | | BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), |
742 | | FileLoc(FileLoc), ParseCommands(ParseCommands), |
743 | 0 | CommentState(LCS_BeforeComment), State(LS_Normal) {} |
744 | | |
745 | 0 | void Lexer::lex(Token &T) { |
746 | 0 | again: |
747 | 0 | switch (CommentState) { |
748 | 0 | case LCS_BeforeComment: |
749 | 0 | if (BufferPtr == BufferEnd) { |
750 | 0 | formTokenWithChars(T, BufferPtr, tok::eof); |
751 | 0 | return; |
752 | 0 | } |
753 | | |
754 | 0 | assert(*BufferPtr == '/'); |
755 | 0 | BufferPtr++; // Skip first slash. |
756 | 0 | switch(*BufferPtr) { |
757 | 0 | case '/': { // BCPL comment. |
758 | 0 | BufferPtr++; // Skip second slash. |
759 | |
|
760 | 0 | if (BufferPtr != BufferEnd) { |
761 | | // Skip Doxygen magic marker, if it is present. |
762 | | // It might be missing because of a typo //< or /*<, or because we |
763 | | // merged this non-Doxygen comment into a bunch of Doxygen comments |
764 | | // around it: /** ... */ /* ... */ /** ... */ |
765 | 0 | const char C = *BufferPtr; |
766 | 0 | if (C == '/' || C == '!') |
767 | 0 | BufferPtr++; |
768 | 0 | } |
769 | | |
770 | | // Skip less-than symbol that marks trailing comments. |
771 | | // Skip it even if the comment is not a Doxygen one, because //< and /*< |
772 | | // are frequent typos. |
773 | 0 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
774 | 0 | BufferPtr++; |
775 | |
|
776 | 0 | CommentState = LCS_InsideBCPLComment; |
777 | 0 | if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) |
778 | 0 | State = LS_Normal; |
779 | 0 | CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); |
780 | 0 | goto again; |
781 | 0 | } |
782 | 0 | case '*': { // C comment. |
783 | 0 | BufferPtr++; // Skip star. |
784 | | |
785 | | // Skip Doxygen magic marker. |
786 | 0 | const char C = *BufferPtr; |
787 | 0 | if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') |
788 | 0 | BufferPtr++; |
789 | | |
790 | | // Skip less-than symbol that marks trailing comments. |
791 | 0 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
792 | 0 | BufferPtr++; |
793 | |
|
794 | 0 | CommentState = LCS_InsideCComment; |
795 | 0 | State = LS_Normal; |
796 | 0 | CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); |
797 | 0 | goto again; |
798 | 0 | } |
799 | 0 | default: |
800 | 0 | llvm_unreachable("second character of comment should be '/' or '*'"); |
801 | 0 | } |
802 | | |
803 | 0 | case LCS_BetweenComments: { |
804 | | // Consecutive comments are extracted only if there is only whitespace |
805 | | // between them. So we can search for the start of the next comment. |
806 | 0 | const char *EndWhitespace = BufferPtr; |
807 | 0 | while(EndWhitespace != BufferEnd && *EndWhitespace != '/') |
808 | 0 | EndWhitespace++; |
809 | | |
810 | | // Turn any whitespace between comments (and there is only whitespace |
811 | | // between them -- guaranteed by comment extraction) into a newline. We |
812 | | // have two newlines between C comments in total (first one was synthesized |
813 | | // after a comment). |
814 | 0 | formTokenWithChars(T, EndWhitespace, tok::newline); |
815 | |
|
816 | 0 | CommentState = LCS_BeforeComment; |
817 | 0 | break; |
818 | 0 | } |
819 | | |
820 | 0 | case LCS_InsideBCPLComment: |
821 | 0 | case LCS_InsideCComment: |
822 | 0 | if (BufferPtr != CommentEnd) { |
823 | 0 | lexCommentText(T); |
824 | 0 | break; |
825 | 0 | } else { |
826 | | // Skip C comment closing sequence. |
827 | 0 | if (CommentState == LCS_InsideCComment) { |
828 | 0 | assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); |
829 | 0 | BufferPtr += 2; |
830 | 0 | assert(BufferPtr <= BufferEnd); |
831 | | |
832 | | // Synthenize newline just after the C comment, regardless if there is |
833 | | // actually a newline. |
834 | 0 | formTokenWithChars(T, BufferPtr, tok::newline); |
835 | |
|
836 | 0 | CommentState = LCS_BetweenComments; |
837 | 0 | break; |
838 | 0 | } else { |
839 | | // Don't synthesized a newline after BCPL comment. |
840 | 0 | CommentState = LCS_BetweenComments; |
841 | 0 | goto again; |
842 | 0 | } |
843 | 0 | } |
844 | 0 | } |
845 | 0 | } |
846 | | |
847 | | StringRef Lexer::getSpelling(const Token &Tok, |
848 | 0 | const SourceManager &SourceMgr) const { |
849 | 0 | SourceLocation Loc = Tok.getLocation(); |
850 | 0 | std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); |
851 | |
|
852 | 0 | bool InvalidTemp = false; |
853 | 0 | StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); |
854 | 0 | if (InvalidTemp) |
855 | 0 | return StringRef(); |
856 | | |
857 | 0 | const char *Begin = File.data() + LocInfo.second; |
858 | 0 | return StringRef(Begin, Tok.getLength()); |
859 | 0 | } |
860 | | |
861 | | } // end namespace comments |
862 | | } // end namespace clang |