/src/llvm-project/clang-tools-extra/pseudo/lib/Lex.cpp

Source (jump to first uncovered line)
//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Token.h"
#include "clang/Basic/IdentifierTable.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/LiteralSupport.h"

namespace clang {
namespace pseudo {

TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
  clang::SourceLocation Start;
  // Tokenize using clang's lexer in raw mode.
  // std::string guarantees null-termination, which the lexer needs.
  clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
                     Code.data() + Code.size());
  Lexer.SetCommentRetentionState(true);

  TokenStream Result;
  clang::Token CT;
  // Index into the token stream of original source code.
  Token::Index TokenIndex = 0;
  unsigned LastOffset = 0;
  unsigned Line = 0;
  unsigned Indent = 0;
  for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
       Lexer.LexFromRawLexer(CT)) {
    unsigned Offset =
        CT.getLocation().getRawEncoding() - Start.getRawEncoding();

    Token Tok;
    Tok.Data = &Code[Offset];
    Tok.Length = CT.getLength();
    Tok.Kind = CT.getKind();

    // Update current line number and indentation from raw source code.
    unsigned NewLineStart = 0;
    for (unsigned I = LastOffset; I < Offset; ++I) {
      if (Code[I] == '\n') {
        NewLineStart = I + 1;
        ++Line;
      }
    }
    if (NewLineStart || !LastOffset) {
      Indent = 0;
      for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
        if (C == ' ')
          ++Indent;
        else if (C == '\t')
          Indent += 8;
        else
          break;
      }
    }
    Tok.Indent = Indent;
    Tok.Line = Line;

    if (CT.isAtStartOfLine())
      Tok.setFlag(LexFlags::StartsPPLine);
    if (CT.needsCleaning() || CT.hasUCN())
      Tok.setFlag(LexFlags::NeedsCleaning);

    Tok.OriginalIndex = TokenIndex++;
    Result.push(Tok);
    LastOffset = Offset;
  }
  Result.finalize();
  return Result;
}

TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
  auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
  clang::IdentifierTable Identifiers(LangOpts);
  TokenStream Result(CleanedStorage);
  Result.addPayload(Code.getPayload());
  for (auto Tok : Code.tokens()) {
    if (Tok.flag(LexFlags::NeedsCleaning)) {
      // Remove escaped newlines and trigraphs.
      llvm::SmallString<64> CleanBuffer;
      const char *Pos = Tok.text().begin();
      while (Pos < Tok.text().end()) {
        auto [Char, CharSize] =
            clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
        CleanBuffer.push_back(Char);
        assert(CharSize != 0 && "no progress!");
        Pos += CharSize;
      }
      llvm::StringRef Text = CleanBuffer;
      llvm::SmallString<64> UCNBuffer;
      // A surface reading of the standard suggests UCNs might appear anywhere.
      // But we need only decode them in raw_identifiers.
      //  - they cannot appear in punctuation/keyword tokens, because UCNs
      //    cannot encode basic characters outside of literals [lex.charset]
      //  - they can appear in literals, but we need not unescape them now.
      //    We treat them as escape sequences when evaluating the literal.
      //  - comments are handled similarly to literals
      // This is good fortune, because expandUCNs requires its input to be a
      // reasonably valid identifier (e.g. without stray backslashes).
      if (Tok.Kind == tok::raw_identifier) {
        clang::expandUCNs(UCNBuffer, CleanBuffer);
        Text = UCNBuffer;
      }

      Tok.Data = Text.copy(*CleanedStorage).data();
      Tok.Length = Text.size();
      Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
    }

    if (Tok.Kind == tok::raw_identifier) {
      // Cook raw_identifiers into identifier, keyword, etc.
      Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
    } else if (Tok.Kind == tok::greatergreater) {
      // Split the greatergreater token.
      // FIXME: split lessless token to support Cuda triple angle brackets <<<.
      assert(Tok.text() == ">>");
      Tok.Kind = tok::greater;
      Tok.Length = 1;
      Result.push(Tok);
      // Line is wrong if the first greater is followed by an escaped newline!
      Tok.Data = Tok.text().data() + 1;
    }

    Result.push(std::move(Tok));
  }

  Result.finalize();
  return Result;
}

} // namespace pseudo
} // namespace clang

Line	Count	Source (jump to first uncovered line)
1		//===--- Lex.cpp - extract token stream from source code ---------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8
9		#include "clang-pseudo/Token.h"
10		#include "clang/Basic/IdentifierTable.h"
11		#include "clang/Basic/SourceLocation.h"
12		#include "clang/Basic/TokenKinds.h"
13		#include "clang/Lex/Lexer.h"
14		#include "clang/Lex/LiteralSupport.h"
15
16		namespace clang {
17		namespace pseudo {
18
19	9.56k	TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20	9.56k	clang::SourceLocation Start;
21		// Tokenize using clang's lexer in raw mode.
22		// std::string guarantees null-termination, which the lexer needs.
23	9.56k	clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24	9.56k	Code.data() + Code.size());
25	9.56k	Lexer.SetCommentRetentionState(true);
26
27	9.56k	TokenStream Result;
28	9.56k	clang::Token CT;
29		// Index into the token stream of original source code.
30	9.56k	Token::Index TokenIndex = 0;
31	9.56k	unsigned LastOffset = 0;
32	9.56k	unsigned Line = 0;
33	9.56k	unsigned Indent = 0;
34	52.9M	for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
35	52.9M	Lexer.LexFromRawLexer(CT)) {
36	52.9M	unsigned Offset =
37	52.9M	CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38
39	52.9M	Token Tok;
40	52.9M	Tok.Data = &Code[Offset];
41	52.9M	Tok.Length = CT.getLength();
42	52.9M	Tok.Kind = CT.getKind();
43
44		// Update current line number and indentation from raw source code.
45	52.9M	unsigned NewLineStart = 0;
46	160M	for (unsigned I = LastOffset; I < Offset; ++I) {
47	107M	if (Code[I] == '\n') {
48	1.30M	NewLineStart = I + 1;
49	1.30M	++Line;
50	1.30M	}
51	107M	}
52	52.9M	if (NewLineStart \|\| !LastOffset) {
53	1.23M	Indent = 0;
54	1.23M	for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
55	332k	if (C == ' ')
56	1.88k	++Indent;
57	330k	else if (C == '\t')
58	2.86k	Indent += 8;
59	327k	else
60	327k	break;
61	332k	}
62	1.23M	}
63	52.9M	Tok.Indent = Indent;
64	52.9M	Tok.Line = Line;
65
66	52.9M	if (CT.isAtStartOfLine())
67	1.17M	Tok.setFlag(LexFlags::StartsPPLine);
68	52.9M	if (CT.needsCleaning() \|\| CT.hasUCN())
69	155k	Tok.setFlag(LexFlags::NeedsCleaning);
70
71	52.9M	Tok.OriginalIndex = TokenIndex++;
72	52.9M	Result.push(Tok);
73	52.9M	LastOffset = Offset;
74	52.9M	}
75	9.56k	Result.finalize();
76	9.56k	return Result;
77	9.56k	}
78
79	9.56k	TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80	9.56k	auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81	9.56k	clang::IdentifierTable Identifiers(LangOpts);
82	9.56k	TokenStream Result(CleanedStorage);
83	9.56k	Result.addPayload(Code.getPayload());
84	52.9M	for (auto Tok : Code.tokens()) {
85	52.9M	if (Tok.flag(LexFlags::NeedsCleaning)) {
86		// Remove escaped newlines and trigraphs.
87	155k	llvm::SmallString<64> CleanBuffer;
88	155k	const char *Pos = Tok.text().begin();
89	34.4M	while (Pos < Tok.text().end()) {
90	34.3M	auto [Char, CharSize] =
91	34.3M	clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
92	34.3M	CleanBuffer.push_back(Char);
93	34.3M	assert(CharSize != 0 && "no progress!");
94	0	Pos += CharSize;
95	34.3M	}
96	155k	llvm::StringRef Text = CleanBuffer;
97	155k	llvm::SmallString<64> UCNBuffer;
98		// A surface reading of the standard suggests UCNs might appear anywhere.
99		// But we need only decode them in raw_identifiers.
100		// - they cannot appear in punctuation/keyword tokens, because UCNs
101		// cannot encode basic characters outside of literals [lex.charset]
102		// - they can appear in literals, but we need not unescape them now.
103		// We treat them as escape sequences when evaluating the literal.
104		// - comments are handled similarly to literals
105		// This is good fortune, because expandUCNs requires its input to be a
106		// reasonably valid identifier (e.g. without stray backslashes).
107	155k	if (Tok.Kind == tok::raw_identifier) {
108	13.5k	clang::expandUCNs(UCNBuffer, CleanBuffer);
109	13.5k	Text = UCNBuffer;
110	13.5k	}
111
112	155k	Tok.Data = Text.copy(*CleanedStorage).data();
113	155k	Tok.Length = Text.size();
114	155k	Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115	155k	}
116
117	52.9M	if (Tok.Kind == tok::raw_identifier) {
118		// Cook raw_identifiers into identifier, keyword, etc.
119	3.12M	Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
120	49.7M	} else if (Tok.Kind == tok::greatergreater) {
121		// Split the greatergreater token.
122		// FIXME: split lessless token to support Cuda triple angle brackets <<<.
123	228k	assert(Tok.text() == ">>");
124	0	Tok.Kind = tok::greater;
125	228k	Tok.Length = 1;
126	228k	Result.push(Tok);
127		// Line is wrong if the first greater is followed by an escaped newline!
128	228k	Tok.Data = Tok.text().data() + 1;
129	228k	}
130
131	0	Result.push(std::move(Tok));
132	52.9M	}
133
134	9.56k	Result.finalize();
135	9.56k	return Result;
136	9.56k	}
137
138		} // namespace pseudo
139		} // namespace clang

Coverage Report

Created: 2024-01-17 10:31