Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/lib/Lex.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang-pseudo/Token.h"
10
#include "clang/Basic/IdentifierTable.h"
11
#include "clang/Basic/SourceLocation.h"
12
#include "clang/Basic/TokenKinds.h"
13
#include "clang/Lex/Lexer.h"
14
#include "clang/Lex/LiteralSupport.h"
15
16
namespace clang {
17
namespace pseudo {
18
19
9.56k
TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20
9.56k
  clang::SourceLocation Start;
21
  // Tokenize using clang's lexer in raw mode.
22
  // std::string guarantees null-termination, which the lexer needs.
23
9.56k
  clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24
9.56k
                     Code.data() + Code.size());
25
9.56k
  Lexer.SetCommentRetentionState(true);
26
27
9.56k
  TokenStream Result;
28
9.56k
  clang::Token CT;
29
  // Index into the token stream of original source code.
30
9.56k
  Token::Index TokenIndex = 0;
31
9.56k
  unsigned LastOffset = 0;
32
9.56k
  unsigned Line = 0;
33
9.56k
  unsigned Indent = 0;
34
52.9M
  for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
35
52.9M
       Lexer.LexFromRawLexer(CT)) {
36
52.9M
    unsigned Offset =
37
52.9M
        CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38
39
52.9M
    Token Tok;
40
52.9M
    Tok.Data = &Code[Offset];
41
52.9M
    Tok.Length = CT.getLength();
42
52.9M
    Tok.Kind = CT.getKind();
43
44
    // Update current line number and indentation from raw source code.
45
52.9M
    unsigned NewLineStart = 0;
46
160M
    for (unsigned I = LastOffset; I < Offset; ++I) {
47
107M
      if (Code[I] == '\n') {
48
1.30M
        NewLineStart = I + 1;
49
1.30M
        ++Line;
50
1.30M
      }
51
107M
    }
52
52.9M
    if (NewLineStart || !LastOffset) {
53
1.23M
      Indent = 0;
54
1.23M
      for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
55
332k
        if (C == ' ')
56
1.88k
          ++Indent;
57
330k
        else if (C == '\t')
58
2.86k
          Indent += 8;
59
327k
        else
60
327k
          break;
61
332k
      }
62
1.23M
    }
63
52.9M
    Tok.Indent = Indent;
64
52.9M
    Tok.Line = Line;
65
66
52.9M
    if (CT.isAtStartOfLine())
67
1.17M
      Tok.setFlag(LexFlags::StartsPPLine);
68
52.9M
    if (CT.needsCleaning() || CT.hasUCN())
69
155k
      Tok.setFlag(LexFlags::NeedsCleaning);
70
71
52.9M
    Tok.OriginalIndex = TokenIndex++;
72
52.9M
    Result.push(Tok);
73
52.9M
    LastOffset = Offset;
74
52.9M
  }
75
9.56k
  Result.finalize();
76
9.56k
  return Result;
77
9.56k
}
78
79
9.56k
TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80
9.56k
  auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81
9.56k
  clang::IdentifierTable Identifiers(LangOpts);
82
9.56k
  TokenStream Result(CleanedStorage);
83
9.56k
  Result.addPayload(Code.getPayload());
84
52.9M
  for (auto Tok : Code.tokens()) {
85
52.9M
    if (Tok.flag(LexFlags::NeedsCleaning)) {
86
      // Remove escaped newlines and trigraphs.
87
155k
      llvm::SmallString<64> CleanBuffer;
88
155k
      const char *Pos = Tok.text().begin();
89
34.4M
      while (Pos < Tok.text().end()) {
90
34.3M
        auto [Char, CharSize] =
91
34.3M
            clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
92
34.3M
        CleanBuffer.push_back(Char);
93
34.3M
        assert(CharSize != 0 && "no progress!");
94
0
        Pos += CharSize;
95
34.3M
      }
96
155k
      llvm::StringRef Text = CleanBuffer;
97
155k
      llvm::SmallString<64> UCNBuffer;
98
      // A surface reading of the standard suggests UCNs might appear anywhere.
99
      // But we need only decode them in raw_identifiers.
100
      //  - they cannot appear in punctuation/keyword tokens, because UCNs
101
      //    cannot encode basic characters outside of literals [lex.charset]
102
      //  - they can appear in literals, but we need not unescape them now.
103
      //    We treat them as escape sequences when evaluating the literal.
104
      //  - comments are handled similarly to literals
105
      // This is good fortune, because expandUCNs requires its input to be a
106
      // reasonably valid identifier (e.g. without stray backslashes).
107
155k
      if (Tok.Kind == tok::raw_identifier) {
108
13.5k
        clang::expandUCNs(UCNBuffer, CleanBuffer);
109
13.5k
        Text = UCNBuffer;
110
13.5k
      }
111
112
155k
      Tok.Data = Text.copy(*CleanedStorage).data();
113
155k
      Tok.Length = Text.size();
114
155k
      Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115
155k
    }
116
117
52.9M
    if (Tok.Kind == tok::raw_identifier) {
118
      // Cook raw_identifiers into identifier, keyword, etc.
119
3.12M
      Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
120
49.7M
    } else if (Tok.Kind == tok::greatergreater) {
121
      // Split the greatergreater token.
122
      // FIXME: split lessless token to support Cuda triple angle brackets <<<.
123
228k
      assert(Tok.text() == ">>");
124
0
      Tok.Kind = tok::greater;
125
228k
      Tok.Length = 1;
126
228k
      Result.push(Tok);
127
      // Line is wrong if the first greater is followed by an escaped newline!
128
228k
      Tok.Data = Tok.text().data() + 1;
129
228k
    }
130
131
0
    Result.push(std::move(Tok));
132
52.9M
  }
133
134
9.56k
  Result.finalize();
135
9.56k
  return Result;
136
9.56k
}
137
138
} // namespace pseudo
139
} // namespace clang